summary refs log tree commit diff
path: root/synapse
diff options
context:
space:
mode:
authorRichard van der Hoff <1389908+richvdh@users.noreply.github.com>2019-12-09 11:55:30 +0000
committerGitHub <noreply@github.com>2019-12-09 11:55:30 +0000
commit18660a34d82ccb120efd2fa2480b42ac62dbe2b4 (patch)
tree60503faeb260b59e78f4f02fdcda8f6c7bea5ae8 /synapse
parentRemove fallback for missing /federation/v1/state_ids API (#6488) (diff)
downloadsynapse-18660a34d82ccb120efd2fa2480b42ac62dbe2b4.tar.xz
Fix inaccurate per-block metrics (#6491)
`Measure` incorrectly assumed that it was the only thing being done by the parent `LoggingContext`. For instance, during a "renew group attestations" operation, hundreds of `outbound_request` calls could take place in parallel, all using the same `LoggingContext`. This would mean that any resources used during *any* of those calls would be reported against *all* of them, producing wildly inaccurate results.

Instead, we now give each `Measure` block its own `LoggingContext` (using the parent `LoggingContext` mechanism to ensure that the log lines look correct and that the metrics are ultimately propogated to the top level for reporting against requests/backgrond tasks).
Diffstat (limited to 'synapse')
-rw-r--r--synapse/util/metrics.py60
1 files changed, 18 insertions, 42 deletions
diff --git a/synapse/util/metrics.py b/synapse/util/metrics.py
index 63ddaaba87..7b18455469 100644
--- a/synapse/util/metrics.py
+++ b/synapse/util/metrics.py
@@ -91,72 +91,48 @@ class Measure(object):
     __slots__ = [
         "clock",
         "name",
-        "start_context",
+        "_logging_context",
         "start",
-        "created_context",
-        "start_usage",
     ]
 
     def __init__(self, clock, name):
         self.clock = clock
         self.name = name
-        self.start_context = None
+        self._logging_context = None
         self.start = None
-        self.created_context = False
 
     def __enter__(self):
-        self.start = self.clock.time()
-        self.start_context = LoggingContext.current_context()
-        if not self.start_context:
-            self.start_context = LoggingContext("Measure")
-            self.start_context.__enter__()
-            self.created_context = True
-
-        self.start_usage = self.start_context.get_resource_usage()
+        if self._logging_context:
+            raise RuntimeError("Measure() objects cannot be re-used")
 
+        self.start = self.clock.time()
+        parent_context = LoggingContext.current_context()
+        self._logging_context = LoggingContext(
+            "Measure[%s]" % (self.name,), parent_context
+        )
+        self._logging_context.__enter__()
         in_flight.register((self.name,), self._update_in_flight)
 
     def __exit__(self, exc_type, exc_val, exc_tb):
-        if isinstance(exc_type, Exception) or not self.start_context:
-            return
-
-        in_flight.unregister((self.name,), self._update_in_flight)
+        if not self._logging_context:
+            raise RuntimeError("Measure() block exited without being entered")
 
         duration = self.clock.time() - self.start
+        usage = self._logging_context.get_resource_usage()
 
-        block_counter.labels(self.name).inc()
-        block_timer.labels(self.name).inc(duration)
-
-        context = LoggingContext.current_context()
-
-        if context != self.start_context:
-            logger.warning(
-                "Context has unexpectedly changed from '%s' to '%s'. (%r)",
-                self.start_context,
-                context,
-                self.name,
-            )
-            return
-
-        if not context:
-            logger.warning("Expected context. (%r)", self.name)
-            return
+        in_flight.unregister((self.name,), self._update_in_flight)
+        self._logging_context.__exit__(exc_type, exc_val, exc_tb)
 
-        current = context.get_resource_usage()
-        usage = current - self.start_usage
         try:
+            block_counter.labels(self.name).inc()
+            block_timer.labels(self.name).inc(duration)
             block_ru_utime.labels(self.name).inc(usage.ru_utime)
             block_ru_stime.labels(self.name).inc(usage.ru_stime)
             block_db_txn_count.labels(self.name).inc(usage.db_txn_count)
             block_db_txn_duration.labels(self.name).inc(usage.db_txn_duration_sec)
             block_db_sched_duration.labels(self.name).inc(usage.db_sched_duration_sec)
         except ValueError:
-            logger.warning(
-                "Failed to save metrics! OLD: %r, NEW: %r", self.start_usage, current
-            )
-
-        if self.created_context:
-            self.start_context.__exit__(exc_type, exc_val, exc_tb)
+            logger.warning("Failed to save metrics! Usage: %s", usage)
 
     def _update_in_flight(self, metrics):
         """Gets called when processing in flight metrics