summary refs log tree commit diff
diff options
context:
space:
mode:
authorRichard van der Hoff <1389908+richvdh@users.noreply.github.com>2019-12-09 11:55:30 +0000
committerGitHub <noreply@github.com>2019-12-09 11:55:30 +0000
commit18660a34d82ccb120efd2fa2480b42ac62dbe2b4 (patch)
tree60503faeb260b59e78f4f02fdcda8f6c7bea5ae8
parentRemove fallback for missing /federation/v1/state_ids API (#6488) (diff)
downloadsynapse-18660a34d82ccb120efd2fa2480b42ac62dbe2b4.tar.xz
Fix inaccurate per-block metrics (#6491)
`Measure` incorrectly assumed that it was the only thing being done by the parent `LoggingContext`. For instance, during a "renew group attestations" operation, hundreds of `outbound_request` calls could take place in parallel, all using the same `LoggingContext`. This would mean that any resources used during *any* of those calls would be reported against *all* of them, producing wildly inaccurate results.

Instead, we now give each `Measure` block its own `LoggingContext` (using the parent `LoggingContext` mechanism to ensure that the log lines look correct and that the metrics are ultimately propogated to the top level for reporting against requests/backgrond tasks).
-rw-r--r--changelog.d/6491.bugfix1
-rw-r--r--synapse/util/metrics.py60
2 files changed, 19 insertions, 42 deletions
diff --git a/changelog.d/6491.bugfix b/changelog.d/6491.bugfix
new file mode 100644

index 0000000000..78204693b0 --- /dev/null +++ b/changelog.d/6491.bugfix
@@ -0,0 +1 @@ +Fix inaccurate per-block Prometheus metrics. diff --git a/synapse/util/metrics.py b/synapse/util/metrics.py
index 63ddaaba87..7b18455469 100644 --- a/synapse/util/metrics.py +++ b/synapse/util/metrics.py
@@ -91,72 +91,48 @@ class Measure(object): __slots__ = [ "clock", "name", - "start_context", + "_logging_context", "start", - "created_context", - "start_usage", ] def __init__(self, clock, name): self.clock = clock self.name = name - self.start_context = None + self._logging_context = None self.start = None - self.created_context = False def __enter__(self): - self.start = self.clock.time() - self.start_context = LoggingContext.current_context() - if not self.start_context: - self.start_context = LoggingContext("Measure") - self.start_context.__enter__() - self.created_context = True - - self.start_usage = self.start_context.get_resource_usage() + if self._logging_context: + raise RuntimeError("Measure() objects cannot be re-used") + self.start = self.clock.time() + parent_context = LoggingContext.current_context() + self._logging_context = LoggingContext( + "Measure[%s]" % (self.name,), parent_context + ) + self._logging_context.__enter__() in_flight.register((self.name,), self._update_in_flight) def __exit__(self, exc_type, exc_val, exc_tb): - if isinstance(exc_type, Exception) or not self.start_context: - return - - in_flight.unregister((self.name,), self._update_in_flight) + if not self._logging_context: + raise RuntimeError("Measure() block exited without being entered") duration = self.clock.time() - self.start + usage = self._logging_context.get_resource_usage() - block_counter.labels(self.name).inc() - block_timer.labels(self.name).inc(duration) - - context = LoggingContext.current_context() - - if context != self.start_context: - logger.warning( - "Context has unexpectedly changed from '%s' to '%s'. (%r)", - self.start_context, - context, - self.name, - ) - return - - if not context: - logger.warning("Expected context. (%r)", self.name) - return + in_flight.unregister((self.name,), self._update_in_flight) + self._logging_context.__exit__(exc_type, exc_val, exc_tb) - current = context.get_resource_usage() - usage = current - self.start_usage try: + block_counter.labels(self.name).inc() + block_timer.labels(self.name).inc(duration) block_ru_utime.labels(self.name).inc(usage.ru_utime) block_ru_stime.labels(self.name).inc(usage.ru_stime) block_db_txn_count.labels(self.name).inc(usage.db_txn_count) block_db_txn_duration.labels(self.name).inc(usage.db_txn_duration_sec) block_db_sched_duration.labels(self.name).inc(usage.db_sched_duration_sec) except ValueError: - logger.warning( - "Failed to save metrics! OLD: %r, NEW: %r", self.start_usage, current - ) - - if self.created_context: - self.start_context.__exit__(exc_type, exc_val, exc_tb) + logger.warning("Failed to save metrics! Usage: %s", usage) def _update_in_flight(self, metrics): """Gets called when processing in flight metrics