summary refs log tree commit diff
diff options
context:
space:
mode:
authorRichard van der Hoff <1389908+richvdh@users.noreply.github.com>2019-06-24 17:12:54 +0100
committerGitHub <noreply@github.com>2019-06-24 17:12:54 +0100
commite59a8cd2e54e5e91dac4e1747849a683757dbc0d (patch)
treebc916952d19771c175c3b8d43e49316310172056
parentMerge pull request #5534 from matrix-org/babolivier/federation-publicrooms (diff)
parentblack (diff)
downloadsynapse-e59a8cd2e54e5e91dac4e1747849a683757dbc0d.tar.xz
Merge pull request #5499 from matrix-org/rav/cleanup_metrics
Cleanups and sanity-checking in cpu and db metrics
-rw-r--r--changelog.d/5499.misc1
-rw-r--r--synapse/storage/_base.py35
-rw-r--r--synapse/util/logcontext.py46
3 files changed, 53 insertions, 29 deletions
diff --git a/changelog.d/5499.misc b/changelog.d/5499.misc
new file mode 100644
index 0000000000..84de1f2dae
--- /dev/null
+++ b/changelog.d/5499.misc
@@ -0,0 +1 @@
+Some cleanups and sanity-checking in the CPU and database metrics.
\ No newline at end of file
diff --git a/synapse/storage/_base.py b/synapse/storage/_base.py
index aae43d0f99..29589853c6 100644
--- a/synapse/storage/_base.py
+++ b/synapse/storage/_base.py
@@ -175,22 +175,22 @@ class PerformanceCounters(object):
         self.current_counters = {}
         self.previous_counters = {}
 
-    def update(self, key, start_time, end_time=None):
-        if end_time is None:
-            end_time = time.time()
-        duration = end_time - start_time
+    def update(self, key, duration_secs):
         count, cum_time = self.current_counters.get(key, (0, 0))
         count += 1
-        cum_time += duration
+        cum_time += duration_secs
         self.current_counters[key] = (count, cum_time)
-        return end_time
 
-    def interval(self, interval_duration, limit=3):
+    def interval(self, interval_duration_secs, limit=3):
         counters = []
         for name, (count, cum_time) in iteritems(self.current_counters):
             prev_count, prev_time = self.previous_counters.get(name, (0, 0))
             counters.append(
-                ((cum_time - prev_time) / interval_duration, count - prev_count, name)
+                (
+                    (cum_time - prev_time) / interval_duration_secs,
+                    count - prev_count,
+                    name,
+                )
             )
 
         self.previous_counters = dict(self.current_counters)
@@ -221,7 +221,6 @@ class SQLBaseStore(object):
         #   is running in mainline, and we have some nice monitoring frontends
         #   to watch it
         self._txn_perf_counters = PerformanceCounters()
-        self._get_event_counters = PerformanceCounters()
 
         self._get_event_cache = Cache(
             "*getEvent*", keylen=3, max_entries=hs.config.event_cache_size
@@ -369,21 +368,13 @@ class SQLBaseStore(object):
             time_then = self._previous_loop_ts
             self._previous_loop_ts = time_now
 
-            ratio = (curr - prev) / (time_now - time_then)
-
-            top_three_counters = self._txn_perf_counters.interval(
-                time_now - time_then, limit=3
-            )
+            duration = time_now - time_then
+            ratio = (curr - prev) / duration
 
-            top_3_event_counters = self._get_event_counters.interval(
-                time_now - time_then, limit=3
-            )
+            top_three_counters = self._txn_perf_counters.interval(duration, limit=3)
 
             perf_logger.info(
-                "Total database time: %.3f%% {%s} {%s}",
-                ratio * 100,
-                top_three_counters,
-                top_3_event_counters,
+                "Total database time: %.3f%% {%s}", ratio * 100, top_three_counters
             )
 
         self._clock.looping_call(loop, 10000)
@@ -465,7 +456,7 @@ class SQLBaseStore(object):
             transaction_logger.debug("[TXN END] {%s} %f sec", name, duration)
 
             self._current_txn_total_time += duration
-            self._txn_perf_counters.update(desc, start, end)
+            self._txn_perf_counters.update(desc, duration)
             sql_txn_timer.labels(desc).observe(duration)
 
     @defer.inlineCallbacks
diff --git a/synapse/util/logcontext.py b/synapse/util/logcontext.py
index a9885cb507..6b0d2deea0 100644
--- a/synapse/util/logcontext.py
+++ b/synapse/util/logcontext.py
@@ -336,10 +336,9 @@ class LoggingContext(object):
             logger.warning("Called stop on logcontext %s without calling start", self)
             return
 
-        usage_end = get_thread_resource_usage()
-
-        self._resource_usage.ru_utime += usage_end.ru_utime - self.usage_start.ru_utime
-        self._resource_usage.ru_stime += usage_end.ru_stime - self.usage_start.ru_stime
+        utime_delta, stime_delta = self._get_cputime()
+        self._resource_usage.ru_utime += utime_delta
+        self._resource_usage.ru_stime += stime_delta
 
         self.usage_start = None
 
@@ -357,13 +356,44 @@ class LoggingContext(object):
         # can include resource usage so far.
         is_main_thread = threading.current_thread() is self.main_thread
         if self.alive and self.usage_start and is_main_thread:
-            current = get_thread_resource_usage()
-            res.ru_utime += current.ru_utime - self.usage_start.ru_utime
-            res.ru_stime += current.ru_stime - self.usage_start.ru_stime
+            utime_delta, stime_delta = self._get_cputime()
+            res.ru_utime += utime_delta
+            res.ru_stime += stime_delta
 
         return res
 
+    def _get_cputime(self):
+        """Get the cpu usage time so far
+
+        Returns: Tuple[float, float]: seconds in user mode, seconds in system mode
+        """
+        current = get_thread_resource_usage()
+
+        utime_delta = current.ru_utime - self.usage_start.ru_utime
+        stime_delta = current.ru_stime - self.usage_start.ru_stime
+
+        # sanity check
+        if utime_delta < 0:
+            logger.error(
+                "utime went backwards! %f < %f",
+                current.ru_utime,
+                self.usage_start.ru_utime,
+            )
+            utime_delta = 0
+
+        if stime_delta < 0:
+            logger.error(
+                "stime went backwards! %f < %f",
+                current.ru_stime,
+                self.usage_start.ru_stime,
+            )
+            stime_delta = 0
+
+        return utime_delta, stime_delta
+
     def add_database_transaction(self, duration_sec):
+        if duration_sec < 0:
+            raise ValueError("DB txn time can only be non-negative")
         self._resource_usage.db_txn_count += 1
         self._resource_usage.db_txn_duration_sec += duration_sec
 
@@ -374,6 +404,8 @@ class LoggingContext(object):
             sched_sec (float): number of seconds it took us to get a
                 connection
         """
+        if sched_sec < 0:
+            raise ValueError("DB scheduling time can only be non-negative")
         self._resource_usage.db_sched_duration_sec += sched_sec
 
     def record_event_fetch(self, event_count):