diff --git a/changelog.d/5499.misc b/changelog.d/5499.misc
new file mode 100644
index 0000000000..84de1f2dae
--- /dev/null
+++ b/changelog.d/5499.misc
@@ -0,0 +1 @@
+Some cleanups and sanity-checking in the CPU and database metrics.
\ No newline at end of file
diff --git a/synapse/storage/_base.py b/synapse/storage/_base.py
index aae43d0f99..29589853c6 100644
--- a/synapse/storage/_base.py
+++ b/synapse/storage/_base.py
@@ -175,22 +175,22 @@ class PerformanceCounters(object):
self.current_counters = {}
self.previous_counters = {}
- def update(self, key, start_time, end_time=None):
- if end_time is None:
- end_time = time.time()
- duration = end_time - start_time
+ def update(self, key, duration_secs):
count, cum_time = self.current_counters.get(key, (0, 0))
count += 1
- cum_time += duration
+ cum_time += duration_secs
self.current_counters[key] = (count, cum_time)
- return end_time
- def interval(self, interval_duration, limit=3):
+ def interval(self, interval_duration_secs, limit=3):
counters = []
for name, (count, cum_time) in iteritems(self.current_counters):
prev_count, prev_time = self.previous_counters.get(name, (0, 0))
counters.append(
- ((cum_time - prev_time) / interval_duration, count - prev_count, name)
+ (
+ (cum_time - prev_time) / interval_duration_secs,
+ count - prev_count,
+ name,
+ )
)
self.previous_counters = dict(self.current_counters)
@@ -221,7 +221,6 @@ class SQLBaseStore(object):
# is running in mainline, and we have some nice monitoring frontends
# to watch it
self._txn_perf_counters = PerformanceCounters()
- self._get_event_counters = PerformanceCounters()
self._get_event_cache = Cache(
"*getEvent*", keylen=3, max_entries=hs.config.event_cache_size
@@ -369,21 +368,13 @@ class SQLBaseStore(object):
time_then = self._previous_loop_ts
self._previous_loop_ts = time_now
- ratio = (curr - prev) / (time_now - time_then)
-
- top_three_counters = self._txn_perf_counters.interval(
- time_now - time_then, limit=3
- )
+ duration = time_now - time_then
+ ratio = (curr - prev) / duration
- top_3_event_counters = self._get_event_counters.interval(
- time_now - time_then, limit=3
- )
+ top_three_counters = self._txn_perf_counters.interval(duration, limit=3)
perf_logger.info(
- "Total database time: %.3f%% {%s} {%s}",
- ratio * 100,
- top_three_counters,
- top_3_event_counters,
+ "Total database time: %.3f%% {%s}", ratio * 100, top_three_counters
)
self._clock.looping_call(loop, 10000)
@@ -465,7 +456,7 @@ class SQLBaseStore(object):
transaction_logger.debug("[TXN END] {%s} %f sec", name, duration)
self._current_txn_total_time += duration
- self._txn_perf_counters.update(desc, start, end)
+ self._txn_perf_counters.update(desc, duration)
sql_txn_timer.labels(desc).observe(duration)
@defer.inlineCallbacks
diff --git a/synapse/util/logcontext.py b/synapse/util/logcontext.py
index a9885cb507..6b0d2deea0 100644
--- a/synapse/util/logcontext.py
+++ b/synapse/util/logcontext.py
@@ -336,10 +336,9 @@ class LoggingContext(object):
logger.warning("Called stop on logcontext %s without calling start", self)
return
- usage_end = get_thread_resource_usage()
-
- self._resource_usage.ru_utime += usage_end.ru_utime - self.usage_start.ru_utime
- self._resource_usage.ru_stime += usage_end.ru_stime - self.usage_start.ru_stime
+ utime_delta, stime_delta = self._get_cputime()
+ self._resource_usage.ru_utime += utime_delta
+ self._resource_usage.ru_stime += stime_delta
self.usage_start = None
@@ -357,13 +356,44 @@ class LoggingContext(object):
# can include resource usage so far.
is_main_thread = threading.current_thread() is self.main_thread
if self.alive and self.usage_start and is_main_thread:
- current = get_thread_resource_usage()
- res.ru_utime += current.ru_utime - self.usage_start.ru_utime
- res.ru_stime += current.ru_stime - self.usage_start.ru_stime
+ utime_delta, stime_delta = self._get_cputime()
+ res.ru_utime += utime_delta
+ res.ru_stime += stime_delta
return res
+ def _get_cputime(self):
+ """Get the cpu usage time so far
+
+ Returns: Tuple[float, float]: seconds in user mode, seconds in system mode
+ """
+ current = get_thread_resource_usage()
+
+ utime_delta = current.ru_utime - self.usage_start.ru_utime
+ stime_delta = current.ru_stime - self.usage_start.ru_stime
+
+ # sanity check
+ if utime_delta < 0:
+ logger.error(
+ "utime went backwards! %f < %f",
+ current.ru_utime,
+ self.usage_start.ru_utime,
+ )
+ utime_delta = 0
+
+ if stime_delta < 0:
+ logger.error(
+ "stime went backwards! %f < %f",
+ current.ru_stime,
+ self.usage_start.ru_stime,
+ )
+ stime_delta = 0
+
+ return utime_delta, stime_delta
+
def add_database_transaction(self, duration_sec):
+ if duration_sec < 0:
+ raise ValueError("DB txn time can only be non-negative")
self._resource_usage.db_txn_count += 1
self._resource_usage.db_txn_duration_sec += duration_sec
@@ -374,6 +404,8 @@ class LoggingContext(object):
sched_sec (float): number of seconds it took us to get a
connection
"""
+ if sched_sec < 0:
+ raise ValueError("DB scheduling time can only be non-negative")
self._resource_usage.db_sched_duration_sec += sched_sec
def record_event_fetch(self, event_count):
|