From fe641df770ae38a70ce5c17b3112cf57ec93f06d Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Wed, 19 Jun 2019 20:55:53 +0100 Subject: Sanity-checking for metrics updates Check that our clocks go forward. --- synapse/util/logcontext.py | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) (limited to 'synapse/util/logcontext.py') diff --git a/synapse/util/logcontext.py b/synapse/util/logcontext.py index fe412355d8..645d71282e 100644 --- a/synapse/util/logcontext.py +++ b/synapse/util/logcontext.py @@ -325,10 +325,9 @@ class LoggingContext(object): ) return - usage_end = get_thread_resource_usage() - - self._resource_usage.ru_utime += usage_end.ru_utime - self.usage_start.ru_utime - self._resource_usage.ru_stime += usage_end.ru_stime - self.usage_start.ru_stime + utime_delta, stime_delta = self._get_cputime() + self._resource_usage.ru_utime += utime_delta + self._resource_usage.ru_stime += stime_delta self.usage_start = None @@ -346,13 +345,38 @@ class LoggingContext(object): # can include resource usage so far. is_main_thread = threading.current_thread() is self.main_thread if self.alive and self.usage_start and is_main_thread: - current = get_thread_resource_usage() - res.ru_utime += current.ru_utime - self.usage_start.ru_utime - res.ru_stime += current.ru_stime - self.usage_start.ru_stime + utime_delta, stime_delta = self._get_cputime() + res.ru_utime += utime_delta + res.ru_stime += stime_delta return res + def _get_cputime(self): + """Get the cpu usage time so far + + Returns: Tuple[float, float]: seconds in user mode, seconds in system mode + """ + current = get_thread_resource_usage() + + utime_delta = current.ru_utime - self.usage_start.ru_utime + stime_delta = current.ru_stime - self.usage_start.ru_stime + + # sanity check + if utime_delta < 0: + raise ValueError("utime went backwards! %f < %f" % ( + current.ru_utime, self.usage_start.ru_utime, + )) + + if stime_delta < 0: + raise ValueError("stime went backwards! %f < %f" % ( + current.ru_stime, self.usage_start.ru_stime, + )) + + return utime_delta, stime_delta + def add_database_transaction(self, duration_sec): + if duration_sec < 0: + raise ValueError('DB txn time can only be non-negative') self._resource_usage.db_txn_count += 1 self._resource_usage.db_txn_duration_sec += duration_sec @@ -363,6 +387,8 @@ class LoggingContext(object): sched_sec (float): number of seconds it took us to get a connection """ + if sched_sec < 0: + raise ValueError('DB scheduling time can only be non-negative') self._resource_usage.db_sched_duration_sec += sched_sec def record_event_fetch(self, event_count): -- cgit 1.4.1 From dc94773e600fef29cf88d304ffb1515b145aea13 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Mon, 24 Jun 2019 10:01:16 +0100 Subject: Avoid raising exceptions in metrics Sentry will catch the errors if they happen, so that should be good enough, and woun't make things explode if we hit the error condition. --- synapse/util/logcontext.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'synapse/util/logcontext.py') diff --git a/synapse/util/logcontext.py b/synapse/util/logcontext.py index 10022ff620..6b0d2deea0 100644 --- a/synapse/util/logcontext.py +++ b/synapse/util/logcontext.py @@ -374,20 +374,26 @@ class LoggingContext(object): # sanity check if utime_delta < 0: - raise ValueError("utime went backwards! %f < %f" % ( - current.ru_utime, self.usage_start.ru_utime, - )) + logger.error( + "utime went backwards! %f < %f", + current.ru_utime, + self.usage_start.ru_utime, + ) + utime_delta = 0 if stime_delta < 0: - raise ValueError("stime went backwards! %f < %f" % ( - current.ru_stime, self.usage_start.ru_stime, - )) + logger.error( + "stime went backwards! %f < %f", + current.ru_stime, + self.usage_start.ru_stime, + ) + stime_delta = 0 return utime_delta, stime_delta def add_database_transaction(self, duration_sec): if duration_sec < 0: - raise ValueError('DB txn time can only be non-negative') + raise ValueError("DB txn time can only be non-negative") self._resource_usage.db_txn_count += 1 self._resource_usage.db_txn_duration_sec += duration_sec @@ -399,7 +405,7 @@ class LoggingContext(object): connection """ if sched_sec < 0: - raise ValueError('DB scheduling time can only be non-negative') + raise ValueError("DB scheduling time can only be non-negative") self._resource_usage.db_sched_duration_sec += sched_sec def record_event_fetch(self, event_count): -- cgit 1.4.1