From b01a75549893bf4f8f986a74a499e7cb6131868c Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Mon, 20 Aug 2018 17:27:52 +0100 Subject: Make the in flight requests metrics thread safe --- synapse/http/request_metrics.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'synapse/http/request_metrics.py') diff --git a/synapse/http/request_metrics.py b/synapse/http/request_metrics.py index 588e280571..284d5e05de 100644 --- a/synapse/http/request_metrics.py +++ b/synapse/http/request_metrics.py @@ -15,6 +15,7 @@ # limitations under the License. import logging +import threading from prometheus_client.core import Counter, Histogram @@ -111,6 +112,9 @@ in_flight_requests_db_sched_duration = Counter( # The set of all in flight requests, set[RequestMetrics] _in_flight_requests = set() +# Protects the _in_flight_requests set from concurrent accesss +_in_flight_reqeuests_lock = threading.Lock() + def _get_in_flight_counts(): """Returns a count of all in flight requests by (method, server_name) @@ -120,7 +124,8 @@ def _get_in_flight_counts(): """ # Cast to a list to prevent it changing while the Prometheus # thread is collecting metrics - reqs = list(_in_flight_requests) + with _in_flight_reqeuests_lock: + reqs = list(_in_flight_requests) for rm in reqs: rm.update_metrics() @@ -154,10 +159,12 @@ class RequestMetrics(object): # to the "in flight" metrics. self._request_stats = self.start_context.get_resource_usage() - _in_flight_requests.add(self) + with _in_flight_reqeuests_lock: + _in_flight_requests.add(self) def stop(self, time_sec, request): - _in_flight_requests.discard(self) + with _in_flight_reqeuests_lock: + _in_flight_requests.discard(self) context = LoggingContext.current_context() -- cgit 1.5.1 From cd6937fb266b72080f73a7a41da761f7f8775d96 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 21 Aug 2018 16:28:10 +0100 Subject: Fix typo --- synapse/http/request_metrics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'synapse/http/request_metrics.py') diff --git a/synapse/http/request_metrics.py b/synapse/http/request_metrics.py index 284d5e05de..72c2654678 100644 --- a/synapse/http/request_metrics.py +++ b/synapse/http/request_metrics.py @@ -113,7 +113,7 @@ in_flight_requests_db_sched_duration = Counter( _in_flight_requests = set() # Protects the _in_flight_requests set from concurrent accesss -_in_flight_reqeuests_lock = threading.Lock() +_in_flight_requests_lock = threading.Lock() def _get_in_flight_counts(): @@ -124,7 +124,7 @@ def _get_in_flight_counts(): """ # Cast to a list to prevent it changing while the Prometheus # thread is collecting metrics - with _in_flight_reqeuests_lock: + with _in_flight_requests_lock: reqs = list(_in_flight_requests) for rm in reqs: @@ -159,11 +159,11 @@ class RequestMetrics(object): # to the "in flight" metrics. self._request_stats = self.start_context.get_resource_usage() - with _in_flight_reqeuests_lock: + with _in_flight_requests_lock: _in_flight_requests.add(self) def stop(self, time_sec, request): - with _in_flight_reqeuests_lock: + with _in_flight_requests_lock: _in_flight_requests.discard(self) context = LoggingContext.current_context() -- cgit 1.5.1 From ac80cb08fe247812b46c70843b2d5c9764cca619 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Mon, 17 Sep 2018 17:16:50 +0100 Subject: Fix more b'abcd' noise in metrics --- synapse/http/request_metrics.py | 22 +++++++++++----------- synapse/http/site.py | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'synapse/http/request_metrics.py') diff --git a/synapse/http/request_metrics.py b/synapse/http/request_metrics.py index 72c2654678..fedb4e6b18 100644 --- a/synapse/http/request_metrics.py +++ b/synapse/http/request_metrics.py @@ -162,7 +162,7 @@ class RequestMetrics(object): with _in_flight_requests_lock: _in_flight_requests.add(self) - def stop(self, time_sec, request): + def stop(self, time_sec, response_code, sent_bytes): with _in_flight_requests_lock: _in_flight_requests.discard(self) @@ -179,35 +179,35 @@ class RequestMetrics(object): ) return - response_code = str(request.code) + response_code = str(response_code) - outgoing_responses_counter.labels(request.method, response_code).inc() + outgoing_responses_counter.labels(self.method, response_code).inc() - response_count.labels(request.method, self.name, tag).inc() + response_count.labels(self.method, self.name, tag).inc() - response_timer.labels(request.method, self.name, tag, response_code).observe( + response_timer.labels(self.method, self.name, tag, response_code).observe( time_sec - self.start ) resource_usage = context.get_resource_usage() - response_ru_utime.labels(request.method, self.name, tag).inc( + response_ru_utime.labels(self.method, self.name, tag).inc( resource_usage.ru_utime, ) - response_ru_stime.labels(request.method, self.name, tag).inc( + response_ru_stime.labels(self.method, self.name, tag).inc( resource_usage.ru_stime, ) - response_db_txn_count.labels(request.method, self.name, tag).inc( + response_db_txn_count.labels(self.method, self.name, tag).inc( resource_usage.db_txn_count ) - response_db_txn_duration.labels(request.method, self.name, tag).inc( + response_db_txn_duration.labels(self.method, self.name, tag).inc( resource_usage.db_txn_duration_sec ) - response_db_sched_duration.labels(request.method, self.name, tag).inc( + response_db_sched_duration.labels(self.method, self.name, tag).inc( resource_usage.db_sched_duration_sec ) - response_size.labels(request.method, self.name, tag).inc(request.sentLength) + response_size.labels(self.method, self.name, tag).inc(sent_bytes) # We always call this at the end to ensure that we update the metrics # regardless of whether a call to /metrics while the request was in diff --git a/synapse/http/site.py b/synapse/http/site.py index 2191de9836..9579e8cd0d 100644 --- a/synapse/http/site.py +++ b/synapse/http/site.py @@ -288,7 +288,7 @@ class SynapseRequest(Request): ) try: - self.request_metrics.stop(self.finish_time, self) + self.request_metrics.stop(self.finish_time, self.code, self.sentLength) except Exception as e: logger.warn("Failed to stop metrics: %r", e) -- cgit 1.5.1 From b69216f7680b92ccd8f92e618e1a81c9ba3c3346 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Fri, 19 Oct 2018 21:45:45 +1100 Subject: Make the metrics less racy (#4061) --- changelog.d/4061.bugfix | 1 + synapse/http/request_metrics.py | 31 ++++++++++++++++++------------- synapse/notifier.py | 6 +++--- 3 files changed, 22 insertions(+), 16 deletions(-) create mode 100644 changelog.d/4061.bugfix (limited to 'synapse/http/request_metrics.py') diff --git a/changelog.d/4061.bugfix b/changelog.d/4061.bugfix new file mode 100644 index 0000000000..94ffcf7a51 --- /dev/null +++ b/changelog.d/4061.bugfix @@ -0,0 +1 @@ +Fix some metrics being racy and causing exceptions when polled by Prometheus. diff --git a/synapse/http/request_metrics.py b/synapse/http/request_metrics.py index fedb4e6b18..62045a918b 100644 --- a/synapse/http/request_metrics.py +++ b/synapse/http/request_metrics.py @@ -39,7 +39,8 @@ outgoing_responses_counter = Counter( ) response_timer = Histogram( - "synapse_http_server_response_time_seconds", "sec", + "synapse_http_server_response_time_seconds", + "sec", ["method", "servlet", "tag", "code"], ) @@ -79,15 +80,11 @@ response_size = Counter( # than when the response was written. in_flight_requests_ru_utime = Counter( - "synapse_http_server_in_flight_requests_ru_utime_seconds", - "", - ["method", "servlet"], + "synapse_http_server_in_flight_requests_ru_utime_seconds", "", ["method", "servlet"] ) in_flight_requests_ru_stime = Counter( - "synapse_http_server_in_flight_requests_ru_stime_seconds", - "", - ["method", "servlet"], + "synapse_http_server_in_flight_requests_ru_stime_seconds", "", ["method", "servlet"] ) in_flight_requests_db_txn_count = Counter( @@ -134,7 +131,7 @@ def _get_in_flight_counts(): # type counts = {} for rm in reqs: - key = (rm.method, rm.name,) + key = (rm.method, rm.name) counts[key] = counts.get(key, 0) + 1 return counts @@ -175,7 +172,8 @@ class RequestMetrics(object): if context != self.start_context: logger.warn( "Context have unexpectedly changed %r, %r", - context, self.start_context + context, + self.start_context, ) return @@ -192,10 +190,10 @@ class RequestMetrics(object): resource_usage = context.get_resource_usage() response_ru_utime.labels(self.method, self.name, tag).inc( - resource_usage.ru_utime, + resource_usage.ru_utime ) response_ru_stime.labels(self.method, self.name, tag).inc( - resource_usage.ru_stime, + resource_usage.ru_stime ) response_db_txn_count.labels(self.method, self.name, tag).inc( resource_usage.db_txn_count @@ -222,8 +220,15 @@ class RequestMetrics(object): diff = new_stats - self._request_stats self._request_stats = new_stats - in_flight_requests_ru_utime.labels(self.method, self.name).inc(diff.ru_utime) - in_flight_requests_ru_stime.labels(self.method, self.name).inc(diff.ru_stime) + # max() is used since rapid use of ru_stime/ru_utime can end up with the + # count going backwards due to NTP, time smearing, fine-grained + # correction, or floating points. Who knows, really? + in_flight_requests_ru_utime.labels(self.method, self.name).inc( + max(diff.ru_utime, 0) + ) + in_flight_requests_ru_stime.labels(self.method, self.name).inc( + max(diff.ru_stime, 0) + ) in_flight_requests_db_txn_count.labels(self.method, self.name).inc( diff.db_txn_count diff --git a/synapse/notifier.py b/synapse/notifier.py index 340b16ce25..de02b1017e 100644 --- a/synapse/notifier.py +++ b/synapse/notifier.py @@ -186,9 +186,9 @@ class Notifier(object): def count_listeners(): all_user_streams = set() - for x in self.room_to_user_streams.values(): + for x in list(self.room_to_user_streams.values()): all_user_streams |= x - for x in self.user_to_user_stream.values(): + for x in list(self.user_to_user_stream.values()): all_user_streams.add(x) return sum(stream.count_listeners() for stream in all_user_streams) @@ -196,7 +196,7 @@ class Notifier(object): LaterGauge( "synapse_notifier_rooms", "", [], - lambda: count(bool, self.room_to_user_streams.values()), + lambda: count(bool, list(self.room_to_user_streams.values())), ) LaterGauge( "synapse_notifier_users", "", [], -- cgit 1.5.1