diff --git a/synapse/http/request_metrics.py b/synapse/http/request_metrics.py
index 588e280571..62045a918b 100644
--- a/synapse/http/request_metrics.py
+++ b/synapse/http/request_metrics.py
@@ -15,6 +15,7 @@
# limitations under the License.
import logging
+import threading
from prometheus_client.core import Counter, Histogram
@@ -38,7 +39,8 @@ outgoing_responses_counter = Counter(
)
response_timer = Histogram(
- "synapse_http_server_response_time_seconds", "sec",
+ "synapse_http_server_response_time_seconds",
+ "sec",
["method", "servlet", "tag", "code"],
)
@@ -78,15 +80,11 @@ response_size = Counter(
# than when the response was written.
in_flight_requests_ru_utime = Counter(
- "synapse_http_server_in_flight_requests_ru_utime_seconds",
- "",
- ["method", "servlet"],
+ "synapse_http_server_in_flight_requests_ru_utime_seconds", "", ["method", "servlet"]
)
in_flight_requests_ru_stime = Counter(
- "synapse_http_server_in_flight_requests_ru_stime_seconds",
- "",
- ["method", "servlet"],
+ "synapse_http_server_in_flight_requests_ru_stime_seconds", "", ["method", "servlet"]
)
in_flight_requests_db_txn_count = Counter(
@@ -111,6 +109,9 @@ in_flight_requests_db_sched_duration = Counter(
# The set of all in flight requests, set[RequestMetrics]
_in_flight_requests = set()
+# Protects the _in_flight_requests set from concurrent accesss
+_in_flight_requests_lock = threading.Lock()
+
def _get_in_flight_counts():
"""Returns a count of all in flight requests by (method, server_name)
@@ -120,7 +121,8 @@ def _get_in_flight_counts():
"""
# Cast to a list to prevent it changing while the Prometheus
# thread is collecting metrics
- reqs = list(_in_flight_requests)
+ with _in_flight_requests_lock:
+ reqs = list(_in_flight_requests)
for rm in reqs:
rm.update_metrics()
@@ -129,7 +131,7 @@ def _get_in_flight_counts():
# type
counts = {}
for rm in reqs:
- key = (rm.method, rm.name,)
+ key = (rm.method, rm.name)
counts[key] = counts.get(key, 0) + 1
return counts
@@ -154,10 +156,12 @@ class RequestMetrics(object):
# to the "in flight" metrics.
self._request_stats = self.start_context.get_resource_usage()
- _in_flight_requests.add(self)
+ with _in_flight_requests_lock:
+ _in_flight_requests.add(self)
- def stop(self, time_sec, request):
- _in_flight_requests.discard(self)
+ def stop(self, time_sec, response_code, sent_bytes):
+ with _in_flight_requests_lock:
+ _in_flight_requests.discard(self)
context = LoggingContext.current_context()
@@ -168,39 +172,40 @@ class RequestMetrics(object):
if context != self.start_context:
logger.warn(
"Context have unexpectedly changed %r, %r",
- context, self.start_context
+ context,
+ self.start_context,
)
return
- response_code = str(request.code)
+ response_code = str(response_code)
- outgoing_responses_counter.labels(request.method, response_code).inc()
+ outgoing_responses_counter.labels(self.method, response_code).inc()
- response_count.labels(request.method, self.name, tag).inc()
+ response_count.labels(self.method, self.name, tag).inc()
- response_timer.labels(request.method, self.name, tag, response_code).observe(
+ response_timer.labels(self.method, self.name, tag, response_code).observe(
time_sec - self.start
)
resource_usage = context.get_resource_usage()
- response_ru_utime.labels(request.method, self.name, tag).inc(
- resource_usage.ru_utime,
+ response_ru_utime.labels(self.method, self.name, tag).inc(
+ resource_usage.ru_utime
)
- response_ru_stime.labels(request.method, self.name, tag).inc(
- resource_usage.ru_stime,
+ response_ru_stime.labels(self.method, self.name, tag).inc(
+ resource_usage.ru_stime
)
- response_db_txn_count.labels(request.method, self.name, tag).inc(
+ response_db_txn_count.labels(self.method, self.name, tag).inc(
resource_usage.db_txn_count
)
- response_db_txn_duration.labels(request.method, self.name, tag).inc(
+ response_db_txn_duration.labels(self.method, self.name, tag).inc(
resource_usage.db_txn_duration_sec
)
- response_db_sched_duration.labels(request.method, self.name, tag).inc(
+ response_db_sched_duration.labels(self.method, self.name, tag).inc(
resource_usage.db_sched_duration_sec
)
- response_size.labels(request.method, self.name, tag).inc(request.sentLength)
+ response_size.labels(self.method, self.name, tag).inc(sent_bytes)
# We always call this at the end to ensure that we update the metrics
# regardless of whether a call to /metrics while the request was in
@@ -215,8 +220,15 @@ class RequestMetrics(object):
diff = new_stats - self._request_stats
self._request_stats = new_stats
- in_flight_requests_ru_utime.labels(self.method, self.name).inc(diff.ru_utime)
- in_flight_requests_ru_stime.labels(self.method, self.name).inc(diff.ru_stime)
+ # max() is used since rapid use of ru_stime/ru_utime can end up with the
+ # count going backwards due to NTP, time smearing, fine-grained
+ # correction, or floating points. Who knows, really?
+ in_flight_requests_ru_utime.labels(self.method, self.name).inc(
+ max(diff.ru_utime, 0)
+ )
+ in_flight_requests_ru_stime.labels(self.method, self.name).inc(
+ max(diff.ru_stime, 0)
+ )
in_flight_requests_db_txn_count.labels(self.method, self.name).inc(
diff.db_txn_count
|