diff options
Diffstat (limited to 'synapse/metrics')
-rw-r--r-- | synapse/metrics/__init__.py | 39 | ||||
-rw-r--r-- | synapse/metrics/background_process_metrics.py | 179 |
2 files changed, 209 insertions, 9 deletions
diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index 429e79c472..a9158fc066 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -13,20 +13,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging import functools -import time import gc +import logging import os import platform -import attr +import time -from prometheus_client import Gauge, Histogram, Counter -from prometheus_client.core import GaugeMetricFamily, REGISTRY +import attr +from prometheus_client import Counter, Gauge, Histogram +from prometheus_client.core import REGISTRY, GaugeMetricFamily from twisted.internet import reactor - logger = logging.getLogger(__name__) running_on_pypy = platform.python_implementation() == "PyPy" @@ -62,7 +61,7 @@ class LaterGauge(object): calls = self.caller() except Exception: logger.exception( - "Exception running callback for LaterGuage(%s)", + "Exception running callback for LaterGauge(%s)", self.name, ) yield g @@ -140,14 +139,15 @@ gc_time = Histogram( class GCCounts(object): def collect(self): - cm = GaugeMetricFamily("python_gc_counts", "GC cycle counts", labels=["gen"]) + cm = GaugeMetricFamily("python_gc_counts", "GC object counts", labels=["gen"]) for n, m in enumerate(gc.get_count()): cm.add_metric([str(n)], m) yield cm -REGISTRY.register(GCCounts()) +if not running_on_pypy: + REGISTRY.register(GCCounts()) # # Twisted reactor metrics @@ -190,6 +190,22 @@ event_processing_last_ts = Gauge("synapse_event_processing_last_ts", "", ["name" # finished being processed. event_processing_lag = Gauge("synapse_event_processing_lag", "", ["name"]) +last_ticked = time.time() + + +class ReactorLastSeenMetric(object): + + def collect(self): + cm = GaugeMetricFamily( + "python_twisted_reactor_last_seen", + "Seconds since the Twisted reactor was last seen", + ) + cm.add_metric([], time.time() - last_ticked) + yield cm + + +REGISTRY.register(ReactorLastSeenMetric()) + def runUntilCurrentTimer(func): @@ -222,6 +238,11 @@ def runUntilCurrentTimer(func): tick_time.observe(end - start) pending_calls_metric.observe(num_pending) + # Update the time we last ticked, for the metric to test whether + # Synapse's reactor has frozen + global last_ticked + last_ticked = end + if running_on_pypy: return ret diff --git a/synapse/metrics/background_process_metrics.py b/synapse/metrics/background_process_metrics.py new file mode 100644 index 0000000000..9d820e44a6 --- /dev/null +++ b/synapse/metrics/background_process_metrics.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- +# Copyright 2018 New Vector Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import six + +from prometheus_client.core import REGISTRY, Counter, GaugeMetricFamily + +from twisted.internet import defer + +from synapse.util.logcontext import LoggingContext, PreserveLoggingContext + +_background_process_start_count = Counter( + "synapse_background_process_start_count", + "Number of background processes started", + ["name"], +) + +# we set registry=None in all of these to stop them getting registered with +# the default registry. Instead we collect them all via the CustomCollector, +# which ensures that we can update them before they are collected. +# +_background_process_ru_utime = Counter( + "synapse_background_process_ru_utime_seconds", + "User CPU time used by background processes, in seconds", + ["name"], + registry=None, +) + +_background_process_ru_stime = Counter( + "synapse_background_process_ru_stime_seconds", + "System CPU time used by background processes, in seconds", + ["name"], + registry=None, +) + +_background_process_db_txn_count = Counter( + "synapse_background_process_db_txn_count", + "Number of database transactions done by background processes", + ["name"], + registry=None, +) + +_background_process_db_txn_duration = Counter( + "synapse_background_process_db_txn_duration_seconds", + ("Seconds spent by background processes waiting for database " + "transactions, excluding scheduling time"), + ["name"], + registry=None, +) + +_background_process_db_sched_duration = Counter( + "synapse_background_process_db_sched_duration_seconds", + "Seconds spent by background processes waiting for database connections", + ["name"], + registry=None, +) + +# map from description to a counter, so that we can name our logcontexts +# incrementally. (It actually duplicates _background_process_start_count, but +# it's much simpler to do so than to try to combine them.) +_background_process_counts = dict() # type: dict[str, int] + +# map from description to the currently running background processes. +# +# it's kept as a dict of sets rather than a big set so that we can keep track +# of process descriptions that no longer have any active processes. +_background_processes = dict() # type: dict[str, set[_BackgroundProcess]] + + +class _Collector(object): + """A custom metrics collector for the background process metrics. + + Ensures that all of the metrics are up-to-date with any in-flight processes + before they are returned. + """ + def collect(self): + background_process_in_flight_count = GaugeMetricFamily( + "synapse_background_process_in_flight_count", + "Number of background processes in flight", + labels=["name"], + ) + + for desc, processes in six.iteritems(_background_processes): + background_process_in_flight_count.add_metric( + (desc,), len(processes), + ) + for process in processes: + process.update_metrics() + + yield background_process_in_flight_count + + # now we need to run collect() over each of the static Counters, and + # yield each metric they return. + for m in ( + _background_process_ru_utime, + _background_process_ru_stime, + _background_process_db_txn_count, + _background_process_db_txn_duration, + _background_process_db_sched_duration, + ): + for r in m.collect(): + yield r + + +REGISTRY.register(_Collector()) + + +class _BackgroundProcess(object): + def __init__(self, desc, ctx): + self.desc = desc + self._context = ctx + self._reported_stats = None + + def update_metrics(self): + """Updates the metrics with values from this process.""" + new_stats = self._context.get_resource_usage() + if self._reported_stats is None: + diff = new_stats + else: + diff = new_stats - self._reported_stats + self._reported_stats = new_stats + + _background_process_ru_utime.labels(self.desc).inc(diff.ru_utime) + _background_process_ru_stime.labels(self.desc).inc(diff.ru_stime) + _background_process_db_txn_count.labels(self.desc).inc( + diff.db_txn_count, + ) + _background_process_db_txn_duration.labels(self.desc).inc( + diff.db_txn_duration_sec, + ) + _background_process_db_sched_duration.labels(self.desc).inc( + diff.db_sched_duration_sec, + ) + + +def run_as_background_process(desc, func, *args, **kwargs): + """Run the given function in its own logcontext, with resource metrics + + This should be used to wrap processes which are fired off to run in the + background, instead of being associated with a particular request. + + Args: + desc (str): a description for this background process type + func: a function, which may return a Deferred + args: positional args for func + kwargs: keyword args for func + + Returns: None + """ + @defer.inlineCallbacks + def run(): + count = _background_process_counts.get(desc, 0) + _background_process_counts[desc] = count + 1 + _background_process_start_count.labels(desc).inc() + + with LoggingContext(desc) as context: + context.request = "%s-%i" % (desc, count) + proc = _BackgroundProcess(desc, context) + _background_processes.setdefault(desc, set()).add(proc) + try: + yield func(*args, **kwargs) + finally: + proc.update_metrics() + _background_processes[desc].remove(proc) + + with PreserveLoggingContext(): + run() |