diff options
author | Neil Johnson <neil@matrix.org> | 2018-06-06 12:27:33 +0100 |
---|---|---|
committer | Neil Johnson <neil@matrix.org> | 2018-06-06 12:27:33 +0100 |
commit | 752b7b32ed1c33651c0c64fbbb4289c3b62ac89b (patch) | |
tree | e120f6fffa36a2d4f6eae37f555be078940b7854 /synapse/metrics/__init__.py | |
parent | Merge pull request #3290 from rubo77/patch-7 (diff) | |
parent | 7 char sha in changelog (diff) | |
download | synapse-752b7b32ed1c33651c0c64fbbb4289c3b62ac89b.tar.xz |
Merge tag 'v0.31.0'
Changes in synapse v0.31.0 (2018-06-06) ====================================== Most notable change from v0.30.0 is to switch to python prometheus library to improve system stats reporting. WARNING this changes a number of prometheus metrics in a backwards-incompatible manner. For more details, see `docs/metrics-howto.rst <docs/metrics-howto.rst#removal-of-deprecated-metrics--time-based-counters-becoming-histograms-in-0310>`_. Bug Fixes: * Fix metric documentation tables (PR #3341) * Fix LaterGuage error handling (694968f) * Fix replication metrics (b7e7fd2) Changes in synapse v0.31.0-rc1 (2018-06-04) ========================================== Features: * Switch to the Python Prometheus library (PR #3256, #3274) * Let users leave the server notice room after joining (PR #3287) Changes: * daily user type phone home stats (PR #3264) * Use iter* methods for _filter_events_for_server (PR #3267) * Docs on consent bits (PR #3268) * Remove users from user directory on deactivate (PR #3277) * Avoid sending consent notice to guest users (PR #3288) * disable CPUMetrics if no /proc/self/stat (PR #3299) * Add local and loopback IPv6 addresses to url_preview_ip_range_blacklist (PR #3312) Thanks to @thegcat! * Consistently use six's iteritems and wrap lazy keys/values in list() if they're not meant to be lazy (PR #3307) * Add private IPv6 addresses to example config for url preview blacklist (PR #3317) Thanks to @thegcat! * Reduce stuck read-receipts: ignore depth when updating (PR #3318) * Put python's logs into Trial when running unit tests (PR #3319) Changes, python 3 migration: * Replace some more comparisons with six (PR #3243) Thanks to @NotAFile! * replace some iteritems with six (PR #3244) Thanks to @NotAFile! * Add batch_iter to utils (PR #3245) Thanks to @NotAFile! * use repr, not str (PR #3246) Thanks to @NotAFile! * Misc Python3 fixes (PR #3247) Thanks to @NotAFile! * Py3 storage/_base.py (PR #3278) Thanks to @NotAFile! * more six iteritems (PR #3279) Thanks to @NotAFile! * More Misc. py3 fixes (PR #3280) Thanks to @NotAFile! * remaining isintance fixes (PR #3281) Thanks to @NotAFile! * py3-ize state.py (PR #3283) Thanks to @NotAFile! * extend tox testing for py3 to avoid regressions (PR #3302) Thanks to @krombel! * use memoryview in py3 (PR #3303) Thanks to @NotAFile! Bugs: * Fix federation backfill bugs (PR #3261) * federation: fix LaterGauge usage (PR #3328) Thanks to @intelfx!
Diffstat (limited to 'synapse/metrics/__init__.py')
-rw-r--r-- | synapse/metrics/__init__.py | 241 |
1 files changed, 127 insertions, 114 deletions
diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index e3b831db67..429e79c472 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -17,165 +17,178 @@ import logging import functools import time import gc +import os import platform +import attr -from twisted.internet import reactor +from prometheus_client import Gauge, Histogram, Counter +from prometheus_client.core import GaugeMetricFamily, REGISTRY -from .metric import ( - CounterMetric, CallbackMetric, DistributionMetric, CacheMetric, - MemoryUsageMetric, GaugeMetric, -) -from .process_collector import register_process_collector +from twisted.internet import reactor logger = logging.getLogger(__name__) - -running_on_pypy = platform.python_implementation() == 'PyPy' +running_on_pypy = platform.python_implementation() == "PyPy" all_metrics = [] all_collectors = [] +all_gauges = {} + +HAVE_PROC_SELF_STAT = os.path.exists("/proc/self/stat") + + +class RegistryProxy(object): + + @staticmethod + def collect(): + for metric in REGISTRY.collect(): + if not metric.name.startswith("__"): + yield metric -class Metrics(object): - """ A single Metrics object gives a (mutable) slice view of the all_metrics - dict, allowing callers to easily register new metrics that are namespaced - nicely.""" +@attr.s(hash=True) +class LaterGauge(object): - def __init__(self, name): - self.name_prefix = name + name = attr.ib() + desc = attr.ib() + labels = attr.ib(hash=False) + caller = attr.ib() - def make_subspace(self, name): - return Metrics("%s_%s" % (self.name_prefix, name)) + def collect(self): - def register_collector(self, func): - all_collectors.append(func) + g = GaugeMetricFamily(self.name, self.desc, labels=self.labels) - def _register(self, metric_class, name, *args, **kwargs): - full_name = "%s_%s" % (self.name_prefix, name) + try: + calls = self.caller() + except Exception: + logger.exception( + "Exception running callback for LaterGuage(%s)", + self.name, + ) + yield g + return - metric = metric_class(full_name, *args, **kwargs) + if isinstance(calls, dict): + for k, v in calls.items(): + g.add_metric(k, v) + else: + g.add_metric([], calls) - all_metrics.append(metric) - return metric + yield g - def register_counter(self, *args, **kwargs): - """ - Returns: - CounterMetric - """ - return self._register(CounterMetric, *args, **kwargs) + def __attrs_post_init__(self): + self._register() - def register_gauge(self, *args, **kwargs): - """ - Returns: - GaugeMetric - """ - return self._register(GaugeMetric, *args, **kwargs) + def _register(self): + if self.name in all_gauges.keys(): + logger.warning("%s already registered, reregistering" % (self.name,)) + REGISTRY.unregister(all_gauges.pop(self.name)) - def register_callback(self, *args, **kwargs): - """ - Returns: - CallbackMetric - """ - return self._register(CallbackMetric, *args, **kwargs) + REGISTRY.register(self) + all_gauges[self.name] = self - def register_distribution(self, *args, **kwargs): - """ - Returns: - DistributionMetric - """ - return self._register(DistributionMetric, *args, **kwargs) - def register_cache(self, *args, **kwargs): - """ - Returns: - CacheMetric - """ - return self._register(CacheMetric, *args, **kwargs) +# +# Detailed CPU metrics +# + +class CPUMetrics(object): + def __init__(self): + ticks_per_sec = 100 + try: + # Try and get the system config + ticks_per_sec = os.sysconf('SC_CLK_TCK') + except (ValueError, TypeError, AttributeError): + pass -def register_memory_metrics(hs): - try: - import psutil - process = psutil.Process() - process.memory_info().rss - except (ImportError, AttributeError): - logger.warn( - "psutil is not installed or incorrect version." - " Disabling memory metrics." - ) - return - metric = MemoryUsageMetric(hs, psutil) - all_metrics.append(metric) + self.ticks_per_sec = ticks_per_sec + def collect(self): + if not HAVE_PROC_SELF_STAT: + return -def get_metrics_for(pkg_name): - """ Returns a Metrics instance for conveniently creating metrics - namespaced with the given name prefix. """ + with open("/proc/self/stat") as s: + line = s.read() + raw_stats = line.split(") ", 1)[1].split(" ") - # Convert a "package.name" to "package_name" because Prometheus doesn't - # let us use . in metric names - return Metrics(pkg_name.replace(".", "_")) + user = GaugeMetricFamily("process_cpu_user_seconds_total", "") + user.add_metric([], float(raw_stats[11]) / self.ticks_per_sec) + yield user + sys = GaugeMetricFamily("process_cpu_system_seconds_total", "") + sys.add_metric([], float(raw_stats[12]) / self.ticks_per_sec) + yield sys -def render_all(): - strs = [] - for collector in all_collectors: - collector() +REGISTRY.register(CPUMetrics()) - for metric in all_metrics: - try: - strs += metric.render() - except Exception: - strs += ["# FAILED to render"] - logger.exception("Failed to render metric") +# +# Python GC metrics +# + +gc_unreachable = Gauge("python_gc_unreachable_total", "Unreachable GC objects", ["gen"]) +gc_time = Histogram( + "python_gc_time", + "Time taken to GC (sec)", + ["gen"], + buckets=[0.0025, 0.005, 0.01, 0.025, 0.05, 0.10, 0.25, 0.50, 1.00, 2.50, + 5.00, 7.50, 15.00, 30.00, 45.00, 60.00], +) - strs.append("") # to generate a final CRLF - return "\n".join(strs) +class GCCounts(object): + def collect(self): + cm = GaugeMetricFamily("python_gc_counts", "GC cycle counts", labels=["gen"]) + for n, m in enumerate(gc.get_count()): + cm.add_metric([str(n)], m) -register_process_collector(get_metrics_for("process")) + yield cm -python_metrics = get_metrics_for("python") +REGISTRY.register(GCCounts()) -gc_time = python_metrics.register_distribution("gc_time", labels=["gen"]) -gc_unreachable = python_metrics.register_counter("gc_unreachable_total", labels=["gen"]) -python_metrics.register_callback( - "gc_counts", lambda: {(i,): v for i, v in enumerate(gc.get_count())}, labels=["gen"] +# +# Twisted reactor metrics +# + +tick_time = Histogram( + "python_twisted_reactor_tick_time", + "Tick time of the Twisted reactor (sec)", + buckets=[0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2, 5], +) +pending_calls_metric = Histogram( + "python_twisted_reactor_pending_calls", + "Pending calls", + buckets=[1, 2, 5, 10, 25, 50, 100, 250, 500, 1000], ) -reactor_metrics = get_metrics_for("python.twisted.reactor") -tick_time = reactor_metrics.register_distribution("tick_time") -pending_calls_metric = reactor_metrics.register_distribution("pending_calls") +# +# Federation Metrics +# + +sent_edus_counter = Counter("synapse_federation_client_sent_edus", "") + +sent_transactions_counter = Counter("synapse_federation_client_sent_transactions", "") -synapse_metrics = get_metrics_for("synapse") +events_processed_counter = Counter("synapse_federation_client_events_processed", "") # Used to track where various components have processed in the event stream, # e.g. federation sending, appservice sending, etc. -event_processing_positions = synapse_metrics.register_gauge( - "event_processing_positions", labels=["name"], -) +event_processing_positions = Gauge("synapse_event_processing_positions", "", ["name"]) # Used to track the current max events stream position -event_persisted_position = synapse_metrics.register_gauge( - "event_persisted_position", -) +event_persisted_position = Gauge("synapse_event_persisted_position", "") # Used to track the received_ts of the last event processed by various # components -event_processing_last_ts = synapse_metrics.register_gauge( - "event_processing_last_ts", labels=["name"], -) +event_processing_last_ts = Gauge("synapse_event_processing_last_ts", "", ["name"]) # Used to track the lag processing events. This is the time difference # between the last processed event's received_ts and the time it was # finished being processed. -event_processing_lag = synapse_metrics.register_gauge( - "event_processing_lag", labels=["name"], -) +event_processing_lag = Gauge("synapse_event_processing_lag", "", ["name"]) def runUntilCurrentTimer(func): @@ -197,17 +210,17 @@ def runUntilCurrentTimer(func): num_pending += 1 num_pending += len(reactor.threadCallQueue) - start = time.time() * 1000 + start = time.time() ret = func(*args, **kwargs) - end = time.time() * 1000 + end = time.time() # record the amount of wallclock time spent running pending calls. # This is a proxy for the actual amount of time between reactor polls, # since about 25% of time is actually spent running things triggered by # I/O events, but that is harder to capture without rewriting half the # reactor. - tick_time.inc_by(end - start) - pending_calls_metric.inc_by(num_pending) + tick_time.observe(end - start) + pending_calls_metric.observe(num_pending) if running_on_pypy: return ret @@ -220,12 +233,12 @@ def runUntilCurrentTimer(func): if threshold[i] < counts[i]: logger.info("Collecting gc %d", i) - start = time.time() * 1000 + start = time.time() unreachable = gc.collect(i) - end = time.time() * 1000 + end = time.time() - gc_time.inc_by(end - start, i) - gc_unreachable.inc_by(unreachable, i) + gc_time.labels(i).observe(end - start) + gc_unreachable.labels(i).set(unreachable) return ret |