diff options
Diffstat (limited to 'synapse/metrics')
-rw-r--r-- | synapse/metrics/__init__.py | 62 | ||||
-rw-r--r-- | synapse/metrics/metric.py | 84 |
2 files changed, 113 insertions, 33 deletions
diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index 5664d5a381..76d5998d75 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -22,22 +22,20 @@ import functools import os import stat import time +import gc from twisted.internet import reactor from .metric import ( - CounterMetric, CallbackMetric, DistributionMetric, CacheMetric + CounterMetric, CallbackMetric, DistributionMetric, CacheMetric, + MemoryUsageMetric, ) logger = logging.getLogger(__name__) -# We'll keep all the available metrics in a single toplevel dict, one shared -# for the entire process. We don't currently support per-HomeServer instances -# of metrics, because in practice any one python VM will host only one -# HomeServer anyway. This makes a lot of implementation neater -all_metrics = {} +all_metrics = [] class Metrics(object): @@ -53,7 +51,7 @@ class Metrics(object): metric = metric_class(full_name, *args, **kwargs) - all_metrics[full_name] = metric + all_metrics.append(metric) return metric def register_counter(self, *args, **kwargs): @@ -69,6 +67,21 @@ class Metrics(object): return self._register(CacheMetric, *args, **kwargs) +def register_memory_metrics(hs): + try: + import psutil + process = psutil.Process() + process.memory_info().rss + except (ImportError, AttributeError): + logger.warn( + "psutil is not installed or incorrect version." + " Disabling memory metrics." + ) + return + metric = MemoryUsageMetric(hs, psutil) + all_metrics.append(metric) + + def get_metrics_for(pkg_name): """ Returns a Metrics instance for conveniently creating metrics namespaced with the given name prefix. """ @@ -84,12 +97,12 @@ def render_all(): # TODO(paul): Internal hack update_resource_metrics() - for name in sorted(all_metrics.keys()): + for metric in all_metrics: try: - strs += all_metrics[name].render() + strs += metric.render() except Exception: - strs += ["# FAILED to render %s" % name] - logger.exception("Failed to render %s metric", name) + strs += ["# FAILED to render"] + logger.exception("Failed to render metric") strs.append("") # to generate a final CRLF @@ -156,6 +169,13 @@ reactor_metrics = get_metrics_for("reactor") tick_time = reactor_metrics.register_distribution("tick_time") pending_calls_metric = reactor_metrics.register_distribution("pending_calls") +gc_time = reactor_metrics.register_distribution("gc_time", labels=["gen"]) +gc_unreachable = reactor_metrics.register_counter("gc_unreachable", labels=["gen"]) + +reactor_metrics.register_callback( + "gc_counts", lambda: {(i,): v for i, v in enumerate(gc.get_count())}, labels=["gen"] +) + def runUntilCurrentTimer(func): @@ -182,6 +202,22 @@ def runUntilCurrentTimer(func): end = time.time() * 1000 tick_time.inc_by(end - start) pending_calls_metric.inc_by(num_pending) + + # Check if we need to do a manual GC (since its been disabled), and do + # one if necessary. + threshold = gc.get_threshold() + counts = gc.get_count() + for i in (2, 1, 0): + if threshold[i] < counts[i]: + logger.info("Collecting gc %d", i) + + start = time.time() * 1000 + unreachable = gc.collect(i) + end = time.time() * 1000 + + gc_time.inc_by(end - start, i) + gc_unreachable.inc_by(unreachable, i) + return ret return f @@ -196,5 +232,9 @@ try: # runUntilCurrent is called when we have pending calls. It is called once # per iteratation after fd polling. reactor.runUntilCurrent = runUntilCurrentTimer(reactor.runUntilCurrent) + + # We manually run the GC each reactor tick so that we can get some metrics + # about time spent doing GC, + gc.disable() except AttributeError: pass diff --git a/synapse/metrics/metric.py b/synapse/metrics/metric.py index 368fc24984..e81af29895 100644 --- a/synapse/metrics/metric.py +++ b/synapse/metrics/metric.py @@ -47,9 +47,6 @@ class BaseMetric(object): for k, v in zip(self.labels, values)]) ) - def render(self): - return map_concat(self.render_item, sorted(self.counts.keys())) - class CounterMetric(BaseMetric): """The simplest kind of metric; one that stores a monotonically-increasing @@ -83,6 +80,9 @@ class CounterMetric(BaseMetric): def render_item(self, k): return ["%s%s %d" % (self.name, self._render_key(k), self.counts[k])] + def render(self): + return map_concat(self.render_item, sorted(self.counts.keys())) + class CallbackMetric(BaseMetric): """A metric that returns the numeric value returned by a callback whenever @@ -126,30 +126,70 @@ class DistributionMetric(object): class CacheMetric(object): - """A combination of two CounterMetrics, one to count cache hits and one to - count a total, and a callback metric to yield the current size. - - This metric generates standard metric name pairs, so that monitoring rules - can easily be applied to measure hit ratio.""" + __slots__ = ("name", "cache_name", "hits", "misses", "size_callback") - def __init__(self, name, size_callback, labels=[]): + def __init__(self, name, size_callback, cache_name): self.name = name + self.cache_name = cache_name - self.hits = CounterMetric(name + ":hits", labels=labels) - self.total = CounterMetric(name + ":total", labels=labels) + self.hits = 0 + self.misses = 0 - self.size = CallbackMetric( - name + ":size", - callback=size_callback, - labels=labels, - ) + self.size_callback = size_callback + + def inc_hits(self): + self.hits += 1 + + def inc_misses(self): + self.misses += 1 + + def render(self): + size = self.size_callback() + hits = self.hits + total = self.misses + self.hits + + return [ + """%s:hits{name="%s"} %d""" % (self.name, self.cache_name, hits), + """%s:total{name="%s"} %d""" % (self.name, self.cache_name, total), + """%s:size{name="%s"} %d""" % (self.name, self.cache_name, size), + ] + + +class MemoryUsageMetric(object): + """Keeps track of the current memory usage, using psutil. + + The class will keep the current min/max/sum/counts of rss over the last + WINDOW_SIZE_SEC, by polling UPDATE_HZ times per second + """ + + UPDATE_HZ = 2 # number of times to get memory per second + WINDOW_SIZE_SEC = 30 # the size of the window in seconds + + def __init__(self, hs, psutil): + clock = hs.get_clock() + self.memory_snapshots = [] + + self.process = psutil.Process() - def inc_hits(self, *values): - self.hits.inc(*values) - self.total.inc(*values) + clock.looping_call(self._update_curr_values, 1000 / self.UPDATE_HZ) - def inc_misses(self, *values): - self.total.inc(*values) + def _update_curr_values(self): + max_size = self.UPDATE_HZ * self.WINDOW_SIZE_SEC + self.memory_snapshots.append(self.process.memory_info().rss) + self.memory_snapshots[:] = self.memory_snapshots[-max_size:] def render(self): - return self.hits.render() + self.total.render() + self.size.render() + if not self.memory_snapshots: + return [] + + max_rss = max(self.memory_snapshots) + min_rss = min(self.memory_snapshots) + sum_rss = sum(self.memory_snapshots) + len_rss = len(self.memory_snapshots) + + return [ + "process_psutil_rss:max %d" % max_rss, + "process_psutil_rss:min %d" % min_rss, + "process_psutil_rss:total %d" % sum_rss, + "process_psutil_rss:count %d" % len_rss, + ] |