5 files changed, 358 insertions, 421 deletions
diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py
index 2265e6e8d6..a9158fc066 100644
--- a/synapse/metrics/__init__.py
+++ b/synapse/metrics/__init__.py
@@ -13,118 +13,198 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import logging
 import functools
-import time
 import gc
+import logging
+import os
+import platform
+import time
 
-from twisted.internet import reactor
-
-from .metric import (
-    CounterMetric, CallbackMetric, DistributionMetric, CacheMetric,
-    MemoryUsageMetric,
-)
-from .process_collector import register_process_collector
+import attr
+from prometheus_client import Counter, Gauge, Histogram
+from prometheus_client.core import REGISTRY, GaugeMetricFamily
 
+from twisted.internet import reactor
 
 logger = logging.getLogger(__name__)
 
-
+running_on_pypy = platform.python_implementation() == "PyPy"
 all_metrics = []
 all_collectors = []
+all_gauges = {}
 
+HAVE_PROC_SELF_STAT = os.path.exists("/proc/self/stat")
 
-class Metrics(object):
-    """ A single Metrics object gives a (mutable) slice view of the all_metrics
-    dict, allowing callers to easily register new metrics that are namespaced
-    nicely."""
 
-    def __init__(self, name):
-        self.name_prefix = name
+class RegistryProxy(object):
 
-    def make_subspace(self, name):
-        return Metrics("%s_%s" % (self.name_prefix, name))
+    @staticmethod
+    def collect():
+        for metric in REGISTRY.collect():
+            if not metric.name.startswith("__"):
+                yield metric
 
-    def register_collector(self, func):
-        all_collectors.append(func)
 
-    def _register(self, metric_class, name, *args, **kwargs):
-        full_name = "%s_%s" % (self.name_prefix, name)
+@attr.s(hash=True)
+class LaterGauge(object):
 
-        metric = metric_class(full_name, *args, **kwargs)
+    name = attr.ib()
+    desc = attr.ib()
+    labels = attr.ib(hash=False)
+    caller = attr.ib()
 
-        all_metrics.append(metric)
-        return metric
+    def collect(self):
 
-    def register_counter(self, *args, **kwargs):
-        return self._register(CounterMetric, *args, **kwargs)
+        g = GaugeMetricFamily(self.name, self.desc, labels=self.labels)
 
-    def register_callback(self, *args, **kwargs):
-        return self._register(CallbackMetric, *args, **kwargs)
+        try:
+            calls = self.caller()
+        except Exception:
+            logger.exception(
+                "Exception running callback for LaterGauge(%s)",
+                self.name,
+            )
+            yield g
+            return
 
-    def register_distribution(self, *args, **kwargs):
-        return self._register(DistributionMetric, *args, **kwargs)
+        if isinstance(calls, dict):
+            for k, v in calls.items():
+                g.add_metric(k, v)
+        else:
+            g.add_metric([], calls)
 
-    def register_cache(self, *args, **kwargs):
-        return self._register(CacheMetric, *args, **kwargs)
+        yield g
 
+    def __attrs_post_init__(self):
+        self._register()
 
-def register_memory_metrics(hs):
-    try:
-        import psutil
-        process = psutil.Process()
-        process.memory_info().rss
-    except (ImportError, AttributeError):
-        logger.warn(
-            "psutil is not installed or incorrect version."
-            " Disabling memory metrics."
-        )
-        return
-    metric = MemoryUsageMetric(hs, psutil)
-    all_metrics.append(metric)
+    def _register(self):
+        if self.name in all_gauges.keys():
+            logger.warning("%s already registered, reregistering" % (self.name,))
+            REGISTRY.unregister(all_gauges.pop(self.name))
 
+        REGISTRY.register(self)
+        all_gauges[self.name] = self
 
-def get_metrics_for(pkg_name):
-    """ Returns a Metrics instance for conveniently creating metrics
-    namespaced with the given name prefix. """
 
-    # Convert a "package.name" to "package_name" because Prometheus doesn't
-    # let us use . in metric names
-    return Metrics(pkg_name.replace(".", "_"))
+#
+# Detailed CPU metrics
+#
 
+class CPUMetrics(object):
 
-def render_all():
-    strs = []
+    def __init__(self):
+        ticks_per_sec = 100
+        try:
+            # Try and get the system config
+            ticks_per_sec = os.sysconf('SC_CLK_TCK')
+        except (ValueError, TypeError, AttributeError):
+            pass
 
-    for collector in all_collectors:
-        collector()
+        self.ticks_per_sec = ticks_per_sec
 
-    for metric in all_metrics:
-        try:
-            strs += metric.render()
-        except Exception:
-            strs += ["# FAILED to render"]
-            logger.exception("Failed to render metric")
+    def collect(self):
+        if not HAVE_PROC_SELF_STAT:
+            return
 
-    strs.append("")  # to generate a final CRLF
+        with open("/proc/self/stat") as s:
+            line = s.read()
+            raw_stats = line.split(") ", 1)[1].split(" ")
 
-    return "\n".join(strs)
+            user = GaugeMetricFamily("process_cpu_user_seconds_total", "")
+            user.add_metric([], float(raw_stats[11]) / self.ticks_per_sec)
+            yield user
 
+            sys = GaugeMetricFamily("process_cpu_system_seconds_total", "")
+            sys.add_metric([], float(raw_stats[12]) / self.ticks_per_sec)
+            yield sys
 
-register_process_collector(get_metrics_for("process"))
 
+REGISTRY.register(CPUMetrics())
 
-python_metrics = get_metrics_for("python")
+#
+# Python GC metrics
+#
 
-gc_time = python_metrics.register_distribution("gc_time", labels=["gen"])
-gc_unreachable = python_metrics.register_counter("gc_unreachable_total", labels=["gen"])
-python_metrics.register_callback(
-    "gc_counts", lambda: {(i,): v for i, v in enumerate(gc.get_count())}, labels=["gen"]
+gc_unreachable = Gauge("python_gc_unreachable_total", "Unreachable GC objects", ["gen"])
+gc_time = Histogram(
+    "python_gc_time",
+    "Time taken to GC (sec)",
+    ["gen"],
+    buckets=[0.0025, 0.005, 0.01, 0.025, 0.05, 0.10, 0.25, 0.50, 1.00, 2.50,
+             5.00, 7.50, 15.00, 30.00, 45.00, 60.00],
 )
 
-reactor_metrics = get_metrics_for("python.twisted.reactor")
-tick_time = reactor_metrics.register_distribution("tick_time")
-pending_calls_metric = reactor_metrics.register_distribution("pending_calls")
+
+class GCCounts(object):
+
+    def collect(self):
+        cm = GaugeMetricFamily("python_gc_counts", "GC object counts", labels=["gen"])
+        for n, m in enumerate(gc.get_count()):
+            cm.add_metric([str(n)], m)
+
+        yield cm
+
+
+if not running_on_pypy:
+    REGISTRY.register(GCCounts())
+
+#
+# Twisted reactor metrics
+#
+
+tick_time = Histogram(
+    "python_twisted_reactor_tick_time",
+    "Tick time of the Twisted reactor (sec)",
+    buckets=[0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2, 5],
+)
+pending_calls_metric = Histogram(
+    "python_twisted_reactor_pending_calls",
+    "Pending calls",
+    buckets=[1, 2, 5, 10, 25, 50, 100, 250, 500, 1000],
+)
+
+#
+# Federation Metrics
+#
+
+sent_edus_counter = Counter("synapse_federation_client_sent_edus", "")
+
+sent_transactions_counter = Counter("synapse_federation_client_sent_transactions", "")
+
+events_processed_counter = Counter("synapse_federation_client_events_processed", "")
+
+# Used to track where various components have processed in the event stream,
+# e.g. federation sending, appservice sending, etc.
+event_processing_positions = Gauge("synapse_event_processing_positions", "", ["name"])
+
+# Used to track the current max events stream position
+event_persisted_position = Gauge("synapse_event_persisted_position", "")
+
+# Used to track the received_ts of the last event processed by various
+# components
+event_processing_last_ts = Gauge("synapse_event_processing_last_ts", "", ["name"])
+
+# Used to track the lag processing events. This is the time difference
+# between the last processed event's received_ts and the time it was
+# finished being processed.
+event_processing_lag = Gauge("synapse_event_processing_lag", "", ["name"])
+
+last_ticked = time.time()
+
+
+class ReactorLastSeenMetric(object):
+
+    def collect(self):
+        cm = GaugeMetricFamily(
+            "python_twisted_reactor_last_seen",
+            "Seconds since the Twisted reactor was last seen",
+        )
+        cm.add_metric([], time.time() - last_ticked)
+        yield cm
+
+
+REGISTRY.register(ReactorLastSeenMetric())
 
 
 def runUntilCurrentTimer(func):
@@ -146,12 +226,25 @@ def runUntilCurrentTimer(func):
             num_pending += 1
 
         num_pending += len(reactor.threadCallQueue)
-
-        start = time.time() * 1000
+        start = time.time()
         ret = func(*args, **kwargs)
-        end = time.time() * 1000
-        tick_time.inc_by(end - start)
-        pending_calls_metric.inc_by(num_pending)
+        end = time.time()
+
+        # record the amount of wallclock time spent running pending calls.
+        # This is a proxy for the actual amount of time between reactor polls,
+        # since about 25% of time is actually spent running things triggered by
+        # I/O events, but that is harder to capture without rewriting half the
+        # reactor.
+        tick_time.observe(end - start)
+        pending_calls_metric.observe(num_pending)
+
+        # Update the time we last ticked, for the metric to test whether
+        # Synapse's reactor has frozen
+        global last_ticked
+        last_ticked = end
+
+        if running_on_pypy:
+            return ret
 
         # Check if we need to do a manual GC (since its been disabled), and do
         # one if necessary.
@@ -161,12 +254,12 @@ def runUntilCurrentTimer(func):
             if threshold[i] < counts[i]:
                 logger.info("Collecting gc %d", i)
 
-                start = time.time() * 1000
+                start = time.time()
                 unreachable = gc.collect(i)
-                end = time.time() * 1000
+                end = time.time()
 
-                gc_time.inc_by(end - start, i)
-                gc_unreachable.inc_by(unreachable, i)
+                gc_time.labels(i).observe(end - start)
+                gc_unreachable.labels(i).set(unreachable)
 
         return ret
 
@@ -185,6 +278,7 @@ try:
 
     # We manually run the GC each reactor tick so that we can get some metrics
     # about time spent doing GC,
-    gc.disable()
+    if not running_on_pypy:
+        gc.disable()
 except AttributeError:
     pass
diff --git a/synapse/metrics/background_process_metrics.py b/synapse/metrics/background_process_metrics.py
new file mode 100644
index 0000000000..9d820e44a6
--- /dev/null
+++ b/synapse/metrics/background_process_metrics.py
@@ -0,0 +1,179 @@
+# -*- coding: utf-8 -*-
+# Copyright 2018 New Vector Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+
+from prometheus_client.core import REGISTRY, Counter, GaugeMetricFamily
+
+from twisted.internet import defer
+
+from synapse.util.logcontext import LoggingContext, PreserveLoggingContext
+
+_background_process_start_count = Counter(
+    "synapse_background_process_start_count",
+    "Number of background processes started",
+    ["name"],
+)
+
+# we set registry=None in all of these to stop them getting registered with
+# the default registry. Instead we collect them all via the CustomCollector,
+# which ensures that we can update them before they are collected.
+#
+_background_process_ru_utime = Counter(
+    "synapse_background_process_ru_utime_seconds",
+    "User CPU time used by background processes, in seconds",
+    ["name"],
+    registry=None,
+)
+
+_background_process_ru_stime = Counter(
+    "synapse_background_process_ru_stime_seconds",
+    "System CPU time used by background processes, in seconds",
+    ["name"],
+    registry=None,
+)
+
+_background_process_db_txn_count = Counter(
+    "synapse_background_process_db_txn_count",
+    "Number of database transactions done by background processes",
+    ["name"],
+    registry=None,
+)
+
+_background_process_db_txn_duration = Counter(
+    "synapse_background_process_db_txn_duration_seconds",
+    ("Seconds spent by background processes waiting for database "
+     "transactions, excluding scheduling time"),
+    ["name"],
+    registry=None,
+)
+
+_background_process_db_sched_duration = Counter(
+    "synapse_background_process_db_sched_duration_seconds",
+    "Seconds spent by background processes waiting for database connections",
+    ["name"],
+    registry=None,
+)
+
+# map from description to a counter, so that we can name our logcontexts
+# incrementally. (It actually duplicates _background_process_start_count, but
+# it's much simpler to do so than to try to combine them.)
+_background_process_counts = dict()  # type: dict[str, int]
+
+# map from description to the currently running background processes.
+#
+# it's kept as a dict of sets rather than a big set so that we can keep track
+# of process descriptions that no longer have any active processes.
+_background_processes = dict()  # type: dict[str, set[_BackgroundProcess]]
+
+
+class _Collector(object):
+    """A custom metrics collector for the background process metrics.
+
+    Ensures that all of the metrics are up-to-date with any in-flight processes
+    before they are returned.
+    """
+    def collect(self):
+        background_process_in_flight_count = GaugeMetricFamily(
+            "synapse_background_process_in_flight_count",
+            "Number of background processes in flight",
+            labels=["name"],
+        )
+
+        for desc, processes in six.iteritems(_background_processes):
+            background_process_in_flight_count.add_metric(
+                (desc,), len(processes),
+            )
+            for process in processes:
+                process.update_metrics()
+
+        yield background_process_in_flight_count
+
+        # now we need to run collect() over each of the static Counters, and
+        # yield each metric they return.
+        for m in (
+                _background_process_ru_utime,
+                _background_process_ru_stime,
+                _background_process_db_txn_count,
+                _background_process_db_txn_duration,
+                _background_process_db_sched_duration,
+        ):
+            for r in m.collect():
+                yield r
+
+
+REGISTRY.register(_Collector())
+
+
+class _BackgroundProcess(object):
+    def __init__(self, desc, ctx):
+        self.desc = desc
+        self._context = ctx
+        self._reported_stats = None
+
+    def update_metrics(self):
+        """Updates the metrics with values from this process."""
+        new_stats = self._context.get_resource_usage()
+        if self._reported_stats is None:
+            diff = new_stats
+        else:
+            diff = new_stats - self._reported_stats
+        self._reported_stats = new_stats
+
+        _background_process_ru_utime.labels(self.desc).inc(diff.ru_utime)
+        _background_process_ru_stime.labels(self.desc).inc(diff.ru_stime)
+        _background_process_db_txn_count.labels(self.desc).inc(
+            diff.db_txn_count,
+        )
+        _background_process_db_txn_duration.labels(self.desc).inc(
+            diff.db_txn_duration_sec,
+        )
+        _background_process_db_sched_duration.labels(self.desc).inc(
+            diff.db_sched_duration_sec,
+        )
+
+
+def run_as_background_process(desc, func, *args, **kwargs):
+    """Run the given function in its own logcontext, with resource metrics
+
+    This should be used to wrap processes which are fired off to run in the
+    background, instead of being associated with a particular request.
+
+    Args:
+        desc (str): a description for this background process type
+        func: a function, which may return a Deferred
+        args: positional args for func
+        kwargs: keyword args for func
+
+    Returns: None
+    """
+    @defer.inlineCallbacks
+    def run():
+        count = _background_process_counts.get(desc, 0)
+        _background_process_counts[desc] = count + 1
+        _background_process_start_count.labels(desc).inc()
+
+        with LoggingContext(desc) as context:
+            context.request = "%s-%i" % (desc, count)
+            proc = _BackgroundProcess(desc, context)
+            _background_processes.setdefault(desc, set()).add(proc)
+            try:
+                yield func(*args, **kwargs)
+            finally:
+                proc.update_metrics()
+                _background_processes[desc].remove(proc)
+
+    with PreserveLoggingContext():
+        run()
diff --git a/synapse/metrics/metric.py b/synapse/metrics/metric.py
deleted file mode 100644
index e87b2b80a7..0000000000
--- a/synapse/metrics/metric.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2015, 2016 OpenMarket Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from itertools import chain
-
-
-# TODO(paul): I can't believe Python doesn't have one of these
-def map_concat(func, items):
-    # flatten a list-of-lists
-    return list(chain.from_iterable(map(func, items)))
-
-
-class BaseMetric(object):
-
-    def __init__(self, name, labels=[]):
-        self.name = name
-        self.labels = labels  # OK not to clone as we never write it
-
-    def dimension(self):
-        return len(self.labels)
-
-    def is_scalar(self):
-        return not len(self.labels)
-
-    def _render_labelvalue(self, value):
-        # TODO: some kind of value escape
-        return '"%s"' % (value)
-
-    def _render_key(self, values):
-        if self.is_scalar():
-            return ""
-        return "{%s}" % (
-            ",".join(["%s=%s" % (k, self._render_labelvalue(v))
-                      for k, v in zip(self.labels, values)])
-        )
-
-
-class CounterMetric(BaseMetric):
-    """The simplest kind of metric; one that stores a monotonically-increasing
-    integer that counts events."""
-
-    def __init__(self, *args, **kwargs):
-        super(CounterMetric, self).__init__(*args, **kwargs)
-
-        self.counts = {}
-
-        # Scalar metrics are never empty
-        if self.is_scalar():
-            self.counts[()] = 0
-
-    def inc_by(self, incr, *values):
-        if len(values) != self.dimension():
-            raise ValueError(
-                "Expected as many values to inc() as labels (%d)" % (self.dimension())
-            )
-
-        # TODO: should assert that the tag values are all strings
-
-        if values not in self.counts:
-            self.counts[values] = incr
-        else:
-            self.counts[values] += incr
-
-    def inc(self, *values):
-        self.inc_by(1, *values)
-
-    def render_item(self, k):
-        return ["%s%s %d" % (self.name, self._render_key(k), self.counts[k])]
-
-    def render(self):
-        return map_concat(self.render_item, sorted(self.counts.keys()))
-
-
-class CallbackMetric(BaseMetric):
-    """A metric that returns the numeric value returned by a callback whenever
-    it is rendered. Typically this is used to implement gauges that yield the
-    size or other state of some in-memory object by actively querying it."""
-
-    def __init__(self, name, callback, labels=[]):
-        super(CallbackMetric, self).__init__(name, labels=labels)
-
-        self.callback = callback
-
-    def render(self):
-        value = self.callback()
-
-        if self.is_scalar():
-            return ["%s %.12g" % (self.name, value)]
-
-        return ["%s%s %.12g" % (self.name, self._render_key(k), value[k])
-                for k in sorted(value.keys())]
-
-
-class DistributionMetric(object):
-    """A combination of an event counter and an accumulator, which counts
-    both the number of events and accumulates the total value. Typically this
-    could be used to keep track of method-running times, or other distributions
-    of values that occur in discrete occurances.
-
-    TODO(paul): Try to export some heatmap-style stats?
-    """
-
-    def __init__(self, name, *args, **kwargs):
-        self.counts = CounterMetric(name + ":count", **kwargs)
-        self.totals = CounterMetric(name + ":total", **kwargs)
-
-    def inc_by(self, inc, *values):
-        self.counts.inc(*values)
-        self.totals.inc_by(inc, *values)
-
-    def render(self):
-        return self.counts.render() + self.totals.render()
-
-
-class CacheMetric(object):
-    __slots__ = ("name", "cache_name", "hits", "misses", "size_callback")
-
-    def __init__(self, name, size_callback, cache_name):
-        self.name = name
-        self.cache_name = cache_name
-
-        self.hits = 0
-        self.misses = 0
-
-        self.size_callback = size_callback
-
-    def inc_hits(self):
-        self.hits += 1
-
-    def inc_misses(self):
-        self.misses += 1
-
-    def render(self):
-        size = self.size_callback()
-        hits = self.hits
-        total = self.misses + self.hits
-
-        return [
-            """%s:hits{name="%s"} %d""" % (self.name, self.cache_name, hits),
-            """%s:total{name="%s"} %d""" % (self.name, self.cache_name, total),
-            """%s:size{name="%s"} %d""" % (self.name, self.cache_name, size),
-        ]
-
-
-class MemoryUsageMetric(object):
-    """Keeps track of the current memory usage, using psutil.
-
-    The class will keep the current min/max/sum/counts of rss over the last
-    WINDOW_SIZE_SEC, by polling UPDATE_HZ times per second
-    """
-
-    UPDATE_HZ = 2  # number of times to get memory per second
-    WINDOW_SIZE_SEC = 30  # the size of the window in seconds
-
-    def __init__(self, hs, psutil):
-        clock = hs.get_clock()
-        self.memory_snapshots = []
-
-        self.process = psutil.Process()
-
-        clock.looping_call(self._update_curr_values, 1000 / self.UPDATE_HZ)
-
-    def _update_curr_values(self):
-        max_size = self.UPDATE_HZ * self.WINDOW_SIZE_SEC
-        self.memory_snapshots.append(self.process.memory_info().rss)
-        self.memory_snapshots[:] = self.memory_snapshots[-max_size:]
-
-    def render(self):
-        if not self.memory_snapshots:
-            return []
-
-        max_rss = max(self.memory_snapshots)
-        min_rss = min(self.memory_snapshots)
-        sum_rss = sum(self.memory_snapshots)
-        len_rss = len(self.memory_snapshots)
-
-        return [
-            "process_psutil_rss:max %d" % max_rss,
-            "process_psutil_rss:min %d" % min_rss,
-            "process_psutil_rss:total %d" % sum_rss,
-            "process_psutil_rss:count %d" % len_rss,
-        ]
diff --git a/synapse/metrics/process_collector.py b/synapse/metrics/process_collector.py
deleted file mode 100644
index 6fec3de399..0000000000
--- a/synapse/metrics/process_collector.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2015, 2016 OpenMarket Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-
-TICKS_PER_SEC = 100
-BYTES_PER_PAGE = 4096
-
-HAVE_PROC_STAT = os.path.exists("/proc/stat")
-HAVE_PROC_SELF_STAT = os.path.exists("/proc/self/stat")
-HAVE_PROC_SELF_LIMITS = os.path.exists("/proc/self/limits")
-HAVE_PROC_SELF_FD = os.path.exists("/proc/self/fd")
-
-# Field indexes from /proc/self/stat, taken from the proc(5) manpage
-STAT_FIELDS = {
-    "utime": 14,
-    "stime": 15,
-    "starttime": 22,
-    "vsize": 23,
-    "rss": 24,
-}
-
-
-stats = {}
-
-# In order to report process_start_time_seconds we need to know the
-# machine's boot time, because the value in /proc/self/stat is relative to
-# this
-boot_time = None
-if HAVE_PROC_STAT:
-    with open("/proc/stat") as _procstat:
-        for line in _procstat:
-            if line.startswith("btime "):
-                boot_time = int(line.split()[1])
-
-
-def update_resource_metrics():
-    if HAVE_PROC_SELF_STAT:
-        global stats
-        with open("/proc/self/stat") as s:
-            line = s.read()
-            # line is PID (command) more stats go here ...
-            raw_stats = line.split(") ", 1)[1].split(" ")
-
-            for (name, index) in STAT_FIELDS.iteritems():
-                # subtract 3 from the index, because proc(5) is 1-based, and
-                # we've lost the first two fields in PID and COMMAND above
-                stats[name] = int(raw_stats[index - 3])
-
-
-def _count_fds():
-    # Not every OS will have a /proc/self/fd directory
-    if not HAVE_PROC_SELF_FD:
-        return 0
-
-    return len(os.listdir("/proc/self/fd"))
-
-
-def register_process_collector(process_metrics):
-    process_metrics.register_collector(update_resource_metrics)
-
-    if HAVE_PROC_SELF_STAT:
-        process_metrics.register_callback(
-            "cpu_user_seconds_total",
-            lambda: float(stats["utime"]) / TICKS_PER_SEC
-        )
-        process_metrics.register_callback(
-            "cpu_system_seconds_total",
-            lambda: float(stats["stime"]) / TICKS_PER_SEC
-        )
-        process_metrics.register_callback(
-            "cpu_seconds_total",
-            lambda: (float(stats["utime"] + stats["stime"])) / TICKS_PER_SEC
-        )
-
-        process_metrics.register_callback(
-            "virtual_memory_bytes",
-            lambda: int(stats["vsize"])
-        )
-        process_metrics.register_callback(
-            "resident_memory_bytes",
-            lambda: int(stats["rss"]) * BYTES_PER_PAGE
-        )
-
-        process_metrics.register_callback(
-            "start_time_seconds",
-            lambda: boot_time + int(stats["starttime"]) / TICKS_PER_SEC
-        )
-
-    if HAVE_PROC_SELF_FD:
-        process_metrics.register_callback(
-            "open_fds",
-            lambda: _count_fds()
-        )
-
-    if HAVE_PROC_SELF_LIMITS:
-        def _get_max_fds():
-            with open("/proc/self/limits") as limits:
-                for line in limits:
-                    if not line.startswith("Max open files "):
-                        continue
-                    # Line is  Max open files  $SOFT  $HARD
-                    return int(line.split()[3])
-            return None
-
-        process_metrics.register_callback(
-            "max_fds",
-            lambda: _get_max_fds()
-        )
diff --git a/synapse/metrics/resource.py b/synapse/metrics/resource.py
index 870f400600..9789359077 100644
--- a/synapse/metrics/resource.py
+++ b/synapse/metrics/resource.py
@@ -13,27 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from twisted.web.resource import Resource
-
-import synapse.metrics
-
+from prometheus_client.twisted import MetricsResource
 
 METRICS_PREFIX = "/_synapse/metrics"
 
-
-class MetricsResource(Resource):
-    isLeaf = True
-
-    def __init__(self, hs):
-        Resource.__init__(self)  # Resource is old-style, so no super()
-
-        self.hs = hs
-
-    def render_GET(self, request):
-        response = synapse.metrics.render_all()
-
-        request.setHeader("Content-Type", "text/plain")
-        request.setHeader("Content-Length", str(len(response)))
-
-        # Encode as UTF-8 (default)
-        return response.encode()
+__all__ = ["MetricsResource", "METRICS_PREFIX"]