From be4250c7a888e314e361df42042bfa344ab65d55 Mon Sep 17 00:00:00 2001
From: reivilibre <oliverw@matrix.org>
Date: Wed, 24 Aug 2022 11:35:54 +0000
Subject: Add experimental configuration option to allow disabling legacy
 Prometheus metric names. (#13540)

Co-authored-by: David Robertson <davidr@element.io>
---
 synapse/metrics/__init__.py           |   4 +-
 synapse/metrics/_exposition.py        | 262 -------------------------------
 synapse/metrics/_legacy_exposition.py | 284 ++++++++++++++++++++++++++++++++++
 3 files changed, 286 insertions(+), 264 deletions(-)
 delete mode 100644 synapse/metrics/_exposition.py
 create mode 100644 synapse/metrics/_legacy_exposition.py

(limited to 'synapse/metrics')

diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py
index 496fce2ecc..c3d3daf877 100644
--- a/synapse/metrics/__init__.py
+++ b/synapse/metrics/__init__.py
@@ -46,12 +46,12 @@ from twisted.python.threadpool import ThreadPool
 
 # This module is imported for its side effects; flake8 needn't warn that it's unused.
 import synapse.metrics._reactor_metrics  # noqa: F401
-from synapse.metrics._exposition import (
+from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, install_gc_manager
+from synapse.metrics._legacy_exposition import (
     MetricsResource,
     generate_latest,
     start_http_server,
 )
-from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, install_gc_manager
 from synapse.metrics._types import Collector
 from synapse.util import SYNAPSE_VERSION
 
diff --git a/synapse/metrics/_exposition.py b/synapse/metrics/_exposition.py
deleted file mode 100644
index 353d0a63b6..0000000000
--- a/synapse/metrics/_exposition.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# Copyright 2015-2019 Prometheus Python Client Developers
-# Copyright 2019 Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This code is based off `prometheus_client/exposition.py` from version 0.7.1.
-
-Due to the renaming of metrics in prometheus_client 0.4.0, this customised
-vendoring of the code will emit both the old versions that Synapse dashboards
-expect, and the newer "best practice" version of the up-to-date official client.
-"""
-
-import math
-import threading
-from http.server import BaseHTTPRequestHandler, HTTPServer
-from socketserver import ThreadingMixIn
-from typing import Any, Dict, List, Type, Union
-from urllib.parse import parse_qs, urlparse
-
-from prometheus_client import REGISTRY, CollectorRegistry
-from prometheus_client.core import Sample
-
-from twisted.web.resource import Resource
-from twisted.web.server import Request
-
-from synapse.util import caches
-
-CONTENT_TYPE_LATEST = "text/plain; version=0.0.4; charset=utf-8"
-
-
-def floatToGoString(d: Union[int, float]) -> str:
-    d = float(d)
-    if d == math.inf:
-        return "+Inf"
-    elif d == -math.inf:
-        return "-Inf"
-    elif math.isnan(d):
-        return "NaN"
-    else:
-        s = repr(d)
-        dot = s.find(".")
-        # Go switches to exponents sooner than Python.
-        # We only need to care about positive values for le/quantile.
-        if d > 0 and dot > 6:
-            mantissa = f"{s[0]}.{s[1:dot]}{s[dot + 1 :]}".rstrip("0.")
-            return f"{mantissa}e+0{dot - 1}"
-        return s
-
-
-def sample_line(line: Sample, name: str) -> str:
-    if line.labels:
-        labelstr = "{{{0}}}".format(
-            ",".join(
-                [
-                    '{}="{}"'.format(
-                        k,
-                        v.replace("\\", r"\\").replace("\n", r"\n").replace('"', r"\""),
-                    )
-                    for k, v in sorted(line.labels.items())
-                ]
-            )
-        )
-    else:
-        labelstr = ""
-    timestamp = ""
-    if line.timestamp is not None:
-        # Convert to milliseconds.
-        timestamp = f" {int(float(line.timestamp) * 1000):d}"
-    return "{}{} {}{}\n".format(name, labelstr, floatToGoString(line.value), timestamp)
-
-
-def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> bytes:
-
-    # Trigger the cache metrics to be rescraped, which updates the common
-    # metrics but do not produce metrics themselves
-    for collector in caches.collectors_by_name.values():
-        collector.collect()
-
-    output = []
-
-    for metric in registry.collect():
-        if not metric.samples:
-            # No samples, don't bother.
-            continue
-
-        mname = metric.name
-        mnewname = metric.name
-        mtype = metric.type
-
-        # OpenMetrics -> Prometheus
-        if mtype == "counter":
-            mnewname = mnewname + "_total"
-        elif mtype == "info":
-            mtype = "gauge"
-            mnewname = mnewname + "_info"
-        elif mtype == "stateset":
-            mtype = "gauge"
-        elif mtype == "gaugehistogram":
-            mtype = "histogram"
-        elif mtype == "unknown":
-            mtype = "untyped"
-
-        # Output in the old format for compatibility.
-        if emit_help:
-            output.append(
-                "# HELP {} {}\n".format(
-                    mname,
-                    metric.documentation.replace("\\", r"\\").replace("\n", r"\n"),
-                )
-            )
-        output.append(f"# TYPE {mname} {mtype}\n")
-
-        om_samples: Dict[str, List[str]] = {}
-        for s in metric.samples:
-            for suffix in ["_created", "_gsum", "_gcount"]:
-                if s.name == metric.name + suffix:
-                    # OpenMetrics specific sample, put in a gauge at the end.
-                    # (these come from gaugehistograms which don't get renamed,
-                    # so no need to faff with mnewname)
-                    om_samples.setdefault(suffix, []).append(sample_line(s, s.name))
-                    break
-            else:
-                newname = s.name.replace(mnewname, mname)
-                if ":" in newname and newname.endswith("_total"):
-                    newname = newname[: -len("_total")]
-                output.append(sample_line(s, newname))
-
-        for suffix, lines in sorted(om_samples.items()):
-            if emit_help:
-                output.append(
-                    "# HELP {}{} {}\n".format(
-                        metric.name,
-                        suffix,
-                        metric.documentation.replace("\\", r"\\").replace("\n", r"\n"),
-                    )
-                )
-            output.append(f"# TYPE {metric.name}{suffix} gauge\n")
-            output.extend(lines)
-
-        # Get rid of the weird colon things while we're at it
-        if mtype == "counter":
-            mnewname = mnewname.replace(":total", "")
-        mnewname = mnewname.replace(":", "_")
-
-        if mname == mnewname:
-            continue
-
-        # Also output in the new format, if it's different.
-        if emit_help:
-            output.append(
-                "# HELP {} {}\n".format(
-                    mnewname,
-                    metric.documentation.replace("\\", r"\\").replace("\n", r"\n"),
-                )
-            )
-        output.append(f"# TYPE {mnewname} {mtype}\n")
-
-        for s in metric.samples:
-            # Get rid of the OpenMetrics specific samples (we should already have
-            # dealt with them above anyway.)
-            for suffix in ["_created", "_gsum", "_gcount"]:
-                if s.name == metric.name + suffix:
-                    break
-            else:
-                output.append(
-                    sample_line(s, s.name.replace(":total", "").replace(":", "_"))
-                )
-
-    return "".join(output).encode("utf-8")
-
-
-class MetricsHandler(BaseHTTPRequestHandler):
-    """HTTP handler that gives metrics from ``REGISTRY``."""
-
-    registry = REGISTRY
-
-    def do_GET(self) -> None:
-        registry = self.registry
-        params = parse_qs(urlparse(self.path).query)
-
-        if "help" in params:
-            emit_help = True
-        else:
-            emit_help = False
-
-        try:
-            output = generate_latest(registry, emit_help=emit_help)
-        except Exception:
-            self.send_error(500, "error generating metric output")
-            raise
-        self.send_response(200)
-        self.send_header("Content-Type", CONTENT_TYPE_LATEST)
-        self.send_header("Content-Length", str(len(output)))
-        self.end_headers()
-        self.wfile.write(output)
-
-    def log_message(self, format: str, *args: Any) -> None:
-        """Log nothing."""
-
-    @classmethod
-    def factory(cls, registry: CollectorRegistry) -> Type:
-        """Returns a dynamic MetricsHandler class tied
-        to the passed registry.
-        """
-        # This implementation relies on MetricsHandler.registry
-        #  (defined above and defaulted to REGISTRY).
-
-        # As we have unicode_literals, we need to create a str()
-        #  object for type().
-        cls_name = str(cls.__name__)
-        MyMetricsHandler = type(cls_name, (cls, object), {"registry": registry})
-        return MyMetricsHandler
-
-
-class _ThreadingSimpleServer(ThreadingMixIn, HTTPServer):
-    """Thread per request HTTP server."""
-
-    # Make worker threads "fire and forget". Beginning with Python 3.7 this
-    # prevents a memory leak because ``ThreadingMixIn`` starts to gather all
-    # non-daemon threads in a list in order to join on them at server close.
-    # Enabling daemon threads virtually makes ``_ThreadingSimpleServer`` the
-    # same as Python 3.7's ``ThreadingHTTPServer``.
-    daemon_threads = True
-
-
-def start_http_server(
-    port: int, addr: str = "", registry: CollectorRegistry = REGISTRY
-) -> None:
-    """Starts an HTTP server for prometheus metrics as a daemon thread"""
-    CustomMetricsHandler = MetricsHandler.factory(registry)
-    httpd = _ThreadingSimpleServer((addr, port), CustomMetricsHandler)
-    t = threading.Thread(target=httpd.serve_forever)
-    t.daemon = True
-    t.start()
-
-
-class MetricsResource(Resource):
-    """
-    Twisted ``Resource`` that serves prometheus metrics.
-    """
-
-    isLeaf = True
-
-    def __init__(self, registry: CollectorRegistry = REGISTRY):
-        self.registry = registry
-
-    def render_GET(self, request: Request) -> bytes:
-        request.setHeader(b"Content-Type", CONTENT_TYPE_LATEST.encode("ascii"))
-        response = generate_latest(self.registry)
-        request.setHeader(b"Content-Length", str(len(response)))
-        return response
diff --git a/synapse/metrics/_legacy_exposition.py b/synapse/metrics/_legacy_exposition.py
new file mode 100644
index 0000000000..ff640a49af
--- /dev/null
+++ b/synapse/metrics/_legacy_exposition.py
@@ -0,0 +1,284 @@
+# Copyright 2015-2019 Prometheus Python Client Developers
+# Copyright 2019 Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This code is based off `prometheus_client/exposition.py` from version 0.7.1.
+
+Due to the renaming of metrics in prometheus_client 0.4.0, this customised
+vendoring of the code will emit both the old versions that Synapse dashboards
+expect, and the newer "best practice" version of the up-to-date official client.
+"""
+
+import math
+import threading
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from socketserver import ThreadingMixIn
+from typing import Any, Dict, List, Type, Union
+from urllib.parse import parse_qs, urlparse
+
+from prometheus_client import REGISTRY, CollectorRegistry
+from prometheus_client.core import Sample
+
+from twisted.web.resource import Resource
+from twisted.web.server import Request
+
+from synapse.util import caches
+
+CONTENT_TYPE_LATEST = "text/plain; version=0.0.4; charset=utf-8"
+
+
+def floatToGoString(d: Union[int, float]) -> str:
+    d = float(d)
+    if d == math.inf:
+        return "+Inf"
+    elif d == -math.inf:
+        return "-Inf"
+    elif math.isnan(d):
+        return "NaN"
+    else:
+        s = repr(d)
+        dot = s.find(".")
+        # Go switches to exponents sooner than Python.
+        # We only need to care about positive values for le/quantile.
+        if d > 0 and dot > 6:
+            mantissa = f"{s[0]}.{s[1:dot]}{s[dot + 1 :]}".rstrip("0.")
+            return f"{mantissa}e+0{dot - 1}"
+        return s
+
+
+def sample_line(line: Sample, name: str) -> str:
+    if line.labels:
+        labelstr = "{{{0}}}".format(
+            ",".join(
+                [
+                    '{}="{}"'.format(
+                        k,
+                        v.replace("\\", r"\\").replace("\n", r"\n").replace('"', r"\""),
+                    )
+                    for k, v in sorted(line.labels.items())
+                ]
+            )
+        )
+    else:
+        labelstr = ""
+    timestamp = ""
+    if line.timestamp is not None:
+        # Convert to milliseconds.
+        timestamp = f" {int(float(line.timestamp) * 1000):d}"
+    return "{}{} {}{}\n".format(name, labelstr, floatToGoString(line.value), timestamp)
+
+
+# Mapping from new metric names to legacy metric names.
+# We translate these back to their old names when exposing them through our
+# legacy vendored exporter.
+# Only this legacy exposition module applies these name changes.
+LEGACY_METRIC_NAMES = {
+    "synapse_util_caches_cache_hits": "synapse_util_caches_cache:hits",
+    "synapse_util_caches_cache_size": "synapse_util_caches_cache:size",
+    "synapse_util_caches_cache_evicted_size": "synapse_util_caches_cache:evicted_size",
+    "synapse_util_caches_cache_total": "synapse_util_caches_cache:total",
+    "synapse_util_caches_response_cache_size": "synapse_util_caches_response_cache:size",
+    "synapse_util_caches_response_cache_hits": "synapse_util_caches_response_cache:hits",
+    "synapse_util_caches_response_cache_evicted_size": "synapse_util_caches_response_cache:evicted_size",
+    "synapse_util_caches_response_cache_total": "synapse_util_caches_response_cache:total",
+}
+
+
+def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> bytes:
+    """
+    Generate metrics in legacy format. Modern metrics are generated directly
+    by prometheus-client.
+    """
+
+    # Trigger the cache metrics to be rescraped, which updates the common
+    # metrics but do not produce metrics themselves
+    for collector in caches.collectors_by_name.values():
+        collector.collect()
+
+    output = []
+
+    for metric in registry.collect():
+        if not metric.samples:
+            # No samples, don't bother.
+            continue
+
+        # Translate to legacy metric name if it has one.
+        mname = LEGACY_METRIC_NAMES.get(metric.name, metric.name)
+        mnewname = metric.name
+        mtype = metric.type
+
+        # OpenMetrics -> Prometheus
+        if mtype == "counter":
+            mnewname = mnewname + "_total"
+        elif mtype == "info":
+            mtype = "gauge"
+            mnewname = mnewname + "_info"
+        elif mtype == "stateset":
+            mtype = "gauge"
+        elif mtype == "gaugehistogram":
+            mtype = "histogram"
+        elif mtype == "unknown":
+            mtype = "untyped"
+
+        # Output in the old format for compatibility.
+        if emit_help:
+            output.append(
+                "# HELP {} {}\n".format(
+                    mname,
+                    metric.documentation.replace("\\", r"\\").replace("\n", r"\n"),
+                )
+            )
+        output.append(f"# TYPE {mname} {mtype}\n")
+
+        om_samples: Dict[str, List[str]] = {}
+        for s in metric.samples:
+            for suffix in ["_created", "_gsum", "_gcount"]:
+                if s.name == mname + suffix:
+                    # OpenMetrics specific sample, put in a gauge at the end.
+                    # (these come from gaugehistograms which don't get renamed,
+                    # so no need to faff with mnewname)
+                    om_samples.setdefault(suffix, []).append(sample_line(s, s.name))
+                    break
+            else:
+                newname = s.name.replace(mnewname, mname)
+                if ":" in newname and newname.endswith("_total"):
+                    newname = newname[: -len("_total")]
+                output.append(sample_line(s, newname))
+
+        for suffix, lines in sorted(om_samples.items()):
+            if emit_help:
+                output.append(
+                    "# HELP {}{} {}\n".format(
+                        mname,
+                        suffix,
+                        metric.documentation.replace("\\", r"\\").replace("\n", r"\n"),
+                    )
+                )
+            output.append(f"# TYPE {mname}{suffix} gauge\n")
+            output.extend(lines)
+
+        # Get rid of the weird colon things while we're at it
+        if mtype == "counter":
+            mnewname = mnewname.replace(":total", "")
+        mnewname = mnewname.replace(":", "_")
+
+        if mname == mnewname:
+            continue
+
+        # Also output in the new format, if it's different.
+        if emit_help:
+            output.append(
+                "# HELP {} {}\n".format(
+                    mnewname,
+                    metric.documentation.replace("\\", r"\\").replace("\n", r"\n"),
+                )
+            )
+        output.append(f"# TYPE {mnewname} {mtype}\n")
+
+        for s in metric.samples:
+            # Get rid of the OpenMetrics specific samples (we should already have
+            # dealt with them above anyway.)
+            for suffix in ["_created", "_gsum", "_gcount"]:
+                if s.name == mname + suffix:
+                    break
+            else:
+                sample_name = LEGACY_METRIC_NAMES.get(s.name, s.name)
+                output.append(
+                    sample_line(s, sample_name.replace(":total", "").replace(":", "_"))
+                )
+
+    return "".join(output).encode("utf-8")
+
+
+class MetricsHandler(BaseHTTPRequestHandler):
+    """HTTP handler that gives metrics from ``REGISTRY``."""
+
+    registry = REGISTRY
+
+    def do_GET(self) -> None:
+        registry = self.registry
+        params = parse_qs(urlparse(self.path).query)
+
+        if "help" in params:
+            emit_help = True
+        else:
+            emit_help = False
+
+        try:
+            output = generate_latest(registry, emit_help=emit_help)
+        except Exception:
+            self.send_error(500, "error generating metric output")
+            raise
+        self.send_response(200)
+        self.send_header("Content-Type", CONTENT_TYPE_LATEST)
+        self.send_header("Content-Length", str(len(output)))
+        self.end_headers()
+        self.wfile.write(output)
+
+    def log_message(self, format: str, *args: Any) -> None:
+        """Log nothing."""
+
+    @classmethod
+    def factory(cls, registry: CollectorRegistry) -> Type:
+        """Returns a dynamic MetricsHandler class tied
+        to the passed registry.
+        """
+        # This implementation relies on MetricsHandler.registry
+        #  (defined above and defaulted to REGISTRY).
+
+        # As we have unicode_literals, we need to create a str()
+        #  object for type().
+        cls_name = str(cls.__name__)
+        MyMetricsHandler = type(cls_name, (cls, object), {"registry": registry})
+        return MyMetricsHandler
+
+
+class _ThreadingSimpleServer(ThreadingMixIn, HTTPServer):
+    """Thread per request HTTP server."""
+
+    # Make worker threads "fire and forget". Beginning with Python 3.7 this
+    # prevents a memory leak because ``ThreadingMixIn`` starts to gather all
+    # non-daemon threads in a list in order to join on them at server close.
+    # Enabling daemon threads virtually makes ``_ThreadingSimpleServer`` the
+    # same as Python 3.7's ``ThreadingHTTPServer``.
+    daemon_threads = True
+
+
+def start_http_server(
+    port: int, addr: str = "", registry: CollectorRegistry = REGISTRY
+) -> None:
+    """Starts an HTTP server for prometheus metrics as a daemon thread"""
+    CustomMetricsHandler = MetricsHandler.factory(registry)
+    httpd = _ThreadingSimpleServer((addr, port), CustomMetricsHandler)
+    t = threading.Thread(target=httpd.serve_forever)
+    t.daemon = True
+    t.start()
+
+
+class MetricsResource(Resource):
+    """
+    Twisted ``Resource`` that serves prometheus metrics.
+    """
+
+    isLeaf = True
+
+    def __init__(self, registry: CollectorRegistry = REGISTRY):
+        self.registry = registry
+
+    def render_GET(self, request: Request) -> bytes:
+        request.setHeader(b"Content-Type", CONTENT_TYPE_LATEST.encode("ascii"))
+        response = generate_latest(self.registry)
+        request.setHeader(b"Content-Length", str(len(response)))
+        return response
-- 
cgit 1.5.1


From 898fef2789c9b1a20ef53c7d588f536f51f0fe2f Mon Sep 17 00:00:00 2001
From: Brendan Abolivier <babolivier@matrix.org>
Date: Mon, 5 Sep 2022 12:26:43 +0200
Subject: Share some metrics between the Prometheus exporter and the phone home
 stats (#13671)

---
 changelog.d/13671.misc                  |  1 +
 synapse/app/_base.py                    |  5 ++-
 synapse/app/phone_stats_home.py         | 13 +++++-
 synapse/metrics/common_usage_metrics.py | 79 +++++++++++++++++++++++++++++++++
 synapse/server.py                       |  6 +++
 tests/test_phone_home.py                | 46 ++++++++++++++++++-
 6 files changed, 146 insertions(+), 4 deletions(-)
 create mode 100644 changelog.d/13671.misc
 create mode 100644 synapse/metrics/common_usage_metrics.py

(limited to 'synapse/metrics')

diff --git a/changelog.d/13671.misc b/changelog.d/13671.misc
new file mode 100644
index 0000000000..f1c62b5b1e
--- /dev/null
+++ b/changelog.d/13671.misc
@@ -0,0 +1 @@
+Introduce a `CommonUsageMetrics` class to share some usage metrics between the Prometheus exporter and the phone home stats.
diff --git a/synapse/app/_base.py b/synapse/app/_base.py
index 4742435d3b..9a24bed0a0 100644
--- a/synapse/app/_base.py
+++ b/synapse/app/_base.py
@@ -511,9 +511,10 @@ async def start(hs: "HomeServer") -> None:
     setup_sentry(hs)
     setup_sdnotify(hs)
 
-    # If background tasks are running on the main process, start collecting the
-    # phone home stats.
+    # If background tasks are running on the main process or this is the worker in
+    # charge of them, start collecting the phone home stats and shared usage metrics.
     if hs.config.worker.run_background_tasks:
+        await hs.get_common_usage_metrics_manager().setup()
         start_phone_stats_home(hs)
 
     # We now freeze all allocated objects in the hopes that (almost)
diff --git a/synapse/app/phone_stats_home.py b/synapse/app/phone_stats_home.py
index 40dbdace8e..51c8d15711 100644
--- a/synapse/app/phone_stats_home.py
+++ b/synapse/app/phone_stats_home.py
@@ -51,6 +51,16 @@ async def phone_stats_home(
     stats: JsonDict,
     stats_process: List[Tuple[int, "resource.struct_rusage"]] = _stats_process,
 ) -> None:
+    """Collect usage statistics and send them to the configured endpoint.
+
+    Args:
+        hs: the HomeServer object to use for gathering usage data.
+        stats: the dict in which to store the statistics sent to the configured
+            endpoint. Mostly used in tests to figure out the data that is supposed to
+            be sent.
+        stats_process: statistics about resource usage of the process.
+    """
+
     logger.info("Gathering stats for reporting")
     now = int(hs.get_clock().time())
     # Ensure the homeserver has started.
@@ -83,6 +93,7 @@ async def phone_stats_home(
     #
 
     store = hs.get_datastores().main
+    common_metrics = await hs.get_common_usage_metrics_manager().get_metrics()
 
     stats["homeserver"] = hs.config.server.server_name
     stats["server_context"] = hs.config.server.server_context
@@ -104,7 +115,7 @@ async def phone_stats_home(
     room_count = await store.get_room_count()
     stats["total_room_count"] = room_count
 
-    stats["daily_active_users"] = await store.count_daily_users()
+    stats["daily_active_users"] = common_metrics.daily_active_users
     stats["monthly_active_users"] = await store.count_monthly_users()
     daily_active_e2ee_rooms = await store.count_daily_active_e2ee_rooms()
     stats["daily_active_e2ee_rooms"] = daily_active_e2ee_rooms
diff --git a/synapse/metrics/common_usage_metrics.py b/synapse/metrics/common_usage_metrics.py
new file mode 100644
index 0000000000..0a22ea3d92
--- /dev/null
+++ b/synapse/metrics/common_usage_metrics.py
@@ -0,0 +1,79 @@
+# Copyright 2022 The Matrix.org Foundation C.I.C
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+import attr
+
+from synapse.metrics.background_process_metrics import run_as_background_process
+
+if TYPE_CHECKING:
+    from synapse.server import HomeServer
+
+from prometheus_client import Gauge
+
+# Gauge to expose daily active users metrics
+current_dau_gauge = Gauge(
+    "synapse_admin_daily_active_users",
+    "Current daily active users count",
+)
+
+
+@attr.s(auto_attribs=True)
+class CommonUsageMetrics:
+    """Usage metrics shared between the phone home stats and the prometheus exporter."""
+
+    daily_active_users: int
+
+
+class CommonUsageMetricsManager:
+    """Collects common usage metrics."""
+
+    def __init__(self, hs: "HomeServer") -> None:
+        self._store = hs.get_datastores().main
+        self._clock = hs.get_clock()
+
+    async def get_metrics(self) -> CommonUsageMetrics:
+        """Get the CommonUsageMetrics object. If no collection has happened yet, do it
+        before returning the metrics.
+
+        Returns:
+            The CommonUsageMetrics object to read common metrics from.
+        """
+        return await self._collect()
+
+    async def setup(self) -> None:
+        """Keep the gauges for common usage metrics up to date."""
+        await self._update_gauges()
+        self._clock.looping_call(
+            run_as_background_process,
+            5 * 60 * 1000,
+            desc="common_usage_metrics_update_gauges",
+            func=self._update_gauges,
+        )
+
+    async def _collect(self) -> CommonUsageMetrics:
+        """Collect the common metrics and either create the CommonUsageMetrics object to
+        use if it doesn't exist yet, or update it.
+        """
+        dau_count = await self._store.count_daily_users()
+
+        return CommonUsageMetrics(
+            daily_active_users=dau_count,
+        )
+
+    async def _update_gauges(self) -> None:
+        """Update the Prometheus gauges."""
+        metrics = await self._collect()
+
+        current_dau_gauge.set(float(metrics.daily_active_users))
diff --git a/synapse/server.py b/synapse/server.py
index c2e55bf0b1..5a99c0b344 100644
--- a/synapse/server.py
+++ b/synapse/server.py
@@ -105,6 +105,7 @@ from synapse.handlers.typing import FollowerTypingHandler, TypingWriterHandler
 from synapse.handlers.user_directory import UserDirectoryHandler
 from synapse.http.client import InsecureInterceptableContextFactory, SimpleHttpClient
 from synapse.http.matrixfederationclient import MatrixFederationHttpClient
+from synapse.metrics.common_usage_metrics import CommonUsageMetricsManager
 from synapse.module_api import ModuleApi
 from synapse.notifier import Notifier
 from synapse.push.bulk_push_rule_evaluator import BulkPushRuleEvaluator
@@ -829,3 +830,8 @@ class HomeServer(metaclass=abc.ABCMeta):
             self.config.ratelimiting.rc_message,
             self.config.ratelimiting.rc_admin_redaction,
         )
+
+    @cache_in_self
+    def get_common_usage_metrics_manager(self) -> CommonUsageMetricsManager:
+        """Usage metrics shared between phone home stats and the prometheus exporter."""
+        return CommonUsageMetricsManager(self)
diff --git a/tests/test_phone_home.py b/tests/test_phone_home.py
index b01cae6e5d..cc1a98f1c4 100644
--- a/tests/test_phone_home.py
+++ b/tests/test_phone_home.py
@@ -15,8 +15,14 @@
 import resource
 from unittest import mock
 
+from twisted.test.proto_helpers import MemoryReactor
+
 from synapse.app.phone_stats_home import phone_stats_home
+from synapse.rest import admin
+from synapse.rest.client import login, sync
+from synapse.server import HomeServer
 from synapse.types import JsonDict
+from synapse.util import Clock
 
 from tests.unittest import HomeserverTestCase
 
@@ -47,5 +53,43 @@ class PhoneHomeStatsTestCase(HomeserverTestCase):
         stats: JsonDict = {}
         self.reactor.advance(1)
         # `old_resource` has type `Mock` instead of `struct_rusage`
-        self.get_success(phone_stats_home(self.hs, stats, past_stats))  # type: ignore[arg-type]
+        self.get_success(
+            phone_stats_home(self.hs, stats, past_stats)  # type: ignore[arg-type]
+        )
         self.assertApproximates(stats["cpu_average"], 100, tolerance=2.5)
+
+
+class CommonMetricsTestCase(HomeserverTestCase):
+    servlets = [
+        admin.register_servlets,
+        login.register_servlets,
+        sync.register_servlets,
+    ]
+
+    def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
+        self.metrics_manager = hs.get_common_usage_metrics_manager()
+        self.get_success(self.metrics_manager.setup())
+
+    def test_dau(self) -> None:
+        """Tests that the daily active users count is correctly updated."""
+        self._assert_metric_value("daily_active_users", 0)
+
+        self.register_user("user", "password")
+        tok = self.login("user", "password")
+        self.make_request("GET", "/sync", access_token=tok)
+
+        self.pump(1)
+
+        self._assert_metric_value("daily_active_users", 1)
+
+    def _assert_metric_value(self, metric_name: str, expected: int) -> None:
+        """Compare the given value to the current value of the common usage metric with
+        the given name.
+
+        Args:
+            metric_name: The metric to look up.
+            expected: Expected value for this metric.
+        """
+        metrics = self.get_success(self.metrics_manager.get_metrics())
+        value = getattr(metrics, metric_name)
+        self.assertEqual(value, expected)
-- 
cgit 1.5.1


From b455c2a5ec4874a6897c0448631d8ab8f36f9339 Mon Sep 17 00:00:00 2001
From: reivilibre <oliverw@matrix.org>
Date: Tue, 6 Sep 2022 11:21:21 +0000
Subject: Update Grafana dashboard to not use legacy metric names. (#13714)

---
 changelog.d/13714.misc                |   1 +
 contrib/grafana/synapse.json          | 138 +++++++++++++++++-----------------
 synapse/metrics/_legacy_exposition.py |   4 +-
 synapse/util/caches/__init__.py       |   4 +-
 4 files changed, 74 insertions(+), 73 deletions(-)
 create mode 100644 changelog.d/13714.misc

(limited to 'synapse/metrics')

diff --git a/changelog.d/13714.misc b/changelog.d/13714.misc
new file mode 100644
index 0000000000..07ace50b12
--- /dev/null
+++ b/changelog.d/13714.misc
@@ -0,0 +1 @@
+Add experimental configuration option to allow disabling legacy Prometheus metric names.
\ No newline at end of file
diff --git a/contrib/grafana/synapse.json b/contrib/grafana/synapse.json
index 248cd6d9ad..58061e2fce 100644
--- a/contrib/grafana/synapse.json
+++ b/contrib/grafana/synapse.json
@@ -335,7 +335,7 @@
           "datasource": {
             "uid": "$datasource"
           },
-          "expr": "sum(rate(synapse_storage_events_persisted_events{instance=\"$instance\"}[$bucket_size]))",
+          "expr": "sum(rate(synapse_storage_events_persisted_events_total{instance=\"$instance\"}[$bucket_size]))",
           "hide": false,
           "instant": false,
           "legendFormat": "Events",
@@ -1423,7 +1423,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_background_process_ru_utime_seconds{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])+rate(synapse_background_process_ru_stime_seconds{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
+              "expr": "rate(synapse_background_process_ru_utime_seconds_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])+rate(synapse_background_process_ru_stime_seconds_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
               "format": "time_series",
               "hide": false,
               "instant": false,
@@ -1804,7 +1804,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "sum(rate(synapse_storage_events_persisted_events{instance=\"$instance\"}[$bucket_size])) without (job,index)",
+              "expr": "sum(rate(synapse_storage_events_persisted_events_total{instance=\"$instance\"}[$bucket_size])) without (job,index)",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
@@ -2437,7 +2437,7 @@
                 "uid": "$datasource"
               },
               "exemplar": false,
-              "expr": "sum(rate(synapse_state_res_db_for_biggest_room_seconds{instance=\"$instance\"}[1m]))",
+              "expr": "sum(rate(synapse_state_res_db_for_biggest_room_seconds_total{instance=\"$instance\"}[1m]))",
               "format": "time_series",
               "hide": false,
               "instant": false,
@@ -2451,7 +2451,7 @@
                 "uid": "$datasource"
               },
               "exemplar": false,
-              "expr": "sum(rate(synapse_state_res_cpu_for_biggest_room_seconds{instance=\"$instance\"}[1m]))",
+              "expr": "sum(rate(synapse_state_res_cpu_for_biggest_room_seconds_total{instance=\"$instance\"}[1m]))",
               "format": "time_series",
               "hide": false,
               "instant": false,
@@ -3425,7 +3425,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_background_process_ru_utime_seconds{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])+rate(synapse_background_process_ru_stime_seconds{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
+              "expr": "rate(synapse_background_process_ru_utime_seconds_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])+rate(synapse_background_process_ru_stime_seconds_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 1,
@@ -3518,7 +3518,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_background_process_db_txn_duration_seconds{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]) +  rate(synapse_background_process_db_sched_duration_seconds{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
+              "expr": "rate(synapse_background_process_db_txn_duration_seconds_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]) +  rate(synapse_background_process_db_sched_duration_seconds_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
               "format": "time_series",
               "hide": false,
               "intervalFactor": 1,
@@ -3726,7 +3726,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "sum(rate(synapse_federation_client_sent_transactions{instance=\"$instance\"}[$bucket_size]))",
+              "expr": "sum(rate(synapse_federation_client_sent_transactions_total{instance=\"$instance\"}[$bucket_size]))",
               "format": "time_series",
               "intervalFactor": 1,
               "legendFormat": "successful txn rate",
@@ -3736,7 +3736,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "sum(rate(synapse_util_metrics_block_count{block_name=\"_send_new_transaction\",instance=\"$instance\"}[$bucket_size]) - ignoring (block_name) rate(synapse_federation_client_sent_transactions{instance=\"$instance\"}[$bucket_size]))",
+              "expr": "sum(rate(synapse_util_metrics_block_count_total{block_name=\"_send_new_transaction\",instance=\"$instance\"}[$bucket_size]) - ignoring (block_name) rate(synapse_federation_client_sent_transactions_total{instance=\"$instance\"}[$bucket_size]))",
               "legendFormat": "failed txn rate",
               "refId": "B"
             }
@@ -3826,7 +3826,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "sum(rate(synapse_federation_server_received_pdus{instance=~\"$instance\"}[$bucket_size]))",
+              "expr": "sum(rate(synapse_federation_server_received_pdus_total{instance=~\"$instance\"}[$bucket_size]))",
               "format": "time_series",
               "intervalFactor": 1,
               "legendFormat": "pdus",
@@ -3836,7 +3836,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "sum(rate(synapse_federation_server_received_edus{instance=~\"$instance\"}[$bucket_size]))",
+              "expr": "sum(rate(synapse_federation_server_received_edus_total{instance=~\"$instance\"}[$bucket_size]))",
               "format": "time_series",
               "intervalFactor": 1,
               "legendFormat": "edus",
@@ -3928,7 +3928,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "sum(rate(synapse_federation_client_sent_pdu_destinations:total{instance=\"$instance\"}[$bucket_size]))",
+              "expr": "sum(rate(synapse_federation_client_sent_pdu_destinations:total_total{instance=\"$instance\"}[$bucket_size]))",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 1,
@@ -3939,7 +3939,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "sum(rate(synapse_federation_client_sent_edus{instance=\"$instance\"}[$bucket_size]))",
+              "expr": "sum(rate(synapse_federation_client_sent_edus_total{instance=\"$instance\"}[$bucket_size]))",
               "format": "time_series",
               "intervalFactor": 1,
               "legendFormat": "edus",
@@ -5042,7 +5042,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_http_httppusher_http_pushes_processed{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]) and on (instance, job, index) (synapse_http_httppusher_http_pushes_failed + synapse_http_httppusher_http_pushes_processed) > 0",
+              "expr": "rate(synapse_http_httppusher_http_pushes_processed_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]) and on (instance, job, index) (synapse_http_httppusher_http_pushes_failed_total + synapse_http_httppusher_http_pushes_processed_total) > 0",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
@@ -5054,7 +5054,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_http_httppusher_http_pushes_failed{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]) and on (instance, job, index) (synapse_http_httppusher_http_pushes_failed + synapse_http_httppusher_http_pushes_processed) > 0",
+              "expr": "rate(synapse_http_httppusher_http_pushes_failed_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]) and on (instance, job, index) (synapse_http_httppusher_http_pushes_failed_total + synapse_http_httppusher_http_pushes_processed_total) > 0",
               "format": "time_series",
               "intervalFactor": 2,
               "legendFormat": "failed {{job}}",
@@ -5268,12 +5268,12 @@
                 "uid": "${DS_PROMETHEUS}"
               },
               "exemplar": true,
-              "expr": "sum(rate(synapse_push_bulk_push_rule_evaluator_push_rules_state_size_counter{job=\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size]))",
+              "expr": "sum(rate(synapse_push_bulk_push_rule_evaluator_push_rules_state_size_counter_total{job=\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size]))",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
               "legendFormat": "{{index}}",
-              "metric": "synapse_push_bulk_push_rule_evaluator_push_rules_state_size_counter",
+              "metric": "synapse_push_bulk_push_rule_evaluator_push_rules_state_size_counter_total",
               "refId": "A",
               "step": 2
             }
@@ -5369,12 +5369,12 @@
                 "uid": "$datasource"
               },
               "exemplar": true,
-              "expr": "sum(rate(synapse_push_bulk_push_rule_evaluator_push_rules_invalidation_counter{job=\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size]))",
+              "expr": "sum(rate(synapse_push_bulk_push_rule_evaluator_push_rules_invalidation_counter_total{job=\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size]))",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
               "legendFormat": "{{index}}",
-              "metric": "synapse_push_bulk_push_rule_evaluator_push_rules_invalidation_counter",
+              "metric": "synapse_push_bulk_push_rule_evaluator_push_rules_invalidation_counter_total",
               "refId": "A",
               "step": 2
             }
@@ -5475,12 +5475,12 @@
                 "uid": "${DS_PROMETHEUS}"
               },
               "exemplar": true,
-              "expr": "sum(rate(synapse_util_caches_cache:hits{job=\"$job\",index=~\"$index\",name=\"push_rules_delta_state_cache_metric\",instance=\"$instance\"}[$bucket_size]))/sum(rate(synapse_util_caches_cache:total{job=\"$job\",index=~\"$index\", name=\"push_rules_delta_state_cache_metric\",instance=\"$instance\"}[$bucket_size]))",
+              "expr": "sum(rate(synapse_util_caches_cache_hits{job=\"$job\",index=~\"$index\",name=\"push_rules_delta_state_cache_metric\",instance=\"$instance\"}[$bucket_size]))/sum(rate(synapse_util_caches_cache{job=\"$job\",index=~\"$index\", name=\"push_rules_delta_state_cache_metric\",instance=\"$instance\"}[$bucket_size]))",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
               "legendFormat": "Hit Rate",
-              "metric": "synapse_push_bulk_push_rule_evaluator_push_rules_invalidation_counter",
+              "metric": "synapse_push_bulk_push_rule_evaluator_push_rules_invalidation_counter_total",
               "refId": "A",
               "step": 2
             },
@@ -5490,7 +5490,7 @@
                 "uid": "${DS_PROMETHEUS}"
               },
               "exemplar": true,
-              "expr": "sum(rate(synapse_util_caches_cache:total{job=\"$job\",index=~\"$index\", name=\"push_rules_delta_state_cache_metric\",instance=\"$instance\"}[$bucket_size]))",
+              "expr": "sum(rate(synapse_util_caches_cache{job=\"$job\",index=~\"$index\", name=\"push_rules_delta_state_cache_metric\",instance=\"$instance\"}[$bucket_size]))",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
@@ -5598,12 +5598,12 @@
                 "uid": "${DS_PROMETHEUS}"
               },
               "exemplar": true,
-              "expr": "sum(rate(synapse_util_caches_cache:hits{job=\"$job\",index=~\"$index\",name=\"room_push_rule_cache\",instance=\"$instance\"}[$bucket_size]))/sum(rate(synapse_util_caches_cache:total{job=\"$job\",index=~\"$index\", name=\"room_push_rule_cache\",instance=\"$instance\"}[$bucket_size]))",
+              "expr": "sum(rate(synapse_util_caches_cache_hits{job=\"$job\",index=~\"$index\",name=\"room_push_rule_cache\",instance=\"$instance\"}[$bucket_size]))/sum(rate(synapse_util_caches_cache{job=\"$job\",index=~\"$index\", name=\"room_push_rule_cache\",instance=\"$instance\"}[$bucket_size]))",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
               "legendFormat": "Hit Rate",
-              "metric": "synapse_push_bulk_push_rule_evaluator_push_rules_invalidation_counter",
+              "metric": "synapse_push_bulk_push_rule_evaluator_push_rules_invalidation_counter_total",
               "refId": "A",
               "step": 2
             },
@@ -5613,7 +5613,7 @@
                 "uid": "${DS_PROMETHEUS}"
               },
               "exemplar": true,
-              "expr": "sum(rate(synapse_util_caches_cache:total{job=\"$job\",index=~\"$index\", name=\"room_push_rule_cache\",instance=\"$instance\"}[$bucket_size]))",
+              "expr": "sum(rate(synapse_util_caches_cache{job=\"$job\",index=~\"$index\", name=\"room_push_rule_cache\",instance=\"$instance\"}[$bucket_size]))",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
@@ -5719,12 +5719,12 @@
                 "uid": "${DS_PROMETHEUS}"
               },
               "exemplar": true,
-              "expr": "sum(rate(synapse_util_caches_cache:hits{job=\"$job\",index=~\"$index\",name=\"_get_rules_for_room\",instance=\"$instance\"}[$bucket_size]))/sum(rate(synapse_util_caches_cache:total{job=\"$job\",index=~\"$index\", name=\"_get_rules_for_room\",instance=\"$instance\"}[$bucket_size]))",
+              "expr": "sum(rate(synapse_util_caches_cache_hits{job=\"$job\",index=~\"$index\",name=\"_get_rules_for_room\",instance=\"$instance\"}[$bucket_size]))/sum(rate(synapse_util_caches_cache{job=\"$job\",index=~\"$index\", name=\"_get_rules_for_room\",instance=\"$instance\"}[$bucket_size]))",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
               "legendFormat": "Hit Rate",
-              "metric": "synapse_push_bulk_push_rule_evaluator_push_rules_invalidation_counter",
+              "metric": "synapse_push_bulk_push_rule_evaluator_push_rules_invalidation_counter_total",
               "refId": "A",
               "step": 2
             },
@@ -5734,7 +5734,7 @@
                 "uid": "${DS_PROMETHEUS}"
               },
               "exemplar": true,
-              "expr": "sum(rate(synapse_util_caches_cache:total{job=\"$job\",index=~\"$index\", name=\"_get_rules_for_room\",instance=\"$instance\"}[$bucket_size]))",
+              "expr": "sum(rate(synapse_util_caches_cache{job=\"$job\",index=~\"$index\", name=\"_get_rules_for_room\",instance=\"$instance\"}[$bucket_size]))",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
@@ -6087,7 +6087,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "topk(10, rate(synapse_storage_transaction_time_count{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]))",
+              "expr": "topk(10, rate(synapse_storage_transaction_time_count_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]))",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
@@ -6187,7 +6187,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_storage_transaction_time_sum{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
+              "expr": "rate(synapse_storage_transaction_time_sum_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -6287,7 +6287,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_storage_transaction_time_sum{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])/rate(synapse_storage_transaction_time_count{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
+              "expr": "rate(synapse_storage_transaction_time_sum_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])/rate(synapse_storage_transaction_time_count_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -6538,7 +6538,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_util_metrics_block_ru_utime_seconds{instance=\"$instance\",job=~\"$job\",index=~\"$index\",block_name!=\"wrapped_request_handler\"}[$bucket_size]) + rate(synapse_util_metrics_block_ru_stime_seconds[$bucket_size])",
+              "expr": "rate(synapse_util_metrics_block_ru_utime_seconds_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\",block_name!=\"wrapped_request_handler\"}[$bucket_size]) + rate(synapse_util_metrics_block_ru_stime_seconds_total[$bucket_size])",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
@@ -6636,7 +6636,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "(rate(synapse_util_metrics_block_ru_utime_seconds{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]) + rate(synapse_util_metrics_block_ru_stime_seconds[$bucket_size])) / rate(synapse_util_metrics_block_count[$bucket_size])",
+              "expr": "(rate(synapse_util_metrics_block_ru_utime_seconds_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]) + rate(synapse_util_metrics_block_ru_stime_seconds_total[$bucket_size])) / rate(synapse_util_metrics_block_count_total[$bucket_size])",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
@@ -6737,7 +6737,7 @@
                 "uid": "${DS_PROMETHEUS}"
               },
               "exemplar": true,
-              "expr": "rate(synapse_util_metrics_block_db_txn_duration_seconds{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
+              "expr": "rate(synapse_util_metrics_block_db_txn_duration_seconds_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
@@ -6839,7 +6839,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_util_metrics_block_db_txn_duration_seconds{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]) / rate(synapse_util_metrics_block_db_txn_count{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
+              "expr": "rate(synapse_util_metrics_block_db_txn_duration_seconds_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]) / rate(synapse_util_metrics_block_db_txn_count_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
@@ -6936,7 +6936,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_util_metrics_block_db_txn_duration_seconds{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]) / rate(synapse_util_metrics_block_db_txn_count{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
+              "expr": "rate(synapse_util_metrics_block_db_txn_duration_seconds_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]) / rate(synapse_util_metrics_block_db_txn_count_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
@@ -7033,7 +7033,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_util_metrics_block_time_seconds{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]) / rate(synapse_util_metrics_block_count{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
+              "expr": "rate(synapse_util_metrics_block_time_seconds_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]) / rate(synapse_util_metrics_block_count_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
@@ -7122,7 +7122,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_util_metrics_block_count{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
+              "expr": "rate(synapse_util_metrics_block_count_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
               "interval": "",
               "legendFormat": "{{job}}-{{index}} {{block_name}}",
               "refId": "A"
@@ -7246,7 +7246,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_util_caches_cache:hits{job=~\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size])/rate(synapse_util_caches_cache:total{job=~\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size])",
+              "expr": "rate(synapse_util_caches_cache_hits{job=~\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size])/rate(synapse_util_caches_cache{job=~\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size])",
               "format": "time_series",
               "intervalFactor": 2,
               "legendFormat": "{{name}} {{job}}-{{index}}",
@@ -7347,7 +7347,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "synapse_util_caches_cache:size{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}",
+              "expr": "synapse_util_caches_cache_size{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}",
               "format": "time_series",
               "hide": false,
               "interval": "",
@@ -7447,7 +7447,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_util_caches_cache:total{job=~\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size])",
+              "expr": "rate(synapse_util_caches_cache{job=~\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size])",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
@@ -7547,7 +7547,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "topk(10, rate(synapse_util_caches_cache:total{job=~\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size]) - rate(synapse_util_caches_cache:hits{job=~\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size]))",
+              "expr": "topk(10, rate(synapse_util_caches_cache{job=~\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size]) - rate(synapse_util_caches_cache_hits{job=~\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size]))",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
@@ -7643,7 +7643,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_util_caches_cache:evicted_size{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
+              "expr": "rate(synapse_util_caches_cache_evicted_size{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 1,
@@ -7763,7 +7763,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "synapse_util_caches_response_cache:size{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}",
+              "expr": "synapse_util_caches_response_cache_size{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}",
               "interval": "",
               "legendFormat": "{{name}} {{job}}-{{index}}",
               "refId": "A"
@@ -7853,7 +7853,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_util_caches_response_cache:hits{instance=\"$instance\", job=~\"$job\", index=~\"$index\"}[$bucket_size])/rate(synapse_util_caches_response_cache:total{instance=\"$instance\", job=~\"$job\", index=~\"$index\"}[$bucket_size])",
+              "expr": "rate(synapse_util_caches_response_cache_hits{instance=\"$instance\", job=~\"$job\", index=~\"$index\"}[$bucket_size])/rate(synapse_util_caches_response_cache{instance=\"$instance\", job=~\"$job\", index=~\"$index\"}[$bucket_size])",
               "interval": "",
               "legendFormat": "{{name}} {{job}}-{{index}}",
               "refId": "A"
@@ -9556,7 +9556,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "synapse_forward_extremities_bucket{instance=\"$instance\"} and on (index, instance, job) (synapse_storage_events_persisted_events > 0)",
+              "expr": "synapse_forward_extremities_bucket{instance=\"$instance\"} and on (index, instance, job) (synapse_storage_events_persisted_events_total > 0)",
               "format": "heatmap",
               "intervalFactor": 1,
               "legendFormat": "{{le}}",
@@ -9716,7 +9716,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_storage_events_forward_extremities_persisted_bucket{instance=\"$instance\"}[$bucket_size]) and on (index, instance, job) (synapse_storage_events_persisted_events > 0)",
+              "expr": "rate(synapse_storage_events_forward_extremities_persisted_bucket{instance=\"$instance\"}[$bucket_size]) and on (index, instance, job) (synapse_storage_events_persisted_events_total > 0)",
               "format": "heatmap",
               "intervalFactor": 1,
               "legendFormat": "{{le}}",
@@ -9793,7 +9793,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "histogram_quantile(0.5, rate(synapse_storage_events_forward_extremities_persisted_bucket{instance=\"$instance\"}[$bucket_size]) and on (index, instance, job) (synapse_storage_events_persisted_events > 0))",
+              "expr": "histogram_quantile(0.5, rate(synapse_storage_events_forward_extremities_persisted_bucket{instance=\"$instance\"}[$bucket_size]) and on (index, instance, job) (synapse_storage_events_persisted_events_total > 0))",
               "format": "time_series",
               "intervalFactor": 1,
               "legendFormat": "50%",
@@ -9803,7 +9803,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "histogram_quantile(0.75, rate(synapse_storage_events_forward_extremities_persisted_bucket{instance=\"$instance\"}[$bucket_size]) and on (index, instance, job) (synapse_storage_events_persisted_events > 0))",
+              "expr": "histogram_quantile(0.75, rate(synapse_storage_events_forward_extremities_persisted_bucket{instance=\"$instance\"}[$bucket_size]) and on (index, instance, job) (synapse_storage_events_persisted_events_total > 0))",
               "format": "time_series",
               "intervalFactor": 1,
               "legendFormat": "75%",
@@ -9813,7 +9813,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "histogram_quantile(0.90, rate(synapse_storage_events_forward_extremities_persisted_bucket{instance=\"$instance\"}[$bucket_size]) and on (index, instance, job) (synapse_storage_events_persisted_events > 0))",
+              "expr": "histogram_quantile(0.90, rate(synapse_storage_events_forward_extremities_persisted_bucket{instance=\"$instance\"}[$bucket_size]) and on (index, instance, job) (synapse_storage_events_persisted_events_total > 0))",
               "format": "time_series",
               "intervalFactor": 1,
               "legendFormat": "90%",
@@ -9823,7 +9823,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "histogram_quantile(0.99, rate(synapse_storage_events_forward_extremities_persisted_bucket{instance=\"$instance\"}[$bucket_size]) and on (index, instance, job) (synapse_storage_events_persisted_events > 0))",
+              "expr": "histogram_quantile(0.99, rate(synapse_storage_events_forward_extremities_persisted_bucket{instance=\"$instance\"}[$bucket_size]) and on (index, instance, job) (synapse_storage_events_persisted_events_total > 0))",
               "format": "time_series",
               "intervalFactor": 1,
               "legendFormat": "99%",
@@ -9905,7 +9905,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_storage_events_stale_forward_extremities_persisted_bucket{instance=\"$instance\"}[$bucket_size]) and on (index, instance, job) (synapse_storage_events_persisted_events > 0)",
+              "expr": "rate(synapse_storage_events_stale_forward_extremities_persisted_bucket{instance=\"$instance\"}[$bucket_size]) and on (index, instance, job) (synapse_storage_events_persisted_events_total > 0)",
               "format": "heatmap",
               "intervalFactor": 1,
               "legendFormat": "{{le}}",
@@ -9982,7 +9982,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "histogram_quantile(0.5, rate(synapse_storage_events_stale_forward_extremities_persisted_bucket{instance=\"$instance\"}[$bucket_size]) and on (index, instance, job) (synapse_storage_events_persisted_events > 0))",
+              "expr": "histogram_quantile(0.5, rate(synapse_storage_events_stale_forward_extremities_persisted_bucket{instance=\"$instance\"}[$bucket_size]) and on (index, instance, job) (synapse_storage_events_persisted_events_total > 0))",
               "format": "time_series",
               "intervalFactor": 1,
               "legendFormat": "50%",
@@ -9992,7 +9992,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "histogram_quantile(0.75, rate(synapse_storage_events_stale_forward_extremities_persisted_bucket{instance=\"$instance\"}[$bucket_size]) and on (index, instance, job) (synapse_storage_events_persisted_events > 0))",
+              "expr": "histogram_quantile(0.75, rate(synapse_storage_events_stale_forward_extremities_persisted_bucket{instance=\"$instance\"}[$bucket_size]) and on (index, instance, job) (synapse_storage_events_persisted_events_total > 0))",
               "format": "time_series",
               "intervalFactor": 1,
               "legendFormat": "75%",
@@ -10002,7 +10002,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "histogram_quantile(0.90, rate(synapse_storage_events_stale_forward_extremities_persisted_bucket{instance=\"$instance\"}[$bucket_size]) and on (index, instance, job) (synapse_storage_events_persisted_events > 0))",
+              "expr": "histogram_quantile(0.90, rate(synapse_storage_events_stale_forward_extremities_persisted_bucket{instance=\"$instance\"}[$bucket_size]) and on (index, instance, job) (synapse_storage_events_persisted_events_total > 0))",
               "format": "time_series",
               "intervalFactor": 1,
               "legendFormat": "90%",
@@ -10012,7 +10012,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "histogram_quantile(0.99, rate(synapse_storage_events_stale_forward_extremities_persisted_bucket{instance=\"$instance\"}[$bucket_size]) and on (index, instance, job) (synapse_storage_events_persisted_events > 0))",
+              "expr": "histogram_quantile(0.99, rate(synapse_storage_events_stale_forward_extremities_persisted_bucket{instance=\"$instance\"}[$bucket_size]) and on (index, instance, job) (synapse_storage_events_persisted_events_total > 0))",
               "format": "time_series",
               "intervalFactor": 1,
               "legendFormat": "99%",
@@ -10297,7 +10297,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "sum(rate(synapse_storage_events_state_resolutions_during_persistence{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]))",
+              "expr": "sum(rate(synapse_storage_events_state_resolutions_during_persistence_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]))",
               "interval": "",
               "legendFormat": "State res ",
               "refId": "A"
@@ -10306,7 +10306,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "sum(rate(synapse_storage_events_potential_times_prune_extremities{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]))",
+              "expr": "sum(rate(synapse_storage_events_potential_times_prune_extremities_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]))",
               "interval": "",
               "legendFormat": "Potential to prune",
               "refId": "B"
@@ -10315,7 +10315,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "sum(rate(synapse_storage_events_times_pruned_extremities{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]))",
+              "expr": "sum(rate(synapse_storage_events_times_pruned_extremities_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]))",
               "interval": "",
               "legendFormat": "Pruned",
               "refId": "C"
@@ -11069,7 +11069,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_handler_presence_notified_presence{job=\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size])",
+              "expr": "rate(synapse_handler_presence_notified_presence_total{job=\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size])",
               "interval": "",
               "legendFormat": "Notified",
               "refId": "A"
@@ -11078,7 +11078,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_handler_presence_federation_presence_out{job=\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size])",
+              "expr": "rate(synapse_handler_presence_federation_presence_out_total{job=\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size])",
               "interval": "",
               "legendFormat": "Remote ping",
               "refId": "B"
@@ -11087,7 +11087,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_handler_presence_presence_updates{job=\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size])",
+              "expr": "rate(synapse_handler_presence_presence_updates_total{job=\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size])",
               "interval": "",
               "legendFormat": "Total updates",
               "refId": "C"
@@ -11096,7 +11096,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_handler_presence_federation_presence{job=\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size])",
+              "expr": "rate(synapse_handler_presence_federation_presence_total{job=\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size])",
               "interval": "",
               "legendFormat": "Remote updates",
               "refId": "D"
@@ -11105,7 +11105,7 @@
               "datasource": {
                 "uid": "$datasource"
               },
-              "expr": "rate(synapse_handler_presence_bump_active_time{job=\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size])",
+              "expr": "rate(synapse_handler_presence_bump_active_time_total{job=\"$job\",index=~\"$index\",instance=\"$instance\"}[$bucket_size])",
               "interval": "",
               "legendFormat": "Bump active time",
               "refId": "E"
@@ -11789,7 +11789,7 @@
         "name": "instance",
         "options": [],
         "query": {
-          "query": "label_values(synapse_util_metrics_block_ru_utime_seconds, instance)",
+          "query": "label_values(synapse_util_metrics_block_ru_utime_seconds_total, instance)",
           "refId": "Prometheus-instance-Variable-Query"
         },
         "refresh": 2,
@@ -11818,7 +11818,7 @@
         "name": "job",
         "options": [],
         "query": {
-          "query": "label_values(synapse_util_metrics_block_ru_utime_seconds, job)",
+          "query": "label_values(synapse_util_metrics_block_ru_utime_seconds_total, job)",
           "refId": "Prometheus-job-Variable-Query"
         },
         "refresh": 2,
@@ -11848,7 +11848,7 @@
         "name": "index",
         "options": [],
         "query": {
-          "query": "label_values(synapse_util_metrics_block_ru_utime_seconds, index)",
+          "query": "label_values(synapse_util_metrics_block_ru_utime_seconds_total, index)",
           "refId": "Prometheus-index-Variable-Query"
         },
         "refresh": 2,
@@ -11896,6 +11896,6 @@
   "timezone": "",
   "title": "Synapse",
   "uid": "000000012",
-  "version": 132,
+  "version": 133,
   "weekStart": ""
-}
\ No newline at end of file
+}
diff --git a/synapse/metrics/_legacy_exposition.py b/synapse/metrics/_legacy_exposition.py
index ff640a49af..6f00ff2a47 100644
--- a/synapse/metrics/_legacy_exposition.py
+++ b/synapse/metrics/_legacy_exposition.py
@@ -88,11 +88,11 @@ LEGACY_METRIC_NAMES = {
     "synapse_util_caches_cache_hits": "synapse_util_caches_cache:hits",
     "synapse_util_caches_cache_size": "synapse_util_caches_cache:size",
     "synapse_util_caches_cache_evicted_size": "synapse_util_caches_cache:evicted_size",
-    "synapse_util_caches_cache_total": "synapse_util_caches_cache:total",
+    "synapse_util_caches_cache": "synapse_util_caches_cache:total",
     "synapse_util_caches_response_cache_size": "synapse_util_caches_response_cache:size",
     "synapse_util_caches_response_cache_hits": "synapse_util_caches_response_cache:hits",
     "synapse_util_caches_response_cache_evicted_size": "synapse_util_caches_response_cache:evicted_size",
-    "synapse_util_caches_response_cache_total": "synapse_util_caches_response_cache:total",
+    "synapse_util_caches_response_cache": "synapse_util_caches_response_cache:total",
 }
 
 
diff --git a/synapse/util/caches/__init__.py b/synapse/util/caches/__init__.py
index bdf9b0dc8c..d4a2b77c29 100644
--- a/synapse/util/caches/__init__.py
+++ b/synapse/util/caches/__init__.py
@@ -37,7 +37,7 @@ collectors_by_name: Dict[str, "CacheMetric"] = {}
 cache_size = Gauge("synapse_util_caches_cache_size", "", ["name"])
 cache_hits = Gauge("synapse_util_caches_cache_hits", "", ["name"])
 cache_evicted = Gauge("synapse_util_caches_cache_evicted_size", "", ["name", "reason"])
-cache_total = Gauge("synapse_util_caches_cache_total", "", ["name"])
+cache_total = Gauge("synapse_util_caches_cache", "", ["name"])
 cache_max_size = Gauge("synapse_util_caches_cache_max_size", "", ["name"])
 cache_memory_usage = Gauge(
     "synapse_util_caches_cache_size_bytes",
@@ -50,7 +50,7 @@ response_cache_hits = Gauge("synapse_util_caches_response_cache_hits", "", ["nam
 response_cache_evicted = Gauge(
     "synapse_util_caches_response_cache_evicted_size", "", ["name", "reason"]
 )
-response_cache_total = Gauge("synapse_util_caches_response_cache_total", "", ["name"])
+response_cache_total = Gauge("synapse_util_caches_response_cache", "", ["name"])
 
 
 class EvictionReason(Enum):
-- 
cgit 1.5.1


From 526f84bc2ea78a44ef612bf79caf5cfb7966c1ed Mon Sep 17 00:00:00 2001
From: reivilibre <oliverw@matrix.org>
Date: Thu, 8 Sep 2022 14:01:42 +0000
Subject: Fix Prometheus recording rules to not use legacy metric names.
 (#13718)

---
 changelog.d/13718.misc                |  1 +
 contrib/prometheus/synapse-v1.rules   | 21 ---------------------
 contrib/prometheus/synapse-v2.rules   | 22 +++++++++++-----------
 synapse/app/phone_stats_home.py       |  6 +++---
 synapse/federation/sender/__init__.py |  4 ++--
 synapse/metrics/_legacy_exposition.py |  5 +++++
 6 files changed, 22 insertions(+), 37 deletions(-)
 create mode 100644 changelog.d/13718.misc
 delete mode 100644 contrib/prometheus/synapse-v1.rules

(limited to 'synapse/metrics')

diff --git a/changelog.d/13718.misc b/changelog.d/13718.misc
new file mode 100644
index 0000000000..07ace50b12
--- /dev/null
+++ b/changelog.d/13718.misc
@@ -0,0 +1 @@
+Add experimental configuration option to allow disabling legacy Prometheus metric names.
\ No newline at end of file
diff --git a/contrib/prometheus/synapse-v1.rules b/contrib/prometheus/synapse-v1.rules
deleted file mode 100644
index 4c900ba537..0000000000
--- a/contrib/prometheus/synapse-v1.rules
+++ /dev/null
@@ -1,21 +0,0 @@
-synapse_federation_transaction_queue_pendingEdus:total = sum(synapse_federation_transaction_queue_pendingEdus or absent(synapse_federation_transaction_queue_pendingEdus)*0)
-synapse_federation_transaction_queue_pendingPdus:total = sum(synapse_federation_transaction_queue_pendingPdus or absent(synapse_federation_transaction_queue_pendingPdus)*0)
-
-synapse_http_server_request_count:method{servlet=""} = sum(synapse_http_server_request_count) by (method)
-synapse_http_server_request_count:servlet{method=""} = sum(synapse_http_server_request_count) by (servlet)
-
-synapse_http_server_request_count:total{servlet=""} = sum(synapse_http_server_request_count:by_method) by (servlet)
-
-synapse_cache:hit_ratio_5m = rate(synapse_util_caches_cache:hits[5m]) / rate(synapse_util_caches_cache:total[5m])
-synapse_cache:hit_ratio_30s = rate(synapse_util_caches_cache:hits[30s]) / rate(synapse_util_caches_cache:total[30s])
-
-synapse_federation_client_sent{type="EDU"} = synapse_federation_client_sent_edus + 0
-synapse_federation_client_sent{type="PDU"} = synapse_federation_client_sent_pdu_destinations:count + 0
-synapse_federation_client_sent{type="Query"} = sum(synapse_federation_client_sent_queries) by (job)
-
-synapse_federation_server_received{type="EDU"} = synapse_federation_server_received_edus + 0
-synapse_federation_server_received{type="PDU"} = synapse_federation_server_received_pdus + 0
-synapse_federation_server_received{type="Query"} = sum(synapse_federation_server_received_queries) by (job)
-
-synapse_federation_transaction_queue_pending{type="EDU"} = synapse_federation_transaction_queue_pending_edus + 0
-synapse_federation_transaction_queue_pending{type="PDU"} = synapse_federation_transaction_queue_pending_pdus + 0
diff --git a/contrib/prometheus/synapse-v2.rules b/contrib/prometheus/synapse-v2.rules
index 7e405bf7f0..a5e6a735cd 100644
--- a/contrib/prometheus/synapse-v2.rules
+++ b/contrib/prometheus/synapse-v2.rules
@@ -20,18 +20,18 @@ groups:
     expr: 'sum(synapse_http_server_request_count:by_method) by (servlet)'
 
   - record: 'synapse_cache:hit_ratio_5m'
-    expr: 'rate(synapse_util_caches_cache:hits[5m]) / rate(synapse_util_caches_cache:total[5m])'
+    expr: 'rate(synapse_util_caches_cache_hits[5m]) / rate(synapse_util_caches_cache[5m])'
   - record: 'synapse_cache:hit_ratio_30s'
-    expr: 'rate(synapse_util_caches_cache:hits[30s]) / rate(synapse_util_caches_cache:total[30s])'
+    expr: 'rate(synapse_util_caches_cache_hits[30s]) / rate(synapse_util_caches_cache[30s])'
 
   - record: 'synapse_federation_client_sent'
     labels:
       type: "EDU"
-    expr: 'synapse_federation_client_sent_edus + 0'
+    expr: 'synapse_federation_client_sent_edus_total + 0'
   - record: 'synapse_federation_client_sent'
     labels:
       type: "PDU"
-    expr: 'synapse_federation_client_sent_pdu_destinations:count + 0'
+    expr: 'synapse_federation_client_sent_pdu_destinations_count_total + 0'
   - record: 'synapse_federation_client_sent'
     labels:
       type: "Query"
@@ -40,11 +40,11 @@ groups:
   - record: 'synapse_federation_server_received'
     labels:
       type: "EDU"
-    expr: 'synapse_federation_server_received_edus + 0'
+    expr: 'synapse_federation_server_received_edus_total + 0'
   - record: 'synapse_federation_server_received'
     labels:
       type: "PDU"
-    expr: 'synapse_federation_server_received_pdus + 0'
+    expr: 'synapse_federation_server_received_pdus_total + 0'
   - record: 'synapse_federation_server_received'
     labels:
       type: "Query"
@@ -60,19 +60,19 @@ groups:
     expr: 'synapse_federation_transaction_queue_pending_pdus + 0'
 
   - record: synapse_storage_events_persisted_by_source_type
-    expr: sum without(type, origin_type, origin_entity) (synapse_storage_events_persisted_events_sep{origin_type="remote"})
+    expr: sum without(type, origin_type, origin_entity) (synapse_storage_events_persisted_events_sep_total{origin_type="remote"})
     labels:
       type: remote
   - record: synapse_storage_events_persisted_by_source_type
-    expr: sum without(type, origin_type, origin_entity) (synapse_storage_events_persisted_events_sep{origin_entity="*client*",origin_type="local"})
+    expr: sum without(type, origin_type, origin_entity) (synapse_storage_events_persisted_events_sep_total{origin_entity="*client*",origin_type="local"})
     labels:
       type: local
   - record: synapse_storage_events_persisted_by_source_type
-    expr: sum without(type, origin_type, origin_entity) (synapse_storage_events_persisted_events_sep{origin_entity!="*client*",origin_type="local"})
+    expr: sum without(type, origin_type, origin_entity) (synapse_storage_events_persisted_events_sep_total{origin_entity!="*client*",origin_type="local"})
     labels:
       type: bridges
   - record: synapse_storage_events_persisted_by_event_type
-    expr: sum without(origin_entity, origin_type) (synapse_storage_events_persisted_events_sep)
+    expr: sum without(origin_entity, origin_type) (synapse_storage_events_persisted_events_sep_total)
   - record: synapse_storage_events_persisted_by_origin
-    expr: sum without(type) (synapse_storage_events_persisted_events_sep)
+    expr: sum without(type) (synapse_storage_events_persisted_events_sep_total)
 
diff --git a/synapse/app/phone_stats_home.py b/synapse/app/phone_stats_home.py
index 51c8d15711..53db1e85b3 100644
--- a/synapse/app/phone_stats_home.py
+++ b/synapse/app/phone_stats_home.py
@@ -32,15 +32,15 @@ logger = logging.getLogger("synapse.app.homeserver")
 _stats_process: List[Tuple[int, "resource.struct_rusage"]] = []
 
 # Gauges to expose monthly active user control metrics
-current_mau_gauge = Gauge("synapse_admin_mau:current", "Current MAU")
+current_mau_gauge = Gauge("synapse_admin_mau_current", "Current MAU")
 current_mau_by_service_gauge = Gauge(
     "synapse_admin_mau_current_mau_by_service",
     "Current MAU by service",
     ["app_service"],
 )
-max_mau_gauge = Gauge("synapse_admin_mau:max", "MAU Limit")
+max_mau_gauge = Gauge("synapse_admin_mau_max", "MAU Limit")
 registered_reserved_users_mau_gauge = Gauge(
-    "synapse_admin_mau:registered_reserved_users",
+    "synapse_admin_mau_registered_reserved_users",
     "Registered users with reserved threepids",
 )
 
diff --git a/synapse/federation/sender/__init__.py b/synapse/federation/sender/__init__.py
index 8bc60e3e3e..a6cb3ba58f 100644
--- a/synapse/federation/sender/__init__.py
+++ b/synapse/federation/sender/__init__.py
@@ -62,12 +62,12 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
 
 sent_pdus_destination_dist_count = Counter(
-    "synapse_federation_client_sent_pdu_destinations:count",
+    "synapse_federation_client_sent_pdu_destinations_count",
     "Number of PDUs queued for sending to one or more destinations",
 )
 
 sent_pdus_destination_dist_total = Counter(
-    "synapse_federation_client_sent_pdu_destinations:total",
+    "synapse_federation_client_sent_pdu_destinations",
     "Total number of PDUs queued for sending across all destinations",
 )
 
diff --git a/synapse/metrics/_legacy_exposition.py b/synapse/metrics/_legacy_exposition.py
index 6f00ff2a47..133f1603dd 100644
--- a/synapse/metrics/_legacy_exposition.py
+++ b/synapse/metrics/_legacy_exposition.py
@@ -93,6 +93,11 @@ LEGACY_METRIC_NAMES = {
     "synapse_util_caches_response_cache_hits": "synapse_util_caches_response_cache:hits",
     "synapse_util_caches_response_cache_evicted_size": "synapse_util_caches_response_cache:evicted_size",
     "synapse_util_caches_response_cache": "synapse_util_caches_response_cache:total",
+    "synapse_federation_client_sent_pdu_destinations": "synapse_federation_client_sent_pdu_destinations:total",
+    "synapse_federation_client_sent_pdu_destinations_count": "synapse_federation_client_sent_pdu_destinations:count",
+    "synapse_admin_mau_current": "synapse_admin_mau:current",
+    "synapse_admin_mau_max": "synapse_admin_mau:max",
+    "synapse_admin_mau_registered_reserved_users": "synapse_admin_mau:registered_reserved_users",
 }
 
 
-- 
cgit 1.5.1


From cf11919ddd4f48b2f59062542ba62969042f80aa Mon Sep 17 00:00:00 2001
From: reivilibre <oliverw@matrix.org>
Date: Thu, 8 Sep 2022 14:30:48 +0000
Subject: Fix cache metrics not being updated when not using the legacy
 exposition module. (#13717)

---
 changelog.d/13717.misc                |  1 +
 synapse/metrics/_legacy_exposition.py |  7 ----
 synapse/util/caches/__init__.py       | 60 ++++++++++++++++++++++++++++-------
 synapse/util/metrics.py               | 34 ++++++++++++++++++--
 4 files changed, 81 insertions(+), 21 deletions(-)
 create mode 100644 changelog.d/13717.misc

(limited to 'synapse/metrics')

diff --git a/changelog.d/13717.misc b/changelog.d/13717.misc
new file mode 100644
index 0000000000..07ace50b12
--- /dev/null
+++ b/changelog.d/13717.misc
@@ -0,0 +1 @@
+Add experimental configuration option to allow disabling legacy Prometheus metric names.
\ No newline at end of file
diff --git a/synapse/metrics/_legacy_exposition.py b/synapse/metrics/_legacy_exposition.py
index 133f1603dd..563d8cc2c6 100644
--- a/synapse/metrics/_legacy_exposition.py
+++ b/synapse/metrics/_legacy_exposition.py
@@ -34,8 +34,6 @@ from prometheus_client.core import Sample
 from twisted.web.resource import Resource
 from twisted.web.server import Request
 
-from synapse.util import caches
-
 CONTENT_TYPE_LATEST = "text/plain; version=0.0.4; charset=utf-8"
 
 
@@ -107,11 +105,6 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt
     by prometheus-client.
     """
 
-    # Trigger the cache metrics to be rescraped, which updates the common
-    # metrics but do not produce metrics themselves
-    for collector in caches.collectors_by_name.values():
-        collector.collect()
-
     output = []
 
     for metric in registry.collect():
diff --git a/synapse/util/caches/__init__.py b/synapse/util/caches/__init__.py
index d4a2b77c29..35c0be08b0 100644
--- a/synapse/util/caches/__init__.py
+++ b/synapse/util/caches/__init__.py
@@ -20,9 +20,11 @@ from sys import intern
 from typing import Any, Callable, Dict, List, Optional, Sized, TypeVar
 
 import attr
+from prometheus_client import REGISTRY
 from prometheus_client.core import Gauge
 
 from synapse.config.cache import add_resizable_cache
+from synapse.util.metrics import DynamicCollectorRegistry
 
 logger = logging.getLogger(__name__)
 
@@ -30,27 +32,62 @@ logger = logging.getLogger(__name__)
 # Whether to track estimated memory usage of the LruCaches.
 TRACK_MEMORY_USAGE = False
 
+# We track cache metrics in a special registry that lets us update the metrics
+# just before they are returned from the scrape endpoint.
+CACHE_METRIC_REGISTRY = DynamicCollectorRegistry()
 
 caches_by_name: Dict[str, Sized] = {}
-collectors_by_name: Dict[str, "CacheMetric"] = {}
 
-cache_size = Gauge("synapse_util_caches_cache_size", "", ["name"])
-cache_hits = Gauge("synapse_util_caches_cache_hits", "", ["name"])
-cache_evicted = Gauge("synapse_util_caches_cache_evicted_size", "", ["name", "reason"])
-cache_total = Gauge("synapse_util_caches_cache", "", ["name"])
-cache_max_size = Gauge("synapse_util_caches_cache_max_size", "", ["name"])
+cache_size = Gauge(
+    "synapse_util_caches_cache_size", "", ["name"], registry=CACHE_METRIC_REGISTRY
+)
+cache_hits = Gauge(
+    "synapse_util_caches_cache_hits", "", ["name"], registry=CACHE_METRIC_REGISTRY
+)
+cache_evicted = Gauge(
+    "synapse_util_caches_cache_evicted_size",
+    "",
+    ["name", "reason"],
+    registry=CACHE_METRIC_REGISTRY,
+)
+cache_total = Gauge(
+    "synapse_util_caches_cache", "", ["name"], registry=CACHE_METRIC_REGISTRY
+)
+cache_max_size = Gauge(
+    "synapse_util_caches_cache_max_size", "", ["name"], registry=CACHE_METRIC_REGISTRY
+)
 cache_memory_usage = Gauge(
     "synapse_util_caches_cache_size_bytes",
     "Estimated memory usage of the caches",
     ["name"],
+    registry=CACHE_METRIC_REGISTRY,
 )
 
-response_cache_size = Gauge("synapse_util_caches_response_cache_size", "", ["name"])
-response_cache_hits = Gauge("synapse_util_caches_response_cache_hits", "", ["name"])
+response_cache_size = Gauge(
+    "synapse_util_caches_response_cache_size",
+    "",
+    ["name"],
+    registry=CACHE_METRIC_REGISTRY,
+)
+response_cache_hits = Gauge(
+    "synapse_util_caches_response_cache_hits",
+    "",
+    ["name"],
+    registry=CACHE_METRIC_REGISTRY,
+)
 response_cache_evicted = Gauge(
-    "synapse_util_caches_response_cache_evicted_size", "", ["name", "reason"]
+    "synapse_util_caches_response_cache_evicted_size",
+    "",
+    ["name", "reason"],
+    registry=CACHE_METRIC_REGISTRY,
 )
-response_cache_total = Gauge("synapse_util_caches_response_cache", "", ["name"])
+response_cache_total = Gauge(
+    "synapse_util_caches_response_cache", "", ["name"], registry=CACHE_METRIC_REGISTRY
+)
+
+
+# Register our custom cache metrics registry with the global registry
+REGISTRY.register(CACHE_METRIC_REGISTRY)
 
 
 class EvictionReason(Enum):
@@ -168,9 +205,8 @@ def register_cache(
         add_resizable_cache(cache_name, resize_callback)
 
     metric = CacheMetric(cache, cache_type, cache_name, collect_callback)
-    metric_name = "cache_%s_%s" % (cache_type, cache_name)
     caches_by_name[cache_name] = cache
-    collectors_by_name[metric_name] = metric
+    CACHE_METRIC_REGISTRY.register_hook(metric.collect)
     return metric
 
 
diff --git a/synapse/util/metrics.py b/synapse/util/metrics.py
index bc3b4938ea..9687120ebf 100644
--- a/synapse/util/metrics.py
+++ b/synapse/util/metrics.py
@@ -15,9 +15,9 @@
 import logging
 from functools import wraps
 from types import TracebackType
-from typing import Awaitable, Callable, Optional, Type, TypeVar
+from typing import Awaitable, Callable, Generator, List, Optional, Type, TypeVar
 
-from prometheus_client import Counter
+from prometheus_client import CollectorRegistry, Counter, Metric
 from typing_extensions import Concatenate, ParamSpec, Protocol
 
 from synapse.logging.context import (
@@ -208,3 +208,33 @@ class Measure:
         metrics.real_time_sum += duration
 
         # TODO: Add other in flight metrics.
+
+
+class DynamicCollectorRegistry(CollectorRegistry):
+    """
+    Custom Prometheus Collector registry that calls a hook first, allowing you
+    to update metrics on-demand.
+
+    Don't forget to register this registry with the main registry!
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._pre_update_hooks: List[Callable[[], None]] = []
+
+    def collect(self) -> Generator[Metric, None, None]:
+        """
+        Collects metrics, calling pre-update hooks first.
+        """
+
+        for pre_update_hook in self._pre_update_hooks:
+            pre_update_hook()
+
+        yield from super().collect()
+
+    def register_hook(self, hook: Callable[[], None]) -> None:
+        """
+        Registers a hook that is called before metric collection.
+        """
+
+        self._pre_update_hooks.append(hook)
-- 
cgit 1.5.1


From 1fa2e58772620199075a36c237dd83cd989c0e91 Mon Sep 17 00:00:00 2001
From: David Robertson <davidr@element.io>
Date: Fri, 7 Oct 2022 13:35:44 +0100
Subject: Catch BrokenPipeError from metrics server, and log as a warning
 (#14072)

---
 changelog.d/14072.misc                |  1 +
 synapse/metrics/_legacy_exposition.py | 18 ++++++++++++------
 2 files changed, 13 insertions(+), 6 deletions(-)
 create mode 100644 changelog.d/14072.misc

(limited to 'synapse/metrics')

diff --git a/changelog.d/14072.misc b/changelog.d/14072.misc
new file mode 100644
index 0000000000..3070c756d5
--- /dev/null
+++ b/changelog.d/14072.misc
@@ -0,0 +1 @@
+Don't create noisy Sentry events when a requester drops connection to the metrics server mid-request.
diff --git a/synapse/metrics/_legacy_exposition.py b/synapse/metrics/_legacy_exposition.py
index 563d8cc2c6..1459f9d224 100644
--- a/synapse/metrics/_legacy_exposition.py
+++ b/synapse/metrics/_legacy_exposition.py
@@ -20,7 +20,7 @@ Due to the renaming of metrics in prometheus_client 0.4.0, this customised
 vendoring of the code will emit both the old versions that Synapse dashboards
 expect, and the newer "best practice" version of the up-to-date official client.
 """
-
+import logging
 import math
 import threading
 from http.server import BaseHTTPRequestHandler, HTTPServer
@@ -34,6 +34,7 @@ from prometheus_client.core import Sample
 from twisted.web.resource import Resource
 from twisted.web.server import Request
 
+logger = logging.getLogger(__name__)
 CONTENT_TYPE_LATEST = "text/plain; version=0.0.4; charset=utf-8"
 
 
@@ -219,11 +220,16 @@ class MetricsHandler(BaseHTTPRequestHandler):
         except Exception:
             self.send_error(500, "error generating metric output")
             raise
-        self.send_response(200)
-        self.send_header("Content-Type", CONTENT_TYPE_LATEST)
-        self.send_header("Content-Length", str(len(output)))
-        self.end_headers()
-        self.wfile.write(output)
+        try:
+            self.send_response(200)
+            self.send_header("Content-Type", CONTENT_TYPE_LATEST)
+            self.send_header("Content-Length", str(len(output)))
+            self.end_headers()
+            self.wfile.write(output)
+        except BrokenPipeError as e:
+            logger.warning(
+                "BrokenPipeError when serving metrics (%s). Did Prometheus restart?", e
+            )
 
     def log_message(self, format: str, *args: Any) -> None:
         """Log nothing."""
-- 
cgit 1.5.1


From 2bb2c32e8ed5642a5bf3ba1e8c49e10cecc88905 Mon Sep 17 00:00:00 2001
From: David Robertson <davidr@element.io>
Date: Mon, 31 Oct 2022 13:02:07 +0000
Subject: Avoid incrementing bg process utime/stime counters by negative
 durations (#14323)

---
 changelog.d/14323.bugfix                         |   1 +
 mypy.ini                                         |   4 +-
 synapse/metrics/background_process_metrics.py    |   6 +-
 tests/metrics/__init__.py                        |   0
 tests/metrics/test_background_process_metrics.py |  19 +++
 tests/metrics/test_metrics.py                    | 206 +++++++++++++++++++++++
 tests/test_metrics.py                            | 200 ----------------------
 7 files changed, 233 insertions(+), 203 deletions(-)
 create mode 100644 changelog.d/14323.bugfix
 create mode 100644 tests/metrics/__init__.py
 create mode 100644 tests/metrics/test_background_process_metrics.py
 create mode 100644 tests/metrics/test_metrics.py
 delete mode 100644 tests/test_metrics.py

(limited to 'synapse/metrics')

diff --git a/changelog.d/14323.bugfix b/changelog.d/14323.bugfix
new file mode 100644
index 0000000000..da39bc020c
--- /dev/null
+++ b/changelog.d/14323.bugfix
@@ -0,0 +1 @@
+Fix a bug introduced in Synapse 0.34.0rc2 where logs could include error spam when background processes are measured as taking a negative amount of time.
diff --git a/mypy.ini b/mypy.ini
index 34b4523e00..8f1141a239 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -56,7 +56,6 @@ exclude = (?x)
    |tests/rest/media/v1/test_media_storage.py
    |tests/server.py
    |tests/server_notices/test_resource_limits_server_notices.py
-   |tests/test_metrics.py
    |tests/test_state.py
    |tests/test_terms_auth.py
    |tests/util/caches/test_cached_call.py
@@ -106,6 +105,9 @@ disallow_untyped_defs = False
 [mypy-tests.handlers.test_user_directory]
 disallow_untyped_defs = True
 
+[mypy-tests.metrics.test_background_process_metrics]
+disallow_untyped_defs = True
+
 [mypy-tests.push.test_bulk_push_rule_evaluator]
 disallow_untyped_defs = True
 
diff --git a/synapse/metrics/background_process_metrics.py b/synapse/metrics/background_process_metrics.py
index 7a1516d3a8..9ea4e23b31 100644
--- a/synapse/metrics/background_process_metrics.py
+++ b/synapse/metrics/background_process_metrics.py
@@ -174,8 +174,10 @@ class _BackgroundProcess:
             diff = new_stats - self._reported_stats
         self._reported_stats = new_stats
 
-        _background_process_ru_utime.labels(self.desc).inc(diff.ru_utime)
-        _background_process_ru_stime.labels(self.desc).inc(diff.ru_stime)
+        # For unknown reasons, the difference in times can be negative. See comment in
+        # synapse.http.request_metrics.RequestMetrics.update_metrics.
+        _background_process_ru_utime.labels(self.desc).inc(max(diff.ru_utime, 0))
+        _background_process_ru_stime.labels(self.desc).inc(max(diff.ru_stime, 0))
         _background_process_db_txn_count.labels(self.desc).inc(diff.db_txn_count)
         _background_process_db_txn_duration.labels(self.desc).inc(
             diff.db_txn_duration_sec
diff --git a/tests/metrics/__init__.py b/tests/metrics/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/metrics/test_background_process_metrics.py b/tests/metrics/test_background_process_metrics.py
new file mode 100644
index 0000000000..f0f6cb2912
--- /dev/null
+++ b/tests/metrics/test_background_process_metrics.py
@@ -0,0 +1,19 @@
+from unittest import TestCase as StdlibTestCase
+from unittest.mock import Mock
+
+from synapse.logging.context import ContextResourceUsage, LoggingContext
+from synapse.metrics.background_process_metrics import _BackgroundProcess
+
+
+class TestBackgroundProcessMetrics(StdlibTestCase):
+    def test_update_metrics_with_negative_time_diff(self) -> None:
+        """We should ignore negative reported utime and stime differences"""
+        usage = ContextResourceUsage()
+        usage.ru_stime = usage.ru_utime = -1.0
+
+        mock_logging_context = Mock(spec=LoggingContext)
+        mock_logging_context.get_resource_usage.return_value = usage
+
+        process = _BackgroundProcess("test process", mock_logging_context)
+        # Should not raise
+        process.update_metrics()
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
new file mode 100644
index 0000000000..bddc4228bc
--- /dev/null
+++ b/tests/metrics/test_metrics.py
@@ -0,0 +1,206 @@
+# Copyright 2018 New Vector Ltd
+# Copyright 2019 Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing_extensions import Protocol
+
+try:
+    from importlib import metadata
+except ImportError:
+    import importlib_metadata as metadata  # type: ignore[no-redef]
+
+from unittest.mock import patch
+
+from pkg_resources import parse_version
+
+from synapse.app._base import _set_prometheus_client_use_created_metrics
+from synapse.metrics import REGISTRY, InFlightGauge, generate_latest
+from synapse.util.caches.deferred_cache import DeferredCache
+
+from tests import unittest
+
+
+def get_sample_labels_value(sample):
+    """Extract the labels and values of a sample.
+
+    prometheus_client 0.5 changed the sample type to a named tuple with more
+    members than the plain tuple had in 0.4 and earlier. This function can
+    extract the labels and value from the sample for both sample types.
+
+    Args:
+        sample: The sample to get the labels and value from.
+    Returns:
+        A tuple of (labels, value) from the sample.
+    """
+
+    # If the sample has a labels and value attribute, use those.
+    if hasattr(sample, "labels") and hasattr(sample, "value"):
+        return sample.labels, sample.value
+    # Otherwise fall back to treating it as a plain 3 tuple.
+    else:
+        _, labels, value = sample
+        return labels, value
+
+
+class TestMauLimit(unittest.TestCase):
+    def test_basic(self):
+        class MetricEntry(Protocol):
+            foo: int
+            bar: int
+
+        gauge: InFlightGauge[MetricEntry] = InFlightGauge(
+            "test1", "", labels=["test_label"], sub_metrics=["foo", "bar"]
+        )
+
+        def handle1(metrics):
+            metrics.foo += 2
+            metrics.bar = max(metrics.bar, 5)
+
+        def handle2(metrics):
+            metrics.foo += 3
+            metrics.bar = max(metrics.bar, 7)
+
+        gauge.register(("key1",), handle1)
+
+        self.assert_dict(
+            {
+                "test1_total": {("key1",): 1},
+                "test1_foo": {("key1",): 2},
+                "test1_bar": {("key1",): 5},
+            },
+            self.get_metrics_from_gauge(gauge),
+        )
+
+        gauge.unregister(("key1",), handle1)
+
+        self.assert_dict(
+            {
+                "test1_total": {("key1",): 0},
+                "test1_foo": {("key1",): 0},
+                "test1_bar": {("key1",): 0},
+            },
+            self.get_metrics_from_gauge(gauge),
+        )
+
+        gauge.register(("key1",), handle1)
+        gauge.register(("key2",), handle2)
+
+        self.assert_dict(
+            {
+                "test1_total": {("key1",): 1, ("key2",): 1},
+                "test1_foo": {("key1",): 2, ("key2",): 3},
+                "test1_bar": {("key1",): 5, ("key2",): 7},
+            },
+            self.get_metrics_from_gauge(gauge),
+        )
+
+        gauge.unregister(("key2",), handle2)
+        gauge.register(("key1",), handle2)
+
+        self.assert_dict(
+            {
+                "test1_total": {("key1",): 2, ("key2",): 0},
+                "test1_foo": {("key1",): 5, ("key2",): 0},
+                "test1_bar": {("key1",): 7, ("key2",): 0},
+            },
+            self.get_metrics_from_gauge(gauge),
+        )
+
+    def get_metrics_from_gauge(self, gauge):
+        results = {}
+
+        for r in gauge.collect():
+            results[r.name] = {
+                tuple(labels[x] for x in gauge.labels): value
+                for labels, value in map(get_sample_labels_value, r.samples)
+            }
+
+        return results
+
+
+class BuildInfoTests(unittest.TestCase):
+    def test_get_build(self):
+        """
+        The synapse_build_info metric reports the OS version, Python version,
+        and Synapse version.
+        """
+        items = list(
+            filter(
+                lambda x: b"synapse_build_info{" in x,
+                generate_latest(REGISTRY).split(b"\n"),
+            )
+        )
+        self.assertEqual(len(items), 1)
+        self.assertTrue(b"osversion=" in items[0])
+        self.assertTrue(b"pythonversion=" in items[0])
+        self.assertTrue(b"version=" in items[0])
+
+
+class CacheMetricsTests(unittest.HomeserverTestCase):
+    def test_cache_metric(self):
+        """
+        Caches produce metrics reflecting their state when scraped.
+        """
+        CACHE_NAME = "cache_metrics_test_fgjkbdfg"
+        cache: DeferredCache[str, str] = DeferredCache(CACHE_NAME, max_entries=777)
+
+        items = {
+            x.split(b"{")[0].decode("ascii"): x.split(b" ")[1].decode("ascii")
+            for x in filter(
+                lambda x: b"cache_metrics_test_fgjkbdfg" in x,
+                generate_latest(REGISTRY).split(b"\n"),
+            )
+        }
+
+        self.assertEqual(items["synapse_util_caches_cache_size"], "0.0")
+        self.assertEqual(items["synapse_util_caches_cache_max_size"], "777.0")
+
+        cache.prefill("1", "hi")
+
+        items = {
+            x.split(b"{")[0].decode("ascii"): x.split(b" ")[1].decode("ascii")
+            for x in filter(
+                lambda x: b"cache_metrics_test_fgjkbdfg" in x,
+                generate_latest(REGISTRY).split(b"\n"),
+            )
+        }
+
+        self.assertEqual(items["synapse_util_caches_cache_size"], "1.0")
+        self.assertEqual(items["synapse_util_caches_cache_max_size"], "777.0")
+
+
+class PrometheusMetricsHackTestCase(unittest.HomeserverTestCase):
+    if parse_version(metadata.version("prometheus_client")) < parse_version("0.14.0"):
+        skip = "prometheus-client too old"
+
+    def test_created_metrics_disabled(self) -> None:
+        """
+        Tests that a brittle hack, to disable `_created` metrics, works.
+        This involves poking at the internals of prometheus-client.
+        It's not the end of the world if this doesn't work.
+
+        This test gives us a way to notice if prometheus-client changes
+        their internals.
+        """
+        import prometheus_client.metrics
+
+        PRIVATE_FLAG_NAME = "_use_created"
+
+        # By default, the pesky `_created` metrics are enabled.
+        # Check this assumption is still valid.
+        self.assertTrue(getattr(prometheus_client.metrics, PRIVATE_FLAG_NAME))
+
+        with patch("prometheus_client.metrics") as mock:
+            setattr(mock, PRIVATE_FLAG_NAME, True)
+            _set_prometheus_client_use_created_metrics(False)
+            self.assertFalse(getattr(mock, PRIVATE_FLAG_NAME, False))
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
deleted file mode 100644
index 1a70eddc9b..0000000000
--- a/tests/test_metrics.py
+++ /dev/null
@@ -1,200 +0,0 @@
-# Copyright 2018 New Vector Ltd
-# Copyright 2019 Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-try:
-    from importlib import metadata
-except ImportError:
-    import importlib_metadata as metadata  # type: ignore[no-redef]
-
-from unittest.mock import patch
-
-from pkg_resources import parse_version
-
-from synapse.app._base import _set_prometheus_client_use_created_metrics
-from synapse.metrics import REGISTRY, InFlightGauge, generate_latest
-from synapse.util.caches.deferred_cache import DeferredCache
-
-from tests import unittest
-
-
-def get_sample_labels_value(sample):
-    """Extract the labels and values of a sample.
-
-    prometheus_client 0.5 changed the sample type to a named tuple with more
-    members than the plain tuple had in 0.4 and earlier. This function can
-    extract the labels and value from the sample for both sample types.
-
-    Args:
-        sample: The sample to get the labels and value from.
-    Returns:
-        A tuple of (labels, value) from the sample.
-    """
-
-    # If the sample has a labels and value attribute, use those.
-    if hasattr(sample, "labels") and hasattr(sample, "value"):
-        return sample.labels, sample.value
-    # Otherwise fall back to treating it as a plain 3 tuple.
-    else:
-        _, labels, value = sample
-        return labels, value
-
-
-class TestMauLimit(unittest.TestCase):
-    def test_basic(self):
-        gauge = InFlightGauge(
-            "test1", "", labels=["test_label"], sub_metrics=["foo", "bar"]
-        )
-
-        def handle1(metrics):
-            metrics.foo += 2
-            metrics.bar = max(metrics.bar, 5)
-
-        def handle2(metrics):
-            metrics.foo += 3
-            metrics.bar = max(metrics.bar, 7)
-
-        gauge.register(("key1",), handle1)
-
-        self.assert_dict(
-            {
-                "test1_total": {("key1",): 1},
-                "test1_foo": {("key1",): 2},
-                "test1_bar": {("key1",): 5},
-            },
-            self.get_metrics_from_gauge(gauge),
-        )
-
-        gauge.unregister(("key1",), handle1)
-
-        self.assert_dict(
-            {
-                "test1_total": {("key1",): 0},
-                "test1_foo": {("key1",): 0},
-                "test1_bar": {("key1",): 0},
-            },
-            self.get_metrics_from_gauge(gauge),
-        )
-
-        gauge.register(("key1",), handle1)
-        gauge.register(("key2",), handle2)
-
-        self.assert_dict(
-            {
-                "test1_total": {("key1",): 1, ("key2",): 1},
-                "test1_foo": {("key1",): 2, ("key2",): 3},
-                "test1_bar": {("key1",): 5, ("key2",): 7},
-            },
-            self.get_metrics_from_gauge(gauge),
-        )
-
-        gauge.unregister(("key2",), handle2)
-        gauge.register(("key1",), handle2)
-
-        self.assert_dict(
-            {
-                "test1_total": {("key1",): 2, ("key2",): 0},
-                "test1_foo": {("key1",): 5, ("key2",): 0},
-                "test1_bar": {("key1",): 7, ("key2",): 0},
-            },
-            self.get_metrics_from_gauge(gauge),
-        )
-
-    def get_metrics_from_gauge(self, gauge):
-        results = {}
-
-        for r in gauge.collect():
-            results[r.name] = {
-                tuple(labels[x] for x in gauge.labels): value
-                for labels, value in map(get_sample_labels_value, r.samples)
-            }
-
-        return results
-
-
-class BuildInfoTests(unittest.TestCase):
-    def test_get_build(self):
-        """
-        The synapse_build_info metric reports the OS version, Python version,
-        and Synapse version.
-        """
-        items = list(
-            filter(
-                lambda x: b"synapse_build_info{" in x,
-                generate_latest(REGISTRY).split(b"\n"),
-            )
-        )
-        self.assertEqual(len(items), 1)
-        self.assertTrue(b"osversion=" in items[0])
-        self.assertTrue(b"pythonversion=" in items[0])
-        self.assertTrue(b"version=" in items[0])
-
-
-class CacheMetricsTests(unittest.HomeserverTestCase):
-    def test_cache_metric(self):
-        """
-        Caches produce metrics reflecting their state when scraped.
-        """
-        CACHE_NAME = "cache_metrics_test_fgjkbdfg"
-        cache = DeferredCache(CACHE_NAME, max_entries=777)
-
-        items = {
-            x.split(b"{")[0].decode("ascii"): x.split(b" ")[1].decode("ascii")
-            for x in filter(
-                lambda x: b"cache_metrics_test_fgjkbdfg" in x,
-                generate_latest(REGISTRY).split(b"\n"),
-            )
-        }
-
-        self.assertEqual(items["synapse_util_caches_cache_size"], "0.0")
-        self.assertEqual(items["synapse_util_caches_cache_max_size"], "777.0")
-
-        cache.prefill("1", "hi")
-
-        items = {
-            x.split(b"{")[0].decode("ascii"): x.split(b" ")[1].decode("ascii")
-            for x in filter(
-                lambda x: b"cache_metrics_test_fgjkbdfg" in x,
-                generate_latest(REGISTRY).split(b"\n"),
-            )
-        }
-
-        self.assertEqual(items["synapse_util_caches_cache_size"], "1.0")
-        self.assertEqual(items["synapse_util_caches_cache_max_size"], "777.0")
-
-
-class PrometheusMetricsHackTestCase(unittest.HomeserverTestCase):
-    if parse_version(metadata.version("prometheus_client")) < parse_version("0.14.0"):
-        skip = "prometheus-client too old"
-
-    def test_created_metrics_disabled(self) -> None:
-        """
-        Tests that a brittle hack, to disable `_created` metrics, works.
-        This involves poking at the internals of prometheus-client.
-        It's not the end of the world if this doesn't work.
-
-        This test gives us a way to notice if prometheus-client changes
-        their internals.
-        """
-        import prometheus_client.metrics
-
-        PRIVATE_FLAG_NAME = "_use_created"
-
-        # By default, the pesky `_created` metrics are enabled.
-        # Check this assumption is still valid.
-        self.assertTrue(getattr(prometheus_client.metrics, PRIVATE_FLAG_NAME))
-
-        with patch("prometheus_client.metrics") as mock:
-            setattr(mock, PRIVATE_FLAG_NAME, True)
-            _set_prometheus_client_use_created_metrics(False)
-            self.assertFalse(getattr(mock, PRIVATE_FLAG_NAME, False))
-- 
cgit 1.5.1


From 9af2be192a759c22d189b72cc0a7580cd9de8a37 Mon Sep 17 00:00:00 2001
From: reivilibre <oliverw@matrix.org>
Date: Thu, 24 Nov 2022 09:09:17 +0000
Subject: Remove legacy Prometheus metrics names. They were deprecated in
 Synapse v1.69.0 and disabled by default in Synapse v1.71.0. (#14538)

---
 changelog.d/14538.removal                        |   1 +
 docs/upgrade.md                                  |  22 ++
 docs/usage/configuration/config_documentation.md |  25 --
 synapse/app/_base.py                             |  16 +-
 synapse/app/generic_worker.py                    |   1 -
 synapse/app/homeserver.py                        |   1 -
 synapse/config/metrics.py                        |   2 -
 synapse/metrics/__init__.py                      |   7 +-
 synapse/metrics/_legacy_exposition.py            | 288 -----------------------
 synapse/metrics/_twisted_exposition.py           |  38 +++
 tests/storage/test_event_metrics.py              |   7 +-
 11 files changed, 70 insertions(+), 338 deletions(-)
 create mode 100644 changelog.d/14538.removal
 delete mode 100644 synapse/metrics/_legacy_exposition.py
 create mode 100644 synapse/metrics/_twisted_exposition.py

(limited to 'synapse/metrics')

diff --git a/changelog.d/14538.removal b/changelog.d/14538.removal
new file mode 100644
index 0000000000..d2035ce82a
--- /dev/null
+++ b/changelog.d/14538.removal
@@ -0,0 +1 @@
+Remove legacy Prometheus metrics names. They were deprecated in Synapse v1.69.0 and disabled by default in Synapse v1.71.0.
\ No newline at end of file
diff --git a/docs/upgrade.md b/docs/upgrade.md
index 2aa353e496..4fe9e4f02e 100644
--- a/docs/upgrade.md
+++ b/docs/upgrade.md
@@ -88,6 +88,28 @@ process, for example:
     dpkg -i matrix-synapse-py3_1.3.0+stretch1_amd64.deb
     ```
 
+# Upgrading to v1.73.0
+
+## Legacy Prometheus metric names have now been removed
+
+Synapse v1.69.0 included the deprecation of legacy Prometheus metric names
+and offered an option to disable them.
+Synapse v1.71.0 disabled legacy Prometheus metric names by default.
+
+This version, v1.73.0, removes those legacy Prometheus metric names entirely.
+This also means that the `enable_legacy_metrics` configuration option has been
+removed; it will no longer be possible to re-enable the legacy metric names.
+
+If you use metrics and have not yet updated your Grafana dashboard(s),
+Prometheus console(s) or alerting rule(s), please consider doing so when upgrading
+to this version.
+Note that the included Grafana dashboard was updated in v1.72.0 to correct some
+metric names which were missed when legacy metrics were disabled by default.
+
+See [v1.69.0: Deprecation of legacy Prometheus metric names](#deprecation-of-legacy-prometheus-metric-names)
+for more context.
+
+
 # Upgrading to v1.72.0
 
 ## Dropping support for PostgreSQL 10
diff --git a/docs/usage/configuration/config_documentation.md b/docs/usage/configuration/config_documentation.md
index f5937dd902..fae2771fad 100644
--- a/docs/usage/configuration/config_documentation.md
+++ b/docs/usage/configuration/config_documentation.md
@@ -2437,31 +2437,6 @@ Example configuration:
 enable_metrics: true
 ```
 ---
-### `enable_legacy_metrics`
-
-Set to `true` to publish both legacy and non-legacy Prometheus metric names,
-or to `false` to only publish non-legacy Prometheus metric names.
-Defaults to `false`. Has no effect if `enable_metrics` is `false`.
-**In Synapse v1.67.0 up to and including Synapse v1.70.1, this defaulted to `true`.**
-
-Legacy metric names include:
-- metrics containing colons in the name, such as `synapse_util_caches_response_cache:hits`, because colons are supposed to be reserved for user-defined recording rules;
-- counters that don't end with the `_total` suffix, such as `synapse_federation_client_sent_edus`, therefore not adhering to the OpenMetrics standard.
-
-These legacy metric names are unconventional and not compliant with OpenMetrics standards.
-They are included for backwards compatibility.
-
-Example configuration:
-```yaml
-enable_legacy_metrics: false
-```
-
-See https://github.com/matrix-org/synapse/issues/11106 for context.
-
-*Since v1.67.0.*
-
-**Will be removed in v1.73.0.**
----
 ### `sentry`
 
 Use this option to enable sentry integration. Provide the DSN assigned to you by sentry
diff --git a/synapse/app/_base.py b/synapse/app/_base.py
index 41d2732ef9..a5aa2185a2 100644
--- a/synapse/app/_base.py
+++ b/synapse/app/_base.py
@@ -266,26 +266,18 @@ def register_start(
     reactor.callWhenRunning(lambda: defer.ensureDeferred(wrapper()))
 
 
-def listen_metrics(
-    bind_addresses: Iterable[str], port: int, enable_legacy_metric_names: bool
-) -> None:
+def listen_metrics(bind_addresses: Iterable[str], port: int) -> None:
     """
     Start Prometheus metrics server.
     """
     from prometheus_client import start_http_server as start_http_server_prometheus
 
-    from synapse.metrics import (
-        RegistryProxy,
-        start_http_server as start_http_server_legacy,
-    )
+    from synapse.metrics import RegistryProxy
 
     for host in bind_addresses:
         logger.info("Starting metrics listener on %s:%d", host, port)
-        if enable_legacy_metric_names:
-            start_http_server_legacy(port, addr=host, registry=RegistryProxy)
-        else:
-            _set_prometheus_client_use_created_metrics(False)
-            start_http_server_prometheus(port, addr=host, registry=RegistryProxy)
+        _set_prometheus_client_use_created_metrics(False)
+        start_http_server_prometheus(port, addr=host, registry=RegistryProxy)
 
 
 def _set_prometheus_client_use_created_metrics(new_value: bool) -> None:
diff --git a/synapse/app/generic_worker.py b/synapse/app/generic_worker.py
index 74909b7d4a..46dc731696 100644
--- a/synapse/app/generic_worker.py
+++ b/synapse/app/generic_worker.py
@@ -320,7 +320,6 @@ class GenericWorkerServer(HomeServer):
                     _base.listen_metrics(
                         listener.bind_addresses,
                         listener.port,
-                        enable_legacy_metric_names=self.config.metrics.enable_legacy_metrics,
                     )
             else:
                 logger.warning("Unsupported listener type: %s", listener.type)
diff --git a/synapse/app/homeserver.py b/synapse/app/homeserver.py
index 4f4fee4782..b9be558c7e 100644
--- a/synapse/app/homeserver.py
+++ b/synapse/app/homeserver.py
@@ -265,7 +265,6 @@ class SynapseHomeServer(HomeServer):
                     _base.listen_metrics(
                         listener.bind_addresses,
                         listener.port,
-                        enable_legacy_metric_names=self.config.metrics.enable_legacy_metrics,
                     )
             else:
                 # this shouldn't happen, as the listener type should have been checked
diff --git a/synapse/config/metrics.py b/synapse/config/metrics.py
index 6034a0346e..8c1c9bd12d 100644
--- a/synapse/config/metrics.py
+++ b/synapse/config/metrics.py
@@ -43,8 +43,6 @@ class MetricsConfig(Config):
     def read_config(self, config: JsonDict, **kwargs: Any) -> None:
         self.enable_metrics = config.get("enable_metrics", False)
 
-        self.enable_legacy_metrics = config.get("enable_legacy_metrics", False)
-
         self.report_stats = config.get("report_stats", None)
         self.report_stats_endpoint = config.get(
             "report_stats_endpoint", "https://matrix.org/report-usage-stats/push"
diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py
index c3d3daf877..b01372565d 100644
--- a/synapse/metrics/__init__.py
+++ b/synapse/metrics/__init__.py
@@ -47,11 +47,7 @@ from twisted.python.threadpool import ThreadPool
 # This module is imported for its side effects; flake8 needn't warn that it's unused.
 import synapse.metrics._reactor_metrics  # noqa: F401
 from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, install_gc_manager
-from synapse.metrics._legacy_exposition import (
-    MetricsResource,
-    generate_latest,
-    start_http_server,
-)
+from synapse.metrics._twisted_exposition import MetricsResource, generate_latest
 from synapse.metrics._types import Collector
 from synapse.util import SYNAPSE_VERSION
 
@@ -474,7 +470,6 @@ __all__ = [
     "Collector",
     "MetricsResource",
     "generate_latest",
-    "start_http_server",
     "LaterGauge",
     "InFlightGauge",
     "GaugeBucketCollector",
diff --git a/synapse/metrics/_legacy_exposition.py b/synapse/metrics/_legacy_exposition.py
deleted file mode 100644
index 1459f9d224..0000000000
--- a/synapse/metrics/_legacy_exposition.py
+++ /dev/null
@@ -1,288 +0,0 @@
-# Copyright 2015-2019 Prometheus Python Client Developers
-# Copyright 2019 Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This code is based off `prometheus_client/exposition.py` from version 0.7.1.
-
-Due to the renaming of metrics in prometheus_client 0.4.0, this customised
-vendoring of the code will emit both the old versions that Synapse dashboards
-expect, and the newer "best practice" version of the up-to-date official client.
-"""
-import logging
-import math
-import threading
-from http.server import BaseHTTPRequestHandler, HTTPServer
-from socketserver import ThreadingMixIn
-from typing import Any, Dict, List, Type, Union
-from urllib.parse import parse_qs, urlparse
-
-from prometheus_client import REGISTRY, CollectorRegistry
-from prometheus_client.core import Sample
-
-from twisted.web.resource import Resource
-from twisted.web.server import Request
-
-logger = logging.getLogger(__name__)
-CONTENT_TYPE_LATEST = "text/plain; version=0.0.4; charset=utf-8"
-
-
-def floatToGoString(d: Union[int, float]) -> str:
-    d = float(d)
-    if d == math.inf:
-        return "+Inf"
-    elif d == -math.inf:
-        return "-Inf"
-    elif math.isnan(d):
-        return "NaN"
-    else:
-        s = repr(d)
-        dot = s.find(".")
-        # Go switches to exponents sooner than Python.
-        # We only need to care about positive values for le/quantile.
-        if d > 0 and dot > 6:
-            mantissa = f"{s[0]}.{s[1:dot]}{s[dot + 1 :]}".rstrip("0.")
-            return f"{mantissa}e+0{dot - 1}"
-        return s
-
-
-def sample_line(line: Sample, name: str) -> str:
-    if line.labels:
-        labelstr = "{{{0}}}".format(
-            ",".join(
-                [
-                    '{}="{}"'.format(
-                        k,
-                        v.replace("\\", r"\\").replace("\n", r"\n").replace('"', r"\""),
-                    )
-                    for k, v in sorted(line.labels.items())
-                ]
-            )
-        )
-    else:
-        labelstr = ""
-    timestamp = ""
-    if line.timestamp is not None:
-        # Convert to milliseconds.
-        timestamp = f" {int(float(line.timestamp) * 1000):d}"
-    return "{}{} {}{}\n".format(name, labelstr, floatToGoString(line.value), timestamp)
-
-
-# Mapping from new metric names to legacy metric names.
-# We translate these back to their old names when exposing them through our
-# legacy vendored exporter.
-# Only this legacy exposition module applies these name changes.
-LEGACY_METRIC_NAMES = {
-    "synapse_util_caches_cache_hits": "synapse_util_caches_cache:hits",
-    "synapse_util_caches_cache_size": "synapse_util_caches_cache:size",
-    "synapse_util_caches_cache_evicted_size": "synapse_util_caches_cache:evicted_size",
-    "synapse_util_caches_cache": "synapse_util_caches_cache:total",
-    "synapse_util_caches_response_cache_size": "synapse_util_caches_response_cache:size",
-    "synapse_util_caches_response_cache_hits": "synapse_util_caches_response_cache:hits",
-    "synapse_util_caches_response_cache_evicted_size": "synapse_util_caches_response_cache:evicted_size",
-    "synapse_util_caches_response_cache": "synapse_util_caches_response_cache:total",
-    "synapse_federation_client_sent_pdu_destinations": "synapse_federation_client_sent_pdu_destinations:total",
-    "synapse_federation_client_sent_pdu_destinations_count": "synapse_federation_client_sent_pdu_destinations:count",
-    "synapse_admin_mau_current": "synapse_admin_mau:current",
-    "synapse_admin_mau_max": "synapse_admin_mau:max",
-    "synapse_admin_mau_registered_reserved_users": "synapse_admin_mau:registered_reserved_users",
-}
-
-
-def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> bytes:
-    """
-    Generate metrics in legacy format. Modern metrics are generated directly
-    by prometheus-client.
-    """
-
-    output = []
-
-    for metric in registry.collect():
-        if not metric.samples:
-            # No samples, don't bother.
-            continue
-
-        # Translate to legacy metric name if it has one.
-        mname = LEGACY_METRIC_NAMES.get(metric.name, metric.name)
-        mnewname = metric.name
-        mtype = metric.type
-
-        # OpenMetrics -> Prometheus
-        if mtype == "counter":
-            mnewname = mnewname + "_total"
-        elif mtype == "info":
-            mtype = "gauge"
-            mnewname = mnewname + "_info"
-        elif mtype == "stateset":
-            mtype = "gauge"
-        elif mtype == "gaugehistogram":
-            mtype = "histogram"
-        elif mtype == "unknown":
-            mtype = "untyped"
-
-        # Output in the old format for compatibility.
-        if emit_help:
-            output.append(
-                "# HELP {} {}\n".format(
-                    mname,
-                    metric.documentation.replace("\\", r"\\").replace("\n", r"\n"),
-                )
-            )
-        output.append(f"# TYPE {mname} {mtype}\n")
-
-        om_samples: Dict[str, List[str]] = {}
-        for s in metric.samples:
-            for suffix in ["_created", "_gsum", "_gcount"]:
-                if s.name == mname + suffix:
-                    # OpenMetrics specific sample, put in a gauge at the end.
-                    # (these come from gaugehistograms which don't get renamed,
-                    # so no need to faff with mnewname)
-                    om_samples.setdefault(suffix, []).append(sample_line(s, s.name))
-                    break
-            else:
-                newname = s.name.replace(mnewname, mname)
-                if ":" in newname and newname.endswith("_total"):
-                    newname = newname[: -len("_total")]
-                output.append(sample_line(s, newname))
-
-        for suffix, lines in sorted(om_samples.items()):
-            if emit_help:
-                output.append(
-                    "# HELP {}{} {}\n".format(
-                        mname,
-                        suffix,
-                        metric.documentation.replace("\\", r"\\").replace("\n", r"\n"),
-                    )
-                )
-            output.append(f"# TYPE {mname}{suffix} gauge\n")
-            output.extend(lines)
-
-        # Get rid of the weird colon things while we're at it
-        if mtype == "counter":
-            mnewname = mnewname.replace(":total", "")
-        mnewname = mnewname.replace(":", "_")
-
-        if mname == mnewname:
-            continue
-
-        # Also output in the new format, if it's different.
-        if emit_help:
-            output.append(
-                "# HELP {} {}\n".format(
-                    mnewname,
-                    metric.documentation.replace("\\", r"\\").replace("\n", r"\n"),
-                )
-            )
-        output.append(f"# TYPE {mnewname} {mtype}\n")
-
-        for s in metric.samples:
-            # Get rid of the OpenMetrics specific samples (we should already have
-            # dealt with them above anyway.)
-            for suffix in ["_created", "_gsum", "_gcount"]:
-                if s.name == mname + suffix:
-                    break
-            else:
-                sample_name = LEGACY_METRIC_NAMES.get(s.name, s.name)
-                output.append(
-                    sample_line(s, sample_name.replace(":total", "").replace(":", "_"))
-                )
-
-    return "".join(output).encode("utf-8")
-
-
-class MetricsHandler(BaseHTTPRequestHandler):
-    """HTTP handler that gives metrics from ``REGISTRY``."""
-
-    registry = REGISTRY
-
-    def do_GET(self) -> None:
-        registry = self.registry
-        params = parse_qs(urlparse(self.path).query)
-
-        if "help" in params:
-            emit_help = True
-        else:
-            emit_help = False
-
-        try:
-            output = generate_latest(registry, emit_help=emit_help)
-        except Exception:
-            self.send_error(500, "error generating metric output")
-            raise
-        try:
-            self.send_response(200)
-            self.send_header("Content-Type", CONTENT_TYPE_LATEST)
-            self.send_header("Content-Length", str(len(output)))
-            self.end_headers()
-            self.wfile.write(output)
-        except BrokenPipeError as e:
-            logger.warning(
-                "BrokenPipeError when serving metrics (%s). Did Prometheus restart?", e
-            )
-
-    def log_message(self, format: str, *args: Any) -> None:
-        """Log nothing."""
-
-    @classmethod
-    def factory(cls, registry: CollectorRegistry) -> Type:
-        """Returns a dynamic MetricsHandler class tied
-        to the passed registry.
-        """
-        # This implementation relies on MetricsHandler.registry
-        #  (defined above and defaulted to REGISTRY).
-
-        # As we have unicode_literals, we need to create a str()
-        #  object for type().
-        cls_name = str(cls.__name__)
-        MyMetricsHandler = type(cls_name, (cls, object), {"registry": registry})
-        return MyMetricsHandler
-
-
-class _ThreadingSimpleServer(ThreadingMixIn, HTTPServer):
-    """Thread per request HTTP server."""
-
-    # Make worker threads "fire and forget". Beginning with Python 3.7 this
-    # prevents a memory leak because ``ThreadingMixIn`` starts to gather all
-    # non-daemon threads in a list in order to join on them at server close.
-    # Enabling daemon threads virtually makes ``_ThreadingSimpleServer`` the
-    # same as Python 3.7's ``ThreadingHTTPServer``.
-    daemon_threads = True
-
-
-def start_http_server(
-    port: int, addr: str = "", registry: CollectorRegistry = REGISTRY
-) -> None:
-    """Starts an HTTP server for prometheus metrics as a daemon thread"""
-    CustomMetricsHandler = MetricsHandler.factory(registry)
-    httpd = _ThreadingSimpleServer((addr, port), CustomMetricsHandler)
-    t = threading.Thread(target=httpd.serve_forever)
-    t.daemon = True
-    t.start()
-
-
-class MetricsResource(Resource):
-    """
-    Twisted ``Resource`` that serves prometheus metrics.
-    """
-
-    isLeaf = True
-
-    def __init__(self, registry: CollectorRegistry = REGISTRY):
-        self.registry = registry
-
-    def render_GET(self, request: Request) -> bytes:
-        request.setHeader(b"Content-Type", CONTENT_TYPE_LATEST.encode("ascii"))
-        response = generate_latest(self.registry)
-        request.setHeader(b"Content-Length", str(len(response)))
-        return response
diff --git a/synapse/metrics/_twisted_exposition.py b/synapse/metrics/_twisted_exposition.py
new file mode 100644
index 0000000000..0abcd14953
--- /dev/null
+++ b/synapse/metrics/_twisted_exposition.py
@@ -0,0 +1,38 @@
+# Copyright 2015-2019 Prometheus Python Client Developers
+# Copyright 2019 Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from prometheus_client import REGISTRY, CollectorRegistry, generate_latest
+
+from twisted.web.resource import Resource
+from twisted.web.server import Request
+
+CONTENT_TYPE_LATEST = "text/plain; version=0.0.4; charset=utf-8"
+
+
+class MetricsResource(Resource):
+    """
+    Twisted ``Resource`` that serves prometheus metrics.
+    """
+
+    isLeaf = True
+
+    def __init__(self, registry: CollectorRegistry = REGISTRY):
+        self.registry = registry
+
+    def render_GET(self, request: Request) -> bytes:
+        request.setHeader(b"Content-Type", CONTENT_TYPE_LATEST.encode("ascii"))
+        response = generate_latest(self.registry)
+        request.setHeader(b"Content-Length", str(len(response)))
+        return response
diff --git a/tests/storage/test_event_metrics.py b/tests/storage/test_event_metrics.py
index 088fbb247b..6f1135eef4 100644
--- a/tests/storage/test_event_metrics.py
+++ b/tests/storage/test_event_metrics.py
@@ -11,8 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from prometheus_client import generate_latest
 
-from synapse.metrics import REGISTRY, generate_latest
+from synapse.metrics import REGISTRY
 from synapse.types import UserID, create_requester
 
 from tests.unittest import HomeserverTestCase
@@ -53,8 +54,8 @@ class ExtremStatisticsTestCase(HomeserverTestCase):
 
         items = list(
             filter(
-                lambda x: b"synapse_forward_extremities_" in x,
-                generate_latest(REGISTRY, emit_help=False).split(b"\n"),
+                lambda x: b"synapse_forward_extremities_" in x and b"# HELP" not in x,
+                generate_latest(REGISTRY).split(b"\n"),
             )
         )
 
-- 
cgit 1.5.1