diff --git a/changelog.d/13540.misc b/changelog.d/13540.misc
new file mode 100644
index 0000000000..07ace50b12
--- /dev/null
+++ b/changelog.d/13540.misc
@@ -0,0 +1 @@
+Add experimental configuration option to allow disabling legacy Prometheus metric names.
\ No newline at end of file
diff --git a/synapse/app/_base.py b/synapse/app/_base.py
index 923891ae0d..4742435d3b 100644
--- a/synapse/app/_base.py
+++ b/synapse/app/_base.py
@@ -266,15 +266,48 @@ def register_start(
reactor.callWhenRunning(lambda: defer.ensureDeferred(wrapper()))
-def listen_metrics(bind_addresses: Iterable[str], port: int) -> None:
+def listen_metrics(
+ bind_addresses: Iterable[str], port: int, enable_legacy_metric_names: bool
+) -> None:
"""
Start Prometheus metrics server.
"""
- from synapse.metrics import RegistryProxy, start_http_server
+ from prometheus_client import start_http_server as start_http_server_prometheus
+
+ from synapse.metrics import (
+ RegistryProxy,
+ start_http_server as start_http_server_legacy,
+ )
for host in bind_addresses:
logger.info("Starting metrics listener on %s:%d", host, port)
- start_http_server(port, addr=host, registry=RegistryProxy)
+ if enable_legacy_metric_names:
+ start_http_server_legacy(port, addr=host, registry=RegistryProxy)
+ else:
+ _set_prometheus_client_use_created_metrics(False)
+ start_http_server_prometheus(port, addr=host, registry=RegistryProxy)
+
+
+def _set_prometheus_client_use_created_metrics(new_value: bool) -> None:
+ """
+ Sets whether prometheus_client should expose `_created`-suffixed metrics for
+ all gauges, histograms and summaries.
+ There is no programmatic way to disable this without poking at internals;
+ the proper way is to use an environment variable which prometheus_client
+ loads at import time.
+
+ The motivation for disabling these `_created` metrics is that they're
+ a waste of space as they're not useful but they take up space in Prometheus.
+ """
+
+ import prometheus_client.metrics
+
+ if hasattr(prometheus_client.metrics, "_use_created"):
+ prometheus_client.metrics._use_created = new_value
+ else:
+ logger.error(
+ "Can't disable `_created` metrics in prometheus_client (brittle hack broken?)"
+ )
def listen_manhole(
diff --git a/synapse/app/generic_worker.py b/synapse/app/generic_worker.py
index 30e21d9707..5e3825fca6 100644
--- a/synapse/app/generic_worker.py
+++ b/synapse/app/generic_worker.py
@@ -412,7 +412,11 @@ class GenericWorkerServer(HomeServer):
"enable_metrics is not True!"
)
else:
- _base.listen_metrics(listener.bind_addresses, listener.port)
+ _base.listen_metrics(
+ listener.bind_addresses,
+ listener.port,
+ enable_legacy_metric_names=self.config.metrics.enable_legacy_metrics,
+ )
else:
logger.warning("Unsupported listener type: %s", listener.type)
diff --git a/synapse/app/homeserver.py b/synapse/app/homeserver.py
index 68993d91a9..e57a926032 100644
--- a/synapse/app/homeserver.py
+++ b/synapse/app/homeserver.py
@@ -307,7 +307,11 @@ class SynapseHomeServer(HomeServer):
"enable_metrics is not True!"
)
else:
- _base.listen_metrics(listener.bind_addresses, listener.port)
+ _base.listen_metrics(
+ listener.bind_addresses,
+ listener.port,
+ enable_legacy_metric_names=self.config.metrics.enable_legacy_metrics,
+ )
else:
# this shouldn't happen, as the listener type should have been checked
# during parsing
diff --git a/synapse/config/metrics.py b/synapse/config/metrics.py
index 3b42be5b5b..f3134834e5 100644
--- a/synapse/config/metrics.py
+++ b/synapse/config/metrics.py
@@ -42,6 +42,35 @@ class MetricsConfig(Config):
def read_config(self, config: JsonDict, **kwargs: Any) -> None:
self.enable_metrics = config.get("enable_metrics", False)
+
+ """
+ ### `enable_legacy_metrics` (experimental)
+
+ **Experimental: this option may be removed or have its behaviour
+ changed at any time, with no notice.**
+
+ Set to `true` to publish both legacy and non-legacy Prometheus metric names,
+ or to `false` to only publish non-legacy Prometheus metric names.
+ Defaults to `true`. Has no effect if `enable_metrics` is `false`.
+
+ Legacy metric names include:
+ - metrics containing colons in the name, such as `synapse_util_caches_response_cache:hits`, because colons are supposed to be reserved for user-defined recording rules;
+ - counters that don't end with the `_total` suffix, such as `synapse_federation_client_sent_edus`, therefore not adhering to the OpenMetrics standard.
+
+ These legacy metric names are unconventional and not compliant with OpenMetrics standards.
+ They are included for backwards compatibility.
+
+ Example configuration:
+ ```yaml
+ enable_legacy_metrics: false
+ ```
+
+ See https://github.com/matrix-org/synapse/issues/11106 for context.
+
+ *Since v1.67.0.*
+ """
+ self.enable_legacy_metrics = config.get("enable_legacy_metrics", True)
+
self.report_stats = config.get("report_stats", None)
self.report_stats_endpoint = config.get(
"report_stats_endpoint", "https://matrix.org/report-usage-stats/push"
diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py
index 496fce2ecc..c3d3daf877 100644
--- a/synapse/metrics/__init__.py
+++ b/synapse/metrics/__init__.py
@@ -46,12 +46,12 @@ from twisted.python.threadpool import ThreadPool
# This module is imported for its side effects; flake8 needn't warn that it's unused.
import synapse.metrics._reactor_metrics # noqa: F401
-from synapse.metrics._exposition import (
+from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, install_gc_manager
+from synapse.metrics._legacy_exposition import (
MetricsResource,
generate_latest,
start_http_server,
)
-from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, install_gc_manager
from synapse.metrics._types import Collector
from synapse.util import SYNAPSE_VERSION
diff --git a/synapse/metrics/_exposition.py b/synapse/metrics/_legacy_exposition.py
index 353d0a63b6..ff640a49af 100644
--- a/synapse/metrics/_exposition.py
+++ b/synapse/metrics/_legacy_exposition.py
@@ -80,7 +80,27 @@ def sample_line(line: Sample, name: str) -> str:
return "{}{} {}{}\n".format(name, labelstr, floatToGoString(line.value), timestamp)
+# Mapping from new metric names to legacy metric names.
+# We translate these back to their old names when exposing them through our
+# legacy vendored exporter.
+# Only this legacy exposition module applies these name changes.
+LEGACY_METRIC_NAMES = {
+ "synapse_util_caches_cache_hits": "synapse_util_caches_cache:hits",
+ "synapse_util_caches_cache_size": "synapse_util_caches_cache:size",
+ "synapse_util_caches_cache_evicted_size": "synapse_util_caches_cache:evicted_size",
+ "synapse_util_caches_cache_total": "synapse_util_caches_cache:total",
+ "synapse_util_caches_response_cache_size": "synapse_util_caches_response_cache:size",
+ "synapse_util_caches_response_cache_hits": "synapse_util_caches_response_cache:hits",
+ "synapse_util_caches_response_cache_evicted_size": "synapse_util_caches_response_cache:evicted_size",
+ "synapse_util_caches_response_cache_total": "synapse_util_caches_response_cache:total",
+}
+
+
def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> bytes:
+ """
+ Generate metrics in legacy format. Modern metrics are generated directly
+ by prometheus-client.
+ """
# Trigger the cache metrics to be rescraped, which updates the common
# metrics but do not produce metrics themselves
@@ -94,7 +114,8 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt
# No samples, don't bother.
continue
- mname = metric.name
+ # Translate to legacy metric name if it has one.
+ mname = LEGACY_METRIC_NAMES.get(metric.name, metric.name)
mnewname = metric.name
mtype = metric.type
@@ -124,7 +145,7 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt
om_samples: Dict[str, List[str]] = {}
for s in metric.samples:
for suffix in ["_created", "_gsum", "_gcount"]:
- if s.name == metric.name + suffix:
+ if s.name == mname + suffix:
# OpenMetrics specific sample, put in a gauge at the end.
# (these come from gaugehistograms which don't get renamed,
# so no need to faff with mnewname)
@@ -140,12 +161,12 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt
if emit_help:
output.append(
"# HELP {}{} {}\n".format(
- metric.name,
+ mname,
suffix,
metric.documentation.replace("\\", r"\\").replace("\n", r"\n"),
)
)
- output.append(f"# TYPE {metric.name}{suffix} gauge\n")
+ output.append(f"# TYPE {mname}{suffix} gauge\n")
output.extend(lines)
# Get rid of the weird colon things while we're at it
@@ -170,11 +191,12 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt
# Get rid of the OpenMetrics specific samples (we should already have
# dealt with them above anyway.)
for suffix in ["_created", "_gsum", "_gcount"]:
- if s.name == metric.name + suffix:
+ if s.name == mname + suffix:
break
else:
+ sample_name = LEGACY_METRIC_NAMES.get(s.name, s.name)
output.append(
- sample_line(s, s.name.replace(":total", "").replace(":", "_"))
+ sample_line(s, sample_name.replace(":total", "").replace(":", "_"))
)
return "".join(output).encode("utf-8")
diff --git a/synapse/util/caches/__init__.py b/synapse/util/caches/__init__.py
index 42f6abb5e1..bdf9b0dc8c 100644
--- a/synapse/util/caches/__init__.py
+++ b/synapse/util/caches/__init__.py
@@ -34,10 +34,10 @@ TRACK_MEMORY_USAGE = False
caches_by_name: Dict[str, Sized] = {}
collectors_by_name: Dict[str, "CacheMetric"] = {}
-cache_size = Gauge("synapse_util_caches_cache:size", "", ["name"])
-cache_hits = Gauge("synapse_util_caches_cache:hits", "", ["name"])
-cache_evicted = Gauge("synapse_util_caches_cache:evicted_size", "", ["name", "reason"])
-cache_total = Gauge("synapse_util_caches_cache:total", "", ["name"])
+cache_size = Gauge("synapse_util_caches_cache_size", "", ["name"])
+cache_hits = Gauge("synapse_util_caches_cache_hits", "", ["name"])
+cache_evicted = Gauge("synapse_util_caches_cache_evicted_size", "", ["name", "reason"])
+cache_total = Gauge("synapse_util_caches_cache_total", "", ["name"])
cache_max_size = Gauge("synapse_util_caches_cache_max_size", "", ["name"])
cache_memory_usage = Gauge(
"synapse_util_caches_cache_size_bytes",
@@ -45,12 +45,12 @@ cache_memory_usage = Gauge(
["name"],
)
-response_cache_size = Gauge("synapse_util_caches_response_cache:size", "", ["name"])
-response_cache_hits = Gauge("synapse_util_caches_response_cache:hits", "", ["name"])
+response_cache_size = Gauge("synapse_util_caches_response_cache_size", "", ["name"])
+response_cache_hits = Gauge("synapse_util_caches_response_cache_hits", "", ["name"])
response_cache_evicted = Gauge(
- "synapse_util_caches_response_cache:evicted_size", "", ["name", "reason"]
+ "synapse_util_caches_response_cache_evicted_size", "", ["name", "reason"]
)
-response_cache_total = Gauge("synapse_util_caches_response_cache:total", "", ["name"])
+response_cache_total = Gauge("synapse_util_caches_response_cache_total", "", ["name"])
class EvictionReason(Enum):
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index b4574b2ffe..1a70eddc9b 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -12,7 +12,16 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+try:
+ from importlib import metadata
+except ImportError:
+ import importlib_metadata as metadata # type: ignore[no-redef]
+from unittest.mock import patch
+
+from pkg_resources import parse_version
+
+from synapse.app._base import _set_prometheus_client_use_created_metrics
from synapse.metrics import REGISTRY, InFlightGauge, generate_latest
from synapse.util.caches.deferred_cache import DeferredCache
@@ -162,3 +171,30 @@ class CacheMetricsTests(unittest.HomeserverTestCase):
self.assertEqual(items["synapse_util_caches_cache_size"], "1.0")
self.assertEqual(items["synapse_util_caches_cache_max_size"], "777.0")
+
+
+class PrometheusMetricsHackTestCase(unittest.HomeserverTestCase):
+ if parse_version(metadata.version("prometheus_client")) < parse_version("0.14.0"):
+ skip = "prometheus-client too old"
+
+ def test_created_metrics_disabled(self) -> None:
+ """
+ Tests that a brittle hack, to disable `_created` metrics, works.
+ This involves poking at the internals of prometheus-client.
+ It's not the end of the world if this doesn't work.
+
+ This test gives us a way to notice if prometheus-client changes
+ their internals.
+ """
+ import prometheus_client.metrics
+
+ PRIVATE_FLAG_NAME = "_use_created"
+
+ # By default, the pesky `_created` metrics are enabled.
+ # Check this assumption is still valid.
+ self.assertTrue(getattr(prometheus_client.metrics, PRIVATE_FLAG_NAME))
+
+ with patch("prometheus_client.metrics") as mock:
+ setattr(mock, PRIVATE_FLAG_NAME, True)
+ _set_prometheus_client_use_created_metrics(False)
+ self.assertFalse(getattr(mock, PRIVATE_FLAG_NAME, False))
|