From 1eea73b4133da3afc2361592f7b2fa4a4125249d Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Tue, 30 Aug 2022 06:08:29 -0500
Subject: Fix rate limit metrics registering twice and misreporting (#13649)

* Fix rate limit metrics registering twice and misreporting

Fix https://github.com/matrix-org/synapse/issues/13641

* Fix lints

* Add changelog

* Document `metrics_name=None`.
---
 changelog.d/13649.bugfix       |   1 +
 synapse/server.py              |   4 +-
 synapse/util/ratelimitutils.py | 155 ++++++++++++++++++++++++++++++++---------
 3 files changed, 128 insertions(+), 32 deletions(-)
 create mode 100644 changelog.d/13649.bugfix

diff --git a/changelog.d/13649.bugfix b/changelog.d/13649.bugfix
new file mode 100644
index 0000000000..e6513a585a
--- /dev/null
+++ b/changelog.d/13649.bugfix
@@ -0,0 +1 @@
+Fix rate limit gauge metrics registering twice and misreporting (`synapse_rate_limit_sleep_affected_hosts`, `synapse_rate_limit_reject_affected_hosts`).
diff --git a/synapse/server.py b/synapse/server.py
index 181984a1a4..c2e55bf0b1 100644
--- a/synapse/server.py
+++ b/synapse/server.py
@@ -756,7 +756,9 @@ class HomeServer(metaclass=abc.ABCMeta):
     @cache_in_self
     def get_federation_ratelimiter(self) -> FederationRateLimiter:
         return FederationRateLimiter(
-            self.get_clock(), config=self.config.ratelimiting.rc_federation
+            self.get_clock(),
+            config=self.config.ratelimiting.rc_federation,
+            metrics_name="federation_servlets",
         )
 
     @cache_in_self
diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py
index f678b52cb4..9f64fed0d7 100644
--- a/synapse/util/ratelimitutils.py
+++ b/synapse/util/ratelimitutils.py
@@ -15,10 +15,23 @@
 import collections
 import contextlib
 import logging
+import threading
 import typing
-from typing import Any, DefaultDict, Iterator, List, Set
+from typing import (
+    Any,
+    Callable,
+    DefaultDict,
+    Dict,
+    Iterator,
+    List,
+    Mapping,
+    Optional,
+    Set,
+    Tuple,
+)
 
 from prometheus_client.core import Counter
+from typing_extensions import ContextManager
 
 from twisted.internet import defer
 
@@ -40,12 +53,20 @@ logger = logging.getLogger(__name__)
 
 
 # Track how much the ratelimiter is affecting requests
-rate_limit_sleep_counter = Counter("synapse_rate_limit_sleep", "")
-rate_limit_reject_counter = Counter("synapse_rate_limit_reject", "")
+rate_limit_sleep_counter = Counter(
+    "synapse_rate_limit_sleep",
+    "Number of requests slept by the rate limiter",
+    ["rate_limiter_name"],
+)
+rate_limit_reject_counter = Counter(
+    "synapse_rate_limit_reject",
+    "Number of requests rejected by the rate limiter",
+    ["rate_limiter_name"],
+)
 queue_wait_timer = Histogram(
     "synapse_rate_limit_queue_wait_time_seconds",
-    "sec",
-    [],
+    "Amount of time spent waiting for the rate limiter to let our request through.",
+    ["rate_limiter_name"],
     buckets=(
         0.005,
         0.01,
@@ -65,35 +86,92 @@ queue_wait_timer = Histogram(
 )
 
 
+_rate_limiter_instances: Set["FederationRateLimiter"] = set()
+# Protects the _rate_limiter_instances set from concurrent access
+_rate_limiter_instances_lock = threading.Lock()
+
+
+def _get_counts_from_rate_limiter_instance(
+    count_func: Callable[["FederationRateLimiter"], int]
+) -> Mapping[Tuple[str, ...], int]:
+    """Returns a count of something (slept/rejected hosts) by (metrics_name)"""
+    # Cast to a list to prevent it changing while the Prometheus
+    # thread is collecting metrics
+    with _rate_limiter_instances_lock:
+        rate_limiter_instances = list(_rate_limiter_instances)
+
+    # Map from (metrics_name,) -> int, the number of something like slept hosts
+    # or rejected hosts. The key type is Tuple[str], but we leave the length
+    # unspecified for compatability with LaterGauge's annotations.
+    counts: Dict[Tuple[str, ...], int] = {}
+    for rate_limiter_instance in rate_limiter_instances:
+        # Only track metrics if they provided a `metrics_name` to
+        # differentiate this instance of the rate limiter.
+        if rate_limiter_instance.metrics_name:
+            key = (rate_limiter_instance.metrics_name,)
+            counts[key] = count_func(rate_limiter_instance)
+
+    return counts
+
+
+# We track the number of affected hosts per time-period so we can
+# differentiate one really noisy homeserver from a general
+# ratelimit tuning problem across the federation.
+LaterGauge(
+    "synapse_rate_limit_sleep_affected_hosts",
+    "Number of hosts that had requests put to sleep",
+    ["rate_limiter_name"],
+    lambda: _get_counts_from_rate_limiter_instance(
+        lambda rate_limiter_instance: sum(
+            ratelimiter.should_sleep()
+            for ratelimiter in rate_limiter_instance.ratelimiters.values()
+        )
+    ),
+)
+LaterGauge(
+    "synapse_rate_limit_reject_affected_hosts",
+    "Number of hosts that had requests rejected",
+    ["rate_limiter_name"],
+    lambda: _get_counts_from_rate_limiter_instance(
+        lambda rate_limiter_instance: sum(
+            ratelimiter.should_reject()
+            for ratelimiter in rate_limiter_instance.ratelimiters.values()
+        )
+    ),
+)
+
+
 class FederationRateLimiter:
-    def __init__(self, clock: Clock, config: FederationRatelimitSettings):
+    """Used to rate limit request per-host."""
+
+    def __init__(
+        self,
+        clock: Clock,
+        config: FederationRatelimitSettings,
+        metrics_name: Optional[str] = None,
+    ):
+        """
+        Args:
+            clock
+            config
+            metrics_name: The name of the rate limiter so we can differentiate it
+                from the rest in the metrics. If `None`, we don't track metrics
+                for this rate limiter.
+
+        """
+        self.metrics_name = metrics_name
+
         def new_limiter() -> "_PerHostRatelimiter":
-            return _PerHostRatelimiter(clock=clock, config=config)
+            return _PerHostRatelimiter(
+                clock=clock, config=config, metrics_name=metrics_name
+            )
 
         self.ratelimiters: DefaultDict[
             str, "_PerHostRatelimiter"
         ] = collections.defaultdict(new_limiter)
 
-        # We track the number of affected hosts per time-period so we can
-        # differentiate one really noisy homeserver from a general
-        # ratelimit tuning problem across the federation.
-        LaterGauge(
-            "synapse_rate_limit_sleep_affected_hosts",
-            "Number of hosts that had requests put to sleep",
-            [],
-            lambda: sum(
-                ratelimiter.should_sleep() for ratelimiter in self.ratelimiters.values()
-            ),
-        )
-        LaterGauge(
-            "synapse_rate_limit_reject_affected_hosts",
-            "Number of hosts that had requests rejected",
-            [],
-            lambda: sum(
-                ratelimiter.should_reject()
-                for ratelimiter in self.ratelimiters.values()
-            ),
-        )
+        with _rate_limiter_instances_lock:
+            _rate_limiter_instances.add(self)
 
     def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]":
         """Used to ratelimit an incoming request from a given host
@@ -114,13 +192,23 @@ class FederationRateLimiter:
 
 
 class _PerHostRatelimiter:
-    def __init__(self, clock: Clock, config: FederationRatelimitSettings):
+    def __init__(
+        self,
+        clock: Clock,
+        config: FederationRatelimitSettings,
+        metrics_name: Optional[str] = None,
+    ):
         """
         Args:
             clock
             config
+            metrics_name: The name of the rate limiter so we can differentiate it
+                from the rest in the metrics. If `None`, we don't track metrics
+                for this rate limiter.
+                from the rest in the metrics
         """
         self.clock = clock
+        self.metrics_name = metrics_name
 
         self.window_size = config.window_size
         self.sleep_limit = config.sleep_limit
@@ -178,7 +266,10 @@ class _PerHostRatelimiter:
         return len(self.request_times) > self.sleep_limit
 
     async def _on_enter_with_tracing(self, request_id: object) -> None:
-        with start_active_span("ratelimit wait"), queue_wait_timer.time():
+        maybe_metrics_cm: ContextManager = contextlib.nullcontext()
+        if self.metrics_name:
+            maybe_metrics_cm = queue_wait_timer.labels(self.metrics_name).time()
+        with start_active_span("ratelimit wait"), maybe_metrics_cm:
             await self._on_enter(request_id)
 
     def _on_enter(self, request_id: object) -> "defer.Deferred[None]":
@@ -193,7 +284,8 @@ class _PerHostRatelimiter:
         # sleeping or in the ready queue).
         if self.should_reject():
             logger.debug("Ratelimiter(%s): rejecting request", self.host)
-            rate_limit_reject_counter.inc()
+            if self.metrics_name:
+                rate_limit_reject_counter.labels(self.metrics_name).inc()
             raise LimitExceededError(
                 retry_after_ms=int(self.window_size / self.sleep_limit)
             )
@@ -228,7 +320,8 @@ class _PerHostRatelimiter:
                 id(request_id),
                 self.sleep_sec,
             )
-            rate_limit_sleep_counter.inc()
+            if self.metrics_name:
+                rate_limit_sleep_counter.labels(self.metrics_name).inc()
             ret_defer = run_in_background(self.clock.sleep, self.sleep_sec)
 
             self.sleeping_requests.add(request_id)
-- 
cgit 1.5.1


From 8f6aa015a80d1e34a25d100c5fd6ec4dde1fb728 Mon Sep 17 00:00:00 2001
From: David Robertson <davidr@element.io>
Date: Tue, 30 Aug 2022 12:25:44 +0100
Subject: 1.66.0rc2

---
 CHANGES.md               | 9 +++++++++
 changelog.d/13649.bugfix | 1 -
 debian/changelog         | 6 ++++++
 pyproject.toml           | 2 +-
 4 files changed, 16 insertions(+), 2 deletions(-)
 delete mode 100644 changelog.d/13649.bugfix

diff --git a/CHANGES.md b/CHANGES.md
index 14fafc260d..8790812f8e 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,12 @@
+Synapse 1.66.0rc2 (2022-08-30)
+==============================
+
+Bugfixes
+--------
+
+- Fix rate limit gauge metrics registering twice and misreporting (`synapse_rate_limit_sleep_affected_hosts`, `synapse_rate_limit_reject_affected_hosts`). ([\#13649](https://github.com/matrix-org/synapse/issues/13649))
+
+
 Synapse 1.66.0rc1 (2022-08-23)
 ==============================
 
diff --git a/changelog.d/13649.bugfix b/changelog.d/13649.bugfix
deleted file mode 100644
index e6513a585a..0000000000
--- a/changelog.d/13649.bugfix
+++ /dev/null
@@ -1 +0,0 @@
-Fix rate limit gauge metrics registering twice and misreporting (`synapse_rate_limit_sleep_affected_hosts`, `synapse_rate_limit_reject_affected_hosts`).
diff --git a/debian/changelog b/debian/changelog
index c3974261a9..b42c99d81a 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+matrix-synapse-py3 (1.66.0~rc2) stable; urgency=medium
+
+  * New Synapse release 1.66.0rc2.
+
+ -- Synapse Packaging team <packages@matrix.org>  Tue, 30 Aug 2022 12:25:19 +0100
+
 matrix-synapse-py3 (1.66.0~rc1) stable; urgency=medium
 
   * New Synapse release 1.66.0rc1.
diff --git a/pyproject.toml b/pyproject.toml
index 745b6067aa..7146897837 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,7 +54,7 @@ skip_gitignore = true
 
 [tool.poetry]
 name = "matrix-synapse"
-version = "1.66.0rc1"
+version = "1.66.0rc2"
 description = "Homeserver for the Matrix decentralised comms protocol"
 authors = ["Matrix.org Team and Contributors <packages@matrix.org>"]
 license = "Apache-2.0"
-- 
cgit 1.5.1


From 31f2a3fbc3f46b88cb400f69cd59f3551c1401f5 Mon Sep 17 00:00:00 2001
From: David Robertson <davidr@element.io>
Date: Tue, 30 Aug 2022 14:19:52 +0100
Subject: Update changes

---
 CHANGES.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGES.md b/CHANGES.md
index 8790812f8e..a39fe661ce 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -4,7 +4,7 @@ Synapse 1.66.0rc2 (2022-08-30)
 Bugfixes
 --------
 
-- Fix rate limit gauge metrics registering twice and misreporting (`synapse_rate_limit_sleep_affected_hosts`, `synapse_rate_limit_reject_affected_hosts`). ([\#13649](https://github.com/matrix-org/synapse/issues/13649))
+- Fix a bug introduced in Synapse 1.66.0rc1 where the new rate limit metrics were misreported (`synapse_rate_limit_sleep_affected_hosts`, `synapse_rate_limit_reject_affected_hosts`). ([\#13649](https://github.com/matrix-org/synapse/issues/13649))
 
 
 Synapse 1.66.0rc1 (2022-08-23)
-- 
cgit 1.5.1