diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py
index 434b02b97b..724d39b92f 100644
--- a/synapse/util/ratelimitutils.py
+++ b/synapse/util/ratelimitutils.py
@@ -30,7 +30,7 @@ from synapse.logging.context import (
run_in_background,
)
from synapse.logging.opentracing import start_active_span
-from synapse.metrics import Histogram
+from synapse.metrics import Histogram, LaterGauge
from synapse.util import Clock
if typing.TYPE_CHECKING:
@@ -74,6 +74,27 @@ class FederationRateLimiter:
str, "_PerHostRatelimiter"
] = collections.defaultdict(new_limiter)
+ # We track the number of affected hosts per time-period so we can
+ # differentiate one really noisy homeserver from a general
+ # ratelimit tuning problem across the federation.
+ LaterGauge(
+ "synapse_rate_limit_sleep_affected_hosts",
+ "Number of hosts that had requests put to sleep",
+ [],
+ lambda: sum(
+ ratelimiter.should_sleep() for ratelimiter in self.ratelimiters.values()
+ ),
+ )
+ LaterGauge(
+ "synapse_rate_limit_reject_affected_hosts",
+ "Number of hosts that had requests rejected",
+ [],
+ lambda: sum(
+ ratelimiter.should_reject()
+ for ratelimiter in self.ratelimiters.values()
+ ),
+ )
+
def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]":
"""Used to ratelimit an incoming request from a given host
@@ -139,6 +160,21 @@ class _PerHostRatelimiter:
finally:
self._on_exit(request_id)
+ def should_reject(self) -> bool:
+ """
+ Whether to reject the request if we already have too many queued up
+ (either sleeping or in the ready queue).
+ """
+ queue_size = len(self.ready_request_queue) + len(self.sleeping_requests)
+ return queue_size > self.reject_limit
+
+ def should_sleep(self) -> bool:
+ """
+ Whether to sleep the request if we already have too many requests coming
+ through within the window.
+ """
+ return len(self.request_times) > self.sleep_limit
+
def _on_enter(self, request_id: object) -> "defer.Deferred[None]":
time_now = self.clock.time_msec()
@@ -149,8 +185,7 @@ class _PerHostRatelimiter:
# reject the request if we already have too many queued up (either
# sleeping or in the ready queue).
- queue_size = len(self.ready_request_queue) + len(self.sleeping_requests)
- if queue_size > self.reject_limit:
+ if self.should_reject():
logger.debug("Ratelimiter(%s): rejecting request", self.host)
rate_limit_reject_counter.inc()
raise LimitExceededError(
@@ -180,7 +215,7 @@ class _PerHostRatelimiter:
len(self.request_times),
)
- if len(self.request_times) > self.sleep_limit:
+ if self.should_sleep():
logger.debug(
"Ratelimiter(%s) [%s]: sleeping request for %f sec",
self.host,
|