diff options
author | Eric Eastwood <erice@element.io> | 2022-08-18 10:05:07 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-08-18 10:05:07 -0500 |
commit | d64653d062a7fc27782e70c1ca581e85b7730e72 (patch) | |
tree | fad14f84578004d4c6245da8e523b363107eb02e /synapse/util/ratelimitutils.py | |
parent | Add support for compression to federation responses (#13537) (diff) | |
download | synapse-d64653d062a7fc27782e70c1ca581e85b7730e72.tar.xz |
Track number of hosts affected by the rate limiter (#13541)
Track number of hosts affected by the rate limiter so we can differentiate one really noisy homeserver from a general ratelimit tuning problem across the federation. Follow-up to https://github.com/matrix-org/synapse/pull/13534 Part of https://github.com/matrix-org/synapse/issues/13356
Diffstat (limited to 'synapse/util/ratelimitutils.py')
-rw-r--r-- | synapse/util/ratelimitutils.py | 43 |
1 files changed, 39 insertions, 4 deletions
diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index 434b02b97b..724d39b92f 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -30,7 +30,7 @@ from synapse.logging.context import ( run_in_background, ) from synapse.logging.opentracing import start_active_span -from synapse.metrics import Histogram +from synapse.metrics import Histogram, LaterGauge from synapse.util import Clock if typing.TYPE_CHECKING: @@ -74,6 +74,27 @@ class FederationRateLimiter: str, "_PerHostRatelimiter" ] = collections.defaultdict(new_limiter) + # We track the number of affected hosts per time-period so we can + # differentiate one really noisy homeserver from a general + # ratelimit tuning problem across the federation. + LaterGauge( + "synapse_rate_limit_sleep_affected_hosts", + "Number of hosts that had requests put to sleep", + [], + lambda: sum( + ratelimiter.should_sleep() for ratelimiter in self.ratelimiters.values() + ), + ) + LaterGauge( + "synapse_rate_limit_reject_affected_hosts", + "Number of hosts that had requests rejected", + [], + lambda: sum( + ratelimiter.should_reject() + for ratelimiter in self.ratelimiters.values() + ), + ) + def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]": """Used to ratelimit an incoming request from a given host @@ -139,6 +160,21 @@ class _PerHostRatelimiter: finally: self._on_exit(request_id) + def should_reject(self) -> bool: + """ + Whether to reject the request if we already have too many queued up + (either sleeping or in the ready queue). + """ + queue_size = len(self.ready_request_queue) + len(self.sleeping_requests) + return queue_size > self.reject_limit + + def should_sleep(self) -> bool: + """ + Whether to sleep the request if we already have too many requests coming + through within the window. + """ + return len(self.request_times) > self.sleep_limit + def _on_enter(self, request_id: object) -> "defer.Deferred[None]": time_now = self.clock.time_msec() @@ -149,8 +185,7 @@ class _PerHostRatelimiter: # reject the request if we already have too many queued up (either # sleeping or in the ready queue). - queue_size = len(self.ready_request_queue) + len(self.sleeping_requests) - if queue_size > self.reject_limit: + if self.should_reject(): logger.debug("Ratelimiter(%s): rejecting request", self.host) rate_limit_reject_counter.inc() raise LimitExceededError( @@ -180,7 +215,7 @@ class _PerHostRatelimiter: len(self.request_times), ) - if len(self.request_times) > self.sleep_limit: + if self.should_sleep(): logger.debug( "Ratelimiter(%s) [%s]: sleeping request for %f sec", self.host, |