diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py
index f678b52cb4..9f64fed0d7 100644
--- a/synapse/util/ratelimitutils.py
+++ b/synapse/util/ratelimitutils.py
@@ -15,10 +15,23 @@
import collections
import contextlib
import logging
+import threading
import typing
-from typing import Any, DefaultDict, Iterator, List, Set
+from typing import (
+ Any,
+ Callable,
+ DefaultDict,
+ Dict,
+ Iterator,
+ List,
+ Mapping,
+ Optional,
+ Set,
+ Tuple,
+)
from prometheus_client.core import Counter
+from typing_extensions import ContextManager
from twisted.internet import defer
@@ -40,12 +53,20 @@ logger = logging.getLogger(__name__)
# Track how much the ratelimiter is affecting requests
-rate_limit_sleep_counter = Counter("synapse_rate_limit_sleep", "")
-rate_limit_reject_counter = Counter("synapse_rate_limit_reject", "")
+rate_limit_sleep_counter = Counter(
+ "synapse_rate_limit_sleep",
+ "Number of requests slept by the rate limiter",
+ ["rate_limiter_name"],
+)
+rate_limit_reject_counter = Counter(
+ "synapse_rate_limit_reject",
+ "Number of requests rejected by the rate limiter",
+ ["rate_limiter_name"],
+)
queue_wait_timer = Histogram(
"synapse_rate_limit_queue_wait_time_seconds",
- "sec",
- [],
+ "Amount of time spent waiting for the rate limiter to let our request through.",
+ ["rate_limiter_name"],
buckets=(
0.005,
0.01,
@@ -65,35 +86,92 @@ queue_wait_timer = Histogram(
)
+_rate_limiter_instances: Set["FederationRateLimiter"] = set()
+# Protects the _rate_limiter_instances set from concurrent access
+_rate_limiter_instances_lock = threading.Lock()
+
+
+def _get_counts_from_rate_limiter_instance(
+ count_func: Callable[["FederationRateLimiter"], int]
+) -> Mapping[Tuple[str, ...], int]:
+ """Returns a count of something (slept/rejected hosts) by (metrics_name)"""
+ # Cast to a list to prevent it changing while the Prometheus
+ # thread is collecting metrics
+ with _rate_limiter_instances_lock:
+ rate_limiter_instances = list(_rate_limiter_instances)
+
+ # Map from (metrics_name,) -> int, the number of something like slept hosts
+ # or rejected hosts. The key type is Tuple[str], but we leave the length
+ # unspecified for compatability with LaterGauge's annotations.
+ counts: Dict[Tuple[str, ...], int] = {}
+ for rate_limiter_instance in rate_limiter_instances:
+ # Only track metrics if they provided a `metrics_name` to
+ # differentiate this instance of the rate limiter.
+ if rate_limiter_instance.metrics_name:
+ key = (rate_limiter_instance.metrics_name,)
+ counts[key] = count_func(rate_limiter_instance)
+
+ return counts
+
+
+# We track the number of affected hosts per time-period so we can
+# differentiate one really noisy homeserver from a general
+# ratelimit tuning problem across the federation.
+LaterGauge(
+ "synapse_rate_limit_sleep_affected_hosts",
+ "Number of hosts that had requests put to sleep",
+ ["rate_limiter_name"],
+ lambda: _get_counts_from_rate_limiter_instance(
+ lambda rate_limiter_instance: sum(
+ ratelimiter.should_sleep()
+ for ratelimiter in rate_limiter_instance.ratelimiters.values()
+ )
+ ),
+)
+LaterGauge(
+ "synapse_rate_limit_reject_affected_hosts",
+ "Number of hosts that had requests rejected",
+ ["rate_limiter_name"],
+ lambda: _get_counts_from_rate_limiter_instance(
+ lambda rate_limiter_instance: sum(
+ ratelimiter.should_reject()
+ for ratelimiter in rate_limiter_instance.ratelimiters.values()
+ )
+ ),
+)
+
+
class FederationRateLimiter:
- def __init__(self, clock: Clock, config: FederationRatelimitSettings):
+ """Used to rate limit request per-host."""
+
+ def __init__(
+ self,
+ clock: Clock,
+ config: FederationRatelimitSettings,
+ metrics_name: Optional[str] = None,
+ ):
+ """
+ Args:
+ clock
+ config
+ metrics_name: The name of the rate limiter so we can differentiate it
+ from the rest in the metrics. If `None`, we don't track metrics
+ for this rate limiter.
+
+ """
+ self.metrics_name = metrics_name
+
def new_limiter() -> "_PerHostRatelimiter":
- return _PerHostRatelimiter(clock=clock, config=config)
+ return _PerHostRatelimiter(
+ clock=clock, config=config, metrics_name=metrics_name
+ )
self.ratelimiters: DefaultDict[
str, "_PerHostRatelimiter"
] = collections.defaultdict(new_limiter)
- # We track the number of affected hosts per time-period so we can
- # differentiate one really noisy homeserver from a general
- # ratelimit tuning problem across the federation.
- LaterGauge(
- "synapse_rate_limit_sleep_affected_hosts",
- "Number of hosts that had requests put to sleep",
- [],
- lambda: sum(
- ratelimiter.should_sleep() for ratelimiter in self.ratelimiters.values()
- ),
- )
- LaterGauge(
- "synapse_rate_limit_reject_affected_hosts",
- "Number of hosts that had requests rejected",
- [],
- lambda: sum(
- ratelimiter.should_reject()
- for ratelimiter in self.ratelimiters.values()
- ),
- )
+ with _rate_limiter_instances_lock:
+ _rate_limiter_instances.add(self)
def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]":
"""Used to ratelimit an incoming request from a given host
@@ -114,13 +192,23 @@ class FederationRateLimiter:
class _PerHostRatelimiter:
- def __init__(self, clock: Clock, config: FederationRatelimitSettings):
+ def __init__(
+ self,
+ clock: Clock,
+ config: FederationRatelimitSettings,
+ metrics_name: Optional[str] = None,
+ ):
"""
Args:
clock
config
+ metrics_name: The name of the rate limiter so we can differentiate it
+ from the rest in the metrics. If `None`, we don't track metrics
+ for this rate limiter.
+ from the rest in the metrics
"""
self.clock = clock
+ self.metrics_name = metrics_name
self.window_size = config.window_size
self.sleep_limit = config.sleep_limit
@@ -178,7 +266,10 @@ class _PerHostRatelimiter:
return len(self.request_times) > self.sleep_limit
async def _on_enter_with_tracing(self, request_id: object) -> None:
- with start_active_span("ratelimit wait"), queue_wait_timer.time():
+ maybe_metrics_cm: ContextManager = contextlib.nullcontext()
+ if self.metrics_name:
+ maybe_metrics_cm = queue_wait_timer.labels(self.metrics_name).time()
+ with start_active_span("ratelimit wait"), maybe_metrics_cm:
await self._on_enter(request_id)
def _on_enter(self, request_id: object) -> "defer.Deferred[None]":
@@ -193,7 +284,8 @@ class _PerHostRatelimiter:
# sleeping or in the ready queue).
if self.should_reject():
logger.debug("Ratelimiter(%s): rejecting request", self.host)
- rate_limit_reject_counter.inc()
+ if self.metrics_name:
+ rate_limit_reject_counter.labels(self.metrics_name).inc()
raise LimitExceededError(
retry_after_ms=int(self.window_size / self.sleep_limit)
)
@@ -228,7 +320,8 @@ class _PerHostRatelimiter:
id(request_id),
self.sleep_sec,
)
- rate_limit_sleep_counter.inc()
+ if self.metrics_name:
+ rate_limit_sleep_counter.labels(self.metrics_name).inc()
ret_defer = run_in_background(self.clock.sleep, self.sleep_sec)
self.sleeping_requests.add(request_id)
|