diff --git a/CHANGES.md b/CHANGES.md
index 14fafc260d..a39fe661ce 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,12 @@
+Synapse 1.66.0rc2 (2022-08-30)
+==============================
+
+Bugfixes
+--------
+
+- Fix a bug introduced in Synapse 1.66.0rc1 where the new rate limit metrics were misreported (`synapse_rate_limit_sleep_affected_hosts`, `synapse_rate_limit_reject_affected_hosts`). ([\#13649](https://github.com/matrix-org/synapse/issues/13649))
+
+
Synapse 1.66.0rc1 (2022-08-23)
==============================
diff --git a/debian/changelog b/debian/changelog
index 3257adef77..0709ae24e9 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,4 +1,4 @@
-matrix-synapse-py3 (1.66.0~rc1+nmu1) UNRELEASED; urgency=medium
+matrix-synapse-py3 (1.66.0~rc2+nmu1) UNRELEASED; urgency=medium
* Update debhelper to compatibility level 12.
* Drop the preinst script stopping synapse.
@@ -6,6 +6,12 @@ matrix-synapse-py3 (1.66.0~rc1+nmu1) UNRELEASED; urgency=medium
-- Jörg Behrmann <behrmann@physik.fu-berlin.de> Tue, 23 Aug 2022 17:17:00 +0100
+matrix-synapse-py3 (1.66.0~rc2) stable; urgency=medium
+
+ * New Synapse release 1.66.0rc2.
+
+ -- Synapse Packaging team <packages@matrix.org> Tue, 30 Aug 2022 12:25:19 +0100
+
matrix-synapse-py3 (1.66.0~rc1) stable; urgency=medium
* New Synapse release 1.66.0rc1.
diff --git a/pyproject.toml b/pyproject.toml
index 745b6067aa..7146897837 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,7 +54,7 @@ skip_gitignore = true
[tool.poetry]
name = "matrix-synapse"
-version = "1.66.0rc1"
+version = "1.66.0rc2"
description = "Homeserver for the Matrix decentralised comms protocol"
authors = ["Matrix.org Team and Contributors <packages@matrix.org>"]
license = "Apache-2.0"
diff --git a/synapse/server.py b/synapse/server.py
index 181984a1a4..c2e55bf0b1 100644
--- a/synapse/server.py
+++ b/synapse/server.py
@@ -756,7 +756,9 @@ class HomeServer(metaclass=abc.ABCMeta):
@cache_in_self
def get_federation_ratelimiter(self) -> FederationRateLimiter:
return FederationRateLimiter(
- self.get_clock(), config=self.config.ratelimiting.rc_federation
+ self.get_clock(),
+ config=self.config.ratelimiting.rc_federation,
+ metrics_name="federation_servlets",
)
@cache_in_self
diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py
index f678b52cb4..9f64fed0d7 100644
--- a/synapse/util/ratelimitutils.py
+++ b/synapse/util/ratelimitutils.py
@@ -15,10 +15,23 @@
import collections
import contextlib
import logging
+import threading
import typing
-from typing import Any, DefaultDict, Iterator, List, Set
+from typing import (
+ Any,
+ Callable,
+ DefaultDict,
+ Dict,
+ Iterator,
+ List,
+ Mapping,
+ Optional,
+ Set,
+ Tuple,
+)
from prometheus_client.core import Counter
+from typing_extensions import ContextManager
from twisted.internet import defer
@@ -40,12 +53,20 @@ logger = logging.getLogger(__name__)
# Track how much the ratelimiter is affecting requests
-rate_limit_sleep_counter = Counter("synapse_rate_limit_sleep", "")
-rate_limit_reject_counter = Counter("synapse_rate_limit_reject", "")
+rate_limit_sleep_counter = Counter(
+ "synapse_rate_limit_sleep",
+ "Number of requests slept by the rate limiter",
+ ["rate_limiter_name"],
+)
+rate_limit_reject_counter = Counter(
+ "synapse_rate_limit_reject",
+ "Number of requests rejected by the rate limiter",
+ ["rate_limiter_name"],
+)
queue_wait_timer = Histogram(
"synapse_rate_limit_queue_wait_time_seconds",
- "sec",
- [],
+ "Amount of time spent waiting for the rate limiter to let our request through.",
+ ["rate_limiter_name"],
buckets=(
0.005,
0.01,
@@ -65,35 +86,92 @@ queue_wait_timer = Histogram(
)
+_rate_limiter_instances: Set["FederationRateLimiter"] = set()
+# Protects the _rate_limiter_instances set from concurrent access
+_rate_limiter_instances_lock = threading.Lock()
+
+
+def _get_counts_from_rate_limiter_instance(
+ count_func: Callable[["FederationRateLimiter"], int]
+) -> Mapping[Tuple[str, ...], int]:
+ """Returns a count of something (slept/rejected hosts) by (metrics_name)"""
+ # Cast to a list to prevent it changing while the Prometheus
+ # thread is collecting metrics
+ with _rate_limiter_instances_lock:
+ rate_limiter_instances = list(_rate_limiter_instances)
+
+ # Map from (metrics_name,) -> int, the number of something like slept hosts
+ # or rejected hosts. The key type is Tuple[str], but we leave the length
+ # unspecified for compatability with LaterGauge's annotations.
+ counts: Dict[Tuple[str, ...], int] = {}
+ for rate_limiter_instance in rate_limiter_instances:
+ # Only track metrics if they provided a `metrics_name` to
+ # differentiate this instance of the rate limiter.
+ if rate_limiter_instance.metrics_name:
+ key = (rate_limiter_instance.metrics_name,)
+ counts[key] = count_func(rate_limiter_instance)
+
+ return counts
+
+
+# We track the number of affected hosts per time-period so we can
+# differentiate one really noisy homeserver from a general
+# ratelimit tuning problem across the federation.
+LaterGauge(
+ "synapse_rate_limit_sleep_affected_hosts",
+ "Number of hosts that had requests put to sleep",
+ ["rate_limiter_name"],
+ lambda: _get_counts_from_rate_limiter_instance(
+ lambda rate_limiter_instance: sum(
+ ratelimiter.should_sleep()
+ for ratelimiter in rate_limiter_instance.ratelimiters.values()
+ )
+ ),
+)
+LaterGauge(
+ "synapse_rate_limit_reject_affected_hosts",
+ "Number of hosts that had requests rejected",
+ ["rate_limiter_name"],
+ lambda: _get_counts_from_rate_limiter_instance(
+ lambda rate_limiter_instance: sum(
+ ratelimiter.should_reject()
+ for ratelimiter in rate_limiter_instance.ratelimiters.values()
+ )
+ ),
+)
+
+
class FederationRateLimiter:
- def __init__(self, clock: Clock, config: FederationRatelimitSettings):
+ """Used to rate limit request per-host."""
+
+ def __init__(
+ self,
+ clock: Clock,
+ config: FederationRatelimitSettings,
+ metrics_name: Optional[str] = None,
+ ):
+ """
+ Args:
+ clock
+ config
+ metrics_name: The name of the rate limiter so we can differentiate it
+ from the rest in the metrics. If `None`, we don't track metrics
+ for this rate limiter.
+
+ """
+ self.metrics_name = metrics_name
+
def new_limiter() -> "_PerHostRatelimiter":
- return _PerHostRatelimiter(clock=clock, config=config)
+ return _PerHostRatelimiter(
+ clock=clock, config=config, metrics_name=metrics_name
+ )
self.ratelimiters: DefaultDict[
str, "_PerHostRatelimiter"
] = collections.defaultdict(new_limiter)
- # We track the number of affected hosts per time-period so we can
- # differentiate one really noisy homeserver from a general
- # ratelimit tuning problem across the federation.
- LaterGauge(
- "synapse_rate_limit_sleep_affected_hosts",
- "Number of hosts that had requests put to sleep",
- [],
- lambda: sum(
- ratelimiter.should_sleep() for ratelimiter in self.ratelimiters.values()
- ),
- )
- LaterGauge(
- "synapse_rate_limit_reject_affected_hosts",
- "Number of hosts that had requests rejected",
- [],
- lambda: sum(
- ratelimiter.should_reject()
- for ratelimiter in self.ratelimiters.values()
- ),
- )
+ with _rate_limiter_instances_lock:
+ _rate_limiter_instances.add(self)
def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]":
"""Used to ratelimit an incoming request from a given host
@@ -114,13 +192,23 @@ class FederationRateLimiter:
class _PerHostRatelimiter:
- def __init__(self, clock: Clock, config: FederationRatelimitSettings):
+ def __init__(
+ self,
+ clock: Clock,
+ config: FederationRatelimitSettings,
+ metrics_name: Optional[str] = None,
+ ):
"""
Args:
clock
config
+ metrics_name: The name of the rate limiter so we can differentiate it
+ from the rest in the metrics. If `None`, we don't track metrics
+ for this rate limiter.
+ from the rest in the metrics
"""
self.clock = clock
+ self.metrics_name = metrics_name
self.window_size = config.window_size
self.sleep_limit = config.sleep_limit
@@ -178,7 +266,10 @@ class _PerHostRatelimiter:
return len(self.request_times) > self.sleep_limit
async def _on_enter_with_tracing(self, request_id: object) -> None:
- with start_active_span("ratelimit wait"), queue_wait_timer.time():
+ maybe_metrics_cm: ContextManager = contextlib.nullcontext()
+ if self.metrics_name:
+ maybe_metrics_cm = queue_wait_timer.labels(self.metrics_name).time()
+ with start_active_span("ratelimit wait"), maybe_metrics_cm:
await self._on_enter(request_id)
def _on_enter(self, request_id: object) -> "defer.Deferred[None]":
@@ -193,7 +284,8 @@ class _PerHostRatelimiter:
# sleeping or in the ready queue).
if self.should_reject():
logger.debug("Ratelimiter(%s): rejecting request", self.host)
- rate_limit_reject_counter.inc()
+ if self.metrics_name:
+ rate_limit_reject_counter.labels(self.metrics_name).inc()
raise LimitExceededError(
retry_after_ms=int(self.window_size / self.sleep_limit)
)
@@ -228,7 +320,8 @@ class _PerHostRatelimiter:
id(request_id),
self.sleep_sec,
)
- rate_limit_sleep_counter.inc()
+ if self.metrics_name:
+ rate_limit_sleep_counter.labels(self.metrics_name).inc()
ret_defer = run_in_background(self.clock.sleep, self.sleep_sec)
self.sleeping_requests.add(request_id)
|