From d6e94ad9d9b22efe0f5eb64d946c10a542068f7c Mon Sep 17 00:00:00 2001 From: Dirk Klimpel <5740567+dklimpel@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:40:20 +0200 Subject: Rename `RateLimitConfig` to `RatelimitSettings` (#13442) --- synapse/util/ratelimitutils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'synapse/util/ratelimitutils.py') diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index dfe628c97e..6394cc39ac 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -21,7 +21,7 @@ from typing import Any, DefaultDict, Iterator, List, Set from twisted.internet import defer from synapse.api.errors import LimitExceededError -from synapse.config.ratelimiting import FederationRateLimitConfig +from synapse.config.ratelimiting import FederationRatelimitSettings from synapse.logging.context import ( PreserveLoggingContext, make_deferred_yieldable, @@ -36,7 +36,7 @@ logger = logging.getLogger(__name__) class FederationRateLimiter: - def __init__(self, clock: Clock, config: FederationRateLimitConfig): + def __init__(self, clock: Clock, config: FederationRatelimitSettings): def new_limiter() -> "_PerHostRatelimiter": return _PerHostRatelimiter(clock=clock, config=config) @@ -63,7 +63,7 @@ class FederationRateLimiter: class _PerHostRatelimiter: - def __init__(self, clock: Clock, config: FederationRateLimitConfig): + def __init__(self, clock: Clock, config: FederationRatelimitSettings): """ Args: clock -- cgit 1.5.1 From 344a2f767c636259412f7fc2914c1554a5c4dc1d Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Mon, 15 Aug 2022 13:41:23 -0500 Subject: Instrument `FederationStateIdsServlet` - `/state_ids` (#13499) Instrument FederationStateIdsServlet - `/state_ids` so it's easier to follow what's going on in Jaeger when viewing a trace. --- changelog.d/13499.misc | 1 + synapse/federation/federation_server.py | 11 ++++++++++- synapse/handlers/federation.py | 4 +++- synapse/storage/databases/main/event_federation.py | 3 +++ synapse/util/ratelimitutils.py | 4 ++++ 5 files changed, 21 insertions(+), 2 deletions(-) create mode 100644 changelog.d/13499.misc (limited to 'synapse/util/ratelimitutils.py') diff --git a/changelog.d/13499.misc b/changelog.d/13499.misc new file mode 100644 index 0000000000..99dbcebec8 --- /dev/null +++ b/changelog.d/13499.misc @@ -0,0 +1 @@ +Instrument `FederationStateIdsServlet` (`/state_ids`) for understandable traces in Jaeger. diff --git a/synapse/federation/federation_server.py b/synapse/federation/federation_server.py index db4b83a505..75fbc6073d 100644 --- a/synapse/federation/federation_server.py +++ b/synapse/federation/federation_server.py @@ -61,7 +61,12 @@ from synapse.logging.context import ( nested_logging_context, run_in_background, ) -from synapse.logging.opentracing import log_kv, start_active_span_from_edu, trace +from synapse.logging.opentracing import ( + log_kv, + start_active_span_from_edu, + tag_args, + trace, +) from synapse.metrics.background_process_metrics import wrap_as_background_process from synapse.replication.http.federation import ( ReplicationFederationSendEduRestServlet, @@ -547,6 +552,8 @@ class FederationServer(FederationBase): return 200, resp + @trace + @tag_args async def on_state_ids_request( self, origin: str, room_id: str, event_id: str ) -> Tuple[int, JsonDict]: @@ -569,6 +576,8 @@ class FederationServer(FederationBase): return 200, resp + @trace + @tag_args async def _on_state_ids_request_compute( self, room_id: str, event_id: str ) -> JsonDict: diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py index 5042236742..6f5ab86ac4 100644 --- a/synapse/handlers/federation.py +++ b/synapse/handlers/federation.py @@ -59,7 +59,7 @@ from synapse.events.validator import EventValidator from synapse.federation.federation_client import InvalidResponseError from synapse.http.servlet import assert_params_in_dict from synapse.logging.context import nested_logging_context -from synapse.logging.opentracing import trace +from synapse.logging.opentracing import tag_args, trace from synapse.metrics.background_process_metrics import run_as_background_process from synapse.module_api import NOT_SPAM from synapse.replication.http.federation import ( @@ -1081,6 +1081,8 @@ class FederationHandler: return event + @trace + @tag_args async def get_state_ids_for_pdu(self, room_id: str, event_id: str) -> List[str]: """Returns the state at the event. i.e. not including said event.""" event = await self.store.get_event(event_id, check_room_id=room_id) diff --git a/synapse/storage/databases/main/event_federation.py b/synapse/storage/databases/main/event_federation.py index eec55b6478..0bc8401f2b 100644 --- a/synapse/storage/databases/main/event_federation.py +++ b/synapse/storage/databases/main/event_federation.py @@ -33,6 +33,7 @@ from synapse.api.constants import MAX_DEPTH, EventTypes from synapse.api.errors import StoreError from synapse.api.room_versions import EventFormatVersions, RoomVersion from synapse.events import EventBase, make_event_from_dict +from synapse.logging.opentracing import tag_args, trace from synapse.metrics.background_process_metrics import wrap_as_background_process from synapse.storage._base import SQLBaseStore, db_to_json, make_in_list_sql_clause from synapse.storage.database import ( @@ -126,6 +127,8 @@ class EventFederationWorkerStore(SignatureWorkerStore, EventsWorkerStore, SQLBas ) return await self.get_events_as_list(event_ids) + @trace + @tag_args async def get_auth_chain_ids( self, room_id: str, diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index 6394cc39ac..e1beaec5a3 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -27,6 +27,7 @@ from synapse.logging.context import ( make_deferred_yieldable, run_in_background, ) +from synapse.logging.opentracing import start_active_span from synapse.util import Clock if typing.TYPE_CHECKING: @@ -176,8 +177,11 @@ class _PerHostRatelimiter: # Ensure that we've properly cleaned up. self.sleeping_requests.discard(request_id) self.ready_request_queue.pop(request_id, None) + wait_span_scope.__exit__(None, None, None) return r + wait_span_scope = start_active_span("ratelimit wait") + wait_span_scope.__enter__() ret_defer.addCallbacks(on_start, on_err) ret_defer.addBoth(on_both) return make_deferred_yieldable(ret_defer) -- cgit 1.5.1 From c6ee9c0ee40803a9e3673c2833e5a40032e86f5a Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 17 Aug 2022 04:38:05 -0500 Subject: Add metrics to track rate limiter queue timing (#13544) --- changelog.d/13544.misc | 1 + synapse/util/ratelimitutils.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 changelog.d/13544.misc (limited to 'synapse/util/ratelimitutils.py') diff --git a/changelog.d/13544.misc b/changelog.d/13544.misc new file mode 100644 index 0000000000..d84ba3f076 --- /dev/null +++ b/changelog.d/13544.misc @@ -0,0 +1 @@ +Add metrics to track rate limiter queue timing (`synapse_rate_limit_queue_wait_time_seconds`). diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index e1beaec5a3..e48324d926 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -28,6 +28,7 @@ from synapse.logging.context import ( run_in_background, ) from synapse.logging.opentracing import start_active_span +from synapse.metrics import Histogram from synapse.util import Clock if typing.TYPE_CHECKING: @@ -36,6 +37,29 @@ if typing.TYPE_CHECKING: logger = logging.getLogger(__name__) +queue_wait_timer = Histogram( + "synapse_rate_limit_queue_wait_time_seconds", + "sec", + [], + buckets=( + 0.005, + 0.01, + 0.025, + 0.05, + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 10.0, + 20.0, + "+Inf", + ), +) + + class FederationRateLimiter: def __init__(self, clock: Clock, config: FederationRatelimitSettings): def new_limiter() -> "_PerHostRatelimiter": @@ -178,10 +202,16 @@ class _PerHostRatelimiter: self.sleeping_requests.discard(request_id) self.ready_request_queue.pop(request_id, None) wait_span_scope.__exit__(None, None, None) + wait_timer_cm.__exit__(None, None, None) return r + # Tracing wait_span_scope = start_active_span("ratelimit wait") wait_span_scope.__enter__() + # Metrics + wait_timer_cm = queue_wait_timer.time() + wait_timer_cm.__enter__() + ret_defer.addCallbacks(on_start, on_err) ret_defer.addBoth(on_both) return make_deferred_yieldable(ret_defer) -- cgit 1.5.1 From 49d04e43dfa0551f52c1f1872b6f311efa756ca8 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 17 Aug 2022 16:10:07 -0500 Subject: Add metrics to track how the rate limiter is affecting requests (sleep/reject) (#13534) Related to https://github.com/matrix-org/synapse/pull/13499 Part of https://github.com/matrix-org/synapse/issues/13356 --- changelog.d/13534.misc | 1 + synapse/util/ratelimitutils.py | 37 +++++++++++++++++++++++++++++-------- 2 files changed, 30 insertions(+), 8 deletions(-) create mode 100644 changelog.d/13534.misc (limited to 'synapse/util/ratelimitutils.py') diff --git a/changelog.d/13534.misc b/changelog.d/13534.misc new file mode 100644 index 0000000000..b488bf74c3 --- /dev/null +++ b/changelog.d/13534.misc @@ -0,0 +1 @@ +Add metrics to track how the rate limiter is affecting requests (sleep/reject). diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index e48324d926..434b02b97b 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -18,6 +18,8 @@ import logging import typing from typing import Any, DefaultDict, Iterator, List, Set +from prometheus_client.core import Counter + from twisted.internet import defer from synapse.api.errors import LimitExceededError @@ -37,6 +39,9 @@ if typing.TYPE_CHECKING: logger = logging.getLogger(__name__) +# Track how much the ratelimiter is affecting requests +rate_limit_sleep_counter = Counter("synapse_rate_limit_sleep", "") +rate_limit_reject_counter = Counter("synapse_rate_limit_reject", "") queue_wait_timer = Histogram( "synapse_rate_limit_queue_wait_time_seconds", "sec", @@ -84,7 +89,7 @@ class FederationRateLimiter: Returns: context manager which returns a deferred. """ - return self.ratelimiters[host].ratelimit() + return self.ratelimiters[host].ratelimit(host) class _PerHostRatelimiter: @@ -119,12 +124,14 @@ class _PerHostRatelimiter: self.request_times: List[int] = [] @contextlib.contextmanager - def ratelimit(self) -> "Iterator[defer.Deferred[None]]": + def ratelimit(self, host: str) -> "Iterator[defer.Deferred[None]]": # `contextlib.contextmanager` takes a generator and turns it into a # context manager. The generator should only yield once with a value # to be returned by manager. # Exceptions will be reraised at the yield. + self.host = host + request_id = object() ret = self._on_enter(request_id) try: @@ -144,6 +151,8 @@ class _PerHostRatelimiter: # sleeping or in the ready queue). queue_size = len(self.ready_request_queue) + len(self.sleeping_requests) if queue_size > self.reject_limit: + logger.debug("Ratelimiter(%s): rejecting request", self.host) + rate_limit_reject_counter.inc() raise LimitExceededError( retry_after_ms=int(self.window_size / self.sleep_limit) ) @@ -155,7 +164,8 @@ class _PerHostRatelimiter: queue_defer: defer.Deferred[None] = defer.Deferred() self.ready_request_queue[request_id] = queue_defer logger.info( - "Ratelimiter: queueing request (queue now %i items)", + "Ratelimiter(%s): queueing request (queue now %i items)", + self.host, len(self.ready_request_queue), ) @@ -164,19 +174,28 @@ class _PerHostRatelimiter: return defer.succeed(None) logger.debug( - "Ratelimit [%s]: len(self.request_times)=%d", + "Ratelimit(%s) [%s]: len(self.request_times)=%d", + self.host, id(request_id), len(self.request_times), ) if len(self.request_times) > self.sleep_limit: - logger.debug("Ratelimiter: sleeping request for %f sec", self.sleep_sec) + logger.debug( + "Ratelimiter(%s) [%s]: sleeping request for %f sec", + self.host, + id(request_id), + self.sleep_sec, + ) + rate_limit_sleep_counter.inc() ret_defer = run_in_background(self.clock.sleep, self.sleep_sec) self.sleeping_requests.add(request_id) def on_wait_finished(_: Any) -> "defer.Deferred[None]": - logger.debug("Ratelimit [%s]: Finished sleeping", id(request_id)) + logger.debug( + "Ratelimit(%s) [%s]: Finished sleeping", self.host, id(request_id) + ) self.sleeping_requests.discard(request_id) queue_defer = queue_request() return queue_defer @@ -186,7 +205,9 @@ class _PerHostRatelimiter: ret_defer = queue_request() def on_start(r: object) -> object: - logger.debug("Ratelimit [%s]: Processing req", id(request_id)) + logger.debug( + "Ratelimit(%s) [%s]: Processing req", self.host, id(request_id) + ) self.current_processing.add(request_id) return r @@ -217,7 +238,7 @@ class _PerHostRatelimiter: return make_deferred_yieldable(ret_defer) def _on_exit(self, request_id: object) -> None: - logger.debug("Ratelimit [%s]: Processed req", id(request_id)) + logger.debug("Ratelimit(%s) [%s]: Processed req", self.host, id(request_id)) self.current_processing.discard(request_id) try: # start processing the next item on the queue. -- cgit 1.5.1 From d64653d062a7fc27782e70c1ca581e85b7730e72 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Thu, 18 Aug 2022 10:05:07 -0500 Subject: Track number of hosts affected by the rate limiter (#13541) Track number of hosts affected by the rate limiter so we can differentiate one really noisy homeserver from a general ratelimit tuning problem across the federation. Follow-up to https://github.com/matrix-org/synapse/pull/13534 Part of https://github.com/matrix-org/synapse/issues/13356 --- changelog.d/13541.misc | 1 + synapse/util/ratelimitutils.py | 43 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 40 insertions(+), 4 deletions(-) create mode 100644 changelog.d/13541.misc (limited to 'synapse/util/ratelimitutils.py') diff --git a/changelog.d/13541.misc b/changelog.d/13541.misc new file mode 100644 index 0000000000..b488bf74c3 --- /dev/null +++ b/changelog.d/13541.misc @@ -0,0 +1 @@ +Add metrics to track how the rate limiter is affecting requests (sleep/reject). diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index 434b02b97b..724d39b92f 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -30,7 +30,7 @@ from synapse.logging.context import ( run_in_background, ) from synapse.logging.opentracing import start_active_span -from synapse.metrics import Histogram +from synapse.metrics import Histogram, LaterGauge from synapse.util import Clock if typing.TYPE_CHECKING: @@ -74,6 +74,27 @@ class FederationRateLimiter: str, "_PerHostRatelimiter" ] = collections.defaultdict(new_limiter) + # We track the number of affected hosts per time-period so we can + # differentiate one really noisy homeserver from a general + # ratelimit tuning problem across the federation. + LaterGauge( + "synapse_rate_limit_sleep_affected_hosts", + "Number of hosts that had requests put to sleep", + [], + lambda: sum( + ratelimiter.should_sleep() for ratelimiter in self.ratelimiters.values() + ), + ) + LaterGauge( + "synapse_rate_limit_reject_affected_hosts", + "Number of hosts that had requests rejected", + [], + lambda: sum( + ratelimiter.should_reject() + for ratelimiter in self.ratelimiters.values() + ), + ) + def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]": """Used to ratelimit an incoming request from a given host @@ -139,6 +160,21 @@ class _PerHostRatelimiter: finally: self._on_exit(request_id) + def should_reject(self) -> bool: + """ + Whether to reject the request if we already have too many queued up + (either sleeping or in the ready queue). + """ + queue_size = len(self.ready_request_queue) + len(self.sleeping_requests) + return queue_size > self.reject_limit + + def should_sleep(self) -> bool: + """ + Whether to sleep the request if we already have too many requests coming + through within the window. + """ + return len(self.request_times) > self.sleep_limit + def _on_enter(self, request_id: object) -> "defer.Deferred[None]": time_now = self.clock.time_msec() @@ -149,8 +185,7 @@ class _PerHostRatelimiter: # reject the request if we already have too many queued up (either # sleeping or in the ready queue). - queue_size = len(self.ready_request_queue) + len(self.sleeping_requests) - if queue_size > self.reject_limit: + if self.should_reject(): logger.debug("Ratelimiter(%s): rejecting request", self.host) rate_limit_reject_counter.inc() raise LimitExceededError( @@ -180,7 +215,7 @@ class _PerHostRatelimiter: len(self.request_times), ) - if len(self.request_times) > self.sleep_limit: + if self.should_sleep(): logger.debug( "Ratelimiter(%s) [%s]: sleeping request for %f sec", self.host, -- cgit 1.5.1 From b251cff8196e4130b2a6951c8fe569ed46779443 Mon Sep 17 00:00:00 2001 From: Sean Quah <8349537+squahtx@users.noreply.github.com> Date: Thu, 18 Aug 2022 16:26:26 +0100 Subject: Fix incorrect juggling of logging contexts in `_PerHostRatelimiter` (#13554) Signed-off-by: Sean Quah Co-authored-by: Richard van der Hoff <1389908+richvdh@users.noreply.github.com> --- changelog.d/13554.misc | 1 + synapse/util/ratelimitutils.py | 17 +++++++---------- 2 files changed, 8 insertions(+), 10 deletions(-) create mode 100644 changelog.d/13554.misc (limited to 'synapse/util/ratelimitutils.py') diff --git a/changelog.d/13554.misc b/changelog.d/13554.misc new file mode 100644 index 0000000000..99dbcebec8 --- /dev/null +++ b/changelog.d/13554.misc @@ -0,0 +1 @@ +Instrument `FederationStateIdsServlet` (`/state_ids`) for understandable traces in Jaeger. diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index 724d39b92f..f678b52cb4 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -154,7 +154,9 @@ class _PerHostRatelimiter: self.host = host request_id = object() - ret = self._on_enter(request_id) + # Ideally we'd use `Deferred.fromCoroutine()` here, to save on redundant + # type-checking, but we'd need Twisted >= 21.2. + ret = defer.ensureDeferred(self._on_enter_with_tracing(request_id)) try: yield ret finally: @@ -175,6 +177,10 @@ class _PerHostRatelimiter: """ return len(self.request_times) > self.sleep_limit + async def _on_enter_with_tracing(self, request_id: object) -> None: + with start_active_span("ratelimit wait"), queue_wait_timer.time(): + await self._on_enter(request_id) + def _on_enter(self, request_id: object) -> "defer.Deferred[None]": time_now = self.clock.time_msec() @@ -257,17 +263,8 @@ class _PerHostRatelimiter: # Ensure that we've properly cleaned up. self.sleeping_requests.discard(request_id) self.ready_request_queue.pop(request_id, None) - wait_span_scope.__exit__(None, None, None) - wait_timer_cm.__exit__(None, None, None) return r - # Tracing - wait_span_scope = start_active_span("ratelimit wait") - wait_span_scope.__enter__() - # Metrics - wait_timer_cm = queue_wait_timer.time() - wait_timer_cm.__enter__() - ret_defer.addCallbacks(on_start, on_err) ret_defer.addBoth(on_both) return make_deferred_yieldable(ret_defer) -- cgit 1.5.1