From d9f694932c64d68e791ecb4c860e911e21a0baeb Mon Sep 17 00:00:00 2001 From: Sean Quah <8349537+squahtx@users.noreply.github.com> Date: Thu, 30 Mar 2023 13:36:41 +0100 Subject: Fix spinloop during partial state sync when a prev event is in backoff (#15351) Previously, we would spin in a tight loop until `update_state_for_partial_state_event` stopped raising `FederationPullAttemptBackoffError`s. Replace the spinloop with a wait until the backoff period has expired. Signed-off-by: Sean Quah --- synapse/handlers/federation.py | 36 ++++++++++++++++++------------------ synapse/handlers/federation_event.py | 24 +++++++++++++++++------- 2 files changed, 35 insertions(+), 25 deletions(-) (limited to 'synapse/handlers') diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py index 80156ef343..65461a0787 100644 --- a/synapse/handlers/federation.py +++ b/synapse/handlers/federation.py @@ -1949,27 +1949,25 @@ class FederationHandler: ) for event in events: for attempt in itertools.count(): + # We try a new destination on every iteration. try: - await self._federation_event_handler.update_state_for_partial_state_event( - destination, event - ) + while True: + try: + await self._federation_event_handler.update_state_for_partial_state_event( + destination, event + ) + break + except FederationPullAttemptBackoffError as e: + # We are in the backoff period for one of the event's + # prev_events. Wait it out and try again after. + logger.warning( + "%s; waiting for %d ms...", e, e.retry_after_ms + ) + await self.clock.sleep(e.retry_after_ms / 1000) + + # Success, no need to try the rest of the destinations. break - except FederationPullAttemptBackoffError as exc: - # Log a warning about why we failed to process the event (the error message - # for `FederationPullAttemptBackoffError` is pretty good) - logger.warning("_sync_partial_state_room: %s", exc) - # We do not record a failed pull attempt when we backoff fetching a missing - # `prev_event` because not being able to fetch the `prev_events` just means - # we won't be able to de-outlier the pulled event. But we can still use an - # `outlier` in the state/auth chain for another event. So we shouldn't stop - # a downstream event from trying to pull it. - # - # This avoids a cascade of backoff for all events in the DAG downstream from - # one event backoff upstream. except FederationError as e: - # TODO: We should `record_event_failed_pull_attempt` here, - # see https://github.com/matrix-org/synapse/issues/13700 - if attempt == len(destinations) - 1: # We have tried every remote server for this event. Give up. # TODO(faster_joins) giving up isn't the right thing to do @@ -1986,6 +1984,8 @@ class FederationHandler: destination, e, ) + # TODO: We should `record_event_failed_pull_attempt` here, + # see https://github.com/matrix-org/synapse/issues/13700 raise # Try the next remote server. diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py index 648843cdbe..982c8d3b2f 100644 --- a/synapse/handlers/federation_event.py +++ b/synapse/handlers/federation_event.py @@ -140,6 +140,7 @@ class FederationEventHandler: """ def __init__(self, hs: "HomeServer"): + self._clock = hs.get_clock() self._store = hs.get_datastores().main self._storage_controllers = hs.get_storage_controllers() self._state_storage_controller = self._storage_controllers.state @@ -1038,8 +1039,8 @@ class FederationEventHandler: Raises: FederationPullAttemptBackoffError if we are are deliberately not attempting - to pull the given event over federation because we've already done so - recently and are backing off. + to pull one of the given event's `prev_event`s over federation because + we've already done so recently and are backing off. FederationError if we fail to get the state from the remote server after any missing `prev_event`s. """ @@ -1053,13 +1054,22 @@ class FederationEventHandler: # If we've already recently attempted to pull this missing event, don't # try it again so soon. Since we have to fetch all of the prev_events, we can # bail early here if we find any to ignore. - prevs_to_ignore = await self._store.get_event_ids_to_not_pull_from_backoff( - room_id, missing_prevs + prevs_with_pull_backoff = ( + await self._store.get_event_ids_to_not_pull_from_backoff( + room_id, missing_prevs + ) ) - if len(prevs_to_ignore) > 0: + if len(prevs_with_pull_backoff) > 0: raise FederationPullAttemptBackoffError( - event_ids=prevs_to_ignore, - message=f"While computing context for event={event_id}, not attempting to pull missing prev_event={prevs_to_ignore[0]} because we already tried to pull recently (backing off).", + event_ids=prevs_with_pull_backoff.keys(), + message=( + f"While computing context for event={event_id}, not attempting to " + f"pull missing prev_events={list(prevs_with_pull_backoff.keys())} " + "because we already tried to pull recently (backing off)." + ), + retry_after_ms=( + max(prevs_with_pull_backoff.values()) - self._clock.time_msec() + ), ) if not missing_prevs: -- cgit 1.4.1