diff options
Diffstat (limited to 'synapse/util/retryutils.py')
-rw-r--r-- | synapse/util/retryutils.py | 43 |
1 files changed, 41 insertions, 2 deletions
diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py index 27e9fc976c..0e1f907667 100644 --- a/synapse/util/retryutils.py +++ b/synapse/util/retryutils.py @@ -19,6 +19,7 @@ from typing import TYPE_CHECKING, Any, Optional, Type from synapse.api.errors import CodeMessageException from synapse.metrics.background_process_metrics import run_as_background_process from synapse.storage import DataStore +from synapse.types import StrCollection from synapse.util import Clock if TYPE_CHECKING: @@ -116,6 +117,30 @@ async def get_retry_limiter( ) +async def filter_destinations_by_retry_limiter( + destinations: StrCollection, + clock: Clock, + store: DataStore, + retry_due_within_ms: int = 0, +) -> StrCollection: + """Filter down the list of destinations to only those that will are either + alive or due for a retry (within `retry_due_within_ms`) + """ + if not destinations: + return destinations + + retry_timings = await store.get_destination_retry_timings_batch(destinations) + + now = int(clock.time_msec()) + + return [ + destination + for destination, timings in retry_timings.items() + if timings is None + or timings.retry_last_ts + timings.retry_interval <= now + retry_due_within_ms + ] + + class RetryDestinationLimiter: def __init__( self, @@ -128,6 +153,7 @@ class RetryDestinationLimiter: backoff_on_failure: bool = True, notifier: Optional["Notifier"] = None, replication_client: Optional["ReplicationCommandHandler"] = None, + backoff_on_all_error_codes: bool = False, ): """Marks the destination as "down" if an exception is thrown in the context, except for CodeMessageException with code < 500. @@ -147,6 +173,9 @@ class RetryDestinationLimiter: backoff_on_failure: set to False if we should not increase the retry interval on a failure. + + backoff_on_all_error_codes: Whether we should back off on any + error code. """ self.clock = clock self.store = store @@ -156,6 +185,7 @@ class RetryDestinationLimiter: self.retry_interval = retry_interval self.backoff_on_404 = backoff_on_404 self.backoff_on_failure = backoff_on_failure + self.backoff_on_all_error_codes = backoff_on_all_error_codes self.notifier = notifier self.replication_client = replication_client @@ -179,6 +209,7 @@ class RetryDestinationLimiter: exc_val: Optional[BaseException], exc_tb: Optional[TracebackType], ) -> None: + success = exc_type is None valid_err_code = False if exc_type is None: valid_err_code = True @@ -195,7 +226,9 @@ class RetryDestinationLimiter: # won't accept our requests for at least a while. # 429 is us being aggressively rate limited, so lets rate limit # ourselves. - if exc_val.code == 404 and self.backoff_on_404: + if self.backoff_on_all_error_codes: + valid_err_code = False + elif exc_val.code == 404 and self.backoff_on_404: valid_err_code = False elif exc_val.code in (401, 429): valid_err_code = False @@ -204,7 +237,7 @@ class RetryDestinationLimiter: else: valid_err_code = False - if valid_err_code: + if success: # We connected successfully. if not self.retry_interval: return @@ -215,6 +248,12 @@ class RetryDestinationLimiter: self.failure_ts = None retry_last_ts = 0 self.retry_interval = 0 + elif valid_err_code: + # We got a potentially valid error code back. We don't reset the + # timers though, as the other side might actually be down anyway + # (e.g. some deprovisioned servers will always return a 404 or 403, + # and we don't want to keep resetting the retry timers for them). + return elif not self.backoff_on_failure: return else: |