diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py
index 27e9fc976c..9d2065372c 100644
--- a/synapse/util/retryutils.py
+++ b/synapse/util/retryutils.py
@@ -128,6 +128,7 @@ class RetryDestinationLimiter:
backoff_on_failure: bool = True,
notifier: Optional["Notifier"] = None,
replication_client: Optional["ReplicationCommandHandler"] = None,
+ backoff_on_all_error_codes: bool = False,
):
"""Marks the destination as "down" if an exception is thrown in the
context, except for CodeMessageException with code < 500.
@@ -147,6 +148,9 @@ class RetryDestinationLimiter:
backoff_on_failure: set to False if we should not increase the
retry interval on a failure.
+
+ backoff_on_all_error_codes: Whether we should back off on any
+ error code.
"""
self.clock = clock
self.store = store
@@ -156,6 +160,7 @@ class RetryDestinationLimiter:
self.retry_interval = retry_interval
self.backoff_on_404 = backoff_on_404
self.backoff_on_failure = backoff_on_failure
+ self.backoff_on_all_error_codes = backoff_on_all_error_codes
self.notifier = notifier
self.replication_client = replication_client
@@ -179,6 +184,7 @@ class RetryDestinationLimiter:
exc_val: Optional[BaseException],
exc_tb: Optional[TracebackType],
) -> None:
+ success = exc_type is None
valid_err_code = False
if exc_type is None:
valid_err_code = True
@@ -195,7 +201,9 @@ class RetryDestinationLimiter:
# won't accept our requests for at least a while.
# 429 is us being aggressively rate limited, so lets rate limit
# ourselves.
- if exc_val.code == 404 and self.backoff_on_404:
+ if self.backoff_on_all_error_codes:
+ valid_err_code = False
+ elif exc_val.code == 404 and self.backoff_on_404:
valid_err_code = False
elif exc_val.code in (401, 429):
valid_err_code = False
@@ -204,7 +212,7 @@ class RetryDestinationLimiter:
else:
valid_err_code = False
- if valid_err_code:
+ if success:
# We connected successfully.
if not self.retry_interval:
return
@@ -215,6 +223,12 @@ class RetryDestinationLimiter:
self.failure_ts = None
retry_last_ts = 0
self.retry_interval = 0
+ elif valid_err_code:
+ # We got a potentially valid error code back. We don't reset the
+ # timers though, as the other side might actually be down anyway
+ # (e.g. some deprovisioned servers will always return a 404 or 403,
+ # and we don't want to keep resetting the retry timers for them).
+ return
elif not self.backoff_on_failure:
return
else:
|