1 files changed, 41 insertions, 2 deletions
diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py
index 27e9fc976c..0e1f907667 100644
--- a/synapse/util/retryutils.py
+++ b/synapse/util/retryutils.py
@@ -19,6 +19,7 @@ from typing import TYPE_CHECKING, Any, Optional, Type
 from synapse.api.errors import CodeMessageException
 from synapse.metrics.background_process_metrics import run_as_background_process
 from synapse.storage import DataStore
+from synapse.types import StrCollection
 from synapse.util import Clock
 
 if TYPE_CHECKING:
@@ -116,6 +117,30 @@ async def get_retry_limiter(
     )
 
 
+async def filter_destinations_by_retry_limiter(
+    destinations: StrCollection,
+    clock: Clock,
+    store: DataStore,
+    retry_due_within_ms: int = 0,
+) -> StrCollection:
+    """Filter down the list of destinations to only those that will are either
+    alive or due for a retry (within `retry_due_within_ms`)
+    """
+    if not destinations:
+        return destinations
+
+    retry_timings = await store.get_destination_retry_timings_batch(destinations)
+
+    now = int(clock.time_msec())
+
+    return [
+        destination
+        for destination, timings in retry_timings.items()
+        if timings is None
+        or timings.retry_last_ts + timings.retry_interval <= now + retry_due_within_ms
+    ]
+
+
 class RetryDestinationLimiter:
     def __init__(
         self,
@@ -128,6 +153,7 @@ class RetryDestinationLimiter:
         backoff_on_failure: bool = True,
         notifier: Optional["Notifier"] = None,
         replication_client: Optional["ReplicationCommandHandler"] = None,
+        backoff_on_all_error_codes: bool = False,
     ):
         """Marks the destination as "down" if an exception is thrown in the
         context, except for CodeMessageException with code < 500.
@@ -147,6 +173,9 @@ class RetryDestinationLimiter:
 
             backoff_on_failure: set to False if we should not increase the
                 retry interval on a failure.
+
+            backoff_on_all_error_codes: Whether we should back off on any
+                error code.
         """
         self.clock = clock
         self.store = store
@@ -156,6 +185,7 @@ class RetryDestinationLimiter:
         self.retry_interval = retry_interval
         self.backoff_on_404 = backoff_on_404
         self.backoff_on_failure = backoff_on_failure
+        self.backoff_on_all_error_codes = backoff_on_all_error_codes
 
         self.notifier = notifier
         self.replication_client = replication_client
@@ -179,6 +209,7 @@ class RetryDestinationLimiter:
         exc_val: Optional[BaseException],
         exc_tb: Optional[TracebackType],
     ) -> None:
+        success = exc_type is None
         valid_err_code = False
         if exc_type is None:
             valid_err_code = True
@@ -195,7 +226,9 @@ class RetryDestinationLimiter:
             # won't accept our requests for at least a while.
             # 429 is us being aggressively rate limited, so lets rate limit
             # ourselves.
-            if exc_val.code == 404 and self.backoff_on_404:
+            if self.backoff_on_all_error_codes:
+                valid_err_code = False
+            elif exc_val.code == 404 and self.backoff_on_404:
                 valid_err_code = False
             elif exc_val.code in (401, 429):
                 valid_err_code = False
@@ -204,7 +237,7 @@ class RetryDestinationLimiter:
             else:
                 valid_err_code = False
 
-        if valid_err_code:
+        if success:
             # We connected successfully.
             if not self.retry_interval:
                 return
@@ -215,6 +248,12 @@ class RetryDestinationLimiter:
             self.failure_ts = None
             retry_last_ts = 0
             self.retry_interval = 0
+        elif valid_err_code:
+            # We got a potentially valid error code back. We don't reset the
+            # timers though, as the other side might actually be down anyway
+            # (e.g. some deprovisioned servers will always return a 404 or 403,
+            # and we don't want to keep resetting the retry timers for them).
+            return
         elif not self.backoff_on_failure:
             return
         else: