summary refs log tree commit diff
path: root/synapse/util
diff options
context:
space:
mode:
authorErik Johnston <erikj@matrix.org>2023-09-04 14:04:43 +0100
committerGitHub <noreply@github.com>2023-09-04 14:04:43 +0100
commitf84baecb6f57b5ddb570c43574f774fae5e8afed (patch)
tree2caa954af94803230112bdc64f885588afa82027 /synapse/util
parentBump furo from 2023.7.26 to 2023.8.19 (#16238) (diff)
downloadsynapse-f84baecb6f57b5ddb570c43574f774fae5e8afed.tar.xz
Don't reset retry timers on "valid" error codes (#16221)
Diffstat (limited to 'synapse/util')
-rw-r--r--synapse/util/retryutils.py18
1 files changed, 16 insertions, 2 deletions
diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py
index 27e9fc976c..9d2065372c 100644
--- a/synapse/util/retryutils.py
+++ b/synapse/util/retryutils.py
@@ -128,6 +128,7 @@ class RetryDestinationLimiter:
         backoff_on_failure: bool = True,
         notifier: Optional["Notifier"] = None,
         replication_client: Optional["ReplicationCommandHandler"] = None,
+        backoff_on_all_error_codes: bool = False,
     ):
         """Marks the destination as "down" if an exception is thrown in the
         context, except for CodeMessageException with code < 500.
@@ -147,6 +148,9 @@ class RetryDestinationLimiter:
 
             backoff_on_failure: set to False if we should not increase the
                 retry interval on a failure.
+
+            backoff_on_all_error_codes: Whether we should back off on any
+                error code.
         """
         self.clock = clock
         self.store = store
@@ -156,6 +160,7 @@ class RetryDestinationLimiter:
         self.retry_interval = retry_interval
         self.backoff_on_404 = backoff_on_404
         self.backoff_on_failure = backoff_on_failure
+        self.backoff_on_all_error_codes = backoff_on_all_error_codes
 
         self.notifier = notifier
         self.replication_client = replication_client
@@ -179,6 +184,7 @@ class RetryDestinationLimiter:
         exc_val: Optional[BaseException],
         exc_tb: Optional[TracebackType],
     ) -> None:
+        success = exc_type is None
         valid_err_code = False
         if exc_type is None:
             valid_err_code = True
@@ -195,7 +201,9 @@ class RetryDestinationLimiter:
             # won't accept our requests for at least a while.
             # 429 is us being aggressively rate limited, so lets rate limit
             # ourselves.
-            if exc_val.code == 404 and self.backoff_on_404:
+            if self.backoff_on_all_error_codes:
+                valid_err_code = False
+            elif exc_val.code == 404 and self.backoff_on_404:
                 valid_err_code = False
             elif exc_val.code in (401, 429):
                 valid_err_code = False
@@ -204,7 +212,7 @@ class RetryDestinationLimiter:
             else:
                 valid_err_code = False
 
-        if valid_err_code:
+        if success:
             # We connected successfully.
             if not self.retry_interval:
                 return
@@ -215,6 +223,12 @@ class RetryDestinationLimiter:
             self.failure_ts = None
             retry_last_ts = 0
             self.retry_interval = 0
+        elif valid_err_code:
+            # We got a potentially valid error code back. We don't reset the
+            # timers though, as the other side might actually be down anyway
+            # (e.g. some deprovisioned servers will always return a 404 or 403,
+            # and we don't want to keep resetting the retry timers for them).
+            return
         elif not self.backoff_on_failure:
             return
         else: