1 files changed, 47 insertions, 8 deletions
diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py
index e2de7fce91..4fa9d1a03c 100644
--- a/synapse/util/retryutils.py
+++ b/synapse/util/retryutils.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import synapse.util.logcontext
 from twisted.internet import defer
 
 from synapse.api.errors import CodeMessageException
@@ -35,7 +35,8 @@ class NotRetryingDestination(Exception):
 
 
 @defer.inlineCallbacks
-def get_retry_limiter(destination, clock, store, **kwargs):
+def get_retry_limiter(destination, clock, store, ignore_backoff=False,
+                      **kwargs):
     """For a given destination check if we have previously failed to
     send a request there and are waiting before retrying the destination.
     If we are not ready to retry the destination, this will raise a
@@ -43,6 +44,14 @@ def get_retry_limiter(destination, clock, store, **kwargs):
     that will mark the destination as down if an exception is thrown (excluding
     CodeMessageException with code < 500)
 
+    Args:
+        destination (str): name of homeserver
+        clock (synapse.util.clock): timing source
+        store (synapse.storage.transactions.TransactionStore): datastore
+        ignore_backoff (bool): true to ignore the historical backoff data and
+            try the request anyway. We will still update the next
+            retry_interval on success/failure.
+
     Example usage:
 
         try:
@@ -66,7 +75,7 @@ def get_retry_limiter(destination, clock, store, **kwargs):
 
         now = int(clock.time_msec())
 
-        if retry_last_ts + retry_interval > now:
+        if not ignore_backoff and retry_last_ts + retry_interval > now:
             raise NotRetryingDestination(
                 retry_last_ts=retry_last_ts,
                 retry_interval=retry_interval,
@@ -88,7 +97,7 @@ class RetryDestinationLimiter(object):
     def __init__(self, destination, clock, store, retry_interval,
                  min_retry_interval=10 * 60 * 1000,
                  max_retry_interval=24 * 60 * 60 * 1000,
-                 multiplier_retry_interval=5,):
+                 multiplier_retry_interval=5, backoff_on_404=False):
         """Marks the destination as "down" if an exception is thrown in the
         context, except for CodeMessageException with code < 500.
 
@@ -107,6 +116,7 @@ class RetryDestinationLimiter(object):
                 a failed request, in milliseconds.
             multiplier_retry_interval (int): The multiplier to use to increase
                 the retry interval after a failed request.
+            backoff_on_404 (bool): Back off if we get a 404
         """
         self.clock = clock
         self.store = store
@@ -116,20 +126,44 @@ class RetryDestinationLimiter(object):
         self.min_retry_interval = min_retry_interval
         self.max_retry_interval = max_retry_interval
         self.multiplier_retry_interval = multiplier_retry_interval
+        self.backoff_on_404 = backoff_on_404
 
     def __enter__(self):
         pass
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         valid_err_code = False
-        if exc_type is not None and issubclass(exc_type, CodeMessageException):
-            valid_err_code = exc_val.code != 429 and 0 <= exc_val.code < 500
+        if exc_type is None:
+            valid_err_code = True
+        elif not issubclass(exc_type, Exception):
+            # avoid treating exceptions which don't derive from Exception as
+            # failures; this is mostly so as not to catch defer._DefGen.
+            valid_err_code = True
+        elif issubclass(exc_type, CodeMessageException):
+            # Some error codes are perfectly fine for some APIs, whereas other
+            # APIs may expect to never received e.g. a 404. It's important to
+            # handle 404 as some remote servers will return a 404 when the HS
+            # has been decommissioned.
+            # If we get a 401, then we should probably back off since they
+            # won't accept our requests for at least a while.
+            # 429 is us being aggresively rate limited, so lets rate limit
+            # ourselves.
+            if exc_val.code == 404 and self.backoff_on_404:
+                valid_err_code = False
+            elif exc_val.code in (401, 429):
+                valid_err_code = False
+            elif exc_val.code < 500:
+                valid_err_code = True
+            else:
+                valid_err_code = False
 
-        if exc_type is None or valid_err_code:
+        if valid_err_code:
             # We connected successfully.
             if not self.retry_interval:
                 return
 
+            logger.debug("Connection to %s was successful; clearing backoff",
+                         self.destination)
             retry_last_ts = 0
             self.retry_interval = 0
         else:
@@ -143,6 +177,10 @@ class RetryDestinationLimiter(object):
             else:
                 self.retry_interval = self.min_retry_interval
 
+            logger.debug(
+                "Connection to %s was unsuccessful (%s(%s)); backoff now %i",
+                self.destination, exc_type, exc_val, self.retry_interval
+            )
             retry_last_ts = int(self.clock.time_msec())
 
         @defer.inlineCallbacks
@@ -156,4 +194,5 @@ class RetryDestinationLimiter(object):
                     "Failed to store set_destination_retry_timings",
                 )
 
-        store_retry_timings()
+        # we deliberately do this in the background.
+        synapse.util.logcontext.preserve_fn(store_retry_timings)()