From dce6e9e0c11fc5d99b2da6698aed04e9f525f242 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Mon, 3 Jun 2019 23:50:43 +0100 Subject: Avoid rapidly backing-off a server if we ignore the retry interval --- synapse/util/retryutils.py | 60 ++++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 23 deletions(-) (limited to 'synapse/util/retryutils.py') diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py index 26cce7d197..f6dfa77d8f 100644 --- a/synapse/util/retryutils.py +++ b/synapse/util/retryutils.py @@ -46,8 +46,7 @@ class NotRetryingDestination(Exception): @defer.inlineCallbacks -def get_retry_limiter(destination, clock, store, ignore_backoff=False, - **kwargs): +def get_retry_limiter(destination, clock, store, ignore_backoff=False, **kwargs): """For a given destination check if we have previously failed to send a request there and are waiting before retrying the destination. If we are not ready to retry the destination, this will raise a @@ -60,8 +59,7 @@ def get_retry_limiter(destination, clock, store, ignore_backoff=False, clock (synapse.util.clock): timing source store (synapse.storage.transactions.TransactionStore): datastore ignore_backoff (bool): true to ignore the historical backoff data and - try the request anyway. We will still update the next - retry_interval on success/failure. + try the request anyway. We will still reset the retry_interval on success. Example usage: @@ -75,13 +73,12 @@ def get_retry_limiter(destination, clock, store, ignore_backoff=False, """ retry_last_ts, retry_interval = (0, 0) - retry_timings = yield store.get_destination_retry_timings( - destination - ) + retry_timings = yield store.get_destination_retry_timings(destination) if retry_timings: retry_last_ts, retry_interval = ( - retry_timings["retry_last_ts"], retry_timings["retry_interval"] + retry_timings["retry_last_ts"], + retry_timings["retry_interval"], ) now = int(clock.time_msec()) @@ -93,22 +90,31 @@ def get_retry_limiter(destination, clock, store, ignore_backoff=False, destination=destination, ) + # if we are ignoring the backoff data, we should also not increment the backoff + # when we get another failure - otherwise a server can very quickly reach the + # maximum backoff even though it might only have been down briefly + backoff_on_failure = not ignore_backoff + defer.returnValue( RetryDestinationLimiter( - destination, - clock, - store, - retry_interval, - **kwargs + destination, clock, store, retry_interval, backoff_on_failure, **kwargs ) ) class RetryDestinationLimiter(object): - def __init__(self, destination, clock, store, retry_interval, - min_retry_interval=10 * 60 * 1000, - max_retry_interval=24 * 60 * 60 * 1000, - multiplier_retry_interval=5, backoff_on_404=False): + def __init__( + self, + destination, + clock, + store, + retry_interval, + min_retry_interval=10 * 60 * 1000, + max_retry_interval=24 * 60 * 60 * 1000, + multiplier_retry_interval=5, + backoff_on_404=False, + backoff_on_failure=True, + ): """Marks the destination as "down" if an exception is thrown in the context, except for CodeMessageException with code < 500. @@ -128,6 +134,9 @@ class RetryDestinationLimiter(object): multiplier_retry_interval (int): The multiplier to use to increase the retry interval after a failed request. backoff_on_404 (bool): Back off if we get a 404 + + backoff_on_failure (bool): set to False if we should not increase the + retry interval on a failure. """ self.clock = clock self.store = store @@ -138,6 +147,7 @@ class RetryDestinationLimiter(object): self.max_retry_interval = max_retry_interval self.multiplier_retry_interval = multiplier_retry_interval self.backoff_on_404 = backoff_on_404 + self.backoff_on_failure = backoff_on_failure def __enter__(self): pass @@ -173,10 +183,13 @@ class RetryDestinationLimiter(object): if not self.retry_interval: return - logger.debug("Connection to %s was successful; clearing backoff", - self.destination) + logger.debug( + "Connection to %s was successful; clearing backoff", self.destination + ) retry_last_ts = 0 self.retry_interval = 0 + elif not self.backoff_on_failure: + return else: # We couldn't connect. if self.retry_interval: @@ -190,7 +203,10 @@ class RetryDestinationLimiter(object): logger.info( "Connection to %s was unsuccessful (%s(%s)); backoff now %i", - self.destination, exc_type, exc_val, self.retry_interval + self.destination, + exc_type, + exc_val, + self.retry_interval, ) retry_last_ts = int(self.clock.time_msec()) @@ -201,9 +217,7 @@ class RetryDestinationLimiter(object): self.destination, retry_last_ts, self.retry_interval ) except Exception: - logger.exception( - "Failed to store destination_retry_timings", - ) + logger.exception("Failed to store destination_retry_timings") # we deliberately do this in the background. synapse.util.logcontext.run_in_background(store_retry_timings) -- cgit 1.4.1 From aa530e68005d041ad037b081fe748d301a11291a Mon Sep 17 00:00:00 2001 From: Richard van der Hoff <1389908+richvdh@users.noreply.github.com> Date: Tue, 4 Jun 2019 22:02:53 +0100 Subject: Call RetryLimiter correctly (#5340) Fixes a regression introduced in #5335. --- changelog.d/5340.bugfix | 2 ++ synapse/util/retryutils.py | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 changelog.d/5340.bugfix (limited to 'synapse/util/retryutils.py') diff --git a/changelog.d/5340.bugfix b/changelog.d/5340.bugfix new file mode 100644 index 0000000000..931ee904e1 --- /dev/null +++ b/changelog.d/5340.bugfix @@ -0,0 +1,2 @@ +Fix a bug where we could rapidly mark a server as unreachable even though it was only down for a few minutes. + diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py index f6dfa77d8f..1a77456498 100644 --- a/synapse/util/retryutils.py +++ b/synapse/util/retryutils.py @@ -97,7 +97,12 @@ def get_retry_limiter(destination, clock, store, ignore_backoff=False, **kwargs) defer.returnValue( RetryDestinationLimiter( - destination, clock, store, retry_interval, backoff_on_failure, **kwargs + destination, + clock, + store, + retry_interval, + backoff_on_failure=backoff_on_failure, + **kwargs ) ) -- cgit 1.4.1