diff options
author | Patrick Cloke <clokep@users.noreply.github.com> | 2023-10-17 07:32:40 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-17 07:32:40 -0400 |
commit | 77dfc1f93967f4157ba961c3b5201206c1bbf797 (patch) | |
tree | 4c894e88055e182e93614323c68764017b2ddbb1 /synapse/util/retryutils.py | |
parent | Update the release script to remind releaser to check for special release not... (diff) | |
download | synapse-77dfc1f93967f4157ba961c3b5201206c1bbf797.tar.xz |
Fix a bug where servers could be marked as up when they were failing (#16506)
After this change a server will only be reported as back online if they were previously having requests fail.
Diffstat (limited to 'synapse/util/retryutils.py')
-rw-r--r-- | synapse/util/retryutils.py | 30 |
1 files changed, 17 insertions, 13 deletions
diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py index 0e1f907667..547202c96b 100644 --- a/synapse/util/retryutils.py +++ b/synapse/util/retryutils.py @@ -170,10 +170,10 @@ class RetryDestinationLimiter: database in milliseconds, or zero if the last request was successful. backoff_on_404: Back off if we get a 404 - backoff_on_failure: set to False if we should not increase the retry interval on a failure. - + notifier: A notifier used to mark servers as up. + replication_client A replication client used to mark servers as up. backoff_on_all_error_codes: Whether we should back off on any error code. """ @@ -237,6 +237,9 @@ class RetryDestinationLimiter: else: valid_err_code = False + # Whether previous requests to the destination had been failing. + previously_failing = bool(self.failure_ts) + if success: # We connected successfully. if not self.retry_interval: @@ -282,6 +285,9 @@ class RetryDestinationLimiter: if self.failure_ts is None: self.failure_ts = retry_last_ts + # Whether the current request to the destination had been failing. + currently_failing = bool(self.failure_ts) + async def store_retry_timings() -> None: try: await self.store.set_destination_retry_timings( @@ -291,17 +297,15 @@ class RetryDestinationLimiter: self.retry_interval, ) - if self.notifier: - # Inform the relevant places that the remote server is back up. - self.notifier.notify_remote_server_up(self.destination) - - if self.replication_client: - # If we're on a worker we try and inform master about this. The - # replication client doesn't hook into the notifier to avoid - # infinite loops where we send a `REMOTE_SERVER_UP` command to - # master, which then echoes it back to us which in turn pokes - # the notifier. - self.replication_client.send_remote_server_up(self.destination) + # If the server was previously failing, but is no longer. + if previously_failing and not currently_failing: + if self.notifier: + # Inform the relevant places that the remote server is back up. + self.notifier.notify_remote_server_up(self.destination) + + if self.replication_client: + # Inform other workers that the remote server is up. + self.replication_client.send_remote_server_up(self.destination) except Exception: logger.exception("Failed to store destination_retry_timings") |