From 77dfc1f93967f4157ba961c3b5201206c1bbf797 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 17 Oct 2023 07:32:40 -0400 Subject: Fix a bug where servers could be marked as up when they were failing (#16506) After this change a server will only be reported as back online if they were previously having requests fail. --- synapse/util/retryutils.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'synapse') diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py index 0e1f907667..547202c96b 100644 --- a/synapse/util/retryutils.py +++ b/synapse/util/retryutils.py @@ -170,10 +170,10 @@ class RetryDestinationLimiter: database in milliseconds, or zero if the last request was successful. backoff_on_404: Back off if we get a 404 - backoff_on_failure: set to False if we should not increase the retry interval on a failure. - + notifier: A notifier used to mark servers as up. + replication_client A replication client used to mark servers as up. backoff_on_all_error_codes: Whether we should back off on any error code. """ @@ -237,6 +237,9 @@ class RetryDestinationLimiter: else: valid_err_code = False + # Whether previous requests to the destination had been failing. + previously_failing = bool(self.failure_ts) + if success: # We connected successfully. if not self.retry_interval: @@ -282,6 +285,9 @@ class RetryDestinationLimiter: if self.failure_ts is None: self.failure_ts = retry_last_ts + # Whether the current request to the destination had been failing. + currently_failing = bool(self.failure_ts) + async def store_retry_timings() -> None: try: await self.store.set_destination_retry_timings( @@ -291,17 +297,15 @@ class RetryDestinationLimiter: self.retry_interval, ) - if self.notifier: - # Inform the relevant places that the remote server is back up. - self.notifier.notify_remote_server_up(self.destination) - - if self.replication_client: - # If we're on a worker we try and inform master about this. The - # replication client doesn't hook into the notifier to avoid - # infinite loops where we send a `REMOTE_SERVER_UP` command to - # master, which then echoes it back to us which in turn pokes - # the notifier. - self.replication_client.send_remote_server_up(self.destination) + # If the server was previously failing, but is no longer. + if previously_failing and not currently_failing: + if self.notifier: + # Inform the relevant places that the remote server is back up. + self.notifier.notify_remote_server_up(self.destination) + + if self.replication_client: + # Inform other workers that the remote server is up. + self.replication_client.send_remote_server_up(self.destination) except Exception: logger.exception("Failed to store destination_retry_timings") -- cgit 1.4.1