Fix a bug where servers could be marked as up when they were failing (#16506)
After this change a server will only be reported as back online
if they were previously having requests fail.
1 files changed, 17 insertions, 13 deletions
diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py
index 0e1f907667..547202c96b 100644
--- a/synapse/util/retryutils.py
+++ b/synapse/util/retryutils.py
@@ -170,10 +170,10 @@ class RetryDestinationLimiter:
database in milliseconds, or zero if the last request was
successful.
backoff_on_404: Back off if we get a 404
-
backoff_on_failure: set to False if we should not increase the
retry interval on a failure.
-
+ notifier: A notifier used to mark servers as up.
+ replication_client A replication client used to mark servers as up.
backoff_on_all_error_codes: Whether we should back off on any
error code.
"""
@@ -237,6 +237,9 @@ class RetryDestinationLimiter:
else:
valid_err_code = False
+ # Whether previous requests to the destination had been failing.
+ previously_failing = bool(self.failure_ts)
+
if success:
# We connected successfully.
if not self.retry_interval:
@@ -282,6 +285,9 @@ class RetryDestinationLimiter:
if self.failure_ts is None:
self.failure_ts = retry_last_ts
+ # Whether the current request to the destination had been failing.
+ currently_failing = bool(self.failure_ts)
+
async def store_retry_timings() -> None:
try:
await self.store.set_destination_retry_timings(
@@ -291,17 +297,15 @@ class RetryDestinationLimiter:
self.retry_interval,
)
- if self.notifier:
- # Inform the relevant places that the remote server is back up.
- self.notifier.notify_remote_server_up(self.destination)
-
- if self.replication_client:
- # If we're on a worker we try and inform master about this. The
- # replication client doesn't hook into the notifier to avoid
- # infinite loops where we send a `REMOTE_SERVER_UP` command to
- # master, which then echoes it back to us which in turn pokes
- # the notifier.
- self.replication_client.send_remote_server_up(self.destination)
+ # If the server was previously failing, but is no longer.
+ if previously_failing and not currently_failing:
+ if self.notifier:
+ # Inform the relevant places that the remote server is back up.
+ self.notifier.notify_remote_server_up(self.destination)
+
+ if self.replication_client:
+ # Inform other workers that the remote server is up.
+ self.replication_client.send_remote_server_up(self.destination)
except Exception:
logger.exception("Failed to store destination_retry_timings")
|