From 77dfc1f93967f4157ba961c3b5201206c1bbf797 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@users.noreply.github.com>
Date: Tue, 17 Oct 2023 07:32:40 -0400
Subject: Fix a bug where servers could be marked as up when they were failing
 (#16506)

After this change a server will only be reported as back online
if they were previously having requests fail.
---
 synapse/util/retryutils.py | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

(limited to 'synapse')

diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py
index 0e1f907667..547202c96b 100644
--- a/synapse/util/retryutils.py
+++ b/synapse/util/retryutils.py
@@ -170,10 +170,10 @@ class RetryDestinationLimiter:
                 database in milliseconds, or zero if the last request was
                 successful.
             backoff_on_404: Back off if we get a 404
-
             backoff_on_failure: set to False if we should not increase the
                 retry interval on a failure.
-
+            notifier: A notifier used to mark servers as up.
+            replication_client A replication client used to mark servers as up.
             backoff_on_all_error_codes: Whether we should back off on any
                 error code.
         """
@@ -237,6 +237,9 @@ class RetryDestinationLimiter:
             else:
                 valid_err_code = False
 
+        # Whether previous requests to the destination had been failing.
+        previously_failing = bool(self.failure_ts)
+
         if success:
             # We connected successfully.
             if not self.retry_interval:
@@ -282,6 +285,9 @@ class RetryDestinationLimiter:
             if self.failure_ts is None:
                 self.failure_ts = retry_last_ts
 
+        # Whether the current request to the destination had been failing.
+        currently_failing = bool(self.failure_ts)
+
         async def store_retry_timings() -> None:
             try:
                 await self.store.set_destination_retry_timings(
@@ -291,17 +297,15 @@ class RetryDestinationLimiter:
                     self.retry_interval,
                 )
 
-                if self.notifier:
-                    # Inform the relevant places that the remote server is back up.
-                    self.notifier.notify_remote_server_up(self.destination)
-
-                if self.replication_client:
-                    # If we're on a worker we try and inform master about this. The
-                    # replication client doesn't hook into the notifier to avoid
-                    # infinite loops where we send a `REMOTE_SERVER_UP` command to
-                    # master, which then echoes it back to us which in turn pokes
-                    # the notifier.
-                    self.replication_client.send_remote_server_up(self.destination)
+                # If the server was previously failing, but is no longer.
+                if previously_failing and not currently_failing:
+                    if self.notifier:
+                        # Inform the relevant places that the remote server is back up.
+                        self.notifier.notify_remote_server_up(self.destination)
+
+                    if self.replication_client:
+                        # Inform other workers that the remote server is up.
+                        self.replication_client.send_remote_server_up(self.destination)
 
             except Exception:
                 logger.exception("Failed to store destination_retry_timings")
-- 
cgit 1.5.1