summary refs log tree commit diff
path: root/synapse
diff options
context:
space:
mode:
authorMathieu Velten <mathieuv@matrix.org>2023-08-03 20:36:55 +0200
committerGitHub <noreply@github.com>2023-08-03 14:36:55 -0400
commitf0a860908ba0309c89c9dba452d99b4f9c6928f7 (patch)
tree0246657c2956d574bbd57dcaae6965f41f9ac3de /synapse
parentAllow modules to check whether the current worker is configured to run backgr... (diff)
downloadsynapse-f0a860908ba0309c89c9dba452d99b4f9c6928f7.tar.xz
Allow config of the backoff algorithm for the federation client. (#15754)
Adds three new configuration variables:

* destination_min_retry_interval is identical to before (10mn).
* destination_retry_multiplier is now 2 instead of 5, the maximum value will
  be reached slower.
* destination_max_retry_interval is one day instead of (essentially) infinity.

Capping this will cause destinations to continue to be retried sometimes instead
of being lost forever. The previous value was 2 ^ 62 milliseconds.
Diffstat (limited to 'synapse')
-rw-r--r--synapse/config/federation.py18
-rw-r--r--synapse/util/retryutils.py29
2 files changed, 34 insertions, 13 deletions
diff --git a/synapse/config/federation.py b/synapse/config/federation.py
index 0e1cb8b6e3..97636039b8 100644
--- a/synapse/config/federation.py
+++ b/synapse/config/federation.py
@@ -65,5 +65,23 @@ class FederationConfig(Config):
         self.max_long_retries = federation_config.get("max_long_retries", 10)
         self.max_short_retries = federation_config.get("max_short_retries", 3)
 
+        # Allow for the configuration of the backoff algorithm used
+        # when trying to reach an unavailable destination.
+        # Unlike previous configuration those values applies across
+        # multiple requests and the state of the backoff is stored on DB.
+        self.destination_min_retry_interval_ms = Config.parse_duration(
+            federation_config.get("destination_min_retry_interval", "10m")
+        )
+        self.destination_retry_multiplier = federation_config.get(
+            "destination_retry_multiplier", 2
+        )
+        self.destination_max_retry_interval_ms = min(
+            Config.parse_duration(
+                federation_config.get("destination_max_retry_interval", "7d")
+            ),
+            # Set a hard-limit to not overflow the database column.
+            2**62,
+        )
+
 
 _METRICS_FOR_DOMAINS_SCHEMA = {"type": "array", "items": {"type": "string"}}
diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py
index dcc037b982..27e9fc976c 100644
--- a/synapse/util/retryutils.py
+++ b/synapse/util/retryutils.py
@@ -27,15 +27,6 @@ if TYPE_CHECKING:
 
 logger = logging.getLogger(__name__)
 
-# the initial backoff, after the first transaction fails
-MIN_RETRY_INTERVAL = 10 * 60 * 1000
-
-# how much we multiply the backoff by after each subsequent fail
-RETRY_MULTIPLIER = 5
-
-# a cap on the backoff. (Essentially none)
-MAX_RETRY_INTERVAL = 2**62
-
 
 class NotRetryingDestination(Exception):
     def __init__(self, retry_last_ts: int, retry_interval: int, destination: str):
@@ -169,6 +160,16 @@ class RetryDestinationLimiter:
         self.notifier = notifier
         self.replication_client = replication_client
 
+        self.destination_min_retry_interval_ms = (
+            self.store.hs.config.federation.destination_min_retry_interval_ms
+        )
+        self.destination_retry_multiplier = (
+            self.store.hs.config.federation.destination_retry_multiplier
+        )
+        self.destination_max_retry_interval_ms = (
+            self.store.hs.config.federation.destination_max_retry_interval_ms
+        )
+
     def __enter__(self) -> None:
         pass
 
@@ -220,13 +221,15 @@ class RetryDestinationLimiter:
             # We couldn't connect.
             if self.retry_interval:
                 self.retry_interval = int(
-                    self.retry_interval * RETRY_MULTIPLIER * random.uniform(0.8, 1.4)
+                    self.retry_interval
+                    * self.destination_retry_multiplier
+                    * random.uniform(0.8, 1.4)
                 )
 
-                if self.retry_interval >= MAX_RETRY_INTERVAL:
-                    self.retry_interval = MAX_RETRY_INTERVAL
+                if self.retry_interval >= self.destination_max_retry_interval_ms:
+                    self.retry_interval = self.destination_max_retry_interval_ms
             else:
-                self.retry_interval = MIN_RETRY_INTERVAL
+                self.retry_interval = self.destination_min_retry_interval_ms
 
             logger.info(
                 "Connection to %s was unsuccessful (%s(%s)); backoff now %i",