diff --git a/changelog.d/15754.misc b/changelog.d/15754.misc
new file mode 100644
index 0000000000..4314d415a3
--- /dev/null
+++ b/changelog.d/15754.misc
@@ -0,0 +1 @@
+Allow for the configuration of the backoff algorithm for federation destinations.
diff --git a/docs/usage/configuration/config_documentation.md b/docs/usage/configuration/config_documentation.md
index 4e6fcd085a..c32608da2b 100644
--- a/docs/usage/configuration/config_documentation.md
+++ b/docs/usage/configuration/config_documentation.md
@@ -1242,6 +1242,14 @@ like sending a federation transaction.
* `max_short_retries`: maximum number of retries for the short retry algo. Default to 3 attempts.
* `max_long_retries`: maximum number of retries for the long retry algo. Default to 10 attempts.
+The following options control the retry logic when communicating with a specific homeserver destination.
+Unlike the previous configuration options, these values apply across all requests
+for a given destination and the state of the backoff is stored in the database.
+
+* `destination_min_retry_interval`: the initial backoff, after the first request fails. Defaults to 10m.
+* `destination_retry_multiplier`: how much we multiply the backoff by after each subsequent fail. Defaults to 2.
+* `destination_max_retry_interval`: a cap on the backoff. Defaults to a week.
+
Example configuration:
```yaml
federation:
@@ -1250,6 +1258,9 @@ federation:
max_long_retry_delay: 100s
max_short_retries: 5
max_long_retries: 20
+ destination_min_retry_interval: 30s
+ destination_retry_multiplier: 5
+ destination_max_retry_interval: 12h
```
---
## Caching
diff --git a/synapse/config/federation.py b/synapse/config/federation.py
index 0e1cb8b6e3..97636039b8 100644
--- a/synapse/config/federation.py
+++ b/synapse/config/federation.py
@@ -65,5 +65,23 @@ class FederationConfig(Config):
self.max_long_retries = federation_config.get("max_long_retries", 10)
self.max_short_retries = federation_config.get("max_short_retries", 3)
+ # Allow for the configuration of the backoff algorithm used
+ # when trying to reach an unavailable destination.
+ # Unlike previous configuration those values applies across
+ # multiple requests and the state of the backoff is stored on DB.
+ self.destination_min_retry_interval_ms = Config.parse_duration(
+ federation_config.get("destination_min_retry_interval", "10m")
+ )
+ self.destination_retry_multiplier = federation_config.get(
+ "destination_retry_multiplier", 2
+ )
+ self.destination_max_retry_interval_ms = min(
+ Config.parse_duration(
+ federation_config.get("destination_max_retry_interval", "7d")
+ ),
+ # Set a hard-limit to not overflow the database column.
+ 2**62,
+ )
+
_METRICS_FOR_DOMAINS_SCHEMA = {"type": "array", "items": {"type": "string"}}
diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py
index dcc037b982..27e9fc976c 100644
--- a/synapse/util/retryutils.py
+++ b/synapse/util/retryutils.py
@@ -27,15 +27,6 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__)
-# the initial backoff, after the first transaction fails
-MIN_RETRY_INTERVAL = 10 * 60 * 1000
-
-# how much we multiply the backoff by after each subsequent fail
-RETRY_MULTIPLIER = 5
-
-# a cap on the backoff. (Essentially none)
-MAX_RETRY_INTERVAL = 2**62
-
class NotRetryingDestination(Exception):
def __init__(self, retry_last_ts: int, retry_interval: int, destination: str):
@@ -169,6 +160,16 @@ class RetryDestinationLimiter:
self.notifier = notifier
self.replication_client = replication_client
+ self.destination_min_retry_interval_ms = (
+ self.store.hs.config.federation.destination_min_retry_interval_ms
+ )
+ self.destination_retry_multiplier = (
+ self.store.hs.config.federation.destination_retry_multiplier
+ )
+ self.destination_max_retry_interval_ms = (
+ self.store.hs.config.federation.destination_max_retry_interval_ms
+ )
+
def __enter__(self) -> None:
pass
@@ -220,13 +221,15 @@ class RetryDestinationLimiter:
# We couldn't connect.
if self.retry_interval:
self.retry_interval = int(
- self.retry_interval * RETRY_MULTIPLIER * random.uniform(0.8, 1.4)
+ self.retry_interval
+ * self.destination_retry_multiplier
+ * random.uniform(0.8, 1.4)
)
- if self.retry_interval >= MAX_RETRY_INTERVAL:
- self.retry_interval = MAX_RETRY_INTERVAL
+ if self.retry_interval >= self.destination_max_retry_interval_ms:
+ self.retry_interval = self.destination_max_retry_interval_ms
else:
- self.retry_interval = MIN_RETRY_INTERVAL
+ self.retry_interval = self.destination_min_retry_interval_ms
logger.info(
"Connection to %s was unsuccessful (%s(%s)); backoff now %i",
diff --git a/tests/storage/test_transactions.py b/tests/storage/test_transactions.py
index 2fab84a529..ef06b50dbb 100644
--- a/tests/storage/test_transactions.py
+++ b/tests/storage/test_transactions.py
@@ -17,7 +17,6 @@ from twisted.test.proto_helpers import MemoryReactor
from synapse.server import HomeServer
from synapse.storage.databases.main.transactions import DestinationRetryTimings
from synapse.util import Clock
-from synapse.util.retryutils import MAX_RETRY_INTERVAL
from tests.unittest import HomeserverTestCase
@@ -57,8 +56,14 @@ class TransactionStoreTestCase(HomeserverTestCase):
self.get_success(d)
def test_large_destination_retry(self) -> None:
+ max_retry_interval_ms = (
+ self.hs.config.federation.destination_max_retry_interval_ms
+ )
d = self.store.set_destination_retry_timings(
- "example.com", MAX_RETRY_INTERVAL, MAX_RETRY_INTERVAL, MAX_RETRY_INTERVAL
+ "example.com",
+ max_retry_interval_ms,
+ max_retry_interval_ms,
+ max_retry_interval_ms,
)
self.get_success(d)
diff --git a/tests/util/test_retryutils.py b/tests/util/test_retryutils.py
index 5f8f4e76b5..1277e1a865 100644
--- a/tests/util/test_retryutils.py
+++ b/tests/util/test_retryutils.py
@@ -11,12 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-from synapse.util.retryutils import (
- MIN_RETRY_INTERVAL,
- RETRY_MULTIPLIER,
- NotRetryingDestination,
- get_retry_limiter,
-)
+from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter
from tests.unittest import HomeserverTestCase
@@ -42,6 +37,11 @@ class RetryLimiterTestCase(HomeserverTestCase):
limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))
+ min_retry_interval_ms = (
+ self.hs.config.federation.destination_min_retry_interval_ms
+ )
+ retry_multiplier = self.hs.config.federation.destination_retry_multiplier
+
self.pump(1)
try:
with limiter:
@@ -57,7 +57,7 @@ class RetryLimiterTestCase(HomeserverTestCase):
assert new_timings is not None
self.assertEqual(new_timings.failure_ts, failure_ts)
self.assertEqual(new_timings.retry_last_ts, failure_ts)
- self.assertEqual(new_timings.retry_interval, MIN_RETRY_INTERVAL)
+ self.assertEqual(new_timings.retry_interval, min_retry_interval_ms)
# now if we try again we should get a failure
self.get_failure(
@@ -68,7 +68,7 @@ class RetryLimiterTestCase(HomeserverTestCase):
# advance the clock and try again
#
- self.pump(MIN_RETRY_INTERVAL)
+ self.pump(min_retry_interval_ms)
limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))
self.pump(1)
@@ -87,16 +87,16 @@ class RetryLimiterTestCase(HomeserverTestCase):
self.assertEqual(new_timings.failure_ts, failure_ts)
self.assertEqual(new_timings.retry_last_ts, retry_ts)
self.assertGreaterEqual(
- new_timings.retry_interval, MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 0.5
+ new_timings.retry_interval, min_retry_interval_ms * retry_multiplier * 0.5
)
self.assertLessEqual(
- new_timings.retry_interval, MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0
+ new_timings.retry_interval, min_retry_interval_ms * retry_multiplier * 2.0
)
#
# one more go, with success
#
- self.reactor.advance(MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0)
+ self.reactor.advance(min_retry_interval_ms * retry_multiplier * 2.0)
limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))
self.pump(1)
|