diff --git a/changelog.d/16156.bugfix b/changelog.d/16156.bugfix
new file mode 100644
index 0000000000..17284297cf
--- /dev/null
+++ b/changelog.d/16156.bugfix
@@ -0,0 +1 @@
+Fix a bug introduced in 1.87 where synapse would send an excessive amount of federation requests to servers which have been offline for a long time. Contributed by Nico.
diff --git a/synapse/storage/databases/main/transactions.py b/synapse/storage/databases/main/transactions.py
index c3bd36efc9..48e4b0ba3c 100644
--- a/synapse/storage/databases/main/transactions.py
+++ b/synapse/storage/databases/main/transactions.py
@@ -242,6 +242,8 @@ class TransactionWorkerStore(CacheInvalidationWorkerStore):
) -> None:
# Upsert retry time interval if retry_interval is zero (i.e. we're
# resetting it) or greater than the existing retry interval.
+ # We also upsert when the new retry interval is the same as the existing one,
+ # since it will be the case when `destination_max_retry_interval` is reached.
#
# WARNING: This is executed in autocommit, so we shouldn't add any more
# SQL calls in here (without being very careful).
@@ -257,7 +259,7 @@ class TransactionWorkerStore(CacheInvalidationWorkerStore):
WHERE
EXCLUDED.retry_interval = 0
OR destinations.retry_interval IS NULL
- OR destinations.retry_interval < EXCLUDED.retry_interval
+ OR destinations.retry_interval <= EXCLUDED.retry_interval
"""
txn.execute(sql, (destination, failure_ts, retry_last_ts, retry_interval))
diff --git a/tests/util/test_retryutils.py b/tests/util/test_retryutils.py
index 1277e1a865..4bcd17a6fc 100644
--- a/tests/util/test_retryutils.py
+++ b/tests/util/test_retryutils.py
@@ -108,3 +108,54 @@ class RetryLimiterTestCase(HomeserverTestCase):
new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
self.assertIsNone(new_timings)
+
+ def test_max_retry_interval(self) -> None:
+ """Test that `destination_max_retry_interval` setting works as expected"""
+ store = self.hs.get_datastores().main
+
+ destination_max_retry_interval_ms = (
+ self.hs.config.federation.destination_max_retry_interval_ms
+ )
+
+ self.get_success(get_retry_limiter("test_dest", self.clock, store))
+ self.pump(1)
+
+ failure_ts = self.clock.time_msec()
+
+ # Simulate reaching destination_max_retry_interval
+ self.get_success(
+ store.set_destination_retry_timings(
+ "test_dest",
+ failure_ts=failure_ts,
+ retry_last_ts=failure_ts,
+ retry_interval=destination_max_retry_interval_ms,
+ )
+ )
+
+ # Check it fails
+ self.get_failure(
+ get_retry_limiter("test_dest", self.clock, store), NotRetryingDestination
+ )
+
+ # Get past retry_interval and we can try again, and still throw an error to continue the backoff
+ self.reactor.advance(destination_max_retry_interval_ms / 1000 + 1)
+ limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))
+ self.pump(1)
+ try:
+ with limiter:
+ self.pump(1)
+ raise AssertionError("argh")
+ except AssertionError:
+ pass
+
+ self.pump()
+
+ # retry_interval does not increase and stays at destination_max_retry_interval_ms
+ new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
+ assert new_timings is not None
+ self.assertEqual(new_timings.retry_interval, destination_max_retry_interval_ms)
+
+ # Check it fails
+ self.get_failure(
+ get_retry_limiter("test_dest", self.clock, store), NotRetryingDestination
+ )
|