From f0a860908ba0309c89c9dba452d99b4f9c6928f7 Mon Sep 17 00:00:00 2001 From: Mathieu Velten Date: Thu, 3 Aug 2023 20:36:55 +0200 Subject: Allow config of the backoff algorithm for the federation client. (#15754) Adds three new configuration variables: * destination_min_retry_interval is identical to before (10mn). * destination_retry_multiplier is now 2 instead of 5, the maximum value will be reached slower. * destination_max_retry_interval is one day instead of (essentially) infinity. Capping this will cause destinations to continue to be retried sometimes instead of being lost forever. The previous value was 2 ^ 62 milliseconds. --- changelog.d/15754.misc | 1 + docs/usage/configuration/config_documentation.md | 11 +++++++++ synapse/config/federation.py | 18 +++++++++++++++ synapse/util/retryutils.py | 29 +++++++++++++----------- tests/storage/test_transactions.py | 9 ++++++-- tests/util/test_retryutils.py | 22 +++++++++--------- 6 files changed, 64 insertions(+), 26 deletions(-) create mode 100644 changelog.d/15754.misc diff --git a/changelog.d/15754.misc b/changelog.d/15754.misc new file mode 100644 index 0000000000..4314d415a3 --- /dev/null +++ b/changelog.d/15754.misc @@ -0,0 +1 @@ +Allow for the configuration of the backoff algorithm for federation destinations. diff --git a/docs/usage/configuration/config_documentation.md b/docs/usage/configuration/config_documentation.md index 4e6fcd085a..c32608da2b 100644 --- a/docs/usage/configuration/config_documentation.md +++ b/docs/usage/configuration/config_documentation.md @@ -1242,6 +1242,14 @@ like sending a federation transaction. * `max_short_retries`: maximum number of retries for the short retry algo. Default to 3 attempts. * `max_long_retries`: maximum number of retries for the long retry algo. Default to 10 attempts. +The following options control the retry logic when communicating with a specific homeserver destination. +Unlike the previous configuration options, these values apply across all requests +for a given destination and the state of the backoff is stored in the database. + +* `destination_min_retry_interval`: the initial backoff, after the first request fails. Defaults to 10m. +* `destination_retry_multiplier`: how much we multiply the backoff by after each subsequent fail. Defaults to 2. +* `destination_max_retry_interval`: a cap on the backoff. Defaults to a week. + Example configuration: ```yaml federation: @@ -1250,6 +1258,9 @@ federation: max_long_retry_delay: 100s max_short_retries: 5 max_long_retries: 20 + destination_min_retry_interval: 30s + destination_retry_multiplier: 5 + destination_max_retry_interval: 12h ``` --- ## Caching diff --git a/synapse/config/federation.py b/synapse/config/federation.py index 0e1cb8b6e3..97636039b8 100644 --- a/synapse/config/federation.py +++ b/synapse/config/federation.py @@ -65,5 +65,23 @@ class FederationConfig(Config): self.max_long_retries = federation_config.get("max_long_retries", 10) self.max_short_retries = federation_config.get("max_short_retries", 3) + # Allow for the configuration of the backoff algorithm used + # when trying to reach an unavailable destination. + # Unlike previous configuration those values applies across + # multiple requests and the state of the backoff is stored on DB. + self.destination_min_retry_interval_ms = Config.parse_duration( + federation_config.get("destination_min_retry_interval", "10m") + ) + self.destination_retry_multiplier = federation_config.get( + "destination_retry_multiplier", 2 + ) + self.destination_max_retry_interval_ms = min( + Config.parse_duration( + federation_config.get("destination_max_retry_interval", "7d") + ), + # Set a hard-limit to not overflow the database column. + 2**62, + ) + _METRICS_FOR_DOMAINS_SCHEMA = {"type": "array", "items": {"type": "string"}} diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py index dcc037b982..27e9fc976c 100644 --- a/synapse/util/retryutils.py +++ b/synapse/util/retryutils.py @@ -27,15 +27,6 @@ if TYPE_CHECKING: logger = logging.getLogger(__name__) -# the initial backoff, after the first transaction fails -MIN_RETRY_INTERVAL = 10 * 60 * 1000 - -# how much we multiply the backoff by after each subsequent fail -RETRY_MULTIPLIER = 5 - -# a cap on the backoff. (Essentially none) -MAX_RETRY_INTERVAL = 2**62 - class NotRetryingDestination(Exception): def __init__(self, retry_last_ts: int, retry_interval: int, destination: str): @@ -169,6 +160,16 @@ class RetryDestinationLimiter: self.notifier = notifier self.replication_client = replication_client + self.destination_min_retry_interval_ms = ( + self.store.hs.config.federation.destination_min_retry_interval_ms + ) + self.destination_retry_multiplier = ( + self.store.hs.config.federation.destination_retry_multiplier + ) + self.destination_max_retry_interval_ms = ( + self.store.hs.config.federation.destination_max_retry_interval_ms + ) + def __enter__(self) -> None: pass @@ -220,13 +221,15 @@ class RetryDestinationLimiter: # We couldn't connect. if self.retry_interval: self.retry_interval = int( - self.retry_interval * RETRY_MULTIPLIER * random.uniform(0.8, 1.4) + self.retry_interval + * self.destination_retry_multiplier + * random.uniform(0.8, 1.4) ) - if self.retry_interval >= MAX_RETRY_INTERVAL: - self.retry_interval = MAX_RETRY_INTERVAL + if self.retry_interval >= self.destination_max_retry_interval_ms: + self.retry_interval = self.destination_max_retry_interval_ms else: - self.retry_interval = MIN_RETRY_INTERVAL + self.retry_interval = self.destination_min_retry_interval_ms logger.info( "Connection to %s was unsuccessful (%s(%s)); backoff now %i", diff --git a/tests/storage/test_transactions.py b/tests/storage/test_transactions.py index 2fab84a529..ef06b50dbb 100644 --- a/tests/storage/test_transactions.py +++ b/tests/storage/test_transactions.py @@ -17,7 +17,6 @@ from twisted.test.proto_helpers import MemoryReactor from synapse.server import HomeServer from synapse.storage.databases.main.transactions import DestinationRetryTimings from synapse.util import Clock -from synapse.util.retryutils import MAX_RETRY_INTERVAL from tests.unittest import HomeserverTestCase @@ -57,8 +56,14 @@ class TransactionStoreTestCase(HomeserverTestCase): self.get_success(d) def test_large_destination_retry(self) -> None: + max_retry_interval_ms = ( + self.hs.config.federation.destination_max_retry_interval_ms + ) d = self.store.set_destination_retry_timings( - "example.com", MAX_RETRY_INTERVAL, MAX_RETRY_INTERVAL, MAX_RETRY_INTERVAL + "example.com", + max_retry_interval_ms, + max_retry_interval_ms, + max_retry_interval_ms, ) self.get_success(d) diff --git a/tests/util/test_retryutils.py b/tests/util/test_retryutils.py index 5f8f4e76b5..1277e1a865 100644 --- a/tests/util/test_retryutils.py +++ b/tests/util/test_retryutils.py @@ -11,12 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from synapse.util.retryutils import ( - MIN_RETRY_INTERVAL, - RETRY_MULTIPLIER, - NotRetryingDestination, - get_retry_limiter, -) +from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter from tests.unittest import HomeserverTestCase @@ -42,6 +37,11 @@ class RetryLimiterTestCase(HomeserverTestCase): limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store)) + min_retry_interval_ms = ( + self.hs.config.federation.destination_min_retry_interval_ms + ) + retry_multiplier = self.hs.config.federation.destination_retry_multiplier + self.pump(1) try: with limiter: @@ -57,7 +57,7 @@ class RetryLimiterTestCase(HomeserverTestCase): assert new_timings is not None self.assertEqual(new_timings.failure_ts, failure_ts) self.assertEqual(new_timings.retry_last_ts, failure_ts) - self.assertEqual(new_timings.retry_interval, MIN_RETRY_INTERVAL) + self.assertEqual(new_timings.retry_interval, min_retry_interval_ms) # now if we try again we should get a failure self.get_failure( @@ -68,7 +68,7 @@ class RetryLimiterTestCase(HomeserverTestCase): # advance the clock and try again # - self.pump(MIN_RETRY_INTERVAL) + self.pump(min_retry_interval_ms) limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store)) self.pump(1) @@ -87,16 +87,16 @@ class RetryLimiterTestCase(HomeserverTestCase): self.assertEqual(new_timings.failure_ts, failure_ts) self.assertEqual(new_timings.retry_last_ts, retry_ts) self.assertGreaterEqual( - new_timings.retry_interval, MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 0.5 + new_timings.retry_interval, min_retry_interval_ms * retry_multiplier * 0.5 ) self.assertLessEqual( - new_timings.retry_interval, MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0 + new_timings.retry_interval, min_retry_interval_ms * retry_multiplier * 2.0 ) # # one more go, with success # - self.reactor.advance(MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0) + self.reactor.advance(min_retry_interval_ms * retry_multiplier * 2.0) limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store)) self.pump(1) -- cgit 1.4.1