summary refs log tree commit diff
diff options
context:
space:
mode:
authorErik Johnston <erik@matrix.org>2015-11-05 16:15:50 +0000
committerErik Johnston <erik@matrix.org>2015-11-05 16:15:50 +0000
commit5bc690408d8b6bf409e4fbb2b4b8df07562d1c10 (patch)
tree671d8157a06a9507af3ce271eecef314c141a0ff
parentMerge branch 'develop' of github.com:matrix-org/synapse into develop (diff)
parentRetry dead servers a lot less often (diff)
downloadsynapse-5bc690408d8b6bf409e4fbb2b4b8df07562d1c10.tar.xz
Merge pull request #340 from matrix-org/erikj/server_retries
Retry dead servers a lot less often
Diffstat (limited to '')
-rw-r--r--synapse/http/matrixfederationclient.py10
-rw-r--r--synapse/util/retryutils.py7
2 files changed, 13 insertions, 4 deletions
diff --git a/synapse/http/matrixfederationclient.py b/synapse/http/matrixfederationclient.py
index b50a0c445c..6e53538a52 100644
--- a/synapse/http/matrixfederationclient.py
+++ b/synapse/http/matrixfederationclient.py
@@ -35,6 +35,7 @@ from signedjson.sign import sign_json
 
 import simplejson as json
 import logging
+import random
 import sys
 import urllib
 import urlparse
@@ -55,6 +56,9 @@ incoming_responses_counter = metrics.register_counter(
 )
 
 
+MAX_RETRIES = 4
+
+
 class MatrixFederationEndpointFactory(object):
     def __init__(self, hs):
         self.tls_server_context_factory = hs.tls_server_context_factory
@@ -119,7 +123,7 @@ class MatrixFederationHttpClient(object):
 
         # XXX: Would be much nicer to retry only at the transaction-layer
         # (once we have reliable transactions in place)
-        retries_left = 5
+        retries_left = MAX_RETRIES
 
         http_url_bytes = urlparse.urlunparse(
             ("", "", path_bytes, param_bytes, query_bytes, "")
@@ -180,7 +184,9 @@ class MatrixFederationHttpClient(object):
                     )
 
                     if retries_left and not timeout:
-                        yield sleep(2 ** (5 - retries_left))
+                        delay = 5 ** (MAX_RETRIES + 1 - retries_left)
+                        delay *= random.uniform(0.8, 1.4)
+                        yield sleep(delay)
                         retries_left -= 1
                     else:
                         raise
diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py
index a42138f556..2fe6814807 100644
--- a/synapse/util/retryutils.py
+++ b/synapse/util/retryutils.py
@@ -18,6 +18,7 @@ from twisted.internet import defer
 from synapse.api.errors import CodeMessageException
 
 import logging
+import random
 
 
 logger = logging.getLogger(__name__)
@@ -85,8 +86,9 @@ def get_retry_limiter(destination, clock, store, **kwargs):
 
 class RetryDestinationLimiter(object):
     def __init__(self, destination, clock, store, retry_interval,
-                 min_retry_interval=5000, max_retry_interval=60 * 60 * 1000,
-                 multiplier_retry_interval=2,):
+                 min_retry_interval=10 * 60 * 1000,
+                 max_retry_interval=24 * 60 * 60 * 1000,
+                 multiplier_retry_interval=5,):
         """Marks the destination as "down" if an exception is thrown in the
         context, except for CodeMessageException with code < 500.
 
@@ -140,6 +142,7 @@ class RetryDestinationLimiter(object):
             # We couldn't connect.
             if self.retry_interval:
                 self.retry_interval *= self.multiplier_retry_interval
+                self.retry_interval *= int(random.uniform(0.8, 1.4))
 
                 if self.retry_interval >= self.max_retry_interval:
                     self.retry_interval = self.max_retry_interval