summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--changelog.d/3875.bugfix1
-rw-r--r--synapse/http/matrixfederationclient.py11
-rw-r--r--synapse/util/async_helpers.py51
3 files changed, 63 insertions, 0 deletions
diff --git a/changelog.d/3875.bugfix b/changelog.d/3875.bugfix
new file mode 100644
index 0000000000..2d2147dd4b
--- /dev/null
+++ b/changelog.d/3875.bugfix
@@ -0,0 +1 @@
+Mitigate outbound federation randomly becoming wedged
diff --git a/synapse/http/matrixfederationclient.py b/synapse/http/matrixfederationclient.py
index da16b5dd8c..13b19f7626 100644
--- a/synapse/http/matrixfederationclient.py
+++ b/synapse/http/matrixfederationclient.py
@@ -42,6 +42,7 @@ from synapse.api.errors import (
 )
 from synapse.http.endpoint import matrix_federation_endpoint
 from synapse.util import logcontext
+from synapse.util.async_helpers import timeout_no_seriously
 from synapse.util.logcontext import make_deferred_yieldable
 from synapse.util.metrics import Measure
 
@@ -228,6 +229,16 @@ class MatrixFederationHttpClient(object):
                     )
                     request_deferred.addTimeout(_sec_timeout, self.hs.get_reactor())
 
+                    # Sometimes the timeout above doesn't work, so lets hack yet
+                    # another layer of timeouts in in the vain hope that at some
+                    # point the world made sense and this really really really
+                    # should work.
+                    request_deferred = timeout_no_seriously(
+                        request_deferred,
+                        timeout=_sec_timeout * 2,
+                        reactor=self.hs.get_reactor(),
+                    )
+
                     with Measure(self.clock, "outbound_request"):
                         response = yield make_deferred_yieldable(
                             request_deferred,
diff --git a/synapse/util/async_helpers.py b/synapse/util/async_helpers.py
index 9b3f2f4b96..083e4f4128 100644
--- a/synapse/util/async_helpers.py
+++ b/synapse/util/async_helpers.py
@@ -438,3 +438,54 @@ def _cancelled_to_timed_out_error(value, timeout):
         value.trap(CancelledError)
         raise DeferredTimeoutError(timeout, "Deferred")
     return value
+
+
+def timeout_no_seriously(deferred, timeout, reactor):
+    """The in build twisted deferred addTimeout (and the method above)
+    completely fail to time things out under some unknown circumstances.
+
+    Lets try a different way of timing things out and maybe that will make
+    things work?!
+
+    TODO: Kill this with fire.
+    """
+
+    new_d = defer.Deferred()
+
+    timed_out = [False]
+
+    def time_it_out():
+        timed_out[0] = True
+        deferred.cancel()
+
+        if not new_d.called:
+            new_d.errback(DeferredTimeoutError(timeout, "Deferred"))
+
+    delayed_call = reactor.callLater(timeout, time_it_out)
+
+    def convert_cancelled(value):
+        if timed_out[0]:
+            return _cancelled_to_timed_out_error(value, timeout)
+        return value
+
+    deferred.addBoth(convert_cancelled)
+
+    def cancel_timeout(result):
+        # stop the pending call to cancel the deferred if it's been fired
+        if delayed_call.active():
+            delayed_call.cancel()
+        return result
+
+    deferred.addBoth(cancel_timeout)
+
+    def success_cb(val):
+        if not new_d.called:
+            new_d.callback(val)
+
+    def failure_cb(val):
+        if not new_d.called:
+            new_d.errback(val)
+
+    deferred.addCallbacks(success_cb, failure_cb)
+
+    return new_d