summary refs log tree commit diff
diff options
context:
space:
mode:
authorRichard van der Hoff <richard@matrix.org>2019-02-01 12:30:22 +0000
committerRichard van der Hoff <richard@matrix.org>2019-02-01 12:30:22 +0000
commit9763a73af0b192ebd043dc29103921c4daef3b26 (patch)
treef89f74c3f1122aee8592f01af60d88758590ef57
parentMerge pull request #4530 from matrix-org/anoa/room_upgrade_federatable (diff)
parentMerge pull request #4542 from matrix-org/rav/cache_for_bad_well_known (diff)
downloadsynapse-9763a73af0b192ebd043dc29103921c4daef3b26.tar.xz
Merge branch 'release-v0.99.0' into develop
-rw-r--r--changelog.d/4539.misc1
-rw-r--r--changelog.d/4542.misc1
-rw-r--r--synapse/http/federation/matrix_federation_agent.py85
-rw-r--r--tests/http/federation/test_matrix_federation_agent.py51
4 files changed, 86 insertions, 52 deletions
diff --git a/changelog.d/4539.misc b/changelog.d/4539.misc
new file mode 100644
index 0000000000..b222c8d23f
--- /dev/null
+++ b/changelog.d/4539.misc
@@ -0,0 +1 @@
+Update federation routing logic to check .well-known before SRV
diff --git a/changelog.d/4542.misc b/changelog.d/4542.misc
new file mode 100644
index 0000000000..74c84e0209
--- /dev/null
+++ b/changelog.d/4542.misc
@@ -0,0 +1 @@
+Improve performance of handling servers with invalid .well-known
diff --git a/synapse/http/federation/matrix_federation_agent.py b/synapse/http/federation/matrix_federation_agent.py
index 26649e70be..50baa8bf0a 100644
--- a/synapse/http/federation/matrix_federation_agent.py
+++ b/synapse/http/federation/matrix_federation_agent.py
@@ -23,14 +23,17 @@ from zope.interface import implementer
 
 from twisted.internet import defer
 from twisted.internet.endpoints import HostnameEndpoint, wrapClientTLS
+from twisted.internet.interfaces import IStreamClientEndpoint
 from twisted.web.client import URI, Agent, HTTPConnectionPool, RedirectAgent, readBody
 from twisted.web.http import stringToDatetime
 from twisted.web.http_headers import Headers
 from twisted.web.iweb import IAgent
 
 from synapse.http.federation.srv_resolver import SrvResolver, pick_server_from_list
+from synapse.util import Clock
 from synapse.util.caches.ttlcache import TTLCache
 from synapse.util.logcontext import make_deferred_yieldable
+from synapse.util.metrics import Measure
 
 # period to cache .well-known results for by default
 WELL_KNOWN_DEFAULT_CACHE_PERIOD = 24 * 3600
@@ -44,6 +47,8 @@ WELL_KNOWN_INVALID_CACHE_PERIOD = 1 * 3600
 # cap for .well-known cache period
 WELL_KNOWN_MAX_CACHE_PERIOD = 48 * 3600
 
+# magic value to mark an invalid well-known
+INVALID_WELL_KNOWN = object()
 
 logger = logging.getLogger(__name__)
 well_known_cache = TTLCache('well-known')
@@ -78,6 +83,8 @@ class MatrixFederationAgent(object):
         _well_known_cache=well_known_cache,
     ):
         self._reactor = reactor
+        self._clock = Clock(reactor)
+
         self._tls_client_options_factory = tls_client_options_factory
         if _srv_resolver is None:
             _srv_resolver = SrvResolver()
@@ -98,6 +105,11 @@ class MatrixFederationAgent(object):
         )
         self._well_known_agent = _well_known_agent
 
+        # our cache of .well-known lookup results, mapping from server name
+        # to delegated name. The values can be:
+        #   `bytes`:     a valid server-name
+        #   `None`:      there is no .well-known here
+        #   INVALID_WELL_KNWOWN: the .well-known here is invalid
         self._well_known_cache = _well_known_cache
 
     @defer.inlineCallbacks
@@ -152,12 +164,9 @@ class MatrixFederationAgent(object):
         class EndpointFactory(object):
             @staticmethod
             def endpointForURI(_uri):
-                logger.info(
-                    "Connecting to %s:%i",
-                    res.target_host.decode("ascii"),
-                    res.target_port,
+                ep = LoggingHostnameEndpoint(
+                    self._reactor, res.target_host, res.target_port,
                 )
-                ep = HostnameEndpoint(self._reactor, res.target_host, res.target_port)
                 if tls_options is not None:
                     ep = wrapClientTLS(tls_options, ep)
                 return ep
@@ -210,11 +219,7 @@ class MatrixFederationAgent(object):
                 target_port=parsed_uri.port,
             ))
 
-        # try a SRV lookup
-        service_name = b"_matrix._tcp.%s" % (parsed_uri.host,)
-        server_list = yield self._srv_resolver.resolve_service(service_name)
-
-        if not server_list and lookup_well_known:
+        if lookup_well_known:
             # try a .well-known lookup
             well_known_server = yield self._get_well_known(parsed_uri.host)
 
@@ -250,6 +255,10 @@ class MatrixFederationAgent(object):
                 res = yield self._route_matrix_uri(new_uri, lookup_well_known=False)
                 defer.returnValue(res)
 
+        # try a SRV lookup
+        service_name = b"_matrix._tcp.%s" % (parsed_uri.host,)
+        server_list = yield self._srv_resolver.resolve_service(service_name)
+
         if not server_list:
             target_host = parsed_uri.host
             port = 8448
@@ -283,14 +292,35 @@ class MatrixFederationAgent(object):
                 None if there was no .well-known file.
         """
         try:
-            cached = self._well_known_cache[server_name]
-            defer.returnValue(cached)
+            result = self._well_known_cache[server_name]
         except KeyError:
-            pass
+            # TODO: should we linearise so that we don't end up doing two .well-known
+            # requests for the same server in parallel?
+            with Measure(self._clock, "get_well_known"):
+                result, cache_period = yield self._do_get_well_known(server_name)
+
+            if cache_period > 0:
+                self._well_known_cache.set(server_name, result, cache_period)
+
+        if result == INVALID_WELL_KNOWN:
+            raise Exception("invalid .well-known on this server")
+
+        defer.returnValue(result)
 
-        # TODO: should we linearise so that we don't end up doing two .well-known requests
-        # for the same server in parallel?
+    @defer.inlineCallbacks
+    def _do_get_well_known(self, server_name):
+        """Actually fetch and parse a .well-known, without checking the cache
 
+        Args:
+            server_name (bytes): name of the server, from the requested url
+
+        Returns:
+            Deferred[Tuple[bytes|None|object],int]:
+                result, cache period, where result is one of:
+                 - the new server name from the .well-known (as a `bytes`)
+                 - None if there was no .well-known file.
+                 - INVALID_WELL_KNOWN if the .well-known was invalid
+        """
         uri = b"https://%s/.well-known/matrix/server" % (server_name, )
         uri_str = uri.decode("ascii")
         logger.info("Fetching %s", uri_str)
@@ -308,9 +338,7 @@ class MatrixFederationAgent(object):
             # after startup
             cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD
             cache_period += random.uniform(0, WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER)
-
-            self._well_known_cache.set(server_name, None, cache_period)
-            defer.returnValue(None)
+            defer.returnValue((None, cache_period))
 
         try:
             parsed_body = json.loads(body.decode('utf-8'))
@@ -320,7 +348,10 @@ class MatrixFederationAgent(object):
             if "m.server" not in parsed_body:
                 raise Exception("Missing key 'm.server'")
         except Exception as e:
-            raise Exception("invalid .well-known response from %s: %s" % (uri_str, e,))
+            logger.info("invalid .well-known response from %s: %s", uri_str, e)
+            cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD
+            cache_period += random.uniform(0, WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER)
+            defer.returnValue((INVALID_WELL_KNOWN, cache_period))
 
         result = parsed_body["m.server"].encode("ascii")
 
@@ -336,10 +367,20 @@ class MatrixFederationAgent(object):
         else:
             cache_period = min(cache_period, WELL_KNOWN_MAX_CACHE_PERIOD)
 
-        if cache_period > 0:
-            self._well_known_cache.set(server_name, result, cache_period)
+        defer.returnValue((result, cache_period))
 
-        defer.returnValue(result)
+
+@implementer(IStreamClientEndpoint)
+class LoggingHostnameEndpoint(object):
+    """A wrapper for HostnameEndpint which logs when it connects"""
+    def __init__(self, reactor, host, port, *args, **kwargs):
+        self.host = host
+        self.port = port
+        self.ep = HostnameEndpoint(reactor, host, port, *args, **kwargs)
+
+    def connect(self, protocol_factory):
+        logger.info("Connecting to %s:%i", self.host, self.port)
+        return self.ep.connect(protocol_factory)
 
 
 def _cache_period_from_headers(headers, time_now=time.time):
diff --git a/tests/http/federation/test_matrix_federation_agent.py b/tests/http/federation/test_matrix_federation_agent.py
index 7b2800021f..369e63ecc6 100644
--- a/tests/http/federation/test_matrix_federation_agent.py
+++ b/tests/http/federation/test_matrix_federation_agent.py
@@ -358,9 +358,8 @@ class MatrixFederationAgentTests(TestCase):
         # Nothing happened yet
         self.assertNoResult(test_d)
 
-        self.mock_resolver.resolve_service.assert_called_once_with(
-            b"_matrix._tcp.testserv",
-        )
+        # No SRV record lookup yet
+        self.mock_resolver.resolve_service.assert_not_called()
 
         # there should be an attempt to connect on port 443 for the .well-known
         clients = self.reactor.tcpClients
@@ -376,6 +375,11 @@ class MatrixFederationAgentTests(TestCase):
         # .well-known request fails.
         self.reactor.pump((0.4,))
 
+        # now there should be a SRV lookup
+        self.mock_resolver.resolve_service.assert_called_once_with(
+            b"_matrix._tcp.testserv",
+        )
+
         # we should fall back to a direct connection
         self.assertEqual(len(clients), 2)
         (host, port, client_factory, _timeout, _bindAddress) = clients[1]
@@ -403,8 +407,7 @@ class MatrixFederationAgentTests(TestCase):
         self.successResultOf(test_d)
 
     def test_get_well_known(self):
-        """Test the behaviour when the server name has no port and no SRV record, but
-        the .well-known redirects elsewhere
+        """Test the behaviour when the .well-known redirects elsewhere
         """
 
         self.mock_resolver.resolve_service.side_effect = lambda _: []
@@ -416,11 +419,6 @@ class MatrixFederationAgentTests(TestCase):
         # Nothing happened yet
         self.assertNoResult(test_d)
 
-        self.mock_resolver.resolve_service.assert_called_once_with(
-            b"_matrix._tcp.testserv",
-        )
-        self.mock_resolver.resolve_service.reset_mock()
-
         # there should be an attempt to connect on port 443 for the .well-known
         clients = self.reactor.tcpClients
         self.assertEqual(len(clients), 1)
@@ -432,7 +430,7 @@ class MatrixFederationAgentTests(TestCase):
             client_factory, expected_sni=b"testserv", target_server=b"target-server",
         )
 
-        # there should be another SRV lookup
+        # there should be a SRV lookup
         self.mock_resolver.resolve_service.assert_called_once_with(
             b"_matrix._tcp.target-server",
         )
@@ -483,11 +481,6 @@ class MatrixFederationAgentTests(TestCase):
         # Nothing happened yet
         self.assertNoResult(test_d)
 
-        self.mock_resolver.resolve_service.assert_called_once_with(
-            b"_matrix._tcp.testserv",
-        )
-        self.mock_resolver.resolve_service.reset_mock()
-
         # there should be an attempt to connect on port 443 for the .well-known
         clients = self.reactor.tcpClients
         self.assertEqual(len(clients), 1)
@@ -529,7 +522,7 @@ class MatrixFederationAgentTests(TestCase):
 
         self.reactor.pump((0.1, ))
 
-        # there should be another SRV lookup
+        # there should be a SRV lookup
         self.mock_resolver.resolve_service.assert_called_once_with(
             b"_matrix._tcp.target-server",
         )
@@ -581,6 +574,7 @@ class MatrixFederationAgentTests(TestCase):
         # Nothing happened yet
         self.assertNoResult(test_d)
 
+        # the request for a .well-known will have failed with a DNS lookup error.
         self.mock_resolver.resolve_service.assert_called_once_with(
             b"_matrix._tcp.testserv",
         )
@@ -613,11 +607,9 @@ class MatrixFederationAgentTests(TestCase):
         self.successResultOf(test_d)
 
     def test_get_well_known_srv(self):
-        """Test the behaviour when the server name has no port and no SRV record, but
-        the .well-known redirects to a place where there is a SRV.
+        """Test the behaviour when the .well-known redirects to a place where there
+        is a SRV.
         """
-
-        self.mock_resolver.resolve_service.side_effect = lambda _: []
         self.reactor.lookups["testserv"] = "1.2.3.4"
         self.reactor.lookups["srvtarget"] = "5.6.7.8"
 
@@ -626,11 +618,6 @@ class MatrixFederationAgentTests(TestCase):
         # Nothing happened yet
         self.assertNoResult(test_d)
 
-        self.mock_resolver.resolve_service.assert_called_once_with(
-            b"_matrix._tcp.testserv",
-        )
-        self.mock_resolver.resolve_service.reset_mock()
-
         # there should be an attempt to connect on port 443 for the .well-known
         clients = self.reactor.tcpClients
         self.assertEqual(len(clients), 1)
@@ -646,7 +633,7 @@ class MatrixFederationAgentTests(TestCase):
             client_factory, expected_sni=b"testserv", target_server=b"target-server",
         )
 
-        # there should be another SRV lookup
+        # there should be a SRV lookup
         self.mock_resolver.resolve_service.assert_called_once_with(
             b"_matrix._tcp.target-server",
         )
@@ -691,9 +678,8 @@ class MatrixFederationAgentTests(TestCase):
         # Nothing happened yet
         self.assertNoResult(test_d)
 
-        self.mock_resolver.resolve_service.assert_called_once_with(
-            b"_matrix._tcp.xn--bcher-kva.com",
-        )
+        # No SRV record lookup yet
+        self.mock_resolver.resolve_service.assert_not_called()
 
         # there should be an attempt to connect on port 443 for the .well-known
         clients = self.reactor.tcpClients
@@ -709,6 +695,11 @@ class MatrixFederationAgentTests(TestCase):
         # .well-known request fails.
         self.reactor.pump((0.4,))
 
+        # now there should have been a SRV lookup
+        self.mock_resolver.resolve_service.assert_called_once_with(
+            b"_matrix._tcp.xn--bcher-kva.com",
+        )
+
         # We should fall back to port 8448
         clients = self.reactor.tcpClients
         self.assertEqual(len(clients), 2)