Cache failures to parse .well-known

Also add a Measure block around the .well-known fetch
author: Richard van der Hoff <richard@matrix.org> 2019-02-01 00:36:24 +0000
committer: Richard van der Hoff <richard@matrix.org> 2019-02-01 00:37:52 +0000
commit: 3c8a41140e883d0be60d4ec469c753d81775d375 (patch)
tree: 9f37c34c0eccbc099647d5ccca6b97aaaca85d39 /synapse/http
parent: better logging for federation connections (diff)
download: synapse-3c8a41140e883d0be60d4ec469c753d81775d375.tar.xz
1 files changed, 43 insertions, 13 deletions
diff --git a/synapse/http/federation/matrix_federation_agent.py b/synapse/http/federation/matrix_federation_agent.py
index 3a0525e365..50baa8bf0a 100644
--- a/synapse/http/federation/matrix_federation_agent.py
+++ b/synapse/http/federation/matrix_federation_agent.py
@@ -30,8 +30,10 @@ from twisted.web.http_headers import Headers
 from twisted.web.iweb import IAgent
 
 from synapse.http.federation.srv_resolver import SrvResolver, pick_server_from_list
+from synapse.util import Clock
 from synapse.util.caches.ttlcache import TTLCache
 from synapse.util.logcontext import make_deferred_yieldable
+from synapse.util.metrics import Measure
 
 # period to cache .well-known results for by default
 WELL_KNOWN_DEFAULT_CACHE_PERIOD = 24 * 3600
@@ -45,6 +47,8 @@ WELL_KNOWN_INVALID_CACHE_PERIOD = 1 * 3600
 # cap for .well-known cache period
 WELL_KNOWN_MAX_CACHE_PERIOD = 48 * 3600
 
+# magic value to mark an invalid well-known
+INVALID_WELL_KNOWN = object()
 
 logger = logging.getLogger(__name__)
 well_known_cache = TTLCache('well-known')
@@ -79,6 +83,8 @@ class MatrixFederationAgent(object):
         _well_known_cache=well_known_cache,
     ):
         self._reactor = reactor
+        self._clock = Clock(reactor)
+
         self._tls_client_options_factory = tls_client_options_factory
         if _srv_resolver is None:
             _srv_resolver = SrvResolver()
@@ -99,6 +105,11 @@ class MatrixFederationAgent(object):
         )
         self._well_known_agent = _well_known_agent
 
+        # our cache of .well-known lookup results, mapping from server name
+        # to delegated name. The values can be:
+        #   `bytes`:     a valid server-name
+        #   `None`:      there is no .well-known here
+        #   INVALID_WELL_KNWOWN: the .well-known here is invalid
         self._well_known_cache = _well_known_cache
 
     @defer.inlineCallbacks
@@ -281,14 +292,35 @@ class MatrixFederationAgent(object):
                 None if there was no .well-known file.
         """
         try:
-            cached = self._well_known_cache[server_name]
-            defer.returnValue(cached)
+            result = self._well_known_cache[server_name]
         except KeyError:
-            pass
+            # TODO: should we linearise so that we don't end up doing two .well-known
+            # requests for the same server in parallel?
+            with Measure(self._clock, "get_well_known"):
+                result, cache_period = yield self._do_get_well_known(server_name)
+
+            if cache_period > 0:
+                self._well_known_cache.set(server_name, result, cache_period)
 
-        # TODO: should we linearise so that we don't end up doing two .well-known requests
-        # for the same server in parallel?
+        if result == INVALID_WELL_KNOWN:
+            raise Exception("invalid .well-known on this server")
+
+        defer.returnValue(result)
 
+    @defer.inlineCallbacks
+    def _do_get_well_known(self, server_name):
+        """Actually fetch and parse a .well-known, without checking the cache
+
+        Args:
+            server_name (bytes): name of the server, from the requested url
+
+        Returns:
+            Deferred[Tuple[bytes|None|object],int]:
+                result, cache period, where result is one of:
+                 - the new server name from the .well-known (as a `bytes`)
+                 - None if there was no .well-known file.
+                 - INVALID_WELL_KNOWN if the .well-known was invalid
+        """
         uri = b"https://%s/.well-known/matrix/server" % (server_name, )
         uri_str = uri.decode("ascii")
         logger.info("Fetching %s", uri_str)
@@ -306,9 +338,7 @@ class MatrixFederationAgent(object):
             # after startup
             cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD
             cache_period += random.uniform(0, WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER)
-
-            self._well_known_cache.set(server_name, None, cache_period)
-            defer.returnValue(None)
+            defer.returnValue((None, cache_period))
 
         try:
             parsed_body = json.loads(body.decode('utf-8'))
@@ -318,7 +348,10 @@ class MatrixFederationAgent(object):
             if "m.server" not in parsed_body:
                 raise Exception("Missing key 'm.server'")
         except Exception as e:
-            raise Exception("invalid .well-known response from %s: %s" % (uri_str, e,))
+            logger.info("invalid .well-known response from %s: %s", uri_str, e)
+            cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD
+            cache_period += random.uniform(0, WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER)
+            defer.returnValue((INVALID_WELL_KNOWN, cache_period))
 
         result = parsed_body["m.server"].encode("ascii")
 
@@ -334,10 +367,7 @@ class MatrixFederationAgent(object):
         else:
             cache_period = min(cache_period, WELL_KNOWN_MAX_CACHE_PERIOD)
 
-        if cache_period > 0:
-            self._well_known_cache.set(server_name, result, cache_period)
-
-        defer.returnValue(result)
+        defer.returnValue((result, cache_period))
 
 
 @implementer(IStreamClientEndpoint)
author	Richard van der Hoff <richard@matrix.org>	2019-02-01 00:36:24 +0000
committer	Richard van der Hoff <richard@matrix.org>	2019-02-01 00:37:52 +0000
commit	3c8a41140e883d0be60d4ec469c753d81775d375 (patch)
tree	9f37c34c0eccbc099647d5ccca6b97aaaca85d39 /synapse/http
parent	better logging for federation connections (diff)
download	synapse-3c8a41140e883d0be60d4ec469c753d81775d375.tar.xz