diff options
Diffstat (limited to 'synapse/http/federation/well_known_resolver.py')
-rw-r--r-- | synapse/http/federation/well_known_resolver.py | 122 |
1 files changed, 100 insertions, 22 deletions
diff --git a/synapse/http/federation/well_known_resolver.py b/synapse/http/federation/well_known_resolver.py index bb250c6922..d59864e298 100644 --- a/synapse/http/federation/well_known_resolver.py +++ b/synapse/http/federation/well_known_resolver.py @@ -38,6 +38,13 @@ WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER = 10 * 60 # period to cache failure to fetch .well-known for WELL_KNOWN_INVALID_CACHE_PERIOD = 1 * 3600 +# period to cache failure to fetch .well-known if there has recently been a +# valid well-known for that domain. +WELL_KNOWN_DOWN_CACHE_PERIOD = 2 * 60 + +# period to remember there was a valid well-known after valid record expires +WELL_KNOWN_REMEMBER_DOMAIN_HAD_VALID = 2 * 3600 + # cap for .well-known cache period WELL_KNOWN_MAX_CACHE_PERIOD = 48 * 3600 @@ -49,11 +56,16 @@ WELL_KNOWN_MIN_CACHE_PERIOD = 5 * 60 # we'll start trying to refetch 1 minute before it expires. WELL_KNOWN_GRACE_PERIOD_FACTOR = 0.2 +# Number of times we retry fetching a well-known for a domain we know recently +# had a valid entry. +WELL_KNOWN_RETRY_ATTEMPTS = 3 + logger = logging.getLogger(__name__) _well_known_cache = TTLCache("well-known") +_had_valid_well_known_cache = TTLCache("had-valid-well-known") @attr.s(slots=True, frozen=True) @@ -65,14 +77,20 @@ class WellKnownResolver(object): """Handles well-known lookups for matrix servers. """ - def __init__(self, reactor, agent, well_known_cache=None): + def __init__( + self, reactor, agent, well_known_cache=None, had_well_known_cache=None + ): self._reactor = reactor self._clock = Clock(reactor) if well_known_cache is None: well_known_cache = _well_known_cache + if had_well_known_cache is None: + had_well_known_cache = _had_valid_well_known_cache + self._well_known_cache = well_known_cache + self._had_valid_well_known_cache = had_well_known_cache self._well_known_agent = RedirectAgent(agent) @defer.inlineCallbacks @@ -100,7 +118,7 @@ class WellKnownResolver(object): # requests for the same server in parallel? try: with Measure(self._clock, "get_well_known"): - result, cache_period = yield self._do_get_well_known(server_name) + result, cache_period = yield self._fetch_well_known(server_name) except _FetchWellKnownFailure as e: if prev_result and e.temporary: @@ -111,10 +129,20 @@ class WellKnownResolver(object): result = None - # add some randomness to the TTL to avoid a stampeding herd every hour - # after startup - cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD - cache_period += random.uniform(0, WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER) + if self._had_valid_well_known_cache.get(server_name, False): + # We have recently seen a valid well-known record for this + # server, so we cache the lack of well-known for a shorter time. + cache_period = WELL_KNOWN_DOWN_CACHE_PERIOD + cache_period += random.uniform( + 0, WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER + ) + else: + # add some randomness to the TTL to avoid a stampeding herd every hour + # after startup + cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD + cache_period += random.uniform( + 0, WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER + ) if cache_period > 0: self._well_known_cache.set(server_name, result, cache_period) @@ -122,7 +150,7 @@ class WellKnownResolver(object): return WellKnownLookupResult(delegated_server=result) @defer.inlineCallbacks - def _do_get_well_known(self, server_name): + def _fetch_well_known(self, server_name): """Actually fetch and parse a .well-known, without checking the cache Args: @@ -134,24 +162,17 @@ class WellKnownResolver(object): Returns: Deferred[Tuple[bytes,int]]: The lookup result and cache period. """ - uri = b"https://%s/.well-known/matrix/server" % (server_name,) - uri_str = uri.decode("ascii") - logger.info("Fetching %s", uri_str) + + had_valid_well_known = bool( + self._had_valid_well_known_cache.get(server_name, False) + ) # We do this in two steps to differentiate between possibly transient # errors (e.g. can't connect to host, 503 response) and more permenant # errors (such as getting a 404 response). - try: - response = yield make_deferred_yieldable( - self._well_known_agent.request(b"GET", uri) - ) - body = yield make_deferred_yieldable(readBody(response)) - - if 500 <= response.code < 600: - raise Exception("Non-200 response %s" % (response.code,)) - except Exception as e: - logger.info("Error fetching %s: %s", uri_str, e) - raise _FetchWellKnownFailure(temporary=True) + response, body = yield self._make_well_known_request( + server_name, retry=had_valid_well_known + ) try: if response.code != 200: @@ -161,8 +182,11 @@ class WellKnownResolver(object): logger.info("Response from .well-known: %s", parsed_body) result = parsed_body["m.server"].encode("ascii") + except defer.CancelledError: + # Bail if we've been cancelled + raise except Exception as e: - logger.info("Error fetching %s: %s", uri_str, e) + logger.info("Error parsing well-known for %s: %s", server_name, e) raise _FetchWellKnownFailure(temporary=False) cache_period = _cache_period_from_headers( @@ -177,8 +201,62 @@ class WellKnownResolver(object): cache_period = min(cache_period, WELL_KNOWN_MAX_CACHE_PERIOD) cache_period = max(cache_period, WELL_KNOWN_MIN_CACHE_PERIOD) + # We got a success, mark as such in the cache + self._had_valid_well_known_cache.set( + server_name, + bool(result), + cache_period + WELL_KNOWN_REMEMBER_DOMAIN_HAD_VALID, + ) + return (result, cache_period) + @defer.inlineCallbacks + def _make_well_known_request(self, server_name, retry): + """Make the well known request. + + This will retry the request if requested and it fails (with unable + to connect or receives a 5xx error). + + Args: + server_name (bytes) + retry (bool): Whether to retry the request if it fails. + + Returns: + Deferred[tuple[IResponse, bytes]] Returns the response object and + body. Response may be a non-200 response. + """ + uri = b"https://%s/.well-known/matrix/server" % (server_name,) + uri_str = uri.decode("ascii") + + i = 0 + while True: + i += 1 + + logger.info("Fetching %s", uri_str) + try: + response = yield make_deferred_yieldable( + self._well_known_agent.request(b"GET", uri) + ) + body = yield make_deferred_yieldable(readBody(response)) + + if 500 <= response.code < 600: + raise Exception("Non-200 response %s" % (response.code,)) + + return response, body + except defer.CancelledError: + # Bail if we've been cancelled + raise + except Exception as e: + logger.info("Retry: %s", retry) + if not retry or i >= WELL_KNOWN_RETRY_ATTEMPTS: + logger.info("Error fetching %s: %s", uri_str, e) + raise _FetchWellKnownFailure(temporary=True) + + logger.info("Error fetching %s: %s. Retrying", uri_str, e) + + # Sleep briefly in the hopes that they come back up + yield self._clock.sleep(0.5) + def _cache_period_from_headers(headers, time_now=time.time): cache_controls = _parse_cache_control(headers) |