From 4bd597d9fcb8e6c6888ee3e8fa683ba812272997 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Thu, 23 Mar 2017 00:12:21 +0000 Subject: push federation retry limiter down to matrixfederationclient rather than having to instrument everywhere we make a federation call, make the MatrixFederationHttpClient manage the retry limiter. --- synapse/federation/federation_client.py | 33 ++--- synapse/federation/transaction_queue.py | 216 ++++++++++++++------------------ synapse/federation/transport/client.py | 1 + 3 files changed, 109 insertions(+), 141 deletions(-) (limited to 'synapse/federation') diff --git a/synapse/federation/federation_client.py b/synapse/federation/federation_client.py index 5dcd4eecce..dc44727b36 100644 --- a/synapse/federation/federation_client.py +++ b/synapse/federation/federation_client.py @@ -29,7 +29,7 @@ from synapse.util.logcontext import preserve_fn, preserve_context_over_deferred from synapse.events import FrozenEvent, builder import synapse.metrics -from synapse.util.retryutils import get_retry_limiter, NotRetryingDestination +from synapse.util.retryutils import NotRetryingDestination import copy import itertools @@ -234,31 +234,24 @@ class FederationClient(FederationBase): continue try: - limiter = yield get_retry_limiter( - destination, - self._clock, - self.store, + transaction_data = yield self.transport_layer.get_event( + destination, event_id, timeout=timeout, ) - with limiter: - transaction_data = yield self.transport_layer.get_event( - destination, event_id, timeout=timeout, - ) - - logger.debug("transaction_data %r", transaction_data) + logger.debug("transaction_data %r", transaction_data) - pdu_list = [ - self.event_from_pdu_json(p, outlier=outlier) - for p in transaction_data["pdus"] - ] + pdu_list = [ + self.event_from_pdu_json(p, outlier=outlier) + for p in transaction_data["pdus"] + ] - if pdu_list and pdu_list[0]: - pdu = pdu_list[0] + if pdu_list and pdu_list[0]: + pdu = pdu_list[0] - # Check signatures are correct. - signed_pdu = yield self._check_sigs_and_hashes([pdu])[0] + # Check signatures are correct. + signed_pdu = yield self._check_sigs_and_hashes([pdu])[0] - break + break pdu_attempts[destination] = now diff --git a/synapse/federation/transaction_queue.py b/synapse/federation/transaction_queue.py index c802dd67a3..d7ecefcc64 100644 --- a/synapse/federation/transaction_queue.py +++ b/synapse/federation/transaction_queue.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import datetime from twisted.internet import defer @@ -22,9 +22,7 @@ from .units import Transaction, Edu from synapse.api.errors import HttpResponseException from synapse.util.async import run_on_reactor from synapse.util.logcontext import preserve_context_over_fn -from synapse.util.retryutils import ( - get_retry_limiter, NotRetryingDestination, -) +from synapse.util.retryutils import NotRetryingDestination from synapse.util.metrics import measure_func from synapse.types import get_domain_from_id from synapse.handlers.presence import format_user_presence_state @@ -312,13 +310,6 @@ class TransactionQueue(object): yield run_on_reactor() while True: - limiter = yield get_retry_limiter( - destination, - self.clock, - self.store, - backoff_on_404=True, # If we get a 404 the other side has gone - ) - device_message_edus, device_stream_id, dev_list_id = ( yield self._get_new_device_messages(destination) ) @@ -374,7 +365,6 @@ class TransactionQueue(object): success = yield self._send_new_transaction( destination, pending_pdus, pending_edus, pending_failures, - limiter=limiter, ) if success: # Remove the acknowledged device messages from the database @@ -392,12 +382,24 @@ class TransactionQueue(object): self.last_device_list_stream_id_by_dest[destination] = dev_list_id else: break - except NotRetryingDestination: + except NotRetryingDestination as e: logger.debug( - "TX [%s] not ready for retry yet - " + "TX [%s] not ready for retry yet (next retry at %s) - " "dropping transaction for now", destination, + datetime.datetime.fromtimestamp( + (e.retry_last_ts + e.retry_interval) / 1000.0 + ), + ) + except Exception as e: + logger.warn( + "TX [%s] Failed to send transaction: %s", + destination, + e, ) + for p in pending_pdus: + logger.info("Failed to send event %s to %s", p.event_id, + destination) finally: # We want to be *very* sure we delete this after we stop processing self.pending_transactions.pop(destination, None) @@ -437,7 +439,7 @@ class TransactionQueue(object): @measure_func("_send_new_transaction") @defer.inlineCallbacks def _send_new_transaction(self, destination, pending_pdus, pending_edus, - pending_failures, limiter): + pending_failures): # Sort based on the order field pending_pdus.sort(key=lambda t: t[1]) @@ -447,132 +449,104 @@ class TransactionQueue(object): success = True - try: - logger.debug("TX [%s] _attempt_new_transaction", destination) + logger.debug("TX [%s] _attempt_new_transaction", destination) - txn_id = str(self._next_txn_id) + txn_id = str(self._next_txn_id) - logger.debug( - "TX [%s] {%s} Attempting new transaction" - " (pdus: %d, edus: %d, failures: %d)", - destination, txn_id, - len(pdus), - len(edus), - len(failures) - ) + logger.debug( + "TX [%s] {%s} Attempting new transaction" + " (pdus: %d, edus: %d, failures: %d)", + destination, txn_id, + len(pdus), + len(edus), + len(failures) + ) - logger.debug("TX [%s] Persisting transaction...", destination) + logger.debug("TX [%s] Persisting transaction...", destination) - transaction = Transaction.create_new( - origin_server_ts=int(self.clock.time_msec()), - transaction_id=txn_id, - origin=self.server_name, - destination=destination, - pdus=pdus, - edus=edus, - pdu_failures=failures, - ) + transaction = Transaction.create_new( + origin_server_ts=int(self.clock.time_msec()), + transaction_id=txn_id, + origin=self.server_name, + destination=destination, + pdus=pdus, + edus=edus, + pdu_failures=failures, + ) - self._next_txn_id += 1 + self._next_txn_id += 1 - yield self.transaction_actions.prepare_to_send(transaction) + yield self.transaction_actions.prepare_to_send(transaction) - logger.debug("TX [%s] Persisted transaction", destination) - logger.info( - "TX [%s] {%s} Sending transaction [%s]," - " (PDUs: %d, EDUs: %d, failures: %d)", - destination, txn_id, - transaction.transaction_id, - len(pdus), - len(edus), - len(failures), - ) + logger.debug("TX [%s] Persisted transaction", destination) + logger.info( + "TX [%s] {%s} Sending transaction [%s]," + " (PDUs: %d, EDUs: %d, failures: %d)", + destination, txn_id, + transaction.transaction_id, + len(pdus), + len(edus), + len(failures), + ) - with limiter: - # Actually send the transaction - - # FIXME (erikj): This is a bit of a hack to make the Pdu age - # keys work - def json_data_cb(): - data = transaction.get_dict() - now = int(self.clock.time_msec()) - if "pdus" in data: - for p in data["pdus"]: - if "age_ts" in p: - unsigned = p.setdefault("unsigned", {}) - unsigned["age"] = now - int(p["age_ts"]) - del p["age_ts"] - return data - - try: - response = yield self.transport_layer.send_transaction( - transaction, json_data_cb - ) - code = 200 - - if response: - for e_id, r in response.get("pdus", {}).items(): - if "error" in r: - logger.warn( - "Transaction returned error for %s: %s", - e_id, r, - ) - except HttpResponseException as e: - code = e.code - response = e.response - - if e.code in (401, 404, 429) or 500 <= e.code: - logger.info( - "TX [%s] {%s} got %d response", - destination, txn_id, code + # Actually send the transaction + + # FIXME (erikj): This is a bit of a hack to make the Pdu age + # keys work + def json_data_cb(): + data = transaction.get_dict() + now = int(self.clock.time_msec()) + if "pdus" in data: + for p in data["pdus"]: + if "age_ts" in p: + unsigned = p.setdefault("unsigned", {}) + unsigned["age"] = now - int(p["age_ts"]) + del p["age_ts"] + return data + + try: + response = yield self.transport_layer.send_transaction( + transaction, json_data_cb + ) + code = 200 + + if response: + for e_id, r in response.get("pdus", {}).items(): + if "error" in r: + logger.warn( + "Transaction returned error for %s: %s", + e_id, r, ) - raise e + except HttpResponseException as e: + code = e.code + response = e.response + if e.code in (401, 404, 429) or 500 <= e.code: logger.info( "TX [%s] {%s} got %d response", destination, txn_id, code ) + raise e - logger.debug("TX [%s] Sent transaction", destination) - logger.debug("TX [%s] Marking as delivered...", destination) - - yield self.transaction_actions.delivered( - transaction, code, response - ) + logger.info( + "TX [%s] {%s} got %d response", + destination, txn_id, code + ) - logger.debug("TX [%s] Marked as delivered", destination) + logger.debug("TX [%s] Sent transaction", destination) + logger.debug("TX [%s] Marking as delivered...", destination) - if code != 200: - for p in pdus: - logger.info( - "Failed to send event %s to %s", p.event_id, destination - ) - success = False - except RuntimeError as e: - # We capture this here as there as nothing actually listens - # for this finishing functions deferred. - logger.warn( - "TX [%s] Problem in _attempt_transaction: %s", - destination, - e, - ) + yield self.transaction_actions.delivered( + transaction, code, response + ) - success = False + logger.debug("TX [%s] Marked as delivered", destination) + if code != 200: for p in pdus: - logger.info("Failed to send event %s to %s", p.event_id, destination) - except Exception as e: - # We capture this here as there as nothing actually listens - # for this finishing functions deferred. - logger.warn( - "TX [%s] Problem in _attempt_transaction: %s", - destination, - e, - ) - + logger.info( + "Failed to send event %s to %s", p.event_id, destination + ) success = False - for p in pdus: - logger.info("Failed to send event %s to %s", p.event_id, destination) - defer.returnValue(success) diff --git a/synapse/federation/transport/client.py b/synapse/federation/transport/client.py index f49e8a2cc4..cc9bc7f14b 100644 --- a/synapse/federation/transport/client.py +++ b/synapse/federation/transport/client.py @@ -163,6 +163,7 @@ class TransportLayerClient(object): data=json_data, json_data_callback=json_data_callback, long_retries=True, + backoff_on_404=True, # If we get a 404 the other side has gone ) logger.debug( -- cgit 1.5.1 From 5a16cb4bf036c6b1914d6c6248ed640c289b59e3 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Thu, 23 Mar 2017 11:10:36 +0000 Subject: Ignore backoff history for invites, aliases, and roomdirs Add a param to the federation client which lets us ignore historical backoff data for federation queries, and set it for a handful of operations. --- synapse/federation/federation_client.py | 7 +++++-- synapse/federation/transport/client.py | 6 +++++- synapse/handlers/directory.py | 1 + synapse/handlers/profile.py | 6 ++++-- synapse/http/matrixfederationclient.py | 33 ++++++++++++++++++++++++++------- synapse/util/retryutils.py | 13 +++++++++++-- 6 files changed, 52 insertions(+), 14 deletions(-) (limited to 'synapse/federation') diff --git a/synapse/federation/federation_client.py b/synapse/federation/federation_client.py index dc44727b36..deee0f4904 100644 --- a/synapse/federation/federation_client.py +++ b/synapse/federation/federation_client.py @@ -88,7 +88,7 @@ class FederationClient(FederationBase): @log_function def make_query(self, destination, query_type, args, - retry_on_dns_fail=False): + retry_on_dns_fail=False, ignore_backoff=False): """Sends a federation Query to a remote homeserver of the given type and arguments. @@ -98,6 +98,8 @@ class FederationClient(FederationBase): handler name used in register_query_handler(). args (dict): Mapping of strings to strings containing the details of the query request. + ignore_backoff (bool): true to ignore the historical backoff data + and try the request anyway. Returns: a Deferred which will eventually yield a JSON object from the @@ -106,7 +108,8 @@ class FederationClient(FederationBase): sent_queries_counter.inc(query_type) return self.transport_layer.make_query( - destination, query_type, args, retry_on_dns_fail=retry_on_dns_fail + destination, query_type, args, retry_on_dns_fail=retry_on_dns_fail, + ignore_backoff=ignore_backoff, ) @log_function diff --git a/synapse/federation/transport/client.py b/synapse/federation/transport/client.py index cc9bc7f14b..15a03378f5 100644 --- a/synapse/federation/transport/client.py +++ b/synapse/federation/transport/client.py @@ -175,7 +175,8 @@ class TransportLayerClient(object): @defer.inlineCallbacks @log_function - def make_query(self, destination, query_type, args, retry_on_dns_fail): + def make_query(self, destination, query_type, args, retry_on_dns_fail, + ignore_backoff=False): path = PREFIX + "/query/%s" % query_type content = yield self.client.get_json( @@ -184,6 +185,7 @@ class TransportLayerClient(object): args=args, retry_on_dns_fail=retry_on_dns_fail, timeout=10000, + ignore_backoff=ignore_backoff, ) defer.returnValue(content) @@ -243,6 +245,7 @@ class TransportLayerClient(object): destination=destination, path=path, data=content, + ignore_backoff=True, ) defer.returnValue(response) @@ -270,6 +273,7 @@ class TransportLayerClient(object): destination=remote_server, path=path, args=args, + ignore_backoff=True, ) defer.returnValue(response) diff --git a/synapse/handlers/directory.py b/synapse/handlers/directory.py index 1b5317edf5..943554ce98 100644 --- a/synapse/handlers/directory.py +++ b/synapse/handlers/directory.py @@ -175,6 +175,7 @@ class DirectoryHandler(BaseHandler): "room_alias": room_alias.to_string(), }, retry_on_dns_fail=False, + ignore_backoff=True, ) except CodeMessageException as e: logging.warn("Error retrieving alias") diff --git a/synapse/handlers/profile.py b/synapse/handlers/profile.py index abd1fb28cb..9bf638f818 100644 --- a/synapse/handlers/profile.py +++ b/synapse/handlers/profile.py @@ -52,7 +52,8 @@ class ProfileHandler(BaseHandler): args={ "user_id": target_user.to_string(), "field": "displayname", - } + }, + ignore_backoff=True, ) except CodeMessageException as e: if e.code != 404: @@ -99,7 +100,8 @@ class ProfileHandler(BaseHandler): args={ "user_id": target_user.to_string(), "field": "avatar_url", - } + }, + ignore_backoff=True, ) except CodeMessageException as e: if e.code != 404: diff --git a/synapse/http/matrixfederationclient.py b/synapse/http/matrixfederationclient.py index b0885dc979..f9e32ef03d 100644 --- a/synapse/http/matrixfederationclient.py +++ b/synapse/http/matrixfederationclient.py @@ -106,12 +106,16 @@ class MatrixFederationHttpClient(object): def _request(self, destination, method, path, body_callback, headers_dict={}, param_bytes=b"", query_bytes=b"", retry_on_dns_fail=True, - timeout=None, long_retries=False, backoff_on_404=False): + timeout=None, long_retries=False, + ignore_backoff=False, + backoff_on_404=False): """ Creates and sends a request to the given server Args: destination (str): The remote server to send the HTTP request to. method (str): HTTP method path (str): The HTTP path + ignore_backoff (bool): true to ignore the historical backoff data + and try the request anyway. backoff_on_404 (bool): Back off if we get a 404 Returns: @@ -127,6 +131,7 @@ class MatrixFederationHttpClient(object): self.clock, self._store, backoff_on_404=backoff_on_404, + ignore_backoff=ignore_backoff, ) destination = destination.encode("ascii") @@ -271,7 +276,9 @@ class MatrixFederationHttpClient(object): @defer.inlineCallbacks def put_json(self, destination, path, data={}, json_data_callback=None, - long_retries=False, timeout=None, backoff_on_404=False): + long_retries=False, timeout=None, + ignore_backoff=False, + backoff_on_404=False): """ Sends the specifed json data using PUT Args: @@ -286,6 +293,8 @@ class MatrixFederationHttpClient(object): retry for a short or long time. timeout(int): How long to try (in ms) the destination for before giving up. None indicates no timeout. + ignore_backoff (bool): true to ignore the historical backoff data + and try the request anyway. backoff_on_404 (bool): True if we should count a 404 response as a failure of the server (and should therefore back off future requests) @@ -319,6 +328,7 @@ class MatrixFederationHttpClient(object): headers_dict={"Content-Type": ["application/json"]}, long_retries=long_retries, timeout=timeout, + ignore_backoff=ignore_backoff, backoff_on_404=backoff_on_404, ) @@ -331,7 +341,7 @@ class MatrixFederationHttpClient(object): @defer.inlineCallbacks def post_json(self, destination, path, data={}, long_retries=False, - timeout=None): + timeout=None, ignore_backoff=False): """ Sends the specifed json data using POST Args: @@ -344,7 +354,8 @@ class MatrixFederationHttpClient(object): retry for a short or long time. timeout(int): How long to try (in ms) the destination for before giving up. None indicates no timeout. - + ignore_backoff (bool): true to ignore the historical backoff data and + try the request anyway. Returns: Deferred: Succeeds when we get a 2xx HTTP response. The result will be the decoded JSON body. On a 4xx or 5xx error response a @@ -368,6 +379,7 @@ class MatrixFederationHttpClient(object): headers_dict={"Content-Type": ["application/json"]}, long_retries=long_retries, timeout=timeout, + ignore_backoff=ignore_backoff, ) if 200 <= response.code < 300: @@ -380,7 +392,7 @@ class MatrixFederationHttpClient(object): @defer.inlineCallbacks def get_json(self, destination, path, args={}, retry_on_dns_fail=True, - timeout=None): + timeout=None, ignore_backoff=False): """ GETs some json from the given host homeserver and path Args: @@ -392,6 +404,8 @@ class MatrixFederationHttpClient(object): timeout (int): How long to try (in ms) the destination for before giving up. None indicates no timeout and that the request will be retried. + ignore_backoff (bool): true to ignore the historical backoff data + and try the request anyway. Returns: Deferred: Succeeds when we get *any* HTTP response. @@ -424,6 +438,7 @@ class MatrixFederationHttpClient(object): body_callback=body_callback, retry_on_dns_fail=retry_on_dns_fail, timeout=timeout, + ignore_backoff=ignore_backoff, ) if 200 <= response.code < 300: @@ -436,13 +451,16 @@ class MatrixFederationHttpClient(object): @defer.inlineCallbacks def get_file(self, destination, path, output_stream, args={}, - retry_on_dns_fail=True, max_size=None): + retry_on_dns_fail=True, max_size=None, + ignore_backoff=False): """GETs a file from a given homeserver Args: destination (str): The remote server to send the HTTP request to. path (str): The HTTP path to GET. output_stream (file): File to write the response body to. args (dict): Optional dictionary used to create the query string. + ignore_backoff (bool): true to ignore the historical backoff data + and try the request anyway. Returns: Deferred: resolves with an (int,dict) tuple of the file length and a dict of the response headers. @@ -473,7 +491,8 @@ class MatrixFederationHttpClient(object): path, query_bytes=query_bytes, body_callback=body_callback, - retry_on_dns_fail=retry_on_dns_fail + retry_on_dns_fail=retry_on_dns_fail, + ignore_backoff=ignore_backoff, ) headers = dict(response.headers.getAllRawHeaders()) diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py index 7e5a952584..7f5299bd32 100644 --- a/synapse/util/retryutils.py +++ b/synapse/util/retryutils.py @@ -35,7 +35,8 @@ class NotRetryingDestination(Exception): @defer.inlineCallbacks -def get_retry_limiter(destination, clock, store, **kwargs): +def get_retry_limiter(destination, clock, store, ignore_backoff=False, + **kwargs): """For a given destination check if we have previously failed to send a request there and are waiting before retrying the destination. If we are not ready to retry the destination, this will raise a @@ -43,6 +44,14 @@ def get_retry_limiter(destination, clock, store, **kwargs): that will mark the destination as down if an exception is thrown (excluding CodeMessageException with code < 500) + Args: + destination (str): name of homeserver + clock (synapse.util.clock): timing source + store (synapse.storage.transactions.TransactionStore): datastore + ignore_backoff (bool): true to ignore the historical backoff data and + try the request anyway. We will still update the next + retry_interval on success/failure. + Example usage: try: @@ -66,7 +75,7 @@ def get_retry_limiter(destination, clock, store, **kwargs): now = int(clock.time_msec()) - if retry_last_ts + retry_interval > now: + if not ignore_backoff and retry_last_ts + retry_interval > now: raise NotRetryingDestination( retry_last_ts=retry_last_ts, retry_interval=retry_interval, -- cgit 1.5.1