From aed62a35832a3ec1c7425ecc99cab06a781263ba Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 7 Dec 2014 02:26:07 +0000 Subject: track replication destination health, and perform exponential back-off when sending transactions. does *not* yet retry transactions, but drops them on the floor if waiting for a server to recover. --- synapse/http/matrixfederationclient.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'synapse/http') diff --git a/synapse/http/matrixfederationclient.py b/synapse/http/matrixfederationclient.py index 510f07dd7b..3edc59dbab 100644 --- a/synapse/http/matrixfederationclient.py +++ b/synapse/http/matrixfederationclient.py @@ -89,7 +89,7 @@ class MatrixFederationHttpClient(object): ("", "", path_bytes, param_bytes, query_bytes, "",) ) - logger.debug("Sending request to %s: %s %s", + logger.info("Sending request to %s: %s %s", destination, method, url_bytes) logger.debug( @@ -101,7 +101,10 @@ class MatrixFederationHttpClient(object): ] ) - retries_left = 5 + # was 5; for now, let's only try once at the HTTP layer and then + # rely on transaction-layer retries for exponential backoff and + # getting the message through. + retries_left = 0 endpoint = self._getEndpoint(reactor, destination) @@ -131,7 +134,8 @@ class MatrixFederationHttpClient(object): e) raise SynapseError(400, "Domain specified not found.") - logger.exception("Got error in _create_request") + logger.exception("Sending request failed to %s: %s %s : %s", + destination, method, url_bytes, e) _print_ex(e) if retries_left: @@ -140,15 +144,15 @@ class MatrixFederationHttpClient(object): else: raise + logger.info("Received response %d %s for %s: %s %s", + response.code, response.phrase, destination, method, url_bytes) + if 200 <= response.code < 300: # We need to update the transactions table to say it was sent? pass else: # :'( # Update transactions table? - logger.error( - "Got response %d %s", response.code, response.phrase - ) raise CodeMessageException( response.code, response.phrase ) -- cgit 1.4.1 From 2b1acb7671e33baeb01be2f0facd20cd6ea7e3b5 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Wed, 10 Dec 2014 00:03:55 +0000 Subject: squidge to 79 columns as per pep8 --- synapse/federation/replication.py | 30 ++++++++++++++++++++---------- synapse/http/matrixfederationclient.py | 7 ++++--- synapse/storage/transactions.py | 18 ++++++++++++------ 3 files changed, 36 insertions(+), 19 deletions(-) (limited to 'synapse/http') diff --git a/synapse/federation/replication.py b/synapse/federation/replication.py index c4c6667b62..c242488483 100644 --- a/synapse/federation/replication.py +++ b/synapse/federation/replication.py @@ -778,21 +778,25 @@ class _TransactionQueue(object): def _attempt_new_transaction(self, destination): (retry_last_ts, retry_interval) = (0, 0) - retry_timings = yield self.store.get_destination_retry_timings(destination) + retry_timings = yield self.store.get_destination_retry_timings( + destination + ) if retry_timings: (retry_last_ts, retry_interval) = ( retry_timings.retry_last_ts, retry_timings.retry_interval ) if retry_last_ts + retry_interval > int(self._clock.time_msec()): - logger.info("TX [%s] not ready for retry yet - dropping transaction for now", destination) + logger.info("TX [%s] not ready for retry yet - " + "dropping transaction for now", destination) return else: logger.info("TX [%s] is ready for retry", destination) if destination in self.pending_transactions: - # XXX: pending_transactions can get stuck on by a never-ending request - # at which point pending_pdus_by_dest just keeps growing. - # we need application-layer timeouts of some flavour of these requests + # XXX: pending_transactions can get stuck on by a never-ending + # request at which point pending_pdus_by_dest just keeps growing. + # we need application-layer timeouts of some flavour of these + # requests return # list of (pending_pdu, deferred, order) @@ -803,8 +807,10 @@ class _TransactionQueue(object): if not pending_pdus and not pending_edus and not pending_failures: return - logger.debug("TX [%s] Attempting new transaction (pdus: %d, edus: %d, failures: %d)", - destination, len(pending_pdus), len(pending_edus), len(pending_failures)) + logger.debug("TX [%s] Attempting new transaction " + "(pdus: %d, edus: %d, failures: %d)", + destination, + len(pending_pdus), len(pending_edus), len(pending_failures)) # Sort based on the order field pending_pdus.sort(key=lambda t: t[2]) @@ -837,7 +843,8 @@ class _TransactionQueue(object): yield self.transaction_actions.prepare_to_send(transaction) logger.debug("TX [%s] Persisted transaction", destination) - logger.info("TX [%s] Sending transaction [%s]", destination, transaction.transaction_id) + logger.info("TX [%s] Sending transaction [%s]", destination, + transaction.transaction_id) # Actually send the transaction @@ -874,7 +881,9 @@ class _TransactionQueue(object): if code == 200: if retry_last_ts: # this host is alive! reset retry schedule - yield self.store.set_destination_retry_timings(destination, 0, 0) + yield self.store.set_destination_retry_timings( + destination, 0, 0 + ) deferred.callback(None) else: self.start_retrying(destination, retry_interval) @@ -892,7 +901,8 @@ class _TransactionQueue(object): except Exception as e: # We capture this here as there as nothing actually listens # for this finishing functions deferred. - logger.exception("TX [%s] Problem in _attempt_transaction: %s", destination, e) + logger.exception("TX [%s] Problem in _attempt_transaction: %s", + destination, e) self.start_retrying(destination, retry_interval) diff --git a/synapse/http/matrixfederationclient.py b/synapse/http/matrixfederationclient.py index 3edc59dbab..c76990904d 100644 --- a/synapse/http/matrixfederationclient.py +++ b/synapse/http/matrixfederationclient.py @@ -90,7 +90,7 @@ class MatrixFederationHttpClient(object): ) logger.info("Sending request to %s: %s %s", - destination, method, url_bytes) + destination, method, url_bytes) logger.debug( "Types: %s", @@ -135,7 +135,7 @@ class MatrixFederationHttpClient(object): raise SynapseError(400, "Domain specified not found.") logger.exception("Sending request failed to %s: %s %s : %s", - destination, method, url_bytes, e) + destination, method, url_bytes, e) _print_ex(e) if retries_left: @@ -145,7 +145,8 @@ class MatrixFederationHttpClient(object): raise logger.info("Received response %d %s for %s: %s %s", - response.code, response.phrase, destination, method, url_bytes) + response.code, response.phrase, + destination, method, url_bytes) if 200 <= response.code < 300: # We need to update the transactions table to say it was sent? diff --git a/synapse/storage/transactions.py b/synapse/storage/transactions.py index 237b024451..2b16787695 100644 --- a/synapse/storage/transactions.py +++ b/synapse/storage/transactions.py @@ -28,7 +28,8 @@ class TransactionStore(SQLBaseStore): """A collection of queries for handling PDUs. """ - # a write-through cache of DestinationsTable.EntryType indexed by destination string + # a write-through cache of DestinationsTable.EntryType indexed by + # destination string destination_retry_cache = {} def get_received_txn_response(self, transaction_id, origin): @@ -238,7 +239,8 @@ class TransactionStore(SQLBaseStore): else: return None - def set_destination_retry_timings(self, destination, retry_last_ts, retry_interval): + def set_destination_retry_timings(self, destination, + retry_last_ts, retry_interval): """Sets the current retry timings for a given destination. Both timings should be zero if retrying is no longer occuring. @@ -249,15 +251,19 @@ class TransactionStore(SQLBaseStore): """ self.destination_retry_cache[destination] = ( - DestinationsTable.EntryType(destination, retry_last_ts, retry_interval) + DestinationsTable.EntryType(destination, + retry_last_ts, retry_interval) ) - # xxx: we could chose to not bother persisting this if our cache things this is a NOOP + # XXX: we could chose to not bother persisting this if our cache thinks + # this is a NOOP return self.runInteraction( "set_destination_retry_timings", - self._set_destination_retry_timings, destination, retry_last_ts, retry_interval) + self._set_destination_retry_timings, destination, + retry_last_ts, retry_interval) - def _set_destination_retry_timings(cls, txn, destination, retry_last_ts, retry_interval): + def _set_destination_retry_timings(cls, txn, destination, + retry_last_ts, retry_interval): query = ( "INSERT OR REPLACE INTO %s " -- cgit 1.4.1 From faf12b64f81627d92cb1ac49b6eb58f9d3f4837d Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Wed, 10 Dec 2014 00:12:51 +0000 Subject: add errbacks to enqueue_pdu deferreds; change logging for failed federation sends to warn rather than exception --- synapse/federation/replication.py | 16 ++++++++++------ synapse/http/matrixfederationclient.py | 4 ++-- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'synapse/http') diff --git a/synapse/federation/replication.py b/synapse/federation/replication.py index c242488483..346b5f04c0 100644 --- a/synapse/federation/replication.py +++ b/synapse/federation/replication.py @@ -334,7 +334,7 @@ class ReplicationLayer(object): defer.returnValue(response) return - logger.debug("[%s] Transacition is new", transaction.transaction_id) + logger.debug("[%s] Transaction is new", transaction.transaction_id) with PreserveLoggingContext(): dl = [] @@ -724,15 +724,19 @@ class _TransactionQueue(object): deferreds = [] for destination in destinations: - # XXX: why don't we specify an errback for this deferred - # like we do for EDUs? --matthew deferred = defer.Deferred() self.pending_pdus_by_dest.setdefault(destination, []).append( (pdu, deferred, order) ) + + def eb(failure): + if not deferred.called: + deferred.errback(failure) + else: + logger.warn("Failed to send pdu", failure) with PreserveLoggingContext(): - self._attempt_new_transaction(destination) + self._attempt_new_transaction(destination).addErrback(eb) deferreds.append(deferred) @@ -754,7 +758,7 @@ class _TransactionQueue(object): if not deferred.called: deferred.errback(failure) else: - logger.exception("Failed to send edu", failure) + logger.warn("Failed to send edu", failure) with PreserveLoggingContext(): self._attempt_new_transaction(destination).addErrback(eb) @@ -901,7 +905,7 @@ class _TransactionQueue(object): except Exception as e: # We capture this here as there as nothing actually listens # for this finishing functions deferred. - logger.exception("TX [%s] Problem in _attempt_transaction: %s", + logger.warn("TX [%s] Problem in _attempt_transaction: %s", destination, e) self.start_retrying(destination, retry_interval) diff --git a/synapse/http/matrixfederationclient.py b/synapse/http/matrixfederationclient.py index c76990904d..8fc6bf8f97 100644 --- a/synapse/http/matrixfederationclient.py +++ b/synapse/http/matrixfederationclient.py @@ -134,7 +134,7 @@ class MatrixFederationHttpClient(object): e) raise SynapseError(400, "Domain specified not found.") - logger.exception("Sending request failed to %s: %s %s : %s", + logger.warn("Sending request failed to %s: %s %s : %s", destination, method, url_bytes, e) _print_ex(e) @@ -289,7 +289,7 @@ def _print_ex(e): for ex in e.reasons: _print_ex(ex) else: - logger.exception(e) + logger.warn(e) class _JsonProducer(object): -- cgit 1.4.1 From b8d30899b1296347a75d5a59e32d73a5236e6ea2 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Wed, 10 Dec 2014 10:16:09 +0000 Subject: Code style. --- synapse/federation/replication.py | 52 +++++++++++++++++++++++----------- synapse/http/matrixfederationclient.py | 29 +++++++++++++------ synapse/storage/transactions.py | 50 ++++++++++++++++++-------------- 3 files changed, 85 insertions(+), 46 deletions(-) (limited to 'synapse/http') diff --git a/synapse/federation/replication.py b/synapse/federation/replication.py index 589a3f581b..0cb632fb08 100644 --- a/synapse/federation/replication.py +++ b/synapse/federation/replication.py @@ -728,7 +728,7 @@ class _TransactionQueue(object): self.pending_pdus_by_dest.setdefault(destination, []).append( (pdu, deferred, order) ) - + def eb(failure): if not deferred.called: deferred.errback(failure) @@ -745,7 +745,7 @@ class _TransactionQueue(object): # NO inlineCallbacks def enqueue_edu(self, edu): destination = edu.destination - + if destination == self.server_name: return @@ -776,7 +776,7 @@ class _TransactionQueue(object): ) yield deferred - + @defer.inlineCallbacks @log_function def _attempt_new_transaction(self, destination): @@ -790,12 +790,15 @@ class _TransactionQueue(object): retry_timings.retry_last_ts, retry_timings.retry_interval ) if retry_last_ts + retry_interval > int(self._clock.time_msec()): - logger.info("TX [%s] not ready for retry yet - " - "dropping transaction for now", destination) + logger.info( + "TX [%s] not ready for retry yet - " + "dropping transaction for now", + destination, + ) return else: logger.info("TX [%s] is ready for retry", destination) - + if destination in self.pending_transactions: # XXX: pending_transactions can get stuck on by a never-ending # request at which point pending_pdus_by_dest just keeps growing. @@ -811,10 +814,14 @@ class _TransactionQueue(object): if not pending_pdus and not pending_edus and not pending_failures: return - logger.debug("TX [%s] Attempting new transaction " - "(pdus: %d, edus: %d, failures: %d)", + logger.debug( + "TX [%s] Attempting new transaction " + "(pdus: %d, edus: %d, failures: %d)", destination, - len(pending_pdus), len(pending_edus), len(pending_failures)) + len(pending_pdus), + len(pending_edus), + len(pending_failures) + ) # Sort based on the order field pending_pdus.sort(key=lambda t: t[2]) @@ -847,8 +854,11 @@ class _TransactionQueue(object): yield self.transaction_actions.prepare_to_send(transaction) logger.debug("TX [%s] Persisted transaction", destination) - logger.info("TX [%s] Sending transaction [%s]", destination, - transaction.transaction_id) + logger.info( + "TX [%s] Sending transaction [%s]", + destination, + transaction.transaction_id, + ) # Actually send the transaction @@ -905,11 +915,14 @@ class _TransactionQueue(object): except Exception as e: # We capture this here as there as nothing actually listens # for this finishing functions deferred. - logger.warn("TX [%s] Problem in _attempt_transaction: %s", - destination, e) + logger.warn( + "TX [%s] Problem in _attempt_transaction: %s", + destination, + e, + ) self.set_retrying(destination, retry_interval) - + for deferred in deferreds: if not deferred.called: deferred.errback(e) @@ -925,12 +938,17 @@ class _TransactionQueue(object): def set_retrying(self, destination, retry_interval): # track that this destination is having problems and we should # give it a chance to recover before trying it again + if retry_interval: retry_interval *= 2 # plateau at hourly retries for now if retry_interval >= 60 * 60 * 1000: retry_interval = 60 * 60 * 1000 else: - retry_interval = 2000 # try again at first after 2 seconds - yield self.store.set_destination_retry_timings(destination, - int(self._clock.time_msec()), retry_interval) + retry_interval = 2000 # try again at first after 2 seconds + + yield self.store.set_destination_retry_timings( + destination, + int(self._clock.time_msec()), + retry_interval + ) diff --git a/synapse/http/matrixfederationclient.py b/synapse/http/matrixfederationclient.py index 8fc6bf8f97..16fb2adab5 100644 --- a/synapse/http/matrixfederationclient.py +++ b/synapse/http/matrixfederationclient.py @@ -130,12 +130,20 @@ class MatrixFederationHttpClient(object): break except Exception as e: if not retry_on_dns_fail and isinstance(e, DNSLookupError): - logger.warn("DNS Lookup failed to %s with %s", destination, - e) + logger.warn( + "DNS Lookup failed to %s with %s", + destination, + e + ) raise SynapseError(400, "Domain specified not found.") - logger.warn("Sending request failed to %s: %s %s : %s", - destination, method, url_bytes, e) + logger.warn( + "Sending request failed to %s: %s %s : %s", + destination, + method, + url_bytes, + e + ) _print_ex(e) if retries_left: @@ -144,10 +152,15 @@ class MatrixFederationHttpClient(object): else: raise - logger.info("Received response %d %s for %s: %s %s", - response.code, response.phrase, - destination, method, url_bytes) - + logger.info( + "Received response %d %s for %s: %s %s", + response.code, + response.phrase, + destination, + method, + url_bytes + ) + if 200 <= response.code < 300: # We need to update the transactions table to say it was sent? pass diff --git a/synapse/storage/transactions.py b/synapse/storage/transactions.py index 2b16787695..423cc3f02a 100644 --- a/synapse/storage/transactions.py +++ b/synapse/storage/transactions.py @@ -27,7 +27,7 @@ logger = logging.getLogger(__name__) class TransactionStore(SQLBaseStore): """A collection of queries for handling PDUs. """ - + # a write-through cache of DestinationsTable.EntryType indexed by # destination string destination_retry_cache = {} @@ -213,21 +213,21 @@ class TransactionStore(SQLBaseStore): def get_destination_retry_timings(self, destination): """Gets the current retry timings (if any) for a given destination. - + Args: destination (str) - + Returns: None if not retrying Otherwise a DestinationsTable.EntryType for the retry scheme """ if destination in self.destination_retry_cache: return defer.succeed(self.destination_retry_cache[destination]) - + return self.runInteraction( "get_destination_retry_timings", self._get_destination_retry_timings, destination) - + def _get_destination_retry_timings(cls, txn, destination): query = DestinationsTable.select_statement("destination = ?") txn.execute(query, (destination,)) @@ -238,30 +238,36 @@ class TransactionStore(SQLBaseStore): return result else: return None - + def set_destination_retry_timings(self, destination, retry_last_ts, retry_interval): """Sets the current retry timings for a given destination. Both timings should be zero if retrying is no longer occuring. - + Args: destination (str) retry_last_ts (int) - time of last retry attempt in unix epoch ms retry_interval (int) - how long until next retry in ms """ - + self.destination_retry_cache[destination] = ( - DestinationsTable.EntryType(destination, - retry_last_ts, retry_interval) + DestinationsTable.EntryType( + destination, + retry_last_ts, + retry_interval + ) ) - + # XXX: we could chose to not bother persisting this if our cache thinks # this is a NOOP return self.runInteraction( "set_destination_retry_timings", - self._set_destination_retry_timings, destination, - retry_last_ts, retry_interval) - + self._set_destination_retry_timings, + destination, + retry_last_ts, + retry_interval, + ) + def _set_destination_retry_timings(cls, txn, destination, retry_last_ts, retry_interval): @@ -275,21 +281,22 @@ class TransactionStore(SQLBaseStore): def get_destinations_needing_retry(self): """Get all destinations which are due a retry for sending a transaction. - + Returns: list: A list of `DestinationsTable.EntryType` """ - + return self.runInteraction( "get_destinations_needing_retry", self._get_destinations_needing_retry ) - + def _get_destinations_needing_retry(cls, txn): where = "retry_last_ts > 0 and retry_next_ts < now()" query = DestinationsTable.select_statement(where) txn.execute(query) - return DestinationsTable.decode_results(txn.fetchall()) + return DestinationsTable.decode_results(txn.fetchall()) + class ReceivedTransactionsTable(Table): table_name = "received_transactions" @@ -332,14 +339,15 @@ class TransactionsToPduTable(Table): ] EntryType = namedtuple("TransactionsToPduEntry", fields) - + + class DestinationsTable(Table): table_name = "destinations" - + fields = [ "destination", "retry_last_ts", "retry_interval", ] - EntryType = namedtuple("DestinationsEntry", fields) \ No newline at end of file + EntryType = namedtuple("DestinationsEntry", fields) -- cgit 1.4.1 From 0f4dcab238b029407080cb02cc2cf14e22d8fe89 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Wed, 10 Dec 2014 10:28:27 +0000 Subject: turn back on per-request transaction retries, so that every time we try to hit a dead server we actually end up hammering 5 times :| --- synapse/http/matrixfederationclient.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'synapse/http') diff --git a/synapse/http/matrixfederationclient.py b/synapse/http/matrixfederationclient.py index 16fb2adab5..fc5b5ab809 100644 --- a/synapse/http/matrixfederationclient.py +++ b/synapse/http/matrixfederationclient.py @@ -101,10 +101,9 @@ class MatrixFederationHttpClient(object): ] ) - # was 5; for now, let's only try once at the HTTP layer and then - # rely on transaction-layer retries for exponential backoff and - # getting the message through. - retries_left = 0 + # XXX: Would be much nicer to retry only at the transaction-layer + # (once we have reliable transactions in place) + retries_left = 5 endpoint = self._getEndpoint(reactor, destination) -- cgit 1.4.1