diff options
Diffstat (limited to 'synapse/federation/replication.py')
-rw-r--r-- | synapse/federation/replication.py | 54 |
1 files changed, 47 insertions, 7 deletions
diff --git a/synapse/federation/replication.py b/synapse/federation/replication.py index 01f87fe423..c4c6667b62 100644 --- a/synapse/federation/replication.py +++ b/synapse/federation/replication.py @@ -685,6 +685,7 @@ class _TransactionQueue(object): self.transport_layer = transport_layer self._clock = hs.get_clock() + self.store = hs.get_datastore() # Is a mapping from destinations -> deferreds. Used to keep track # of which destinations have transactions in flight and when they are @@ -723,6 +724,8 @@ class _TransactionQueue(object): deferreds = [] for destination in destinations: + # XXX: why don't we specify an errback for this deferred + # like we do for EDUs? --matthew deferred = defer.Deferred() self.pending_pdus_by_dest.setdefault(destination, []).append( (pdu, deferred, order) @@ -738,6 +741,9 @@ class _TransactionQueue(object): # NO inlineCallbacks def enqueue_edu(self, edu): destination = edu.destination + + if destination == self.server_name: + return deferred = defer.Deferred() self.pending_edus_by_dest.setdefault(destination, []).append( @@ -766,14 +772,30 @@ class _TransactionQueue(object): ) yield deferred - + @defer.inlineCallbacks @log_function def _attempt_new_transaction(self, destination): + + (retry_last_ts, retry_interval) = (0, 0) + retry_timings = yield self.store.get_destination_retry_timings(destination) + if retry_timings: + (retry_last_ts, retry_interval) = ( + retry_timings.retry_last_ts, retry_timings.retry_interval + ) + if retry_last_ts + retry_interval > int(self._clock.time_msec()): + logger.info("TX [%s] not ready for retry yet - dropping transaction for now", destination) + return + else: + logger.info("TX [%s] is ready for retry", destination) + if destination in self.pending_transactions: + # XXX: pending_transactions can get stuck on by a never-ending request + # at which point pending_pdus_by_dest just keeps growing. + # we need application-layer timeouts of some flavour of these requests return - # list of (pending_pdu, deferred, order) + # list of (pending_pdu, deferred, order) pending_pdus = self.pending_pdus_by_dest.pop(destination, []) pending_edus = self.pending_edus_by_dest.pop(destination, []) pending_failures = self.pending_failures_by_dest.pop(destination, []) @@ -781,7 +803,8 @@ class _TransactionQueue(object): if not pending_pdus and not pending_edus and not pending_failures: return - logger.debug("TX [%s] Attempting new transaction", destination) + logger.debug("TX [%s] Attempting new transaction (pdus: %d, edus: %d, failures: %d)", + destination, len(pending_pdus), len(pending_edus), len(pending_failures)) # Sort based on the order field pending_pdus.sort(key=lambda t: t[2]) @@ -814,7 +837,7 @@ class _TransactionQueue(object): yield self.transaction_actions.prepare_to_send(transaction) logger.debug("TX [%s] Persisted transaction", destination) - logger.debug("TX [%s] Sending transaction...", destination) + logger.info("TX [%s] Sending transaction [%s]", destination, transaction.transaction_id) # Actually send the transaction @@ -835,6 +858,8 @@ class _TransactionQueue(object): transaction, json_data_cb ) + logger.info("TX [%s] got %d response", destination, code) + logger.debug("TX [%s] Sent transaction", destination) logger.debug("TX [%s] Marking as delivered...", destination) @@ -847,8 +872,12 @@ class _TransactionQueue(object): for deferred in deferreds: if code == 200: + if retry_last_ts: + # this host is alive! reset retry schedule + yield self.store.set_destination_retry_timings(destination, 0, 0) deferred.callback(None) else: + self.start_retrying(destination, retry_interval) deferred.errback(RuntimeError("Got status %d" % code)) # Ensures we don't continue until all callbacks on that @@ -861,12 +890,12 @@ class _TransactionQueue(object): logger.debug("TX [%s] Yielded to callbacks", destination) except Exception as e: - logger.error("TX Problem in _attempt_transaction") - # We capture this here as there as nothing actually listens # for this finishing functions deferred. - logger.exception(e) + logger.exception("TX [%s] Problem in _attempt_transaction: %s", destination, e) + self.start_retrying(destination, retry_interval) + for deferred in deferreds: if not deferred.called: deferred.errback(e) @@ -877,3 +906,14 @@ class _TransactionQueue(object): # Check to see if there is anything else to send. self._attempt_new_transaction(destination) + + @defer.inlineCallbacks + def start_retrying(self, destination, retry_interval): + # track that this destination is having problems and we should + # give it a chance to recover before trying it again + if retry_interval: + retry_interval *= 2 + else: + retry_interval = 2000 # try again at first after 2 seconds + yield self.store.set_destination_retry_timings(destination, + int(self._clock.time_msec()), retry_interval) |