Avoid deep recursion in appservice recovery (#5885)
Hopefully, this will fix a stack overflow when recovering an appservice.
The recursion here leads to a huge chain of deferred callbacks, which then
overflows the stack when the chain completes. `inlineCallbacks` makes a better
job of this if we use iteration instead.
Clean up the code a bit too, while we're there.
2 files changed, 26 insertions, 18 deletions
diff --git a/changelog.d/5885.bugfix b/changelog.d/5885.bugfix
new file mode 100644
index 0000000000..411d925fd4
--- /dev/null
+++ b/changelog.d/5885.bugfix
@@ -0,0 +1 @@
+Fix stack overflow when recovering an appservice which had an outage.
diff --git a/synapse/appservice/scheduler.py b/synapse/appservice/scheduler.py
index 42a350bff8..0ae12cbac9 100644
--- a/synapse/appservice/scheduler.py
+++ b/synapse/appservice/scheduler.py
@@ -224,7 +224,9 @@ class _Recoverer(object):
"as-recoverer-%s" % (self.service.id,), self.retry
)
- self.clock.call_later((2 ** self.backoff_counter), _retry)
+ delay = 2 ** self.backoff_counter
+ logger.info("Scheduling retries on %s in %fs", self.service.id, delay)
+ self.clock.call_later(delay, _retry)
def _backoff(self):
# cap the backoff to be around 8.5min => (2^9) = 512 secs
@@ -234,25 +236,30 @@ class _Recoverer(object):
@defer.inlineCallbacks
def retry(self):
+ logger.info("Starting retries on %s", self.service.id)
try:
- txn = yield self.store.get_oldest_unsent_txn(self.service)
- if txn:
+ while True:
+ txn = yield self.store.get_oldest_unsent_txn(self.service)
+ if not txn:
+ # nothing left: we're done!
+ self.callback(self)
+ return
+
logger.info(
"Retrying transaction %s for AS ID %s", txn.id, txn.service.id
)
sent = yield txn.send(self.as_api)
- if sent:
- yield txn.complete(self.store)
- # reset the backoff counter and retry immediately
- self.backoff_counter = 1
- yield self.retry()
- else:
- self._backoff()
- else:
- self._set_service_recovered()
- except Exception as e:
- logger.exception(e)
- self._backoff()
-
- def _set_service_recovered(self):
- self.callback(self)
+ if not sent:
+ break
+
+ yield txn.complete(self.store)
+
+ # reset the backoff counter and then process the next transaction
+ self.backoff_counter = 1
+
+ except Exception:
+ logger.exception("Unexpected error running retries")
+
+ # we didn't manage to send all of the transactions before we got an error of
+ # some flavour: reschedule the next retry.
+ self._backoff()
|