From 98c8fc6ce82d9d6b1bd21bf70df6a0e1ce91c1dc Mon Sep 17 00:00:00 2001
From: Erik Johnston <erik@matrix.org>
Date: Mon, 8 Nov 2021 09:54:47 +0000
Subject: Handle federation inbound instances being killed more gracefully
 (#11262)

* Make lock better handle process being killed

If the process gets killed and restarted (so that it didn't have a
chance to drop its locks gracefully) then there may still be locks in
the DB that are for the same instance that haven't yet timed out but are
safe to delete.

We handle this case by a) checking if the current instance already has
taken out the lock, and b) if not then ignoring locks that are for the
same instance.

* Periodically check for old staged events

This is to protect against other instances dying and their locks timing
out.
---
 synapse/federation/federation_server.py | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'synapse/federation')

diff --git a/synapse/federation/federation_server.py b/synapse/federation/federation_server.py
index 42e3acecb4..9a8758e9a6 100644
--- a/synapse/federation/federation_server.py
+++ b/synapse/federation/federation_server.py
@@ -213,6 +213,11 @@ class FederationServer(FederationBase):
             self._started_handling_of_staged_events = True
             self._handle_old_staged_events()
 
+            # Start a periodic check for old staged events. This is to handle
+            # the case where locks time out, e.g. if another process gets killed
+            # without dropping its locks.
+            self._clock.looping_call(self._handle_old_staged_events, 60 * 1000)
+
         # keep this as early as possible to make the calculated origin ts as
         # accurate as possible.
         request_time = self._clock.time_msec()
-- 
cgit 1.5.1