From 58f61f10f780a5f9e6be99f4072c24442594d597 Mon Sep 17 00:00:00 2001 From: reivilibre <38398653+reivilibre@users.noreply.github.com> Date: Fri, 4 Sep 2020 12:22:23 +0100 Subject: Catch-up after Federation Outage (split, 1) (#8230) Signed-off-by: Olivier Wilkinson (reivilibre) --- tests/federation/test_federation_catch_up.py | 82 ++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 tests/federation/test_federation_catch_up.py (limited to 'tests/federation/test_federation_catch_up.py') diff --git a/tests/federation/test_federation_catch_up.py b/tests/federation/test_federation_catch_up.py new file mode 100644 index 0000000000..73c51c9d6c --- /dev/null +++ b/tests/federation/test_federation_catch_up.py @@ -0,0 +1,82 @@ +from mock import Mock + +from synapse.rest import admin +from synapse.rest.client.v1 import login, room + +from tests.test_utils import event_injection, make_awaitable +from tests.unittest import FederatingHomeserverTestCase, override_config + + +class FederationCatchUpTestCases(FederatingHomeserverTestCase): + servlets = [ + admin.register_servlets, + room.register_servlets, + login.register_servlets, + ] + + def make_homeserver(self, reactor, clock): + return self.setup_test_homeserver( + federation_transport_client=Mock(spec=["send_transaction"]), + ) + + def prepare(self, reactor, clock, hs): + # stub out get_current_hosts_in_room + state_handler = hs.get_state_handler() + + # This mock is crucial for destination_rooms to be populated. + state_handler.get_current_hosts_in_room = Mock( + return_value=make_awaitable(["test", "host2"]) + ) + + def get_destination_room(self, room: str, destination: str = "host2") -> dict: + """ + Gets the destination_rooms entry for a (destination, room_id) pair. + + Args: + room: room ID + destination: what destination, default is "host2" + + Returns: + Dictionary of { event_id: str, stream_ordering: int } + """ + event_id, stream_ordering = self.get_success( + self.hs.get_datastore().db_pool.execute( + "test:get_destination_rooms", + None, + """ + SELECT event_id, stream_ordering + FROM destination_rooms dr + JOIN events USING (stream_ordering) + WHERE dr.destination = ? AND dr.room_id = ? + """, + destination, + room, + ) + )[0] + return {"event_id": event_id, "stream_ordering": stream_ordering} + + @override_config({"send_federation": True}) + def test_catch_up_destination_rooms_tracking(self): + """ + Tests that we populate the `destination_rooms` table as needed. + """ + self.register_user("u1", "you the one") + u1_token = self.login("u1", "you the one") + room = self.helper.create_room_as("u1", tok=u1_token) + + self.get_success( + event_injection.inject_member_event(self.hs, room, "@user:host2", "join") + ) + + event_id_1 = self.helper.send(room, "wombats!", tok=u1_token)["event_id"] + + row_1 = self.get_destination_room(room) + + event_id_2 = self.helper.send(room, "rabbits!", tok=u1_token)["event_id"] + + row_2 = self.get_destination_room(room) + + # check: events correctly registered in order + self.assertEqual(row_1["event_id"], event_id_1) + self.assertEqual(row_2["event_id"], event_id_2) + self.assertEqual(row_1["stream_ordering"], row_2["stream_ordering"] - 1) -- cgit 1.5.1 From 765437df54a0e74f37a2c65515f54755096eeecd Mon Sep 17 00:00:00 2001 From: reivilibre <38398653+reivilibre@users.noreply.github.com> Date: Mon, 7 Sep 2020 10:11:38 +0100 Subject: Add tests for `last_successful_stream_ordering` (#8258) --- changelog.d/8258.misc | 1 + tests/federation/test_federation_catch_up.py | 76 ++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 changelog.d/8258.misc (limited to 'tests/federation/test_federation_catch_up.py') diff --git a/changelog.d/8258.misc b/changelog.d/8258.misc new file mode 100644 index 0000000000..3c27803be4 --- /dev/null +++ b/changelog.d/8258.misc @@ -0,0 +1 @@ +Track the `stream_ordering` of the last successfully-sent event to every destination, so we can use this information to 'catch up' a remote server after an outage. diff --git a/tests/federation/test_federation_catch_up.py b/tests/federation/test_federation_catch_up.py index 73c51c9d6c..6cdcc378f0 100644 --- a/tests/federation/test_federation_catch_up.py +++ b/tests/federation/test_federation_catch_up.py @@ -28,6 +28,24 @@ class FederationCatchUpTestCases(FederatingHomeserverTestCase): return_value=make_awaitable(["test", "host2"]) ) + # whenever send_transaction is called, record the pdu data + self.pdus = [] + self.failed_pdus = [] + self.is_online = True + self.hs.get_federation_transport_client().send_transaction.side_effect = ( + self.record_transaction + ) + + async def record_transaction(self, txn, json_cb): + if self.is_online: + data = json_cb() + self.pdus.extend(data["pdus"]) + return {} + else: + data = json_cb() + self.failed_pdus.extend(data["pdus"]) + raise IOError("Failed to connect because this is a test!") + def get_destination_room(self, room: str, destination: str = "host2") -> dict: """ Gets the destination_rooms entry for a (destination, room_id) pair. @@ -80,3 +98,61 @@ class FederationCatchUpTestCases(FederatingHomeserverTestCase): self.assertEqual(row_1["event_id"], event_id_1) self.assertEqual(row_2["event_id"], event_id_2) self.assertEqual(row_1["stream_ordering"], row_2["stream_ordering"] - 1) + + @override_config({"send_federation": True}) + def test_catch_up_last_successful_stream_ordering_tracking(self): + """ + Tests that we populate the `destination_rooms` table as needed. + """ + self.register_user("u1", "you the one") + u1_token = self.login("u1", "you the one") + room = self.helper.create_room_as("u1", tok=u1_token) + + # take the remote offline + self.is_online = False + + self.get_success( + event_injection.inject_member_event(self.hs, room, "@user:host2", "join") + ) + + self.helper.send(room, "wombats!", tok=u1_token) + self.pump() + + lsso_1 = self.get_success( + self.hs.get_datastore().get_destination_last_successful_stream_ordering( + "host2" + ) + ) + + self.assertIsNone( + lsso_1, + "There should be no last successful stream ordering for an always-offline destination", + ) + + # bring the remote online + self.is_online = True + + event_id_2 = self.helper.send(room, "rabbits!", tok=u1_token)["event_id"] + + lsso_2 = self.get_success( + self.hs.get_datastore().get_destination_last_successful_stream_ordering( + "host2" + ) + ) + row_2 = self.get_destination_room(room) + + self.assertEqual( + self.pdus[0]["content"]["body"], + "rabbits!", + "Test fault: didn't receive the right PDU", + ) + self.assertEqual( + row_2["event_id"], + event_id_2, + "Test fault: destination_rooms not updated correctly", + ) + self.assertEqual( + lsso_2, + row_2["stream_ordering"], + "Send succeeded but not marked as last_successful_stream_ordering", + ) -- cgit 1.5.1 From 576bc37d318f866f11f71e34ce7190aa45b74780 Mon Sep 17 00:00:00 2001 From: reivilibre <38398653+reivilibre@users.noreply.github.com> Date: Tue, 15 Sep 2020 09:07:19 +0100 Subject: Catch-up after Federation Outage (split, 4): catch-up loop (#8272) --- changelog.d/8272.bugfix | 1 + synapse/federation/sender/per_destination_queue.py | 129 +++++++++++++++- synapse/storage/databases/main/transactions.py | 43 +++++- tests/federation/test_federation_catch_up.py | 165 +++++++++++++++++++++ tests/handlers/test_typing.py | 5 + 5 files changed, 338 insertions(+), 5 deletions(-) create mode 100644 changelog.d/8272.bugfix (limited to 'tests/federation/test_federation_catch_up.py') diff --git a/changelog.d/8272.bugfix b/changelog.d/8272.bugfix new file mode 100644 index 0000000000..532d0e22fe --- /dev/null +++ b/changelog.d/8272.bugfix @@ -0,0 +1 @@ +Fix messages over federation being lost until an event is sent into the same room. diff --git a/synapse/federation/sender/per_destination_queue.py b/synapse/federation/sender/per_destination_queue.py index 9f0852b4a2..2657767fd1 100644 --- a/synapse/federation/sender/per_destination_queue.py +++ b/synapse/federation/sender/per_destination_queue.py @@ -15,7 +15,7 @@ # limitations under the License. import datetime import logging -from typing import TYPE_CHECKING, Dict, Hashable, Iterable, List, Tuple +from typing import TYPE_CHECKING, Dict, Hashable, Iterable, List, Optional, Tuple, cast from prometheus_client import Counter @@ -92,6 +92,21 @@ class PerDestinationQueue: self._destination = destination self.transmission_loop_running = False + # True whilst we are sending events that the remote homeserver missed + # because it was unreachable. We start in this state so we can perform + # catch-up at startup. + # New events will only be sent once this is finished, at which point + # _catching_up is flipped to False. + self._catching_up = True # type: bool + + # The stream_ordering of the most recent PDU that was discarded due to + # being in catch-up mode. + self._catchup_last_skipped = 0 # type: int + + # Cache of the last successfully-transmitted stream ordering for this + # destination (we are the only updater so this is safe) + self._last_successful_stream_ordering = None # type: Optional[int] + # a list of pending PDUs self._pending_pdus = [] # type: List[EventBase] @@ -138,7 +153,13 @@ class PerDestinationQueue: Args: pdu: pdu to send """ - self._pending_pdus.append(pdu) + if not self._catching_up or self._last_successful_stream_ordering is None: + # only enqueue the PDU if we are not catching up (False) or do not + # yet know if we have anything to catch up (None) + self._pending_pdus.append(pdu) + else: + self._catchup_last_skipped = pdu.internal_metadata.stream_ordering + self.attempt_new_transaction() def send_presence(self, states: Iterable[UserPresenceState]) -> None: @@ -218,6 +239,13 @@ class PerDestinationQueue: # hence why we throw the result away. await get_retry_limiter(self._destination, self._clock, self._store) + if self._catching_up: + # we potentially need to catch-up first + await self._catch_up_transmission_loop() + if self._catching_up: + # not caught up yet + return + pending_pdus = [] while True: # We have to keep 2 free slots for presence and rr_edus @@ -351,8 +379,9 @@ class PerDestinationQueue: if e.retry_interval > 60 * 60 * 1000: # we won't retry for another hour! # (this suggests a significant outage) - # We drop pending PDUs and EDUs because otherwise they will + # We drop pending EDUs because otherwise they will # rack up indefinitely. + # (Dropping PDUs is already performed by `_start_catching_up`.) # Note that: # - the EDUs that are being dropped here are those that we can # afford to drop (specifically, only typing notifications, @@ -364,11 +393,12 @@ class PerDestinationQueue: # dropping read receipts is a bit sad but should be solved # through another mechanism, because this is all volatile! - self._pending_pdus = [] self._pending_edus = [] self._pending_edus_keyed = {} self._pending_presence = {} self._pending_rrs = {} + + self._start_catching_up() except FederationDeniedError as e: logger.info(e) except HttpResponseException as e: @@ -378,6 +408,8 @@ class PerDestinationQueue: e.code, e, ) + + self._start_catching_up() except RequestSendFailed as e: logger.warning( "TX [%s] Failed to send transaction: %s", self._destination, e @@ -387,16 +419,96 @@ class PerDestinationQueue: logger.info( "Failed to send event %s to %s", p.event_id, self._destination ) + + self._start_catching_up() except Exception: logger.exception("TX [%s] Failed to send transaction", self._destination) for p in pending_pdus: logger.info( "Failed to send event %s to %s", p.event_id, self._destination ) + + self._start_catching_up() finally: # We want to be *very* sure we clear this after we stop processing self.transmission_loop_running = False + async def _catch_up_transmission_loop(self) -> None: + first_catch_up_check = self._last_successful_stream_ordering is None + + if first_catch_up_check: + # first catchup so get last_successful_stream_ordering from database + self._last_successful_stream_ordering = await self._store.get_destination_last_successful_stream_ordering( + self._destination + ) + + if self._last_successful_stream_ordering is None: + # if it's still None, then this means we don't have the information + # in our database ­ we haven't successfully sent a PDU to this server + # (at least since the introduction of the feature tracking + # last_successful_stream_ordering). + # Sadly, this means we can't do anything here as we don't know what + # needs catching up — so catching up is futile; let's stop. + self._catching_up = False + return + + # get at most 50 catchup room/PDUs + while True: + event_ids = await self._store.get_catch_up_room_event_ids( + self._destination, self._last_successful_stream_ordering, + ) + + if not event_ids: + # No more events to catch up on, but we can't ignore the chance + # of a race condition, so we check that no new events have been + # skipped due to us being in catch-up mode + + if self._catchup_last_skipped > self._last_successful_stream_ordering: + # another event has been skipped because we were in catch-up mode + continue + + # we are done catching up! + self._catching_up = False + break + + if first_catch_up_check: + # as this is our check for needing catch-up, we may have PDUs in + # the queue from before we *knew* we had to do catch-up, so + # clear those out now. + self._start_catching_up() + + # fetch the relevant events from the event store + # - redacted behaviour of REDACT is fine, since we only send metadata + # of redacted events to the destination. + # - don't need to worry about rejected events as we do not actively + # forward received events over federation. + catchup_pdus = await self._store.get_events_as_list(event_ids) + if not catchup_pdus: + raise AssertionError( + "No events retrieved when we asked for %r. " + "This should not happen." % event_ids + ) + + if logger.isEnabledFor(logging.INFO): + rooms = (p.room_id for p in catchup_pdus) + logger.info("Catching up rooms to %s: %r", self._destination, rooms) + + success = await self._transaction_manager.send_new_transaction( + self._destination, catchup_pdus, [] + ) + + if not success: + return + + sent_transactions_counter.inc() + final_pdu = catchup_pdus[-1] + self._last_successful_stream_ordering = cast( + int, final_pdu.internal_metadata.stream_ordering + ) + await self._store.set_destination_last_successful_stream_ordering( + self._destination, self._last_successful_stream_ordering + ) + def _get_rr_edus(self, force_flush: bool) -> Iterable[Edu]: if not self._pending_rrs: return @@ -457,3 +569,12 @@ class PerDestinationQueue: ] return (edus, stream_id) + + def _start_catching_up(self) -> None: + """ + Marks this destination as being in catch-up mode. + + This throws away the PDU queue. + """ + self._catching_up = True + self._pending_pdus = [] diff --git a/synapse/storage/databases/main/transactions.py b/synapse/storage/databases/main/transactions.py index c0a958252e..091367006e 100644 --- a/synapse/storage/databases/main/transactions.py +++ b/synapse/storage/databases/main/transactions.py @@ -15,7 +15,7 @@ import logging from collections import namedtuple -from typing import Iterable, Optional, Tuple +from typing import Iterable, List, Optional, Tuple from canonicaljson import encode_canonical_json @@ -371,3 +371,44 @@ class TransactionStore(SQLBaseStore): values={"last_successful_stream_ordering": last_successful_stream_ordering}, desc="set_last_successful_stream_ordering", ) + + async def get_catch_up_room_event_ids( + self, destination: str, last_successful_stream_ordering: int, + ) -> List[str]: + """ + Returns at most 50 event IDs and their corresponding stream_orderings + that correspond to the oldest events that have not yet been sent to + the destination. + + Args: + destination: the destination in question + last_successful_stream_ordering: the stream_ordering of the + most-recently successfully-transmitted event to the destination + + Returns: + list of event_ids + """ + return await self.db_pool.runInteraction( + "get_catch_up_room_event_ids", + self._get_catch_up_room_event_ids_txn, + destination, + last_successful_stream_ordering, + ) + + @staticmethod + def _get_catch_up_room_event_ids_txn( + txn, destination: str, last_successful_stream_ordering: int, + ) -> List[str]: + q = """ + SELECT event_id FROM destination_rooms + JOIN events USING (stream_ordering) + WHERE destination = ? + AND stream_ordering > ? + ORDER BY stream_ordering + LIMIT 50 + """ + txn.execute( + q, (destination, last_successful_stream_ordering), + ) + event_ids = [row[0] for row in txn] + return event_ids diff --git a/tests/federation/test_federation_catch_up.py b/tests/federation/test_federation_catch_up.py index 6cdcc378f0..cc52c3dfac 100644 --- a/tests/federation/test_federation_catch_up.py +++ b/tests/federation/test_federation_catch_up.py @@ -1,5 +1,10 @@ +from typing import List, Tuple + from mock import Mock +from synapse.events import EventBase +from synapse.federation.sender import PerDestinationQueue, TransactionManager +from synapse.federation.units import Edu from synapse.rest import admin from synapse.rest.client.v1 import login, room @@ -156,3 +161,163 @@ class FederationCatchUpTestCases(FederatingHomeserverTestCase): row_2["stream_ordering"], "Send succeeded but not marked as last_successful_stream_ordering", ) + + @override_config({"send_federation": True}) # critical to federate + def test_catch_up_from_blank_state(self): + """ + Runs an overall test of federation catch-up from scratch. + Further tests will focus on more narrow aspects and edge-cases, but I + hope to provide an overall view with this test. + """ + # bring the other server online + self.is_online = True + + # let's make some events for the other server to receive + self.register_user("u1", "you the one") + u1_token = self.login("u1", "you the one") + room_1 = self.helper.create_room_as("u1", tok=u1_token) + room_2 = self.helper.create_room_as("u1", tok=u1_token) + + # also critical to federate + self.get_success( + event_injection.inject_member_event(self.hs, room_1, "@user:host2", "join") + ) + self.get_success( + event_injection.inject_member_event(self.hs, room_2, "@user:host2", "join") + ) + + self.helper.send_state( + room_1, event_type="m.room.topic", body={"topic": "wombat"}, tok=u1_token + ) + + # check: PDU received for topic event + self.assertEqual(len(self.pdus), 1) + self.assertEqual(self.pdus[0]["type"], "m.room.topic") + + # take the remote offline + self.is_online = False + + # send another event + self.helper.send(room_1, "hi user!", tok=u1_token) + + # check: things didn't go well since the remote is down + self.assertEqual(len(self.failed_pdus), 1) + self.assertEqual(self.failed_pdus[0]["content"]["body"], "hi user!") + + # let's delete the federation transmission queue + # (this pretends we are starting up fresh.) + self.assertFalse( + self.hs.get_federation_sender() + ._per_destination_queues["host2"] + .transmission_loop_running + ) + del self.hs.get_federation_sender()._per_destination_queues["host2"] + + # let's also clear any backoffs + self.get_success( + self.hs.get_datastore().set_destination_retry_timings("host2", None, 0, 0) + ) + + # bring the remote online and clear the received pdu list + self.is_online = True + self.pdus = [] + + # now we need to initiate a federation transaction somehow… + # to do that, let's send another event (because it's simple to do) + # (do it to another room otherwise the catch-up logic decides it doesn't + # need to catch up room_1 — something I overlooked when first writing + # this test) + self.helper.send(room_2, "wombats!", tok=u1_token) + + # we should now have received both PDUs + self.assertEqual(len(self.pdus), 2) + self.assertEqual(self.pdus[0]["content"]["body"], "hi user!") + self.assertEqual(self.pdus[1]["content"]["body"], "wombats!") + + def make_fake_destination_queue( + self, destination: str = "host2" + ) -> Tuple[PerDestinationQueue, List[EventBase]]: + """ + Makes a fake per-destination queue. + """ + transaction_manager = TransactionManager(self.hs) + per_dest_queue = PerDestinationQueue(self.hs, transaction_manager, destination) + results_list = [] + + async def fake_send( + destination_tm: str, + pending_pdus: List[EventBase], + _pending_edus: List[Edu], + ) -> bool: + assert destination == destination_tm + results_list.extend(pending_pdus) + return True # success! + + transaction_manager.send_new_transaction = fake_send + + return per_dest_queue, results_list + + @override_config({"send_federation": True}) + def test_catch_up_loop(self): + """ + Tests the behaviour of _catch_up_transmission_loop. + """ + + # ARRANGE: + # - a local user (u1) + # - 3 rooms which u1 is joined to (and remote user @user:host2 is + # joined to) + # - some events (1 to 5) in those rooms + # we have 'already sent' events 1 and 2 to host2 + per_dest_queue, sent_pdus = self.make_fake_destination_queue() + + self.register_user("u1", "you the one") + u1_token = self.login("u1", "you the one") + room_1 = self.helper.create_room_as("u1", tok=u1_token) + room_2 = self.helper.create_room_as("u1", tok=u1_token) + room_3 = self.helper.create_room_as("u1", tok=u1_token) + self.get_success( + event_injection.inject_member_event(self.hs, room_1, "@user:host2", "join") + ) + self.get_success( + event_injection.inject_member_event(self.hs, room_2, "@user:host2", "join") + ) + self.get_success( + event_injection.inject_member_event(self.hs, room_3, "@user:host2", "join") + ) + + # create some events + self.helper.send(room_1, "you hear me!!", tok=u1_token) + event_id_2 = self.helper.send(room_2, "wombats!", tok=u1_token)["event_id"] + self.helper.send(room_3, "Matrix!", tok=u1_token) + event_id_4 = self.helper.send(room_2, "rabbits!", tok=u1_token)["event_id"] + event_id_5 = self.helper.send(room_3, "Synapse!", tok=u1_token)["event_id"] + + # destination_rooms should already be populated, but let us pretend that we already + # sent (successfully) up to and including event id 2 + event_2 = self.get_success(self.hs.get_datastore().get_event(event_id_2)) + + # also fetch event 5 so we know its last_successful_stream_ordering later + event_5 = self.get_success(self.hs.get_datastore().get_event(event_id_5)) + + self.get_success( + self.hs.get_datastore().set_destination_last_successful_stream_ordering( + "host2", event_2.internal_metadata.stream_ordering + ) + ) + + # ACT + self.get_success(per_dest_queue._catch_up_transmission_loop()) + + # ASSERT, noticing in particular: + # - event 3 not sent out, because event 5 replaces it + # - order is least recent first, so event 5 comes after event 4 + # - catch-up is completed + self.assertEqual(len(sent_pdus), 2) + self.assertEqual(sent_pdus[0].event_id, event_id_4) + self.assertEqual(sent_pdus[1].event_id, event_id_5) + self.assertFalse(per_dest_queue._catching_up) + self.assertEqual( + per_dest_queue._last_successful_stream_ordering, + event_5.internal_metadata.stream_ordering, + ) diff --git a/tests/handlers/test_typing.py b/tests/handlers/test_typing.py index f306a09bfa..3fec09ea8a 100644 --- a/tests/handlers/test_typing.py +++ b/tests/handlers/test_typing.py @@ -73,6 +73,7 @@ class TypingNotificationsTestCase(unittest.HomeserverTestCase): "delivered_txn", "get_received_txn_response", "set_received_txn_response", + "get_destination_last_successful_stream_ordering", "get_destination_retry_timings", "get_devices_by_remote", "maybe_store_room_on_invite", @@ -121,6 +122,10 @@ class TypingNotificationsTestCase(unittest.HomeserverTestCase): (0, []) ) + self.datastore.get_destination_last_successful_stream_ordering.return_value = make_awaitable( + None + ) + def get_received_txn_response(*args): return defer.succeed(None) -- cgit 1.5.1 From 36efbcaf511790d6f1dd7df2260900f07489bda6 Mon Sep 17 00:00:00 2001 From: reivilibre <38398653+reivilibre@users.noreply.github.com> Date: Fri, 18 Sep 2020 14:59:13 +0100 Subject: Catch-up after Federation Outage (bonus): Catch-up on Synapse Startup (#8322) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Olivier Wilkinson (reivilibre) Co-authored-by: Patrick Cloke * Fix _set_destination_retry_timings This came about because the code assumed that retry_interval could not be NULL — which has been challenged by catch-up. --- changelog.d/8230.bugfix | 1 + changelog.d/8230.misc | 1 - changelog.d/8247.bugfix | 1 + changelog.d/8247.misc | 1 - changelog.d/8258.bugfix | 1 + changelog.d/8258.misc | 1 - changelog.d/8322.bugfix | 1 + synapse/federation/sender/__init__.py | 51 +++++++++++++ synapse/storage/databases/main/transactions.py | 66 ++++++++++++++++- tests/federation/test_federation_catch_up.py | 99 ++++++++++++++++++++++++++ 10 files changed, 218 insertions(+), 5 deletions(-) create mode 100644 changelog.d/8230.bugfix delete mode 100644 changelog.d/8230.misc create mode 100644 changelog.d/8247.bugfix delete mode 100644 changelog.d/8247.misc create mode 100644 changelog.d/8258.bugfix delete mode 100644 changelog.d/8258.misc create mode 100644 changelog.d/8322.bugfix (limited to 'tests/federation/test_federation_catch_up.py') diff --git a/changelog.d/8230.bugfix b/changelog.d/8230.bugfix new file mode 100644 index 0000000000..532d0e22fe --- /dev/null +++ b/changelog.d/8230.bugfix @@ -0,0 +1 @@ +Fix messages over federation being lost until an event is sent into the same room. diff --git a/changelog.d/8230.misc b/changelog.d/8230.misc deleted file mode 100644 index bf0ba76730..0000000000 --- a/changelog.d/8230.misc +++ /dev/null @@ -1 +0,0 @@ -Track the latest event for every destination and room for catch-up after federation outage. diff --git a/changelog.d/8247.bugfix b/changelog.d/8247.bugfix new file mode 100644 index 0000000000..532d0e22fe --- /dev/null +++ b/changelog.d/8247.bugfix @@ -0,0 +1 @@ +Fix messages over federation being lost until an event is sent into the same room. diff --git a/changelog.d/8247.misc b/changelog.d/8247.misc deleted file mode 100644 index 3c27803be4..0000000000 --- a/changelog.d/8247.misc +++ /dev/null @@ -1 +0,0 @@ -Track the `stream_ordering` of the last successfully-sent event to every destination, so we can use this information to 'catch up' a remote server after an outage. diff --git a/changelog.d/8258.bugfix b/changelog.d/8258.bugfix new file mode 100644 index 0000000000..532d0e22fe --- /dev/null +++ b/changelog.d/8258.bugfix @@ -0,0 +1 @@ +Fix messages over federation being lost until an event is sent into the same room. diff --git a/changelog.d/8258.misc b/changelog.d/8258.misc deleted file mode 100644 index 3c27803be4..0000000000 --- a/changelog.d/8258.misc +++ /dev/null @@ -1 +0,0 @@ -Track the `stream_ordering` of the last successfully-sent event to every destination, so we can use this information to 'catch up' a remote server after an outage. diff --git a/changelog.d/8322.bugfix b/changelog.d/8322.bugfix new file mode 100644 index 0000000000..532d0e22fe --- /dev/null +++ b/changelog.d/8322.bugfix @@ -0,0 +1 @@ +Fix messages over federation being lost until an event is sent into the same room. diff --git a/synapse/federation/sender/__init__.py b/synapse/federation/sender/__init__.py index 41a726878d..8bb17b3a05 100644 --- a/synapse/federation/sender/__init__.py +++ b/synapse/federation/sender/__init__.py @@ -55,6 +55,15 @@ sent_pdus_destination_dist_total = Counter( "Total number of PDUs queued for sending across all destinations", ) +# Time (in s) after Synapse's startup that we will begin to wake up destinations +# that have catch-up outstanding. +CATCH_UP_STARTUP_DELAY_SEC = 15 + +# Time (in s) to wait in between waking up each destination, i.e. one destination +# will be woken up every seconds after Synapse's startup until we have woken +# every destination has outstanding catch-up. +CATCH_UP_STARTUP_INTERVAL_SEC = 5 + class FederationSender: def __init__(self, hs: "synapse.server.HomeServer"): @@ -125,6 +134,14 @@ class FederationSender: 1000.0 / hs.config.federation_rr_transactions_per_room_per_second ) + # wake up destinations that have outstanding PDUs to be caught up + self._catchup_after_startup_timer = self.clock.call_later( + CATCH_UP_STARTUP_DELAY_SEC, + run_as_background_process, + "wake_destinations_needing_catchup", + self._wake_destinations_needing_catchup, + ) + def _get_per_destination_queue(self, destination: str) -> PerDestinationQueue: """Get or create a PerDestinationQueue for the given destination @@ -560,3 +577,37 @@ class FederationSender: # Dummy implementation for case where federation sender isn't offloaded # to a worker. return [], 0, False + + async def _wake_destinations_needing_catchup(self): + """ + Wakes up destinations that need catch-up and are not currently being + backed off from. + + In order to reduce load spikes, adds a delay between each destination. + """ + + last_processed = None # type: Optional[str] + + while True: + destinations_to_wake = await self.store.get_catch_up_outstanding_destinations( + last_processed + ) + + if not destinations_to_wake: + # finished waking all destinations! + self._catchup_after_startup_timer = None + break + + destinations_to_wake = [ + d + for d in destinations_to_wake + if self._federation_shard_config.should_handle(self._instance_name, d) + ] + + for last_processed in destinations_to_wake: + logger.info( + "Destination %s has outstanding catch-up, waking up.", + last_processed, + ) + self.wake_destination(last_processed) + await self.clock.sleep(CATCH_UP_STARTUP_INTERVAL_SEC) diff --git a/synapse/storage/databases/main/transactions.py b/synapse/storage/databases/main/transactions.py index 99cffff50c..97aed1500e 100644 --- a/synapse/storage/databases/main/transactions.py +++ b/synapse/storage/databases/main/transactions.py @@ -218,6 +218,7 @@ class TransactionStore(SQLBaseStore): retry_interval = EXCLUDED.retry_interval WHERE EXCLUDED.retry_interval = 0 + OR destinations.retry_interval IS NULL OR destinations.retry_interval < EXCLUDED.retry_interval """ @@ -249,7 +250,11 @@ class TransactionStore(SQLBaseStore): "retry_interval": retry_interval, }, ) - elif retry_interval == 0 or prev_row["retry_interval"] < retry_interval: + elif ( + retry_interval == 0 + or prev_row["retry_interval"] is None + or prev_row["retry_interval"] < retry_interval + ): self.db_pool.simple_update_one_txn( txn, "destinations", @@ -397,7 +402,7 @@ class TransactionStore(SQLBaseStore): @staticmethod def _get_catch_up_room_event_ids_txn( - txn, destination: str, last_successful_stream_ordering: int, + txn: LoggingTransaction, destination: str, last_successful_stream_ordering: int, ) -> List[str]: q = """ SELECT event_id FROM destination_rooms @@ -412,3 +417,60 @@ class TransactionStore(SQLBaseStore): ) event_ids = [row[0] for row in txn] return event_ids + + async def get_catch_up_outstanding_destinations( + self, after_destination: Optional[str] + ) -> List[str]: + """ + Gets at most 25 destinations which have outstanding PDUs to be caught up, + and are not being backed off from + Args: + after_destination: + If provided, all destinations must be lexicographically greater + than this one. + + Returns: + list of up to 25 destinations with outstanding catch-up. + These are the lexicographically first destinations which are + lexicographically greater than after_destination (if provided). + """ + time = self.hs.get_clock().time_msec() + + return await self.db_pool.runInteraction( + "get_catch_up_outstanding_destinations", + self._get_catch_up_outstanding_destinations_txn, + time, + after_destination, + ) + + @staticmethod + def _get_catch_up_outstanding_destinations_txn( + txn: LoggingTransaction, now_time_ms: int, after_destination: Optional[str] + ) -> List[str]: + q = """ + SELECT destination FROM destinations + WHERE destination IN ( + SELECT destination FROM destination_rooms + WHERE destination_rooms.stream_ordering > + destinations.last_successful_stream_ordering + ) + AND destination > ? + AND ( + retry_last_ts IS NULL OR + retry_last_ts + retry_interval < ? + ) + ORDER BY destination + LIMIT 25 + """ + txn.execute( + q, + ( + # everything is lexicographically greater than "" so this gives + # us the first batch of up to 25. + after_destination or "", + now_time_ms, + ), + ) + + destinations = [row[0] for row in txn] + return destinations diff --git a/tests/federation/test_federation_catch_up.py b/tests/federation/test_federation_catch_up.py index cc52c3dfac..1a3ccb263d 100644 --- a/tests/federation/test_federation_catch_up.py +++ b/tests/federation/test_federation_catch_up.py @@ -321,3 +321,102 @@ class FederationCatchUpTestCases(FederatingHomeserverTestCase): per_dest_queue._last_successful_stream_ordering, event_5.internal_metadata.stream_ordering, ) + + @override_config({"send_federation": True}) + def test_catch_up_on_synapse_startup(self): + """ + Tests the behaviour of get_catch_up_outstanding_destinations and + _wake_destinations_needing_catchup. + """ + + # list of sorted server names (note that there are more servers than the batch + # size used in get_catch_up_outstanding_destinations). + server_names = ["server%02d" % number for number in range(42)] + ["zzzerver"] + + # ARRANGE: + # - a local user (u1) + # - a room which u1 is joined to (and remote users @user:serverXX are + # joined to) + + # mark the remotes as online + self.is_online = True + + self.register_user("u1", "you the one") + u1_token = self.login("u1", "you the one") + room_id = self.helper.create_room_as("u1", tok=u1_token) + + for server_name in server_names: + self.get_success( + event_injection.inject_member_event( + self.hs, room_id, "@user:%s" % server_name, "join" + ) + ) + + # create an event + self.helper.send(room_id, "deary me!", tok=u1_token) + + # ASSERT: + # - All servers are up to date so none should have outstanding catch-up + outstanding_when_successful = self.get_success( + self.hs.get_datastore().get_catch_up_outstanding_destinations(None) + ) + self.assertEqual(outstanding_when_successful, []) + + # ACT: + # - Make the remote servers unreachable + self.is_online = False + + # - Mark zzzerver as being backed-off from + now = self.clock.time_msec() + self.get_success( + self.hs.get_datastore().set_destination_retry_timings( + "zzzerver", now, now, 24 * 60 * 60 * 1000 # retry in 1 day + ) + ) + + # - Send an event + self.helper.send(room_id, "can anyone hear me?", tok=u1_token) + + # ASSERT (get_catch_up_outstanding_destinations): + # - all remotes are outstanding + # - they are returned in batches of 25, in order + outstanding_1 = self.get_success( + self.hs.get_datastore().get_catch_up_outstanding_destinations(None) + ) + + self.assertEqual(len(outstanding_1), 25) + self.assertEqual(outstanding_1, server_names[0:25]) + + outstanding_2 = self.get_success( + self.hs.get_datastore().get_catch_up_outstanding_destinations( + outstanding_1[-1] + ) + ) + self.assertNotIn("zzzerver", outstanding_2) + self.assertEqual(len(outstanding_2), 17) + self.assertEqual(outstanding_2, server_names[25:-1]) + + # ACT: call _wake_destinations_needing_catchup + + # patch wake_destination to just count the destinations instead + woken = [] + + def wake_destination_track(destination): + woken.append(destination) + + self.hs.get_federation_sender().wake_destination = wake_destination_track + + # cancel the pre-existing timer for _wake_destinations_needing_catchup + # this is because we are calling it manually rather than waiting for it + # to be called automatically + self.hs.get_federation_sender()._catchup_after_startup_timer.cancel() + + self.get_success( + self.hs.get_federation_sender()._wake_destinations_needing_catchup(), by=5.0 + ) + + # ASSERT (_wake_destinations_needing_catchup): + # - all remotes are woken up, save for zzzerver + self.assertNotIn("zzzerver", woken) + # - all destinations are woken exactly once; they appear once in woken. + self.assertCountEqual(woken, server_names[:-1]) -- cgit 1.5.1