summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--changelog.d/12040.feature1
-rw-r--r--synapse/handlers/federation_event.py43
-rw-r--r--synapse/storage/databases/main/events_worker.py8
-rw-r--r--tests/handlers/test_federation_event.py225
4 files changed, 268 insertions, 9 deletions
diff --git a/changelog.d/12040.feature b/changelog.d/12040.feature
new file mode 100644
index 0000000000..0a88c6e80c
--- /dev/null
+++ b/changelog.d/12040.feature
@@ -0,0 +1 @@
+Optimise fetching large quantities of missing room state over federation.
diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py
index 567afc910f..e7b9f15e13 100644
--- a/synapse/handlers/federation_event.py
+++ b/synapse/handlers/federation_event.py
@@ -897,10 +897,24 @@ class FederationEventHandler:
         logger.debug("We are also missing %i auth events", len(missing_auth_events))
 
         missing_events = missing_desired_events | missing_auth_events
-        logger.debug("Fetching %i events from remote", len(missing_events))
-        await self._get_events_and_persist(
-            destination=destination, room_id=room_id, event_ids=missing_events
-        )
+
+        # Making an individual request for each of 1000s of events has a lot of
+        # overhead. On the other hand, we don't really want to fetch all of the events
+        # if we already have most of them.
+        #
+        # As an arbitrary heuristic, if we are missing more than 10% of the events, then
+        # we fetch the whole state.
+        #
+        # TODO: might it be better to have an API which lets us do an aggregate event
+        #   request
+        if (len(missing_events) * 10) >= len(auth_event_ids) + len(state_event_ids):
+            logger.debug("Requesting complete state from remote")
+            await self._get_state_and_persist(destination, room_id, event_id)
+        else:
+            logger.debug("Fetching %i events from remote", len(missing_events))
+            await self._get_events_and_persist(
+                destination=destination, room_id=room_id, event_ids=missing_events
+            )
 
         # we need to make sure we re-load from the database to get the rejected
         # state correct.
@@ -959,6 +973,27 @@ class FederationEventHandler:
 
         return remote_state
 
+    async def _get_state_and_persist(
+        self, destination: str, room_id: str, event_id: str
+    ) -> None:
+        """Get the complete room state at a given event, and persist any new events
+        as outliers"""
+        room_version = await self._store.get_room_version(room_id)
+        auth_events, state_events = await self._federation_client.get_room_state(
+            destination, room_id, event_id=event_id, room_version=room_version
+        )
+        logger.info("/state returned %i events", len(auth_events) + len(state_events))
+
+        await self._auth_and_persist_outliers(
+            room_id, itertools.chain(auth_events, state_events)
+        )
+
+        # we also need the event itself.
+        if not await self._store.have_seen_event(room_id, event_id):
+            await self._get_events_and_persist(
+                destination=destination, room_id=room_id, event_ids=(event_id,)
+            )
+
     async def _process_received_pdu(
         self,
         origin: str,
diff --git a/synapse/storage/databases/main/events_worker.py b/synapse/storage/databases/main/events_worker.py
index 59454a47df..a60e3f4fdd 100644
--- a/synapse/storage/databases/main/events_worker.py
+++ b/synapse/storage/databases/main/events_worker.py
@@ -22,7 +22,6 @@ from typing import (
     Dict,
     Iterable,
     List,
-    NoReturn,
     Optional,
     Set,
     Tuple,
@@ -1330,10 +1329,9 @@ class EventsWorkerStore(SQLBaseStore):
         return results
 
     @cached(max_entries=100000, tree=True)
-    async def have_seen_event(self, room_id: str, event_id: str) -> NoReturn:
-        # this only exists for the benefit of the @cachedList descriptor on
-        # _have_seen_events_dict
-        raise NotImplementedError()
+    async def have_seen_event(self, room_id: str, event_id: str) -> bool:
+        res = await self._have_seen_events_dict(((room_id, event_id),))
+        return res[(room_id, event_id)]
 
     def _get_current_state_event_counts_txn(
         self, txn: LoggingTransaction, room_id: str
diff --git a/tests/handlers/test_federation_event.py b/tests/handlers/test_federation_event.py
new file mode 100644
index 0000000000..489ba57736
--- /dev/null
+++ b/tests/handlers/test_federation_event.py
@@ -0,0 +1,225 @@
+# Copyright 2022 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from unittest import mock
+
+from synapse.events import make_event_from_dict
+from synapse.events.snapshot import EventContext
+from synapse.federation.transport.client import StateRequestResponse
+from synapse.logging.context import LoggingContext
+from synapse.rest import admin
+from synapse.rest.client import login, room
+
+from tests import unittest
+from tests.test_utils import event_injection, make_awaitable
+
+
+class FederationEventHandlerTests(unittest.FederatingHomeserverTestCase):
+    servlets = [
+        admin.register_servlets,
+        login.register_servlets,
+        room.register_servlets,
+    ]
+
+    def make_homeserver(self, reactor, clock):
+        # mock out the federation transport client
+        self.mock_federation_transport_client = mock.Mock(
+            spec=["get_room_state_ids", "get_room_state", "get_event"]
+        )
+        return super().setup_test_homeserver(
+            federation_transport_client=self.mock_federation_transport_client
+        )
+
+    def test_process_pulled_event_with_missing_state(self) -> None:
+        """Ensure that we correctly handle pulled events with lots of missing state
+
+        In this test, we pretend we are processing a "pulled" event (eg, via backfill
+        or get_missing_events). The pulled event has a prev_event we haven't previously
+        seen, so the server requests the state at that prev_event. There is a lot
+        of state we don't have, so we expect the server to make a /state request.
+
+        We check that the pulled event is correctly persisted, and that the state is
+        as we expect.
+        """
+        return self._test_process_pulled_event_with_missing_state(False)
+
+    def test_process_pulled_event_with_missing_state_where_prev_is_outlier(
+        self,
+    ) -> None:
+        """Ensure that we correctly handle pulled events with lots of missing state
+
+        A slight modification to test_process_pulled_event_with_missing_state. Again
+        we have a "pulled" event which refers to a prev_event with lots of state,
+        but in this case we already have the prev_event (as an outlier, obviously -
+        if it were a regular event, we wouldn't need to request the state).
+        """
+        return self._test_process_pulled_event_with_missing_state(True)
+
+    def _test_process_pulled_event_with_missing_state(
+        self, prev_exists_as_outlier: bool
+    ) -> None:
+        OTHER_USER = f"@user:{self.OTHER_SERVER_NAME}"
+        main_store = self.hs.get_datastores().main
+        state_storage = self.hs.get_storage().state
+
+        # create the room
+        user_id = self.register_user("kermit", "test")
+        tok = self.login("kermit", "test")
+        room_id = self.helper.create_room_as(room_creator=user_id, tok=tok)
+        room_version = self.get_success(main_store.get_room_version(room_id))
+
+        # allow the remote user to send state events
+        self.helper.send_state(
+            room_id,
+            "m.room.power_levels",
+            {"events_default": 0, "state_default": 0},
+            tok=tok,
+        )
+
+        # add the remote user to the room
+        member_event = self.get_success(
+            event_injection.inject_member_event(self.hs, room_id, OTHER_USER, "join")
+        )
+
+        initial_state_map = self.get_success(main_store.get_current_state_ids(room_id))
+
+        auth_event_ids = [
+            initial_state_map[("m.room.create", "")],
+            initial_state_map[("m.room.power_levels", "")],
+            initial_state_map[("m.room.join_rules", "")],
+            member_event.event_id,
+        ]
+
+        # mock up a load of state events which we are missing
+        state_events = [
+            make_event_from_dict(
+                self.add_hashes_and_signatures(
+                    {
+                        "type": "test_state_type",
+                        "state_key": f"state_{i}",
+                        "room_id": room_id,
+                        "sender": OTHER_USER,
+                        "prev_events": [member_event.event_id],
+                        "auth_events": auth_event_ids,
+                        "origin_server_ts": 1,
+                        "depth": 10,
+                        "content": {"body": f"state_{i}"},
+                    }
+                ),
+                room_version,
+            )
+            for i in range(1, 10)
+        ]
+
+        # this is the state that we are going to claim is active at the prev_event.
+        state_at_prev_event = state_events + self.get_success(
+            main_store.get_events_as_list(initial_state_map.values())
+        )
+
+        # mock up a prev event.
+        # Depending on the test, we either persist this upfront (as an outlier),
+        # or let the server request it.
+        prev_event = make_event_from_dict(
+            self.add_hashes_and_signatures(
+                {
+                    "type": "test_regular_type",
+                    "room_id": room_id,
+                    "sender": OTHER_USER,
+                    "prev_events": [],
+                    "auth_events": auth_event_ids,
+                    "origin_server_ts": 1,
+                    "depth": 11,
+                    "content": {"body": "missing_prev"},
+                }
+            ),
+            room_version,
+        )
+        if prev_exists_as_outlier:
+            prev_event.internal_metadata.outlier = True
+            persistence = self.hs.get_storage().persistence
+            self.get_success(
+                persistence.persist_event(prev_event, EventContext.for_outlier())
+            )
+        else:
+
+            async def get_event(destination: str, event_id: str, timeout=None):
+                self.assertEqual(destination, self.OTHER_SERVER_NAME)
+                self.assertEqual(event_id, prev_event.event_id)
+                return {"pdus": [prev_event.get_pdu_json()]}
+
+            self.mock_federation_transport_client.get_event.side_effect = get_event
+
+        # mock up a regular event to pass into _process_pulled_event
+        pulled_event = make_event_from_dict(
+            self.add_hashes_and_signatures(
+                {
+                    "type": "test_regular_type",
+                    "room_id": room_id,
+                    "sender": OTHER_USER,
+                    "prev_events": [prev_event.event_id],
+                    "auth_events": auth_event_ids,
+                    "origin_server_ts": 1,
+                    "depth": 12,
+                    "content": {"body": "pulled"},
+                }
+            ),
+            room_version,
+        )
+
+        # we expect an outbound request to /state_ids, so stub that out
+        self.mock_federation_transport_client.get_room_state_ids.return_value = (
+            make_awaitable(
+                {
+                    "pdu_ids": [e.event_id for e in state_at_prev_event],
+                    "auth_chain_ids": [],
+                }
+            )
+        )
+
+        # we also expect an outbound request to /state
+        self.mock_federation_transport_client.get_room_state.return_value = (
+            make_awaitable(
+                StateRequestResponse(auth_events=[], state=state_at_prev_event)
+            )
+        )
+
+        # we have to bump the clock a bit, to keep the retry logic in
+        # FederationClient.get_pdu happy
+        self.reactor.advance(60000)
+
+        # Finally, the call under test: send the pulled event into _process_pulled_event
+        with LoggingContext("test"):
+            self.get_success(
+                self.hs.get_federation_event_handler()._process_pulled_event(
+                    self.OTHER_SERVER_NAME, pulled_event, backfilled=False
+                )
+            )
+
+        # check that the event is correctly persisted
+        persisted = self.get_success(main_store.get_event(pulled_event.event_id))
+        self.assertIsNotNone(persisted, "pulled event was not persisted at all")
+        self.assertFalse(
+            persisted.internal_metadata.is_outlier(), "pulled event was an outlier"
+        )
+
+        # check that the state at that event is as expected
+        state = self.get_success(
+            state_storage.get_state_ids_for_event(pulled_event.event_id)
+        )
+        expected_state = {
+            (e.type, e.state_key): e.event_id for e in state_at_prev_event
+        }
+        self.assertEqual(state, expected_state)
+
+        if prev_exists_as_outlier:
+            self.mock_federation_transport_client.get_event.assert_not_called()