From fef2e792beec9c126953b4f6b6d2d9f6e31ed96f Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Mon, 7 Feb 2022 15:54:13 -0600 Subject: Fix historical messages backfilling in random order on remote homeservers (MSC2716) (#11114) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix https://github.com/matrix-org/synapse/issues/11091 Fix https://github.com/matrix-org/synapse/issues/10764 (side-stepping the issue because we no longer have to deal with `fake_prev_event_id`) 1. Made the `/backfill` response return messages in `(depth, stream_ordering)` order (previously only sorted by `depth`) - Technically, it shouldn't really matter how `/backfill` returns things but I'm just trying to make the `stream_ordering` a little more consistent from the origin to the remote homeservers in order to get the order of messages from `/messages` consistent ([sorted by `(topological_ordering, stream_ordering)`](https://github.com/matrix-org/synapse/blob/develop/docs/development/room-dag-concepts.md#depth-and-stream-ordering)). - Even now that we return backfilled messages in order, it still doesn't guarantee the same `stream_ordering` (and more importantly the [`/messages` order](https://github.com/matrix-org/synapse/blob/develop/docs/development/room-dag-concepts.md#depth-and-stream-ordering)) on the other server. For example, if a room has a bunch of history imported and someone visits a permalink to a historical message back in time, their homeserver will skip over the historical messages in between and insert the permalink as the next message in the `stream_order` and totally throw off the sort. - This will be even more the case when we add the [MSC3030 jump to date API endpoint](https://github.com/matrix-org/matrix-doc/pull/3030) so the static archives can navigate and jump to a certain date. - We're solving this in the future by switching to [online topological ordering](https://github.com/matrix-org/gomatrixserverlib/issues/187) and [chunking](https://github.com/matrix-org/synapse/issues/3785) which by its nature will apply retroactively to fix any inconsistencies introduced by people permalinking 2. As we're navigating `prev_events` to return in `/backfill`, we order by `depth` first (newest -> oldest) and now also tie-break based on the `stream_ordering` (newest -> oldest). This is technically important because MSC2716 inserts a bunch of historical messages at the same `depth` so it's best to be prescriptive about which ones we should process first. In reality, I think the code already looped over the historical messages as expected because the database is already in order. 3. Making the historical state chain and historical event chain float on their own by having no `prev_events` instead of a fake `prev_event` which caused backfill to get clogged with an unresolvable event. Fixes https://github.com/matrix-org/synapse/issues/11091 and https://github.com/matrix-org/synapse/issues/10764 4. We no longer find connected insertion events by finding a potential `prev_event` connection to the current event we're iterating over. We now solely rely on marker events which when processed, add the insertion event as an extremity and the federating homeserver can ask about it when time calls. - Related discussion, https://github.com/matrix-org/synapse/pull/11114#discussion_r741514793 Before | After --- | --- ![](https://user-images.githubusercontent.com/558581/139218681-b465c862-5c49-4702-a59e-466733b0cf45.png) | ![](https://user-images.githubusercontent.com/558581/146453159-a1609e0a-8324-439d-ae44-e4bce43ac6d1.png) #### Why aren't we sorting topologically when receiving backfill events? > The main reason we're going to opt to not sort topologically when receiving backfill events is because it's probably best to do whatever is easiest to make it just work. People will probably have opinions once they look at [MSC2716](https://github.com/matrix-org/matrix-doc/pull/2716) which could change whatever implementation anyway. > > As mentioned, ideally we would do this but code necessary to make the fake edges but it gets confusing and gives an impression of “just whyyyy” (feels icky). This problem also dissolves with online topological ordering. > > -- https://github.com/matrix-org/synapse/pull/11114#discussion_r741517138 See https://github.com/matrix-org/synapse/pull/11114#discussion_r739610091 for the technical difficulties --- synapse/handlers/federation.py | 27 ++++++++++++++++++---- synapse/handlers/federation_event.py | 34 ++++++++++++++++++++++++---- synapse/handlers/message.py | 20 ++++++++++++---- synapse/handlers/room_batch.py | 44 ++++++++++++++++++------------------ synapse/handlers/room_member.py | 22 ++++++++++++++++-- 5 files changed, 110 insertions(+), 37 deletions(-) (limited to 'synapse/handlers') diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py index a37ae0ca09..c0f642005f 100644 --- a/synapse/handlers/federation.py +++ b/synapse/handlers/federation.py @@ -166,9 +166,14 @@ class FederationHandler: oldest_events_with_depth = ( await self.store.get_oldest_event_ids_with_depth_in_room(room_id) ) - insertion_events_to_be_backfilled = ( - await self.store.get_insertion_event_backwards_extremities_in_room(room_id) - ) + + insertion_events_to_be_backfilled: Dict[str, int] = {} + if self.hs.config.experimental.msc2716_enabled: + insertion_events_to_be_backfilled = ( + await self.store.get_insertion_event_backward_extremities_in_room( + room_id + ) + ) logger.debug( "_maybe_backfill_inner: extremities oldest_events_with_depth=%s insertion_events_to_be_backfilled=%s", oldest_events_with_depth, @@ -271,11 +276,12 @@ class FederationHandler: ] logger.debug( - "room_id: %s, backfill: current_depth: %s, limit: %s, max_depth: %s, extrems: %s filtered_sorted_extremeties_tuple: %s", + "room_id: %s, backfill: current_depth: %s, limit: %s, max_depth: %s, extrems (%d): %s filtered_sorted_extremeties_tuple: %s", room_id, current_depth, limit, max_depth, + len(sorted_extremeties_tuple), sorted_extremeties_tuple, filtered_sorted_extremeties_tuple, ) @@ -1047,6 +1053,19 @@ class FederationHandler: limit = min(limit, 100) events = await self.store.get_backfill_events(room_id, pdu_list, limit) + logger.debug( + "on_backfill_request: backfill events=%s", + [ + "event_id=%s,depth=%d,body=%s,prevs=%s\n" + % ( + event.event_id, + event.depth, + event.content.get("body", event.type), + event.prev_event_ids(), + ) + for event in events + ], + ) events = await filter_events_for_server(self.storage, origin, events) diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py index 3905f60b3a..9edc7369d6 100644 --- a/synapse/handlers/federation_event.py +++ b/synapse/handlers/federation_event.py @@ -508,7 +508,11 @@ class FederationEventHandler: f"room {ev.room_id}, when we were backfilling in {room_id}" ) - await self._process_pulled_events(dest, events, backfilled=True) + await self._process_pulled_events( + dest, + events, + backfilled=True, + ) async def _get_missing_events_for_pdu( self, origin: str, pdu: EventBase, prevs: Set[str], min_depth: int @@ -626,11 +630,24 @@ class FederationEventHandler: backfilled: True if this is part of a historical batch of events (inhibits notification to clients, and validation of device keys.) """ + logger.debug( + "processing pulled backfilled=%s events=%s", + backfilled, + [ + "event_id=%s,depth=%d,body=%s,prevs=%s\n" + % ( + event.event_id, + event.depth, + event.content.get("body", event.type), + event.prev_event_ids(), + ) + for event in events + ], + ) # We want to sort these by depth so we process them and # tell clients about them in order. sorted_events = sorted(events, key=lambda x: x.depth) - for ev in sorted_events: with nested_logging_context(ev.event_id): await self._process_pulled_event(origin, ev, backfilled=backfilled) @@ -992,6 +1009,8 @@ class FederationEventHandler: await self._run_push_actions_and_persist_event(event, context, backfilled) + await self._handle_marker_event(origin, event) + if backfilled or context.rejected: return @@ -1071,8 +1090,6 @@ class FederationEventHandler: event.sender, ) - await self._handle_marker_event(origin, event) - async def _resync_device(self, sender: str) -> None: """We have detected that the device list for the given user may be out of sync, so we try and resync them. @@ -1323,7 +1340,14 @@ class FederationEventHandler: return event, context events_to_persist = (x for x in (prep(event) for event in fetched_events) if x) - await self.persist_events_and_notify(room_id, tuple(events_to_persist)) + await self.persist_events_and_notify( + room_id, + tuple(events_to_persist), + # Mark these events backfilled as they're historic events that will + # eventually be backfilled. For example, missing events we fetch + # during backfill should be marked as backfilled as well. + backfilled=True, + ) async def _check_event_auth( self, diff --git a/synapse/handlers/message.py b/synapse/handlers/message.py index b37250aa38..9267e586a8 100644 --- a/synapse/handlers/message.py +++ b/synapse/handlers/message.py @@ -490,12 +490,12 @@ class EventCreationHandler: requester: Requester, event_dict: dict, txn_id: Optional[str] = None, + allow_no_prev_events: bool = False, prev_event_ids: Optional[List[str]] = None, auth_event_ids: Optional[List[str]] = None, require_consent: bool = True, outlier: bool = False, historical: bool = False, - allow_no_prev_events: bool = False, depth: Optional[int] = None, ) -> Tuple[EventBase, EventContext]: """ @@ -510,6 +510,10 @@ class EventCreationHandler: requester event_dict: An entire event txn_id + allow_no_prev_events: Whether to allow this event to be created an empty + list of prev_events. Normally this is prohibited just because most + events should have a prev_event and we should only use this in special + cases like MSC2716. prev_event_ids: the forward extremities to use as the prev_events for the new event. @@ -604,10 +608,10 @@ class EventCreationHandler: event, context = await self.create_new_client_event( builder=builder, requester=requester, + allow_no_prev_events=allow_no_prev_events, prev_event_ids=prev_event_ids, auth_event_ids=auth_event_ids, depth=depth, - allow_no_prev_events=allow_no_prev_events, ) # In an ideal world we wouldn't need the second part of this condition. However, @@ -764,6 +768,7 @@ class EventCreationHandler: self, requester: Requester, event_dict: dict, + allow_no_prev_events: bool = False, prev_event_ids: Optional[List[str]] = None, auth_event_ids: Optional[List[str]] = None, ratelimit: bool = True, @@ -781,6 +786,10 @@ class EventCreationHandler: Args: requester: The requester sending the event. event_dict: An entire event. + allow_no_prev_events: Whether to allow this event to be created an empty + list of prev_events. Normally this is prohibited just because most + events should have a prev_event and we should only use this in special + cases like MSC2716. prev_event_ids: The event IDs to use as the prev events. Should normally be left as None to automatically request them @@ -880,16 +889,20 @@ class EventCreationHandler: self, builder: EventBuilder, requester: Optional[Requester] = None, + allow_no_prev_events: bool = False, prev_event_ids: Optional[List[str]] = None, auth_event_ids: Optional[List[str]] = None, depth: Optional[int] = None, - allow_no_prev_events: bool = False, ) -> Tuple[EventBase, EventContext]: """Create a new event for a local client Args: builder: requester: + allow_no_prev_events: Whether to allow this event to be created an empty + list of prev_events. Normally this is prohibited just because most + events should have a prev_event and we should only use this in special + cases like MSC2716. prev_event_ids: the forward extremities to use as the prev_events for the new event. @@ -908,7 +921,6 @@ class EventCreationHandler: Returns: Tuple of created event, context """ - # Strip down the auth_event_ids to only what we need to auth the event. # For example, we don't need extra m.room.member that don't match event.sender full_state_ids_at_event = None diff --git a/synapse/handlers/room_batch.py b/synapse/handlers/room_batch.py index f880aa93d2..f8137ec04c 100644 --- a/synapse/handlers/room_batch.py +++ b/synapse/handlers/room_batch.py @@ -13,10 +13,6 @@ if TYPE_CHECKING: logger = logging.getLogger(__name__) -def generate_fake_event_id() -> str: - return "$fake_" + random_string(43) - - class RoomBatchHandler: def __init__(self, hs: "HomeServer"): self.hs = hs @@ -182,11 +178,12 @@ class RoomBatchHandler: state_event_ids_at_start = [] auth_event_ids = initial_auth_event_ids.copy() - # Make the state events float off on their own so we don't have a - # bunch of `@mxid joined the room` noise between each batch - prev_event_id_for_state_chain = generate_fake_event_id() + # Make the state events float off on their own by specifying no + # prev_events for the first one in the chain so we don't have a bunch of + # `@mxid joined the room` noise between each batch. + prev_event_ids_for_state_chain: List[str] = [] - for state_event in state_events_at_start: + for index, state_event in enumerate(state_events_at_start): assert_params_in_dict( state_event, ["type", "origin_server_ts", "content", "sender"] ) @@ -222,7 +219,10 @@ class RoomBatchHandler: content=event_dict["content"], outlier=True, historical=True, - prev_event_ids=[prev_event_id_for_state_chain], + # Only the first event in the chain should be floating. + # The rest should hang off each other in a chain. + allow_no_prev_events=index == 0, + prev_event_ids=prev_event_ids_for_state_chain, # Make sure to use a copy of this list because we modify it # later in the loop here. Otherwise it will be the same # reference and also update in the event when we append later. @@ -242,7 +242,10 @@ class RoomBatchHandler: event_dict, outlier=True, historical=True, - prev_event_ids=[prev_event_id_for_state_chain], + # Only the first event in the chain should be floating. + # The rest should hang off each other in a chain. + allow_no_prev_events=index == 0, + prev_event_ids=prev_event_ids_for_state_chain, # Make sure to use a copy of this list because we modify it # later in the loop here. Otherwise it will be the same # reference and also update in the event when we append later. @@ -253,7 +256,7 @@ class RoomBatchHandler: state_event_ids_at_start.append(event_id) auth_event_ids.append(event_id) # Connect all the state in a floating chain - prev_event_id_for_state_chain = event_id + prev_event_ids_for_state_chain = [event_id] return state_event_ids_at_start @@ -261,7 +264,6 @@ class RoomBatchHandler: self, events_to_create: List[JsonDict], room_id: str, - initial_prev_event_ids: List[str], inherited_depth: int, auth_event_ids: List[str], app_service_requester: Requester, @@ -277,9 +279,6 @@ class RoomBatchHandler: events_to_create: List of historical events to create in JSON dictionary format. room_id: Room where you want the events persisted in. - initial_prev_event_ids: These will be the prev_events for the first - event created. Each event created afterwards will point to the - previous event created. inherited_depth: The depth to create the events at (you will probably by calling inherit_depth_from_prev_ids(...)). auth_event_ids: Define which events allow you to create the given @@ -291,11 +290,14 @@ class RoomBatchHandler: """ assert app_service_requester.app_service - prev_event_ids = initial_prev_event_ids.copy() + # Make the historical event chain float off on its own by specifying no + # prev_events for the first event in the chain which causes the HS to + # ask for the state at the start of the batch later. + prev_event_ids: List[str] = [] event_ids = [] events_to_persist = [] - for ev in events_to_create: + for index, ev in enumerate(events_to_create): assert_params_in_dict(ev, ["type", "origin_server_ts", "content", "sender"]) assert self.hs.is_mine_id(ev["sender"]), "User must be our own: %s" % ( @@ -319,6 +321,9 @@ class RoomBatchHandler: ev["sender"], app_service_requester.app_service ), event_dict, + # Only the first event in the chain should be floating. + # The rest should hang off each other in a chain. + allow_no_prev_events=index == 0, prev_event_ids=event_dict.get("prev_events"), auth_event_ids=auth_event_ids, historical=True, @@ -370,7 +375,6 @@ class RoomBatchHandler: events_to_create: List[JsonDict], room_id: str, batch_id_to_connect_to: str, - initial_prev_event_ids: List[str], inherited_depth: int, auth_event_ids: List[str], app_service_requester: Requester, @@ -385,9 +389,6 @@ class RoomBatchHandler: room_id: Room where you want the events created in. batch_id_to_connect_to: The batch_id from the insertion event you want this batch to connect to. - initial_prev_event_ids: These will be the prev_events for the first - event created. Each event created afterwards will point to the - previous event created. inherited_depth: The depth to create the events at (you will probably by calling inherit_depth_from_prev_ids(...)). auth_event_ids: Define which events allow you to create the given @@ -436,7 +437,6 @@ class RoomBatchHandler: event_ids = await self.persist_historical_events( events_to_create=events_to_create, room_id=room_id, - initial_prev_event_ids=initial_prev_event_ids, inherited_depth=inherited_depth, auth_event_ids=auth_event_ids, app_service_requester=app_service_requester, diff --git a/synapse/handlers/room_member.py b/synapse/handlers/room_member.py index efe6b4c9aa..bf1a47efb0 100644 --- a/synapse/handlers/room_member.py +++ b/synapse/handlers/room_member.py @@ -268,7 +268,8 @@ class RoomMemberHandler(metaclass=abc.ABCMeta): target: UserID, room_id: str, membership: str, - prev_event_ids: List[str], + allow_no_prev_events: bool = False, + prev_event_ids: Optional[List[str]] = None, auth_event_ids: Optional[List[str]] = None, txn_id: Optional[str] = None, ratelimit: bool = True, @@ -286,8 +287,12 @@ class RoomMemberHandler(metaclass=abc.ABCMeta): target: room_id: membership: - prev_event_ids: The event IDs to use as the prev events + allow_no_prev_events: Whether to allow this event to be created an empty + list of prev_events. Normally this is prohibited just because most + events should have a prev_event and we should only use this in special + cases like MSC2716. + prev_event_ids: The event IDs to use as the prev events auth_event_ids: The event ids to use as the auth_events for the new event. Should normally be left as None, which will cause them to be calculated @@ -344,6 +349,7 @@ class RoomMemberHandler(metaclass=abc.ABCMeta): "membership": membership, }, txn_id=txn_id, + allow_no_prev_events=allow_no_prev_events, prev_event_ids=prev_event_ids, auth_event_ids=auth_event_ids, require_consent=require_consent, @@ -446,6 +452,7 @@ class RoomMemberHandler(metaclass=abc.ABCMeta): require_consent: bool = True, outlier: bool = False, historical: bool = False, + allow_no_prev_events: bool = False, prev_event_ids: Optional[List[str]] = None, auth_event_ids: Optional[List[str]] = None, ) -> Tuple[str, int]: @@ -470,6 +477,10 @@ class RoomMemberHandler(metaclass=abc.ABCMeta): historical: Indicates whether the message is being inserted back in time around some existing events. This is used to skip a few checks and mark the event as backfilled. + allow_no_prev_events: Whether to allow this event to be created an empty + list of prev_events. Normally this is prohibited just because most + events should have a prev_event and we should only use this in special + cases like MSC2716. prev_event_ids: The event IDs to use as the prev events auth_event_ids: The event ids to use as the auth_events for the new event. @@ -504,6 +515,7 @@ class RoomMemberHandler(metaclass=abc.ABCMeta): require_consent=require_consent, outlier=outlier, historical=historical, + allow_no_prev_events=allow_no_prev_events, prev_event_ids=prev_event_ids, auth_event_ids=auth_event_ids, ) @@ -525,6 +537,7 @@ class RoomMemberHandler(metaclass=abc.ABCMeta): require_consent: bool = True, outlier: bool = False, historical: bool = False, + allow_no_prev_events: bool = False, prev_event_ids: Optional[List[str]] = None, auth_event_ids: Optional[List[str]] = None, ) -> Tuple[str, int]: @@ -551,6 +564,10 @@ class RoomMemberHandler(metaclass=abc.ABCMeta): historical: Indicates whether the message is being inserted back in time around some existing events. This is used to skip a few checks and mark the event as backfilled. + allow_no_prev_events: Whether to allow this event to be created an empty + list of prev_events. Normally this is prohibited just because most + events should have a prev_event and we should only use this in special + cases like MSC2716. prev_event_ids: The event IDs to use as the prev events auth_event_ids: The event ids to use as the auth_events for the new event. @@ -680,6 +697,7 @@ class RoomMemberHandler(metaclass=abc.ABCMeta): membership=effective_membership_state, txn_id=txn_id, ratelimit=ratelimit, + allow_no_prev_events=allow_no_prev_events, prev_event_ids=prev_event_ids, auth_event_ids=auth_event_ids, content=content, -- cgit 1.4.1