summary refs log tree commit diff
diff options
context:
space:
mode:
authorSean Quah <8349537+squahtx@users.noreply.github.com>2023-04-04 13:10:25 +0100
committerGitHub <noreply@github.com>2023-04-04 13:10:25 +0100
commit89a71e73905ffa1c97ae8be27d521cd2ef3f3a0c (patch)
tree540d0433b738deb35820da6eb412eed359225512
parentNote that Synapse 1.74 queued a user dir rebuild (#15386) (diff)
downloadsynapse-89a71e73905ffa1c97ae8be27d521cd2ef3f3a0c.tar.xz
Fix a rare bug where initial /syncs would fail (#15383)
This change fixes a rare bug where initial /syncs would fail with a
`KeyError` under the following circumstances:
 1. A user fast joins a remote room.
 2. The user is kicked from the room before the room's full state has
    been synced.
 3. A second local user fast joins the room.
 4. Events are backfilled into the room with a higher topological
    ordering than the original user's leave. They are assigned a
    negative stream ordering. It's not clear how backfill happened here,
    since it is expected to be equivalent to syncing the full state.
 5. The second local user leaves the room before the room's full state
    has been synced. The homeserver does not complete the sync.
 6. The original user performs an initial /sync with lazy_load_members
    enabled.
     * Because they were kicked from the room, the room is included in
       the /sync response even though the include_leave option is not
       specified.
     * To populate the room's timeline, `_load_filtered_recents` /
       `get_recent_events_for_room` fetches events with a lower stream
       ordering than the leave event and picks the ones with the highest
       topological orderings (which are most recent). This captures the
       backfilled events after the leave, since they have a negative
       stream ordering. These events are filtered out of the timeline,
       since the user was not in the room at the time and cannot view
       them. The sync code ends up with an empty timeline for the room
       that notably does not include the user's leave event.
       This seems buggy, but at least we don't disclose events the user
       isn't allowed to see.
     * Normally, `compute_state_delta` would fetch the state at the
       start and end of the room's timeline to generate the sync
       response. Since the timeline is empty, it fetches the state at
       `min(now, last event in the room)`, which corresponds with the
       second user's leave. The state during the entirety of the second
       user's membership does not include the membership for the first
       user because of partial state.
       This part is also questionable, since we are fetching state from
       outside the bounds of the user's membership.
     * `compute_state_delta` then tries and fails to find the user's
       membership in the auth events of timeline events. Because there
       is no timeline event whose auth events are expected to contain
       the user's membership, a `KeyError` is raised.

Also contains a drive-by fix for a separate unlikely race condition.

Signed-off-by: Sean Quah <seanq@matrix.org>
Diffstat (limited to '')
-rw-r--r--changelog.d/15383.bugfix1
-rw-r--r--synapse/handlers/sync.py24
2 files changed, 20 insertions, 5 deletions
diff --git a/changelog.d/15383.bugfix b/changelog.d/15383.bugfix
new file mode 100644
index 0000000000..28c66ef454
--- /dev/null
+++ b/changelog.d/15383.bugfix
@@ -0,0 +1 @@
+Fix a rare bug introduced in Synapse 1.66.0 where initial syncs would fail when the user had been kicked from a faster joined room that had not finished syncing.
diff --git a/synapse/handlers/sync.py b/synapse/handlers/sync.py
index 9f5b83ed54..64d298408d 100644
--- a/synapse/handlers/sync.py
+++ b/synapse/handlers/sync.py
@@ -943,6 +943,8 @@ class SyncHandler:
 
                 timeline_state = {}
 
+                # Membership events to fetch that can be found in the room state, or in
+                # the case of partial state rooms, the auth events of timeline events.
                 members_to_fetch = set()
                 first_event_by_sender_map = {}
                 for event in batch.events:
@@ -964,9 +966,19 @@ class SyncHandler:
                     # (if we are) to fix https://github.com/vector-im/riot-web/issues/7209
                     # We only need apply this on full state syncs given we disabled
                     # LL for incr syncs in #3840.
-                    members_to_fetch.add(sync_config.user.to_string())
-
-                state_filter = StateFilter.from_lazy_load_member_list(members_to_fetch)
+                    # We don't insert ourselves into `members_to_fetch`, because in some
+                    # rare cases (an empty event batch with a now_token after the user's
+                    # leave in a partial state room which another local user has
+                    # joined), the room state will be missing our membership and there
+                    # is no guarantee that our membership will be in the auth events of
+                    # timeline events when the room is partial stated.
+                    state_filter = StateFilter.from_lazy_load_member_list(
+                        members_to_fetch.union((sync_config.user.to_string(),))
+                    )
+                else:
+                    state_filter = StateFilter.from_lazy_load_member_list(
+                        members_to_fetch
+                    )
 
                 # We are happy to use partial state to compute the `/sync` response.
                 # Since partial state may not include the lazy-loaded memberships we
@@ -988,7 +1000,9 @@ class SyncHandler:
             # sync's timeline and the start of the current sync's timeline.
             # See the docstring above for details.
             state_ids: StateMap[str]
-
+            # We need to know whether the state we fetch may be partial, so check
+            # whether the room is partial stated *before* fetching it.
+            is_partial_state_room = await self.store.is_partial_state_room(room_id)
             if full_state:
                 if batch:
                     state_at_timeline_end = (
@@ -1119,7 +1133,7 @@ class SyncHandler:
             # If we only have partial state for the room, `state_ids` may be missing the
             # memberships we wanted. We attempt to find some by digging through the auth
             # events of timeline events.
-            if lazy_load_members and await self.store.is_partial_state_room(room_id):
+            if lazy_load_members and is_partial_state_room:
                 assert members_to_fetch is not None
                 assert first_event_by_sender_map is not None