summary refs log tree commit diff
diff options
context:
space:
mode:
authorRichard van der Hoff <richard@matrix.org>2024-02-22 15:54:32 +0000
committerRichard van der Hoff <richard@matrix.org>2024-04-11 13:09:45 +0100
commitbc435f7d9d830c7b8f241b749f4b9a95f9ffbeba (patch)
treedbaf3e1d98607ebf087a2f47c59cca15e83be29d
parentStabilize support for MSC4010: push rules & account data. (#17022) (diff)
downloadsynapse-github/rav/log_state_desync.tar.xz
Add some debug to keep track of client state desynchronisation github/rav/log_state_desync rav/log_state_desync
Fixes https://github.com/element-hq/crypto-internal/issues/179
-rw-r--r--synapse/handlers/sync.py138
1 files changed, 138 insertions, 0 deletions
diff --git a/synapse/handlers/sync.py b/synapse/handlers/sync.py
index a6d54ee4b8..dafcfa89fb 100644
--- a/synapse/handlers/sync.py
+++ b/synapse/handlers/sync.py
@@ -88,6 +88,10 @@ if TYPE_CHECKING:
 
 logger = logging.getLogger(__name__)
 
+# Logging for https://github.com/matrix-org/matrix-spec/issues/1209 and
+# https://github.com/element-hq/synapse/issues/16940
+client_state_desync_logger = logging.getLogger("synapse.client_state_desync_debug")
+
 # Counts the number of times we returned a non-empty sync. `type` is one of
 # "initial_sync", "full_state_sync" or "incremental_sync", `lazy_loaded` is
 # "true" or "false" depending on if the request asked for lazy loaded members or
@@ -1214,6 +1218,12 @@ class SyncHandler:
             previous_timeline_end={},
             lazy_load_members=lazy_load_members,
         )
+
+        if client_state_desync_logger.isEnabledFor(logging.DEBUG):
+            await self._log_client_state_desync(
+                room_id, None, state_ids, timeline_state, lazy_load_members
+            )
+
         return state_ids
 
     async def _compute_state_delta_for_incremental_sync(
@@ -1359,6 +1369,15 @@ class SyncHandler:
             lazy_load_members=lazy_load_members,
         )
 
+        if client_state_desync_logger.isEnabledFor(logging.DEBUG):
+            await self._log_client_state_desync(
+                room_id,
+                since_token,
+                state_ids,
+                timeline_state,
+                lazy_load_members,
+            )
+
         return state_ids
 
     async def _find_missing_partial_state_memberships(
@@ -1475,6 +1494,125 @@ class SyncHandler:
 
         return additional_state_ids
 
+    async def _log_client_state_desync(
+        self,
+        room_id: str,
+        since_token: Optional[StreamToken],
+        sync_response_state_state: StateMap[str],
+        sync_response_timeline_state: StateMap[str],
+        lazy_load_members: bool,
+    ) -> None:
+        """
+        Logging to see how often the client's state gets out of sync with the
+        actual current state of the room.
+
+        There are few different potential failure modes here:
+
+         * State resolution can cause changes in the state of the room that don't
+           directly correspond to events with the corresponding (type, state_key).
+           https://github.com/matrix-org/matrix-spec/issues/1209 discusses this in
+           more detail.
+
+         * Even where there is an event that causes a given state change, Synapse
+           may not serve it to the client, since it works on state at specific points
+           in the DAG, rather than "current state".
+           See https://github.com/element-hq/synapse/issues/16940.
+
+         * Lazy-loading adds more complexity, as it means that events that would
+           normally be served via the `state` part of an incremental sync are filtered
+           out.
+
+        To try to get a handle on this, let's put ourselves in the shoes of a client,
+        and compare the state they will calculate against the actual current state.
+        """
+        # We only care about membership events.
+        state_filter = StateFilter.from_types(types=(("m.room.member", None),))
+
+        if since_token is None:
+            if lazy_load_members:
+                # For initial syncs with lazy-loading enabled, there's not too much
+                # concern here. We know the client will do a `/members` query before
+                # doing any encryption, so what sync returns isn't too important.
+                #
+                # (Of course, then `/members` might also return an incomplete list, but
+                # that's a separate problem.)
+                return
+
+            # For regular initial syncs, compare the returned response with the actual
+            # current state.
+            client_calculated_state = {}
+            client_calculated_state.update(sync_response_state_state)
+            client_calculated_state.update(sync_response_timeline_state)
+        else:
+            # For an incremental (gappy or otherwise) sync, let's assume the client has
+            # a complete membership list as of the last sync (or rather, at
+            # `since_token`, which is the closest approximation we have to it
+            # right now), and see what they would calculate as the current state given
+            # this sync update.
+
+            client_calculated_state = dict(
+                await self.get_state_at(
+                    room_id,
+                    stream_position=since_token,
+                    state_filter=state_filter,
+                    await_full_state=False,
+                )
+            )
+            client_calculated_state.update(sync_response_state_state)
+            client_calculated_state.update(sync_response_timeline_state)
+
+        current_state = await self._state_storage_controller.get_current_state_ids(
+            room_id, state_filter=state_filter, await_full_state=False
+        )
+        missing_users = await self._calculate_missing_members(
+            current_state, client_calculated_state
+        )
+        if missing_users:
+            client_state_desync_logger.debug(
+                "client state discrepancy in incremental sync in room %s: missing users %s",
+                room_id,
+                missing_users,
+            )
+
+    async def _calculate_missing_members(
+        self,
+        actual_state: StateMap[str],
+        client_calculated_state: StateMap[str],
+    ) -> List[str]:
+        """Helper for `_log_client_state_desync`: calculates the difference in
+        joined members between two state maps.
+
+        Returns:
+            A list of user IDs
+        """
+        missing_users = []
+
+        async def event_id_to_membership(event_id: Optional[str]) -> Optional[str]:
+            if event_id is None:
+                return None
+            event = await self.store.get_event(event_id, allow_none=True)
+            if event is None:
+                return "MISSING_EVENT"
+            return event.membership
+
+        # Check for joined members in the actual state that are missing or have a
+        # different membership in the actual state.
+        for (event_type, state_key), actual_event_id in actual_state.items():
+            if event_type != EventTypes.Member:
+                continue
+
+            calculated_event_id = client_calculated_state.get((event_type, state_key))
+            if calculated_event_id != actual_event_id:
+                actual_membership = event_id_to_membership(actual_event_id)
+                calculated_membership = event_id_to_membership(calculated_event_id)
+                if (
+                    actual_membership == Membership.JOIN
+                    and calculated_membership != Membership.JOIN
+                ):
+                    missing_users.append(state_key)
+
+        return missing_users
+
     async def unread_notifs_for_room_id(
         self, room_id: str, sync_config: SyncConfig
     ) -> RoomNotifCounts: