diff --git a/changelog.d/13749.bugfix b/changelog.d/13749.bugfix
new file mode 100644
index 0000000000..8ffafec07b
--- /dev/null
+++ b/changelog.d/13749.bugfix
@@ -0,0 +1 @@
+Fix a long standing bug where device lists would remain cached when remote users left and rejoined the last room shared with the local homeserver.
diff --git a/synapse/handlers/device.py b/synapse/handlers/device.py
index c5ac169644..901e2310b7 100644
--- a/synapse/handlers/device.py
+++ b/synapse/handlers/device.py
@@ -45,7 +45,6 @@ from synapse.types import (
JsonDict,
StreamKeyType,
StreamToken,
- UserID,
get_domain_from_id,
get_verify_key_from_cross_signing_key,
)
@@ -324,8 +323,6 @@ class DeviceHandler(DeviceWorkerHandler):
self.device_list_updater.incoming_device_list_update,
)
- hs.get_distributor().observe("user_left_room", self.user_left_room)
-
# Whether `_handle_new_device_update_async` is currently processing.
self._handle_new_device_update_is_processing = False
@@ -569,14 +566,6 @@ class DeviceHandler(DeviceWorkerHandler):
StreamKeyType.DEVICE_LIST, position, users=[from_user_id]
)
- async def user_left_room(self, user: UserID, room_id: str) -> None:
- user_id = user.to_string()
- room_ids = await self.store.get_rooms_for_user(user_id)
- if not room_ids:
- # We no longer share rooms with this user, so we'll no longer
- # receive device updates. Mark this in DB.
- await self.store.mark_remote_user_device_list_as_unsubscribed(user_id)
-
async def store_dehydrated_device(
self,
user_id: str,
diff --git a/synapse/handlers/e2e_keys.py b/synapse/handlers/e2e_keys.py
index ec81639c78..8eed63ccf3 100644
--- a/synapse/handlers/e2e_keys.py
+++ b/synapse/handlers/e2e_keys.py
@@ -175,6 +175,32 @@ class E2eKeysHandler:
user_ids_not_in_cache,
remote_results,
) = await self.store.get_user_devices_from_cache(query_list)
+
+ # Check that the homeserver still shares a room with all cached users.
+ # Note that this check may be slightly racy when a remote user leaves a
+ # room after we have fetched their cached device list. In the worst case
+ # we will do extra federation queries for devices that we had cached.
+ cached_users = set(remote_results.keys())
+ valid_cached_users = (
+ await self.store.get_users_server_still_shares_room_with(
+ remote_results.keys()
+ )
+ )
+ invalid_cached_users = cached_users - valid_cached_users
+ if invalid_cached_users:
+ # Fix up results. If we get here, there is either a bug in device
+ # list tracking, or we hit the race mentioned above.
+ user_ids_not_in_cache.update(invalid_cached_users)
+ for invalid_user_id in invalid_cached_users:
+ remote_results.pop(invalid_user_id)
+ # This log message may be removed if it turns out it's almost
+ # entirely triggered by races.
+ logger.error(
+ "Devices for %s were cached, but the server no longer shares "
+ "any rooms with them. The cached device lists are stale.",
+ invalid_cached_users,
+ )
+
for user_id, devices in remote_results.items():
user_devices = results.setdefault(user_id, {})
for device_id, device in devices.items():
diff --git a/synapse/storage/controllers/persist_events.py b/synapse/storage/controllers/persist_events.py
index dad3731b9b..501dbbc990 100644
--- a/synapse/storage/controllers/persist_events.py
+++ b/synapse/storage/controllers/persist_events.py
@@ -598,9 +598,9 @@ class EventsPersistenceStorageController:
# room
state_delta_for_room: Dict[str, DeltaState] = {}
- # Set of remote users which were in rooms the server has left. We
- # should check if we still share any rooms and if not we mark their
- # device lists as stale.
+ # Set of remote users which were in rooms the server has left or who may
+ # have left rooms the server is in. We should check if we still share any
+ # rooms and if not we mark their device lists as stale.
potentially_left_users: Set[str] = set()
if not backfilled:
@@ -725,6 +725,20 @@ class EventsPersistenceStorageController:
current_state = {}
delta.no_longer_in_room = True
+ # Add all remote users that might have left rooms.
+ potentially_left_users.update(
+ user_id
+ for event_type, user_id in delta.to_delete
+ if event_type == EventTypes.Member
+ and not self.is_mine_id(user_id)
+ )
+ potentially_left_users.update(
+ user_id
+ for event_type, user_id in delta.to_insert.keys()
+ if event_type == EventTypes.Member
+ and not self.is_mine_id(user_id)
+ )
+
state_delta_for_room[room_id] = delta
await self.persist_events_store._persist_events_and_state_updates(
diff --git a/tests/handlers/test_e2e_keys.py b/tests/handlers/test_e2e_keys.py
index 1e6ad4b663..95698bc275 100644
--- a/tests/handlers/test_e2e_keys.py
+++ b/tests/handlers/test_e2e_keys.py
@@ -891,6 +891,12 @@ class E2eKeysHandlerTestCase(unittest.HomeserverTestCase):
new_callable=mock.MagicMock,
return_value=make_awaitable(["some_room_id"]),
)
+ mock_get_users = mock.patch.object(
+ self.store,
+ "get_users_server_still_shares_room_with",
+ new_callable=mock.MagicMock,
+ return_value=make_awaitable({remote_user_id}),
+ )
mock_request = mock.patch.object(
self.hs.get_federation_client(),
"query_user_devices",
@@ -898,7 +904,7 @@ class E2eKeysHandlerTestCase(unittest.HomeserverTestCase):
return_value=make_awaitable(response_body),
)
- with mock_get_rooms, mock_request as mocked_federation_request:
+ with mock_get_rooms, mock_get_users, mock_request as mocked_federation_request:
# Make the first query and sanity check it succeeds.
response_1 = self.get_success(
e2e_handler.query_devices(
|