summary refs log tree commit diff
diff options
context:
space:
mode:
authorErik Johnston <erik@matrix.org>2024-07-30 10:58:17 +0100
committerErik Johnston <erik@matrix.org>2024-07-30 10:58:17 +0100
commit3f6a71bd7105b7f86062235d9900b25c8fc0e1df (patch)
treec37982783876ff48018c39fe1d44c5d8a57eb367
parentMerge remote-tracking branch 'origin/develop' into matrix-org-hotfixes (diff)
parentAdd some more opentracing to sliding sync (#17501) (diff)
downloadsynapse-3f6a71bd7105b7f86062235d9900b25c8fc0e1df.tar.xz
Merge remote-tracking branch 'origin/develop' into matrix-org-hotfixes
-rw-r--r--changelog.d/17499.bugfix1
-rw-r--r--changelog.d/17501.misc1
-rw-r--r--synapse/handlers/e2e_keys.py26
-rw-r--r--synapse/handlers/sliding_sync.py18
-rw-r--r--synapse/rest/client/sync.py20
-rw-r--r--tests/handlers/test_e2e_keys.py59
6 files changed, 109 insertions, 16 deletions
diff --git a/changelog.d/17499.bugfix b/changelog.d/17499.bugfix
new file mode 100644
index 0000000000..5cb7b3c30e
--- /dev/null
+++ b/changelog.d/17499.bugfix
@@ -0,0 +1 @@
+Fix a bug introduced in v1.110.0 which caused `/keys/query` to return incomplete results, leading to high network activity and CPU usage on Matrix clients.
diff --git a/changelog.d/17501.misc b/changelog.d/17501.misc
new file mode 100644
index 0000000000..ba96472acb
--- /dev/null
+++ b/changelog.d/17501.misc
@@ -0,0 +1 @@
+Add some opentracing tags and logging to the experimental sliding sync implementation.
diff --git a/synapse/handlers/e2e_keys.py b/synapse/handlers/e2e_keys.py
index 668cec513b..f78e66ad0a 100644
--- a/synapse/handlers/e2e_keys.py
+++ b/synapse/handlers/e2e_keys.py
@@ -291,13 +291,20 @@ class E2eKeysHandler:
 
             # Only try and fetch keys for destinations that are not marked as
             # down.
-            filtered_destinations = await filter_destinations_by_retry_limiter(
-                remote_queries_not_in_cache.keys(),
-                self.clock,
-                self.store,
-                # Let's give an arbitrary grace period for those hosts that are
-                # only recently down
-                retry_due_within_ms=60 * 1000,
+            unfiltered_destinations = remote_queries_not_in_cache.keys()
+            filtered_destinations = set(
+                await filter_destinations_by_retry_limiter(
+                    unfiltered_destinations,
+                    self.clock,
+                    self.store,
+                    # Let's give an arbitrary grace period for those hosts that are
+                    # only recently down
+                    retry_due_within_ms=60 * 1000,
+                )
+            )
+            failures.update(
+                (dest, _NOT_READY_FOR_RETRY_FAILURE)
+                for dest in (unfiltered_destinations - filtered_destinations)
             )
 
             await concurrently_execute(
@@ -1641,6 +1648,9 @@ def _check_device_signature(
         raise SynapseError(400, "Invalid signature", Codes.INVALID_SIGNATURE)
 
 
+_NOT_READY_FOR_RETRY_FAILURE = {"status": 503, "message": "Not ready for retry"}
+
+
 def _exception_to_failure(e: Exception) -> JsonDict:
     if isinstance(e, SynapseError):
         return {"status": e.code, "errcode": e.errcode, "message": str(e)}
@@ -1649,7 +1659,7 @@ def _exception_to_failure(e: Exception) -> JsonDict:
         return {"status": e.code, "message": str(e)}
 
     if isinstance(e, NotRetryingDestination):
-        return {"status": 503, "message": "Not ready for retry"}
+        return _NOT_READY_FOR_RETRY_FAILURE
 
     # include ConnectionRefused and other errors
     #
diff --git a/synapse/handlers/sliding_sync.py b/synapse/handlers/sliding_sync.py
index 84f2fa18ff..73414dbf69 100644
--- a/synapse/handlers/sliding_sync.py
+++ b/synapse/handlers/sliding_sync.py
@@ -41,7 +41,7 @@ from synapse.api.constants import AccountDataTypes, Direction, EventTypes, Membe
 from synapse.events import EventBase
 from synapse.events.utils import strip_event
 from synapse.handlers.relations import BundledAggregations
-from synapse.logging.opentracing import start_active_span, tag_args, trace
+from synapse.logging.opentracing import log_kv, start_active_span, tag_args, trace
 from synapse.storage.databases.main.roommember import extract_heroes_from_room_summary
 from synapse.storage.databases.main.stream import CurrentStateDeltaMembership
 from synapse.storage.roommember import MemberSummary
@@ -444,6 +444,7 @@ class SlidingSyncHandler:
 
         return result
 
+    @trace
     async def current_sync_for_user(
         self,
         sync_config: SlidingSyncConfig,
@@ -682,8 +683,9 @@ class SlidingSyncHandler:
             if room_sync_result or not from_token:
                 rooms[room_id] = room_sync_result
 
-        with start_active_span("sliding_sync.generate_room_entries"):
-            await concurrently_execute(handle_room, relevant_room_map, 10)
+        if relevant_room_map:
+            with start_active_span("sliding_sync.generate_room_entries"):
+                await concurrently_execute(handle_room, relevant_room_map, 10)
 
         extensions = await self.get_extensions_response(
             sync_config=sync_config,
@@ -1161,6 +1163,7 @@ class SlidingSyncHandler:
 
         # return None
 
+    @trace
     async def filter_rooms(
         self,
         user: UserID,
@@ -1284,6 +1287,7 @@ class SlidingSyncHandler:
         # Assemble a new sync room map but only with the `filtered_room_id_set`
         return {room_id: sync_room_map[room_id] for room_id in filtered_room_id_set}
 
+    @trace
     async def sort_rooms(
         self,
         sync_room_map: Dict[str, _RoomMembershipForUser],
@@ -1492,6 +1496,10 @@ class SlidingSyncHandler:
             else:
                 assert_never(room_status.status)
 
+            log_kv({"sliding_sync.room_status": room_status})
+
+        log_kv({"sliding_sync.from_bound": from_bound, "sliding_sync.initial": initial})
+
         # Assemble the list of timeline events
         #
         # FIXME: It would be nice to make the `rooms` response more uniform regardless of
@@ -1890,6 +1898,7 @@ class SlidingSyncHandler:
             highlight_count=0,
         )
 
+    @trace
     async def get_extensions_response(
         self,
         sync_config: SlidingSyncConfig,
@@ -1942,6 +1951,7 @@ class SlidingSyncHandler:
             account_data=account_data_response,
         )
 
+    @trace
     async def get_to_device_extension_response(
         self,
         sync_config: SlidingSyncConfig,
@@ -2016,6 +2026,7 @@ class SlidingSyncHandler:
             events=messages,
         )
 
+    @trace
     async def get_e2ee_extension_response(
         self,
         sync_config: SlidingSyncConfig,
@@ -2066,6 +2077,7 @@ class SlidingSyncHandler:
             device_unused_fallback_key_types=device_unused_fallback_key_types,
         )
 
+    @trace
     async def get_account_data_extension_response(
         self,
         sync_config: SlidingSyncConfig,
diff --git a/synapse/rest/client/sync.py b/synapse/rest/client/sync.py
index bf3ac8d483..ccfce6bd53 100644
--- a/synapse/rest/client/sync.py
+++ b/synapse/rest/client/sync.py
@@ -52,7 +52,7 @@ from synapse.http.servlet import (
     parse_string,
 )
 from synapse.http.site import SynapseRequest
-from synapse.logging.opentracing import trace_with_opname
+from synapse.logging.opentracing import log_kv, set_tag, trace_with_opname
 from synapse.rest.admin.experimental_features import ExperimentalFeature
 from synapse.types import JsonDict, Requester, SlidingSyncStreamToken, StreamToken
 from synapse.types.rest.client import SlidingSyncBody
@@ -897,7 +897,23 @@ class SlidingSyncRestServlet(RestServlet):
         # by filter ID. For now, we will just prototype with always passing everything
         # in.
         body = parse_and_validate_json_object_from_request(request, SlidingSyncBody)
-        logger.info("Sliding sync request: %r", body)
+
+        # Tag and log useful data to differentiate requests.
+        set_tag("sliding_sync.conn_id", body.conn_id or "")
+        log_kv(
+            {
+                "sliding_sync.lists": {
+                    list_name: {
+                        "ranges": list_config.ranges,
+                        "timeline_limit": list_config.timeline_limit,
+                    }
+                    for list_name, list_config in (body.lists or {}).items()
+                },
+                "sliding_sync.room_subscriptions": list(
+                    (body.room_subscriptions or {}).keys()
+                ),
+            }
+        )
 
         sync_config = SlidingSyncConfig(
             user=user,
diff --git a/tests/handlers/test_e2e_keys.py b/tests/handlers/test_e2e_keys.py
index 0e6352ff4b..8a3dfdcf75 100644
--- a/tests/handlers/test_e2e_keys.py
+++ b/tests/handlers/test_e2e_keys.py
@@ -43,9 +43,7 @@ from tests.unittest import override_config
 class E2eKeysHandlerTestCase(unittest.HomeserverTestCase):
     def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
         self.appservice_api = mock.AsyncMock()
-        return self.setup_test_homeserver(
-            federation_client=mock.Mock(), application_service_api=self.appservice_api
-        )
+        return self.setup_test_homeserver(application_service_api=self.appservice_api)
 
     def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
         self.handler = hs.get_e2e_keys_handler()
@@ -1224,6 +1222,61 @@ class E2eKeysHandlerTestCase(unittest.HomeserverTestCase):
             },
         )
 
+    def test_query_devices_remote_down(self) -> None:
+        """Tests that querying keys for a remote user on an unreachable server returns
+        results in the "failures" property
+        """
+
+        remote_user_id = "@test:other"
+        local_user_id = "@test:test"
+
+        # The backoff code treats time zero as special
+        self.reactor.advance(5)
+
+        self.hs.get_federation_http_client().agent.request = mock.AsyncMock(  # type: ignore[method-assign]
+            side_effect=Exception("boop")
+        )
+
+        e2e_handler = self.hs.get_e2e_keys_handler()
+
+        query_result = self.get_success(
+            e2e_handler.query_devices(
+                {
+                    "device_keys": {remote_user_id: []},
+                },
+                timeout=10,
+                from_user_id=local_user_id,
+                from_device_id="some_device_id",
+            )
+        )
+
+        self.assertEqual(
+            query_result["failures"],
+            {
+                "other": {
+                    "message": "Failed to send request: Exception: boop",
+                    "status": 503,
+                }
+            },
+        )
+
+        # Do it again: we should hit the backoff
+        query_result = self.get_success(
+            e2e_handler.query_devices(
+                {
+                    "device_keys": {remote_user_id: []},
+                },
+                timeout=10,
+                from_user_id=local_user_id,
+                from_device_id="some_device_id",
+            )
+        )
+
+        self.assertEqual(
+            query_result["failures"],
+            {"other": {"message": "Not ready for retry", "status": 503}},
+        )
+
     @parameterized.expand(
         [
             # The remote homeserver's response indicates that this user has 0/1/2 devices.