diff --git a/changelog.d/17499.bugfix b/changelog.d/17499.bugfix
new file mode 100644
index 0000000000..5cb7b3c30e
--- /dev/null
+++ b/changelog.d/17499.bugfix
@@ -0,0 +1 @@
+Fix a bug introduced in v1.110.0 which caused `/keys/query` to return incomplete results, leading to high network activity and CPU usage on Matrix clients.
diff --git a/synapse/handlers/e2e_keys.py b/synapse/handlers/e2e_keys.py
index 668cec513b..f78e66ad0a 100644
--- a/synapse/handlers/e2e_keys.py
+++ b/synapse/handlers/e2e_keys.py
@@ -291,13 +291,20 @@ class E2eKeysHandler:
# Only try and fetch keys for destinations that are not marked as
# down.
- filtered_destinations = await filter_destinations_by_retry_limiter(
- remote_queries_not_in_cache.keys(),
- self.clock,
- self.store,
- # Let's give an arbitrary grace period for those hosts that are
- # only recently down
- retry_due_within_ms=60 * 1000,
+ unfiltered_destinations = remote_queries_not_in_cache.keys()
+ filtered_destinations = set(
+ await filter_destinations_by_retry_limiter(
+ unfiltered_destinations,
+ self.clock,
+ self.store,
+ # Let's give an arbitrary grace period for those hosts that are
+ # only recently down
+ retry_due_within_ms=60 * 1000,
+ )
+ )
+ failures.update(
+ (dest, _NOT_READY_FOR_RETRY_FAILURE)
+ for dest in (unfiltered_destinations - filtered_destinations)
)
await concurrently_execute(
@@ -1641,6 +1648,9 @@ def _check_device_signature(
raise SynapseError(400, "Invalid signature", Codes.INVALID_SIGNATURE)
+_NOT_READY_FOR_RETRY_FAILURE = {"status": 503, "message": "Not ready for retry"}
+
+
def _exception_to_failure(e: Exception) -> JsonDict:
if isinstance(e, SynapseError):
return {"status": e.code, "errcode": e.errcode, "message": str(e)}
@@ -1649,7 +1659,7 @@ def _exception_to_failure(e: Exception) -> JsonDict:
return {"status": e.code, "message": str(e)}
if isinstance(e, NotRetryingDestination):
- return {"status": 503, "message": "Not ready for retry"}
+ return _NOT_READY_FOR_RETRY_FAILURE
# include ConnectionRefused and other errors
#
diff --git a/tests/handlers/test_e2e_keys.py b/tests/handlers/test_e2e_keys.py
index 0e6352ff4b..8a3dfdcf75 100644
--- a/tests/handlers/test_e2e_keys.py
+++ b/tests/handlers/test_e2e_keys.py
@@ -43,9 +43,7 @@ from tests.unittest import override_config
class E2eKeysHandlerTestCase(unittest.HomeserverTestCase):
def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
self.appservice_api = mock.AsyncMock()
- return self.setup_test_homeserver(
- federation_client=mock.Mock(), application_service_api=self.appservice_api
- )
+ return self.setup_test_homeserver(application_service_api=self.appservice_api)
def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
self.handler = hs.get_e2e_keys_handler()
@@ -1224,6 +1222,61 @@ class E2eKeysHandlerTestCase(unittest.HomeserverTestCase):
},
)
+ def test_query_devices_remote_down(self) -> None:
+ """Tests that querying keys for a remote user on an unreachable server returns
+ results in the "failures" property
+ """
+
+ remote_user_id = "@test:other"
+ local_user_id = "@test:test"
+
+ # The backoff code treats time zero as special
+ self.reactor.advance(5)
+
+ self.hs.get_federation_http_client().agent.request = mock.AsyncMock( # type: ignore[method-assign]
+ side_effect=Exception("boop")
+ )
+
+ e2e_handler = self.hs.get_e2e_keys_handler()
+
+ query_result = self.get_success(
+ e2e_handler.query_devices(
+ {
+ "device_keys": {remote_user_id: []},
+ },
+ timeout=10,
+ from_user_id=local_user_id,
+ from_device_id="some_device_id",
+ )
+ )
+
+ self.assertEqual(
+ query_result["failures"],
+ {
+ "other": {
+ "message": "Failed to send request: Exception: boop",
+ "status": 503,
+ }
+ },
+ )
+
+ # Do it again: we should hit the backoff
+ query_result = self.get_success(
+ e2e_handler.query_devices(
+ {
+ "device_keys": {remote_user_id: []},
+ },
+ timeout=10,
+ from_user_id=local_user_id,
+ from_device_id="some_device_id",
+ )
+ )
+
+ self.assertEqual(
+ query_result["failures"],
+ {"other": {"message": "Not ready for retry", "status": 503}},
+ )
+
@parameterized.expand(
[
# The remote homeserver's response indicates that this user has 0/1/2 devices.
|