From 1bf143699c0ac8dd53111bfca4628f126d65210d Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 28 Aug 2023 11:03:23 -0400 Subject: Combine logic about not overriding BUSY presence. (#16170) Simplify some of the presence code by reducing duplicated code between worker & non-worker modes. The main change is to push some of the logic from `user_syncing` into `set_state`. This is done by passing whether the user is setting the presence via a `/sync` with a new `is_sync` flag to `set_state`. If this is `true` some additional logic is performed: * Don't override `busy` presence. * Update the `last_user_sync_ts`. * Never update the status message. --- tests/handlers/test_presence.py | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) (limited to 'tests/handlers/test_presence.py') diff --git a/tests/handlers/test_presence.py b/tests/handlers/test_presence.py index 1aebcc16ad..a3fdcf7f93 100644 --- a/tests/handlers/test_presence.py +++ b/tests/handlers/test_presence.py @@ -641,13 +641,20 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase): """Test that if an external process doesn't update the records for a while we time out their syncing users presence. """ - process_id = "1" - # Notify handler that a user is now syncing. + # Create a worker and use it to handle /sync traffic instead. + # This is used to test that presence changes get replicated from workers + # to the main process correctly. + worker_to_sync_against = self.make_worker_hs( + "synapse.app.generic_worker", {"worker_name": "synchrotron"} + ) + worker_presence_handler = worker_to_sync_against.get_presence_handler() + self.get_success( - self.presence_handler.update_external_syncs_row( - process_id, self.user_id, True, self.clock.time_msec() - ) + worker_presence_handler.user_syncing( + self.user_id, True, PresenceState.ONLINE + ), + by=0.1, ) # Check that if we wait a while without telling the handler the user has @@ -820,7 +827,7 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase): # This is used to test that presence changes get replicated from workers # to the main process correctly. worker_to_sync_against = self.make_worker_hs( - "synapse.app.generic_worker", {"worker_name": "presence_writer"} + "synapse.app.generic_worker", {"worker_name": "synchrotron"} ) # Set presence to BUSY @@ -832,7 +839,8 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase): self.get_success( worker_to_sync_against.get_presence_handler().user_syncing( self.user_id, True, PresenceState.ONLINE - ) + ), + by=0.1, ) # Check against the main process that the user's presence did not change. @@ -840,6 +848,21 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase): # we should still be busy self.assertEqual(state.state, PresenceState.BUSY) + # Advance such that the device would be discarded if it was not busy, + # then pump so _handle_timeouts function to called. + self.reactor.advance(IDLE_TIMER / 1000) + self.reactor.pump([5]) + + # The account should still be busy. + state = self.get_success(self.presence_handler.get_state(self.user_id_obj)) + self.assertEqual(state.state, PresenceState.BUSY) + + # Ensure that a /presence call can set the user *off* busy. + self._set_presencestate_with_status_msg(PresenceState.ONLINE, status_msg) + + state = self.get_success(self.presence_handler.get_state(self.user_id_obj)) + self.assertEqual(state.state, PresenceState.ONLINE) + def _set_presencestate_with_status_msg( self, state: str, status_msg: Optional[str] ) -> None: -- cgit 1.5.1 From 40901af5e096cb10ab69141875b071b4ea4ed1e0 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 28 Aug 2023 13:08:49 -0400 Subject: Pass the device ID around in the presence handler (#16171) Refactoring to pass the device ID (in addition to the user ID) through the presence handler (specifically the `user_syncing`, `set_state`, and `bump_presence_active_time` methods and their replication versions). --- changelog.d/16171.misc | 1 + synapse/handlers/events.py | 1 + synapse/handlers/message.py | 9 ++++--- synapse/handlers/presence.py | 46 +++++++++++++++++++++++++++++------- synapse/replication/http/presence.py | 11 +++++---- synapse/rest/client/presence.py | 2 +- synapse/rest/client/read_marker.py | 4 +++- synapse/rest/client/receipts.py | 4 +++- synapse/rest/client/room.py | 4 +++- synapse/rest/client/sync.py | 1 + tests/handlers/test_presence.py | 38 ++++++++++++++++++++--------- 11 files changed, 91 insertions(+), 30 deletions(-) create mode 100644 changelog.d/16171.misc (limited to 'tests/handlers/test_presence.py') diff --git a/changelog.d/16171.misc b/changelog.d/16171.misc new file mode 100644 index 0000000000..4d709cb56e --- /dev/null +++ b/changelog.d/16171.misc @@ -0,0 +1 @@ +Track per-device information in the presence code. diff --git a/synapse/handlers/events.py b/synapse/handlers/events.py index 33359f6ed7..d12803bf0f 100644 --- a/synapse/handlers/events.py +++ b/synapse/handlers/events.py @@ -67,6 +67,7 @@ class EventStreamHandler: context = await presence_handler.user_syncing( requester.user.to_string(), + requester.device_id, affect_presence=affect_presence, presence_state=PresenceState.ONLINE, ) diff --git a/synapse/handlers/message.py b/synapse/handlers/message.py index 3184bfb047..4a15c76a7b 100644 --- a/synapse/handlers/message.py +++ b/synapse/handlers/message.py @@ -1921,7 +1921,10 @@ class EventCreationHandler: # We don't want to block sending messages on any presence code. This # matters as sometimes presence code can take a while. run_as_background_process( - "bump_presence_active_time", self._bump_active_time, requester.user + "bump_presence_active_time", + self._bump_active_time, + requester.user, + requester.device_id, ) async def _notify() -> None: @@ -1958,10 +1961,10 @@ class EventCreationHandler: logger.info("maybe_kick_guest_users %r", current_state) await self.hs.get_room_member_handler().kick_guest_users(current_state) - async def _bump_active_time(self, user: UserID) -> None: + async def _bump_active_time(self, user: UserID, device_id: Optional[str]) -> None: try: presence = self.hs.get_presence_handler() - await presence.bump_presence_active_time(user) + await presence.bump_presence_active_time(user, device_id) except Exception: logger.exception("Error bumping presence active time") diff --git a/synapse/handlers/presence.py b/synapse/handlers/presence.py index c395dcdb43..50c68c86ce 100644 --- a/synapse/handlers/presence.py +++ b/synapse/handlers/presence.py @@ -165,7 +165,11 @@ class BasePresenceHandler(abc.ABC): @abc.abstractmethod async def user_syncing( - self, user_id: str, affect_presence: bool, presence_state: str + self, + user_id: str, + device_id: Optional[str], + affect_presence: bool, + presence_state: str, ) -> ContextManager[None]: """Returns a context manager that should surround any stream requests from the user. @@ -176,6 +180,7 @@ class BasePresenceHandler(abc.ABC): Args: user_id: the user that is starting a sync + device_id: the user's device that is starting a sync affect_presence: If false this function will be a no-op. Useful for streams that are not associated with an actual client that is being used by a user. @@ -252,6 +257,7 @@ class BasePresenceHandler(abc.ABC): async def set_state( self, target_user: UserID, + device_id: Optional[str], state: JsonDict, force_notify: bool = False, is_sync: bool = False, @@ -260,6 +266,7 @@ class BasePresenceHandler(abc.ABC): Args: target_user: The ID of the user to set the presence state of. + device_id: the device that the user is setting the presence state of. state: The presence state as a JSON dictionary. force_notify: Whether to force notification of the update to clients. is_sync: True if this update was from a sync, which results in @@ -269,7 +276,9 @@ class BasePresenceHandler(abc.ABC): """ @abc.abstractmethod - async def bump_presence_active_time(self, user: UserID) -> None: + async def bump_presence_active_time( + self, user: UserID, device_id: Optional[str] + ) -> None: """We've seen the user do something that indicates they're interacting with the app. """ @@ -381,7 +390,9 @@ class BasePresenceHandler(abc.ABC): # We set force_notify=True here so that this presence update is guaranteed to # increment the presence stream ID (which resending the current user's presence # otherwise would not do). - await self.set_state(UserID.from_string(user_id), state, force_notify=True) + await self.set_state( + UserID.from_string(user_id), None, state, force_notify=True + ) async def is_visible(self, observed_user: UserID, observer_user: UserID) -> bool: raise NotImplementedError( @@ -481,7 +492,11 @@ class WorkerPresenceHandler(BasePresenceHandler): self.send_user_sync(user_id, False, last_sync_ms) async def user_syncing( - self, user_id: str, affect_presence: bool, presence_state: str + self, + user_id: str, + device_id: Optional[str], + affect_presence: bool, + presence_state: str, ) -> ContextManager[None]: """Record that a user is syncing. @@ -495,6 +510,7 @@ class WorkerPresenceHandler(BasePresenceHandler): # what the spec wants. await self.set_state( UserID.from_string(user_id), + device_id, state={"presence": presence_state}, is_sync=True, ) @@ -592,6 +608,7 @@ class WorkerPresenceHandler(BasePresenceHandler): async def set_state( self, target_user: UserID, + device_id: Optional[str], state: JsonDict, force_notify: bool = False, is_sync: bool = False, @@ -600,6 +617,7 @@ class WorkerPresenceHandler(BasePresenceHandler): Args: target_user: The ID of the user to set the presence state of. + device_id: the device that the user is setting the presence state of. state: The presence state as a JSON dictionary. force_notify: Whether to force notification of the update to clients. is_sync: True if this update was from a sync, which results in @@ -622,12 +640,15 @@ class WorkerPresenceHandler(BasePresenceHandler): await self._set_state_client( instance_name=self._presence_writer_instance, user_id=user_id, + device_id=device_id, state=state, force_notify=force_notify, is_sync=is_sync, ) - async def bump_presence_active_time(self, user: UserID) -> None: + async def bump_presence_active_time( + self, user: UserID, device_id: Optional[str] + ) -> None: """We've seen the user do something that indicates they're interacting with the app. """ @@ -638,7 +659,9 @@ class WorkerPresenceHandler(BasePresenceHandler): # Proxy request to instance that writes presence user_id = user.to_string() await self._bump_active_client( - instance_name=self._presence_writer_instance, user_id=user_id + instance_name=self._presence_writer_instance, + user_id=user_id, + device_id=device_id, ) @@ -943,7 +966,9 @@ class PresenceHandler(BasePresenceHandler): return await self._update_states(changes) - async def bump_presence_active_time(self, user: UserID) -> None: + async def bump_presence_active_time( + self, user: UserID, device_id: Optional[str] + ) -> None: """We've seen the user do something that indicates they're interacting with the app. """ @@ -966,6 +991,7 @@ class PresenceHandler(BasePresenceHandler): async def user_syncing( self, user_id: str, + device_id: Optional[str], affect_presence: bool = True, presence_state: str = PresenceState.ONLINE, ) -> ContextManager[None]: @@ -977,7 +1003,8 @@ class PresenceHandler(BasePresenceHandler): when users disconnect/reconnect. Args: - user_id + user_id: the user that is starting a sync + device_id: the user's device that is starting a sync affect_presence: If false this function will be a no-op. Useful for streams that are not associated with an actual client that is being used by a user. @@ -993,6 +1020,7 @@ class PresenceHandler(BasePresenceHandler): # what the spec wants. await self.set_state( UserID.from_string(user_id), + device_id, state={"presence": presence_state}, is_sync=True, ) @@ -1163,6 +1191,7 @@ class PresenceHandler(BasePresenceHandler): async def set_state( self, target_user: UserID, + device_id: Optional[str], state: JsonDict, force_notify: bool = False, is_sync: bool = False, @@ -1171,6 +1200,7 @@ class PresenceHandler(BasePresenceHandler): Args: target_user: The ID of the user to set the presence state of. + device_id: the device that the user is setting the presence state of. state: The presence state as a JSON dictionary. force_notify: Whether to force notification of the update to clients. is_sync: True if this update was from a sync, which results in diff --git a/synapse/replication/http/presence.py b/synapse/replication/http/presence.py index a24fb9310b..6c9e79fb07 100644 --- a/synapse/replication/http/presence.py +++ b/synapse/replication/http/presence.py @@ -13,7 +13,7 @@ # limitations under the License. import logging -from typing import TYPE_CHECKING, Tuple +from typing import TYPE_CHECKING, Optional, Tuple from twisted.web.server import Request @@ -51,14 +51,14 @@ class ReplicationBumpPresenceActiveTime(ReplicationEndpoint): self._presence_handler = hs.get_presence_handler() @staticmethod - async def _serialize_payload(user_id: str) -> JsonDict: # type: ignore[override] - return {} + async def _serialize_payload(user_id: str, device_id: Optional[str]) -> JsonDict: # type: ignore[override] + return {"device_id": device_id} async def _handle_request( # type: ignore[override] self, request: Request, content: JsonDict, user_id: str ) -> Tuple[int, JsonDict]: await self._presence_handler.bump_presence_active_time( - UserID.from_string(user_id) + UserID.from_string(user_id), content.get("device_id") ) return (200, {}) @@ -95,11 +95,13 @@ class ReplicationPresenceSetState(ReplicationEndpoint): @staticmethod async def _serialize_payload( # type: ignore[override] user_id: str, + device_id: Optional[str], state: JsonDict, force_notify: bool = False, is_sync: bool = False, ) -> JsonDict: return { + "device_id": device_id, "state": state, "force_notify": force_notify, "is_sync": is_sync, @@ -110,6 +112,7 @@ class ReplicationPresenceSetState(ReplicationEndpoint): ) -> Tuple[int, JsonDict]: await self._presence_handler.set_state( UserID.from_string(user_id), + content.get("device_id"), content["state"], content["force_notify"], content.get("is_sync", False), diff --git a/synapse/rest/client/presence.py b/synapse/rest/client/presence.py index 8e193330f8..d578faa969 100644 --- a/synapse/rest/client/presence.py +++ b/synapse/rest/client/presence.py @@ -97,7 +97,7 @@ class PresenceStatusRestServlet(RestServlet): raise SynapseError(400, "Unable to parse state") if self._use_presence: - await self.presence_handler.set_state(user, state) + await self.presence_handler.set_state(user, requester.device_id, state) return 200, {} diff --git a/synapse/rest/client/read_marker.py b/synapse/rest/client/read_marker.py index 4f96e51eeb..1707e51972 100644 --- a/synapse/rest/client/read_marker.py +++ b/synapse/rest/client/read_marker.py @@ -52,7 +52,9 @@ class ReadMarkerRestServlet(RestServlet): ) -> Tuple[int, JsonDict]: requester = await self.auth.get_user_by_req(request) - await self.presence_handler.bump_presence_active_time(requester.user) + await self.presence_handler.bump_presence_active_time( + requester.user, requester.device_id + ) body = parse_json_object_from_request(request) diff --git a/synapse/rest/client/receipts.py b/synapse/rest/client/receipts.py index 316e7b9982..869a374459 100644 --- a/synapse/rest/client/receipts.py +++ b/synapse/rest/client/receipts.py @@ -94,7 +94,9 @@ class ReceiptRestServlet(RestServlet): Codes.INVALID_PARAM, ) - await self.presence_handler.bump_presence_active_time(requester.user) + await self.presence_handler.bump_presence_active_time( + requester.user, requester.device_id + ) if receipt_type == ReceiptTypes.FULLY_READ: await self.read_marker_handler.received_client_read_marker( diff --git a/synapse/rest/client/room.py b/synapse/rest/client/room.py index dc498001e4..553938ce9d 100644 --- a/synapse/rest/client/room.py +++ b/synapse/rest/client/room.py @@ -1229,7 +1229,9 @@ class RoomTypingRestServlet(RestServlet): content = parse_json_object_from_request(request) - await self.presence_handler.bump_presence_active_time(requester.user) + await self.presence_handler.bump_presence_active_time( + requester.user, requester.device_id + ) # Limit timeout to stop people from setting silly typing timeouts. timeout = min(content.get("timeout", 30000), 120000) diff --git a/synapse/rest/client/sync.py b/synapse/rest/client/sync.py index d7854ed4fd..42bdd3bb10 100644 --- a/synapse/rest/client/sync.py +++ b/synapse/rest/client/sync.py @@ -205,6 +205,7 @@ class SyncRestServlet(RestServlet): context = await self.presence_handler.user_syncing( user.to_string(), + requester.device_id, affect_presence=affect_presence, presence_state=set_presence, ) diff --git a/tests/handlers/test_presence.py b/tests/handlers/test_presence.py index a3fdcf7f93..a987267308 100644 --- a/tests/handlers/test_presence.py +++ b/tests/handlers/test_presence.py @@ -524,6 +524,7 @@ class PresenceHandlerInitTestCase(unittest.HomeserverTestCase): def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: self.user_id = f"@test:{self.hs.config.server.server_name}" + self.device_id = "dev-1" # Move the reactor to the initial time. self.reactor.advance(1000) @@ -608,7 +609,10 @@ class PresenceHandlerInitTestCase(unittest.HomeserverTestCase): self.reactor.advance(SYNC_ONLINE_TIMEOUT / 1000 / 2) self.get_success( presence_handler.user_syncing( - self.user_id, sync_state != PresenceState.OFFLINE, sync_state + self.user_id, + self.device_id, + sync_state != PresenceState.OFFLINE, + sync_state, ) ) @@ -632,6 +636,7 @@ class PresenceHandlerInitTestCase(unittest.HomeserverTestCase): class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase): user_id = "@test:server" user_id_obj = UserID.from_string(user_id) + device_id = "dev-1" def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: self.presence_handler = hs.get_presence_handler() @@ -652,7 +657,7 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase): self.get_success( worker_presence_handler.user_syncing( - self.user_id, True, PresenceState.ONLINE + self.user_id, self.device_id, True, PresenceState.ONLINE ), by=0.1, ) @@ -708,7 +713,7 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase): # Mark user as offline self.get_success( self.presence_handler.set_state( - self.user_id_obj, {"presence": PresenceState.OFFLINE} + self.user_id_obj, self.device_id, {"presence": PresenceState.OFFLINE} ) ) @@ -740,7 +745,7 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase): # Mark user as online again self.get_success( self.presence_handler.set_state( - self.user_id_obj, {"presence": PresenceState.ONLINE} + self.user_id_obj, self.device_id, {"presence": PresenceState.ONLINE} ) ) @@ -769,7 +774,7 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase): self.get_success( self.presence_handler.user_syncing( - self.user_id, False, PresenceState.ONLINE + self.user_id, self.device_id, False, PresenceState.ONLINE ) ) @@ -786,7 +791,9 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase): self._set_presencestate_with_status_msg(PresenceState.UNAVAILABLE, status_msg) self.get_success( - self.presence_handler.user_syncing(self.user_id, True, PresenceState.ONLINE) + self.presence_handler.user_syncing( + self.user_id, self.device_id, True, PresenceState.ONLINE + ) ) state = self.get_success(self.presence_handler.get_state(self.user_id_obj)) @@ -800,7 +807,9 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase): self._set_presencestate_with_status_msg(PresenceState.UNAVAILABLE, status_msg) self.get_success( - self.presence_handler.user_syncing(self.user_id, True, PresenceState.ONLINE) + self.presence_handler.user_syncing( + self.user_id, self.device_id, True, PresenceState.ONLINE + ) ) state = self.get_success(self.presence_handler.get_state(self.user_id_obj)) @@ -838,7 +847,7 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase): # /presence/*. self.get_success( worker_to_sync_against.get_presence_handler().user_syncing( - self.user_id, True, PresenceState.ONLINE + self.user_id, self.device_id, True, PresenceState.ONLINE ), by=0.1, ) @@ -875,6 +884,7 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase): self.get_success( self.presence_handler.set_state( self.user_id_obj, + self.device_id, {"presence": state, "status_msg": status_msg}, ) ) @@ -1116,7 +1126,9 @@ class PresenceJoinTestCase(unittest.HomeserverTestCase): # Mark test2 as online, test will be offline with a last_active of 0 self.get_success( self.presence_handler.set_state( - UserID.from_string("@test2:server"), {"presence": PresenceState.ONLINE} + UserID.from_string("@test2:server"), + "dev-1", + {"presence": PresenceState.ONLINE}, ) ) self.reactor.pump([0]) # Wait for presence updates to be handled @@ -1163,7 +1175,9 @@ class PresenceJoinTestCase(unittest.HomeserverTestCase): # Mark test as online self.get_success( self.presence_handler.set_state( - UserID.from_string("@test:server"), {"presence": PresenceState.ONLINE} + UserID.from_string("@test:server"), + "dev-1", + {"presence": PresenceState.ONLINE}, ) ) @@ -1171,7 +1185,9 @@ class PresenceJoinTestCase(unittest.HomeserverTestCase): # Note we don't join them to the room yet self.get_success( self.presence_handler.set_state( - UserID.from_string("@test2:server"), {"presence": PresenceState.ONLINE} + UserID.from_string("@test2:server"), + "dev-1", + {"presence": PresenceState.ONLINE}, ) ) -- cgit 1.5.1 From d35bed8369514fe727b4fe1afb68f48cc8b2655a Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Mon, 4 Sep 2023 17:14:09 +0100 Subject: Don't wake up destination transaction queue if they're not due for retry. (#16223) --- changelog.d/16223.feature | 1 + synapse/federation/send_queue.py | 12 +-- synapse/federation/sender/__init__.py | 86 +++++++++++++++------- synapse/federation/sender/per_destination_queue.py | 6 +- synapse/handlers/device.py | 26 +++---- synapse/handlers/devicemessage.py | 7 +- synapse/handlers/presence.py | 16 ++-- synapse/handlers/typing.py | 14 +++- synapse/module_api/__init__.py | 2 +- synapse/replication/tcp/client.py | 8 +- synapse/storage/databases/main/transactions.py | 26 ++++++- synapse/util/retryutils.py | 25 +++++++ tests/federation/test_federation_sender.py | 27 ++++--- tests/handlers/test_presence.py | 60 ++++++++++++--- tests/handlers/test_typing.py | 2 - 15 files changed, 228 insertions(+), 90 deletions(-) create mode 100644 changelog.d/16223.feature (limited to 'tests/handlers/test_presence.py') diff --git a/changelog.d/16223.feature b/changelog.d/16223.feature new file mode 100644 index 0000000000..a52d66658b --- /dev/null +++ b/changelog.d/16223.feature @@ -0,0 +1 @@ +Improve resource usage when sending data to a large number of remote hosts that are marked as "down". diff --git a/synapse/federation/send_queue.py b/synapse/federation/send_queue.py index fb448f2155..6520795635 100644 --- a/synapse/federation/send_queue.py +++ b/synapse/federation/send_queue.py @@ -49,7 +49,7 @@ from synapse.api.presence import UserPresenceState from synapse.federation.sender import AbstractFederationSender, FederationSender from synapse.metrics import LaterGauge from synapse.replication.tcp.streams.federation import FederationStream -from synapse.types import JsonDict, ReadReceipt, RoomStreamToken +from synapse.types import JsonDict, ReadReceipt, RoomStreamToken, StrCollection from synapse.util.metrics import Measure from .units import Edu @@ -229,7 +229,7 @@ class FederationRemoteSendQueue(AbstractFederationSender): """ # nothing to do here: the replication listener will handle it. - def send_presence_to_destinations( + async def send_presence_to_destinations( self, states: Iterable[UserPresenceState], destinations: Iterable[str] ) -> None: """As per FederationSender @@ -245,7 +245,9 @@ class FederationRemoteSendQueue(AbstractFederationSender): self.notifier.on_new_replication_data() - def send_device_messages(self, destination: str, immediate: bool = True) -> None: + async def send_device_messages( + self, destinations: StrCollection, immediate: bool = True + ) -> None: """As per FederationSender""" # We don't need to replicate this as it gets sent down a different # stream. @@ -463,7 +465,7 @@ class ParsedFederationStreamData: edus: Dict[str, List[Edu]] -def process_rows_for_federation( +async def process_rows_for_federation( transaction_queue: FederationSender, rows: List[FederationStream.FederationStreamRow], ) -> None: @@ -496,7 +498,7 @@ def process_rows_for_federation( parsed_row.add_to_buffer(buff) for state, destinations in buff.presence_destinations: - transaction_queue.send_presence_to_destinations( + await transaction_queue.send_presence_to_destinations( states=[state], destinations=destinations ) diff --git a/synapse/federation/sender/__init__.py b/synapse/federation/sender/__init__.py index 97abbdee18..fb20fd8a10 100644 --- a/synapse/federation/sender/__init__.py +++ b/synapse/federation/sender/__init__.py @@ -147,7 +147,10 @@ from twisted.internet import defer import synapse.metrics from synapse.api.presence import UserPresenceState from synapse.events import EventBase -from synapse.federation.sender.per_destination_queue import PerDestinationQueue +from synapse.federation.sender.per_destination_queue import ( + CATCHUP_RETRY_INTERVAL, + PerDestinationQueue, +) from synapse.federation.sender.transaction_manager import TransactionManager from synapse.federation.units import Edu from synapse.logging.context import make_deferred_yieldable, run_in_background @@ -161,9 +164,10 @@ from synapse.metrics.background_process_metrics import ( run_as_background_process, wrap_as_background_process, ) -from synapse.types import JsonDict, ReadReceipt, RoomStreamToken +from synapse.types import JsonDict, ReadReceipt, RoomStreamToken, StrCollection from synapse.util import Clock from synapse.util.metrics import Measure +from synapse.util.retryutils import filter_destinations_by_retry_limiter if TYPE_CHECKING: from synapse.events.presence_router import PresenceRouter @@ -213,7 +217,7 @@ class AbstractFederationSender(metaclass=abc.ABCMeta): raise NotImplementedError() @abc.abstractmethod - def send_presence_to_destinations( + async def send_presence_to_destinations( self, states: Iterable[UserPresenceState], destinations: Iterable[str] ) -> None: """Send the given presence states to the given destinations. @@ -242,9 +246,11 @@ class AbstractFederationSender(metaclass=abc.ABCMeta): raise NotImplementedError() @abc.abstractmethod - def send_device_messages(self, destination: str, immediate: bool = True) -> None: + async def send_device_messages( + self, destinations: StrCollection, immediate: bool = True + ) -> None: """Tells the sender that a new device message is ready to be sent to the - destination. The `immediate` flag specifies whether the messages should + destinations. The `immediate` flag specifies whether the messages should be tried to be sent immediately, or whether it can be delayed for a short while (to aid performance). """ @@ -716,6 +722,13 @@ class FederationSender(AbstractFederationSender): pdu.internal_metadata.stream_ordering, ) + destinations = await filter_destinations_by_retry_limiter( + destinations, + clock=self.clock, + store=self.store, + retry_due_within_ms=CATCHUP_RETRY_INTERVAL, + ) + for destination in destinations: self._get_per_destination_queue(destination).send_pdu(pdu) @@ -763,12 +776,20 @@ class FederationSender(AbstractFederationSender): domains_set = await self._storage_controllers.state.get_current_hosts_in_room_or_partial_state_approximation( room_id ) - domains = [ + domains: StrCollection = [ d for d in domains_set if not self.is_mine_server_name(d) and self._federation_shard_config.should_handle(self._instance_name, d) ] + + domains = await filter_destinations_by_retry_limiter( + domains, + clock=self.clock, + store=self.store, + retry_due_within_ms=CATCHUP_RETRY_INTERVAL, + ) + if not domains: return @@ -816,7 +837,7 @@ class FederationSender(AbstractFederationSender): for queue in queues: queue.flush_read_receipts_for_room(room_id) - def send_presence_to_destinations( + async def send_presence_to_destinations( self, states: Iterable[UserPresenceState], destinations: Iterable[str] ) -> None: """Send the given presence states to the given destinations. @@ -831,13 +852,20 @@ class FederationSender(AbstractFederationSender): for state in states: assert self.is_mine_id(state.user_id) + destinations = await filter_destinations_by_retry_limiter( + [ + d + for d in destinations + if self._federation_shard_config.should_handle(self._instance_name, d) + ], + clock=self.clock, + store=self.store, + retry_due_within_ms=CATCHUP_RETRY_INTERVAL, + ) + for destination in destinations: if self.is_mine_server_name(destination): continue - if not self._federation_shard_config.should_handle( - self._instance_name, destination - ): - continue self._get_per_destination_queue(destination).send_presence( states, start_loop=False @@ -896,21 +924,29 @@ class FederationSender(AbstractFederationSender): else: queue.send_edu(edu) - def send_device_messages(self, destination: str, immediate: bool = True) -> None: - if self.is_mine_server_name(destination): - logger.warning("Not sending device update to ourselves") - return - - if not self._federation_shard_config.should_handle( - self._instance_name, destination - ): - return + async def send_device_messages( + self, destinations: StrCollection, immediate: bool = True + ) -> None: + destinations = await filter_destinations_by_retry_limiter( + [ + destination + for destination in destinations + if self._federation_shard_config.should_handle( + self._instance_name, destination + ) + and not self.is_mine_server_name(destination) + ], + clock=self.clock, + store=self.store, + retry_due_within_ms=CATCHUP_RETRY_INTERVAL, + ) - if immediate: - self._get_per_destination_queue(destination).attempt_new_transaction() - else: - self._get_per_destination_queue(destination).mark_new_data() - self._destination_wakeup_queue.add_to_queue(destination) + for destination in destinations: + if immediate: + self._get_per_destination_queue(destination).attempt_new_transaction() + else: + self._get_per_destination_queue(destination).mark_new_data() + self._destination_wakeup_queue.add_to_queue(destination) def wake_destination(self, destination: str) -> None: """Called when we want to retry sending transactions to a remote. diff --git a/synapse/federation/sender/per_destination_queue.py b/synapse/federation/sender/per_destination_queue.py index 31c5c2b7de..9105ba664c 100644 --- a/synapse/federation/sender/per_destination_queue.py +++ b/synapse/federation/sender/per_destination_queue.py @@ -59,6 +59,10 @@ sent_edus_by_type = Counter( ) +# If the retry interval is larger than this then we enter "catchup" mode +CATCHUP_RETRY_INTERVAL = 60 * 60 * 1000 + + class PerDestinationQueue: """ Manages the per-destination transmission queues. @@ -370,7 +374,7 @@ class PerDestinationQueue: ), ) - if e.retry_interval > 60 * 60 * 1000: + if e.retry_interval > CATCHUP_RETRY_INTERVAL: # we won't retry for another hour! # (this suggests a significant outage) # We drop pending EDUs because otherwise they will diff --git a/synapse/handlers/device.py b/synapse/handlers/device.py index 5ae427d52c..763f56dfc1 100644 --- a/synapse/handlers/device.py +++ b/synapse/handlers/device.py @@ -836,17 +836,16 @@ class DeviceHandler(DeviceWorkerHandler): user_id, hosts, ) - for host in hosts: - self.federation_sender.send_device_messages( - host, immediate=False - ) - # TODO: when called, this isn't in a logging context. - # This leads to log spam, sentry event spam, and massive - # memory usage. - # See https://github.com/matrix-org/synapse/issues/12552. - # log_kv( - # {"message": "sent device update to host", "host": host} - # ) + await self.federation_sender.send_device_messages( + hosts, immediate=False + ) + # TODO: when called, this isn't in a logging context. + # This leads to log spam, sentry event spam, and massive + # memory usage. + # See https://github.com/matrix-org/synapse/issues/12552. + # log_kv( + # {"message": "sent device update to host", "host": host} + # ) if current_stream_id != stream_id: # Clear the set of hosts we've already sent to as we're @@ -951,8 +950,9 @@ class DeviceHandler(DeviceWorkerHandler): # Notify things that device lists need to be sent out. self.notifier.notify_replication() - for host in potentially_changed_hosts: - self.federation_sender.send_device_messages(host, immediate=False) + await self.federation_sender.send_device_messages( + potentially_changed_hosts, immediate=False + ) def _update_device_from_client_ips( diff --git a/synapse/handlers/devicemessage.py b/synapse/handlers/devicemessage.py index 798c7039f9..1c79f7a61e 100644 --- a/synapse/handlers/devicemessage.py +++ b/synapse/handlers/devicemessage.py @@ -302,10 +302,9 @@ class DeviceMessageHandler: ) if self.federation_sender: - for destination in remote_messages.keys(): - # Enqueue a new federation transaction to send the new - # device messages to each remote destination. - self.federation_sender.send_device_messages(destination) + # Enqueue a new federation transaction to send the new + # device messages to each remote destination. + await self.federation_sender.send_device_messages(remote_messages.keys()) async def get_events_for_dehydrated_device( self, diff --git a/synapse/handlers/presence.py b/synapse/handlers/presence.py index 2f841863ae..f31e18328b 100644 --- a/synapse/handlers/presence.py +++ b/synapse/handlers/presence.py @@ -354,7 +354,9 @@ class BasePresenceHandler(abc.ABC): ) for destination, host_states in hosts_to_states.items(): - self._federation.send_presence_to_destinations(host_states, [destination]) + await self._federation.send_presence_to_destinations( + host_states, [destination] + ) async def send_full_presence_to_users(self, user_ids: StrCollection) -> None: """ @@ -936,7 +938,7 @@ class PresenceHandler(BasePresenceHandler): ) for destination, states in hosts_to_states.items(): - self._federation_queue.send_presence_to_destinations( + await self._federation_queue.send_presence_to_destinations( states, [destination] ) @@ -1508,7 +1510,7 @@ class PresenceHandler(BasePresenceHandler): or state.status_msg is not None ] - self._federation_queue.send_presence_to_destinations( + await self._federation_queue.send_presence_to_destinations( destinations=newly_joined_remote_hosts, states=states, ) @@ -1519,7 +1521,7 @@ class PresenceHandler(BasePresenceHandler): prev_remote_hosts or newly_joined_remote_hosts ): local_states = await self.current_state_for_users(newly_joined_local_users) - self._federation_queue.send_presence_to_destinations( + await self._federation_queue.send_presence_to_destinations( destinations=prev_remote_hosts | newly_joined_remote_hosts, states=list(local_states.values()), ) @@ -2182,7 +2184,7 @@ class PresenceFederationQueue: index = bisect(self._queue, (clear_before,)) self._queue = self._queue[index:] - def send_presence_to_destinations( + async def send_presence_to_destinations( self, states: Collection[UserPresenceState], destinations: StrCollection ) -> None: """Send the presence states to the given destinations. @@ -2202,7 +2204,7 @@ class PresenceFederationQueue: return if self._federation: - self._federation.send_presence_to_destinations( + await self._federation.send_presence_to_destinations( states=states, destinations=destinations, ) @@ -2325,7 +2327,7 @@ class PresenceFederationQueue: for host, user_ids in hosts_to_users.items(): states = await self._presence_handler.current_state_for_users(user_ids) - self._federation.send_presence_to_destinations( + await self._federation.send_presence_to_destinations( states=states.values(), destinations=[host], ) diff --git a/synapse/handlers/typing.py b/synapse/handlers/typing.py index 7aeae5319c..4b4227003d 100644 --- a/synapse/handlers/typing.py +++ b/synapse/handlers/typing.py @@ -26,9 +26,10 @@ from synapse.metrics.background_process_metrics import ( ) from synapse.replication.tcp.streams import TypingStream from synapse.streams import EventSource -from synapse.types import JsonDict, Requester, StreamKeyType, UserID +from synapse.types import JsonDict, Requester, StrCollection, StreamKeyType, UserID from synapse.util.caches.stream_change_cache import StreamChangeCache from synapse.util.metrics import Measure +from synapse.util.retryutils import filter_destinations_by_retry_limiter from synapse.util.wheel_timer import WheelTimer if TYPE_CHECKING: @@ -150,8 +151,15 @@ class FollowerTypingHandler: now=now, obj=member, then=now + FEDERATION_PING_INTERVAL ) - hosts = await self._storage_controllers.state.get_current_hosts_in_room( - member.room_id + hosts: StrCollection = ( + await self._storage_controllers.state.get_current_hosts_in_room( + member.room_id + ) + ) + hosts = await filter_destinations_by_retry_limiter( + hosts, + clock=self.clock, + store=self.store, ) for domain in hosts: if not self.is_mine_server_name(domain): diff --git a/synapse/module_api/__init__.py b/synapse/module_api/__init__.py index 9ad8e038ae..2f00a7ba20 100644 --- a/synapse/module_api/__init__.py +++ b/synapse/module_api/__init__.py @@ -1180,7 +1180,7 @@ class ModuleApi: # Send to remote destinations. destination = UserID.from_string(user).domain - presence_handler.get_federation_queue().send_presence_to_destinations( + await presence_handler.get_federation_queue().send_presence_to_destinations( presence_events, [destination] ) diff --git a/synapse/replication/tcp/client.py b/synapse/replication/tcp/client.py index 3b88dc68ea..51285e6d33 100644 --- a/synapse/replication/tcp/client.py +++ b/synapse/replication/tcp/client.py @@ -422,7 +422,7 @@ class FederationSenderHandler: # The federation stream contains things that we want to send out, e.g. # presence, typing, etc. if stream_name == "federation": - send_queue.process_rows_for_federation(self.federation_sender, rows) + await send_queue.process_rows_for_federation(self.federation_sender, rows) await self.update_token(token) # ... and when new receipts happen @@ -439,16 +439,14 @@ class FederationSenderHandler: for row in rows if not row.entity.startswith("@") and not row.is_signature } - for host in hosts: - self.federation_sender.send_device_messages(host, immediate=False) + await self.federation_sender.send_device_messages(hosts, immediate=False) elif stream_name == ToDeviceStream.NAME: # The to_device stream includes stuff to be pushed to both local # clients and remote servers, so we ignore entities that start with # '@' (since they'll be local users rather than destinations). hosts = {row.entity for row in rows if not row.entity.startswith("@")} - for host in hosts: - self.federation_sender.send_device_messages(host) + await self.federation_sender.send_device_messages(hosts) async def _on_new_receipts( self, rows: Iterable[ReceiptsStream.ReceiptsStreamRow] diff --git a/synapse/storage/databases/main/transactions.py b/synapse/storage/databases/main/transactions.py index 860bbf7c0f..efd21b5bfc 100644 --- a/synapse/storage/databases/main/transactions.py +++ b/synapse/storage/databases/main/transactions.py @@ -14,7 +14,7 @@ import logging from enum import Enum -from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, cast +from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, cast import attr from canonicaljson import encode_canonical_json @@ -28,8 +28,8 @@ from synapse.storage.database import ( LoggingTransaction, ) from synapse.storage.databases.main.cache import CacheInvalidationWorkerStore -from synapse.types import JsonDict -from synapse.util.caches.descriptors import cached +from synapse.types import JsonDict, StrCollection +from synapse.util.caches.descriptors import cached, cachedList if TYPE_CHECKING: from synapse.server import HomeServer @@ -205,6 +205,26 @@ class TransactionWorkerStore(CacheInvalidationWorkerStore): else: return None + @cachedList( + cached_method_name="get_destination_retry_timings", list_name="destinations" + ) + async def get_destination_retry_timings_batch( + self, destinations: StrCollection + ) -> Dict[str, Optional[DestinationRetryTimings]]: + rows = await self.db_pool.simple_select_many_batch( + table="destinations", + iterable=destinations, + column="destination", + retcols=("destination", "failure_ts", "retry_last_ts", "retry_interval"), + desc="get_destination_retry_timings_batch", + ) + + return { + row.pop("destination"): DestinationRetryTimings(**row) + for row in rows + if row["retry_last_ts"] and row["failure_ts"] and row["retry_interval"] + } + async def set_destination_retry_timings( self, destination: str, diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py index 9d2065372c..0e1f907667 100644 --- a/synapse/util/retryutils.py +++ b/synapse/util/retryutils.py @@ -19,6 +19,7 @@ from typing import TYPE_CHECKING, Any, Optional, Type from synapse.api.errors import CodeMessageException from synapse.metrics.background_process_metrics import run_as_background_process from synapse.storage import DataStore +from synapse.types import StrCollection from synapse.util import Clock if TYPE_CHECKING: @@ -116,6 +117,30 @@ async def get_retry_limiter( ) +async def filter_destinations_by_retry_limiter( + destinations: StrCollection, + clock: Clock, + store: DataStore, + retry_due_within_ms: int = 0, +) -> StrCollection: + """Filter down the list of destinations to only those that will are either + alive or due for a retry (within `retry_due_within_ms`) + """ + if not destinations: + return destinations + + retry_timings = await store.get_destination_retry_timings_batch(destinations) + + now = int(clock.time_msec()) + + return [ + destination + for destination, timings in retry_timings.items() + if timings is None + or timings.retry_last_ts + timings.retry_interval <= now + retry_due_within_ms + ] + + class RetryDestinationLimiter: def __init__( self, diff --git a/tests/federation/test_federation_sender.py b/tests/federation/test_federation_sender.py index 7bd3d06859..caf04b54cb 100644 --- a/tests/federation/test_federation_sender.py +++ b/tests/federation/test_federation_sender.py @@ -75,7 +75,7 @@ class FederationSenderReceiptsTestCases(HomeserverTestCase): thread_id=None, data={"ts": 1234}, ) - self.successResultOf(defer.ensureDeferred(sender.send_read_receipt(receipt))) + self.get_success(sender.send_read_receipt(receipt)) self.pump() @@ -111,6 +111,9 @@ class FederationSenderReceiptsTestCases(HomeserverTestCase): # * The same room / user on multiple threads. # * A different user in the same room. sender = self.hs.get_federation_sender() + # Hack so that we have a txn in-flight so we batch up read receipts + # below + sender.wake_destination("host2") for user, thread in ( ("alice", None), ("alice", "thread"), @@ -125,9 +128,7 @@ class FederationSenderReceiptsTestCases(HomeserverTestCase): thread_id=thread, data={"ts": 1234}, ) - self.successResultOf( - defer.ensureDeferred(sender.send_read_receipt(receipt)) - ) + defer.ensureDeferred(sender.send_read_receipt(receipt)) self.pump() @@ -191,7 +192,7 @@ class FederationSenderReceiptsTestCases(HomeserverTestCase): thread_id=None, data={"ts": 1234}, ) - self.successResultOf(defer.ensureDeferred(sender.send_read_receipt(receipt))) + self.get_success(sender.send_read_receipt(receipt)) self.pump() @@ -342,7 +343,9 @@ class FederationSenderDevicesTestCases(HomeserverTestCase): self.reactor.advance(1) # a second call should produce no new device EDUs - self.hs.get_federation_sender().send_device_messages("host2") + self.get_success( + self.hs.get_federation_sender().send_device_messages(["host2"]) + ) self.assertEqual(self.edus, []) # a second device @@ -550,7 +553,9 @@ class FederationSenderDevicesTestCases(HomeserverTestCase): # recover the server mock_send_txn.side_effect = self.record_transaction - self.hs.get_federation_sender().send_device_messages("host2") + self.get_success( + self.hs.get_federation_sender().send_device_messages(["host2"]) + ) # We queue up device list updates to be sent over federation, so we # advance to clear the queue. @@ -601,7 +606,9 @@ class FederationSenderDevicesTestCases(HomeserverTestCase): # recover the server mock_send_txn.side_effect = self.record_transaction - self.hs.get_federation_sender().send_device_messages("host2") + self.get_success( + self.hs.get_federation_sender().send_device_messages(["host2"]) + ) # We queue up device list updates to be sent over federation, so we # advance to clear the queue. @@ -656,7 +663,9 @@ class FederationSenderDevicesTestCases(HomeserverTestCase): # recover the server mock_send_txn.side_effect = self.record_transaction - self.hs.get_federation_sender().send_device_messages("host2") + self.get_success( + self.hs.get_federation_sender().send_device_messages(["host2"]) + ) # We queue up device list updates to be sent over federation, so we # advance to clear the queue. diff --git a/tests/handlers/test_presence.py b/tests/handlers/test_presence.py index a987267308..88a16193a3 100644 --- a/tests/handlers/test_presence.py +++ b/tests/handlers/test_presence.py @@ -909,8 +909,14 @@ class PresenceFederationQueueTestCase(unittest.HomeserverTestCase): prev_token = self.queue.get_current_token(self.instance_name) - self.queue.send_presence_to_destinations((state1, state2), ("dest1", "dest2")) - self.queue.send_presence_to_destinations((state3,), ("dest3",)) + self.get_success( + self.queue.send_presence_to_destinations( + (state1, state2), ("dest1", "dest2") + ) + ) + self.get_success( + self.queue.send_presence_to_destinations((state3,), ("dest3",)) + ) now_token = self.queue.get_current_token(self.instance_name) @@ -946,11 +952,17 @@ class PresenceFederationQueueTestCase(unittest.HomeserverTestCase): prev_token = self.queue.get_current_token(self.instance_name) - self.queue.send_presence_to_destinations((state1, state2), ("dest1", "dest2")) + self.get_success( + self.queue.send_presence_to_destinations( + (state1, state2), ("dest1", "dest2") + ) + ) now_token = self.queue.get_current_token(self.instance_name) - self.queue.send_presence_to_destinations((state3,), ("dest3",)) + self.get_success( + self.queue.send_presence_to_destinations((state3,), ("dest3",)) + ) rows, upto_token, limited = self.get_success( self.queue.get_replication_rows("master", prev_token, now_token, 10) @@ -989,8 +1001,14 @@ class PresenceFederationQueueTestCase(unittest.HomeserverTestCase): prev_token = self.queue.get_current_token(self.instance_name) - self.queue.send_presence_to_destinations((state1, state2), ("dest1", "dest2")) - self.queue.send_presence_to_destinations((state3,), ("dest3",)) + self.get_success( + self.queue.send_presence_to_destinations( + (state1, state2), ("dest1", "dest2") + ) + ) + self.get_success( + self.queue.send_presence_to_destinations((state3,), ("dest3",)) + ) self.reactor.advance(10 * 60 * 1000) @@ -1005,8 +1023,14 @@ class PresenceFederationQueueTestCase(unittest.HomeserverTestCase): prev_token = self.queue.get_current_token(self.instance_name) - self.queue.send_presence_to_destinations((state1, state2), ("dest1", "dest2")) - self.queue.send_presence_to_destinations((state3,), ("dest3",)) + self.get_success( + self.queue.send_presence_to_destinations( + (state1, state2), ("dest1", "dest2") + ) + ) + self.get_success( + self.queue.send_presence_to_destinations((state3,), ("dest3",)) + ) now_token = self.queue.get_current_token(self.instance_name) @@ -1033,11 +1057,17 @@ class PresenceFederationQueueTestCase(unittest.HomeserverTestCase): prev_token = self.queue.get_current_token(self.instance_name) - self.queue.send_presence_to_destinations((state1, state2), ("dest1", "dest2")) + self.get_success( + self.queue.send_presence_to_destinations( + (state1, state2), ("dest1", "dest2") + ) + ) self.reactor.advance(2 * 60 * 1000) - self.queue.send_presence_to_destinations((state3,), ("dest3",)) + self.get_success( + self.queue.send_presence_to_destinations((state3,), ("dest3",)) + ) self.reactor.advance(4 * 60 * 1000) @@ -1053,8 +1083,14 @@ class PresenceFederationQueueTestCase(unittest.HomeserverTestCase): prev_token = self.queue.get_current_token(self.instance_name) - self.queue.send_presence_to_destinations((state1, state2), ("dest1", "dest2")) - self.queue.send_presence_to_destinations((state3,), ("dest3",)) + self.get_success( + self.queue.send_presence_to_destinations( + (state1, state2), ("dest1", "dest2") + ) + ) + self.get_success( + self.queue.send_presence_to_destinations((state3,), ("dest3",)) + ) now_token = self.queue.get_current_token(self.instance_name) diff --git a/tests/handlers/test_typing.py b/tests/handlers/test_typing.py index 43c513b157..95106ec8f3 100644 --- a/tests/handlers/test_typing.py +++ b/tests/handlers/test_typing.py @@ -120,8 +120,6 @@ class TypingNotificationsTestCase(unittest.HomeserverTestCase): self.datastore = hs.get_datastores().main - self.datastore.get_destination_retry_timings = AsyncMock(return_value=None) - self.datastore.get_device_updates_by_remote = AsyncMock( # type: ignore[method-assign] return_value=(0, []) ) -- cgit 1.5.1