From 92d21faf12c982a8d27ad465eb94f2fed0e8b32f Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 3 Aug 2022 10:57:38 -0500 Subject: Instrument `/messages` for understandable traces in Jaeger (#13368) In Jaeger: - Before: huge list of uncategorized database calls - After: nice and collapsible into units of work --- synapse/storage/controllers/state.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'synapse/storage/controllers/state.py') diff --git a/synapse/storage/controllers/state.py b/synapse/storage/controllers/state.py index 1e35046e07..0d480f1014 100644 --- a/synapse/storage/controllers/state.py +++ b/synapse/storage/controllers/state.py @@ -29,6 +29,7 @@ from typing import ( from synapse.api.constants import EventTypes from synapse.events import EventBase +from synapse.logging.opentracing import trace from synapse.storage.state import StateFilter from synapse.storage.util.partial_state_events_tracker import ( PartialCurrentStateTracker, @@ -179,6 +180,7 @@ class StateStorageController: return self.stores.state._get_state_groups_from_groups(groups, state_filter) + @trace async def get_state_for_events( self, event_ids: Collection[str], state_filter: Optional[StateFilter] = None ) -> Dict[str, StateMap[EventBase]]: @@ -225,6 +227,7 @@ class StateStorageController: return {event: event_to_state[event] for event in event_ids} + @trace async def get_state_ids_for_events( self, event_ids: Collection[str], @@ -287,6 +290,7 @@ class StateStorageController: ) return state_map[event_id] + @trace async def get_state_ids_for_event( self, event_id: str, state_filter: Optional[StateFilter] = None ) -> StateMap[str]: @@ -327,6 +331,7 @@ class StateStorageController: groups, state_filter or StateFilter.all() ) + @trace async def get_state_group_for_events( self, event_ids: Collection[str], -- cgit 1.5.1 From c3516e9decc355b75a297d72a13b98a43d312e66 Mon Sep 17 00:00:00 2001 From: reivilibre Date: Tue, 16 Aug 2022 12:16:56 +0000 Subject: Faster room joins: make `/joined_members` block whilst the room is partial stated. (#13514) --- changelog.d/13514.bugfix | 1 + synapse/handlers/message.py | 6 +++++- synapse/storage/controllers/state.py | 13 +++++++++++++ synapse/storage/databases/main/roommember.py | 3 +++ 4 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 changelog.d/13514.bugfix (limited to 'synapse/storage/controllers/state.py') diff --git a/changelog.d/13514.bugfix b/changelog.d/13514.bugfix new file mode 100644 index 0000000000..7498af0e47 --- /dev/null +++ b/changelog.d/13514.bugfix @@ -0,0 +1 @@ +Faster room joins: make `/joined_members` block whilst the room is partial stated. \ No newline at end of file diff --git a/synapse/handlers/message.py b/synapse/handlers/message.py index 6b03603598..8f29ee9a87 100644 --- a/synapse/handlers/message.py +++ b/synapse/handlers/message.py @@ -331,7 +331,11 @@ class MessageHandler: msg="Getting joined members while not being a current member of the room is forbidden.", ) - users_with_profile = await self.store.get_users_in_room_with_profiles(room_id) + users_with_profile = ( + await self._state_storage_controller.get_users_in_room_with_profiles( + room_id + ) + ) # If this is an AS, double check that they are allowed to see the members. # This can either be because the AS user is in the room or because there diff --git a/synapse/storage/controllers/state.py b/synapse/storage/controllers/state.py index 0d480f1014..0c78eb735e 100644 --- a/synapse/storage/controllers/state.py +++ b/synapse/storage/controllers/state.py @@ -30,6 +30,7 @@ from typing import ( from synapse.api.constants import EventTypes from synapse.events import EventBase from synapse.logging.opentracing import trace +from synapse.storage.roommember import ProfileInfo from synapse.storage.state import StateFilter from synapse.storage.util.partial_state_events_tracker import ( PartialCurrentStateTracker, @@ -506,3 +507,15 @@ class StateStorageController: await self._partial_state_room_tracker.await_full_state(room_id) return await self.stores.main.get_current_hosts_in_room(room_id) + + async def get_users_in_room_with_profiles( + self, room_id: str + ) -> Dict[str, ProfileInfo]: + """ + Get the current users in the room with their profiles. + If the room is currently partial-stated, this will block until the room has + full state. + """ + await self._partial_state_room_tracker.await_full_state(room_id) + + return await self.stores.main.get_users_in_room_with_profiles(room_id) diff --git a/synapse/storage/databases/main/roommember.py b/synapse/storage/databases/main/roommember.py index 93ff4816c8..5e5f607a14 100644 --- a/synapse/storage/databases/main/roommember.py +++ b/synapse/storage/databases/main/roommember.py @@ -283,6 +283,9 @@ class RoomMemberWorkerStore(EventsWorkerStore): Returns: A mapping from user ID to ProfileInfo. + + Preconditions: + - There is full state available for the room (it is not partial-stated). """ def _get_users_in_room_with_profiles( -- cgit 1.5.1 From 0a4efbc1ddc3a58a6d75ad5d4d960b9ed367481e Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 16 Aug 2022 12:39:40 -0500 Subject: Instrument the federation/backfill part of `/messages` (#13489) Instrument the federation/backfill part of `/messages` so it's easier to follow what's going on in Jaeger when viewing a trace. Split out from https://github.com/matrix-org/synapse/pull/13440 Follow-up from https://github.com/matrix-org/synapse/pull/13368 Part of https://github.com/matrix-org/synapse/issues/13356 --- changelog.d/13489.misc | 1 + synapse/federation/federation_client.py | 27 ++++- synapse/handlers/federation.py | 10 +- synapse/handlers/federation_event.py | 112 ++++++++++++++++++--- synapse/logging/opentracing.py | 19 +++- synapse/storage/controllers/persist_events.py | 30 ++++-- synapse/storage/controllers/state.py | 5 +- synapse/storage/databases/main/event_federation.py | 6 ++ synapse/storage/databases/main/events.py | 2 + synapse/storage/databases/main/events_worker.py | 38 +++++-- .../storage/util/partial_state_events_tracker.py | 3 + 11 files changed, 220 insertions(+), 33 deletions(-) create mode 100644 changelog.d/13489.misc (limited to 'synapse/storage/controllers/state.py') diff --git a/changelog.d/13489.misc b/changelog.d/13489.misc new file mode 100644 index 0000000000..5e4853860e --- /dev/null +++ b/changelog.d/13489.misc @@ -0,0 +1 @@ +Instrument the federation/backfill part of `/messages` for understandable traces in Jaeger. diff --git a/synapse/federation/federation_client.py b/synapse/federation/federation_client.py index 54ffbd8170..987f6dad46 100644 --- a/synapse/federation/federation_client.py +++ b/synapse/federation/federation_client.py @@ -61,7 +61,7 @@ from synapse.federation.federation_base import ( ) from synapse.federation.transport.client import SendJoinResponse from synapse.http.types import QueryParams -from synapse.logging.opentracing import trace +from synapse.logging.opentracing import SynapseTags, set_tag, tag_args, trace from synapse.types import JsonDict, UserID, get_domain_from_id from synapse.util.async_helpers import concurrently_execute from synapse.util.caches.expiringcache import ExpiringCache @@ -235,6 +235,7 @@ class FederationClient(FederationBase): ) @trace + @tag_args async def backfill( self, dest: str, room_id: str, limit: int, extremities: Collection[str] ) -> Optional[List[EventBase]]: @@ -337,6 +338,8 @@ class FederationClient(FederationBase): return None + @trace + @tag_args async def get_pdu( self, destinations: Iterable[str], @@ -448,6 +451,8 @@ class FederationClient(FederationBase): return event_copy + @trace + @tag_args async def get_room_state_ids( self, destination: str, room_id: str, event_id: str ) -> Tuple[List[str], List[str]]: @@ -467,6 +472,23 @@ class FederationClient(FederationBase): state_event_ids = result["pdu_ids"] auth_event_ids = result.get("auth_chain_ids", []) + set_tag( + SynapseTags.RESULT_PREFIX + "state_event_ids", + str(state_event_ids), + ) + set_tag( + SynapseTags.RESULT_PREFIX + "state_event_ids.length", + str(len(state_event_ids)), + ) + set_tag( + SynapseTags.RESULT_PREFIX + "auth_event_ids", + str(auth_event_ids), + ) + set_tag( + SynapseTags.RESULT_PREFIX + "auth_event_ids.length", + str(len(auth_event_ids)), + ) + if not isinstance(state_event_ids, list) or not isinstance( auth_event_ids, list ): @@ -474,6 +496,8 @@ class FederationClient(FederationBase): return state_event_ids, auth_event_ids + @trace + @tag_args async def get_room_state( self, destination: str, @@ -533,6 +557,7 @@ class FederationClient(FederationBase): return valid_state_events, valid_auth_events + @trace async def _check_sigs_and_hash_and_fetch( self, origin: str, diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py index 6f5ab86ac4..d13011d138 100644 --- a/synapse/handlers/federation.py +++ b/synapse/handlers/federation.py @@ -59,7 +59,7 @@ from synapse.events.validator import EventValidator from synapse.federation.federation_client import InvalidResponseError from synapse.http.servlet import assert_params_in_dict from synapse.logging.context import nested_logging_context -from synapse.logging.opentracing import tag_args, trace +from synapse.logging.opentracing import SynapseTags, set_tag, tag_args, trace from synapse.metrics.background_process_metrics import run_as_background_process from synapse.module_api import NOT_SPAM from synapse.replication.http.federation import ( @@ -370,6 +370,14 @@ class FederationHandler: logger.debug( "_maybe_backfill_inner: extremities_to_request %s", extremities_to_request ) + set_tag( + SynapseTags.RESULT_PREFIX + "extremities_to_request", + str(extremities_to_request), + ) + set_tag( + SynapseTags.RESULT_PREFIX + "extremities_to_request.length", + str(len(extremities_to_request)), + ) # Now we need to decide which hosts to hit first. diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py index 8968b705d4..dd0d610fe9 100644 --- a/synapse/handlers/federation_event.py +++ b/synapse/handlers/federation_event.py @@ -59,7 +59,13 @@ from synapse.events import EventBase from synapse.events.snapshot import EventContext from synapse.federation.federation_client import InvalidResponseError from synapse.logging.context import nested_logging_context -from synapse.logging.opentracing import trace +from synapse.logging.opentracing import ( + SynapseTags, + set_tag, + start_active_span, + tag_args, + trace, +) from synapse.metrics.background_process_metrics import run_as_background_process from synapse.replication.http.devices import ReplicationUserDevicesResyncRestServlet from synapse.replication.http.federation import ( @@ -410,6 +416,7 @@ class FederationEventHandler: prev_member_event, ) + @trace async def process_remote_join( self, origin: str, @@ -715,7 +722,7 @@ class FederationEventHandler: @trace async def _process_pulled_events( - self, origin: str, events: Iterable[EventBase], backfilled: bool + self, origin: str, events: Collection[EventBase], backfilled: bool ) -> None: """Process a batch of events we have pulled from a remote server @@ -730,6 +737,15 @@ class FederationEventHandler: backfilled: True if this is part of a historical batch of events (inhibits notification to clients, and validation of device keys.) """ + set_tag( + SynapseTags.FUNC_ARG_PREFIX + "event_ids", + str([event.event_id for event in events]), + ) + set_tag( + SynapseTags.FUNC_ARG_PREFIX + "event_ids.length", + str(len(events)), + ) + set_tag(SynapseTags.FUNC_ARG_PREFIX + "backfilled", str(backfilled)) logger.debug( "processing pulled backfilled=%s events=%s", backfilled, @@ -753,6 +769,7 @@ class FederationEventHandler: await self._process_pulled_event(origin, ev, backfilled=backfilled) @trace + @tag_args async def _process_pulled_event( self, origin: str, event: EventBase, backfilled: bool ) -> None: @@ -854,6 +871,7 @@ class FederationEventHandler: else: raise + @trace async def _compute_event_context_with_maybe_missing_prevs( self, dest: str, event: EventBase ) -> EventContext: @@ -970,6 +988,8 @@ class FederationEventHandler: event, state_ids_before_event=state_map, partial_state=partial_state ) + @trace + @tag_args async def _get_state_ids_after_missing_prev_event( self, destination: str, @@ -1009,10 +1029,10 @@ class FederationEventHandler: logger.debug("Fetching %i events from cache/store", len(desired_events)) have_events = await self._store.have_seen_events(room_id, desired_events) - missing_desired_events = desired_events - have_events + missing_desired_event_ids = desired_events - have_events logger.debug( "We are missing %i events (got %i)", - len(missing_desired_events), + len(missing_desired_event_ids), len(have_events), ) @@ -1024,13 +1044,30 @@ class FederationEventHandler: # already have a bunch of the state events. It would be nice if the # federation api gave us a way of finding out which we actually need. - missing_auth_events = set(auth_event_ids) - have_events - missing_auth_events.difference_update( - await self._store.have_seen_events(room_id, missing_auth_events) + missing_auth_event_ids = set(auth_event_ids) - have_events + missing_auth_event_ids.difference_update( + await self._store.have_seen_events(room_id, missing_auth_event_ids) ) - logger.debug("We are also missing %i auth events", len(missing_auth_events)) + logger.debug("We are also missing %i auth events", len(missing_auth_event_ids)) - missing_events = missing_desired_events | missing_auth_events + missing_event_ids = missing_desired_event_ids | missing_auth_event_ids + + set_tag( + SynapseTags.RESULT_PREFIX + "missing_auth_event_ids", + str(missing_auth_event_ids), + ) + set_tag( + SynapseTags.RESULT_PREFIX + "missing_auth_event_ids.length", + str(len(missing_auth_event_ids)), + ) + set_tag( + SynapseTags.RESULT_PREFIX + "missing_desired_event_ids", + str(missing_desired_event_ids), + ) + set_tag( + SynapseTags.RESULT_PREFIX + "missing_desired_event_ids.length", + str(len(missing_desired_event_ids)), + ) # Making an individual request for each of 1000s of events has a lot of # overhead. On the other hand, we don't really want to fetch all of the events @@ -1041,13 +1078,13 @@ class FederationEventHandler: # # TODO: might it be better to have an API which lets us do an aggregate event # request - if (len(missing_events) * 10) >= len(auth_event_ids) + len(state_event_ids): + if (len(missing_event_ids) * 10) >= len(auth_event_ids) + len(state_event_ids): logger.debug("Requesting complete state from remote") await self._get_state_and_persist(destination, room_id, event_id) else: - logger.debug("Fetching %i events from remote", len(missing_events)) + logger.debug("Fetching %i events from remote", len(missing_event_ids)) await self._get_events_and_persist( - destination=destination, room_id=room_id, event_ids=missing_events + destination=destination, room_id=room_id, event_ids=missing_event_ids ) # We now need to fill out the state map, which involves fetching the @@ -1104,6 +1141,14 @@ class FederationEventHandler: event_id, failed_to_fetch, ) + set_tag( + SynapseTags.RESULT_PREFIX + "failed_to_fetch", + str(failed_to_fetch), + ) + set_tag( + SynapseTags.RESULT_PREFIX + "failed_to_fetch.length", + str(len(failed_to_fetch)), + ) if remote_event.is_state() and remote_event.rejected_reason is None: state_map[ @@ -1112,6 +1157,8 @@ class FederationEventHandler: return state_map + @trace + @tag_args async def _get_state_and_persist( self, destination: str, room_id: str, event_id: str ) -> None: @@ -1133,6 +1180,7 @@ class FederationEventHandler: destination=destination, room_id=room_id, event_ids=(event_id,) ) + @trace async def _process_received_pdu( self, origin: str, @@ -1283,6 +1331,7 @@ class FederationEventHandler: except Exception: logger.exception("Failed to resync device for %s", sender) + @trace async def _handle_marker_event(self, origin: str, marker_event: EventBase) -> None: """Handles backfilling the insertion event when we receive a marker event that points to one. @@ -1414,6 +1463,8 @@ class FederationEventHandler: return event_from_response + @trace + @tag_args async def _get_events_and_persist( self, destination: str, room_id: str, event_ids: Collection[str] ) -> None: @@ -1459,6 +1510,7 @@ class FederationEventHandler: logger.info("Fetched %i events of %i requested", len(events), len(event_ids)) await self._auth_and_persist_outliers(room_id, events) + @trace async def _auth_and_persist_outliers( self, room_id: str, events: Iterable[EventBase] ) -> None: @@ -1477,6 +1529,16 @@ class FederationEventHandler: """ event_map = {event.event_id: event for event in events} + event_ids = event_map.keys() + set_tag( + SynapseTags.FUNC_ARG_PREFIX + "event_ids", + str(event_ids), + ) + set_tag( + SynapseTags.FUNC_ARG_PREFIX + "event_ids.length", + str(len(event_ids)), + ) + # filter out any events we have already seen. This might happen because # the events were eagerly pushed to us (eg, during a room join), or because # another thread has raced against us since we decided to request the event. @@ -1593,6 +1655,7 @@ class FederationEventHandler: backfilled=True, ) + @trace async def _check_event_auth( self, origin: Optional[str], event: EventBase, context: EventContext ) -> None: @@ -1631,6 +1694,14 @@ class FederationEventHandler: claimed_auth_events = await self._load_or_fetch_auth_events_for_event( origin, event ) + set_tag( + SynapseTags.RESULT_PREFIX + "claimed_auth_events", + str([ev.event_id for ev in claimed_auth_events]), + ) + set_tag( + SynapseTags.RESULT_PREFIX + "claimed_auth_events.length", + str(len(claimed_auth_events)), + ) # ... and check that the event passes auth at those auth events. # https://spec.matrix.org/v1.3/server-server-api/#checks-performed-on-receipt-of-a-pdu: @@ -1728,6 +1799,7 @@ class FederationEventHandler: ) context.rejected = RejectedReason.AUTH_ERROR + @trace async def _maybe_kick_guest_users(self, event: EventBase) -> None: if event.type != EventTypes.GuestAccess: return @@ -1935,6 +2007,8 @@ class FederationEventHandler: # instead we raise an AuthError, which will make the caller ignore it. raise AuthError(code=HTTPStatus.FORBIDDEN, msg="Auth events could not be found") + @trace + @tag_args async def _get_remote_auth_chain_for_event( self, destination: str, room_id: str, event_id: str ) -> None: @@ -1963,6 +2037,7 @@ class FederationEventHandler: await self._auth_and_persist_outliers(room_id, remote_auth_events) + @trace async def _run_push_actions_and_persist_event( self, event: EventBase, context: EventContext, backfilled: bool = False ) -> None: @@ -2071,8 +2146,17 @@ class FederationEventHandler: self._message_handler.maybe_schedule_expiry(event) if not backfilled: # Never notify for backfilled events - for event in events: - await self._notify_persisted_event(event, max_stream_token) + with start_active_span("notify_persisted_events"): + set_tag( + SynapseTags.RESULT_PREFIX + "event_ids", + str([ev.event_id for ev in events]), + ) + set_tag( + SynapseTags.RESULT_PREFIX + "event_ids.length", + str(len(events)), + ) + for event in events: + await self._notify_persisted_event(event, max_stream_token) return max_stream_token.stream diff --git a/synapse/logging/opentracing.py b/synapse/logging/opentracing.py index d1fa2cf8ae..482316a1ff 100644 --- a/synapse/logging/opentracing.py +++ b/synapse/logging/opentracing.py @@ -310,6 +310,19 @@ class SynapseTags: # The name of the external cache CACHE_NAME = "cache.name" + # Used to tag function arguments + # + # Tag a named arg. The name of the argument should be appended to this prefix. + FUNC_ARG_PREFIX = "ARG." + # Tag extra variadic number of positional arguments (`def foo(first, second, *extras)`) + FUNC_ARGS = "args" + # Tag keyword args + FUNC_KWARGS = "kwargs" + + # Some intermediate result that's interesting to the function. The label for + # the result should be appended to this prefix. + RESULT_PREFIX = "RESULT." + class SynapseBaggage: FORCE_TRACING = "synapse-force-tracing" @@ -967,9 +980,9 @@ def tag_args(func: Callable[P, R]) -> Callable[P, R]: # first argument only if it's named `self` or `cls`. This isn't fool-proof # but handles the idiomatic cases. for i, arg in enumerate(args[1:], start=1): # type: ignore[index] - set_tag("ARG_" + argspec.args[i], str(arg)) - set_tag("args", str(args[len(argspec.args) :])) # type: ignore[index] - set_tag("kwargs", str(kwargs)) + set_tag(SynapseTags.FUNC_ARG_PREFIX + argspec.args[i], str(arg)) + set_tag(SynapseTags.FUNC_ARGS, str(args[len(argspec.args) :])) # type: ignore[index] + set_tag(SynapseTags.FUNC_KWARGS, str(kwargs)) yield return _custom_sync_async_decorator(func, _wrapping_logic) diff --git a/synapse/storage/controllers/persist_events.py b/synapse/storage/controllers/persist_events.py index cf98b0ab48..dad3731b9b 100644 --- a/synapse/storage/controllers/persist_events.py +++ b/synapse/storage/controllers/persist_events.py @@ -45,8 +45,14 @@ from twisted.internet import defer from synapse.api.constants import EventTypes, Membership from synapse.events import EventBase from synapse.events.snapshot import EventContext -from synapse.logging import opentracing from synapse.logging.context import PreserveLoggingContext, make_deferred_yieldable +from synapse.logging.opentracing import ( + SynapseTags, + active_span, + set_tag, + start_active_span_follows_from, + trace, +) from synapse.metrics.background_process_metrics import run_as_background_process from synapse.storage.controllers.state import StateStorageController from synapse.storage.databases import Databases @@ -223,7 +229,7 @@ class _EventPeristenceQueue(Generic[_PersistResult]): queue.append(end_item) # also add our active opentracing span to the item so that we get a link back - span = opentracing.active_span() + span = active_span() if span: end_item.parent_opentracing_span_contexts.append(span.context) @@ -234,7 +240,7 @@ class _EventPeristenceQueue(Generic[_PersistResult]): res = await make_deferred_yieldable(end_item.deferred.observe()) # add another opentracing span which links to the persist trace. - with opentracing.start_active_span_follows_from( + with start_active_span_follows_from( f"{task.name}_complete", (end_item.opentracing_span_context,) ): pass @@ -266,7 +272,7 @@ class _EventPeristenceQueue(Generic[_PersistResult]): queue = self._get_drainining_queue(room_id) for item in queue: try: - with opentracing.start_active_span_follows_from( + with start_active_span_follows_from( item.task.name, item.parent_opentracing_span_contexts, inherit_force_tracing=True, @@ -355,7 +361,7 @@ class EventsPersistenceStorageController: f"Found an unexpected task type in event persistence queue: {task}" ) - @opentracing.trace + @trace async def persist_events( self, events_and_contexts: Iterable[Tuple[EventBase, EventContext]], @@ -380,9 +386,21 @@ class EventsPersistenceStorageController: PartialStateConflictError: if attempting to persist a partial state event in a room that has been un-partial stated. """ + event_ids: List[str] = [] partitioned: Dict[str, List[Tuple[EventBase, EventContext]]] = {} for event, ctx in events_and_contexts: partitioned.setdefault(event.room_id, []).append((event, ctx)) + event_ids.append(event.event_id) + + set_tag( + SynapseTags.FUNC_ARG_PREFIX + "event_ids", + str(event_ids), + ) + set_tag( + SynapseTags.FUNC_ARG_PREFIX + "event_ids.length", + str(len(event_ids)), + ) + set_tag(SynapseTags.FUNC_ARG_PREFIX + "backfilled", str(backfilled)) async def enqueue( item: Tuple[str, List[Tuple[EventBase, EventContext]]] @@ -418,7 +436,7 @@ class EventsPersistenceStorageController: self.main_store.get_room_max_token(), ) - @opentracing.trace + @trace async def persist_event( self, event: EventBase, context: EventContext, backfilled: bool = False ) -> Tuple[EventBase, PersistedEventPosition, RoomStreamToken]: diff --git a/synapse/storage/controllers/state.py b/synapse/storage/controllers/state.py index 0c78eb735e..1ad002f57b 100644 --- a/synapse/storage/controllers/state.py +++ b/synapse/storage/controllers/state.py @@ -29,7 +29,7 @@ from typing import ( from synapse.api.constants import EventTypes from synapse.events import EventBase -from synapse.logging.opentracing import trace +from synapse.logging.opentracing import tag_args, trace from synapse.storage.roommember import ProfileInfo from synapse.storage.state import StateFilter from synapse.storage.util.partial_state_events_tracker import ( @@ -229,6 +229,7 @@ class StateStorageController: return {event: event_to_state[event] for event in event_ids} @trace + @tag_args async def get_state_ids_for_events( self, event_ids: Collection[str], @@ -333,6 +334,7 @@ class StateStorageController: ) @trace + @tag_args async def get_state_group_for_events( self, event_ids: Collection[str], @@ -474,6 +476,7 @@ class StateStorageController: prev_stream_id, max_stream_id ) + @trace async def get_current_state( self, room_id: str, state_filter: Optional[StateFilter] = None ) -> StateMap[EventBase]: diff --git a/synapse/storage/databases/main/event_federation.py b/synapse/storage/databases/main/event_federation.py index 0bc8401f2b..c836078da6 100644 --- a/synapse/storage/databases/main/event_federation.py +++ b/synapse/storage/databases/main/event_federation.py @@ -712,6 +712,8 @@ class EventFederationWorkerStore(SignatureWorkerStore, EventsWorkerStore, SQLBas # Return all events where not all sets can reach them. return {eid for eid, n in event_to_missing_sets.items() if n} + @trace + @tag_args async def get_oldest_event_ids_with_depth_in_room( self, room_id: str ) -> List[Tuple[str, int]]: @@ -770,6 +772,7 @@ class EventFederationWorkerStore(SignatureWorkerStore, EventsWorkerStore, SQLBas room_id, ) + @trace async def get_insertion_event_backward_extremities_in_room( self, room_id: str ) -> List[Tuple[str, int]]: @@ -1342,6 +1345,8 @@ class EventFederationWorkerStore(SignatureWorkerStore, EventsWorkerStore, SQLBas event_results.reverse() return event_results + @trace + @tag_args async def get_successor_events(self, event_id: str) -> List[str]: """Fetch all events that have the given event as a prev event @@ -1378,6 +1383,7 @@ class EventFederationWorkerStore(SignatureWorkerStore, EventsWorkerStore, SQLBas _delete_old_forward_extrem_cache_txn, ) + @trace async def insert_insertion_extremity(self, event_id: str, room_id: str) -> None: await self.db_pool.simple_upsert( table="insertion_event_extremities", diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py index 5560b38a48..a4010ee28d 100644 --- a/synapse/storage/databases/main/events.py +++ b/synapse/storage/databases/main/events.py @@ -40,6 +40,7 @@ from synapse.api.errors import Codes, SynapseError from synapse.api.room_versions import RoomVersions from synapse.events import EventBase, relation_from_event from synapse.events.snapshot import EventContext +from synapse.logging.opentracing import trace from synapse.storage._base import db_to_json, make_in_list_sql_clause from synapse.storage.database import ( DatabasePool, @@ -145,6 +146,7 @@ class PersistEventsStore: self._backfill_id_gen: AbstractStreamIdGenerator = self.store._backfill_id_gen self._stream_id_gen: AbstractStreamIdGenerator = self.store._stream_id_gen + @trace async def _persist_events_and_state_updates( self, events_and_contexts: List[Tuple[EventBase, EventContext]], diff --git a/synapse/storage/databases/main/events_worker.py b/synapse/storage/databases/main/events_worker.py index b07d812ae2..8a7cdb024d 100644 --- a/synapse/storage/databases/main/events_worker.py +++ b/synapse/storage/databases/main/events_worker.py @@ -54,6 +54,7 @@ from synapse.logging.context import ( current_context, make_deferred_yieldable, ) +from synapse.logging.opentracing import start_active_span, tag_args, trace from synapse.metrics.background_process_metrics import ( run_as_background_process, wrap_as_background_process, @@ -430,6 +431,8 @@ class EventsWorkerStore(SQLBaseStore): return {e.event_id: e for e in events} + @trace + @tag_args async def get_events_as_list( self, event_ids: Collection[str], @@ -1090,23 +1093,42 @@ class EventsWorkerStore(SQLBaseStore): """ fetched_event_ids: Set[str] = set() fetched_events: Dict[str, _EventRow] = {} - events_to_fetch = event_ids - while events_to_fetch: - row_map = await self._enqueue_events(events_to_fetch) + async def _fetch_event_ids_and_get_outstanding_redactions( + event_ids_to_fetch: Collection[str], + ) -> Collection[str]: + """ + Fetch all of the given event_ids and return any associated redaction event_ids + that we still need to fetch in the next iteration. + """ + row_map = await self._enqueue_events(event_ids_to_fetch) # we need to recursively fetch any redactions of those events redaction_ids: Set[str] = set() - for event_id in events_to_fetch: + for event_id in event_ids_to_fetch: row = row_map.get(event_id) fetched_event_ids.add(event_id) if row: fetched_events[event_id] = row redaction_ids.update(row.redactions) - events_to_fetch = redaction_ids.difference(fetched_event_ids) - if events_to_fetch: - logger.debug("Also fetching redaction events %s", events_to_fetch) + event_ids_to_fetch = redaction_ids.difference(fetched_event_ids) + return event_ids_to_fetch + + # Grab the initial list of events requested + event_ids_to_fetch = await _fetch_event_ids_and_get_outstanding_redactions( + event_ids + ) + # Then go and recursively find all of the associated redactions + with start_active_span("recursively fetching redactions"): + while event_ids_to_fetch: + logger.debug("Also fetching redaction events %s", event_ids_to_fetch) + + event_ids_to_fetch = ( + await _fetch_event_ids_and_get_outstanding_redactions( + event_ids_to_fetch + ) + ) # build a map from event_id to EventBase event_map: Dict[str, EventBase] = {} @@ -1424,6 +1446,8 @@ class EventsWorkerStore(SQLBaseStore): return {r["event_id"] for r in rows} + @trace + @tag_args async def have_seen_events( self, room_id: str, event_ids: Iterable[str] ) -> Set[str]: diff --git a/synapse/storage/util/partial_state_events_tracker.py b/synapse/storage/util/partial_state_events_tracker.py index 466e5137f2..b4bf49dace 100644 --- a/synapse/storage/util/partial_state_events_tracker.py +++ b/synapse/storage/util/partial_state_events_tracker.py @@ -20,6 +20,7 @@ from twisted.internet import defer from twisted.internet.defer import Deferred from synapse.logging.context import PreserveLoggingContext, make_deferred_yieldable +from synapse.logging.opentracing import trace_with_opname from synapse.storage.databases.main.events_worker import EventsWorkerStore from synapse.storage.databases.main.room import RoomWorkerStore from synapse.util import unwrapFirstError @@ -58,6 +59,7 @@ class PartialStateEventsTracker: for o in observers: o.callback(None) + @trace_with_opname("PartialStateEventsTracker.await_full_state") async def await_full_state(self, event_ids: Collection[str]) -> None: """Wait for all the given events to have full state. @@ -151,6 +153,7 @@ class PartialCurrentStateTracker: for o in observers: o.callback(None) + @trace_with_opname("PartialCurrentStateTracker.await_full_state") async def await_full_state(self, room_id: str) -> None: # We add the deferred immediately so that the DB call to check for # partial state doesn't race when we unpartial the room. -- cgit 1.5.1 From 84169a82dcf7dfb6eb7d307ea7f5e33cb57f6e3f Mon Sep 17 00:00:00 2001 From: Sean Quah <8349537+squahtx@users.noreply.github.com> Date: Thu, 18 Aug 2022 11:53:02 +0100 Subject: Avoid blocking lazy-loading `/sync`s during partial joins (#13477) Use a state filter or accept partial state in a few places where we request state, to avoid blocking. To make lazy-loading `/sync`s work, we need to provide the memberships of event senders, which are not guaranteed to be in the room state. Instead we dig through auth events for memberships to present to clients. The auth events of an event are guaranteed to contain a passable membership event, otherwise the event would have been rejected. Note that this only covers the common code paths encountered during testing. There has been no exhaustive checking of all sync code paths. Fixes #13146. Signed-off-by: Sean Quah --- changelog.d/13477.misc | 1 + synapse/handlers/sync.py | 253 ++++++++++++++++++++++++++++++----- synapse/storage/controllers/state.py | 24 +++- 3 files changed, 244 insertions(+), 34 deletions(-) create mode 100644 changelog.d/13477.misc (limited to 'synapse/storage/controllers/state.py') diff --git a/changelog.d/13477.misc b/changelog.d/13477.misc new file mode 100644 index 0000000000..5d21ae9d7a --- /dev/null +++ b/changelog.d/13477.misc @@ -0,0 +1 @@ +Faster room joins: Avoid blocking lazy-loading `/sync`s during partial joins due to remote memberships. Pull remote memberships from auth events instead of the room state. diff --git a/synapse/handlers/sync.py b/synapse/handlers/sync.py index 3ca01391c9..b4d3f3958c 100644 --- a/synapse/handlers/sync.py +++ b/synapse/handlers/sync.py @@ -16,9 +16,11 @@ import logging from typing import ( TYPE_CHECKING, Any, + Collection, Dict, FrozenSet, List, + Mapping, Optional, Sequence, Set, @@ -517,10 +519,17 @@ class SyncHandler: # ensure that we always include current state in the timeline current_state_ids: FrozenSet[str] = frozenset() if any(e.is_state() for e in recents): + # FIXME(faster_joins): We use the partial state here as + # we don't want to block `/sync` on finishing a lazy join. + # Which should be fine once + # https://github.com/matrix-org/synapse/issues/12989 is resolved, + # since we shouldn't reach here anymore? + # Note that we use the current state as a whitelist for filtering + # `recents`, so partial state is only a problem when a membership + # event turns up in `recents` but has not made it into the current + # state. current_state_ids_map = ( - await self._state_storage_controller.get_current_state_ids( - room_id - ) + await self.store.get_partial_current_state_ids(room_id) ) current_state_ids = frozenset(current_state_ids_map.values()) @@ -589,7 +598,13 @@ class SyncHandler: if any(e.is_state() for e in loaded_recents): # FIXME(faster_joins): We use the partial state here as # we don't want to block `/sync` on finishing a lazy join. - # Is this the correct way of doing it? + # Which should be fine once + # https://github.com/matrix-org/synapse/issues/12989 is resolved, + # since we shouldn't reach here anymore? + # Note that we use the current state as a whitelist for filtering + # `loaded_recents`, so partial state is only a problem when a + # membership event turns up in `loaded_recents` but has not made it + # into the current state. current_state_ids_map = ( await self.store.get_partial_current_state_ids(room_id) ) @@ -637,7 +652,10 @@ class SyncHandler: ) async def get_state_after_event( - self, event_id: str, state_filter: Optional[StateFilter] = None + self, + event_id: str, + state_filter: Optional[StateFilter] = None, + await_full_state: bool = True, ) -> StateMap[str]: """ Get the room state after the given event @@ -645,9 +663,14 @@ class SyncHandler: Args: event_id: event of interest state_filter: The state filter used to fetch state from the database. + await_full_state: if `True`, will block if we do not yet have complete state + at the event and `state_filter` is not satisfied by partial state. + Defaults to `True`. """ state_ids = await self._state_storage_controller.get_state_ids_for_event( - event_id, state_filter=state_filter or StateFilter.all() + event_id, + state_filter=state_filter or StateFilter.all(), + await_full_state=await_full_state, ) # using get_metadata_for_events here (instead of get_event) sidesteps an issue @@ -670,6 +693,7 @@ class SyncHandler: room_id: str, stream_position: StreamToken, state_filter: Optional[StateFilter] = None, + await_full_state: bool = True, ) -> StateMap[str]: """Get the room state at a particular stream position @@ -677,6 +701,9 @@ class SyncHandler: room_id: room for which to get state stream_position: point at which to get state state_filter: The state filter used to fetch state from the database. + await_full_state: if `True`, will block if we do not yet have complete state + at the last event in the room before `stream_position` and + `state_filter` is not satisfied by partial state. Defaults to `True`. """ # FIXME: This gets the state at the latest event before the stream ordering, # which might not be the same as the "current state" of the room at the time @@ -688,7 +715,9 @@ class SyncHandler: if last_event_id: state = await self.get_state_after_event( - last_event_id, state_filter=state_filter or StateFilter.all() + last_event_id, + state_filter=state_filter or StateFilter.all(), + await_full_state=await_full_state, ) else: @@ -891,7 +920,15 @@ class SyncHandler: with Measure(self.clock, "compute_state_delta"): # The memberships needed for events in the timeline. # Only calculated when `lazy_load_members` is on. - members_to_fetch = None + members_to_fetch: Optional[Set[str]] = None + + # A dictionary mapping user IDs to the first event in the timeline sent by + # them. Only calculated when `lazy_load_members` is on. + first_event_by_sender_map: Optional[Dict[str, EventBase]] = None + + # The contribution to the room state from state events in the timeline. + # Only contains the last event for any given state key. + timeline_state: StateMap[str] lazy_load_members = sync_config.filter_collection.lazy_load_members() include_redundant_members = ( @@ -902,10 +939,23 @@ class SyncHandler: # We only request state for the members needed to display the # timeline: - members_to_fetch = { - event.sender # FIXME: we also care about invite targets etc. - for event in batch.events - } + timeline_state = {} + + members_to_fetch = set() + first_event_by_sender_map = {} + for event in batch.events: + # Build the map from user IDs to the first timeline event they sent. + if event.sender not in first_event_by_sender_map: + first_event_by_sender_map[event.sender] = event + + # We need the event's sender, unless their membership was in a + # previous timeline event. + if (EventTypes.Member, event.sender) not in timeline_state: + members_to_fetch.add(event.sender) + # FIXME: we also care about invite targets etc. + + if event.is_state(): + timeline_state[(event.type, event.state_key)] = event.event_id if full_state: # always make sure we LL ourselves so we know we're in the room @@ -915,16 +965,21 @@ class SyncHandler: members_to_fetch.add(sync_config.user.to_string()) state_filter = StateFilter.from_lazy_load_member_list(members_to_fetch) + + # We are happy to use partial state to compute the `/sync` response. + # Since partial state may not include the lazy-loaded memberships we + # require, we fix up the state response afterwards with memberships from + # auth events. + await_full_state = False else: - state_filter = StateFilter.all() + timeline_state = { + (event.type, event.state_key): event.event_id + for event in batch.events + if event.is_state() + } - # The contribution to the room state from state events in the timeline. - # Only contains the last event for any given state key. - timeline_state = { - (event.type, event.state_key): event.event_id - for event in batch.events - if event.is_state() - } + state_filter = StateFilter.all() + await_full_state = True # Now calculate the state to return in the sync response for the room. # This is more or less the change in state between the end of the previous @@ -936,19 +991,26 @@ class SyncHandler: if batch: state_at_timeline_end = ( await self._state_storage_controller.get_state_ids_for_event( - batch.events[-1].event_id, state_filter=state_filter + batch.events[-1].event_id, + state_filter=state_filter, + await_full_state=await_full_state, ) ) state_at_timeline_start = ( await self._state_storage_controller.get_state_ids_for_event( - batch.events[0].event_id, state_filter=state_filter + batch.events[0].event_id, + state_filter=state_filter, + await_full_state=await_full_state, ) ) else: state_at_timeline_end = await self.get_state_at( - room_id, stream_position=now_token, state_filter=state_filter + room_id, + stream_position=now_token, + state_filter=state_filter, + await_full_state=await_full_state, ) state_at_timeline_start = state_at_timeline_end @@ -964,14 +1026,19 @@ class SyncHandler: if batch: state_at_timeline_start = ( await self._state_storage_controller.get_state_ids_for_event( - batch.events[0].event_id, state_filter=state_filter + batch.events[0].event_id, + state_filter=state_filter, + await_full_state=await_full_state, ) ) else: # We can get here if the user has ignored the senders of all # the recent events. state_at_timeline_start = await self.get_state_at( - room_id, stream_position=now_token, state_filter=state_filter + room_id, + stream_position=now_token, + state_filter=state_filter, + await_full_state=await_full_state, ) # for now, we disable LL for gappy syncs - see @@ -993,20 +1060,28 @@ class SyncHandler: # is indeed the case. assert since_token is not None state_at_previous_sync = await self.get_state_at( - room_id, stream_position=since_token, state_filter=state_filter + room_id, + stream_position=since_token, + state_filter=state_filter, + await_full_state=await_full_state, ) if batch: state_at_timeline_end = ( await self._state_storage_controller.get_state_ids_for_event( - batch.events[-1].event_id, state_filter=state_filter + batch.events[-1].event_id, + state_filter=state_filter, + await_full_state=await_full_state, ) ) else: # We can get here if the user has ignored the senders of all # the recent events. state_at_timeline_end = await self.get_state_at( - room_id, stream_position=now_token, state_filter=state_filter + room_id, + stream_position=now_token, + state_filter=state_filter, + await_full_state=await_full_state, ) state_ids = _calculate_state( @@ -1036,8 +1111,23 @@ class SyncHandler: (EventTypes.Member, member) for member in members_to_fetch ), + await_full_state=False, ) + # If we only have partial state for the room, `state_ids` may be missing the + # memberships we wanted. We attempt to find some by digging through the auth + # events of timeline events. + if lazy_load_members and await self.store.is_partial_state_room(room_id): + assert members_to_fetch is not None + assert first_event_by_sender_map is not None + + additional_state_ids = ( + await self._find_missing_partial_state_memberships( + room_id, members_to_fetch, first_event_by_sender_map, state_ids + ) + ) + state_ids = {**state_ids, **additional_state_ids} + # At this point, if `lazy_load_members` is enabled, `state_ids` includes # the memberships of all event senders in the timeline. This is because we # may not have sent the memberships in a previous sync. @@ -1086,6 +1176,99 @@ class SyncHandler: if e.type != EventTypes.Aliases # until MSC2261 or alternative solution } + async def _find_missing_partial_state_memberships( + self, + room_id: str, + members_to_fetch: Collection[str], + events_with_membership_auth: Mapping[str, EventBase], + found_state_ids: StateMap[str], + ) -> StateMap[str]: + """Finds missing memberships from a set of auth events and returns them as a + state map. + + Args: + room_id: The partial state room to find the remaining memberships for. + members_to_fetch: The memberships to find. + events_with_membership_auth: A mapping from user IDs to events whose auth + events are known to contain their membership. + found_state_ids: A dict from (type, state_key) -> state_event_id, containing + memberships that have been previously found. Entries in + `members_to_fetch` that have a membership in `found_state_ids` are + ignored. + + Returns: + A dict from ("m.room.member", state_key) -> state_event_id, containing the + memberships missing from `found_state_ids`. + + Raises: + KeyError: if `events_with_membership_auth` does not have an entry for a + missing membership. Memberships in `found_state_ids` do not need an + entry in `events_with_membership_auth`. + """ + additional_state_ids: MutableStateMap[str] = {} + + # Tracks the missing members for logging purposes. + missing_members = set() + + # Identify memberships missing from `found_state_ids` and pick out the auth + # events in which to look for them. + auth_event_ids: Set[str] = set() + for member in members_to_fetch: + if (EventTypes.Member, member) in found_state_ids: + continue + + missing_members.add(member) + event_with_membership_auth = events_with_membership_auth[member] + auth_event_ids.update(event_with_membership_auth.auth_event_ids()) + + auth_events = await self.store.get_events(auth_event_ids) + + # Run through the missing memberships once more, picking out the memberships + # from the pile of auth events we have just fetched. + for member in members_to_fetch: + if (EventTypes.Member, member) in found_state_ids: + continue + + event_with_membership_auth = events_with_membership_auth[member] + + # Dig through the auth events to find the desired membership. + for auth_event_id in event_with_membership_auth.auth_event_ids(): + # We only store events once we have all their auth events, + # so the auth event must be in the pile we have just + # fetched. + auth_event = auth_events[auth_event_id] + + if ( + auth_event.type == EventTypes.Member + and auth_event.state_key == member + ): + missing_members.remove(member) + additional_state_ids[ + (EventTypes.Member, member) + ] = auth_event.event_id + break + + if missing_members: + # There really shouldn't be any missing memberships now. Either: + # * we couldn't find an auth event, which shouldn't happen because we do + # not persist events with persisting their auth events first, or + # * the set of auth events did not contain a membership we wanted, which + # means our caller didn't compute the events in `members_to_fetch` + # correctly, or we somehow accepted an event whose auth events were + # dodgy. + logger.error( + "Failed to find memberships for %s in partial state room " + "%s in the auth events of %s.", + missing_members, + room_id, + [ + events_with_membership_auth[member].event_id + for member in missing_members + ], + ) + + return additional_state_ids + async def unread_notifs_for_room_id( self, room_id: str, sync_config: SyncConfig ) -> NotifCounts: @@ -1730,7 +1913,11 @@ class SyncHandler: continue if room_id in sync_result_builder.joined_room_ids or has_join: - old_state_ids = await self.get_state_at(room_id, since_token) + old_state_ids = await self.get_state_at( + room_id, + since_token, + state_filter=StateFilter.from_types([(EventTypes.Member, user_id)]), + ) old_mem_ev_id = old_state_ids.get((EventTypes.Member, user_id), None) old_mem_ev = None if old_mem_ev_id: @@ -1756,7 +1943,13 @@ class SyncHandler: newly_left_rooms.append(room_id) else: if not old_state_ids: - old_state_ids = await self.get_state_at(room_id, since_token) + old_state_ids = await self.get_state_at( + room_id, + since_token, + state_filter=StateFilter.from_types( + [(EventTypes.Member, user_id)] + ), + ) old_mem_ev_id = old_state_ids.get( (EventTypes.Member, user_id), None ) diff --git a/synapse/storage/controllers/state.py b/synapse/storage/controllers/state.py index 1ad002f57b..f9ffd0e29e 100644 --- a/synapse/storage/controllers/state.py +++ b/synapse/storage/controllers/state.py @@ -234,6 +234,7 @@ class StateStorageController: self, event_ids: Collection[str], state_filter: Optional[StateFilter] = None, + await_full_state: bool = True, ) -> Dict[str, StateMap[str]]: """ Get the state dicts corresponding to a list of events, containing the event_ids @@ -242,6 +243,9 @@ class StateStorageController: Args: event_ids: events whose state should be returned state_filter: The state filter used to fetch state from the database. + await_full_state: if `True`, will block if we do not yet have complete state + at these events and `state_filter` is not satisfied by partial state. + Defaults to `True`. Returns: A dict from event_id -> (type, state_key) -> event_id @@ -250,8 +254,12 @@ class StateStorageController: RuntimeError if we don't have a state group for one or more of the events (ie they are outliers or unknown) """ - await_full_state = True - if state_filter and not state_filter.must_await_full_state(self._is_mine_id): + if ( + await_full_state + and state_filter + and not state_filter.must_await_full_state(self._is_mine_id) + ): + # Full state is not required if the state filter is restrictive enough. await_full_state = False event_to_groups = await self.get_state_group_for_events( @@ -294,7 +302,10 @@ class StateStorageController: @trace async def get_state_ids_for_event( - self, event_id: str, state_filter: Optional[StateFilter] = None + self, + event_id: str, + state_filter: Optional[StateFilter] = None, + await_full_state: bool = True, ) -> StateMap[str]: """ Get the state dict corresponding to a particular event @@ -302,6 +313,9 @@ class StateStorageController: Args: event_id: event whose state should be returned state_filter: The state filter used to fetch state from the database. + await_full_state: if `True`, will block if we do not yet have complete state + at the event and `state_filter` is not satisfied by partial state. + Defaults to `True`. Returns: A dict from (type, state_key) -> state_event_id @@ -311,7 +325,9 @@ class StateStorageController: outlier or is unknown) """ state_map = await self.get_state_ids_for_events( - [event_id], state_filter or StateFilter.all() + [event_id], + state_filter or StateFilter.all(), + await_full_state=await_full_state, ) return state_map[event_id] -- cgit 1.5.1 From 51d732db3b4ab13eb58e937a546abce7968112ef Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 30 Aug 2022 01:38:14 -0500 Subject: Optimize how we calculate `likely_domains` during backfill (#13575) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Optimize how we calculate `likely_domains` during backfill because I've seen this take 17s in production just to `get_current_state` which is used to `get_domains_from_state` (see case [*2. Loading tons of events* in the `/messages` investigation issue](https://github.com/matrix-org/synapse/issues/13356)). There are 3 ways we currently calculate hosts that are in the room: 1. `get_current_state` -> `get_domains_from_state` - Used in `backfill` to calculate `likely_domains` and `/timestamp_to_event` because it was cargo-culted from `backfill` - This one is being eliminated in favor of `get_current_hosts_in_room` in this PR 🕳 1. `get_current_hosts_in_room` - Used for other federation things like sending read receipts and typing indicators 1. `get_hosts_in_room_at_events` - Used when pushing out events over federation to other servers in the `_process_event_queue_loop` Fix https://github.com/matrix-org/synapse/issues/13626 Part of https://github.com/matrix-org/synapse/issues/13356 Mentioned in [internal doc](https://docs.google.com/document/d/1lvUoVfYUiy6UaHB6Rb4HicjaJAU40-APue9Q4vzuW3c/edit#bookmark=id.2tvwz3yhcafh) ### Query performance #### Before The query from `get_current_state` sucks just because we have to get all 80k events. And we see almost the exact same performance locally trying to get all of these events (16s vs 17s): ``` synapse=# SELECT type, state_key, event_id FROM current_state_events WHERE room_id = '!OGEhHVWSdvArJzumhm:matrix.org'; Time: 16035.612 ms (00:16.036) synapse=# SELECT type, state_key, event_id FROM current_state_events WHERE room_id = '!OGEhHVWSdvArJzumhm:matrix.org'; Time: 4243.237 ms (00:04.243) ``` But what about `get_current_hosts_in_room`: When there is 8M rows in the `current_state_events` table, the previous query in `get_current_hosts_in_room` took 13s from complete freshness (when the events were first added). But takes 930ms after a Postgres restart or 390ms if running back to back to back. ```sh $ psql synapse synapse=# \timing on synapse=# SELECT COUNT(DISTINCT substring(state_key FROM '@[^:]*:(.*)$')) FROM current_state_events WHERE type = 'm.room.member' AND membership = 'join' AND room_id = '!OGEhHVWSdvArJzumhm:matrix.org'; count ------- 4130 (1 row) Time: 13181.598 ms (00:13.182) synapse=# SELECT COUNT(*) from current_state_events where room_id = '!OGEhHVWSdvArJzumhm:matrix.org'; count ------- 80814 synapse=# SELECT COUNT(*) from current_state_events; count --------- 8162847 synapse=# SELECT pg_size_pretty( pg_total_relation_size('current_state_events') ); pg_size_pretty ---------------- 4702 MB ``` #### After I'm not sure how long it takes from complete freshness as I only really get that opportunity once (maybe restarting computer but that's cumbersome) and it's not really relevant to normal operating times. Maybe you get closer to the fresh times the more access variability there is so that Postgres caches aren't as exact. Update: The longest I've seen this run for is 6.4s and 4.5s after a computer restart. After a Postgres restart, it takes 330ms and running back to back takes 260ms. ```sh $ psql synapse synapse=# \timing on Timing is on. synapse=# SELECT substring(c.state_key FROM '@[^:]*:(.*)$') as host FROM current_state_events c /* Get the depth of the event from the events table */ INNER JOIN events AS e USING (event_id) WHERE c.type = 'm.room.member' AND c.membership = 'join' AND c.room_id = '!OGEhHVWSdvArJzumhm:matrix.org' GROUP BY host ORDER BY min(e.depth) ASC; Time: 333.800 ms ``` #### Going further To improve things further we could add a `limit` parameter to `get_current_hosts_in_room`. Realistically, we don't need 4k domains to choose from because there is no way we're going to query that many before we a) probably get an answer or b) we give up. Another thing we can do is optimize the query to use a index skip scan: - https://wiki.postgresql.org/wiki/Loose_indexscan - Index Skip Scan, https://commitfest.postgresql.org/37/1741/ - https://www.timescale.com/blog/how-we-made-distinct-queries-up-to-8000x-faster-on-postgresql/ --- changelog.d/13575.misc | 1 + synapse/handlers/federation.py | 53 ++++------------- synapse/handlers/room.py | 14 ++--- synapse/storage/controllers/state.py | 3 +- synapse/storage/databases/main/roommember.py | 88 ++++++++++++++++++++++------ 5 files changed, 89 insertions(+), 70 deletions(-) create mode 100644 changelog.d/13575.misc (limited to 'synapse/storage/controllers/state.py') diff --git a/changelog.d/13575.misc b/changelog.d/13575.misc new file mode 100644 index 0000000000..3841472617 --- /dev/null +++ b/changelog.d/13575.misc @@ -0,0 +1 @@ +Optimize how Synapse calculates domains to fetch from during backfill. diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py index e151962055..dd4b9f66d1 100644 --- a/synapse/handlers/federation.py +++ b/synapse/handlers/federation.py @@ -70,7 +70,7 @@ from synapse.replication.http.federation import ( from synapse.storage.databases.main.events import PartialStateConflictError from synapse.storage.databases.main.events_worker import EventRedactBehaviour from synapse.storage.state import StateFilter -from synapse.types import JsonDict, StateMap, get_domain_from_id +from synapse.types import JsonDict, get_domain_from_id from synapse.util.async_helpers import Linearizer from synapse.util.retryutils import NotRetryingDestination from synapse.visibility import filter_events_for_server @@ -104,37 +104,6 @@ backfill_processing_before_timer = Histogram( ) -def get_domains_from_state(state: StateMap[EventBase]) -> List[Tuple[str, int]]: - """Get joined domains from state - - Args: - state: State map from type/state key to event. - - Returns: - Returns a list of servers with the lowest depth of their joins. - Sorted by lowest depth first. - """ - joined_users = [ - (state_key, int(event.depth)) - for (e_type, state_key), event in state.items() - if e_type == EventTypes.Member and event.membership == Membership.JOIN - ] - - joined_domains: Dict[str, int] = {} - for u, d in joined_users: - try: - dom = get_domain_from_id(u) - old_d = joined_domains.get(dom) - if old_d: - joined_domains[dom] = min(d, old_d) - else: - joined_domains[dom] = d - except Exception: - pass - - return sorted(joined_domains.items(), key=lambda d: d[1]) - - class _BackfillPointType(Enum): # a regular backwards extremity (ie, an event which we don't yet have, but which # is referred to by other events in the DAG) @@ -432,21 +401,19 @@ class FederationHandler: ) # Now we need to decide which hosts to hit first. - - # First we try hosts that are already in the room + # First we try hosts that are already in the room. # TODO: HEURISTIC ALERT. + likely_domains = ( + await self._storage_controllers.state.get_current_hosts_in_room(room_id) + ) - curr_state = await self._storage_controllers.state.get_current_state(room_id) - - curr_domains = get_domains_from_state(curr_state) - - likely_domains = [ - domain for domain, depth in curr_domains if domain != self.server_name - ] - - async def try_backfill(domains: List[str]) -> bool: + async def try_backfill(domains: Collection[str]) -> bool: # TODO: Should we try multiple of these at a time? for dom in domains: + # We don't want to ask our own server for information we don't have + if dom == self.server_name: + continue + try: await self._federation_event_handler.backfill( dom, room_id, limit=100, extremities=extremities_to_request diff --git a/synapse/handlers/room.py b/synapse/handlers/room.py index 2fc8264858..f64a8690a5 100644 --- a/synapse/handlers/room.py +++ b/synapse/handlers/room.py @@ -60,7 +60,6 @@ from synapse.event_auth import validate_event_for_room_version from synapse.events import EventBase from synapse.events.utils import copy_and_fixup_power_levels_contents from synapse.federation.federation_client import InvalidResponseError -from synapse.handlers.federation import get_domains_from_state from synapse.handlers.relations import BundledAggregations from synapse.module_api import NOT_SPAM from synapse.rest.admin._base import assert_user_is_admin @@ -1462,17 +1461,16 @@ class TimestampLookupHandler: timestamp, ) - # Find other homeservers from the given state in the room - curr_state = await self._storage_controllers.state.get_current_state( - room_id + likely_domains = ( + await self._storage_controllers.state.get_current_hosts_in_room(room_id) ) - curr_domains = get_domains_from_state(curr_state) - likely_domains = [ - domain for domain, depth in curr_domains if domain != self.server_name - ] # Loop through each homeserver candidate until we get a succesful response for domain in likely_domains: + # We don't want to ask our own server for information we don't have + if domain == self.server_name: + continue + try: remote_response = await self.federation_client.timestamp_to_event( domain, room_id, timestamp, direction diff --git a/synapse/storage/controllers/state.py b/synapse/storage/controllers/state.py index f9ffd0e29e..ba5380ce3e 100644 --- a/synapse/storage/controllers/state.py +++ b/synapse/storage/controllers/state.py @@ -23,7 +23,6 @@ from typing import ( List, Mapping, Optional, - Set, Tuple, ) @@ -520,7 +519,7 @@ class StateStorageController: ) return state_map.get(key) - async def get_current_hosts_in_room(self, room_id: str) -> Set[str]: + async def get_current_hosts_in_room(self, room_id: str) -> List[str]: """Get current hosts in room based on current state.""" await self._partial_state_room_tracker.await_full_state(room_id) diff --git a/synapse/storage/databases/main/roommember.py b/synapse/storage/databases/main/roommember.py index 9e5034b401..06500457bd 100644 --- a/synapse/storage/databases/main/roommember.py +++ b/synapse/storage/databases/main/roommember.py @@ -187,27 +187,48 @@ class RoomMemberWorkerStore(EventsWorkerStore): @cached(max_entries=100000, iterable=True) async def get_users_in_room(self, room_id: str) -> List[str]: + """ + Returns a list of users in the room sorted by longest in the room first + (aka. with the lowest depth). This is done to match the sort in + `get_current_hosts_in_room()` and so we can re-use the cache but it's + not horrible to have here either. + """ + return await self.db_pool.runInteraction( "get_users_in_room", self.get_users_in_room_txn, room_id ) def get_users_in_room_txn(self, txn: LoggingTransaction, room_id: str) -> List[str]: + """ + Returns a list of users in the room sorted by longest in the room first + (aka. with the lowest depth). This is done to match the sort in + `get_current_hosts_in_room()` and so we can re-use the cache but it's + not horrible to have here either. + """ # If we can assume current_state_events.membership is up to date # then we can avoid a join, which is a Very Good Thing given how # frequently this function gets called. if self._current_state_events_membership_up_to_date: sql = """ - SELECT state_key FROM current_state_events - WHERE type = 'm.room.member' AND room_id = ? AND membership = ? + SELECT c.state_key FROM current_state_events as c + /* Get the depth of the event from the events table */ + INNER JOIN events AS e USING (event_id) + WHERE c.type = 'm.room.member' AND c.room_id = ? AND membership = ? + /* Sorted by lowest depth first */ + ORDER BY e.depth ASC; """ else: sql = """ - SELECT state_key FROM room_memberships as m + SELECT c.state_key FROM room_memberships as m + /* Get the depth of the event from the events table */ + INNER JOIN events AS e USING (event_id) INNER JOIN current_state_events as c ON m.event_id = c.event_id AND m.room_id = c.room_id AND m.user_id = c.state_key WHERE c.type = 'm.room.member' AND c.room_id = ? AND m.membership = ? + /* Sorted by lowest depth first */ + ORDER BY e.depth ASC; """ txn.execute(sql, (room_id, Membership.JOIN)) @@ -1037,37 +1058,70 @@ class RoomMemberWorkerStore(EventsWorkerStore): return True @cached(iterable=True, max_entries=10000) - async def get_current_hosts_in_room(self, room_id: str) -> Set[str]: - """Get current hosts in room based on current state.""" + async def get_current_hosts_in_room(self, room_id: str) -> List[str]: + """ + Get current hosts in room based on current state. + + The heuristic of sorting by servers who have been in the room the + longest is good because they're most likely to have anything we ask + about. + + Returns: + Returns a list of servers sorted by longest in the room first. (aka. + sorted by join with the lowest depth first). + """ # First we check if we already have `get_users_in_room` in the cache, as # we can just calculate result from that users = self.get_users_in_room.cache.get_immediate( (room_id,), None, update_metrics=False ) - if users is not None: - return {get_domain_from_id(u) for u in users} - - if isinstance(self.database_engine, Sqlite3Engine): + if users is None and isinstance(self.database_engine, Sqlite3Engine): # If we're using SQLite then let's just always use # `get_users_in_room` rather than funky SQL. users = await self.get_users_in_room(room_id) - return {get_domain_from_id(u) for u in users} + + if users is not None: + # Because `users` is sorted from lowest -> highest depth, the list + # of domains will also be sorted that way. + domains: List[str] = [] + # We use a `Set` just for fast lookups + domain_set: Set[str] = set() + for u in users: + domain = get_domain_from_id(u) + if domain not in domain_set: + domain_set.add(domain) + domains.append(domain) + return domains # For PostgreSQL we can use a regex to pull out the domains from the # joined users in `current_state_events` via regex. - def get_current_hosts_in_room_txn(txn: LoggingTransaction) -> Set[str]: + def get_current_hosts_in_room_txn(txn: LoggingTransaction) -> List[str]: + # Returns a list of servers currently joined in the room sorted by + # longest in the room first (aka. with the lowest depth). The + # heuristic of sorting by servers who have been in the room the + # longest is good because they're most likely to have anything we + # ask about. sql = """ - SELECT DISTINCT substring(state_key FROM '@[^:]*:(.*)$') - FROM current_state_events + SELECT + /* Match the domain part of the MXID */ + substring(c.state_key FROM '@[^:]*:(.*)$') as server_domain + FROM current_state_events c + /* Get the depth of the event from the events table */ + INNER JOIN events AS e USING (event_id) WHERE - type = 'm.room.member' - AND membership = 'join' - AND room_id = ? + /* Find any join state events in the room */ + c.type = 'm.room.member' + AND c.membership = 'join' + AND c.room_id = ? + /* Group all state events from the same domain into their own buckets (groups) */ + GROUP BY server_domain + /* Sorted by lowest depth first */ + ORDER BY min(e.depth) ASC; """ txn.execute(sql, (room_id,)) - return {d for d, in txn} + return [d for d, in txn] return await self.db_pool.runInteraction( "get_current_hosts_in_room", get_current_hosts_in_room_txn -- cgit 1.5.1 From d3d9ca156e323fe194b1bcb1af1628f65a2f3c1c Mon Sep 17 00:00:00 2001 From: reivilibre Date: Wed, 7 Sep 2022 11:03:32 +0000 Subject: Cancel the processing of key query requests when they time out. (#13680) --- changelog.d/13680.feature | 1 + synapse/api/auth.py | 5 +++ synapse/handlers/device.py | 3 ++ synapse/handlers/e2e_keys.py | 40 +++++++++++++--------- synapse/rest/client/keys.py | 6 ++-- synapse/storage/controllers/state.py | 4 +++ synapse/storage/databases/main/devices.py | 4 +++ synapse/storage/databases/main/end_to_end_keys.py | 5 ++- synapse/storage/databases/main/event_federation.py | 2 ++ synapse/storage/databases/main/events_worker.py | 4 +++ synapse/storage/databases/main/roommember.py | 2 ++ synapse/storage/databases/main/state.py | 2 ++ synapse/storage/databases/main/stream.py | 2 ++ synapse/storage/databases/state/store.py | 3 ++ .../storage/util/partial_state_events_tracker.py | 3 ++ synapse/types.py | 5 +++ tests/http/server/_base.py | 10 +++++- tests/rest/client/test_keys.py | 29 ++++++++++++++++ 18 files changed, 110 insertions(+), 20 deletions(-) create mode 100644 changelog.d/13680.feature (limited to 'synapse/storage/controllers/state.py') diff --git a/changelog.d/13680.feature b/changelog.d/13680.feature new file mode 100644 index 0000000000..4234c7e082 --- /dev/null +++ b/changelog.d/13680.feature @@ -0,0 +1 @@ +Cancel the processing of key query requests when they time out. \ No newline at end of file diff --git a/synapse/api/auth.py b/synapse/api/auth.py index 9a1aea083f..8e54ef84b2 100644 --- a/synapse/api/auth.py +++ b/synapse/api/auth.py @@ -38,6 +38,7 @@ from synapse.logging.opentracing import ( trace, ) from synapse.types import Requester, create_requester +from synapse.util.cancellation import cancellable if TYPE_CHECKING: from synapse.server import HomeServer @@ -118,6 +119,7 @@ class Auth: errcode=Codes.NOT_JOINED, ) + @cancellable async def get_user_by_req( self, request: SynapseRequest, @@ -166,6 +168,7 @@ class Auth: parent_span.set_tag("appservice_id", requester.app_service.id) return requester + @cancellable async def _wrapped_get_user_by_req( self, request: SynapseRequest, @@ -281,6 +284,7 @@ class Auth: 403, "Application service has not registered this user (%s)" % user_id ) + @cancellable async def _get_appservice_user(self, request: Request) -> Optional[Requester]: """ Given a request, reads the request parameters to determine: @@ -523,6 +527,7 @@ class Auth: return bool(query_params) or bool(auth_headers) @staticmethod + @cancellable def get_access_token_from_request(request: Request) -> str: """Extracts the access_token from the request. diff --git a/synapse/handlers/device.py b/synapse/handlers/device.py index 9c2c3a0e68..c5ac169644 100644 --- a/synapse/handlers/device.py +++ b/synapse/handlers/device.py @@ -52,6 +52,7 @@ from synapse.types import ( from synapse.util import stringutils from synapse.util.async_helpers import Linearizer from synapse.util.caches.expiringcache import ExpiringCache +from synapse.util.cancellation import cancellable from synapse.util.metrics import measure_func from synapse.util.retryutils import NotRetryingDestination @@ -124,6 +125,7 @@ class DeviceWorkerHandler: return device + @cancellable async def get_device_changes_in_shared_rooms( self, user_id: str, room_ids: Collection[str], from_token: StreamToken ) -> Collection[str]: @@ -163,6 +165,7 @@ class DeviceWorkerHandler: @trace @measure_func("device.get_user_ids_changed") + @cancellable async def get_user_ids_changed( self, user_id: str, from_token: StreamToken ) -> JsonDict: diff --git a/synapse/handlers/e2e_keys.py b/synapse/handlers/e2e_keys.py index c938339ddd..ec81639c78 100644 --- a/synapse/handlers/e2e_keys.py +++ b/synapse/handlers/e2e_keys.py @@ -37,7 +37,8 @@ from synapse.types import ( get_verify_key_from_cross_signing_key, ) from synapse.util import json_decoder, unwrapFirstError -from synapse.util.async_helpers import Linearizer +from synapse.util.async_helpers import Linearizer, delay_cancellation +from synapse.util.cancellation import cancellable from synapse.util.retryutils import NotRetryingDestination if TYPE_CHECKING: @@ -91,6 +92,7 @@ class E2eKeysHandler: ) @trace + @cancellable async def query_devices( self, query_body: JsonDict, @@ -208,22 +210,26 @@ class E2eKeysHandler: r[user_id] = remote_queries[user_id] # Now fetch any devices that we don't have in our cache + # TODO It might make sense to propagate cancellations into the + # deferreds which are querying remote homeservers. await make_deferred_yieldable( - defer.gatherResults( - [ - run_in_background( - self._query_devices_for_destination, - results, - cross_signing_keys, - failures, - destination, - queries, - timeout, - ) - for destination, queries in remote_queries_not_in_cache.items() - ], - consumeErrors=True, - ).addErrback(unwrapFirstError) + delay_cancellation( + defer.gatherResults( + [ + run_in_background( + self._query_devices_for_destination, + results, + cross_signing_keys, + failures, + destination, + queries, + timeout, + ) + for destination, queries in remote_queries_not_in_cache.items() + ], + consumeErrors=True, + ).addErrback(unwrapFirstError) + ) ) ret = {"device_keys": results, "failures": failures} @@ -347,6 +353,7 @@ class E2eKeysHandler: return + @cancellable async def get_cross_signing_keys_from_cache( self, query: Iterable[str], from_user_id: Optional[str] ) -> Dict[str, Dict[str, dict]]: @@ -393,6 +400,7 @@ class E2eKeysHandler: } @trace + @cancellable async def query_local_devices( self, query: Mapping[str, Optional[List[str]]] ) -> Dict[str, Dict[str, dict]]: diff --git a/synapse/rest/client/keys.py b/synapse/rest/client/keys.py index a395694fa5..f653d2a3e1 100644 --- a/synapse/rest/client/keys.py +++ b/synapse/rest/client/keys.py @@ -27,9 +27,9 @@ from synapse.http.servlet import ( ) from synapse.http.site import SynapseRequest from synapse.logging.opentracing import log_kv, set_tag +from synapse.rest.client._base import client_patterns, interactive_auth_handler from synapse.types import JsonDict, StreamToken - -from ._base import client_patterns, interactive_auth_handler +from synapse.util.cancellation import cancellable if TYPE_CHECKING: from synapse.server import HomeServer @@ -156,6 +156,7 @@ class KeyQueryServlet(RestServlet): self.auth = hs.get_auth() self.e2e_keys_handler = hs.get_e2e_keys_handler() + @cancellable async def on_POST(self, request: SynapseRequest) -> Tuple[int, JsonDict]: requester = await self.auth.get_user_by_req(request, allow_guest=True) user_id = requester.user.to_string() @@ -199,6 +200,7 @@ class KeyChangesServlet(RestServlet): self.device_handler = hs.get_device_handler() self.store = hs.get_datastores().main + @cancellable async def on_GET(self, request: SynapseRequest) -> Tuple[int, JsonDict]: requester = await self.auth.get_user_by_req(request, allow_guest=True) diff --git a/synapse/storage/controllers/state.py b/synapse/storage/controllers/state.py index ba5380ce3e..bbe568bf05 100644 --- a/synapse/storage/controllers/state.py +++ b/synapse/storage/controllers/state.py @@ -36,6 +36,7 @@ from synapse.storage.util.partial_state_events_tracker import ( PartialStateEventsTracker, ) from synapse.types import MutableStateMap, StateMap +from synapse.util.cancellation import cancellable if TYPE_CHECKING: from synapse.server import HomeServer @@ -229,6 +230,7 @@ class StateStorageController: @trace @tag_args + @cancellable async def get_state_ids_for_events( self, event_ids: Collection[str], @@ -350,6 +352,7 @@ class StateStorageController: @trace @tag_args + @cancellable async def get_state_group_for_events( self, event_ids: Collection[str], @@ -398,6 +401,7 @@ class StateStorageController: event_id, room_id, prev_group, delta_ids, current_state_ids ) + @cancellable async def get_current_state_ids( self, room_id: str, diff --git a/synapse/storage/databases/main/devices.py b/synapse/storage/databases/main/devices.py index ca0fe8c4be..5d700ca6c3 100644 --- a/synapse/storage/databases/main/devices.py +++ b/synapse/storage/databases/main/devices.py @@ -53,6 +53,7 @@ from synapse.util import json_decoder, json_encoder from synapse.util.caches.descriptors import cached, cachedList from synapse.util.caches.lrucache import LruCache from synapse.util.caches.stream_change_cache import StreamChangeCache +from synapse.util.cancellation import cancellable from synapse.util.iterutils import batch_iter from synapse.util.stringutils import shortstr @@ -668,6 +669,7 @@ class DeviceWorkerStore(EndToEndKeyWorkerStore): ... @trace + @cancellable async def get_user_devices_from_cache( self, query_list: List[Tuple[str, Optional[str]]] ) -> Tuple[Set[str], Dict[str, Dict[str, JsonDict]]]: @@ -743,6 +745,7 @@ class DeviceWorkerStore(EndToEndKeyWorkerStore): return self._device_list_stream_cache.get_all_entities_changed(from_key) + @cancellable async def get_users_whose_devices_changed( self, from_key: int, @@ -1221,6 +1224,7 @@ class DeviceWorkerStore(EndToEndKeyWorkerStore): desc="get_min_device_lists_changes_in_room", ) + @cancellable async def get_device_list_changes_in_rooms( self, room_ids: Collection[str], from_id: int ) -> Optional[Set[str]]: diff --git a/synapse/storage/databases/main/end_to_end_keys.py b/synapse/storage/databases/main/end_to_end_keys.py index 46c0d06157..8e9e1b0b4b 100644 --- a/synapse/storage/databases/main/end_to_end_keys.py +++ b/synapse/storage/databases/main/end_to_end_keys.py @@ -50,6 +50,7 @@ from synapse.storage.util.id_generators import StreamIdGenerator from synapse.types import JsonDict from synapse.util import json_encoder from synapse.util.caches.descriptors import cached, cachedList +from synapse.util.cancellation import cancellable from synapse.util.iterutils import batch_iter if TYPE_CHECKING: @@ -135,6 +136,7 @@ class EndToEndKeyWorkerStore(EndToEndKeyBackgroundStore, CacheInvalidationWorker return now_stream_id, [] @trace + @cancellable async def get_e2e_device_keys_for_cs_api( self, query_list: List[Tuple[str, Optional[str]]] ) -> Dict[str, Dict[str, JsonDict]]: @@ -197,6 +199,7 @@ class EndToEndKeyWorkerStore(EndToEndKeyBackgroundStore, CacheInvalidationWorker ... @trace + @cancellable async def get_e2e_device_keys_and_signatures( self, query_list: Collection[Tuple[str, Optional[str]]], @@ -887,6 +890,7 @@ class EndToEndKeyWorkerStore(EndToEndKeyBackgroundStore, CacheInvalidationWorker return keys + @cancellable async def get_e2e_cross_signing_keys_bulk( self, user_ids: List[str], from_user_id: Optional[str] = None ) -> Dict[str, Optional[Dict[str, JsonDict]]]: @@ -902,7 +906,6 @@ class EndToEndKeyWorkerStore(EndToEndKeyBackgroundStore, CacheInvalidationWorker keys were not found, either their user ID will not be in the dict, or their user ID will map to None. """ - result = await self._get_bare_e2e_cross_signing_keys_bulk(user_ids) if from_user_id: diff --git a/synapse/storage/databases/main/event_federation.py b/synapse/storage/databases/main/event_federation.py index e687f87eca..ca47a22bf1 100644 --- a/synapse/storage/databases/main/event_federation.py +++ b/synapse/storage/databases/main/event_federation.py @@ -48,6 +48,7 @@ from synapse.types import JsonDict from synapse.util import json_encoder from synapse.util.caches.descriptors import cached from synapse.util.caches.lrucache import LruCache +from synapse.util.cancellation import cancellable from synapse.util.iterutils import batch_iter if TYPE_CHECKING: @@ -976,6 +977,7 @@ class EventFederationWorkerStore(SignatureWorkerStore, EventsWorkerStore, SQLBas return int(min_depth) if min_depth is not None else None + @cancellable async def get_forward_extremities_for_room_at_stream_ordering( self, room_id: str, stream_ordering: int ) -> List[str]: diff --git a/synapse/storage/databases/main/events_worker.py b/synapse/storage/databases/main/events_worker.py index 84f17a9945..52914febf9 100644 --- a/synapse/storage/databases/main/events_worker.py +++ b/synapse/storage/databases/main/events_worker.py @@ -81,6 +81,7 @@ from synapse.util import unwrapFirstError from synapse.util.async_helpers import ObservableDeferred, delay_cancellation from synapse.util.caches.descriptors import cached, cachedList from synapse.util.caches.lrucache import AsyncLruCache +from synapse.util.cancellation import cancellable from synapse.util.iterutils import batch_iter from synapse.util.metrics import Measure @@ -339,6 +340,7 @@ class EventsWorkerStore(SQLBaseStore): ) -> Optional[EventBase]: ... + @cancellable async def get_event( self, event_id: str, @@ -433,6 +435,7 @@ class EventsWorkerStore(SQLBaseStore): @trace @tag_args + @cancellable async def get_events_as_list( self, event_ids: Collection[str], @@ -584,6 +587,7 @@ class EventsWorkerStore(SQLBaseStore): return events + @cancellable async def _get_events_from_cache_or_db( self, event_ids: Iterable[str], allow_rejected: bool = False ) -> Dict[str, EventCacheEntry]: diff --git a/synapse/storage/databases/main/roommember.py b/synapse/storage/databases/main/roommember.py index 4f0adb136a..a77e49dc66 100644 --- a/synapse/storage/databases/main/roommember.py +++ b/synapse/storage/databases/main/roommember.py @@ -55,6 +55,7 @@ from synapse.types import JsonDict, PersistedEventPosition, StateMap, get_domain from synapse.util.async_helpers import Linearizer from synapse.util.caches import intern_string from synapse.util.caches.descriptors import _CacheContext, cached, cachedList +from synapse.util.cancellation import cancellable from synapse.util.iterutils import batch_iter from synapse.util.metrics import Measure @@ -770,6 +771,7 @@ class RoomMemberWorkerStore(EventsWorkerStore): _get_users_server_still_shares_room_with_txn, ) + @cancellable async def get_rooms_for_user( self, user_id: str, on_invalidate: Optional[Callable[[], None]] = None ) -> FrozenSet[str]: diff --git a/synapse/storage/databases/main/state.py b/synapse/storage/databases/main/state.py index 0b10af0e58..e607ccfdc9 100644 --- a/synapse/storage/databases/main/state.py +++ b/synapse/storage/databases/main/state.py @@ -36,6 +36,7 @@ from synapse.storage.state import StateFilter from synapse.types import JsonDict, JsonMapping, StateMap from synapse.util.caches import intern_string from synapse.util.caches.descriptors import cached, cachedList +from synapse.util.cancellation import cancellable from synapse.util.iterutils import batch_iter if TYPE_CHECKING: @@ -281,6 +282,7 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore): ) # FIXME: how should this be cached? + @cancellable async def get_partial_filtered_current_state_ids( self, room_id: str, state_filter: Optional[StateFilter] = None ) -> StateMap[str]: diff --git a/synapse/storage/databases/main/stream.py b/synapse/storage/databases/main/stream.py index a347430aa7..3f9bfaeac5 100644 --- a/synapse/storage/databases/main/stream.py +++ b/synapse/storage/databases/main/stream.py @@ -72,6 +72,7 @@ from synapse.storage.util.id_generators import MultiWriterIdGenerator from synapse.types import PersistedEventPosition, RoomStreamToken from synapse.util.caches.descriptors import cached from synapse.util.caches.stream_change_cache import StreamChangeCache +from synapse.util.cancellation import cancellable if TYPE_CHECKING: from synapse.server import HomeServer @@ -597,6 +598,7 @@ class StreamWorkerStore(EventsWorkerStore, SQLBaseStore): return ret, key + @cancellable async def get_membership_changes_for_user( self, user_id: str, diff --git a/synapse/storage/databases/state/store.py b/synapse/storage/databases/state/store.py index bb64543c1f..f8cfcaca83 100644 --- a/synapse/storage/databases/state/store.py +++ b/synapse/storage/databases/state/store.py @@ -31,6 +31,7 @@ from synapse.storage.util.sequence import build_sequence_generator from synapse.types import MutableStateMap, StateKey, StateMap from synapse.util.caches.descriptors import cached from synapse.util.caches.dictionary_cache import DictionaryCache +from synapse.util.cancellation import cancellable if TYPE_CHECKING: from synapse.server import HomeServer @@ -156,6 +157,7 @@ class StateGroupDataStore(StateBackgroundUpdateStore, SQLBaseStore): "get_state_group_delta", _get_state_group_delta_txn ) + @cancellable async def _get_state_groups_from_groups( self, groups: List[int], state_filter: StateFilter ) -> Dict[int, StateMap[str]]: @@ -235,6 +237,7 @@ class StateGroupDataStore(StateBackgroundUpdateStore, SQLBaseStore): return state_filter.filter_state(state_dict_ids), not missing_types + @cancellable async def _get_state_for_groups( self, groups: Iterable[int], state_filter: Optional[StateFilter] = None ) -> Dict[int, MutableStateMap[str]]: diff --git a/synapse/storage/util/partial_state_events_tracker.py b/synapse/storage/util/partial_state_events_tracker.py index b4bf49dace..8d8894d1d5 100644 --- a/synapse/storage/util/partial_state_events_tracker.py +++ b/synapse/storage/util/partial_state_events_tracker.py @@ -24,6 +24,7 @@ from synapse.logging.opentracing import trace_with_opname from synapse.storage.databases.main.events_worker import EventsWorkerStore from synapse.storage.databases.main.room import RoomWorkerStore from synapse.util import unwrapFirstError +from synapse.util.cancellation import cancellable logger = logging.getLogger(__name__) @@ -60,6 +61,7 @@ class PartialStateEventsTracker: o.callback(None) @trace_with_opname("PartialStateEventsTracker.await_full_state") + @cancellable async def await_full_state(self, event_ids: Collection[str]) -> None: """Wait for all the given events to have full state. @@ -154,6 +156,7 @@ class PartialCurrentStateTracker: o.callback(None) @trace_with_opname("PartialCurrentStateTracker.await_full_state") + @cancellable async def await_full_state(self, room_id: str) -> None: # We add the deferred immediately so that the DB call to check for # partial state doesn't race when we unpartial the room. diff --git a/synapse/types.py b/synapse/types.py index 668d48d646..ec44601f54 100644 --- a/synapse/types.py +++ b/synapse/types.py @@ -52,6 +52,7 @@ from twisted.internet.interfaces import ( ) from synapse.api.errors import Codes, SynapseError +from synapse.util.cancellation import cancellable from synapse.util.stringutils import parse_and_validate_server_name if TYPE_CHECKING: @@ -699,7 +700,11 @@ class StreamToken: START: ClassVar["StreamToken"] @classmethod + @cancellable async def from_string(cls, store: "DataStore", string: str) -> "StreamToken": + """ + Creates a RoomStreamToken from its textual representation. + """ try: keys = string.split(cls._SEPARATOR) while len(keys) < len(attr.fields(cls)): diff --git a/tests/http/server/_base.py b/tests/http/server/_base.py index 5726e60cee..5071f83574 100644 --- a/tests/http/server/_base.py +++ b/tests/http/server/_base.py @@ -140,6 +140,8 @@ def make_request_with_cancellation_test( method: str, path: str, content: Union[bytes, str, JsonDict] = b"", + *, + token: Optional[str] = None, ) -> FakeChannel: """Performs a request repeatedly, disconnecting at successive `await`s, until one completes. @@ -211,7 +213,13 @@ def make_request_with_cancellation_test( with deferred_patch.patch(): # Start the request. channel = make_request( - reactor, site, method, path, content, await_result=False + reactor, + site, + method, + path, + content, + await_result=False, + access_token=token, ) request = channel.request diff --git a/tests/rest/client/test_keys.py b/tests/rest/client/test_keys.py index bbc8e74243..741fecea77 100644 --- a/tests/rest/client/test_keys.py +++ b/tests/rest/client/test_keys.py @@ -19,6 +19,7 @@ from synapse.rest import admin from synapse.rest.client import keys, login from tests import unittest +from tests.http.server._base import make_request_with_cancellation_test class KeyQueryTestCase(unittest.HomeserverTestCase): @@ -89,3 +90,31 @@ class KeyQueryTestCase(unittest.HomeserverTestCase): Codes.BAD_JSON, channel.result, ) + + def test_key_query_cancellation(self) -> None: + """ + Tests that /keys/query is cancellable and does not swallow the + CancelledError. + """ + self.register_user("alice", "wonderland") + alice_token = self.login("alice", "wonderland") + + bob = self.register_user("bob", "uncle") + + channel = make_request_with_cancellation_test( + "test_key_query_cancellation", + self.reactor, + self.site, + "POST", + "/_matrix/client/r0/keys/query", + { + "device_keys": { + # Empty list means we request keys for all bob's devices + bob: [], + }, + }, + token=alice_token, + ) + + self.assertEqual(200, channel.code, msg=channel.result["body"]) + self.assertIn(bob, channel.json_body["device_keys"]) -- cgit 1.5.1 From 03c2bfb7f89d637930da52723161ce74d4f89233 Mon Sep 17 00:00:00 2001 From: Sean Quah <8349537+squahtx@users.noreply.github.com> Date: Fri, 23 Sep 2022 13:44:03 +0100 Subject: Send device list updates out to servers in partially joined rooms (#13874) Use the provided list of servers in the room from the `/send_join` response, since we will not know which users are in the room. This isn't sufficient to ensure that all remote servers receive the right device list updates, since the `/send_join` response may be inaccurate or we may calculate the membership state of new users in the room incorrectly. Signed-off-by: Sean Quah --- changelog.d/13874.misc | 1 + synapse/handlers/device.py | 6 ++++- synapse/storage/controllers/state.py | 44 +++++++++++++++++++++++++++++++++- synapse/storage/databases/main/room.py | 17 +++++++++++++ 4 files changed, 66 insertions(+), 2 deletions(-) create mode 100644 changelog.d/13874.misc (limited to 'synapse/storage/controllers/state.py') diff --git a/changelog.d/13874.misc b/changelog.d/13874.misc new file mode 100644 index 0000000000..499e488c35 --- /dev/null +++ b/changelog.d/13874.misc @@ -0,0 +1 @@ +Faster room joins: Send device list updates to most servers in rooms with partial state. diff --git a/synapse/handlers/device.py b/synapse/handlers/device.py index 901e2310b7..6566b3bf3d 100644 --- a/synapse/handlers/device.py +++ b/synapse/handlers/device.py @@ -688,11 +688,15 @@ class DeviceHandler(DeviceWorkerHandler): # Ignore any users that aren't ours if self.hs.is_mine_id(user_id): hosts = set( - await self._storage_controllers.state.get_current_hosts_in_room( + await self._storage_controllers.state.get_current_hosts_in_room_or_partial_state_approximation( room_id ) ) hosts.discard(self.server_name) + # For rooms with partial state, `hosts` is merely an + # approximation. When we transition to a full state room, we + # will have to send out device list updates to any servers we + # missed. # Check if we've already sent this update to some hosts if current_stream_id == stream_id: diff --git a/synapse/storage/controllers/state.py b/synapse/storage/controllers/state.py index bbe568bf05..b1aa17047c 100644 --- a/synapse/storage/controllers/state.py +++ b/synapse/storage/controllers/state.py @@ -23,6 +23,7 @@ from typing import ( List, Mapping, Optional, + Sequence, Tuple, ) @@ -524,12 +525,53 @@ class StateStorageController: return state_map.get(key) async def get_current_hosts_in_room(self, room_id: str) -> List[str]: - """Get current hosts in room based on current state.""" + """Get current hosts in room based on current state. + + Blocks until we have full state for the given room. This only happens for rooms + with partial state. + + Returns: + A list of hosts in the room, sorted by longest in the room first. (aka. + sorted by join with the lowest depth first). + """ await self._partial_state_room_tracker.await_full_state(room_id) return await self.stores.main.get_current_hosts_in_room(room_id) + async def get_current_hosts_in_room_or_partial_state_approximation( + self, room_id: str + ) -> Sequence[str]: + """Get approximation of current hosts in room based on current state. + + For rooms with full state, this is equivalent to `get_current_hosts_in_room`, + with the same order of results. + + For rooms with partial state, no blocking occurs. Instead, the list of hosts + in the room at the time of joining is combined with the list of hosts which + joined the room afterwards. The returned list may include hosts that are not + actually in the room and exclude hosts that are in the room, since we may + calculate state incorrectly during the partial state phase. The order of results + is arbitrary for rooms with partial state. + """ + # We have to read this list first to mitigate races with un-partial stating. + # This will be empty for rooms with full state. + hosts_at_join = await self.stores.main.get_partial_state_servers_at_join( + room_id + ) + + hosts_from_state = await self.stores.main.get_current_hosts_in_room(room_id) + hosts_from_state_set = set(hosts_from_state) + + # First take the list of hosts based on the current state. + # For rooms with partial state, this will be missing most hosts. + hosts = list(hosts_from_state) + # Then add in the list of hosts in the room at the time we joined. + # This will be an empty list for rooms with full state. + hosts.extend(host for host in hosts_at_join if host not in hosts_from_state_set) + + return hosts + async def get_users_in_room_with_profiles( self, room_id: str ) -> Dict[str, ProfileInfo]: diff --git a/synapse/storage/databases/main/room.py b/synapse/storage/databases/main/room.py index bef66f1992..5dd116d766 100644 --- a/synapse/storage/databases/main/room.py +++ b/synapse/storage/databases/main/room.py @@ -25,6 +25,7 @@ from typing import ( List, Mapping, Optional, + Sequence, Tuple, Union, cast, @@ -1133,6 +1134,22 @@ class RoomWorkerStore(CacheInvalidationWorkerStore): get_rooms_for_retention_period_in_range_txn, ) + async def get_partial_state_servers_at_join(self, room_id: str) -> Sequence[str]: + """Gets the list of servers in a partial state room at the time we joined it. + + Returns: + The `servers_in_room` list from the `/send_join` response for partial state + rooms. May not be accurate or complete, as it comes from a remote + homeserver. + An empty list for full state rooms. + """ + return await self.db_pool.simple_select_onecol( + "partial_state_rooms_servers", + keyvalues={"room_id": room_id}, + retcol="server_name", + desc="get_partial_state_servers_at_join", + ) + async def get_partial_state_rooms_and_servers( self, ) -> Mapping[str, Collection[str]]: -- cgit 1.5.1 From f49f73c0da5502792c65d3de1ffd352ceb6af562 Mon Sep 17 00:00:00 2001 From: Sean Quah <8349537+squahtx@users.noreply.github.com> Date: Fri, 23 Sep 2022 17:55:15 +0100 Subject: Faster room joins: Avoid blocking `/keys/changes` (#13888) Part of the work for #12993. Once #12993 is fully resolved, we expect `/keys/changes` to behave sensibly when joined to a room with partial state. Signed-off-by: Sean Quah --- changelog.d/13888.misc | 1 + synapse/handlers/device.py | 7 +++++-- synapse/storage/controllers/state.py | 7 ++++++- 3 files changed, 12 insertions(+), 3 deletions(-) create mode 100644 changelog.d/13888.misc (limited to 'synapse/storage/controllers/state.py') diff --git a/changelog.d/13888.misc b/changelog.d/13888.misc new file mode 100644 index 0000000000..4ffd9bcede --- /dev/null +++ b/changelog.d/13888.misc @@ -0,0 +1 @@ +Faster room joins: Avoid waiting for full state when processing `/keys/changes` requests. diff --git a/synapse/handlers/device.py b/synapse/handlers/device.py index 6566b3bf3d..bad262731c 100644 --- a/synapse/handlers/device.py +++ b/synapse/handlers/device.py @@ -195,7 +195,9 @@ class DeviceWorkerHandler: possibly_changed = set(changed) possibly_left = set() for room_id in rooms_changed: - current_state_ids = await self._state_storage.get_current_state_ids(room_id) + current_state_ids = await self._state_storage.get_current_state_ids( + room_id, await_full_state=False + ) # The user may have left the room # TODO: Check if they actually did or if we were just invited. @@ -234,7 +236,8 @@ class DeviceWorkerHandler: # mapping from event_id -> state_dict prev_state_ids = await self._state_storage.get_state_ids_for_events( - event_ids + event_ids, + await_full_state=False, ) # Check if we've joined the room? If so we just blindly add all the users to diff --git a/synapse/storage/controllers/state.py b/synapse/storage/controllers/state.py index b1aa17047c..bb60130afe 100644 --- a/synapse/storage/controllers/state.py +++ b/synapse/storage/controllers/state.py @@ -407,6 +407,7 @@ class StateStorageController: self, room_id: str, state_filter: Optional[StateFilter] = None, + await_full_state: bool = True, on_invalidate: Optional[Callable[[], None]] = None, ) -> StateMap[str]: """Get the current state event ids for a room based on the @@ -419,13 +420,17 @@ class StateStorageController: room_id: The room to get the state IDs of. state_filter: The state filter used to fetch state from the database. + await_full_state: if true, will block if we do not yet have complete + state for the room. on_invalidate: Callback for when the `get_current_state_ids` cache for the room gets invalidated. Returns: The current state of the room. """ - if not state_filter or state_filter.must_await_full_state(self._is_mine_id): + if await_full_state and ( + not state_filter or state_filter.must_await_full_state(self._is_mine_id) + ): await self._partial_state_room_tracker.await_full_state(room_id) if state_filter and not state_filter.is_full(): -- cgit 1.5.1 From 3dfc4a08dc2e77178f2c2af68dc14b32da2d8b8f Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Fri, 30 Sep 2022 13:15:32 +0100 Subject: Fix performance regression in `get_users_in_room` (#13972) Fixes #13942. Introduced in #13575. Basically, let's only get the ordered set of hosts out of the DB if we need an ordered set of hosts. Since we split the function up the caching won't be as good, but I think it will still be fine as e.g. multiple backfill requests for the same room will hit the cache. --- changelog.d/13972.bugfix | 1 + synapse/handlers/federation.py | 4 +- synapse/handlers/room.py | 4 +- synapse/storage/controllers/state.py | 30 ++++--- synapse/storage/databases/main/roommember.py | 129 +++++++++++++++------------ 5 files changed, 98 insertions(+), 70 deletions(-) create mode 100644 changelog.d/13972.bugfix (limited to 'synapse/storage/controllers/state.py') diff --git a/changelog.d/13972.bugfix b/changelog.d/13972.bugfix new file mode 100644 index 0000000000..4c1e19ef8c --- /dev/null +++ b/changelog.d/13972.bugfix @@ -0,0 +1 @@ +Fix a performance regression in the `get_users_in_room` database query. Introduced in v1.67.0. diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py index b866258298..986ffed3d5 100644 --- a/synapse/handlers/federation.py +++ b/synapse/handlers/federation.py @@ -412,7 +412,9 @@ class FederationHandler: # First we try hosts that are already in the room. # TODO: HEURISTIC ALERT. likely_domains = ( - await self._storage_controllers.state.get_current_hosts_in_room(room_id) + await self._storage_controllers.state.get_current_hosts_in_room_ordered( + room_id + ) ) async def try_backfill(domains: Collection[str]) -> bool: diff --git a/synapse/handlers/room.py b/synapse/handlers/room.py index b220238e55..57ab05ad25 100644 --- a/synapse/handlers/room.py +++ b/synapse/handlers/room.py @@ -1540,7 +1540,9 @@ class TimestampLookupHandler: ) likely_domains = ( - await self._storage_controllers.state.get_current_hosts_in_room(room_id) + await self._storage_controllers.state.get_current_hosts_in_room_ordered( + room_id + ) ) # Loop through each homeserver candidate until we get a succesful response diff --git a/synapse/storage/controllers/state.py b/synapse/storage/controllers/state.py index bb60130afe..2b31ce54bb 100644 --- a/synapse/storage/controllers/state.py +++ b/synapse/storage/controllers/state.py @@ -23,7 +23,7 @@ from typing import ( List, Mapping, Optional, - Sequence, + Set, Tuple, ) @@ -529,7 +529,18 @@ class StateStorageController: ) return state_map.get(key) - async def get_current_hosts_in_room(self, room_id: str) -> List[str]: + async def get_current_hosts_in_room(self, room_id: str) -> Set[str]: + """Get current hosts in room based on current state. + + Blocks until we have full state for the given room. This only happens for rooms + with partial state. + """ + + await self._partial_state_room_tracker.await_full_state(room_id) + + return await self.stores.main.get_current_hosts_in_room(room_id) + + async def get_current_hosts_in_room_ordered(self, room_id: str) -> List[str]: """Get current hosts in room based on current state. Blocks until we have full state for the given room. This only happens for rooms @@ -542,11 +553,11 @@ class StateStorageController: await self._partial_state_room_tracker.await_full_state(room_id) - return await self.stores.main.get_current_hosts_in_room(room_id) + return await self.stores.main.get_current_hosts_in_room_ordered(room_id) async def get_current_hosts_in_room_or_partial_state_approximation( self, room_id: str - ) -> Sequence[str]: + ) -> Collection[str]: """Get approximation of current hosts in room based on current state. For rooms with full state, this is equivalent to `get_current_hosts_in_room`, @@ -566,14 +577,9 @@ class StateStorageController: ) hosts_from_state = await self.stores.main.get_current_hosts_in_room(room_id) - hosts_from_state_set = set(hosts_from_state) - - # First take the list of hosts based on the current state. - # For rooms with partial state, this will be missing most hosts. - hosts = list(hosts_from_state) - # Then add in the list of hosts in the room at the time we joined. - # This will be an empty list for rooms with full state. - hosts.extend(host for host in hosts_at_join if host not in hosts_from_state_set) + + hosts = set(hosts_at_join) + hosts.update(hosts_from_state) return hosts diff --git a/synapse/storage/databases/main/roommember.py b/synapse/storage/databases/main/roommember.py index 982e1f08e3..2337289d88 100644 --- a/synapse/storage/databases/main/roommember.py +++ b/synapse/storage/databases/main/roommember.py @@ -146,42 +146,37 @@ class RoomMemberWorkerStore(EventsWorkerStore): @cached(max_entries=100000, iterable=True) async def get_users_in_room(self, room_id: str) -> List[str]: - """ - Returns a list of users in the room sorted by longest in the room first - (aka. with the lowest depth). This is done to match the sort in - `get_current_hosts_in_room()` and so we can re-use the cache but it's - not horrible to have here either. - - Uses `m.room.member`s in the room state at the current forward extremities to - determine which users are in the room. + """Returns a list of users in the room. Will return inaccurate results for rooms with partial state, since the state for the forward extremities of those rooms will exclude most members. We may also calculate room state incorrectly for such rooms and believe that a member is or is not in the room when the opposite is true. """ - return await self.db_pool.runInteraction( - "get_users_in_room", self.get_users_in_room_txn, room_id + return await self.db_pool.simple_select_onecol( + table="current_state_events", + keyvalues={ + "type": EventTypes.Member, + "room_id": room_id, + "membership": Membership.JOIN, + }, + retcol="state_key", + desc="get_users_in_room", ) def get_users_in_room_txn(self, txn: LoggingTransaction, room_id: str) -> List[str]: - """ - Returns a list of users in the room sorted by longest in the room first - (aka. with the lowest depth). This is done to match the sort in - `get_current_hosts_in_room()` and so we can re-use the cache but it's - not horrible to have here either. - """ - sql = """ - SELECT c.state_key FROM current_state_events as c - /* Get the depth of the event from the events table */ - INNER JOIN events AS e USING (event_id) - WHERE c.type = 'm.room.member' AND c.room_id = ? AND membership = ? - /* Sorted by lowest depth first */ - ORDER BY e.depth ASC; - """ + """Returns a list of users in the room.""" - txn.execute(sql, (room_id, Membership.JOIN)) - return [r[0] for r in txn] + return self.db_pool.simple_select_onecol_txn( + txn, + table="current_state_events", + keyvalues={ + "type": EventTypes.Member, + "room_id": room_id, + "membership": Membership.JOIN, + }, + retcol="state_key", + ) @cached() def get_user_in_room_with_profile( @@ -931,7 +926,44 @@ class RoomMemberWorkerStore(EventsWorkerStore): return True @cached(iterable=True, max_entries=10000) - async def get_current_hosts_in_room(self, room_id: str) -> List[str]: + async def get_current_hosts_in_room(self, room_id: str) -> Set[str]: + """Get current hosts in room based on current state.""" + + # First we check if we already have `get_users_in_room` in the cache, as + # we can just calculate result from that + users = self.get_users_in_room.cache.get_immediate( + (room_id,), None, update_metrics=False + ) + if users is not None: + return {get_domain_from_id(u) for u in users} + + if isinstance(self.database_engine, Sqlite3Engine): + # If we're using SQLite then let's just always use + # `get_users_in_room` rather than funky SQL. + users = await self.get_users_in_room(room_id) + return {get_domain_from_id(u) for u in users} + + # For PostgreSQL we can use a regex to pull out the domains from the + # joined users in `current_state_events` via regex. + + def get_current_hosts_in_room_txn(txn: LoggingTransaction) -> Set[str]: + sql = """ + SELECT DISTINCT substring(state_key FROM '@[^:]*:(.*)$') + FROM current_state_events + WHERE + type = 'm.room.member' + AND membership = 'join' + AND room_id = ? + """ + txn.execute(sql, (room_id,)) + return {d for d, in txn} + + return await self.db_pool.runInteraction( + "get_current_hosts_in_room", get_current_hosts_in_room_txn + ) + + @cached(iterable=True, max_entries=10000) + async def get_current_hosts_in_room_ordered(self, room_id: str) -> List[str]: """ Get current hosts in room based on current state. @@ -939,48 +971,33 @@ class RoomMemberWorkerStore(EventsWorkerStore): longest is good because they're most likely to have anything we ask about. - Uses `m.room.member`s in the room state at the current forward extremities to - determine which hosts are in the room. + For SQLite the returned list is not ordered, as SQLite doesn't support + the appropriate SQL. - Will return inaccurate results for rooms with partial state, since the state for - the forward extremities of those rooms will exclude most members. We may also - calculate room state incorrectly for such rooms and believe that a host is or - is not in the room when the opposite is true. + Uses `m.room.member`s in the room state at the current forward + extremities to determine which hosts are in the room. + + Will return inaccurate results for rooms with partial state, since the + state for the forward extremities of those rooms will exclude most + members. We may also calculate room state incorrectly for such rooms and + believe that a host is or is not in the room when the opposite is true. Returns: Returns a list of servers sorted by longest in the room first. (aka. sorted by join with the lowest depth first). """ - # First we check if we already have `get_users_in_room` in the cache, as - # we can just calculate result from that - users = self.get_users_in_room.cache.get_immediate( - (room_id,), None, update_metrics=False - ) - if users is None and isinstance(self.database_engine, Sqlite3Engine): + if isinstance(self.database_engine, Sqlite3Engine): # If we're using SQLite then let's just always use # `get_users_in_room` rather than funky SQL. - users = await self.get_users_in_room(room_id) - if users is not None: - # Because `users` is sorted from lowest -> highest depth, the list - # of domains will also be sorted that way. - domains: List[str] = [] - # We use a `Set` just for fast lookups - domain_set: Set[str] = set() - for u in users: - if ":" not in u: - continue - domain = get_domain_from_id(u) - if domain not in domain_set: - domain_set.add(domain) - domains.append(domain) - return domains + domains = await self.get_current_hosts_in_room(room_id) + return list(domains) # For PostgreSQL we can use a regex to pull out the domains from the # joined users in `current_state_events` via regex. - def get_current_hosts_in_room_txn(txn: LoggingTransaction) -> List[str]: + def get_current_hosts_in_room_ordered_txn(txn: LoggingTransaction) -> List[str]: # Returns a list of servers currently joined in the room sorted by # longest in the room first (aka. with the lowest depth). The # heuristic of sorting by servers who have been in the room the @@ -1008,7 +1025,7 @@ class RoomMemberWorkerStore(EventsWorkerStore): return [d for d, in txn if d is not None] return await self.db_pool.runInteraction( - "get_current_hosts_in_room", get_current_hosts_in_room_txn + "get_current_hosts_in_room_ordered", get_current_hosts_in_room_ordered_txn ) async def get_joined_hosts( -- cgit 1.5.1