diff options
Diffstat (limited to 'synapse/handlers/federation_event.py')
-rw-r--r-- | synapse/handlers/federation_event.py | 446 |
1 files changed, 277 insertions, 169 deletions
diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py index ace7adcffb..0cc8e990d9 100644 --- a/synapse/handlers/federation_event.py +++ b/synapse/handlers/federation_event.py @@ -43,8 +43,11 @@ from synapse.api.constants import ( from synapse.api.errors import ( AuthError, Codes, + EventSizeError, FederationError, + FederationPullAttemptBackoffError, HttpResponseException, + PartialStateConflictError, RequestSendFailed, SynapseError, ) @@ -56,8 +59,8 @@ from synapse.event_auth import ( validate_event_for_room_version, ) from synapse.events import EventBase -from synapse.events.snapshot import EventContext -from synapse.federation.federation_client import InvalidResponseError +from synapse.events.snapshot import EventContext, UnpersistedEventContextBase +from synapse.federation.federation_client import InvalidResponseError, PulledPduInfo from synapse.logging.context import nested_logging_context from synapse.logging.opentracing import ( SynapseTags, @@ -67,23 +70,25 @@ from synapse.logging.opentracing import ( trace, ) from synapse.metrics.background_process_metrics import run_as_background_process -from synapse.replication.http.devices import ReplicationUserDevicesResyncRestServlet +from synapse.replication.http.devices import ( + ReplicationMultiUserDevicesResyncRestServlet, +) from synapse.replication.http.federation import ( ReplicationFederationSendEventsRestServlet, ) from synapse.state import StateResolutionStore -from synapse.storage.databases.main.events import PartialStateConflictError from synapse.storage.databases.main.events_worker import EventRedactBehaviour -from synapse.storage.state import StateFilter from synapse.types import ( PersistedEventPosition, RoomStreamToken, StateMap, + StrCollection, UserID, get_domain_from_id, ) +from synapse.types.state import StateFilter from synapse.util.async_helpers import Linearizer, concurrently_execute -from synapse.util.iterutils import batch_iter +from synapse.util.iterutils import batch_iter, partition from synapse.util.retryutils import NotRetryingDestination from synapse.util.stringutils import shortstr @@ -137,6 +142,7 @@ class FederationEventHandler: """ def __init__(self, hs: "HomeServer"): + self._clock = hs.get_clock() self._store = hs.get_datastores().main self._storage_controllers = hs.get_storage_controllers() self._state_storage_controller = self._storage_controllers.state @@ -151,10 +157,13 @@ class FederationEventHandler: self._get_room_member_handler = hs.get_room_member_handler self._federation_client = hs.get_federation_client() - self._third_party_event_rules = hs.get_third_party_event_rules() + self._third_party_event_rules = ( + hs.get_module_api_callbacks().third_party_event_rules + ) self._notifier = hs.get_notifier() self._is_mine_id = hs.is_mine_id + self._is_mine_server_name = hs.is_mine_server_name self._server_name = hs.hostname self._instance_name = hs.get_instance_name() @@ -163,8 +172,8 @@ class FederationEventHandler: self._send_events = ReplicationFederationSendEventsRestServlet.make_client(hs) if hs.config.worker.worker_app: - self._user_device_resync = ( - ReplicationUserDevicesResyncRestServlet.make_client(hs) + self._multi_user_device_resync = ( + ReplicationMultiUserDevicesResyncRestServlet.make_client(hs) ) else: self._device_list_updater = hs.get_device_handler().device_list_updater @@ -238,7 +247,7 @@ class FederationEventHandler: # # Note that if we were never in the room then we would have already # dropped the event, since we wouldn't know the room version. - is_in_room = await self._event_auth_handler.check_host_in_room( + is_in_room = await self._event_auth_handler.is_host_in_room( room_id, self._server_name ) if not is_in_room: @@ -414,14 +423,18 @@ class FederationEventHandler: # First, precalculate the joined hosts so that the federation sender doesn't # need to. - await self._event_creation_handler.cache_joined_hosts_for_event(event, context) + await self._event_creation_handler.cache_joined_hosts_for_events( + [(event, context)] + ) await self._check_for_soft_fail(event, context=context, origin=origin) await self._run_push_actions_and_persist_event(event, context) return event, context async def check_join_restrictions( - self, context: EventContext, event: EventBase + self, + context: UnpersistedEventContextBase, + event: EventBase, ) -> None: """Check that restrictions in restricted join rules are matched @@ -434,16 +447,17 @@ class FederationEventHandler: # Check if the user is already in the room or invited to the room. user_id = event.state_key prev_member_event_id = prev_state_ids.get((EventTypes.Member, user_id), None) - prev_member_event = None + prev_membership = None if prev_member_event_id: prev_member_event = await self._store.get_event(prev_member_event_id) + prev_membership = prev_member_event.membership # Check if the member should be allowed access via membership in a space. await self._event_auth_handler.check_restricted_join_rules( prev_state_ids, event.room_version, user_id, - prev_member_event, + prev_membership, ) @trace @@ -519,17 +533,63 @@ class FederationEventHandler: "Peristing join-via-remote %s (partial_state: %s)", event, partial_state ) with nested_logging_context(suffix=event.event_id): + if partial_state: + # When handling a second partial state join into a partial state room, + # the returned state will exclude the membership from the first join. To + # preserve prior memberships, we try to compute the partial state before + # the event ourselves if we know about any of the prev events. + # + # When we don't know about any of the prev events, it's fine to just use + # the returned state, since the new join will create a new forward + # extremity, and leave the forward extremity containing our prior + # memberships alone. + prev_event_ids = set(event.prev_event_ids()) + seen_event_ids = await self._store.have_events_in_timeline( + prev_event_ids + ) + missing_event_ids = prev_event_ids - seen_event_ids + + state_maps_to_resolve: List[StateMap[str]] = [] + + # Fetch the state after the prev events that we know about. + state_maps_to_resolve.extend( + ( + await self._state_storage_controller.get_state_groups_ids( + room_id, seen_event_ids, await_full_state=False + ) + ).values() + ) + + # When there are prev events we do not have the state for, we state + # resolve with the state returned by the remote homeserver. + if missing_event_ids or len(state_maps_to_resolve) == 0: + state_maps_to_resolve.append( + {(e.type, e.state_key): e.event_id for e in state} + ) + + state_ids_before_event = ( + await self._state_resolution_handler.resolve_events_with_store( + event.room_id, + room_version.identifier, + state_maps_to_resolve, + event_map=None, + state_res_store=StateResolutionStore(self._store), + ) + ) + else: + state_ids_before_event = { + (e.type, e.state_key): e.event_id for e in state + } + context = await self._state_handler.compute_event_context( event, - state_ids_before_event={ - (e.type, e.state_key): e.event_id for e in state - }, + state_ids_before_event=state_ids_before_event, partial_state=partial_state, ) await self._check_event_auth(origin, event, context) if context.rejected: - raise SynapseError(400, "Join event was rejected") + raise SynapseError(403, "Join event was rejected") # the remote server is responsible for sending our join event to the rest # of the federation. Indeed, attempting to do so will result in problems @@ -541,18 +601,6 @@ class FederationEventHandler: room_id, [(event, context)] ) - # If we're joining the room again, check if there is new marker - # state indicating that there is new history imported somewhere in - # the DAG. Multiple markers can exist in the current state with - # unique state_keys. - # - # Do this after the state from the remote join was persisted (via - # `persist_events_and_notify`). Otherwise we can run into a - # situation where the create event doesn't exist yet in the - # `current_state_events` - for e in state: - await self._handle_marker_event(origin, e) - return stream_id_after_persist async def update_state_for_partial_state_event( @@ -565,6 +613,9 @@ class FederationEventHandler: event: partial-state event to be de-partial-stated Raises: + FederationPullAttemptBackoffError if we are are deliberately not attempting + to pull the given event over federation because we've already done so + recently and are backing off. FederationError if we fail to request state from the remote server. """ logger.info("Updating state for %s", event.event_id) @@ -603,10 +654,12 @@ class FederationEventHandler: self._state_storage_controller.notify_event_un_partial_stated( event.event_id ) + # Notify that there's a new row in the un_partial_stated_events stream. + self._notifier.notify_replication() @trace async def backfill( - self, dest: str, room_id: str, limit: int, extremities: Collection[str] + self, dest: str, room_id: str, limit: int, extremities: StrCollection ) -> None: """Trigger a backfill request to `dest` for the given `room_id` @@ -624,7 +677,7 @@ class FederationEventHandler: server from invalid events (there is probably no point in trying to re-fetch invalid events from every other HS in the room.) """ - if dest == self._server_name: + if self._is_mine_server_name(dest): raise SynapseError(400, "Can't backfill from self.") events = await self._federation_client.backfill( @@ -670,12 +723,11 @@ class FederationEventHandler: if not prevs - seen: return - latest_list = await self._store.get_latest_event_ids_in_room(room_id) + latest_frozen = await self._store.get_latest_event_ids_in_room(room_id) # We add the prev events that we have seen to the latest # list to ensure the remote server doesn't give them to us - latest = set(latest_list) - latest |= seen + latest = seen | latest_frozen logger.info( "Requesting missing events between %s and %s", @@ -792,12 +844,97 @@ class FederationEventHandler: ], ) - # We want to sort these by depth so we process them and - # tell clients about them in order. - sorted_events = sorted(events, key=lambda x: x.depth) - for ev in sorted_events: - with nested_logging_context(ev.event_id): - await self._process_pulled_event(origin, ev, backfilled=backfilled) + # Check if we already any of these have these events. + # Note: we currently make a lookup in the database directly here rather than + # checking the event cache, due to: + # https://github.com/matrix-org/synapse/issues/13476 + existing_events_map = await self._store._get_events_from_db( + [event.event_id for event in events] + ) + + new_events: List[EventBase] = [] + for event in events: + event_id = event.event_id + + # If we've already seen this event ID... + if event_id in existing_events_map: + existing_event = existing_events_map[event_id] + + # ...and the event itself was not previously stored as an outlier... + if not existing_event.event.internal_metadata.is_outlier(): + # ...then there's no need to persist it. We have it already. + logger.info( + "_process_pulled_event: Ignoring received event %s which we " + "have already seen", + event.event_id, + ) + continue + + # While we have seen this event before, it was stored as an outlier. + # We'll now persist it as a non-outlier. + logger.info("De-outliering event %s", event_id) + + # Continue on with the events that are new to us. + new_events.append(event) + + set_tag( + SynapseTags.RESULT_PREFIX + "new_events.length", + str(len(new_events)), + ) + + @trace + async def _process_new_pulled_events(new_events: Collection[EventBase]) -> None: + # We want to sort these by depth so we process them and tell clients about + # them in order. It's also more efficient to backfill this way (`depth` + # ascending) because one backfill event is likely to be the `prev_event` of + # the next event we're going to process. + sorted_events = sorted(new_events, key=lambda x: x.depth) + for ev in sorted_events: + with nested_logging_context(ev.event_id): + await self._process_pulled_event(origin, ev, backfilled=backfilled) + + # Check if we've already tried to process these events at some point in the + # past. We aren't concerned with the expontntial backoff here, just whether it + # has failed to be processed before. + event_ids_with_failed_pull_attempts = ( + await self._store.get_event_ids_with_failed_pull_attempts( + [event.event_id for event in new_events] + ) + ) + + events_with_failed_pull_attempts, fresh_events = partition( + new_events, lambda e: e.event_id in event_ids_with_failed_pull_attempts + ) + set_tag( + SynapseTags.FUNC_ARG_PREFIX + "events_with_failed_pull_attempts", + str(event_ids_with_failed_pull_attempts), + ) + set_tag( + SynapseTags.RESULT_PREFIX + "events_with_failed_pull_attempts.length", + str(len(events_with_failed_pull_attempts)), + ) + set_tag( + SynapseTags.FUNC_ARG_PREFIX + "fresh_events", + str([event.event_id for event in fresh_events]), + ) + set_tag( + SynapseTags.RESULT_PREFIX + "fresh_events.length", + str(len(fresh_events)), + ) + + # Process previously failed backfill events in the background to not waste + # time on something that is likely to fail again. + if len(events_with_failed_pull_attempts) > 0: + run_as_background_process( + "_process_new_pulled_events_with_failed_pull_attempts", + _process_new_pulled_events, + events_with_failed_pull_attempts, + ) + + # We can optimistically try to process and wait for the event to be fully + # persisted if we've never tried before. + if len(fresh_events) > 0: + await _process_new_pulled_events(fresh_events) @trace @tag_args @@ -846,22 +983,13 @@ class FederationEventHandler: event_id = event.event_id - existing = await self._store.get_event( - event_id, allow_none=True, allow_rejected=True - ) - if existing: - if not existing.internal_metadata.is_outlier(): - logger.info( - "_process_pulled_event: Ignoring received event %s which we have already seen", - event_id, - ) - return - logger.info("De-outliering event %s", event_id) - try: self._sanity_check_event(event) except SynapseError as err: logger.warning("Event %s failed sanity check: %s", event_id, err) + await self._store.record_event_failed_pull_attempt( + event.room_id, event_id, str(err) + ) return try: @@ -896,7 +1024,23 @@ class FederationEventHandler: context, backfilled=backfilled, ) + except FederationPullAttemptBackoffError as exc: + # Log a warning about why we failed to process the event (the error message + # for `FederationPullAttemptBackoffError` is pretty good) + logger.warning("_process_pulled_event: %s", exc) + # We do not record a failed pull attempt when we backoff fetching a missing + # `prev_event` because not being able to fetch the `prev_events` just means + # we won't be able to de-outlier the pulled event. But we can still use an + # `outlier` in the state/auth chain for another event. So we shouldn't stop + # a downstream event from trying to pull it. + # + # This avoids a cascade of backoff for all events in the DAG downstream from + # one event backoff upstream. except FederationError as e: + await self._store.record_event_failed_pull_attempt( + event.room_id, event_id, str(e) + ) + if e.code == 403: logger.warning("Pulled event %s failed history check.", event_id) else: @@ -938,6 +1082,9 @@ class FederationEventHandler: The event context. Raises: + FederationPullAttemptBackoffError if we are are deliberately not attempting + to pull one of the given event's `prev_event`s over federation because + we've already done so recently and are backing off. FederationError if we fail to get the state from the remote server after any missing `prev_event`s. """ @@ -948,6 +1095,27 @@ class FederationEventHandler: seen = await self._store.have_events_in_timeline(prevs) missing_prevs = prevs - seen + # If we've already recently attempted to pull this missing event, don't + # try it again so soon. Since we have to fetch all of the prev_events, we can + # bail early here if we find any to ignore. + prevs_with_pull_backoff = ( + await self._store.get_event_ids_to_not_pull_from_backoff( + room_id, missing_prevs + ) + ) + if len(prevs_with_pull_backoff) > 0: + raise FederationPullAttemptBackoffError( + event_ids=prevs_with_pull_backoff.keys(), + message=( + f"While computing context for event={event_id}, not attempting to " + f"pull missing prev_events={list(prevs_with_pull_backoff.keys())} " + "because we already tried to pull recently (backing off)." + ), + retry_after_ms=( + max(prevs_with_pull_backoff.values()) - self._clock.time_msec() + ), + ) + if not missing_prevs: return await self._state_handler.compute_event_context(event) @@ -1004,10 +1172,9 @@ class FederationEventHandler: state_res_store=StateResolutionStore(self._store), ) - except Exception: + except Exception as e: logger.warning( - "Error attempting to resolve state at missing prev_events", - exc_info=True, + "Error attempting to resolve state at missing prev_events: %s", e ) raise FederationError( "ERROR", @@ -1273,8 +1440,6 @@ class FederationEventHandler: await self._run_push_actions_and_persist_event(event, context, backfilled) - await self._handle_marker_event(origin, event) - if backfilled or context.rejected: return @@ -1360,104 +1525,21 @@ class FederationEventHandler: """ try: - await self._store.mark_remote_user_device_cache_as_stale(sender) + await self._store.mark_remote_users_device_caches_as_stale((sender,)) # Immediately attempt a resync in the background if self._config.worker.worker_app: - await self._user_device_resync(user_id=sender) + await self._multi_user_device_resync(user_ids=[sender]) else: - await self._device_list_updater.user_device_resync(sender) + await self._device_list_updater.multi_user_device_resync( + user_ids=[sender] + ) except Exception: logger.exception("Failed to resync device for %s", sender) - @trace - async def _handle_marker_event(self, origin: str, marker_event: EventBase) -> None: - """Handles backfilling the insertion event when we receive a marker - event that points to one. - - Args: - origin: Origin of the event. Will be called to get the insertion event - marker_event: The event to process - """ - - if marker_event.type != EventTypes.MSC2716_MARKER: - # Not a marker event - return - - if marker_event.rejected_reason is not None: - # Rejected event - return - - # Skip processing a marker event if the room version doesn't - # support it or the event is not from the room creator. - room_version = await self._store.get_room_version(marker_event.room_id) - create_event = await self._store.get_create_event_for_room(marker_event.room_id) - room_creator = create_event.content.get(EventContentFields.ROOM_CREATOR) - if not room_version.msc2716_historical and ( - not self._config.experimental.msc2716_enabled - or marker_event.sender != room_creator - ): - return - - logger.debug("_handle_marker_event: received %s", marker_event) - - insertion_event_id = marker_event.content.get( - EventContentFields.MSC2716_INSERTION_EVENT_REFERENCE - ) - - if insertion_event_id is None: - # Nothing to retrieve then (invalid marker) - return - - already_seen_insertion_event = await self._store.have_seen_event( - marker_event.room_id, insertion_event_id - ) - if already_seen_insertion_event: - # No need to process a marker again if we have already seen the - # insertion event that it was pointing to - return - - logger.debug( - "_handle_marker_event: backfilling insertion event %s", insertion_event_id - ) - - await self._get_events_and_persist( - origin, - marker_event.room_id, - [insertion_event_id], - ) - - insertion_event = await self._store.get_event( - insertion_event_id, allow_none=True - ) - if insertion_event is None: - logger.warning( - "_handle_marker_event: server %s didn't return insertion event %s for marker %s", - origin, - insertion_event_id, - marker_event.event_id, - ) - return - - logger.debug( - "_handle_marker_event: succesfully backfilled insertion event %s from marker event %s", - insertion_event, - marker_event, - ) - - await self._store.insert_insertion_extremity( - insertion_event_id, marker_event.room_id - ) - - logger.debug( - "_handle_marker_event: insertion extremity added for %s from marker event %s", - insertion_event, - marker_event, - ) - async def backfill_event_id( - self, destination: str, room_id: str, event_id: str - ) -> EventBase: + self, destinations: StrCollection, room_id: str, event_id: str + ) -> PulledPduInfo: """Backfill a single event and persist it as a non-outlier which means we also pull in all of the state and auth events necessary for it. @@ -1469,24 +1551,21 @@ class FederationEventHandler: Raises: FederationError if we are unable to find the event from the destination """ - logger.info( - "backfill_event_id: event_id=%s from destination=%s", event_id, destination - ) + logger.info("backfill_event_id: event_id=%s", event_id) room_version = await self._store.get_room_version(room_id) - event_from_response = await self._federation_client.get_pdu( - [destination], + pulled_pdu_info = await self._federation_client.get_pdu( + destinations, event_id, room_version, ) - if not event_from_response: + if not pulled_pdu_info: raise FederationError( "ERROR", 404, - "Unable to find event_id=%s from destination=%s to backfill." - % (event_id, destination), + f"Unable to find event_id={event_id} from remote servers to backfill.", affected=event_id, ) @@ -1494,18 +1573,18 @@ class FederationEventHandler: # and auth events to de-outlier it. This also sets up the necessary # `state_groups` for the event. await self._process_pulled_events( - destination, - [event_from_response], + pulled_pdu_info.pull_origin, + [pulled_pdu_info.pdu], # Prevent notifications going to clients backfilled=True, ) - return event_from_response + return pulled_pdu_info @trace @tag_args async def _get_events_and_persist( - self, destination: str, room_id: str, event_ids: Collection[str] + self, destination: str, room_id: str, event_ids: StrCollection ) -> None: """Fetch the given events from a server, and persist them as outliers. @@ -1523,19 +1602,19 @@ class FederationEventHandler: async def get_event(event_id: str) -> None: with nested_logging_context(event_id): try: - event = await self._federation_client.get_pdu( + pulled_pdu_info = await self._federation_client.get_pdu( [destination], event_id, room_version, ) - if event is None: + if pulled_pdu_info is None: logger.warning( "Server %s didn't return event %s", destination, event_id, ) return - events.append(event) + events.append(pulled_pdu_info.pdu) except Exception as e: logger.warning( @@ -1679,6 +1758,15 @@ class FederationEventHandler: except AuthError as e: logger.warning("Rejecting %r because %s", event, e) context.rejected = RejectedReason.AUTH_ERROR + except EventSizeError as e: + if e.unpersistable: + # This event is completely unpersistable. + raise e + # Otherwise, we are somewhat lenient and just persist the event + # as rejected, for moderate compatibility with older Synapse + # versions. + logger.warning("While validating received event %r: %s", event, e) + context.rejected = RejectedReason.OVERSIZED_EVENT events_and_contexts_to_persist.append((event, context)) @@ -1724,6 +1812,16 @@ class FederationEventHandler: # TODO: use a different rejected reason here? context.rejected = RejectedReason.AUTH_ERROR return + except EventSizeError as e: + if e.unpersistable: + # This event is completely unpersistable. + raise e + # Otherwise, we are somewhat lenient and just persist the event + # as rejected, for moderate compatibility with older Synapse + # versions. + logger.warning("While validating received event %r: %s", event, e) + context.rejected = RejectedReason.OVERSIZED_EVENT + return # next, check that we have all of the event's auth events. # @@ -1877,8 +1975,7 @@ class FederationEventHandler: # partial and full state and may not be accurate. return - extrem_ids_list = await self._store.get_latest_event_ids_in_room(event.room_id) - extrem_ids = set(extrem_ids_list) + extrem_ids = await self._store.get_latest_event_ids_in_room(event.room_id) prev_event_ids = set(event.prev_event_ids()) if extrem_ids == prev_event_ids: @@ -2110,8 +2207,8 @@ class FederationEventHandler: min_depth, ) else: - await self._bulk_push_rule_evaluator.action_for_event_by_user( - event, context + await self._bulk_push_rule_evaluator.action_for_events_by_user( + [(event, context)] ) try: @@ -2153,6 +2250,7 @@ class FederationEventHandler: if instance != self._instance_name: # Limit the number of events sent over replication. We choose 200 # here as that is what we default to in `max_request_body_size(..)` + result = {} try: for batch in batch_iter(event_and_contexts, 200): result = await self._send_events( @@ -2179,6 +2277,10 @@ class FederationEventHandler: event_and_contexts, backfilled=backfilled ) + # After persistence we always need to notify replication there may + # be new data. + self._notifier.notify_replication() + if self._ephemeral_messages_enabled: for event in events: # If there's an expiry timestamp on the event, schedule its expiry. @@ -2232,14 +2334,20 @@ class FederationEventHandler: event_pos = PersistedEventPosition( self._instance_name, event.internal_metadata.stream_ordering ) - await self._notifier.on_new_room_event( - event, event_pos, max_stream_token, extra_users=extra_users + await self._notifier.on_new_room_events( + [(event, event_pos)], max_stream_token, extra_users=extra_users ) if event.type == EventTypes.Member and event.membership == Membership.JOIN: # TODO retrieve the previous state, and exclude join -> join transitions self._notifier.notify_user_joined_room(event.event_id, event.room_id) + # If this is a server ACL event, clear the cache in the storage controller. + if event.type == EventTypes.ServerACL: + self._state_storage_controller.get_server_acl_for_room.invalidate( + (event.room_id,) + ) + def _sanity_check_event(self, ev: EventBase) -> None: """ Do some early sanity checks of a received event |