diff options
author | Sean Quah <8349537+squahtx@users.noreply.github.com> | 2022-07-05 16:12:52 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-07-05 16:12:52 +0100 |
commit | 68db233f0cf16a20f21fd927374121966976d9c7 (patch) | |
tree | dc9054e39534b5933140d688abaa9221e3381627 /synapse/handlers/message.py | |
parent | Type `tests.utils` (#13028) (diff) | |
download | synapse-68db233f0cf16a20f21fd927374121966976d9c7.tar.xz |
Handle race between persisting an event and un-partial stating a room (#13100)
Whenever we want to persist an event, we first compute an event context, which includes the state at the event and a flag indicating whether the state is partial. After a lot of processing, we finally try to store the event in the database, which can fail for partial state events when the containing room has been un-partial stated in the meantime. We detect the race as a foreign key constraint failure in the data store layer and turn it into a special `PartialStateConflictError` exception, which makes its way up to the method in which we computed the event context. To make things difficult, the exception needs to cross a replication request: `/fed_send_events` for events coming over federation and `/send_event` for events from clients. We transport the `PartialStateConflictError` as a `409 Conflict` over replication and turn `409`s back into `PartialStateConflictError`s on the worker making the request. All client events go through `EventCreationHandler.handle_new_client_event`, which is called in *a lot* of places. Instead of trying to update all the code which creates client events, we turn the `PartialStateConflictError` into a `429 Too Many Requests` in `EventCreationHandler.handle_new_client_event` and hope that clients take it as a hint to retry their request. On the federation event side, there are 7 places which compute event contexts. 4 of them use outlier event contexts: `FederationEventHandler._auth_and_persist_outliers_inner`, `FederationHandler.do_knock`, `FederationHandler.on_invite_request` and `FederationHandler.do_remotely_reject_invite`. These events won't have the partial state flag, so we do not need to do anything for then. The remaining 3 paths which create events are `FederationEventHandler.process_remote_join`, `FederationEventHandler.on_send_membership_event` and `FederationEventHandler._process_received_pdu`. We can't experience the race in `process_remote_join`, unless we're handling an additional join into a partial state room, which currently blocks, so we make no attempt to handle it correctly. `on_send_membership_event` is only called by `FederationServer._on_send_membership_event`, so we catch the `PartialStateConflictError` there and retry just once. `_process_received_pdu` is called by `on_receive_pdu` for incoming events and `_process_pulled_event` for backfill. The latter should never try to persist partial state events, so we ignore it. We catch the `PartialStateConflictError` in `on_receive_pdu` and retry just once. Refering to the graph of code paths in https://github.com/matrix-org/synapse/issues/12988#issuecomment-1156857648 may make the above make more sense. Signed-off-by: Sean Quah <seanq@matrix.org>
Diffstat (limited to 'synapse/handlers/message.py')
-rw-r--r-- | synapse/handlers/message.py | 79 |
1 files changed, 53 insertions, 26 deletions
diff --git a/synapse/handlers/message.py b/synapse/handlers/message.py index c6b40a5b7a..1980e37dae 100644 --- a/synapse/handlers/message.py +++ b/synapse/handlers/message.py @@ -37,6 +37,7 @@ from synapse.api.errors import ( AuthError, Codes, ConsentNotGivenError, + LimitExceededError, NotFoundError, ShadowBanError, SynapseError, @@ -53,6 +54,7 @@ from synapse.handlers.directory import DirectoryHandler from synapse.logging.context import make_deferred_yieldable, run_in_background from synapse.metrics.background_process_metrics import run_as_background_process from synapse.replication.http.send_event import ReplicationSendEventRestServlet +from synapse.storage.databases.main.events import PartialStateConflictError from synapse.storage.databases.main.events_worker import EventRedactBehaviour from synapse.storage.state import StateFilter from synapse.types import ( @@ -1250,6 +1252,8 @@ class EventCreationHandler: Raises: ShadowBanError if the requester has been shadow-banned. + SynapseError(503) if attempting to persist a partial state event in + a room that has been un-partial stated. """ extra_users = extra_users or [] @@ -1300,24 +1304,35 @@ class EventCreationHandler: # We now persist the event (and update the cache in parallel, since we # don't want to block on it). - result, _ = await make_deferred_yieldable( - gather_results( - ( - run_in_background( - self._persist_event, - requester=requester, - event=event, - context=context, - ratelimit=ratelimit, - extra_users=extra_users, + try: + result, _ = await make_deferred_yieldable( + gather_results( + ( + run_in_background( + self._persist_event, + requester=requester, + event=event, + context=context, + ratelimit=ratelimit, + extra_users=extra_users, + ), + run_in_background( + self.cache_joined_hosts_for_event, event, context + ).addErrback( + log_failure, "cache_joined_hosts_for_event failed" + ), ), - run_in_background( - self.cache_joined_hosts_for_event, event, context - ).addErrback(log_failure, "cache_joined_hosts_for_event failed"), - ), - consumeErrors=True, + consumeErrors=True, + ) + ).addErrback(unwrapFirstError) + except PartialStateConflictError as e: + # The event context needs to be recomputed. + # Turn the error into a 429, as a hint to the client to try again. + logger.info( + "Room %s was un-partial stated while persisting client event.", + event.room_id, ) - ).addErrback(unwrapFirstError) + raise LimitExceededError(msg=e.msg, errcode=e.errcode, retry_after_ms=0) return result @@ -1332,6 +1347,9 @@ class EventCreationHandler: """Actually persists the event. Should only be called by `handle_new_client_event`, and see its docstring for documentation of the arguments. + + PartialStateConflictError: if attempting to persist a partial state event in + a room that has been un-partial stated. """ # Skip push notification actions for historical messages @@ -1348,16 +1366,21 @@ class EventCreationHandler: # If we're a worker we need to hit out to the master. writer_instance = self._events_shard_config.get_instance(event.room_id) if writer_instance != self._instance_name: - result = await self.send_event( - instance_name=writer_instance, - event_id=event.event_id, - store=self.store, - requester=requester, - event=event, - context=context, - ratelimit=ratelimit, - extra_users=extra_users, - ) + try: + result = await self.send_event( + instance_name=writer_instance, + event_id=event.event_id, + store=self.store, + requester=requester, + event=event, + context=context, + ratelimit=ratelimit, + extra_users=extra_users, + ) + except SynapseError as e: + if e.code == HTTPStatus.CONFLICT: + raise PartialStateConflictError() + raise stream_id = result["stream_id"] event_id = result["event_id"] if event_id != event.event_id: @@ -1485,6 +1508,10 @@ class EventCreationHandler: The persisted event. This may be different than the given event if it was de-duplicated (e.g. because we had already persisted an event with the same transaction ID.) + + Raises: + PartialStateConflictError: if attempting to persist a partial state event in + a room that has been un-partial stated. """ extra_users = extra_users or [] |