From 24a97b3e7144720545df69c321e320c9d35166a6 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 14 Dec 2022 09:25:33 -0500 Subject: Delete event_push_summary_unique_index again. (#14669) if a Synapse deployment upgraded (from < 1.62.0 to >= 1.70.0) then it is possible for schema deltas to run before background updates causing drift in the database schema due to: 1. A delta registered a background update to create an index. 2. A delta dropped the above index if it exists (but it yet exist won't since the background job hasn't run). 3. The code assumed the index was dropped. To fix this we: 1. Cancel the background update which could create the index. 2. Drop the index again. 3. Drop a related index which is dropped by the background update. --- synapse/storage/databases/main/event_push_actions.py | 9 --------- 1 file changed, 9 deletions(-) (limited to 'synapse/storage/databases') diff --git a/synapse/storage/databases/main/event_push_actions.py b/synapse/storage/databases/main/event_push_actions.py index 7ebe34f773..3a0c370fde 100644 --- a/synapse/storage/databases/main/event_push_actions.py +++ b/synapse/storage/databases/main/event_push_actions.py @@ -274,15 +274,6 @@ class EventPushActionsWorkerStore(ReceiptsWorkerStore, StreamWorkerStore, SQLBas self._clear_old_push_actions_staging, 30 * 60 * 1000 ) - self.db_pool.updates.register_background_index_update( - "event_push_summary_unique_index", - index_name="event_push_summary_unique_index", - table="event_push_summary", - columns=["user_id", "room_id"], - unique=True, - replaces_index="event_push_summary_user_rm", - ) - self.db_pool.updates.register_background_index_update( "event_push_summary_unique_index2", index_name="event_push_summary_unique_index2", -- cgit 1.5.1 From fb60cb16fe3cf26fbd947eec926cb4b24b8e9fc7 Mon Sep 17 00:00:00 2001 From: reivilibre Date: Wed, 14 Dec 2022 14:47:11 +0000 Subject: Faster remote room joins: stream the un-partial-stating of events over replication. [rei:frrj/streams/unpsr] (#14545) --- changelog.d/14545.misc | 1 + synapse/handlers/federation_event.py | 2 + synapse/replication/tcp/streams/__init__.py | 7 +- synapse/replication/tcp/streams/partial_state.py | 28 +++++++ synapse/storage/databases/main/events_worker.py | 88 ++++++++++++++++++++++ synapse/storage/databases/main/state.py | 34 ++++++--- .../delta/73/22_un_partial_stated_event_stream.sql | 34 +++++++++ ..._un_partial_stated_room_stream_seq.sql.postgres | 20 +++++ 8 files changed, 204 insertions(+), 10 deletions(-) create mode 100644 changelog.d/14545.misc create mode 100644 synapse/storage/schema/main/delta/73/22_un_partial_stated_event_stream.sql create mode 100644 synapse/storage/schema/main/delta/73/23_un_partial_stated_room_stream_seq.sql.postgres (limited to 'synapse/storage/databases') diff --git a/changelog.d/14545.misc b/changelog.d/14545.misc new file mode 100644 index 0000000000..60b6761a51 --- /dev/null +++ b/changelog.d/14545.misc @@ -0,0 +1 @@ +Faster remote room joins: stream the un-partial-stating of events over replication. \ No newline at end of file diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py index 66aca2f864..31df7f55cc 100644 --- a/synapse/handlers/federation_event.py +++ b/synapse/handlers/federation_event.py @@ -610,6 +610,8 @@ class FederationEventHandler: self._state_storage_controller.notify_event_un_partial_stated( event.event_id ) + # Notify that there's a new row in the un_partial_stated_events stream. + self._notifier.notify_replication() @trace async def backfill( diff --git a/synapse/replication/tcp/streams/__init__.py b/synapse/replication/tcp/streams/__init__.py index 8575666d9c..110f10aab9 100644 --- a/synapse/replication/tcp/streams/__init__.py +++ b/synapse/replication/tcp/streams/__init__.py @@ -42,7 +42,10 @@ from synapse.replication.tcp.streams._base import ( ) from synapse.replication.tcp.streams.events import EventsStream from synapse.replication.tcp.streams.federation import FederationStream -from synapse.replication.tcp.streams.partial_state import UnPartialStatedRoomStream +from synapse.replication.tcp.streams.partial_state import ( + UnPartialStatedEventStream, + UnPartialStatedRoomStream, +) STREAMS_MAP = { stream.NAME: stream @@ -63,6 +66,7 @@ STREAMS_MAP = { AccountDataStream, UserSignatureStream, UnPartialStatedRoomStream, + UnPartialStatedEventStream, ) } @@ -83,4 +87,5 @@ __all__ = [ "AccountDataStream", "UserSignatureStream", "UnPartialStatedRoomStream", + "UnPartialStatedEventStream", ] diff --git a/synapse/replication/tcp/streams/partial_state.py b/synapse/replication/tcp/streams/partial_state.py index 18f087ffa2..b5a2ae74b6 100644 --- a/synapse/replication/tcp/streams/partial_state.py +++ b/synapse/replication/tcp/streams/partial_state.py @@ -46,3 +46,31 @@ class UnPartialStatedRoomStream(Stream): current_token_without_instance(store.get_un_partial_stated_rooms_token), store.get_un_partial_stated_rooms_from_stream, ) + + +@attr.s(slots=True, frozen=True, auto_attribs=True) +class UnPartialStatedEventStreamRow: + # ID of the event that has been un-partial-stated. + event_id: str + + # True iff the rejection status of the event changed as a result of being + # un-partial-stated. + rejection_status_changed: bool + + +class UnPartialStatedEventStream(Stream): + """ + Stream to notify about events becoming un-partial-stated. + """ + + NAME = "un_partial_stated_event" + ROW_TYPE = UnPartialStatedEventStreamRow + + def __init__(self, hs: "HomeServer"): + store = hs.get_datastores().main + super().__init__( + hs.get_instance_name(), + # TODO(faster_joins, multiple writers): we need to account for instance names + current_token_without_instance(store.get_un_partial_stated_events_token), + store.get_un_partial_stated_events_from_stream, + ) diff --git a/synapse/storage/databases/main/events_worker.py b/synapse/storage/databases/main/events_worker.py index 318fd7dc71..e19b16064b 100644 --- a/synapse/storage/databases/main/events_worker.py +++ b/synapse/storage/databases/main/events_worker.py @@ -70,6 +70,7 @@ from synapse.storage.database import ( from synapse.storage.engines import PostgresEngine from synapse.storage.types import Cursor from synapse.storage.util.id_generators import ( + AbstractStreamIdGenerator, AbstractStreamIdTracker, MultiWriterIdGenerator, StreamIdGenerator, @@ -292,6 +293,93 @@ class EventsWorkerStore(SQLBaseStore): id_column="chain_id", ) + self._un_partial_stated_events_stream_id_gen: AbstractStreamIdGenerator + + if isinstance(database.engine, PostgresEngine): + self._un_partial_stated_events_stream_id_gen = MultiWriterIdGenerator( + db_conn=db_conn, + db=database, + stream_name="un_partial_stated_event_stream", + instance_name=hs.get_instance_name(), + tables=[ + ("un_partial_stated_event_stream", "instance_name", "stream_id") + ], + sequence_name="un_partial_stated_event_stream_sequence", + # TODO(faster_joins, multiple writers) Support multiple writers. + writers=["master"], + ) + else: + self._un_partial_stated_events_stream_id_gen = StreamIdGenerator( + db_conn, "un_partial_stated_event_stream", "stream_id" + ) + + def get_un_partial_stated_events_token(self) -> int: + # TODO(faster_joins, multiple writers): This is inappropriate if there are multiple + # writers because workers that don't write often will hold all + # readers up. + return self._un_partial_stated_events_stream_id_gen.get_current_token() + + async def get_un_partial_stated_events_from_stream( + self, instance_name: str, last_id: int, current_id: int, limit: int + ) -> Tuple[List[Tuple[int, Tuple[str, bool]]], int, bool]: + """Get updates for the un-partial-stated events replication stream. + + Args: + instance_name: The writer we want to fetch updates from. Unused + here since there is only ever one writer. + last_id: The token to fetch updates from. Exclusive. + current_id: The token to fetch updates up to. Inclusive. + limit: The requested limit for the number of rows to return. The + function may return more or fewer rows. + + Returns: + A tuple consisting of: the updates, a token to use to fetch + subsequent updates, and whether we returned fewer rows than exists + between the requested tokens due to the limit. + + The token returned can be used in a subsequent call to this + function to get further updatees. + + The updates are a list of 2-tuples of stream ID and the row data + """ + + if last_id == current_id: + return [], current_id, False + + def get_un_partial_stated_events_from_stream_txn( + txn: LoggingTransaction, + ) -> Tuple[List[Tuple[int, Tuple[str, bool]]], int, bool]: + sql = """ + SELECT stream_id, event_id, rejection_status_changed + FROM un_partial_stated_event_stream + WHERE ? < stream_id AND stream_id <= ? AND instance_name = ? + ORDER BY stream_id ASC + LIMIT ? + """ + txn.execute(sql, (last_id, current_id, instance_name, limit)) + updates = [ + ( + row[0], + ( + row[1], + bool(row[2]), + ), + ) + for row in txn + ] + limited = False + upto_token = current_id + if len(updates) >= limit: + upto_token = updates[-1][0] + limited = True + + return updates, upto_token, limited + + return await self.db_pool.runInteraction( + "get_un_partial_stated_events_from_stream", + get_un_partial_stated_events_from_stream_txn, + ) + def process_replication_rows( self, stream_name: str, diff --git a/synapse/storage/databases/main/state.py b/synapse/storage/databases/main/state.py index c801a93b5b..f855903c39 100644 --- a/synapse/storage/databases/main/state.py +++ b/synapse/storage/databases/main/state.py @@ -80,6 +80,7 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore): hs: "HomeServer", ): super().__init__(database, db_conn, hs) + self._instance_name: str = hs.get_instance_name() async def get_room_version(self, room_id: str) -> RoomVersion: """Get the room_version of a given room @@ -404,18 +405,21 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore): context: EventContext, ) -> None: """Update the state group for a partial state event""" - await self.db_pool.runInteraction( - "update_state_for_partial_state_event", - self._update_state_for_partial_state_event_txn, - event, - context, - ) + async with self._un_partial_stated_events_stream_id_gen.get_next() as un_partial_state_event_stream_id: + await self.db_pool.runInteraction( + "update_state_for_partial_state_event", + self._update_state_for_partial_state_event_txn, + event, + context, + un_partial_state_event_stream_id, + ) def _update_state_for_partial_state_event_txn( self, txn: LoggingTransaction, event: EventBase, context: EventContext, + un_partial_state_event_stream_id: int, ) -> None: # we shouldn't have any outliers here assert not event.internal_metadata.is_outlier() @@ -436,7 +440,10 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore): # the event may now be rejected where it was not before, or vice versa, # in which case we need to update the rejected flags. - if bool(context.rejected) != (event.rejected_reason is not None): + rejection_status_changed = bool(context.rejected) != ( + event.rejected_reason is not None + ) + if rejection_status_changed: self.mark_event_rejected_txn(txn, event.event_id, context.rejected) self.db_pool.simple_delete_one_txn( @@ -445,8 +452,6 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore): keyvalues={"event_id": event.event_id}, ) - # TODO(faster_joins): need to do something about workers here - # https://github.com/matrix-org/synapse/issues/12994 txn.call_after(self.is_partial_state_event.invalidate, (event.event_id,)) txn.call_after( self._get_state_group_for_event.prefill, @@ -454,6 +459,17 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore): state_group, ) + self.db_pool.simple_insert_txn( + txn, + "un_partial_stated_event_stream", + { + "stream_id": un_partial_state_event_stream_id, + "instance_name": self._instance_name, + "event_id": event.event_id, + "rejection_status_changed": rejection_status_changed, + }, + ) + class MainStateBackgroundUpdateStore(RoomMemberWorkerStore): diff --git a/synapse/storage/schema/main/delta/73/22_un_partial_stated_event_stream.sql b/synapse/storage/schema/main/delta/73/22_un_partial_stated_event_stream.sql new file mode 100644 index 0000000000..0e571f78c3 --- /dev/null +++ b/synapse/storage/schema/main/delta/73/22_un_partial_stated_event_stream.sql @@ -0,0 +1,34 @@ +/* Copyright 2022 The Matrix.org Foundation C.I.C + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +-- Stream for notifying that an event has become un-partial-stated. +CREATE TABLE un_partial_stated_event_stream( + -- Position in the stream + stream_id BIGINT PRIMARY KEY NOT NULL, + + -- Which instance wrote this entry. + instance_name TEXT NOT NULL, + + -- Which event has been un-partial-stated. + event_id TEXT NOT NULL REFERENCES events(event_id) ON DELETE CASCADE, + + -- true iff the `rejected` status of the event changed when it became + -- un-partial-stated. + rejection_status_changed BOOLEAN NOT NULL +); + +-- We want an index here because of the foreign key constraint: +-- upon deleting an event, the database needs to be able to check here. +CREATE UNIQUE INDEX un_partial_stated_event_stream_room_id ON un_partial_stated_event_stream (event_id); diff --git a/synapse/storage/schema/main/delta/73/23_un_partial_stated_room_stream_seq.sql.postgres b/synapse/storage/schema/main/delta/73/23_un_partial_stated_room_stream_seq.sql.postgres new file mode 100644 index 0000000000..1ec24702f3 --- /dev/null +++ b/synapse/storage/schema/main/delta/73/23_un_partial_stated_room_stream_seq.sql.postgres @@ -0,0 +1,20 @@ +/* Copyright 2022 The Matrix.org Foundation C.I.C + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CREATE SEQUENCE IF NOT EXISTS un_partial_stated_event_stream_sequence; + +SELECT setval('un_partial_stated_event_stream_sequence', ( + SELECT COALESCE(MAX(stream_id), 1) FROM un_partial_stated_event_stream +)); -- cgit 1.5.1 From 2888d7ec83b33b3ce848d9219c921ffe0b88ffbf Mon Sep 17 00:00:00 2001 From: reivilibre Date: Mon, 19 Dec 2022 14:57:51 +0000 Subject: Faster remote room joins: invalidate caches and unblock requests when receiving un-partial-stated event notifications over replication. [rei:frrj/streams/unpsr] (#14546) --- changelog.d/14546.misc | 1 + synapse/replication/tcp/client.py | 14 ++++++++++++- synapse/storage/databases/main/events_worker.py | 27 ++++++++++++++----------- synapse/storage/databases/main/state.py | 18 ++++++++++++++++- 4 files changed, 46 insertions(+), 14 deletions(-) create mode 100644 changelog.d/14546.misc (limited to 'synapse/storage/databases') diff --git a/changelog.d/14546.misc b/changelog.d/14546.misc new file mode 100644 index 0000000000..60b6761a51 --- /dev/null +++ b/changelog.d/14546.misc @@ -0,0 +1 @@ +Faster remote room joins: stream the un-partial-stating of events over replication. \ No newline at end of file diff --git a/synapse/replication/tcp/client.py b/synapse/replication/tcp/client.py index b4dad47b45..658d89210d 100644 --- a/synapse/replication/tcp/client.py +++ b/synapse/replication/tcp/client.py @@ -36,6 +36,7 @@ from synapse.replication.tcp.streams import ( TagAccountDataStream, ToDeviceStream, TypingStream, + UnPartialStatedEventStream, UnPartialStatedRoomStream, ) from synapse.replication.tcp.streams.events import ( @@ -43,7 +44,10 @@ from synapse.replication.tcp.streams.events import ( EventsStreamEventRow, EventsStreamRow, ) -from synapse.replication.tcp.streams.partial_state import UnPartialStatedRoomStreamRow +from synapse.replication.tcp.streams.partial_state import ( + UnPartialStatedEventStreamRow, + UnPartialStatedRoomStreamRow, +) from synapse.types import PersistedEventPosition, ReadReceipt, StreamKeyType, UserID from synapse.util.async_helpers import Linearizer, timeout_deferred from synapse.util.metrics import Measure @@ -247,6 +251,14 @@ class ReplicationDataHandler: self._state_storage_controller.notify_room_un_partial_stated( row.room_id ) + elif stream_name == UnPartialStatedEventStream.NAME: + for row in rows: + assert isinstance(row, UnPartialStatedEventStreamRow) + + # Wake up any tasks waiting for the event to be un-partial-stated. + self._state_storage_controller.notify_event_un_partial_stated( + row.event_id + ) await self._presence_handler.process_replication_rows( stream_name, instance_name, token, rows diff --git a/synapse/storage/databases/main/events_worker.py b/synapse/storage/databases/main/events_worker.py index e19b16064b..761b15a815 100644 --- a/synapse/storage/databases/main/events_worker.py +++ b/synapse/storage/databases/main/events_worker.py @@ -59,8 +59,9 @@ from synapse.metrics.background_process_metrics import ( run_as_background_process, wrap_as_background_process, ) -from synapse.replication.tcp.streams import BackfillStream +from synapse.replication.tcp.streams import BackfillStream, UnPartialStatedEventStream from synapse.replication.tcp.streams.events import EventsStream +from synapse.replication.tcp.streams.partial_state import UnPartialStatedEventStreamRow from synapse.storage._base import SQLBaseStore, db_to_json, make_in_list_sql_clause from synapse.storage.database import ( DatabasePool, @@ -391,6 +392,16 @@ class EventsWorkerStore(SQLBaseStore): self._stream_id_gen.advance(instance_name, token) elif stream_name == BackfillStream.NAME: self._backfill_id_gen.advance(instance_name, -token) + elif stream_name == UnPartialStatedEventStream.NAME: + for row in rows: + assert isinstance(row, UnPartialStatedEventStreamRow) + + self.is_partial_state_event.invalidate((row.event_id,)) + + if row.rejection_status_changed: + # If the partial-stated event became rejected or unrejected + # when it wasn't before, we need to invalidate this cache. + self._invalidate_local_get_event_cache(row.event_id) super().process_replication_rows(stream_name, instance_name, token, rows) @@ -2380,6 +2391,9 @@ class EventsWorkerStore(SQLBaseStore): This can happen, for example, when resyncing state during a faster join. + It is the caller's responsibility to ensure that other workers are + sent a notification so that they call `_invalidate_local_get_event_cache()`. + Args: txn: event_id: ID of event to update @@ -2418,14 +2432,3 @@ class EventsWorkerStore(SQLBaseStore): ) self.invalidate_get_event_cache_after_txn(txn, event_id) - - # TODO(faster_joins): invalidate the cache on workers. Ideally we'd just - # call '_send_invalidation_to_replication', but we actually need the other - # end to call _invalidate_local_get_event_cache() rather than (just) - # _get_event_cache.invalidate(). - # - # One solution might be to (somehow) get the workers to call - # _invalidate_caches_for_event() (though that will invalidate more than - # strictly necessary). - # - # https://github.com/matrix-org/synapse/issues/12994 diff --git a/synapse/storage/databases/main/state.py b/synapse/storage/databases/main/state.py index f855903c39..f32cbb2dec 100644 --- a/synapse/storage/databases/main/state.py +++ b/synapse/storage/databases/main/state.py @@ -14,7 +14,7 @@ # limitations under the License. import collections.abc import logging -from typing import TYPE_CHECKING, Collection, Dict, Iterable, Optional, Set, Tuple +from typing import TYPE_CHECKING, Any, Collection, Dict, Iterable, Optional, Set, Tuple import attr @@ -24,6 +24,8 @@ from synapse.api.room_versions import KNOWN_ROOM_VERSIONS, RoomVersion from synapse.events import EventBase from synapse.events.snapshot import EventContext from synapse.logging.opentracing import trace +from synapse.replication.tcp.streams import UnPartialStatedEventStream +from synapse.replication.tcp.streams.partial_state import UnPartialStatedEventStreamRow from synapse.storage._base import SQLBaseStore from synapse.storage.database import ( DatabasePool, @@ -82,6 +84,20 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore): super().__init__(database, db_conn, hs) self._instance_name: str = hs.get_instance_name() + def process_replication_rows( + self, + stream_name: str, + instance_name: str, + token: int, + rows: Iterable[Any], + ) -> None: + if stream_name == UnPartialStatedEventStream.NAME: + for row in rows: + assert isinstance(row, UnPartialStatedEventStreamRow) + self._get_state_group_for_event.invalidate((row.event_id,)) + + super().process_replication_rows(stream_name, instance_name, token, rows) + async def get_room_version(self, room_id: str) -> RoomVersion: """Get the room_version of a given room Raises: -- cgit 1.5.1 From cbf1cebef304190fda5bf90f1268e6238eb26888 Mon Sep 17 00:00:00 2001 From: "Olivier Wilkinson (reivilibre)" Date: Tue, 20 Dec 2022 18:30:47 +0000 Subject: Mitigate jump to date slowness by adding 30s timeout --- synapse/storage/databases/main/events_worker.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'synapse/storage/databases') diff --git a/synapse/storage/databases/main/events_worker.py b/synapse/storage/databases/main/events_worker.py index 761b15a815..f80b494edb 100644 --- a/synapse/storage/databases/main/events_worker.py +++ b/synapse/storage/databases/main/events_worker.py @@ -2276,6 +2276,10 @@ class EventsWorkerStore(SQLBaseStore): """ def get_event_id_for_timestamp_txn(txn: LoggingTransaction) -> Optional[str]: + if isinstance(self.database_engine, PostgresEngine): + # Temporary: make sure these queries can't last more than 30s + txn.execute("SET LOCAL statement_timeout = 30000") + txn.execute( sql_template, (room_id, timestamp), -- cgit 1.5.1