From 49c9745b4516dec8728c260f1a6784f2c510110c Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 18 Oct 2023 12:26:01 -0400 Subject: Avoid sending massive replication updates when purging a room. (#16510) --- synapse/replication/tcp/streams/events.py | 45 ++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) (limited to 'synapse/replication') diff --git a/synapse/replication/tcp/streams/events.py b/synapse/replication/tcp/streams/events.py index ad9b760713..da6d948e1b 100644 --- a/synapse/replication/tcp/streams/events.py +++ b/synapse/replication/tcp/streams/events.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import heapq +from collections import defaultdict from typing import TYPE_CHECKING, Iterable, Optional, Tuple, Type, TypeVar, cast import attr @@ -51,8 +52,19 @@ data part are: * The state_key of the state which has changed * The event id of the new state +A "state-all" row is sent whenever the "current state" in a room changes, but there are +too many state updates for a particular room in the same update. This replaces any +"state" rows on a per-room basis. The fields in the data part are: + +* The room id for the state changes + """ +# Any room with more than _MAX_STATE_UPDATES_PER_ROOM will send a EventsStreamAllStateRow +# instead of individual EventsStreamEventRow. This is predominantly useful when +# purging large rooms. +_MAX_STATE_UPDATES_PER_ROOM = 150 + @attr.s(slots=True, frozen=True, auto_attribs=True) class EventsStreamRow: @@ -111,9 +123,17 @@ class EventsStreamCurrentStateRow(BaseEventsStreamRow): event_id: Optional[str] +@attr.s(slots=True, frozen=True, auto_attribs=True) +class EventsStreamAllStateRow(BaseEventsStreamRow): + TypeId = "state-all" + + room_id: str + + _EventRows: Tuple[Type[BaseEventsStreamRow], ...] = ( EventsStreamEventRow, EventsStreamCurrentStateRow, + EventsStreamAllStateRow, ) TypeToRow = {Row.TypeId: Row for Row in _EventRows} @@ -213,9 +233,28 @@ class EventsStream(Stream): if stream_id <= upper_limit ) + # Separate out rooms that have many state updates, listeners should clear + # all state for those rooms. + state_updates_by_room = defaultdict(list) + for stream_id, room_id, _type, _state_key, _event_id in state_rows: + state_updates_by_room[room_id].append(stream_id) + + state_all_rows = [ + (stream_ids[-1], room_id) + for room_id, stream_ids in state_updates_by_room.items() + if len(stream_ids) >= _MAX_STATE_UPDATES_PER_ROOM + ] + state_all_updates: Iterable[Tuple[int, Tuple]] = ( + (max_stream_id, (EventsStreamAllStateRow.TypeId, (room_id,))) + for (max_stream_id, room_id) in state_all_rows + ) + + # Any remaining state updates are sent individually. + state_all_rooms = {room_id for _, room_id in state_all_rows} state_updates: Iterable[Tuple[int, Tuple]] = ( (stream_id, (EventsStreamCurrentStateRow.TypeId, rest)) for (stream_id, *rest) in state_rows + if rest[0] not in state_all_rooms ) ex_outliers_updates: Iterable[Tuple[int, Tuple]] = ( @@ -224,7 +263,11 @@ class EventsStream(Stream): ) # we need to return a sorted list, so merge them together. - updates = list(heapq.merge(event_updates, state_updates, ex_outliers_updates)) + updates = list( + heapq.merge( + event_updates, state_all_updates, state_updates, ex_outliers_updates + ) + ) return updates, upper_limit, limited @classmethod -- cgit 1.4.1