Merge branch 'release-v1.13.0' of github.com:matrix-org/synapse into dinsic-release-v1.14.x

* 'release-v1.13.0' of github.com:matrix-org/synapse: (257 commits) Update changelog based on feedback. Move warnings in the changelog and re-iterate changes to branches. 1.13.0 update dh-virtualenv (#7526) 1.13.0rc3 Hash passwords earlier in the registration process (#7523) 1.13.0rc2 1.13.0rc2 Stop `get_joined_users` corruption from custom statuses (#7376) Do not validate that the client dict is stable during UI Auth. (#7483) Fix new flake8 errors (#7489) Don't UPGRADE database rows RST indenting Put rollback instructions in upgrade notes Fix changelog typo Oh yeah, RST Absolute URL it is then Fix upgrade notes link Provide summary of upgrade issues in changelog. Fix ) Move next version notes from changelog to upgrade notes ...
author: Andrew Morgan <andrew@amorgan.xyz> 2020-06-10 16:58:10 +0100
committer: Andrew Morgan <andrew@amorgan.xyz> 2020-06-10 16:58:10 +0100
commit: e8a7a853f8e73b6ea27a1e02bd9786114bfcec3b (patch)
tree: 711dab9b030223706473bf0b8279b0eb86bca317 /synapse/replication/tcp/streams/events.py
parent: Merge pull request #39 from matrix-org/dinsic-release-v1.12.x (diff)
parent: Update changelog based on feedback. (diff)
download: synapse-e8a7a853f8e73b6ea27a1e02bd9786114bfcec3b.tar.xz
1 files changed, 97 insertions, 17 deletions
diff --git a/synapse/replication/tcp/streams/events.py b/synapse/replication/tcp/streams/events.py
index b3afabb8cd..890e75d827 100644
--- a/synapse/replication/tcp/streams/events.py
+++ b/synapse/replication/tcp/streams/events.py
@@ -15,11 +15,12 @@
 # limitations under the License.
 
 import heapq
-from typing import Tuple, Type
+from collections import Iterable
+from typing import List, Tuple, Type
 
 import attr
 
-from ._base import Stream
+from ._base import Stream, StreamUpdateResult, Token
 
 
 """Handling of the 'events' replication stream
@@ -116,28 +117,107 @@ class EventsStream(Stream):
 
     def __init__(self, hs):
         self._store = hs.get_datastore()
-        self.current_token = self._store.get_current_events_token  # type: ignore
+        super().__init__(
+            hs.get_instance_name(),
+            self._store.get_current_events_token,
+            self._update_function,
+        )
 
-        super(EventsStream, self).__init__(hs)
+    async def _update_function(
+        self,
+        instance_name: str,
+        from_token: Token,
+        current_token: Token,
+        target_row_count: int,
+    ) -> StreamUpdateResult:
+
+        # the events stream merges together three separate sources:
+        #  * new events
+        #  * current_state changes
+        #  * events which were previously outliers, but have now been de-outliered.
+        #
+        # The merge operation is complicated by the fact that we only have a single
+        # "stream token" which is supposed to indicate how far we have got through
+        # all three streams. It's therefore no good to return rows 1-1000 from the
+        # "new events" table if the state_deltas are limited to rows 1-100 by the
+        # target_row_count.
+        #
+        # In other words: we must pick a new upper limit, and must return *all* rows
+        # up to that point for each of the three sources.
+        #
+        # Start by trying to split the target_row_count up. We expect to have a
+        # negligible number of ex-outliers, and a rough approximation based on recent
+        # traffic on sw1v.org shows that there are approximately the same number of
+        # event rows between a given pair of stream ids as there are state
+        # updates, so let's split our target_row_count among those two types. The target
+        # is only an approximation - it doesn't matter if we end up going a bit over it.
+
+        target_row_count //= 2
+
+        # now we fetch up to that many rows from the events table
 
-    async def update_function(self, from_token, current_token, limit=None):
         event_rows = await self._store.get_all_new_forward_event_rows(
-            from_token, current_token, limit
-        )
-        event_updates = (
-            (row[0], EventsStreamEventRow.TypeId, row[1:]) for row in event_rows
+            from_token, current_token, target_row_count
+        )  # type: List[Tuple]
+
+        # we rely on get_all_new_forward_event_rows strictly honouring the limit, so
+        # that we know it is safe to just take upper_limit = event_rows[-1][0].
+        assert (
+            len(event_rows) <= target_row_count
+        ), "get_all_new_forward_event_rows did not honour row limit"
+
+        # if we hit the limit on event_updates, there's no point in going beyond the
+        # last stream_id in the batch for the other sources.
+
+        if len(event_rows) == target_row_count:
+            limited = True
+            upper_limit = event_rows[-1][0]  # type: int
+        else:
+            limited = False
+            upper_limit = current_token
+
+        # next up is the state delta table.
+        (
+            state_rows,
+            upper_limit,
+            state_rows_limited,
+        ) = await self._store.get_all_updated_current_state_deltas(
+            from_token, upper_limit, target_row_count
         )
 
-        state_rows = await self._store.get_all_updated_current_state_deltas(
-            from_token, current_token, limit
-        )
-        state_updates = (
-            (row[0], EventsStreamCurrentStateRow.TypeId, row[1:]) for row in state_rows
-        )
+        limited = limited or state_rows_limited
 
-        all_updates = heapq.merge(event_updates, state_updates)
+        # finally, fetch the ex-outliers rows. We assume there are few enough of these
+        # not to bother with the limit.
 
-        return all_updates
+        ex_outliers_rows = await self._store.get_ex_outlier_stream_rows(
+            from_token, upper_limit
+        )  # type: List[Tuple]
+
+        # we now need to turn the raw database rows returned into tuples suitable
+        # for the replication protocol (basically, we add an identifier to
+        # distinguish the row type). At the same time, we can limit the event_rows
+        # to the max stream_id from state_rows.
+
+        event_updates = (
+            (stream_id, (EventsStreamEventRow.TypeId, rest))
+            for (stream_id, *rest) in event_rows
+            if stream_id <= upper_limit
+        )  # type: Iterable[Tuple[int, Tuple]]
+
+        state_updates = (
+            (stream_id, (EventsStreamCurrentStateRow.TypeId, rest))
+            for (stream_id, *rest) in state_rows
+        )  # type: Iterable[Tuple[int, Tuple]]
+
+        ex_outliers_updates = (
+            (stream_id, (EventsStreamEventRow.TypeId, rest))
+            for (stream_id, *rest) in ex_outliers_rows
+        )  # type: Iterable[Tuple[int, Tuple]]
+
+        # we need to return a sorted list, so merge them together.
+        updates = list(heapq.merge(event_updates, state_updates, ex_outliers_updates))
+        return updates, upper_limit, limited
 
     @classmethod
     def parse_row(cls, row):
author	Andrew Morgan <andrew@amorgan.xyz>	2020-06-10 16:58:10 +0100
committer	Andrew Morgan <andrew@amorgan.xyz>	2020-06-10 16:58:10 +0100
commit	e8a7a853f8e73b6ea27a1e02bd9786114bfcec3b (patch)
tree	711dab9b030223706473bf0b8279b0eb86bca317 /synapse/replication/tcp/streams/events.py
parent	Merge pull request #39 from matrix-org/dinsic-release-v1.12.x (diff)
parent	Update changelog based on feedback. (diff)
download	synapse-e8a7a853f8e73b6ea27a1e02bd9786114bfcec3b.tar.xz