On catchup, process each row with its own stream id (#7286)

Other parts of the code (such as the StreamChangeCache) assume that there will not be multiple changes with the same stream id. This code was introduced in #7024, and I hope this fixes #7206.
author: Richard van der Hoff <1389908+richvdh@users.noreply.github.com> 2020-04-20 11:43:29 +0100
committer: GitHub <noreply@github.com> 2020-04-20 11:43:29 +0100
commit: 0f8f02bc39cf1879780f82bdb8dd581588d14cca (patch)
tree: 3c458ed62b2468cb45e357120fc05c64a78bcc88 /synapse
parent: Use a template for the SSO success page to allow for customization. (#7279) (diff)
download: synapse-0f8f02bc39cf1879780f82bdb8dd581588d14cca.tar.xz
2 files changed, 71 insertions, 5 deletions
diff --git a/synapse/replication/tcp/handler.py b/synapse/replication/tcp/handler.py
index 2f5a299141..e32e68e8c4 100644
--- a/synapse/replication/tcp/handler.py
+++ b/synapse/replication/tcp/handler.py
@@ -15,7 +15,18 @@
 # limitations under the License.
 
 import logging
-from typing import Any, Callable, Dict, List, Optional, Set
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    TypeVar,
+)
 
 from prometheus_client import Counter
 
@@ -268,11 +279,14 @@ class ReplicationCommandHandler:
                     missing_updates,
                 ) = await stream.get_updates_since(current_token, cmd.token)
 
-                if updates:
+                # TODO: add some tests for this
+
+                # Some streams return multiple rows with the same stream IDs,
+                # which need to be processed in batches.
+
+                for token, rows in _batch_updates(updates):
                     await self.on_rdata(
-                        cmd.stream_name,
-                        current_token,
-                        [stream.parse_row(update[1]) for update in updates],
+                        cmd.stream_name, token, [stream.parse_row(row) for row in rows],
                     )
 
             # We've now caught up to position sent to us, notify handler.
@@ -404,3 +418,52 @@ class ReplicationCommandHandler:
         We need to check if the client is interested in the stream or not
         """
         self.send_command(RdataCommand(stream_name, token, data))
+
+
+UpdateToken = TypeVar("UpdateToken")
+UpdateRow = TypeVar("UpdateRow")
+
+
+def _batch_updates(
+    updates: Iterable[Tuple[UpdateToken, UpdateRow]]
+) -> Iterator[Tuple[UpdateToken, List[UpdateRow]]]:
+    """Collect stream updates with the same token together
+
+    Given a series of updates returned by Stream.get_updates_since(), collects
+    the updates which share the same stream_id together.
+
+    For example:
+
+        [(1, a), (1, b), (2, c), (3, d), (3, e)]
+
+    becomes:
+
+        [
+            (1, [a, b]),
+            (2, [c]),
+            (3, [d, e]),
+        ]
+    """
+
+    update_iter = iter(updates)
+
+    first_update = next(update_iter, None)
+    if first_update is None:
+        # empty input
+        return
+
+    current_batch_token = first_update[0]
+    current_batch = [first_update[1]]
+
+    for token, row in update_iter:
+        if token != current_batch_token:
+            # different token to the previous row: flush the previous
+            # batch and start anew
+            yield current_batch_token, current_batch
+            current_batch_token = token
+            current_batch = []
+
+        current_batch.append(row)
+
+    # flush the final batch
+    yield current_batch_token, current_batch
diff --git a/synapse/util/caches/stream_change_cache.py b/synapse/util/caches/stream_change_cache.py
index 235f64049c..c61d36a82e 100644
--- a/synapse/util/caches/stream_change_cache.py
+++ b/synapse/util/caches/stream_change_cache.py
@@ -126,6 +126,9 @@ class StreamChangeCache(object):
         """
         assert type(stream_pos) is int
 
+        # FIXME: add a sanity check here that we are not overwriting existing
+        # data in self._cache
+
         if stream_pos > self._earliest_known_stream_pos:
             old_pos = self._entity_to_key.get(entity, None)
             if old_pos is not None:
author	Richard van der Hoff <1389908+richvdh@users.noreply.github.com>	2020-04-20 11:43:29 +0100
committer	GitHub <noreply@github.com>	2020-04-20 11:43:29 +0100
commit	0f8f02bc39cf1879780f82bdb8dd581588d14cca (patch)
tree	3c458ed62b2468cb45e357120fc05c64a78bcc88 /synapse
parent	Use a template for the SSO success page to allow for customization. (#7279) (diff)
download	synapse-0f8f02bc39cf1879780f82bdb8dd581588d14cca.tar.xz