to_device updates could be dropped when consuming the replication stream (#15349)

Co-authored-by: reivilibre <oliverw@matrix.org>
author: Mathieu Velten <mathieuv@matrix.org> 2023-03-30 19:41:14 +0200
committer: GitHub <noreply@github.com> 2023-03-30 19:41:14 +0200
commit: 6f68e32bfbe439435410e81ac70fdca10f28fbf7 (patch)
tree: 6871ba0e2d83b6057dbadec5bd2ad523e746f369
parent: Speed up SQLite unit test CI (#15334) (diff)
download: synapse-6f68e32bfbe439435410e81ac70fdca10f28fbf7.tar.xz
5 files changed, 98 insertions, 15 deletions
diff --git a/changelog.d/15349.bugfix b/changelog.d/15349.bugfix
new file mode 100644
index 0000000000..65ea7ae7eb
--- /dev/null
+++ b/changelog.d/15349.bugfix
@@ -0,0 +1 @@
+Fix a long-standing bug where some to_device messages could be dropped when using workers.
diff --git a/synapse/storage/databases/main/deviceinbox.py b/synapse/storage/databases/main/deviceinbox.py
index 0d75d9739a..b471fcb064 100644
--- a/synapse/storage/databases/main/deviceinbox.py
+++ b/synapse/storage/databases/main/deviceinbox.py
@@ -617,14 +617,14 @@ class DeviceInboxWorkerStore(SQLBaseStore):
             # We limit like this as we might have multiple rows per stream_id, and
             # we want to make sure we always get all entries for any stream_id
             # we return.
-            upper_pos = min(current_id, last_id + limit)
+            upto_token = min(current_id, last_id + limit)
             sql = (
                 "SELECT max(stream_id), user_id"
                 " FROM device_inbox"
                 " WHERE ? < stream_id AND stream_id <= ?"
                 " GROUP BY user_id"
             )
-            txn.execute(sql, (last_id, upper_pos))
+            txn.execute(sql, (last_id, upto_token))
             updates = [(row[0], row[1:]) for row in txn]
 
             sql = (
@@ -633,19 +633,13 @@ class DeviceInboxWorkerStore(SQLBaseStore):
                 " WHERE ? < stream_id AND stream_id <= ?"
                 " GROUP BY destination"
             )
-            txn.execute(sql, (last_id, upper_pos))
+            txn.execute(sql, (last_id, upto_token))
             updates.extend((row[0], row[1:]) for row in txn)
 
             # Order by ascending stream ordering
             updates.sort()
 
-            limited = False
-            upto_token = current_id
-            if len(updates) >= limit:
-                upto_token = updates[-1][0]
-                limited = True
-
-            return updates, upto_token, limited
+            return updates, upto_token, upto_token < current_id
 
         return await self.db_pool.runInteraction(
             "get_all_new_device_messages", get_all_new_device_messages_txn
diff --git a/tests/replication/_base.py b/tests/replication/_base.py
index 46a8e2013e..0f1a8a145f 100644
--- a/tests/replication/_base.py
+++ b/tests/replication/_base.py
@@ -54,6 +54,10 @@ class BaseStreamTestCase(unittest.HomeserverTestCase):
     if not hiredis:
         skip = "Requires hiredis"
 
+    if not USE_POSTGRES_FOR_TESTS:
+        # Redis replication only takes place on Postgres
+        skip = "Requires Postgres"
+
     def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
         # build a replication server
         server_factory = ReplicationStreamProtocolFactory(hs)
diff --git a/tests/replication/tcp/streams/test_account_data.py b/tests/replication/tcp/streams/test_account_data.py
index 01df1be047..b9075e3f20 100644
--- a/tests/replication/tcp/streams/test_account_data.py
+++ b/tests/replication/tcp/streams/test_account_data.py
@@ -37,11 +37,6 @@ class AccountDataStreamTestCase(BaseStreamTestCase):
         # also one global update
         self.get_success(store.add_account_data_for_user("test_user", "m.global", {}))
 
-        # tell the notifier to catch up to avoid duplicate rows.
-        # workaround for https://github.com/matrix-org/synapse/issues/7360
-        # FIXME remove this when the above is fixed
-        self.replicate()
-
         # check we're testing what we think we are: no rows should yet have been
         # received
         self.assertEqual([], self.test_handler.received_rdata_rows)
diff --git a/tests/replication/tcp/streams/test_to_device.py b/tests/replication/tcp/streams/test_to_device.py
new file mode 100644
index 0000000000..fb9eac668f
--- /dev/null
+++ b/tests/replication/tcp/streams/test_to_device.py
@@ -0,0 +1,89 @@
+# Copyright 2023 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+
+import synapse
+from synapse.replication.tcp.streams._base import _STREAM_UPDATE_TARGET_ROW_COUNT
+from synapse.types import JsonDict
+
+from tests.replication._base import BaseStreamTestCase
+
+logger = logging.getLogger(__name__)
+
+
+class ToDeviceStreamTestCase(BaseStreamTestCase):
+    servlets = [
+        synapse.rest.admin.register_servlets,
+        synapse.rest.client.login.register_servlets,
+    ]
+
+    def test_to_device_stream(self) -> None:
+        store = self.hs.get_datastores().main
+
+        user1 = self.register_user("user1", "pass")
+        self.login("user1", "pass", "device")
+        user2 = self.register_user("user2", "pass")
+        self.login("user2", "pass", "device")
+
+        # connect to pull the updates related to users creation/login
+        self.reconnect()
+        self.replicate()
+        self.test_handler.received_rdata_rows.clear()
+        # disconnect so we can accumulate the updates without pulling them
+        self.disconnect()
+
+        msg: JsonDict = {}
+        msg["sender"] = "@sender:example.org"
+        msg["type"] = "m.new_device"
+
+        # add messages to the device inbox for user1 up until the
+        # limit defined for a stream update batch
+        for i in range(0, _STREAM_UPDATE_TARGET_ROW_COUNT):
+            msg["content"] = {"device": {}}
+            messages = {user1: {"device": msg}}
+
+            self.get_success(
+                store.add_messages_from_remote_to_device_inbox(
+                    "example.org",
+                    f"{i}",
+                    messages,
+                )
+            )
+
+        # add one more message, for user2 this time
+        # this message would be dropped before fixing #15335
+        msg["content"] = {"device": {}}
+        messages = {user2: {"device": msg}}
+
+        self.get_success(
+            store.add_messages_from_remote_to_device_inbox(
+                "example.org",
+                f"{_STREAM_UPDATE_TARGET_ROW_COUNT}",
+                messages,
+            )
+        )
+
+        # replication is disconnected so we shouldn't get any updates yet
+        self.assertEqual([], self.test_handler.received_rdata_rows)
+
+        # now reconnect to pull the updates
+        self.reconnect()
+        self.replicate()
+
+        # we should receive the fact that we have to_device updates
+        # for user1 and user2
+        received_rows = self.test_handler.received_rdata_rows
+        self.assertEqual(len(received_rows), 2)
+        self.assertEqual(received_rows[0][2].entity, user1)
+        self.assertEqual(received_rows[1][2].entity, user2)
author	Mathieu Velten <mathieuv@matrix.org>	2023-03-30 19:41:14 +0200
committer	GitHub <noreply@github.com>	2023-03-30 19:41:14 +0200
commit	6f68e32bfbe439435410e81ac70fdca10f28fbf7 (patch)
tree	6871ba0e2d83b6057dbadec5bd2ad523e746f369
parent	Speed up SQLite unit test CI (#15334) (diff)
download	synapse-6f68e32bfbe439435410e81ac70fdca10f28fbf7.tar.xz