From 1ce59d7ba002a869ee94fbe375898cc79c6eb4d1 Mon Sep 17 00:00:00 2001
From: Erik Johnston <erikj@element.io>
Date: Tue, 2 Jul 2024 12:39:49 +0100
Subject: Fix sync waiting for an invalid token from the "future" (#17386)

Fixes https://github.com/element-hq/synapse/issues/17274, hopefully.

Basically, old versions of Synapse could advance streams without
persisting anything in the DB (fixed in #17229). On restart those
updates would get lost, and so the position of the stream would revert
to an older position. If this happened across an upgrade to a later
Synapse version which included #17215, then sync could get blocked
indefinitely (until the stream advanced to the position in the token).

We fix this by bounding the stream positions we'll wait for to the
maximum position of the underlying stream ID generator.
---
 tests/handlers/test_sync.py    | 73 +++++++++++++++++++++++++++++++++++++++++-
 tests/rest/client/test_sync.py |  4 ++-
 2 files changed, 75 insertions(+), 2 deletions(-)

(limited to 'tests')

diff --git a/tests/handlers/test_sync.py b/tests/handlers/test_sync.py
index 02371ce724..5319928c28 100644
--- a/tests/handlers/test_sync.py
+++ b/tests/handlers/test_sync.py
@@ -22,6 +22,7 @@ from unittest.mock import AsyncMock, Mock, patch
 
 from parameterized import parameterized
 
+from twisted.internet import defer
 from twisted.test.proto_helpers import MemoryReactor
 
 from synapse.api.constants import AccountDataTypes, EventTypes, JoinRules
@@ -35,7 +36,7 @@ from synapse.handlers.sync import SyncConfig, SyncRequestKey, SyncResult, SyncVe
 from synapse.rest import admin
 from synapse.rest.client import knock, login, room
 from synapse.server import HomeServer
-from synapse.types import JsonDict, UserID, create_requester
+from synapse.types import JsonDict, StreamKeyType, UserID, create_requester
 from synapse.util import Clock
 
 import tests.unittest
@@ -959,6 +960,76 @@ class SyncTestCase(tests.unittest.HomeserverTestCase):
 
         self.fail("No push rules found")
 
+    def test_wait_for_future_sync_token(self) -> None:
+        """Test that if we receive a token that is ahead of our current token,
+        we'll wait until the stream position advances.
+
+        This can happen if replication streams start lagging, and the client's
+        previous sync request was serviced by a worker ahead of ours.
+        """
+        user = self.register_user("alice", "password")
+
+        # We simulate a lagging stream by getting a stream ID from the ID gen
+        # and then waiting to mark it as "persisted".
+        presence_id_gen = self.store.get_presence_stream_id_gen()
+        ctx_mgr = presence_id_gen.get_next()
+        stream_id = self.get_success(ctx_mgr.__aenter__())
+
+        # Create the new token based on the stream ID above.
+        current_token = self.hs.get_event_sources().get_current_token()
+        since_token = current_token.copy_and_advance(StreamKeyType.PRESENCE, stream_id)
+
+        sync_d = defer.ensureDeferred(
+            self.sync_handler.wait_for_sync_for_user(
+                create_requester(user),
+                generate_sync_config(user),
+                sync_version=SyncVersion.SYNC_V2,
+                request_key=generate_request_key(),
+                since_token=since_token,
+                timeout=0,
+            )
+        )
+
+        # This should block waiting for the presence stream to update
+        self.pump()
+        self.assertFalse(sync_d.called)
+
+        # Marking the stream ID as persisted should unblock the request.
+        self.get_success(ctx_mgr.__aexit__(None, None, None))
+
+        self.get_success(sync_d, by=1.0)
+
+    def test_wait_for_invalid_future_sync_token(self) -> None:
+        """Like the previous test, except we give a token that has a stream
+        position ahead of what is in the DB, i.e. its invalid and we shouldn't
+        wait for the stream to advance (as it may never do so).
+
+        This can happen due to older versions of Synapse giving out stream
+        positions without persisting them in the DB, and so on restart the
+        stream would get reset back to an older position.
+        """
+        user = self.register_user("alice", "password")
+
+        # Create a token and arbitrarily advance one of the streams.
+        current_token = self.hs.get_event_sources().get_current_token()
+        since_token = current_token.copy_and_advance(
+            StreamKeyType.PRESENCE, current_token.presence_key + 1
+        )
+
+        sync_d = defer.ensureDeferred(
+            self.sync_handler.wait_for_sync_for_user(
+                create_requester(user),
+                generate_sync_config(user),
+                sync_version=SyncVersion.SYNC_V2,
+                request_key=generate_request_key(),
+                since_token=since_token,
+                timeout=0,
+            )
+        )
+
+        # We should return without waiting for the presence stream to advance.
+        self.get_success(sync_d)
+
 
 def generate_sync_config(
     user_id: str,
diff --git a/tests/rest/client/test_sync.py b/tests/rest/client/test_sync.py
index bfb26139d3..12c11f342c 100644
--- a/tests/rest/client/test_sync.py
+++ b/tests/rest/client/test_sync.py
@@ -1386,10 +1386,12 @@ class SlidingSyncTestCase(unittest.HomeserverTestCase):
         # Create a future token that will cause us to wait. Since we never send a new
         # event to reach that future stream_ordering, the worker will wait until the
         # full timeout.
+        stream_id_gen = self.store.get_events_stream_id_generator()
+        stream_id = self.get_success(stream_id_gen.get_next().__aenter__())
         current_token = self.event_sources.get_current_token()
         future_position_token = current_token.copy_and_replace(
             StreamKeyType.ROOM,
-            RoomStreamToken(stream=current_token.room_key.stream + 1),
+            RoomStreamToken(stream=stream_id),
         )
 
         future_position_token_serialized = self.get_success(
-- 
cgit 1.5.1


From b905ae27caac4bb27262d9d7ac6e834de5694f10 Mon Sep 17 00:00:00 2001
From: Erik Johnston <erikj@element.io>
Date: Tue, 2 Jul 2024 14:06:36 +0100
Subject: Fix regression when bounding future tokens (#17391)

Fix bug added in #17386, where we accidentally used `room_key` for the
receipts stream. See first commit.

Reviewable commit-by-commit
---
 changelog.d/17391.bugfix    |  1 +
 synapse/streams/events.py   | 26 ++++++++++++++++++++++----
 tests/handlers/test_sync.py | 37 +++++++++++++++++++++++++++++++------
 3 files changed, 54 insertions(+), 10 deletions(-)
 create mode 100644 changelog.d/17391.bugfix

(limited to 'tests')

diff --git a/changelog.d/17391.bugfix b/changelog.d/17391.bugfix
new file mode 100644
index 0000000000..9686b5c276
--- /dev/null
+++ b/changelog.d/17391.bugfix
@@ -0,0 +1 @@
+Fix bug where `/sync` requests could get blocked indefinitely after an upgrade from Synapse versions before v1.109.0.
diff --git a/synapse/streams/events.py b/synapse/streams/events.py
index 93d5ae1a55..856f646795 100644
--- a/synapse/streams/events.py
+++ b/synapse/streams/events.py
@@ -19,6 +19,7 @@
 #
 #
 
+import logging
 from typing import TYPE_CHECKING, Sequence, Tuple
 
 import attr
@@ -41,6 +42,9 @@ if TYPE_CHECKING:
     from synapse.server import HomeServer
 
 
+logger = logging.getLogger(__name__)
+
+
 @attr.s(frozen=True, slots=True, auto_attribs=True)
 class _EventSourcesInner:
     room: RoomEventSource
@@ -139,9 +143,16 @@ class EventSources:
                         key
                     ].get_max_allocated_token()
 
-                    token = token.copy_and_replace(
-                        key, token.room_key.bound_stream_token(max_token)
-                    )
+                    if max_token < token_value.get_max_stream_pos():
+                        logger.error(
+                            "Bounding token from the future '%s': token: %s, bound: %s",
+                            key,
+                            token_value,
+                            max_token,
+                        )
+                        token = token.copy_and_replace(
+                            key, token_value.bound_stream_token(max_token)
+                        )
             else:
                 assert isinstance(current_value, int)
                 if current_value < token_value:
@@ -149,7 +160,14 @@ class EventSources:
                         key
                     ].get_max_allocated_token()
 
-                    token = token.copy_and_replace(key, min(token_value, max_token))
+                    if max_token < token_value:
+                        logger.error(
+                            "Bounding token from the future '%s': token: %s, bound: %s",
+                            key,
+                            token_value,
+                            max_token,
+                        )
+                        token = token.copy_and_replace(key, max_token)
 
         return token
 
diff --git a/tests/handlers/test_sync.py b/tests/handlers/test_sync.py
index 5319928c28..674dd4fb54 100644
--- a/tests/handlers/test_sync.py
+++ b/tests/handlers/test_sync.py
@@ -36,7 +36,14 @@ from synapse.handlers.sync import SyncConfig, SyncRequestKey, SyncResult, SyncVe
 from synapse.rest import admin
 from synapse.rest.client import knock, login, room
 from synapse.server import HomeServer
-from synapse.types import JsonDict, StreamKeyType, UserID, create_requester
+from synapse.types import (
+    JsonDict,
+    MultiWriterStreamToken,
+    RoomStreamToken,
+    StreamKeyType,
+    UserID,
+    create_requester,
+)
 from synapse.util import Clock
 
 import tests.unittest
@@ -999,7 +1006,13 @@ class SyncTestCase(tests.unittest.HomeserverTestCase):
 
         self.get_success(sync_d, by=1.0)
 
-    def test_wait_for_invalid_future_sync_token(self) -> None:
+    @parameterized.expand(
+        [(key,) for key in StreamKeyType.__members__.values()],
+        name_func=lambda func, _, param: f"{func.__name__}_{param.args[0].name}",
+    )
+    def test_wait_for_invalid_future_sync_token(
+        self, stream_key: StreamKeyType
+    ) -> None:
         """Like the previous test, except we give a token that has a stream
         position ahead of what is in the DB, i.e. its invalid and we shouldn't
         wait for the stream to advance (as it may never do so).
@@ -1010,11 +1023,23 @@ class SyncTestCase(tests.unittest.HomeserverTestCase):
         """
         user = self.register_user("alice", "password")
 
-        # Create a token and arbitrarily advance one of the streams.
+        # Create a token and advance one of the streams.
         current_token = self.hs.get_event_sources().get_current_token()
-        since_token = current_token.copy_and_advance(
-            StreamKeyType.PRESENCE, current_token.presence_key + 1
-        )
+        token_value = current_token.get_field(stream_key)
+
+        # How we advance the streams depends on the type.
+        if isinstance(token_value, int):
+            since_token = current_token.copy_and_advance(stream_key, token_value + 1)
+        elif isinstance(token_value, MultiWriterStreamToken):
+            since_token = current_token.copy_and_advance(
+                stream_key, MultiWriterStreamToken(stream=token_value.stream + 1)
+            )
+        elif isinstance(token_value, RoomStreamToken):
+            since_token = current_token.copy_and_advance(
+                stream_key, RoomStreamToken(stream=token_value.stream + 1)
+            )
+        else:
+            raise Exception("Unreachable")
 
         sync_d = defer.ensureDeferred(
             self.sync_handler.wait_for_sync_for_user(
-- 
cgit 1.5.1


From 1609855ff8322e3d4d91f8aea322f9750ac24ba2 Mon Sep 17 00:00:00 2001
From: Erik Johnston <erikj@element.io>
Date: Mon, 1 Jul 2024 12:48:36 +0100
Subject: Limit size of presence EDUs (#17371)

Otherwise they are unbounded.

---------

Co-authored-by: Andrew Morgan <1342360+anoadragon453@users.noreply.github.com>
---
 changelog.d/17371.misc                             |   1 +
 synapse/federation/sender/per_destination_queue.py |  31 ++++--
 tests/federation/test_federation_sender.py         | 119 +++++++++++++++++++++
 3 files changed, 140 insertions(+), 11 deletions(-)
 create mode 100644 changelog.d/17371.misc

(limited to 'tests')

diff --git a/changelog.d/17371.misc b/changelog.d/17371.misc
new file mode 100644
index 0000000000..0fbf19f4fb
--- /dev/null
+++ b/changelog.d/17371.misc
@@ -0,0 +1 @@
+Limit size of presence EDUs to 50 entries.
diff --git a/synapse/federation/sender/per_destination_queue.py b/synapse/federation/sender/per_destination_queue.py
index d9f2f017ed..9f1c2fe22a 100644
--- a/synapse/federation/sender/per_destination_queue.py
+++ b/synapse/federation/sender/per_destination_queue.py
@@ -21,6 +21,7 @@
 #
 import datetime
 import logging
+from collections import OrderedDict
 from types import TracebackType
 from typing import TYPE_CHECKING, Dict, Hashable, Iterable, List, Optional, Tuple, Type
 
@@ -68,6 +69,10 @@ sent_edus_by_type = Counter(
 # If the retry interval is larger than this then we enter "catchup" mode
 CATCHUP_RETRY_INTERVAL = 60 * 60 * 1000
 
+# Limit how many presence states we add to each presence EDU, to ensure that
+# they are bounded in size.
+MAX_PRESENCE_STATES_PER_EDU = 50
+
 
 class PerDestinationQueue:
     """
@@ -144,7 +149,7 @@ class PerDestinationQueue:
 
         # Map of user_id -> UserPresenceState of pending presence to be sent to this
         # destination
-        self._pending_presence: Dict[str, UserPresenceState] = {}
+        self._pending_presence: OrderedDict[str, UserPresenceState] = OrderedDict()
 
         # List of room_id -> receipt_type -> user_id -> receipt_dict,
         #
@@ -399,7 +404,7 @@ class PerDestinationQueue:
                 # through another mechanism, because this is all volatile!
                 self._pending_edus = []
                 self._pending_edus_keyed = {}
-                self._pending_presence = {}
+                self._pending_presence.clear()
                 self._pending_receipt_edus = []
 
                 self._start_catching_up()
@@ -721,22 +726,26 @@ class _TransactionQueueManager:
 
         # Add presence EDU.
         if self.queue._pending_presence:
+            # Only send max 50 presence entries in the EDU, to bound the amount
+            # of data we're sending.
+            presence_to_add: List[JsonDict] = []
+            while (
+                self.queue._pending_presence
+                and len(presence_to_add) < MAX_PRESENCE_STATES_PER_EDU
+            ):
+                _, presence = self.queue._pending_presence.popitem(last=False)
+                presence_to_add.append(
+                    format_user_presence_state(presence, self.queue._clock.time_msec())
+                )
+
             pending_edus.append(
                 Edu(
                     origin=self.queue._server_name,
                     destination=self.queue._destination,
                     edu_type=EduTypes.PRESENCE,
-                    content={
-                        "push": [
-                            format_user_presence_state(
-                                presence, self.queue._clock.time_msec()
-                            )
-                            for presence in self.queue._pending_presence.values()
-                        ]
-                    },
+                    content={"push": presence_to_add},
                 )
             )
-            self.queue._pending_presence = {}
 
         # Add read receipt EDUs.
         pending_edus.extend(self.queue._get_receipt_edus(force_flush=False, limit=5))
diff --git a/tests/federation/test_federation_sender.py b/tests/federation/test_federation_sender.py
index 9073afc70e..6a8887fe74 100644
--- a/tests/federation/test_federation_sender.py
+++ b/tests/federation/test_federation_sender.py
@@ -27,6 +27,8 @@ from twisted.internet import defer
 from twisted.test.proto_helpers import MemoryReactor
 
 from synapse.api.constants import EduTypes, RoomEncryptionAlgorithms
+from synapse.api.presence import UserPresenceState
+from synapse.federation.sender.per_destination_queue import MAX_PRESENCE_STATES_PER_EDU
 from synapse.federation.units import Transaction
 from synapse.handlers.device import DeviceHandler
 from synapse.rest import admin
@@ -266,6 +268,123 @@ class FederationSenderReceiptsTestCases(HomeserverTestCase):
         )
 
 
+class FederationSenderPresenceTestCases(HomeserverTestCase):
+    """
+    Test federation sending for presence updates.
+    """
+
+    def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
+        self.federation_transport_client = Mock(spec=["send_transaction"])
+        self.federation_transport_client.send_transaction = AsyncMock()
+        hs = self.setup_test_homeserver(
+            federation_transport_client=self.federation_transport_client,
+        )
+
+        return hs
+
+    def default_config(self) -> JsonDict:
+        config = super().default_config()
+        config["federation_sender_instances"] = None
+        return config
+
+    def test_presence_simple(self) -> None:
+        "Test that sending a single presence update works"
+
+        mock_send_transaction: AsyncMock = (
+            self.federation_transport_client.send_transaction
+        )
+        mock_send_transaction.return_value = {}
+
+        sender = self.hs.get_federation_sender()
+        self.get_success(
+            sender.send_presence_to_destinations(
+                [UserPresenceState.default("@user:test")],
+                ["server"],
+            )
+        )
+
+        self.pump()
+
+        # expect a call to send_transaction
+        mock_send_transaction.assert_awaited_once()
+
+        json_cb = mock_send_transaction.call_args[0][1]
+        data = json_cb()
+        self.assertEqual(
+            data["edus"],
+            [
+                {
+                    "edu_type": EduTypes.PRESENCE,
+                    "content": {
+                        "push": [
+                            {
+                                "presence": "offline",
+                                "user_id": "@user:test",
+                            }
+                        ]
+                    },
+                }
+            ],
+        )
+
+    def test_presence_batched(self) -> None:
+        """Test that sending lots of presence updates to a destination are
+        batched, rather than having them all sent in one EDU."""
+
+        mock_send_transaction: AsyncMock = (
+            self.federation_transport_client.send_transaction
+        )
+        mock_send_transaction.return_value = {}
+
+        sender = self.hs.get_federation_sender()
+
+        # We now send lots of presence updates to force the federation sender to
+        # batch the mup.
+        number_presence_updates_to_send = MAX_PRESENCE_STATES_PER_EDU * 2
+        self.get_success(
+            sender.send_presence_to_destinations(
+                [
+                    UserPresenceState.default(f"@user{i}:test")
+                    for i in range(number_presence_updates_to_send)
+                ],
+                ["server"],
+            )
+        )
+
+        self.pump()
+
+        # We should have seen at least one transcation be sent by now.
+        mock_send_transaction.assert_called()
+
+        # We don't want to specify exactly how the presence EDUs get sent out,
+        # could be one per transaction or multiple per transaction. We just want
+        # to assert that a) each presence EDU has bounded number of updates, and
+        # b) that all updates get sent out.
+        presence_edus = []
+        for transaction_call in mock_send_transaction.call_args_list:
+            json_cb = transaction_call[0][1]
+            data = json_cb()
+
+            for edu in data["edus"]:
+                self.assertEqual(edu.get("edu_type"), EduTypes.PRESENCE)
+                presence_edus.append(edu)
+
+        # A set of all user presence we see, this should end up matching the
+        # number we sent out above.
+        seen_users: Set[str] = set()
+
+        for edu in presence_edus:
+            presence_states = edu["content"]["push"]
+
+            # This is where we actually check that the number of presence
+            # updates is bounded.
+            self.assertLessEqual(len(presence_states), MAX_PRESENCE_STATES_PER_EDU)
+
+            seen_users.update(p["user_id"] for p in presence_states)
+
+        self.assertEqual(len(seen_users), number_presence_updates_to_send)
+
+
 class FederationSenderDevicesTestCases(HomeserverTestCase):
     """
     Test federation sending to update devices.
-- 
cgit 1.5.1