From 58f830511486271da72543dd20676b702bc52b2f Mon Sep 17 00:00:00 2001
From: Anshul Madnawat <100751856+anshulm333@users.noreply.github.com>
Date: Thu, 27 Jul 2023 00:15:47 +0530
Subject: Inline SQL queries using boolean parameters (#15525)

SQLite now supports TRUE and FALSE constants, simplify some
queries by inlining those instead of passing them as arguments.
---
 synapse/storage/databases/main/event_federation.py |  3 +--
 synapse/storage/databases/main/events.py           | 12 ++++++------
 synapse/storage/databases/main/purge_events.py     |  9 ++++-----
 synapse/storage/databases/main/push_rule.py        |  6 +++---
 synapse/storage/databases/main/registration.py     |  4 ++--
 synapse/storage/databases/main/room.py             | 10 +++++-----
 synapse/storage/databases/main/stream.py           |  4 ++--
 7 files changed, 23 insertions(+), 25 deletions(-)

(limited to 'synapse')

diff --git a/synapse/storage/databases/main/event_federation.py b/synapse/storage/databases/main/event_federation.py
index b2cda52ce5..534dc32413 100644
--- a/synapse/storage/databases/main/event_federation.py
+++ b/synapse/storage/databases/main/event_federation.py
@@ -843,7 +843,7 @@ class EventFederationWorkerStore(SignatureWorkerStore, EventsWorkerStore, SQLBas
                      * because the schema change is in a background update, it's not
                      * necessarily safe to assume that it will have been completed.
                      */
-                    AND edge.is_state is ? /* False */
+                    AND edge.is_state is FALSE
                     /**
                      * We only want backwards extremities that are older than or at
                      * the same position of the given `current_depth` (where older
@@ -886,7 +886,6 @@ class EventFederationWorkerStore(SignatureWorkerStore, EventsWorkerStore, SQLBas
                 sql,
                 (
                     room_id,
-                    False,
                     current_depth,
                     self._clock.time_msec(),
                     BACKFILL_EVENT_EXPONENTIAL_BACKOFF_MAXIMUM_DOUBLING_STEPS,
diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py
index 2b83a69426..bd3f14fb71 100644
--- a/synapse/storage/databases/main/events.py
+++ b/synapse/storage/databases/main/events.py
@@ -1455,8 +1455,8 @@ class PersistEventsStore:
                     },
                 )
 
-                sql = "UPDATE events SET outlier = ? WHERE event_id = ?"
-                txn.execute(sql, (False, event.event_id))
+                sql = "UPDATE events SET outlier = FALSE WHERE event_id = ?"
+                txn.execute(sql, (event.event_id,))
 
                 # Update the event_backward_extremities table now that this
                 # event isn't an outlier any more.
@@ -1549,13 +1549,13 @@ class PersistEventsStore:
             for event, _ in events_and_contexts
             if not event.internal_metadata.is_redacted()
         ]
-        sql = "UPDATE redactions SET have_censored = ? WHERE "
+        sql = "UPDATE redactions SET have_censored = FALSE WHERE "
         clause, args = make_in_list_sql_clause(
             self.database_engine,
             "redacts",
             unredacted_events,
         )
-        txn.execute(sql + clause, [False] + args)
+        txn.execute(sql + clause, args)
 
         self.db_pool.simple_insert_many_txn(
             txn,
@@ -2318,14 +2318,14 @@ class PersistEventsStore:
             "   SELECT 1 FROM events"
             "   LEFT JOIN event_edges edge"
             "   ON edge.event_id = events.event_id"
-            "   WHERE events.event_id = ? AND events.room_id = ? AND (events.outlier = ? OR edge.event_id IS NULL)"
+            "   WHERE events.event_id = ? AND events.room_id = ? AND (events.outlier = FALSE OR edge.event_id IS NULL)"
             " )"
         )
 
         txn.execute_batch(
             query,
             [
-                (e_id, ev.room_id, e_id, ev.room_id, e_id, ev.room_id, False)
+                (e_id, ev.room_id, e_id, ev.room_id, e_id, ev.room_id)
                 for ev in events
                 for e_id in ev.prev_event_ids()
                 if not ev.internal_metadata.is_outlier()
diff --git a/synapse/storage/databases/main/purge_events.py b/synapse/storage/databases/main/purge_events.py
index 9773c1fcd2..b52f48cf04 100644
--- a/synapse/storage/databases/main/purge_events.py
+++ b/synapse/storage/databases/main/purge_events.py
@@ -249,12 +249,11 @@ class PurgeEventsStore(StateGroupWorkerStore, CacheInvalidationWorkerStore):
         # Mark all state and own events as outliers
         logger.info("[purge] marking remaining events as outliers")
         txn.execute(
-            "UPDATE events SET outlier = ?"
+            "UPDATE events SET outlier = TRUE"
             " WHERE event_id IN ("
-            "    SELECT event_id FROM events_to_purge "
-            "    WHERE NOT should_delete"
-            ")",
-            (True,),
+            "   SELECT event_id FROM events_to_purge "
+            "   WHERE NOT should_delete"
+            ")"
         )
 
         # synapse tries to take out an exclusive lock on room_depth whenever it
diff --git a/synapse/storage/databases/main/push_rule.py b/synapse/storage/databases/main/push_rule.py
index e098ceea3c..c13c0bc7d7 100644
--- a/synapse/storage/databases/main/push_rule.py
+++ b/synapse/storage/databases/main/push_rule.py
@@ -560,19 +560,19 @@ class PushRuleStore(PushRulesWorkerStore):
         if isinstance(self.database_engine, PostgresEngine):
             sql = """
                 INSERT INTO push_rules_enable (id, user_name, rule_id, enabled)
-                VALUES (?, ?, ?, ?)
+                VALUES (?, ?, ?, 1)
                 ON CONFLICT DO NOTHING
             """
         elif isinstance(self.database_engine, Sqlite3Engine):
             sql = """
                 INSERT OR IGNORE INTO push_rules_enable (id, user_name, rule_id, enabled)
-                VALUES (?, ?, ?, ?)
+                VALUES (?, ?, ?, 1)
             """
         else:
             raise RuntimeError("Unknown database engine")
 
         new_enable_id = self._push_rules_enable_id_gen.get_next()
-        txn.execute(sql, (new_enable_id, user_id, rule_id, 1))
+        txn.execute(sql, (new_enable_id, user_id, rule_id))
 
     async def delete_push_rule(self, user_id: str, rule_id: str) -> None:
         """
diff --git a/synapse/storage/databases/main/registration.py b/synapse/storage/databases/main/registration.py
index 676d03bb7e..c582cf0573 100644
--- a/synapse/storage/databases/main/registration.py
+++ b/synapse/storage/databases/main/registration.py
@@ -454,9 +454,9 @@ class RegistrationWorkerStore(CacheInvalidationWorkerStore):
         ) -> List[Tuple[str, int]]:
             sql = (
                 "SELECT user_id, expiration_ts_ms FROM account_validity"
-                " WHERE email_sent = ? AND (expiration_ts_ms - ?) <= ?"
+                " WHERE email_sent = FALSE AND (expiration_ts_ms - ?) <= ?"
             )
-            values = [False, now_ms, renew_at]
+            values = [now_ms, renew_at]
             txn.execute(sql, values)
             return cast(List[Tuple[str, int]], txn.fetchall())
 
diff --git a/synapse/storage/databases/main/room.py b/synapse/storage/databases/main/room.py
index 830658f328..719e11aea6 100644
--- a/synapse/storage/databases/main/room.py
+++ b/synapse/storage/databases/main/room.py
@@ -936,11 +936,11 @@ class RoomWorkerStore(CacheInvalidationWorkerStore):
             JOIN event_json USING (room_id, event_id)
             WHERE room_id = ?
                 %(where_clause)s
-                AND contains_url = ? AND outlier = ?
+                AND contains_url = TRUE AND outlier = FALSE
             ORDER BY stream_ordering DESC
             LIMIT ?
         """
-        txn.execute(sql % {"where_clause": ""}, (room_id, True, False, 100))
+        txn.execute(sql % {"where_clause": ""}, (room_id, 100))
 
         local_media_mxcs = []
         remote_media_mxcs = []
@@ -976,7 +976,7 @@ class RoomWorkerStore(CacheInvalidationWorkerStore):
 
             txn.execute(
                 sql % {"where_clause": "AND stream_ordering < ?"},
-                (room_id, next_token, True, False, 100),
+                (room_id, next_token, 100),
             )
 
         return local_media_mxcs, remote_media_mxcs
@@ -1086,9 +1086,9 @@ class RoomWorkerStore(CacheInvalidationWorkerStore):
 
         # set quarantine
         if quarantined_by is not None:
-            sql += "AND safe_from_quarantine = ?"
+            sql += "AND safe_from_quarantine = FALSE"
             txn.executemany(
-                sql, [(quarantined_by, media_id, False) for media_id in local_mxcs]
+                sql, [(quarantined_by, media_id) for media_id in local_mxcs]
             )
         # remove from quarantine
         else:
diff --git a/synapse/storage/databases/main/stream.py b/synapse/storage/databases/main/stream.py
index 92cbe262a6..5a3611c415 100644
--- a/synapse/storage/databases/main/stream.py
+++ b/synapse/storage/databases/main/stream.py
@@ -1401,7 +1401,7 @@ class StreamWorkerStore(EventsWorkerStore, SQLBaseStore):
             `to_token`), or `limit` is zero.
         """
 
-        args = [False, room_id]
+        args: List[Any] = [room_id]
 
         order, from_bound, to_bound = generate_pagination_bounds(
             direction, from_token, to_token
@@ -1475,7 +1475,7 @@ class StreamWorkerStore(EventsWorkerStore, SQLBaseStore):
                 event.topological_ordering, event.stream_ordering
             FROM events AS event
             %(join_clause)s
-            WHERE event.outlier = ? AND event.room_id = ? AND %(bounds)s
+            WHERE event.outlier = FALSE AND event.room_id = ? AND %(bounds)s
             ORDER BY event.topological_ordering %(order)s,
             event.stream_ordering %(order)s LIMIT ?
         """ % {
-- 
cgit 1.5.1


From f98f4f2e16a01928e0d442fef4669a1e3fca9b0f Mon Sep 17 00:00:00 2001
From: Shay <hillerys@element.io>
Date: Wed, 26 Jul 2023 12:59:47 -0700
Subject: Remove support for legacy application service paths (#15964)

---
 changelog.d/15964.removal    |  1 +
 synapse/appservice/api.py    | 82 ++++++--------------------------------------
 tests/appservice/test_api.py | 53 ----------------------------
 3 files changed, 12 insertions(+), 124 deletions(-)
 create mode 100644 changelog.d/15964.removal

(limited to 'synapse')

diff --git a/changelog.d/15964.removal b/changelog.d/15964.removal
new file mode 100644
index 0000000000..7613afe505
--- /dev/null
+++ b/changelog.d/15964.removal
@@ -0,0 +1 @@
+Remove support for legacy application service paths.
diff --git a/synapse/appservice/api.py b/synapse/appservice/api.py
index 5fb3d5083d..359999f680 100644
--- a/synapse/appservice/api.py
+++ b/synapse/appservice/api.py
@@ -17,8 +17,6 @@ import urllib.parse
 from typing import (
     TYPE_CHECKING,
     Any,
-    Awaitable,
-    Callable,
     Dict,
     Iterable,
     List,
@@ -30,7 +28,7 @@ from typing import (
 )
 
 from prometheus_client import Counter
-from typing_extensions import Concatenate, ParamSpec, TypeGuard
+from typing_extensions import ParamSpec, TypeGuard
 
 from synapse.api.constants import EventTypes, Membership, ThirdPartyEntityKind
 from synapse.api.errors import CodeMessageException, HttpResponseException
@@ -80,9 +78,7 @@ sent_todevice_counter = Counter(
 
 HOUR_IN_MS = 60 * 60 * 1000
 
-
 APP_SERVICE_PREFIX = "/_matrix/app/v1"
-APP_SERVICE_UNSTABLE_PREFIX = "/_matrix/app/unstable"
 
 P = ParamSpec("P")
 R = TypeVar("R")
@@ -128,47 +124,6 @@ class ApplicationServiceApi(SimpleHttpClient):
             hs.get_clock(), "as_protocol_meta", timeout_ms=HOUR_IN_MS
         )
 
-    async def _send_with_fallbacks(
-        self,
-        service: "ApplicationService",
-        prefixes: List[str],
-        path: str,
-        func: Callable[Concatenate[str, P], Awaitable[R]],
-        *args: P.args,
-        **kwargs: P.kwargs,
-    ) -> R:
-        """
-        Attempt to call an application service with multiple paths, falling back
-        until one succeeds.
-
-        Args:
-            service: The appliacation service, this provides the base URL.
-            prefixes: A last of paths to try in order for the requests.
-            path: A suffix to append to each prefix.
-            func: The function to call, the first argument will be the full
-                endpoint to fetch. Other arguments are provided by args/kwargs.
-
-        Returns:
-            The return value of func.
-        """
-        for i, prefix in enumerate(prefixes, start=1):
-            uri = f"{service.url}{prefix}{path}"
-            try:
-                return await func(uri, *args, **kwargs)
-            except HttpResponseException as e:
-                # If an error is received that is due to an unrecognised path,
-                # fallback to next path (if one exists). Otherwise, consider it
-                # a legitimate error and raise.
-                if i < len(prefixes) and is_unknown_endpoint(e):
-                    continue
-                raise
-            except Exception:
-                # Unexpected exceptions get sent to the caller.
-                raise
-
-        # The function should always exit via the return or raise above this.
-        raise RuntimeError("Unexpected fallback behaviour. This should never be seen.")
-
     async def query_user(self, service: "ApplicationService", user_id: str) -> bool:
         if service.url is None:
             return False
@@ -177,11 +132,8 @@ class ApplicationServiceApi(SimpleHttpClient):
         assert service.hs_token is not None
 
         try:
-            response = await self._send_with_fallbacks(
-                service,
-                [APP_SERVICE_PREFIX, ""],
-                f"/users/{urllib.parse.quote(user_id)}",
-                self.get_json,
+            response = await self.get_json(
+                f"{service.url}{APP_SERVICE_PREFIX}/users/{urllib.parse.quote(user_id)}",
                 {"access_token": service.hs_token},
                 headers={"Authorization": [f"Bearer {service.hs_token}"]},
             )
@@ -203,11 +155,8 @@ class ApplicationServiceApi(SimpleHttpClient):
         assert service.hs_token is not None
 
         try:
-            response = await self._send_with_fallbacks(
-                service,
-                [APP_SERVICE_PREFIX, ""],
-                f"/rooms/{urllib.parse.quote(alias)}",
-                self.get_json,
+            response = await self.get_json(
+                f"{service.url}{APP_SERVICE_PREFIX}/rooms/{urllib.parse.quote(alias)}",
                 {"access_token": service.hs_token},
                 headers={"Authorization": [f"Bearer {service.hs_token}"]},
             )
@@ -245,11 +194,8 @@ class ApplicationServiceApi(SimpleHttpClient):
                 **fields,
                 b"access_token": service.hs_token,
             }
-            response = await self._send_with_fallbacks(
-                service,
-                [APP_SERVICE_PREFIX, APP_SERVICE_UNSTABLE_PREFIX],
-                f"/thirdparty/{kind}/{urllib.parse.quote(protocol)}",
-                self.get_json,
+            response = await self.get_json(
+                f"{service.url}{APP_SERVICE_PREFIX}/thirdparty/{kind}/{urllib.parse.quote(protocol)}",
                 args=args,
                 headers={"Authorization": [f"Bearer {service.hs_token}"]},
             )
@@ -285,11 +231,8 @@ class ApplicationServiceApi(SimpleHttpClient):
             # This is required by the configuration.
             assert service.hs_token is not None
             try:
-                info = await self._send_with_fallbacks(
-                    service,
-                    [APP_SERVICE_PREFIX, APP_SERVICE_UNSTABLE_PREFIX],
-                    f"/thirdparty/protocol/{urllib.parse.quote(protocol)}",
-                    self.get_json,
+                info = await self.get_json(
+                    f"{service.url}{APP_SERVICE_PREFIX}/thirdparty/protocol/{urllib.parse.quote(protocol)}",
                     {"access_token": service.hs_token},
                     headers={"Authorization": [f"Bearer {service.hs_token}"]},
                 )
@@ -401,11 +344,8 @@ class ApplicationServiceApi(SimpleHttpClient):
                 }
 
         try:
-            await self._send_with_fallbacks(
-                service,
-                [APP_SERVICE_PREFIX, ""],
-                f"/transactions/{urllib.parse.quote(str(txn_id))}",
-                self.put_json,
+            await self.put_json(
+                f"{service.url}{APP_SERVICE_PREFIX}/transactions/{urllib.parse.quote(str(txn_id))}",
                 json_body=body,
                 args={"access_token": service.hs_token},
                 headers={"Authorization": [f"Bearer {service.hs_token}"]},
diff --git a/tests/appservice/test_api.py b/tests/appservice/test_api.py
index 15fce165b6..807dc2f21c 100644
--- a/tests/appservice/test_api.py
+++ b/tests/appservice/test_api.py
@@ -16,7 +16,6 @@ from unittest.mock import Mock
 
 from twisted.test.proto_helpers import MemoryReactor
 
-from synapse.api.errors import HttpResponseException
 from synapse.appservice import ApplicationService
 from synapse.server import HomeServer
 from synapse.types import JsonDict
@@ -107,58 +106,6 @@ class ApplicationServiceApiTestCase(unittest.HomeserverTestCase):
         self.assertEqual(self.request_url, URL_LOCATION)
         self.assertEqual(result, SUCCESS_RESULT_LOCATION)
 
-    def test_fallback(self) -> None:
-        """
-        Tests that the fallback to legacy URLs works.
-        """
-        SUCCESS_RESULT_USER = [
-            {
-                "protocol": PROTOCOL,
-                "userid": "@a:user",
-                "fields": {
-                    "more": "fields",
-                },
-            }
-        ]
-
-        URL_USER = f"{URL}/_matrix/app/v1/thirdparty/user/{PROTOCOL}"
-        FALLBACK_URL_USER = f"{URL}/_matrix/app/unstable/thirdparty/user/{PROTOCOL}"
-
-        self.request_url = None
-        self.v1_seen = False
-
-        async def get_json(
-            url: str,
-            args: Mapping[Any, Any],
-            headers: Mapping[Union[str, bytes], Sequence[Union[str, bytes]]],
-        ) -> List[JsonDict]:
-            # Ensure the access token is passed as both a header and query arg.
-            if not headers.get("Authorization") or not args.get(b"access_token"):
-                raise RuntimeError("Access token not provided")
-
-            self.assertEqual(headers.get("Authorization"), [f"Bearer {TOKEN}"])
-            self.assertEqual(args.get(b"access_token"), TOKEN)
-            self.request_url = url
-            if url == URL_USER:
-                self.v1_seen = True
-                raise HttpResponseException(404, "NOT_FOUND", b"NOT_FOUND")
-            elif url == FALLBACK_URL_USER:
-                return SUCCESS_RESULT_USER
-            else:
-                raise RuntimeError(
-                    "URL provided was invalid. This should never be seen."
-                )
-
-        # We assign to a method, which mypy doesn't like.
-        self.api.get_json = Mock(side_effect=get_json)  # type: ignore[assignment]
-
-        result = self.get_success(
-            self.api.query_3pe(self.service, "user", PROTOCOL, {b"some": [b"field"]})
-        )
-        self.assertTrue(self.v1_seen)
-        self.assertEqual(self.request_url, FALLBACK_URL_USER)
-        self.assertEqual(result, SUCCESS_RESULT_USER)
-
     def test_claim_keys(self) -> None:
         """
         Tests that the /keys/claim response is properly parsed for missing
-- 
cgit 1.5.1


From a719b703d9bd0dade2565ddcad0e2f3a7a9d4c37 Mon Sep 17 00:00:00 2001
From: Mathieu Velten <mathieuv@matrix.org>
Date: Thu, 27 Jul 2023 15:45:05 +0200
Subject: Fix 404 on /profile when the display name is empty but not the avatar
 (#16012)

---
 changelog.d/16012.bugfix       |  1 +
 synapse/handlers/profile.py    |  2 +-
 tests/handlers/test_profile.py | 10 ++++++++++
 3 files changed, 12 insertions(+), 1 deletion(-)
 create mode 100644 changelog.d/16012.bugfix

(limited to 'synapse')

diff --git a/changelog.d/16012.bugfix b/changelog.d/16012.bugfix
new file mode 100644
index 0000000000..44ca9377ff
--- /dev/null
+++ b/changelog.d/16012.bugfix
@@ -0,0 +1 @@
+Fix 404 not found code returned on profile endpoint when the display name is empty but not the avatar URL.
diff --git a/synapse/handlers/profile.py b/synapse/handlers/profile.py
index a7f8c5e636..c7fe101cd9 100644
--- a/synapse/handlers/profile.py
+++ b/synapse/handlers/profile.py
@@ -68,7 +68,7 @@ class ProfileHandler:
 
         if self.hs.is_mine(target_user):
             profileinfo = await self.store.get_profileinfo(target_user)
-            if profileinfo.display_name is None:
+            if profileinfo.display_name is None and profileinfo.avatar_url is None:
                 raise SynapseError(404, "Profile was not found", Codes.NOT_FOUND)
 
             return {
diff --git a/tests/handlers/test_profile.py b/tests/handlers/test_profile.py
index 196ceb0b82..ec2f5d30be 100644
--- a/tests/handlers/test_profile.py
+++ b/tests/handlers/test_profile.py
@@ -179,6 +179,16 @@ class ProfileTestCase(unittest.HomeserverTestCase):
 
         self.assertEqual("http://my.server/me.png", avatar_url)
 
+    def test_get_profile_empty_displayname(self) -> None:
+        self.get_success(self.store.set_profile_displayname(self.frank, None))
+        self.get_success(
+            self.store.set_profile_avatar_url(self.frank, "http://my.server/me.png")
+        )
+
+        profile = self.get_success(self.handler.get_profile(self.frank.to_string()))
+
+        self.assertEqual("http://my.server/me.png", profile["avatar_url"])
+
     def test_set_my_avatar(self) -> None:
         self.get_success(
             self.handler.set_avatar_url(
-- 
cgit 1.5.1


From 68b2611783ab00fdad567654a95492442722c106 Mon Sep 17 00:00:00 2001
From: Shay <hillerys@element.io>
Date: Thu, 27 Jul 2023 15:08:46 -0700
Subject: Clarify comment on key uploads over replication (#16016)

---
 changelog.d/16016.doc               | 2 ++
 synapse/replication/http/devices.py | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)
 create mode 100644 changelog.d/16016.doc

(limited to 'synapse')

diff --git a/changelog.d/16016.doc b/changelog.d/16016.doc
new file mode 100644
index 0000000000..e677058c2d
--- /dev/null
+++ b/changelog.d/16016.doc
@@ -0,0 +1,2 @@
+Clarify comment on the keys/upload over replication enpoint.
+
diff --git a/synapse/replication/http/devices.py b/synapse/replication/http/devices.py
index f874f072f9..73f3de3642 100644
--- a/synapse/replication/http/devices.py
+++ b/synapse/replication/http/devices.py
@@ -107,8 +107,7 @@ class ReplicationUploadKeysForUserRestServlet(ReplicationEndpoint):
     Calls to e2e_keys_handler.upload_keys_for_user(user_id, device_id, keys) on
     the main process to accomplish this.
 
-    Defined in https://spec.matrix.org/v1.4/client-server-api/#post_matrixclientv3keysupload
-    Request format(borrowed and expanded from KeyUploadServlet):
+    Request format for this endpoint (borrowed and expanded from KeyUploadServlet):
 
         POST /_synapse/replication/upload_keys_for_user
 
@@ -117,6 +116,7 @@ class ReplicationUploadKeysForUserRestServlet(ReplicationEndpoint):
         "device_id": "<device_id>",
         "keys": {
             ....this part can be found in KeyUploadServlet in rest/client/keys.py....
+            or as defined in https://spec.matrix.org/v1.4/client-server-api/#post_matrixclientv3keysupload
         }
     }
 
-- 
cgit 1.5.1


From ae55cc1e6bc6527d0e359a823c474f5c9ed4382e Mon Sep 17 00:00:00 2001
From: Erik Johnston <erik@matrix.org>
Date: Mon, 31 Jul 2023 10:58:03 +0100
Subject: Add ability to wait for locks and add locks to purge history / room
 deletion (#15791)

c.f. #13476
---
 changelog.d/15791.bugfix                         |   1 +
 synapse/federation/federation_server.py          |  17 +-
 synapse/handlers/message.py                      |  38 ++-
 synapse/handlers/pagination.py                   |  23 +-
 synapse/handlers/room_member.py                  |  45 +--
 synapse/handlers/worker_lock.py                  | 333 +++++++++++++++++++++++
 synapse/notifier.py                              |  16 ++
 synapse/replication/tcp/commands.py              |  33 +++
 synapse/replication/tcp/handler.py               |  22 ++
 synapse/rest/client/room_upgrade_rest_servlet.py |  11 +-
 synapse/server.py                                |   5 +
 synapse/storage/controllers/persist_events.py    |  27 +-
 synapse/storage/databases/main/lock.py           | 190 ++++++++-----
 tests/handlers/test_worker_lock.py               |  74 +++++
 tests/rest/client/test_rooms.py                  |   4 +-
 tests/storage/databases/main/test_lock.py        |  52 ++++
 16 files changed, 783 insertions(+), 108 deletions(-)
 create mode 100644 changelog.d/15791.bugfix
 create mode 100644 synapse/handlers/worker_lock.py
 create mode 100644 tests/handlers/test_worker_lock.py

(limited to 'synapse')

diff --git a/changelog.d/15791.bugfix b/changelog.d/15791.bugfix
new file mode 100644
index 0000000000..182634b62f
--- /dev/null
+++ b/changelog.d/15791.bugfix
@@ -0,0 +1 @@
+Fix bug where purging history and paginating simultaneously could lead to database corruption when using workers.
diff --git a/synapse/federation/federation_server.py b/synapse/federation/federation_server.py
index fa61dd8c10..a90d99c4d6 100644
--- a/synapse/federation/federation_server.py
+++ b/synapse/federation/federation_server.py
@@ -63,6 +63,7 @@ from synapse.federation.federation_base import (
 )
 from synapse.federation.persistence import TransactionActions
 from synapse.federation.units import Edu, Transaction
+from synapse.handlers.worker_lock import DELETE_ROOM_LOCK_NAME
 from synapse.http.servlet import assert_params_in_dict
 from synapse.logging.context import (
     make_deferred_yieldable,
@@ -137,6 +138,7 @@ class FederationServer(FederationBase):
         self._event_auth_handler = hs.get_event_auth_handler()
         self._room_member_handler = hs.get_room_member_handler()
         self._e2e_keys_handler = hs.get_e2e_keys_handler()
+        self._worker_lock_handler = hs.get_worker_locks_handler()
 
         self._state_storage_controller = hs.get_storage_controllers().state
 
@@ -1236,9 +1238,18 @@ class FederationServer(FederationBase):
                 logger.info("handling received PDU in room %s: %s", room_id, event)
                 try:
                     with nested_logging_context(event.event_id):
-                        await self._federation_event_handler.on_receive_pdu(
-                            origin, event
-                        )
+                        # We're taking out a lock within a lock, which could
+                        # lead to deadlocks if we're not careful. However, it is
+                        # safe on this occasion as we only ever take a write
+                        # lock when deleting a room, which we would never do
+                        # while holding the `_INBOUND_EVENT_HANDLING_LOCK_NAME`
+                        # lock.
+                        async with self._worker_lock_handler.acquire_read_write_lock(
+                            DELETE_ROOM_LOCK_NAME, room_id, write=False
+                        ):
+                            await self._federation_event_handler.on_receive_pdu(
+                                origin, event
+                            )
                 except FederationError as e:
                     # XXX: Ideally we'd inform the remote we failed to process
                     # the event, but we can't return an error in the transaction
diff --git a/synapse/handlers/message.py b/synapse/handlers/message.py
index fff0b5fa12..187dedae7d 100644
--- a/synapse/handlers/message.py
+++ b/synapse/handlers/message.py
@@ -53,6 +53,7 @@ from synapse.events.snapshot import EventContext, UnpersistedEventContextBase
 from synapse.events.utils import SerializeEventConfig, maybe_upsert_event_field
 from synapse.events.validator import EventValidator
 from synapse.handlers.directory import DirectoryHandler
+from synapse.handlers.worker_lock import DELETE_ROOM_LOCK_NAME
 from synapse.logging import opentracing
 from synapse.logging.context import make_deferred_yieldable, run_in_background
 from synapse.metrics.background_process_metrics import run_as_background_process
@@ -485,6 +486,7 @@ class EventCreationHandler:
         self._events_shard_config = self.config.worker.events_shard_config
         self._instance_name = hs.get_instance_name()
         self._notifier = hs.get_notifier()
+        self._worker_lock_handler = hs.get_worker_locks_handler()
 
         self.room_prejoin_state_types = self.hs.config.api.room_prejoin_state
 
@@ -1010,6 +1012,37 @@ class EventCreationHandler:
                         event.internal_metadata.stream_ordering,
                     )
 
+        async with self._worker_lock_handler.acquire_read_write_lock(
+            DELETE_ROOM_LOCK_NAME, room_id, write=False
+        ):
+            return await self._create_and_send_nonmember_event_locked(
+                requester=requester,
+                event_dict=event_dict,
+                allow_no_prev_events=allow_no_prev_events,
+                prev_event_ids=prev_event_ids,
+                state_event_ids=state_event_ids,
+                ratelimit=ratelimit,
+                txn_id=txn_id,
+                ignore_shadow_ban=ignore_shadow_ban,
+                outlier=outlier,
+                depth=depth,
+            )
+
+    async def _create_and_send_nonmember_event_locked(
+        self,
+        requester: Requester,
+        event_dict: dict,
+        allow_no_prev_events: bool = False,
+        prev_event_ids: Optional[List[str]] = None,
+        state_event_ids: Optional[List[str]] = None,
+        ratelimit: bool = True,
+        txn_id: Optional[str] = None,
+        ignore_shadow_ban: bool = False,
+        outlier: bool = False,
+        depth: Optional[int] = None,
+    ) -> Tuple[EventBase, int]:
+        room_id = event_dict["room_id"]
+
         # If we don't have any prev event IDs specified then we need to
         # check that the host is in the room (as otherwise populating the
         # prev events will fail), at which point we may as well check the
@@ -1923,7 +1956,10 @@ class EventCreationHandler:
         )
 
         for room_id in room_ids:
-            dummy_event_sent = await self._send_dummy_event_for_room(room_id)
+            async with self._worker_lock_handler.acquire_read_write_lock(
+                DELETE_ROOM_LOCK_NAME, room_id, write=False
+            ):
+                dummy_event_sent = await self._send_dummy_event_for_room(room_id)
 
             if not dummy_event_sent:
                 # Did not find a valid user in the room, so remove from future attempts
diff --git a/synapse/handlers/pagination.py b/synapse/handlers/pagination.py
index 19b8728db9..da34658470 100644
--- a/synapse/handlers/pagination.py
+++ b/synapse/handlers/pagination.py
@@ -46,6 +46,11 @@ logger = logging.getLogger(__name__)
 BACKFILL_BECAUSE_TOO_MANY_GAPS_THRESHOLD = 3
 
 
+PURGE_HISTORY_LOCK_NAME = "purge_history_lock"
+
+DELETE_ROOM_LOCK_NAME = "delete_room_lock"
+
+
 @attr.s(slots=True, auto_attribs=True)
 class PurgeStatus:
     """Object tracking the status of a purge request
@@ -142,6 +147,7 @@ class PaginationHandler:
         self._server_name = hs.hostname
         self._room_shutdown_handler = hs.get_room_shutdown_handler()
         self._relations_handler = hs.get_relations_handler()
+        self._worker_locks = hs.get_worker_locks_handler()
 
         self.pagination_lock = ReadWriteLock()
         # IDs of rooms in which there currently an active purge *or delete* operation.
@@ -356,7 +362,9 @@ class PaginationHandler:
         """
         self._purges_in_progress_by_room.add(room_id)
         try:
-            async with self.pagination_lock.write(room_id):
+            async with self._worker_locks.acquire_read_write_lock(
+                PURGE_HISTORY_LOCK_NAME, room_id, write=True
+            ):
                 await self._storage_controllers.purge_events.purge_history(
                     room_id, token, delete_local_events
                 )
@@ -412,7 +420,10 @@ class PaginationHandler:
             room_id: room to be purged
             force: set true to skip checking for joined users.
         """
-        async with self.pagination_lock.write(room_id):
+        async with self._worker_locks.acquire_multi_read_write_lock(
+            [(PURGE_HISTORY_LOCK_NAME, room_id), (DELETE_ROOM_LOCK_NAME, room_id)],
+            write=True,
+        ):
             # first check that we have no users in this room
             if not force:
                 joined = await self.store.is_host_joined(room_id, self._server_name)
@@ -471,7 +482,9 @@ class PaginationHandler:
 
         room_token = from_token.room_key
 
-        async with self.pagination_lock.read(room_id):
+        async with self._worker_locks.acquire_read_write_lock(
+            PURGE_HISTORY_LOCK_NAME, room_id, write=False
+        ):
             (membership, member_event_id) = (None, None)
             if not use_admin_priviledge:
                 (
@@ -747,7 +760,9 @@ class PaginationHandler:
 
         self._purges_in_progress_by_room.add(room_id)
         try:
-            async with self.pagination_lock.write(room_id):
+            async with self._worker_locks.acquire_read_write_lock(
+                PURGE_HISTORY_LOCK_NAME, room_id, write=True
+            ):
                 self._delete_by_id[delete_id].status = DeleteStatus.STATUS_SHUTTING_DOWN
                 self._delete_by_id[
                     delete_id
diff --git a/synapse/handlers/room_member.py b/synapse/handlers/room_member.py
index 496e701f13..6cca2ec344 100644
--- a/synapse/handlers/room_member.py
+++ b/synapse/handlers/room_member.py
@@ -39,6 +39,7 @@ from synapse.events import EventBase
 from synapse.events.snapshot import EventContext
 from synapse.handlers.profile import MAX_AVATAR_URL_LEN, MAX_DISPLAYNAME_LEN
 from synapse.handlers.state_deltas import MatchChange, StateDeltasHandler
+from synapse.handlers.worker_lock import DELETE_ROOM_LOCK_NAME
 from synapse.logging import opentracing
 from synapse.metrics import event_processing_positions
 from synapse.metrics.background_process_metrics import run_as_background_process
@@ -94,6 +95,7 @@ class RoomMemberHandler(metaclass=abc.ABCMeta):
         self.event_creation_handler = hs.get_event_creation_handler()
         self.account_data_handler = hs.get_account_data_handler()
         self.event_auth_handler = hs.get_event_auth_handler()
+        self._worker_lock_handler = hs.get_worker_locks_handler()
 
         self.member_linearizer: Linearizer = Linearizer(name="member")
         self.member_as_limiter = Linearizer(max_count=10, name="member_as_limiter")
@@ -638,26 +640,29 @@ class RoomMemberHandler(metaclass=abc.ABCMeta):
         # by application services), and then by room ID.
         async with self.member_as_limiter.queue(as_id):
             async with self.member_linearizer.queue(key):
-                with opentracing.start_active_span("update_membership_locked"):
-                    result = await self.update_membership_locked(
-                        requester,
-                        target,
-                        room_id,
-                        action,
-                        txn_id=txn_id,
-                        remote_room_hosts=remote_room_hosts,
-                        third_party_signed=third_party_signed,
-                        ratelimit=ratelimit,
-                        content=content,
-                        new_room=new_room,
-                        require_consent=require_consent,
-                        outlier=outlier,
-                        allow_no_prev_events=allow_no_prev_events,
-                        prev_event_ids=prev_event_ids,
-                        state_event_ids=state_event_ids,
-                        depth=depth,
-                        origin_server_ts=origin_server_ts,
-                    )
+                async with self._worker_lock_handler.acquire_read_write_lock(
+                    DELETE_ROOM_LOCK_NAME, room_id, write=False
+                ):
+                    with opentracing.start_active_span("update_membership_locked"):
+                        result = await self.update_membership_locked(
+                            requester,
+                            target,
+                            room_id,
+                            action,
+                            txn_id=txn_id,
+                            remote_room_hosts=remote_room_hosts,
+                            third_party_signed=third_party_signed,
+                            ratelimit=ratelimit,
+                            content=content,
+                            new_room=new_room,
+                            require_consent=require_consent,
+                            outlier=outlier,
+                            allow_no_prev_events=allow_no_prev_events,
+                            prev_event_ids=prev_event_ids,
+                            state_event_ids=state_event_ids,
+                            depth=depth,
+                            origin_server_ts=origin_server_ts,
+                        )
 
         return result
 
diff --git a/synapse/handlers/worker_lock.py b/synapse/handlers/worker_lock.py
new file mode 100644
index 0000000000..72df773a86
--- /dev/null
+++ b/synapse/handlers/worker_lock.py
@@ -0,0 +1,333 @@
+# Copyright 2023 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+from types import TracebackType
+from typing import (
+    TYPE_CHECKING,
+    AsyncContextManager,
+    Collection,
+    Dict,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+)
+from weakref import WeakSet
+
+import attr
+
+from twisted.internet import defer
+from twisted.internet.interfaces import IReactorTime
+
+from synapse.logging.context import PreserveLoggingContext
+from synapse.logging.opentracing import start_active_span
+from synapse.metrics.background_process_metrics import wrap_as_background_process
+from synapse.storage.databases.main.lock import Lock, LockStore
+from synapse.util.async_helpers import timeout_deferred
+
+if TYPE_CHECKING:
+    from synapse.logging.opentracing import opentracing
+    from synapse.server import HomeServer
+
+
+DELETE_ROOM_LOCK_NAME = "delete_room_lock"
+
+
+class WorkerLocksHandler:
+    """A class for waiting on taking out locks, rather than using the storage
+    functions directly (which don't support awaiting).
+    """
+
+    def __init__(self, hs: "HomeServer") -> None:
+        self._reactor = hs.get_reactor()
+        self._store = hs.get_datastores().main
+        self._clock = hs.get_clock()
+        self._notifier = hs.get_notifier()
+        self._instance_name = hs.get_instance_name()
+
+        # Map from lock name/key to set of `WaitingLock` that are active for
+        # that lock.
+        self._locks: Dict[
+            Tuple[str, str], WeakSet[Union[WaitingLock, WaitingMultiLock]]
+        ] = {}
+
+        self._clock.looping_call(self._cleanup_locks, 30_000)
+
+        self._notifier.add_lock_released_callback(self._on_lock_released)
+
+    def acquire_lock(self, lock_name: str, lock_key: str) -> "WaitingLock":
+        """Acquire a standard lock, returns a context manager that will block
+        until the lock is acquired.
+
+        Note: Care must be taken to avoid deadlocks. In particular, this
+        function does *not* timeout.
+
+        Usage:
+            async with handler.acquire_lock(name, key):
+                # Do work while holding the lock...
+        """
+
+        lock = WaitingLock(
+            reactor=self._reactor,
+            store=self._store,
+            handler=self,
+            lock_name=lock_name,
+            lock_key=lock_key,
+            write=None,
+        )
+
+        self._locks.setdefault((lock_name, lock_key), WeakSet()).add(lock)
+
+        return lock
+
+    def acquire_read_write_lock(
+        self,
+        lock_name: str,
+        lock_key: str,
+        *,
+        write: bool,
+    ) -> "WaitingLock":
+        """Acquire a read/write lock, returns a context manager that will block
+        until the lock is acquired.
+
+        Note: Care must be taken to avoid deadlocks. In particular, this
+        function does *not* timeout.
+
+        Usage:
+            async with handler.acquire_read_write_lock(name, key, write=True):
+                # Do work while holding the lock...
+        """
+
+        lock = WaitingLock(
+            reactor=self._reactor,
+            store=self._store,
+            handler=self,
+            lock_name=lock_name,
+            lock_key=lock_key,
+            write=write,
+        )
+
+        self._locks.setdefault((lock_name, lock_key), WeakSet()).add(lock)
+
+        return lock
+
+    def acquire_multi_read_write_lock(
+        self,
+        lock_names: Collection[Tuple[str, str]],
+        *,
+        write: bool,
+    ) -> "WaitingMultiLock":
+        """Acquires multi read/write locks at once, returns a context manager
+        that will block until all the locks are acquired.
+
+        This will try and acquire all locks at once, and will never hold on to a
+        subset of the locks. (This avoids accidentally creating deadlocks).
+
+        Note: Care must be taken to avoid deadlocks. In particular, this
+        function does *not* timeout.
+        """
+
+        lock = WaitingMultiLock(
+            lock_names=lock_names,
+            write=write,
+            reactor=self._reactor,
+            store=self._store,
+            handler=self,
+        )
+
+        for lock_name, lock_key in lock_names:
+            self._locks.setdefault((lock_name, lock_key), WeakSet()).add(lock)
+
+        return lock
+
+    def notify_lock_released(self, lock_name: str, lock_key: str) -> None:
+        """Notify that a lock has been released.
+
+        Pokes both the notifier and replication.
+        """
+
+        self._notifier.notify_lock_released(self._instance_name, lock_name, lock_key)
+
+    def _on_lock_released(
+        self, instance_name: str, lock_name: str, lock_key: str
+    ) -> None:
+        """Called when a lock has been released.
+
+        Wakes up any locks that might be waiting on this.
+        """
+        locks = self._locks.get((lock_name, lock_key))
+        if not locks:
+            return
+
+        def _wake_deferred(deferred: defer.Deferred) -> None:
+            if not deferred.called:
+                deferred.callback(None)
+
+        for lock in locks:
+            self._clock.call_later(0, _wake_deferred, lock.deferred)
+
+    @wrap_as_background_process("_cleanup_locks")
+    async def _cleanup_locks(self) -> None:
+        """Periodically cleans out stale entries in the locks map"""
+        self._locks = {key: value for key, value in self._locks.items() if value}
+
+
+@attr.s(auto_attribs=True, eq=False)
+class WaitingLock:
+    reactor: IReactorTime
+    store: LockStore
+    handler: WorkerLocksHandler
+    lock_name: str
+    lock_key: str
+    write: Optional[bool]
+    deferred: "defer.Deferred[None]" = attr.Factory(defer.Deferred)
+    _inner_lock: Optional[Lock] = None
+    _retry_interval: float = 0.1
+    _lock_span: "opentracing.Scope" = attr.Factory(
+        lambda: start_active_span("WaitingLock.lock")
+    )
+
+    async def __aenter__(self) -> None:
+        self._lock_span.__enter__()
+
+        with start_active_span("WaitingLock.waiting_for_lock"):
+            while self._inner_lock is None:
+                self.deferred = defer.Deferred()
+
+                if self.write is not None:
+                    lock = await self.store.try_acquire_read_write_lock(
+                        self.lock_name, self.lock_key, write=self.write
+                    )
+                else:
+                    lock = await self.store.try_acquire_lock(
+                        self.lock_name, self.lock_key
+                    )
+
+                if lock:
+                    self._inner_lock = lock
+                    break
+
+                try:
+                    # Wait until the we get notified the lock might have been
+                    # released (by the deferred being resolved). We also
+                    # periodically wake up in case the lock was released but we
+                    # weren't notified.
+                    with PreserveLoggingContext():
+                        await timeout_deferred(
+                            deferred=self.deferred,
+                            timeout=self._get_next_retry_interval(),
+                            reactor=self.reactor,
+                        )
+                except Exception:
+                    pass
+
+        return await self._inner_lock.__aenter__()
+
+    async def __aexit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc: Optional[BaseException],
+        tb: Optional[TracebackType],
+    ) -> Optional[bool]:
+        assert self._inner_lock
+
+        self.handler.notify_lock_released(self.lock_name, self.lock_key)
+
+        try:
+            r = await self._inner_lock.__aexit__(exc_type, exc, tb)
+        finally:
+            self._lock_span.__exit__(exc_type, exc, tb)
+
+        return r
+
+    def _get_next_retry_interval(self) -> float:
+        next = self._retry_interval
+        self._retry_interval = max(5, next * 2)
+        return next * random.uniform(0.9, 1.1)
+
+
+@attr.s(auto_attribs=True, eq=False)
+class WaitingMultiLock:
+    lock_names: Collection[Tuple[str, str]]
+
+    write: bool
+
+    reactor: IReactorTime
+    store: LockStore
+    handler: WorkerLocksHandler
+
+    deferred: "defer.Deferred[None]" = attr.Factory(defer.Deferred)
+
+    _inner_lock_cm: Optional[AsyncContextManager] = None
+    _retry_interval: float = 0.1
+    _lock_span: "opentracing.Scope" = attr.Factory(
+        lambda: start_active_span("WaitingLock.lock")
+    )
+
+    async def __aenter__(self) -> None:
+        self._lock_span.__enter__()
+
+        with start_active_span("WaitingLock.waiting_for_lock"):
+            while self._inner_lock_cm is None:
+                self.deferred = defer.Deferred()
+
+                lock_cm = await self.store.try_acquire_multi_read_write_lock(
+                    self.lock_names, write=self.write
+                )
+
+                if lock_cm:
+                    self._inner_lock_cm = lock_cm
+                    break
+
+                try:
+                    # Wait until the we get notified the lock might have been
+                    # released (by the deferred being resolved). We also
+                    # periodically wake up in case the lock was released but we
+                    # weren't notified.
+                    with PreserveLoggingContext():
+                        await timeout_deferred(
+                            deferred=self.deferred,
+                            timeout=self._get_next_retry_interval(),
+                            reactor=self.reactor,
+                        )
+                except Exception:
+                    pass
+
+        assert self._inner_lock_cm
+        await self._inner_lock_cm.__aenter__()
+        return
+
+    async def __aexit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc: Optional[BaseException],
+        tb: Optional[TracebackType],
+    ) -> Optional[bool]:
+        assert self._inner_lock_cm
+
+        for lock_name, lock_key in self.lock_names:
+            self.handler.notify_lock_released(lock_name, lock_key)
+
+        try:
+            r = await self._inner_lock_cm.__aexit__(exc_type, exc, tb)
+        finally:
+            self._lock_span.__exit__(exc_type, exc, tb)
+
+        return r
+
+    def _get_next_retry_interval(self) -> float:
+        next = self._retry_interval
+        self._retry_interval = max(5, next * 2)
+        return next * random.uniform(0.9, 1.1)
diff --git a/synapse/notifier.py b/synapse/notifier.py
index 897272ad5b..68115bca70 100644
--- a/synapse/notifier.py
+++ b/synapse/notifier.py
@@ -234,6 +234,9 @@ class Notifier:
 
         self._third_party_rules = hs.get_module_api_callbacks().third_party_event_rules
 
+        # List of callbacks to be notified when a lock is released
+        self._lock_released_callback: List[Callable[[str, str, str], None]] = []
+
         self.clock = hs.get_clock()
         self.appservice_handler = hs.get_application_service_handler()
         self._pusher_pool = hs.get_pusherpool()
@@ -785,6 +788,19 @@ class Notifier:
         # that any in flight requests can be immediately retried.
         self._federation_client.wake_destination(server)
 
+    def add_lock_released_callback(
+        self, callback: Callable[[str, str, str], None]
+    ) -> None:
+        """Add a function to be called whenever we are notified about a released lock."""
+        self._lock_released_callback.append(callback)
+
+    def notify_lock_released(
+        self, instance_name: str, lock_name: str, lock_key: str
+    ) -> None:
+        """Notify the callbacks that a lock has been released."""
+        for cb in self._lock_released_callback:
+            cb(instance_name, lock_name, lock_key)
+
 
 @attr.s(auto_attribs=True)
 class ReplicationNotifier:
diff --git a/synapse/replication/tcp/commands.py b/synapse/replication/tcp/commands.py
index 32f52e54d8..10f5c98ff8 100644
--- a/synapse/replication/tcp/commands.py
+++ b/synapse/replication/tcp/commands.py
@@ -422,6 +422,36 @@ class RemoteServerUpCommand(_SimpleCommand):
     NAME = "REMOTE_SERVER_UP"
 
 
+class LockReleasedCommand(Command):
+    """Sent to inform other instances that a given lock has been dropped.
+
+    Format::
+
+        LOCK_RELEASED ["<instance_name>", "<lock_name>", "<lock_key>"]
+    """
+
+    NAME = "LOCK_RELEASED"
+
+    def __init__(
+        self,
+        instance_name: str,
+        lock_name: str,
+        lock_key: str,
+    ):
+        self.instance_name = instance_name
+        self.lock_name = lock_name
+        self.lock_key = lock_key
+
+    @classmethod
+    def from_line(cls: Type["LockReleasedCommand"], line: str) -> "LockReleasedCommand":
+        instance_name, lock_name, lock_key = json_decoder.decode(line)
+
+        return cls(instance_name, lock_name, lock_key)
+
+    def to_line(self) -> str:
+        return json_encoder.encode([self.instance_name, self.lock_name, self.lock_key])
+
+
 _COMMANDS: Tuple[Type[Command], ...] = (
     ServerCommand,
     RdataCommand,
@@ -435,6 +465,7 @@ _COMMANDS: Tuple[Type[Command], ...] = (
     UserIpCommand,
     RemoteServerUpCommand,
     ClearUserSyncsCommand,
+    LockReleasedCommand,
 )
 
 # Map of command name to command type.
@@ -448,6 +479,7 @@ VALID_SERVER_COMMANDS = (
     ErrorCommand.NAME,
     PingCommand.NAME,
     RemoteServerUpCommand.NAME,
+    LockReleasedCommand.NAME,
 )
 
 # The commands the client is allowed to send
@@ -461,6 +493,7 @@ VALID_CLIENT_COMMANDS = (
     UserIpCommand.NAME,
     ErrorCommand.NAME,
     RemoteServerUpCommand.NAME,
+    LockReleasedCommand.NAME,
 )
 
 
diff --git a/synapse/replication/tcp/handler.py b/synapse/replication/tcp/handler.py
index 5d108fe11b..a2cabba7b1 100644
--- a/synapse/replication/tcp/handler.py
+++ b/synapse/replication/tcp/handler.py
@@ -39,6 +39,7 @@ from synapse.replication.tcp.commands import (
     ClearUserSyncsCommand,
     Command,
     FederationAckCommand,
+    LockReleasedCommand,
     PositionCommand,
     RdataCommand,
     RemoteServerUpCommand,
@@ -248,6 +249,9 @@ class ReplicationCommandHandler:
         if self._is_master or self._should_insert_client_ips:
             self.subscribe_to_channel("USER_IP")
 
+        if hs.config.redis.redis_enabled:
+            self._notifier.add_lock_released_callback(self.on_lock_released)
+
     def subscribe_to_channel(self, channel_name: str) -> None:
         """
         Indicates that we wish to subscribe to a Redis channel by name.
@@ -648,6 +652,17 @@ class ReplicationCommandHandler:
 
         self._notifier.notify_remote_server_up(cmd.data)
 
+    def on_LOCK_RELEASED(
+        self, conn: IReplicationConnection, cmd: LockReleasedCommand
+    ) -> None:
+        """Called when we get a new LOCK_RELEASED command."""
+        if cmd.instance_name == self._instance_name:
+            return
+
+        self._notifier.notify_lock_released(
+            cmd.instance_name, cmd.lock_name, cmd.lock_key
+        )
+
     def new_connection(self, connection: IReplicationConnection) -> None:
         """Called when we have a new connection."""
         self._connections.append(connection)
@@ -754,6 +769,13 @@ class ReplicationCommandHandler:
         """
         self.send_command(RdataCommand(stream_name, self._instance_name, token, data))
 
+    def on_lock_released(
+        self, instance_name: str, lock_name: str, lock_key: str
+    ) -> None:
+        """Called when we released a lock and should notify other instances."""
+        if instance_name == self._instance_name:
+            self.send_command(LockReleasedCommand(instance_name, lock_name, lock_key))
+
 
 UpdateToken = TypeVar("UpdateToken")
 UpdateRow = TypeVar("UpdateRow")
diff --git a/synapse/rest/client/room_upgrade_rest_servlet.py b/synapse/rest/client/room_upgrade_rest_servlet.py
index 6a7792e18b..4a5d9e13e7 100644
--- a/synapse/rest/client/room_upgrade_rest_servlet.py
+++ b/synapse/rest/client/room_upgrade_rest_servlet.py
@@ -17,6 +17,7 @@ from typing import TYPE_CHECKING, Tuple
 
 from synapse.api.errors import Codes, ShadowBanError, SynapseError
 from synapse.api.room_versions import KNOWN_ROOM_VERSIONS
+from synapse.handlers.worker_lock import DELETE_ROOM_LOCK_NAME
 from synapse.http.server import HttpServer
 from synapse.http.servlet import (
     RestServlet,
@@ -60,6 +61,7 @@ class RoomUpgradeRestServlet(RestServlet):
         self._hs = hs
         self._room_creation_handler = hs.get_room_creation_handler()
         self._auth = hs.get_auth()
+        self._worker_lock_handler = hs.get_worker_locks_handler()
 
     async def on_POST(
         self, request: SynapseRequest, room_id: str
@@ -78,9 +80,12 @@ class RoomUpgradeRestServlet(RestServlet):
             )
 
         try:
-            new_room_id = await self._room_creation_handler.upgrade_room(
-                requester, room_id, new_version
-            )
+            async with self._worker_lock_handler.acquire_read_write_lock(
+                DELETE_ROOM_LOCK_NAME, room_id, write=False
+            ):
+                new_room_id = await self._room_creation_handler.upgrade_room(
+                    requester, room_id, new_version
+                )
         except ShadowBanError:
             # Generate a random room ID.
             new_room_id = stringutils.random_string(18)
diff --git a/synapse/server.py b/synapse/server.py
index b72b76a38b..8430f99ef2 100644
--- a/synapse/server.py
+++ b/synapse/server.py
@@ -107,6 +107,7 @@ from synapse.handlers.stats import StatsHandler
 from synapse.handlers.sync import SyncHandler
 from synapse.handlers.typing import FollowerTypingHandler, TypingWriterHandler
 from synapse.handlers.user_directory import UserDirectoryHandler
+from synapse.handlers.worker_lock import WorkerLocksHandler
 from synapse.http.client import (
     InsecureInterceptableContextFactory,
     ReplicationClient,
@@ -912,3 +913,7 @@ class HomeServer(metaclass=abc.ABCMeta):
     def get_common_usage_metrics_manager(self) -> CommonUsageMetricsManager:
         """Usage metrics shared between phone home stats and the prometheus exporter."""
         return CommonUsageMetricsManager(self)
+
+    @cache_in_self
+    def get_worker_locks_handler(self) -> WorkerLocksHandler:
+        return WorkerLocksHandler(self)
diff --git a/synapse/storage/controllers/persist_events.py b/synapse/storage/controllers/persist_events.py
index 35c0680365..35cd1089d6 100644
--- a/synapse/storage/controllers/persist_events.py
+++ b/synapse/storage/controllers/persist_events.py
@@ -45,6 +45,7 @@ from twisted.internet import defer
 from synapse.api.constants import EventTypes, Membership
 from synapse.events import EventBase
 from synapse.events.snapshot import EventContext
+from synapse.handlers.worker_lock import DELETE_ROOM_LOCK_NAME
 from synapse.logging.context import PreserveLoggingContext, make_deferred_yieldable
 from synapse.logging.opentracing import (
     SynapseTags,
@@ -338,6 +339,7 @@ class EventsPersistenceStorageController:
         )
         self._state_resolution_handler = hs.get_state_resolution_handler()
         self._state_controller = state_controller
+        self.hs = hs
 
     async def _process_event_persist_queue_task(
         self,
@@ -350,15 +352,22 @@ class EventsPersistenceStorageController:
             A dictionary of event ID to event ID we didn't persist as we already
             had another event persisted with the same TXN ID.
         """
-        if isinstance(task, _PersistEventsTask):
-            return await self._persist_event_batch(room_id, task)
-        elif isinstance(task, _UpdateCurrentStateTask):
-            await self._update_current_state(room_id, task)
-            return {}
-        else:
-            raise AssertionError(
-                f"Found an unexpected task type in event persistence queue: {task}"
-            )
+
+        # Ensure that the room can't be deleted while we're persisting events to
+        # it. We might already have taken out the lock, but since this is just a
+        # "read" lock its inherently reentrant.
+        async with self.hs.get_worker_locks_handler().acquire_read_write_lock(
+            DELETE_ROOM_LOCK_NAME, room_id, write=False
+        ):
+            if isinstance(task, _PersistEventsTask):
+                return await self._persist_event_batch(room_id, task)
+            elif isinstance(task, _UpdateCurrentStateTask):
+                await self._update_current_state(room_id, task)
+                return {}
+            else:
+                raise AssertionError(
+                    f"Found an unexpected task type in event persistence queue: {task}"
+                )
 
     @trace
     async def persist_events(
diff --git a/synapse/storage/databases/main/lock.py b/synapse/storage/databases/main/lock.py
index c89b4f7919..1680bf6168 100644
--- a/synapse/storage/databases/main/lock.py
+++ b/synapse/storage/databases/main/lock.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+from contextlib import AsyncExitStack
 from types import TracebackType
-from typing import TYPE_CHECKING, Optional, Set, Tuple, Type
+from typing import TYPE_CHECKING, Collection, Optional, Set, Tuple, Type
 from weakref import WeakValueDictionary
 
 from twisted.internet.interfaces import IReactorCore
@@ -208,76 +209,85 @@ class LockStore(SQLBaseStore):
         used (otherwise the lock will leak).
         """
 
+        try:
+            lock = await self.db_pool.runInteraction(
+                "try_acquire_read_write_lock",
+                self._try_acquire_read_write_lock_txn,
+                lock_name,
+                lock_key,
+                write,
+            )
+        except self.database_engine.module.IntegrityError:
+            return None
+
+        return lock
+
+    def _try_acquire_read_write_lock_txn(
+        self,
+        txn: LoggingTransaction,
+        lock_name: str,
+        lock_key: str,
+        write: bool,
+    ) -> "Lock":
+        # We attempt to acquire the lock by inserting into
+        # `worker_read_write_locks` and seeing if that fails any
+        # constraints. If it doesn't then we have acquired the lock,
+        # otherwise we haven't.
+        #
+        # Before that though we clear the table of any stale locks.
+
         now = self._clock.time_msec()
         token = random_string(6)
 
-        def _try_acquire_read_write_lock_txn(txn: LoggingTransaction) -> None:
-            # We attempt to acquire the lock by inserting into
-            # `worker_read_write_locks` and seeing if that fails any
-            # constraints. If it doesn't then we have acquired the lock,
-            # otherwise we haven't.
-            #
-            # Before that though we clear the table of any stale locks.
-
-            delete_sql = """
-                DELETE FROM worker_read_write_locks
-                    WHERE last_renewed_ts < ? AND lock_name = ? AND lock_key = ?;
-            """
-
-            insert_sql = """
-                INSERT INTO worker_read_write_locks (lock_name, lock_key, write_lock, instance_name, token, last_renewed_ts)
-                VALUES (?, ?, ?, ?, ?, ?)
-            """
-
-            if isinstance(self.database_engine, PostgresEngine):
-                # For Postgres we can send these queries at the same time.
-                txn.execute(
-                    delete_sql + ";" + insert_sql,
-                    (
-                        # DELETE args
-                        now - _LOCK_TIMEOUT_MS,
-                        lock_name,
-                        lock_key,
-                        # UPSERT args
-                        lock_name,
-                        lock_key,
-                        write,
-                        self._instance_name,
-                        token,
-                        now,
-                    ),
-                )
-            else:
-                # For SQLite these need to be two queries.
-                txn.execute(
-                    delete_sql,
-                    (
-                        now - _LOCK_TIMEOUT_MS,
-                        lock_name,
-                        lock_key,
-                    ),
-                )
-                txn.execute(
-                    insert_sql,
-                    (
-                        lock_name,
-                        lock_key,
-                        write,
-                        self._instance_name,
-                        token,
-                        now,
-                    ),
-                )
+        delete_sql = """
+            DELETE FROM worker_read_write_locks
+                WHERE last_renewed_ts < ? AND lock_name = ? AND lock_key = ?;
+        """
 
-            return
+        insert_sql = """
+            INSERT INTO worker_read_write_locks (lock_name, lock_key, write_lock, instance_name, token, last_renewed_ts)
+            VALUES (?, ?, ?, ?, ?, ?)
+        """
 
-        try:
-            await self.db_pool.runInteraction(
-                "try_acquire_read_write_lock",
-                _try_acquire_read_write_lock_txn,
+        if isinstance(self.database_engine, PostgresEngine):
+            # For Postgres we can send these queries at the same time.
+            txn.execute(
+                delete_sql + ";" + insert_sql,
+                (
+                    # DELETE args
+                    now - _LOCK_TIMEOUT_MS,
+                    lock_name,
+                    lock_key,
+                    # UPSERT args
+                    lock_name,
+                    lock_key,
+                    write,
+                    self._instance_name,
+                    token,
+                    now,
+                ),
+            )
+        else:
+            # For SQLite these need to be two queries.
+            txn.execute(
+                delete_sql,
+                (
+                    now - _LOCK_TIMEOUT_MS,
+                    lock_name,
+                    lock_key,
+                ),
+            )
+            txn.execute(
+                insert_sql,
+                (
+                    lock_name,
+                    lock_key,
+                    write,
+                    self._instance_name,
+                    token,
+                    now,
+                ),
             )
-        except self.database_engine.module.IntegrityError:
-            return None
 
         lock = Lock(
             self._reactor,
@@ -289,10 +299,58 @@ class LockStore(SQLBaseStore):
             token=token,
         )
 
-        self._live_read_write_lock_tokens[(lock_name, lock_key, token)] = lock
+        def set_lock() -> None:
+            self._live_read_write_lock_tokens[(lock_name, lock_key, token)] = lock
+
+        txn.call_after(set_lock)
 
         return lock
 
+    async def try_acquire_multi_read_write_lock(
+        self,
+        lock_names: Collection[Tuple[str, str]],
+        write: bool,
+    ) -> Optional[AsyncExitStack]:
+        """Try to acquire multiple locks for the given names/keys. Will return
+        an async context manager if the locks are successfully acquired, which
+        *must* be used (otherwise the lock will leak).
+
+        If only a subset of the locks can be acquired then it will immediately
+        drop them and return `None`.
+        """
+        try:
+            locks = await self.db_pool.runInteraction(
+                "try_acquire_multi_read_write_lock",
+                self._try_acquire_multi_read_write_lock_txn,
+                lock_names,
+                write,
+            )
+        except self.database_engine.module.IntegrityError:
+            return None
+
+        stack = AsyncExitStack()
+
+        for lock in locks:
+            await stack.enter_async_context(lock)
+
+        return stack
+
+    def _try_acquire_multi_read_write_lock_txn(
+        self,
+        txn: LoggingTransaction,
+        lock_names: Collection[Tuple[str, str]],
+        write: bool,
+    ) -> Collection["Lock"]:
+        locks = []
+
+        for lock_name, lock_key in lock_names:
+            lock = self._try_acquire_read_write_lock_txn(
+                txn, lock_name, lock_key, write
+            )
+            locks.append(lock)
+
+        return locks
+
 
 class Lock:
     """An async context manager that manages an acquired lock, ensuring it is
diff --git a/tests/handlers/test_worker_lock.py b/tests/handlers/test_worker_lock.py
new file mode 100644
index 0000000000..73e548726c
--- /dev/null
+++ b/tests/handlers/test_worker_lock.py
@@ -0,0 +1,74 @@
+# Copyright 2023 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from twisted.internet import defer
+from twisted.test.proto_helpers import MemoryReactor
+
+from synapse.server import HomeServer
+from synapse.util import Clock
+
+from tests import unittest
+from tests.replication._base import BaseMultiWorkerStreamTestCase
+
+
+class WorkerLockTestCase(unittest.HomeserverTestCase):
+    def prepare(
+        self, reactor: MemoryReactor, clock: Clock, homeserver: HomeServer
+    ) -> None:
+        self.worker_lock_handler = self.hs.get_worker_locks_handler()
+
+    def test_wait_for_lock_locally(self) -> None:
+        """Test waiting for a lock on a single worker"""
+
+        lock1 = self.worker_lock_handler.acquire_lock("name", "key")
+        self.get_success(lock1.__aenter__())
+
+        lock2 = self.worker_lock_handler.acquire_lock("name", "key")
+        d2 = defer.ensureDeferred(lock2.__aenter__())
+        self.assertNoResult(d2)
+
+        self.get_success(lock1.__aexit__(None, None, None))
+
+        self.get_success(d2)
+        self.get_success(lock2.__aexit__(None, None, None))
+
+
+class WorkerLockWorkersTestCase(BaseMultiWorkerStreamTestCase):
+    def prepare(
+        self, reactor: MemoryReactor, clock: Clock, homeserver: HomeServer
+    ) -> None:
+        self.main_worker_lock_handler = self.hs.get_worker_locks_handler()
+
+    def test_wait_for_lock_worker(self) -> None:
+        """Test waiting for a lock on another worker"""
+
+        worker = self.make_worker_hs(
+            "synapse.app.generic_worker",
+            extra_config={
+                "redis": {"enabled": True},
+            },
+        )
+        worker_lock_handler = worker.get_worker_locks_handler()
+
+        lock1 = self.main_worker_lock_handler.acquire_lock("name", "key")
+        self.get_success(lock1.__aenter__())
+
+        lock2 = worker_lock_handler.acquire_lock("name", "key")
+        d2 = defer.ensureDeferred(lock2.__aenter__())
+        self.assertNoResult(d2)
+
+        self.get_success(lock1.__aexit__(None, None, None))
+
+        self.get_success(d2)
+        self.get_success(lock2.__aexit__(None, None, None))
diff --git a/tests/rest/client/test_rooms.py b/tests/rest/client/test_rooms.py
index d013e75d55..4f6347be15 100644
--- a/tests/rest/client/test_rooms.py
+++ b/tests/rest/client/test_rooms.py
@@ -711,7 +711,7 @@ class RoomsCreateTestCase(RoomBase):
         self.assertEqual(HTTPStatus.OK, channel.code, channel.result)
         self.assertTrue("room_id" in channel.json_body)
         assert channel.resource_usage is not None
-        self.assertEqual(30, channel.resource_usage.db_txn_count)
+        self.assertEqual(32, channel.resource_usage.db_txn_count)
 
     def test_post_room_initial_state(self) -> None:
         # POST with initial_state config key, expect new room id
@@ -724,7 +724,7 @@ class RoomsCreateTestCase(RoomBase):
         self.assertEqual(HTTPStatus.OK, channel.code, channel.result)
         self.assertTrue("room_id" in channel.json_body)
         assert channel.resource_usage is not None
-        self.assertEqual(32, channel.resource_usage.db_txn_count)
+        self.assertEqual(34, channel.resource_usage.db_txn_count)
 
     def test_post_room_visibility_key(self) -> None:
         # POST with visibility config key, expect new room id
diff --git a/tests/storage/databases/main/test_lock.py b/tests/storage/databases/main/test_lock.py
index ad454f6dd8..383da83dfb 100644
--- a/tests/storage/databases/main/test_lock.py
+++ b/tests/storage/databases/main/test_lock.py
@@ -448,3 +448,55 @@ class ReadWriteLockTestCase(unittest.HomeserverTestCase):
         self.get_success(self.store._on_shutdown())
 
         self.assertEqual(self.store._live_read_write_lock_tokens, {})
+
+    def test_acquire_multiple_locks(self) -> None:
+        """Tests that acquiring multiple locks at once works."""
+
+        # Take out multiple locks and ensure that we can't get those locks out
+        # again.
+        lock = self.get_success(
+            self.store.try_acquire_multi_read_write_lock(
+                [("name1", "key1"), ("name2", "key2")], write=True
+            )
+        )
+        self.assertIsNotNone(lock)
+
+        assert lock is not None
+        self.get_success(lock.__aenter__())
+
+        lock2 = self.get_success(
+            self.store.try_acquire_read_write_lock("name1", "key1", write=True)
+        )
+        self.assertIsNone(lock2)
+
+        lock3 = self.get_success(
+            self.store.try_acquire_read_write_lock("name2", "key2", write=False)
+        )
+        self.assertIsNone(lock3)
+
+        # Overlapping locks attempts will fail, and won't lock any locks.
+        lock4 = self.get_success(
+            self.store.try_acquire_multi_read_write_lock(
+                [("name1", "key1"), ("name3", "key3")], write=True
+            )
+        )
+        self.assertIsNone(lock4)
+
+        lock5 = self.get_success(
+            self.store.try_acquire_read_write_lock("name3", "key3", write=True)
+        )
+        self.assertIsNotNone(lock5)
+        assert lock5 is not None
+        self.get_success(lock5.__aenter__())
+        self.get_success(lock5.__aexit__(None, None, None))
+
+        # Once we release the lock we can take out the locks again.
+        self.get_success(lock.__aexit__(None, None, None))
+
+        lock6 = self.get_success(
+            self.store.try_acquire_read_write_lock("name1", "key1", write=True)
+        )
+        self.assertIsNotNone(lock6)
+        assert lock6 is not None
+        self.get_success(lock6.__aenter__())
+        self.get_success(lock6.__aexit__(None, None, None))
-- 
cgit 1.5.1


From b7695ac38843d679b7121495729e0d433c37688e Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@users.noreply.github.com>
Date: Mon, 31 Jul 2023 08:44:45 -0400
Subject: Combine duplicated code for calculating an event ID from a txn ID
 (#16023)

Refactoring related to stabilization of MSC3970, refactor to combine
code which has the same logic.
---
 changelog.d/16023.misc          |  1 +
 synapse/handlers/message.py     | 39 +++++++++++++++++++++++++++++++--------
 synapse/handlers/room_member.py | 28 ++++------------------------
 3 files changed, 36 insertions(+), 32 deletions(-)
 create mode 100644 changelog.d/16023.misc

(limited to 'synapse')

diff --git a/changelog.d/16023.misc b/changelog.d/16023.misc
new file mode 100644
index 0000000000..ee732318e4
--- /dev/null
+++ b/changelog.d/16023.misc
@@ -0,0 +1 @@
+Combine duplicated code.
diff --git a/synapse/handlers/message.py b/synapse/handlers/message.py
index 187dedae7d..c656e07d37 100644
--- a/synapse/handlers/message.py
+++ b/synapse/handlers/message.py
@@ -878,14 +878,13 @@ class EventCreationHandler:
                 return prev_event
         return None
 
-    async def get_event_from_transaction(
+    async def get_event_id_from_transaction(
         self,
         requester: Requester,
         txn_id: str,
         room_id: str,
-    ) -> Optional[EventBase]:
-        """For the given transaction ID and room ID, check if there is a matching event.
-        If so, fetch it and return it.
+    ) -> Optional[str]:
+        """For the given transaction ID and room ID, check if there is a matching event ID.
 
         Args:
             requester: The requester making the request in the context of which we want
@@ -894,8 +893,9 @@ class EventCreationHandler:
             room_id: The room ID.
 
         Returns:
-            An event if one could be found, None otherwise.
+            An event ID if one could be found, None otherwise.
         """
+        existing_event_id = None
 
         if self._msc3970_enabled and requester.device_id:
             # When MSC3970 is enabled, we lookup for events sent by the same device first,
@@ -909,7 +909,7 @@ class EventCreationHandler:
                 )
             )
             if existing_event_id:
-                return await self.store.get_event(existing_event_id)
+                return existing_event_id
 
         # Pre-MSC3970, we looked up for events that were sent by the same session by
         # using the access token ID.
@@ -922,9 +922,32 @@ class EventCreationHandler:
                     txn_id,
                 )
             )
-            if existing_event_id:
-                return await self.store.get_event(existing_event_id)
 
+        return existing_event_id
+
+    async def get_event_from_transaction(
+        self,
+        requester: Requester,
+        txn_id: str,
+        room_id: str,
+    ) -> Optional[EventBase]:
+        """For the given transaction ID and room ID, check if there is a matching event.
+        If so, fetch it and return it.
+
+        Args:
+            requester: The requester making the request in the context of which we want
+                to fetch the event.
+            txn_id: The transaction ID.
+            room_id: The room ID.
+
+        Returns:
+            An event if one could be found, None otherwise.
+        """
+        existing_event_id = await self.get_event_id_from_transaction(
+            requester, txn_id, room_id
+        )
+        if existing_event_id:
+            return await self.store.get_event(existing_event_id)
         return None
 
     async def create_and_send_nonmember_event(
diff --git a/synapse/handlers/room_member.py b/synapse/handlers/room_member.py
index 6cca2ec344..e3cdf2bc61 100644
--- a/synapse/handlers/room_member.py
+++ b/synapse/handlers/room_member.py
@@ -176,8 +176,6 @@ class RoomMemberHandler(metaclass=abc.ABCMeta):
         self.request_ratelimiter = hs.get_request_ratelimiter()
         hs.get_notifier().add_new_join_in_room_callback(self._on_user_joined_room)
 
-        self._msc3970_enabled = hs.config.experimental.msc3970_enabled
-
     def _on_user_joined_room(self, event_id: str, room_id: str) -> None:
         """Notify the rate limiter that a room join has occurred.
 
@@ -418,29 +416,11 @@ class RoomMemberHandler(metaclass=abc.ABCMeta):
         # do this check just before we persist an event as well, but may as well
         # do it up front for efficiency.)
         if txn_id:
-            existing_event_id = None
-            if self._msc3970_enabled and requester.device_id:
-                # When MSC3970 is enabled, we lookup for events sent by the same device
-                # first, and fallback to the old behaviour if none were found.
-                existing_event_id = (
-                    await self.store.get_event_id_from_transaction_id_and_device_id(
-                        room_id,
-                        requester.user.to_string(),
-                        requester.device_id,
-                        txn_id,
-                    )
+            existing_event_id = (
+                await self.event_creation_handler.get_event_id_from_transaction(
+                    requester, txn_id, room_id
                 )
-
-            if requester.access_token_id and not existing_event_id:
-                existing_event_id = (
-                    await self.store.get_event_id_from_transaction_id_and_token_id(
-                        room_id,
-                        requester.user.to_string(),
-                        requester.access_token_id,
-                        txn_id,
-                    )
-                )
-
+            )
             if existing_event_id:
                 event_pos = await self.store.get_position_for_event(existing_event_id)
                 return existing_event_id, event_pos.stream
-- 
cgit 1.5.1


From 7cbb2a00d1ed07d42c6fa1fb226db512cd2a6b90 Mon Sep 17 00:00:00 2001
From: Jason Little <realtyem@gmail.com>
Date: Tue, 1 Aug 2023 07:10:49 -0500
Subject: Add metrics tracking for eviction to ResponseCache (#16028)

Track whether the ResponseCache is evicting due to invalidation
or due to time.
---
 changelog.d/16028.misc                |  1 +
 synapse/util/caches/response_cache.py | 10 ++++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)
 create mode 100644 changelog.d/16028.misc

(limited to 'synapse')

diff --git a/changelog.d/16028.misc b/changelog.d/16028.misc
new file mode 100644
index 0000000000..3a1e9fef09
--- /dev/null
+++ b/changelog.d/16028.misc
@@ -0,0 +1 @@
+Collect additional metrics from `ResponseCache` for eviction.
diff --git a/synapse/util/caches/response_cache.py b/synapse/util/caches/response_cache.py
index 340e5e9145..0cb46700a9 100644
--- a/synapse/util/caches/response_cache.py
+++ b/synapse/util/caches/response_cache.py
@@ -36,7 +36,7 @@ from synapse.logging.opentracing import (
 )
 from synapse.util import Clock
 from synapse.util.async_helpers import AbstractObservableDeferred, ObservableDeferred
-from synapse.util.caches import register_cache
+from synapse.util.caches import EvictionReason, register_cache
 
 logger = logging.getLogger(__name__)
 
@@ -167,7 +167,7 @@ class ResponseCache(Generic[KV]):
             # the should_cache bit, we leave it in the cache for now and schedule
             # its removal later.
             if self.timeout_sec and context.should_cache:
-                self.clock.call_later(self.timeout_sec, self.unset, key)
+                self.clock.call_later(self.timeout_sec, self._entry_timeout, key)
             else:
                 # otherwise, remove the result immediately.
                 self.unset(key)
@@ -185,6 +185,12 @@ class ResponseCache(Generic[KV]):
         Args:
             key: key used to remove the cached value
         """
+        self._metrics.inc_evictions(EvictionReason.invalidation)
+        self._result_cache.pop(key, None)
+
+    def _entry_timeout(self, key: KV) -> None:
+        """For the call_later to remove from the cache"""
+        self._metrics.inc_evictions(EvictionReason.time)
         self._result_cache.pop(key, None)
 
     async def wrap(
-- 
cgit 1.5.1


From 5eb3fd785bdbf2ae07031f13a6ac5fb578adc338 Mon Sep 17 00:00:00 2001
From: Mohit Rathee <mohit.rathee2505@gmail.com>
Date: Tue, 1 Aug 2023 18:44:02 +0530
Subject: Trim whitespace when setting display names (#16031)

---
 changelog.d/16031.bugfix          |  1 +
 synapse/handlers/profile.py       |  2 +-
 tests/rest/client/test_profile.py | 12 ++++++++++++
 3 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 changelog.d/16031.bugfix

(limited to 'synapse')

diff --git a/changelog.d/16031.bugfix b/changelog.d/16031.bugfix
new file mode 100644
index 0000000000..e48bf3975c
--- /dev/null
+++ b/changelog.d/16031.bugfix
@@ -0,0 +1 @@
+Remove leading and trailing spaces when setting a display name.
diff --git a/synapse/handlers/profile.py b/synapse/handlers/profile.py
index c7fe101cd9..c2109036ec 100644
--- a/synapse/handlers/profile.py
+++ b/synapse/handlers/profile.py
@@ -163,7 +163,7 @@ class ProfileHandler:
                 400, "Displayname is too long (max %i)" % (MAX_DISPLAYNAME_LEN,)
             )
 
-        displayname_to_set: Optional[str] = new_displayname
+        displayname_to_set: Optional[str] = new_displayname.strip()
         if new_displayname == "":
             displayname_to_set = None
 
diff --git a/tests/rest/client/test_profile.py b/tests/rest/client/test_profile.py
index 27c93ad761..ecae092b47 100644
--- a/tests/rest/client/test_profile.py
+++ b/tests/rest/client/test_profile.py
@@ -68,6 +68,18 @@ class ProfileTestCase(unittest.HomeserverTestCase):
         res = self._get_displayname()
         self.assertEqual(res, "test")
 
+    def test_set_displayname_with_extra_spaces(self) -> None:
+        channel = self.make_request(
+            "PUT",
+            "/profile/%s/displayname" % (self.owner,),
+            content={"displayname": "  test  "},
+            access_token=self.owner_tok,
+        )
+        self.assertEqual(channel.code, 200, channel.result)
+
+        res = self._get_displayname()
+        self.assertEqual(res, "test")
+
     def test_set_displayname_noauth(self) -> None:
         channel = self.make_request(
             "PUT",
-- 
cgit 1.5.1


From 90ad836ed8f4b701580213a89f2befb742c88b5e Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@users.noreply.github.com>
Date: Tue, 1 Aug 2023 10:36:33 -0400
Subject: Properly setup the additional sequences in the portdb script.
 (#16043)

The un_partial_stated_event_stream_sequence and
application_services_txn_id_seq were never properly configured
in the portdb script, resulting in an error on start-up.
---
 changelog.d/16043.bugfix            |  1 +
 synapse/_scripts/synapse_port_db.py | 18 +++++++++++++++---
 2 files changed, 16 insertions(+), 3 deletions(-)
 create mode 100644 changelog.d/16043.bugfix

(limited to 'synapse')

diff --git a/changelog.d/16043.bugfix b/changelog.d/16043.bugfix
new file mode 100644
index 0000000000..78c0f3455a
--- /dev/null
+++ b/changelog.d/16043.bugfix
@@ -0,0 +1 @@
+Fix a long-standing bug where the `synapse_port_db` failed to configure sequences for application services and partial stated rooms.
diff --git a/synapse/_scripts/synapse_port_db.py b/synapse/_scripts/synapse_port_db.py
index 7c4aa0afa2..22c84fbd5b 100755
--- a/synapse/_scripts/synapse_port_db.py
+++ b/synapse/_scripts/synapse_port_db.py
@@ -761,7 +761,7 @@ class Porter:
 
             # Step 2. Set up sequences
             #
-            # We do this before porting the tables so that event if we fail half
+            # We do this before porting the tables so that even if we fail half
             # way through the postgres DB always have sequences that are greater
             # than their respective tables. If we don't then creating the
             # `DataStore` object will fail due to the inconsistency.
@@ -769,6 +769,10 @@ class Porter:
             await self._setup_state_group_id_seq()
             await self._setup_user_id_seq()
             await self._setup_events_stream_seqs()
+            await self._setup_sequence(
+                "un_partial_stated_event_stream_sequence",
+                ("un_partial_stated_event_stream",),
+            )
             await self._setup_sequence(
                 "device_inbox_sequence", ("device_inbox", "device_federation_outbox")
             )
@@ -779,6 +783,11 @@ class Porter:
             await self._setup_sequence("receipts_sequence", ("receipts_linearized",))
             await self._setup_sequence("presence_stream_sequence", ("presence_stream",))
             await self._setup_auth_chain_sequence()
+            await self._setup_sequence(
+                "application_services_txn_id_seq",
+                ("application_services_txns",),
+                "txn_id",
+            )
 
             # Step 3. Get tables.
             self.progress.set_state("Fetching tables")
@@ -1083,7 +1092,10 @@ class Porter:
         )
 
     async def _setup_sequence(
-        self, sequence_name: str, stream_id_tables: Iterable[str]
+        self,
+        sequence_name: str,
+        stream_id_tables: Iterable[str],
+        column_name: str = "stream_id",
     ) -> None:
         """Set a sequence to the correct value."""
         current_stream_ids = []
@@ -1093,7 +1105,7 @@ class Porter:
                 await self.sqlite_store.db_pool.simple_select_one_onecol(
                     table=stream_id_table,
                     keyvalues={},
-                    retcol="COALESCE(MAX(stream_id), 1)",
+                    retcol=f"COALESCE(MAX({column_name}), 1)",
                     allow_none=True,
                 ),
             )
-- 
cgit 1.5.1


From 4f5bccbbba13ba10412497cb92a1460535cf7a25 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@users.noreply.github.com>
Date: Wed, 2 Aug 2023 11:35:54 -0400
Subject: Add forward-compatibility for the redacts property (MSC2174).
 (#16013)

The location of the redacts field changes in room version 11. Ensure
it is copied to the *new* location for *old* room versions for
forwards-compatibility with clients.

Note that copying it to the *old* location for the *new* room version
was previously handled.
---
 changelog.d/16013.misc               |  1 +
 synapse/events/utils.py              | 18 +++++-----
 tests/rest/client/test_redactions.py | 67 +++++++++++++++++++++++++++---------
 3 files changed, 61 insertions(+), 25 deletions(-)
 create mode 100644 changelog.d/16013.misc

(limited to 'synapse')

diff --git a/changelog.d/16013.misc b/changelog.d/16013.misc
new file mode 100644
index 0000000000..bd161e13ed
--- /dev/null
+++ b/changelog.d/16013.misc
@@ -0,0 +1 @@
+Properly overwrite the `redacts` content-property for forwards-compatibility with room versions 1 through 10.
diff --git a/synapse/events/utils.py b/synapse/events/utils.py
index c890833b1d..967a6c245b 100644
--- a/synapse/events/utils.py
+++ b/synapse/events/utils.py
@@ -475,14 +475,16 @@ def serialize_event(
     if config.as_client_event:
         d = config.event_format(d)
 
-    # If the event is a redaction, copy the redacts field from the content to
-    # top-level for backwards compatibility.
-    if (
-        e.type == EventTypes.Redaction
-        and e.room_version.updated_redaction_rules
-        and e.redacts is not None
-    ):
-        d["redacts"] = e.redacts
+    # If the event is a redaction, the field with the redacted event ID appears
+    # in a different location depending on the room version. e.redacts handles
+    # fetching from the proper location; copy it to the other location for forwards-
+    # and backwards-compatibility with clients.
+    if e.type == EventTypes.Redaction and e.redacts is not None:
+        if e.room_version.updated_redaction_rules:
+            d["redacts"] = e.redacts
+        else:
+            d["content"] = dict(d["content"])
+            d["content"]["redacts"] = e.redacts
 
     only_event_fields = config.only_event_fields
     if only_event_fields:
diff --git a/tests/rest/client/test_redactions.py b/tests/rest/client/test_redactions.py
index 6028886bd6..180b635ea6 100644
--- a/tests/rest/client/test_redactions.py
+++ b/tests/rest/client/test_redactions.py
@@ -13,10 +13,12 @@
 # limitations under the License.
 from typing import List, Optional
 
+from parameterized import parameterized
+
 from twisted.test.proto_helpers import MemoryReactor
 
 from synapse.api.constants import EventTypes, RelationTypes
-from synapse.api.room_versions import RoomVersions
+from synapse.api.room_versions import RoomVersion, RoomVersions
 from synapse.rest import admin
 from synapse.rest.client import login, room, sync
 from synapse.server import HomeServer
@@ -569,50 +571,81 @@ class RedactionsTestCase(HomeserverTestCase):
         self.assertIn("body", event_dict["content"], event_dict)
         self.assertEqual("I'm in a thread!", event_dict["content"]["body"])
 
-    def test_content_redaction(self) -> None:
-        """MSC2174 moved the redacts property to the content."""
+    @parameterized.expand(
+        [
+            # Tuples of:
+            #   Room version
+            #   Boolean: True if the redaction event content should include the event ID.
+            #   Boolean: true if the resulting redaction event is expected to include the
+            #            event ID in the content.
+            (RoomVersions.V10, False, False),
+            (RoomVersions.V11, True, True),
+            (RoomVersions.V11, False, True),
+        ]
+    )
+    def test_redaction_content(
+        self, room_version: RoomVersion, include_content: bool, expect_content: bool
+    ) -> None:
+        """
+        Room version 11 moved the redacts property to the content.
+
+        Ensure that the event gets created properly and that the Client-Server
+        API servers the proper backwards-compatible version.
+        """
         # Create a room with the newer room version.
         room_id = self.helper.create_room_as(
             self.mod_user_id,
             tok=self.mod_access_token,
-            room_version=RoomVersions.V11.identifier,
+            room_version=room_version.identifier,
         )
 
         # Create an event.
         b = self.helper.send(room_id=room_id, tok=self.mod_access_token)
         event_id = b["event_id"]
 
-        # Attempt to redact it with a bogus event ID.
-        self._redact_event(
+        # Ensure the event ID in the URL and the content must match.
+        if include_content:
+            self._redact_event(
+                self.mod_access_token,
+                room_id,
+                event_id,
+                expect_code=400,
+                content={"redacts": "foo"},
+            )
+
+        # Redact it for real.
+        result = self._redact_event(
             self.mod_access_token,
             room_id,
             event_id,
-            expect_code=400,
-            content={"redacts": "foo"},
+            content={"redacts": event_id} if include_content else {},
         )
-
-        # Redact it for real.
-        self._redact_event(self.mod_access_token, room_id, event_id)
+        redaction_event_id = result["event_id"]
 
         # Sync the room, to get the id of the create event
         timeline = self._sync_room_timeline(self.mod_access_token, room_id)
         redact_event = timeline[-1]
         self.assertEqual(redact_event["type"], EventTypes.Redaction)
-        # The redacts key should be in the content.
+        # The redacts key should be in the content and the redacts keys.
         self.assertEquals(redact_event["content"]["redacts"], event_id)
-
-        # It should also be copied as the top-level redacts field for backwards
-        # compatibility.
         self.assertEquals(redact_event["redacts"], event_id)
 
         # But it isn't actually part of the event.
         def get_event(txn: LoggingTransaction) -> JsonDict:
             return db_to_json(
-                main_datastore._fetch_event_rows(txn, [event_id])[event_id].json
+                main_datastore._fetch_event_rows(txn, [redaction_event_id])[
+                    redaction_event_id
+                ].json
             )
 
         main_datastore = self.hs.get_datastores().main
         event_json = self.get_success(
             main_datastore.db_pool.runInteraction("get_event", get_event)
         )
-        self.assertNotIn("redacts", event_json)
+        self.assertEquals(event_json["type"], EventTypes.Redaction)
+        if expect_content:
+            self.assertNotIn("redacts", event_json)
+            self.assertEquals(event_json["content"]["redacts"], event_id)
+        else:
+            self.assertEquals(event_json["redacts"], event_id)
+            self.assertNotIn("redacts", event_json["content"])
-- 
cgit 1.5.1


From 9c462f18a4b6969f627349d956b9161968ab8252 Mon Sep 17 00:00:00 2001
From: reivilibre <oliverw@matrix.org>
Date: Thu, 3 Aug 2023 12:42:19 +0000
Subject: Allow modules to check whether the current worker is configured to
 run background tasks. (#15991)

---
 changelog.d/15991.misc         |  1 +
 synapse/module_api/__init__.py | 12 ++++++++++++
 2 files changed, 13 insertions(+)
 create mode 100644 changelog.d/15991.misc

(limited to 'synapse')

diff --git a/changelog.d/15991.misc b/changelog.d/15991.misc
new file mode 100644
index 0000000000..18f388cff8
--- /dev/null
+++ b/changelog.d/15991.misc
@@ -0,0 +1 @@
+Allow modules to check whether the current worker is configured to run background tasks.
\ No newline at end of file
diff --git a/synapse/module_api/__init__.py b/synapse/module_api/__init__.py
index 95f7800111..ba1a925003 100644
--- a/synapse/module_api/__init__.py
+++ b/synapse/module_api/__init__.py
@@ -1230,6 +1230,18 @@ class ModuleApi:
                 f,
             )
 
+    def should_run_background_tasks(self) -> bool:
+        """
+        Return true if and only if the current worker is configured to run
+        background tasks.
+        There should only be one worker configured to run background tasks, so
+        this is helpful when you need to only run a task on one worker but don't
+        have any other good way to choose which one.
+
+        Added in Synapse v1.89.0.
+        """
+        return self._hs.config.worker.run_background_tasks
+
     async def sleep(self, seconds: float) -> None:
         """Sleeps for the given number of seconds.
 
-- 
cgit 1.5.1


From f0a860908ba0309c89c9dba452d99b4f9c6928f7 Mon Sep 17 00:00:00 2001
From: Mathieu Velten <mathieuv@matrix.org>
Date: Thu, 3 Aug 2023 20:36:55 +0200
Subject: Allow config of the backoff algorithm for the federation client.
 (#15754)

Adds three new configuration variables:

* destination_min_retry_interval is identical to before (10mn).
* destination_retry_multiplier is now 2 instead of 5, the maximum value will
  be reached slower.
* destination_max_retry_interval is one day instead of (essentially) infinity.

Capping this will cause destinations to continue to be retried sometimes instead
of being lost forever. The previous value was 2 ^ 62 milliseconds.
---
 changelog.d/15754.misc                           |  1 +
 docs/usage/configuration/config_documentation.md | 11 +++++++++
 synapse/config/federation.py                     | 18 +++++++++++++++
 synapse/util/retryutils.py                       | 29 +++++++++++++-----------
 tests/storage/test_transactions.py               |  9 ++++++--
 tests/util/test_retryutils.py                    | 22 +++++++++---------
 6 files changed, 64 insertions(+), 26 deletions(-)
 create mode 100644 changelog.d/15754.misc

(limited to 'synapse')

diff --git a/changelog.d/15754.misc b/changelog.d/15754.misc
new file mode 100644
index 0000000000..4314d415a3
--- /dev/null
+++ b/changelog.d/15754.misc
@@ -0,0 +1 @@
+Allow for the configuration of the backoff algorithm for federation destinations.
diff --git a/docs/usage/configuration/config_documentation.md b/docs/usage/configuration/config_documentation.md
index 4e6fcd085a..c32608da2b 100644
--- a/docs/usage/configuration/config_documentation.md
+++ b/docs/usage/configuration/config_documentation.md
@@ -1242,6 +1242,14 @@ like sending a federation transaction.
 * `max_short_retries`: maximum number of retries for the short retry algo. Default to 3 attempts.
 * `max_long_retries`: maximum number of retries for the long retry algo. Default to 10 attempts.
 
+The following options control the retry logic when communicating with a specific homeserver destination.
+Unlike the previous configuration options, these values apply across all requests
+for a given destination and the state of the backoff is stored in the database.
+
+* `destination_min_retry_interval`: the initial backoff, after the first request fails. Defaults to 10m.
+* `destination_retry_multiplier`: how much we multiply the backoff by after each subsequent fail. Defaults to 2.
+* `destination_max_retry_interval`: a cap on the backoff. Defaults to a week.
+
 Example configuration:
 ```yaml
 federation:
@@ -1250,6 +1258,9 @@ federation:
   max_long_retry_delay: 100s
   max_short_retries: 5
   max_long_retries: 20
+  destination_min_retry_interval: 30s
+  destination_retry_multiplier: 5
+  destination_max_retry_interval: 12h
 ```
 ---
 ## Caching
diff --git a/synapse/config/federation.py b/synapse/config/federation.py
index 0e1cb8b6e3..97636039b8 100644
--- a/synapse/config/federation.py
+++ b/synapse/config/federation.py
@@ -65,5 +65,23 @@ class FederationConfig(Config):
         self.max_long_retries = federation_config.get("max_long_retries", 10)
         self.max_short_retries = federation_config.get("max_short_retries", 3)
 
+        # Allow for the configuration of the backoff algorithm used
+        # when trying to reach an unavailable destination.
+        # Unlike previous configuration those values applies across
+        # multiple requests and the state of the backoff is stored on DB.
+        self.destination_min_retry_interval_ms = Config.parse_duration(
+            federation_config.get("destination_min_retry_interval", "10m")
+        )
+        self.destination_retry_multiplier = federation_config.get(
+            "destination_retry_multiplier", 2
+        )
+        self.destination_max_retry_interval_ms = min(
+            Config.parse_duration(
+                federation_config.get("destination_max_retry_interval", "7d")
+            ),
+            # Set a hard-limit to not overflow the database column.
+            2**62,
+        )
+
 
 _METRICS_FOR_DOMAINS_SCHEMA = {"type": "array", "items": {"type": "string"}}
diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py
index dcc037b982..27e9fc976c 100644
--- a/synapse/util/retryutils.py
+++ b/synapse/util/retryutils.py
@@ -27,15 +27,6 @@ if TYPE_CHECKING:
 
 logger = logging.getLogger(__name__)
 
-# the initial backoff, after the first transaction fails
-MIN_RETRY_INTERVAL = 10 * 60 * 1000
-
-# how much we multiply the backoff by after each subsequent fail
-RETRY_MULTIPLIER = 5
-
-# a cap on the backoff. (Essentially none)
-MAX_RETRY_INTERVAL = 2**62
-
 
 class NotRetryingDestination(Exception):
     def __init__(self, retry_last_ts: int, retry_interval: int, destination: str):
@@ -169,6 +160,16 @@ class RetryDestinationLimiter:
         self.notifier = notifier
         self.replication_client = replication_client
 
+        self.destination_min_retry_interval_ms = (
+            self.store.hs.config.federation.destination_min_retry_interval_ms
+        )
+        self.destination_retry_multiplier = (
+            self.store.hs.config.federation.destination_retry_multiplier
+        )
+        self.destination_max_retry_interval_ms = (
+            self.store.hs.config.federation.destination_max_retry_interval_ms
+        )
+
     def __enter__(self) -> None:
         pass
 
@@ -220,13 +221,15 @@ class RetryDestinationLimiter:
             # We couldn't connect.
             if self.retry_interval:
                 self.retry_interval = int(
-                    self.retry_interval * RETRY_MULTIPLIER * random.uniform(0.8, 1.4)
+                    self.retry_interval
+                    * self.destination_retry_multiplier
+                    * random.uniform(0.8, 1.4)
                 )
 
-                if self.retry_interval >= MAX_RETRY_INTERVAL:
-                    self.retry_interval = MAX_RETRY_INTERVAL
+                if self.retry_interval >= self.destination_max_retry_interval_ms:
+                    self.retry_interval = self.destination_max_retry_interval_ms
             else:
-                self.retry_interval = MIN_RETRY_INTERVAL
+                self.retry_interval = self.destination_min_retry_interval_ms
 
             logger.info(
                 "Connection to %s was unsuccessful (%s(%s)); backoff now %i",
diff --git a/tests/storage/test_transactions.py b/tests/storage/test_transactions.py
index 2fab84a529..ef06b50dbb 100644
--- a/tests/storage/test_transactions.py
+++ b/tests/storage/test_transactions.py
@@ -17,7 +17,6 @@ from twisted.test.proto_helpers import MemoryReactor
 from synapse.server import HomeServer
 from synapse.storage.databases.main.transactions import DestinationRetryTimings
 from synapse.util import Clock
-from synapse.util.retryutils import MAX_RETRY_INTERVAL
 
 from tests.unittest import HomeserverTestCase
 
@@ -57,8 +56,14 @@ class TransactionStoreTestCase(HomeserverTestCase):
         self.get_success(d)
 
     def test_large_destination_retry(self) -> None:
+        max_retry_interval_ms = (
+            self.hs.config.federation.destination_max_retry_interval_ms
+        )
         d = self.store.set_destination_retry_timings(
-            "example.com", MAX_RETRY_INTERVAL, MAX_RETRY_INTERVAL, MAX_RETRY_INTERVAL
+            "example.com",
+            max_retry_interval_ms,
+            max_retry_interval_ms,
+            max_retry_interval_ms,
         )
         self.get_success(d)
 
diff --git a/tests/util/test_retryutils.py b/tests/util/test_retryutils.py
index 5f8f4e76b5..1277e1a865 100644
--- a/tests/util/test_retryutils.py
+++ b/tests/util/test_retryutils.py
@@ -11,12 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from synapse.util.retryutils import (
-    MIN_RETRY_INTERVAL,
-    RETRY_MULTIPLIER,
-    NotRetryingDestination,
-    get_retry_limiter,
-)
+from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter
 
 from tests.unittest import HomeserverTestCase
 
@@ -42,6 +37,11 @@ class RetryLimiterTestCase(HomeserverTestCase):
 
         limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))
 
+        min_retry_interval_ms = (
+            self.hs.config.federation.destination_min_retry_interval_ms
+        )
+        retry_multiplier = self.hs.config.federation.destination_retry_multiplier
+
         self.pump(1)
         try:
             with limiter:
@@ -57,7 +57,7 @@ class RetryLimiterTestCase(HomeserverTestCase):
         assert new_timings is not None
         self.assertEqual(new_timings.failure_ts, failure_ts)
         self.assertEqual(new_timings.retry_last_ts, failure_ts)
-        self.assertEqual(new_timings.retry_interval, MIN_RETRY_INTERVAL)
+        self.assertEqual(new_timings.retry_interval, min_retry_interval_ms)
 
         # now if we try again we should get a failure
         self.get_failure(
@@ -68,7 +68,7 @@ class RetryLimiterTestCase(HomeserverTestCase):
         # advance the clock and try again
         #
 
-        self.pump(MIN_RETRY_INTERVAL)
+        self.pump(min_retry_interval_ms)
         limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))
 
         self.pump(1)
@@ -87,16 +87,16 @@ class RetryLimiterTestCase(HomeserverTestCase):
         self.assertEqual(new_timings.failure_ts, failure_ts)
         self.assertEqual(new_timings.retry_last_ts, retry_ts)
         self.assertGreaterEqual(
-            new_timings.retry_interval, MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 0.5
+            new_timings.retry_interval, min_retry_interval_ms * retry_multiplier * 0.5
         )
         self.assertLessEqual(
-            new_timings.retry_interval, MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0
+            new_timings.retry_interval, min_retry_interval_ms * retry_multiplier * 2.0
         )
 
         #
         # one more go, with success
         #
-        self.reactor.advance(MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0)
+        self.reactor.advance(min_retry_interval_ms * retry_multiplier * 2.0)
         limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))
 
         self.pump(1)
-- 
cgit 1.5.1


From 0a5f4f766514b84aff84ff17dffd5301a437c797 Mon Sep 17 00:00:00 2001
From: Shay <hillerys@element.io>
Date: Thu, 3 Aug 2023 11:43:51 -0700
Subject: Move support for application service query parameter authorization
 behind a configuration option (#16017)

---
 changelog.d/16017.removal                        |  1 +
 docs/upgrade.md                                  | 16 ++++-
 docs/usage/configuration/config_documentation.md | 14 ++++
 synapse/appservice/api.py                        | 34 +++++++---
 synapse/config/appservice.py                     |  8 +++
 tests/appservice/test_api.py                     | 85 ++++++++++++++++++++++--
 6 files changed, 144 insertions(+), 14 deletions(-)
 create mode 100644 changelog.d/16017.removal

(limited to 'synapse')

diff --git a/changelog.d/16017.removal b/changelog.d/16017.removal
new file mode 100644
index 0000000000..6b72442892
--- /dev/null
+++ b/changelog.d/16017.removal
@@ -0,0 +1 @@
+Move support for application service query parameter authorization behind a configuration option.
diff --git a/docs/upgrade.md b/docs/upgrade.md
index 5dde6c769e..f50a279e98 100644
--- a/docs/upgrade.md
+++ b/docs/upgrade.md
@@ -88,6 +88,21 @@ process, for example:
     dpkg -i matrix-synapse-py3_1.3.0+stretch1_amd64.deb
     ```
 
+# Upgrading to v1.90.0
+
+## App service query parameter authorization is now a configuration option
+
+Synapse v1.81.0 deprecated application service authorization via query parameters as this is
+considered insecure - and from Synapse v1.71.0 forwards the application service token has also been sent via 
+[the `Authorization` header](https://spec.matrix.org/v1.6/application-service-api/#authorization)], making the insecure
+query parameter authorization redundant. Since removing the ability to continue to use query parameters could break 
+backwards compatibility it has now been put behind a configuration option, `use_appservice_legacy_authorization`.  
+This option defaults to false, but can be activated by adding 
+```yaml
+use_appservice_legacy_authorization: true 
+```
+to your configuration.
+
 # Upgrading to v1.89.0
 
 ## Removal of unspecced `user` property for `/register`
@@ -97,7 +112,6 @@ The standard `username` property should be used instead. See the
 [Application Service specification](https://spec.matrix.org/v1.7/application-service-api/#server-admin-style-permissions)
 for more information.
 
-
 # Upgrading to v1.88.0
 
 ## Minimum supported Python version
diff --git a/docs/usage/configuration/config_documentation.md b/docs/usage/configuration/config_documentation.md
index c32608da2b..2987c9332d 100644
--- a/docs/usage/configuration/config_documentation.md
+++ b/docs/usage/configuration/config_documentation.md
@@ -2848,6 +2848,20 @@ Example configuration:
 ```yaml
 track_appservice_user_ips: true
 ```
+---
+### `use_appservice_legacy_authorization`
+
+Whether to send the application service access tokens via the `access_token` query parameter
+per older versions of the Matrix specification. Defaults to false. Set to true to enable sending
+access tokens via a query parameter.
+
+**Enabling this option is considered insecure and is not recommended. **
+
+Example configuration:
+```yaml
+use_appservice_legacy_authorization: true 
+```
+
 ---
 ### `macaroon_secret_key`
 
diff --git a/synapse/appservice/api.py b/synapse/appservice/api.py
index 359999f680..de7a94bf26 100644
--- a/synapse/appservice/api.py
+++ b/synapse/appservice/api.py
@@ -16,7 +16,6 @@ import logging
 import urllib.parse
 from typing import (
     TYPE_CHECKING,
-    Any,
     Dict,
     Iterable,
     List,
@@ -25,6 +24,7 @@ from typing import (
     Sequence,
     Tuple,
     TypeVar,
+    Union,
 )
 
 from prometheus_client import Counter
@@ -119,6 +119,7 @@ class ApplicationServiceApi(SimpleHttpClient):
     def __init__(self, hs: "HomeServer"):
         super().__init__(hs)
         self.clock = hs.get_clock()
+        self.config = hs.config.appservice
 
         self.protocol_meta_cache: ResponseCache[Tuple[str, str]] = ResponseCache(
             hs.get_clock(), "as_protocol_meta", timeout_ms=HOUR_IN_MS
@@ -132,9 +133,12 @@ class ApplicationServiceApi(SimpleHttpClient):
         assert service.hs_token is not None
 
         try:
+            args = None
+            if self.config.use_appservice_legacy_authorization:
+                args = {"access_token": service.hs_token}
             response = await self.get_json(
                 f"{service.url}{APP_SERVICE_PREFIX}/users/{urllib.parse.quote(user_id)}",
-                {"access_token": service.hs_token},
+                args,
                 headers={"Authorization": [f"Bearer {service.hs_token}"]},
             )
             if response is not None:  # just an empty json object
@@ -155,9 +159,12 @@ class ApplicationServiceApi(SimpleHttpClient):
         assert service.hs_token is not None
 
         try:
+            args = None
+            if self.config.use_appservice_legacy_authorization:
+                args = {"access_token": service.hs_token}
             response = await self.get_json(
                 f"{service.url}{APP_SERVICE_PREFIX}/rooms/{urllib.parse.quote(alias)}",
-                {"access_token": service.hs_token},
+                args,
                 headers={"Authorization": [f"Bearer {service.hs_token}"]},
             )
             if response is not None:  # just an empty json object
@@ -190,10 +197,12 @@ class ApplicationServiceApi(SimpleHttpClient):
         assert service.hs_token is not None
 
         try:
-            args: Mapping[Any, Any] = {
-                **fields,
-                b"access_token": service.hs_token,
-            }
+            args: Mapping[bytes, Union[List[bytes], str]] = fields
+            if self.config.use_appservice_legacy_authorization:
+                args = {
+                    **fields,
+                    b"access_token": service.hs_token,
+                }
             response = await self.get_json(
                 f"{service.url}{APP_SERVICE_PREFIX}/thirdparty/{kind}/{urllib.parse.quote(protocol)}",
                 args=args,
@@ -231,9 +240,12 @@ class ApplicationServiceApi(SimpleHttpClient):
             # This is required by the configuration.
             assert service.hs_token is not None
             try:
+                args = None
+                if self.config.use_appservice_legacy_authorization:
+                    args = {"access_token": service.hs_token}
                 info = await self.get_json(
                     f"{service.url}{APP_SERVICE_PREFIX}/thirdparty/protocol/{urllib.parse.quote(protocol)}",
-                    {"access_token": service.hs_token},
+                    args,
                     headers={"Authorization": [f"Bearer {service.hs_token}"]},
                 )
 
@@ -344,10 +356,14 @@ class ApplicationServiceApi(SimpleHttpClient):
                 }
 
         try:
+            args = None
+            if self.config.use_appservice_legacy_authorization:
+                args = {"access_token": service.hs_token}
+
             await self.put_json(
                 f"{service.url}{APP_SERVICE_PREFIX}/transactions/{urllib.parse.quote(str(txn_id))}",
                 json_body=body,
-                args={"access_token": service.hs_token},
+                args=args,
                 headers={"Authorization": [f"Bearer {service.hs_token}"]},
             )
             if logger.isEnabledFor(logging.DEBUG):
diff --git a/synapse/config/appservice.py b/synapse/config/appservice.py
index c2710fdf04..919f81a9b7 100644
--- a/synapse/config/appservice.py
+++ b/synapse/config/appservice.py
@@ -43,6 +43,14 @@ class AppServiceConfig(Config):
             )
 
         self.track_appservice_user_ips = config.get("track_appservice_user_ips", False)
+        self.use_appservice_legacy_authorization = config.get(
+            "use_appservice_legacy_authorization", False
+        )
+        if self.use_appservice_legacy_authorization:
+            logger.warning(
+                "The use of appservice legacy authorization via query params is deprecated"
+                " and should be considered insecure."
+            )
 
 
 def load_appservices(
diff --git a/tests/appservice/test_api.py b/tests/appservice/test_api.py
index 807dc2f21c..3c635e3dcb 100644
--- a/tests/appservice/test_api.py
+++ b/tests/appservice/test_api.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, List, Mapping, Sequence, Union
+from typing import Any, List, Mapping, Optional, Sequence, Union
 from unittest.mock import Mock
 
 from twisted.test.proto_helpers import MemoryReactor
@@ -22,6 +22,7 @@ from synapse.types import JsonDict
 from synapse.util import Clock
 
 from tests import unittest
+from tests.unittest import override_config
 
 PROTOCOL = "myproto"
 TOKEN = "myastoken"
@@ -39,7 +40,7 @@ class ApplicationServiceApiTestCase(unittest.HomeserverTestCase):
             hs_token=TOKEN,
         )
 
-    def test_query_3pe_authenticates_token(self) -> None:
+    def test_query_3pe_authenticates_token_via_header(self) -> None:
         """
         Tests that 3pe queries to the appservice are authenticated
         with the appservice's token.
@@ -74,12 +75,88 @@ class ApplicationServiceApiTestCase(unittest.HomeserverTestCase):
             args: Mapping[Any, Any],
             headers: Mapping[Union[str, bytes], Sequence[Union[str, bytes]]],
         ) -> List[JsonDict]:
-            # Ensure the access token is passed as both a header and query arg.
-            if not headers.get("Authorization") or not args.get(b"access_token"):
+            # Ensure the access token is passed as a header.
+            if not headers or not headers.get("Authorization"):
                 raise RuntimeError("Access token not provided")
+            # ... and not as a query param
+            if b"access_token" in args:
+                raise RuntimeError(
+                    "Access token should not be passed as a query param."
+                )
 
             self.assertEqual(headers.get("Authorization"), [f"Bearer {TOKEN}"])
+            self.request_url = url
+            if url == URL_USER:
+                return SUCCESS_RESULT_USER
+            elif url == URL_LOCATION:
+                return SUCCESS_RESULT_LOCATION
+            else:
+                raise RuntimeError(
+                    "URL provided was invalid. This should never be seen."
+                )
+
+        # We assign to a method, which mypy doesn't like.
+        self.api.get_json = Mock(side_effect=get_json)  # type: ignore[assignment]
+
+        result = self.get_success(
+            self.api.query_3pe(self.service, "user", PROTOCOL, {b"some": [b"field"]})
+        )
+        self.assertEqual(self.request_url, URL_USER)
+        self.assertEqual(result, SUCCESS_RESULT_USER)
+        result = self.get_success(
+            self.api.query_3pe(
+                self.service, "location", PROTOCOL, {b"some": [b"field"]}
+            )
+        )
+        self.assertEqual(self.request_url, URL_LOCATION)
+        self.assertEqual(result, SUCCESS_RESULT_LOCATION)
+
+    @override_config({"use_appservice_legacy_authorization": True})
+    def test_query_3pe_authenticates_token_via_param(self) -> None:
+        """
+        Tests that 3pe queries to the appservice are authenticated
+        with the appservice's token.
+        """
+
+        SUCCESS_RESULT_USER = [
+            {
+                "protocol": PROTOCOL,
+                "userid": "@a:user",
+                "fields": {
+                    "more": "fields",
+                },
+            }
+        ]
+        SUCCESS_RESULT_LOCATION = [
+            {
+                "protocol": PROTOCOL,
+                "alias": "#a:room",
+                "fields": {
+                    "more": "fields",
+                },
+            }
+        ]
+
+        URL_USER = f"{URL}/_matrix/app/v1/thirdparty/user/{PROTOCOL}"
+        URL_LOCATION = f"{URL}/_matrix/app/v1/thirdparty/location/{PROTOCOL}"
+
+        self.request_url = None
+
+        async def get_json(
+            url: str,
+            args: Mapping[Any, Any],
+            headers: Optional[
+                Mapping[Union[str, bytes], Sequence[Union[str, bytes]]]
+            ] = None,
+        ) -> List[JsonDict]:
+            # Ensure the access token is passed as a both a query param and in the headers.
+            if not args.get(b"access_token"):
+                raise RuntimeError("Access token should be provided in query params.")
+            if not headers or not headers.get("Authorization"):
+                raise RuntimeError("Access token should be provided in auth headers.")
+
             self.assertEqual(args.get(b"access_token"), TOKEN)
+            self.assertEqual(headers.get("Authorization"), [f"Bearer {TOKEN}"])
             self.request_url = url
             if url == URL_USER:
                 return SUCCESS_RESULT_USER
-- 
cgit 1.5.1


From d98a43d9226cbb4b9ab5ad3abd9b630548c2f09f Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@users.noreply.github.com>
Date: Fri, 4 Aug 2023 07:47:18 -0400
Subject: Stabilize support for MSC3970: updated transaction semantics (scope
 to `device_id`) (#15629)

For now this maintains compatible with old Synapses by falling back
to using transaction semantics on a per-access token. A future version
of Synapse will drop support for this.
---
 changelog.d/15629.feature                |  1 +
 synapse/config/experimental.py           |  9 -------
 synapse/events/utils.py                  | 42 ++++++++++++++++----------------
 synapse/handlers/message.py              | 12 ++++-----
 synapse/rest/client/transactions.py      | 12 ++++-----
 synapse/server.py                        |  4 +--
 synapse/storage/databases/main/events.py | 15 +++++-------
 synapse/storage/schema/__init__.py       |  5 +++-
 synapse/types/__init__.py                |  7 +++---
 9 files changed, 48 insertions(+), 59 deletions(-)
 create mode 100644 changelog.d/15629.feature

(limited to 'synapse')

diff --git a/changelog.d/15629.feature b/changelog.d/15629.feature
new file mode 100644
index 0000000000..16264effca
--- /dev/null
+++ b/changelog.d/15629.feature
@@ -0,0 +1 @@
+Scope transaction IDs to devices (implement [MSC3970](https://github.com/matrix-org/matrix-spec-proposals/pull/3970)).
diff --git a/synapse/config/experimental.py b/synapse/config/experimental.py
index 1695ed8ca3..ac9449b18f 100644
--- a/synapse/config/experimental.py
+++ b/synapse/config/experimental.py
@@ -216,12 +216,6 @@ class MSC3861:
                 ("session_lifetime",),
             )
 
-        if not root.experimental.msc3970_enabled:
-            raise ConfigError(
-                "experimental_features.msc3970_enabled must be 'true' when OAuth delegation is enabled",
-                ("experimental_features", "msc3970_enabled"),
-            )
-
 
 @attr.s(auto_attribs=True, frozen=True, slots=True)
 class MSC3866Config:
@@ -397,9 +391,6 @@ class ExperimentalConfig(Config):
                 "Invalid MSC3861 configuration", ("experimental", "msc3861")
             ) from exc
 
-        # MSC3970: Scope transaction IDs to devices
-        self.msc3970_enabled = experimental.get("msc3970_enabled", self.msc3861.enabled)
-
         # Check that none of the other config options conflict with MSC3861 when enabled
         self.msc3861.check_config_conflicts(self.root)
 
diff --git a/synapse/events/utils.py b/synapse/events/utils.py
index 967a6c245b..52acb21955 100644
--- a/synapse/events/utils.py
+++ b/synapse/events/utils.py
@@ -394,7 +394,6 @@ def serialize_event(
     time_now_ms: int,
     *,
     config: SerializeEventConfig = _DEFAULT_SERIALIZE_EVENT_CONFIG,
-    msc3970_enabled: bool = False,
 ) -> JsonDict:
     """Serialize event for clients
 
@@ -402,8 +401,6 @@ def serialize_event(
         e
         time_now_ms
         config: Event serialization config
-        msc3970_enabled: Whether MSC3970 is enabled. It changes whether we should
-            include the `transaction_id` in the event's `unsigned` section.
 
     Returns:
         The serialized event dictionary.
@@ -429,38 +426,46 @@ def serialize_event(
             e.unsigned["redacted_because"],
             time_now_ms,
             config=config,
-            msc3970_enabled=msc3970_enabled,
         )
 
     # If we have a txn_id saved in the internal_metadata, we should include it in the
     # unsigned section of the event if it was sent by the same session as the one
     # requesting the event.
     txn_id: Optional[str] = getattr(e.internal_metadata, "txn_id", None)
-    if txn_id is not None and config.requester is not None:
-        # For the MSC3970 rules to be applied, we *need* to have the device ID in the
-        # event internal metadata. Since we were not recording them before, if it hasn't
-        # been recorded, we fallback to the old behaviour.
+    if (
+        txn_id is not None
+        and config.requester is not None
+        and config.requester.user.to_string() == e.sender
+    ):
+        # Some events do not have the device ID stored in the internal metadata,
+        # this includes old events as well as those created by appservice, guests,
+        # or with tokens minted with the admin API. For those events, fallback
+        # to using the access token instead.
         event_device_id: Optional[str] = getattr(e.internal_metadata, "device_id", None)
-        if msc3970_enabled and event_device_id is not None:
+        if event_device_id is not None:
             if event_device_id == config.requester.device_id:
                 d["unsigned"]["transaction_id"] = txn_id
 
         else:
-            # The pre-MSC3970 behaviour is to only include the transaction ID if the
-            # event was sent from the same access token. For regular users, we can use
-            # the access token ID to determine this. For guests, we can't, but since
-            # each guest only has one access token, we can just check that the event was
-            # sent by the same user as the one requesting the event.
+            # Fallback behaviour: only include the transaction ID if the event
+            # was sent from the same access token.
+            #
+            # For regular users, the access token ID can be used to determine this.
+            # This includes access tokens minted with the admin API.
+            #
+            # For guests and appservice users, we can't check the access token ID
+            # so assume it is the same session.
             event_token_id: Optional[int] = getattr(
                 e.internal_metadata, "token_id", None
             )
-            if config.requester.user.to_string() == e.sender and (
+            if (
                 (
                     event_token_id is not None
                     and config.requester.access_token_id is not None
                     and event_token_id == config.requester.access_token_id
                 )
                 or config.requester.is_guest
+                or config.requester.app_service
             ):
                 d["unsigned"]["transaction_id"] = txn_id
 
@@ -504,9 +509,6 @@ class EventClientSerializer:
     clients.
     """
 
-    def __init__(self, *, msc3970_enabled: bool = False):
-        self._msc3970_enabled = msc3970_enabled
-
     def serialize_event(
         self,
         event: Union[JsonDict, EventBase],
@@ -531,9 +533,7 @@ class EventClientSerializer:
         if not isinstance(event, EventBase):
             return event
 
-        serialized_event = serialize_event(
-            event, time_now, config=config, msc3970_enabled=self._msc3970_enabled
-        )
+        serialized_event = serialize_event(event, time_now, config=config)
 
         # Check if there are any bundled aggregations to include with the event.
         if bundle_aggregations:
diff --git a/synapse/handlers/message.py b/synapse/handlers/message.py
index c656e07d37..d485f21e49 100644
--- a/synapse/handlers/message.py
+++ b/synapse/handlers/message.py
@@ -561,8 +561,6 @@ class EventCreationHandler:
                 expiry_ms=30 * 60 * 1000,
             )
 
-        self._msc3970_enabled = hs.config.experimental.msc3970_enabled
-
     async def create_event(
         self,
         requester: Requester,
@@ -897,9 +895,8 @@ class EventCreationHandler:
         """
         existing_event_id = None
 
-        if self._msc3970_enabled and requester.device_id:
-            # When MSC3970 is enabled, we lookup for events sent by the same device first,
-            # and fallback to the old behaviour if none were found.
+        # According to the spec, transactions are scoped to a user's device ID.
+        if requester.device_id:
             existing_event_id = (
                 await self.store.get_event_id_from_transaction_id_and_device_id(
                     room_id,
@@ -911,8 +908,9 @@ class EventCreationHandler:
             if existing_event_id:
                 return existing_event_id
 
-        # Pre-MSC3970, we looked up for events that were sent by the same session by
-        # using the access token ID.
+        # Some requsters don't have device IDs (appservice, guests, and access
+        # tokens minted with the admin API), fallback to checking the access token
+        # ID, which should be close enough.
         if requester.access_token_id:
             existing_event_id = (
                 await self.store.get_event_id_from_transaction_id_and_token_id(
diff --git a/synapse/rest/client/transactions.py b/synapse/rest/client/transactions.py
index 0d8a63d8be..3d814c404d 100644
--- a/synapse/rest/client/transactions.py
+++ b/synapse/rest/client/transactions.py
@@ -50,8 +50,6 @@ class HttpTransactionCache:
         # for at *LEAST* 30 mins, and at *MOST* 60 mins.
         self.cleaner = self.clock.looping_call(self._cleanup, CLEANUP_PERIOD_MS)
 
-        self._msc3970_enabled = hs.config.experimental.msc3970_enabled
-
     def _get_transaction_key(self, request: IRequest, requester: Requester) -> Hashable:
         """A helper function which returns a transaction key that can be used
         with TransactionCache for idempotent requests.
@@ -78,18 +76,20 @@ class HttpTransactionCache:
         elif requester.app_service is not None:
             return (path, "appservice", requester.app_service.id)
 
-        # With MSC3970, we use the user ID and device ID as the transaction key
-        elif self._msc3970_enabled:
+        # Use the user ID and device ID as the transaction key.
+        elif requester.device_id:
             assert requester.user, "Requester must have a user"
             assert requester.device_id, "Requester must have a device_id"
             return (path, "user", requester.user, requester.device_id)
 
-        # Otherwise, the pre-MSC3970 behaviour is to use the access token ID
+        # Some requsters don't have device IDs, these are mostly handled above
+        # (appservice and guest users), but does not cover access tokens minted
+        # by the admin API. Use the access token ID instead.
         else:
             assert (
                 requester.access_token_id is not None
             ), "Requester must have an access_token_id"
-            return (path, "user", requester.access_token_id)
+            return (path, "user_admin", requester.access_token_id)
 
     def fetch_or_execute_request(
         self,
diff --git a/synapse/server.py b/synapse/server.py
index 8430f99ef2..e753ff0377 100644
--- a/synapse/server.py
+++ b/synapse/server.py
@@ -785,9 +785,7 @@ class HomeServer(metaclass=abc.ABCMeta):
 
     @cache_in_self
     def get_event_client_serializer(self) -> EventClientSerializer:
-        return EventClientSerializer(
-            msc3970_enabled=self.config.experimental.msc3970_enabled
-        )
+        return EventClientSerializer()
 
     @cache_in_self
     def get_password_policy_handler(self) -> PasswordPolicyHandler:
diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py
index bd3f14fb71..c1353b18c1 100644
--- a/synapse/storage/databases/main/events.py
+++ b/synapse/storage/databases/main/events.py
@@ -127,8 +127,6 @@ class PersistEventsStore:
         self._backfill_id_gen: AbstractStreamIdGenerator = self.store._backfill_id_gen
         self._stream_id_gen: AbstractStreamIdGenerator = self.store._stream_id_gen
 
-        self._msc3970_enabled = hs.config.experimental.msc3970_enabled
-
     @trace
     async def _persist_events_and_state_updates(
         self,
@@ -1012,9 +1010,11 @@ class PersistEventsStore:
                         )
                     )
 
-        # Pre-MSC3970, we rely on the access_token_id to scope the txn_id for events.
-        # Since this is an experimental flag, we still store the mapping even if the
-        # flag is disabled.
+        # Synapse usually relies on the device_id to scope transactions for events,
+        # except for users without device IDs (appservice, guests, and access
+        # tokens minted with the admin API) which use the access token ID instead.
+        #
+        # TODO https://github.com/matrix-org/synapse/issues/16042
         if to_insert_token_id:
             self.db_pool.simple_insert_many_txn(
                 txn,
@@ -1030,10 +1030,7 @@ class PersistEventsStore:
                 values=to_insert_token_id,
             )
 
-        # With MSC3970, we rely on the device_id instead to scope the txn_id for events.
-        # We're only inserting if MSC3970 is *enabled*, because else the pre-MSC3970
-        # behaviour would allow for a UNIQUE constraint violation on this table
-        if to_insert_device_id and self._msc3970_enabled:
+        if to_insert_device_id:
             self.db_pool.simple_insert_many_txn(
                 txn,
                 table="event_txn_id_device_id",
diff --git a/synapse/storage/schema/__init__.py b/synapse/storage/schema/__init__.py
index d3ec648f6d..7de9949a5b 100644
--- a/synapse/storage/schema/__init__.py
+++ b/synapse/storage/schema/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-SCHEMA_VERSION = 79  # remember to update the list below when updating
+SCHEMA_VERSION = 80  # remember to update the list below when updating
 """Represents the expectations made by the codebase about the database schema
 
 This should be incremented whenever the codebase changes its requirements on the
@@ -110,6 +110,9 @@ Changes in SCHEMA_VERSION = 78
 Changes in SCHEMA_VERSION = 79
     - Add tables to handle in DB read-write locks.
     - Add some mitigations for a painful race between foreground and background updates, cf #15677.
+
+Changes in SCHEMA_VERSION = 80
+    - The event_txn_id_device_id is always written to for new events.
 """
 
 
diff --git a/synapse/types/__init__.py b/synapse/types/__init__.py
index fdfd465c8d..39a1ae4ac3 100644
--- a/synapse/types/__init__.py
+++ b/synapse/types/__init__.py
@@ -117,11 +117,12 @@ class Requester:
 
     Attributes:
         user:  id of the user making the request
-        access_token_id:  *ID* of the access token used for this
-            request, or None if it came via the appservice API or similar
+        access_token_id:  *ID* of the access token used for this request, or
+            None for appservices, guests, and tokens generated by the admin API
         is_guest:  True if the user making this request is a guest user
         shadow_banned:  True if the user making this request has been shadow-banned.
-        device_id:  device_id which was set at authentication time
+        device_id:  device_id which was set at authentication time, or
+            None for appservices, guests, and tokens generated by the admin API
         app_service:  the AS requesting on behalf of the user
         authenticated_entity: The entity that authenticated when making the request.
             This is different to the user_id when an admin user or the server is
-- 
cgit 1.5.1


From 84ae2e3f6fb86115df767bb2f1fb16ac2fbaa7c3 Mon Sep 17 00:00:00 2001
From: Shay <hillerys@element.io>
Date: Fri, 4 Aug 2023 10:49:54 -0700
Subject: Fix deletion for Dehydrated Devices (#16046)

---
 changelog.d/16046.bugfix          |   1 +
 synapse/handlers/device.py        |  16 +++++
 synapse/rest/client/devices.py    |  14 ++--
 tests/rest/client/test_devices.py | 139 +++++++++++++++++++++++++++++++++++++-
 4 files changed, 165 insertions(+), 5 deletions(-)
 create mode 100644 changelog.d/16046.bugfix

(limited to 'synapse')

diff --git a/changelog.d/16046.bugfix b/changelog.d/16046.bugfix
new file mode 100644
index 0000000000..ce5a9ae4b5
--- /dev/null
+++ b/changelog.d/16046.bugfix
@@ -0,0 +1 @@
+Fix deletion in dehydrated devices v2.
diff --git a/synapse/handlers/device.py b/synapse/handlers/device.py
index f3a713f5fa..b7bf70a72d 100644
--- a/synapse/handlers/device.py
+++ b/synapse/handlers/device.py
@@ -722,6 +722,22 @@ class DeviceHandler(DeviceWorkerHandler):
 
         return {"success": True}
 
+    async def delete_dehydrated_device(self, user_id: str, device_id: str) -> None:
+        """
+        Delete a stored dehydrated device.
+
+        Args:
+            user_id: the user_id to delete the device from
+            device_id: id of the dehydrated device to delete
+        """
+        success = await self.store.remove_dehydrated_device(user_id, device_id)
+
+        if not success:
+            raise errors.NotFoundError()
+
+        await self.delete_devices(user_id, [device_id])
+        await self.store.delete_e2e_keys_by_device(user_id=user_id, device_id=device_id)
+
     @wrap_as_background_process("_handle_new_device_update_async")
     async def _handle_new_device_update_async(self) -> None:
         """Called when we have a new local device list update that we need to
diff --git a/synapse/rest/client/devices.py b/synapse/rest/client/devices.py
index 690d2ec406..dd3f7fd666 100644
--- a/synapse/rest/client/devices.py
+++ b/synapse/rest/client/devices.py
@@ -513,10 +513,8 @@ class DehydratedDeviceV2Servlet(RestServlet):
         if dehydrated_device is not None:
             (device_id, device_data) = dehydrated_device
 
-            result = await self.device_handler.rehydrate_device(
-                requester.user.to_string(),
-                self.auth.get_access_token_from_request(request),
-                device_id,
+            await self.device_handler.delete_dehydrated_device(
+                requester.user.to_string(), device_id
             )
 
             result = {"device_id": device_id}
@@ -538,6 +536,14 @@ class DehydratedDeviceV2Servlet(RestServlet):
         requester = await self.auth.get_user_by_req(request)
         user_id = requester.user.to_string()
 
+        old_dehydrated_device = await self.device_handler.get_dehydrated_device(user_id)
+
+        # if an old device exists, delete it before creating a new one
+        if old_dehydrated_device:
+            await self.device_handler.delete_dehydrated_device(
+                user_id, old_dehydrated_device[0]
+            )
+
         device_info = submission.dict()
         if "device_keys" not in device_info.keys():
             raise SynapseError(
diff --git a/tests/rest/client/test_devices.py b/tests/rest/client/test_devices.py
index b7d420cfec..3cf29c10ea 100644
--- a/tests/rest/client/test_devices.py
+++ b/tests/rest/client/test_devices.py
@@ -379,4 +379,141 @@ class DehydratedDeviceTestCase(unittest.HomeserverTestCase):
             access_token=token,
             shorthand=False,
         )
-        self.assertEqual(channel.code, 404)
+        self.assertEqual(channel.code, 401)
+
+    @unittest.override_config(
+        {"experimental_features": {"msc2697_enabled": False, "msc3814_enabled": True}}
+    )
+    def test_msc3814_dehydrated_device_delete_works(self) -> None:
+        user = self.register_user("mikey", "pass")
+        token = self.login(user, "pass", device_id="device1")
+        content: JsonDict = {
+            "device_data": {
+                "algorithm": "m.dehydration.v1.olm",
+            },
+            "device_id": "device2",
+            "initial_device_display_name": "foo bar",
+            "device_keys": {
+                "user_id": "@mikey:test",
+                "device_id": "device2",
+                "valid_until_ts": "80",
+                "algorithms": [
+                    "m.olm.curve25519-aes-sha2",
+                ],
+                "keys": {
+                    "<algorithm>:<device_id>": "<key_base64>",
+                },
+                "signatures": {
+                    "<user_id>": {"<algorithm>:<device_id>": "<signature_base64>"}
+                },
+            },
+        }
+        channel = self.make_request(
+            "PUT",
+            "_matrix/client/unstable/org.matrix.msc3814.v1/dehydrated_device",
+            content=content,
+            access_token=token,
+            shorthand=False,
+        )
+        self.assertEqual(channel.code, 200)
+        device_id = channel.json_body.get("device_id")
+        assert device_id is not None
+        self.assertIsInstance(device_id, str)
+        self.assertEqual("device2", device_id)
+
+        # ensure that keys were uploaded and available
+        channel = self.make_request(
+            "POST",
+            "/_matrix/client/r0/keys/query",
+            {
+                "device_keys": {
+                    user: ["device2"],
+                },
+            },
+            token,
+        )
+        self.assertEqual(
+            channel.json_body["device_keys"][user]["device2"]["keys"],
+            {
+                "<algorithm>:<device_id>": "<key_base64>",
+            },
+        )
+
+        # delete the dehydrated device
+        channel = self.make_request(
+            "DELETE",
+            "_matrix/client/unstable/org.matrix.msc3814.v1/dehydrated_device",
+            access_token=token,
+            shorthand=False,
+        )
+        self.assertEqual(channel.code, 200)
+
+        # ensure that keys are no longer available for deleted device
+        channel = self.make_request(
+            "POST",
+            "/_matrix/client/r0/keys/query",
+            {
+                "device_keys": {
+                    user: ["device2"],
+                },
+            },
+            token,
+        )
+        self.assertEqual(channel.json_body["device_keys"], {"@mikey:test": {}})
+
+        # check that an old device is deleted when user PUTs a new device
+        # First, create a device
+        content["device_id"] = "device3"
+        content["device_keys"]["device_id"] = "device3"
+        channel = self.make_request(
+            "PUT",
+            "_matrix/client/unstable/org.matrix.msc3814.v1/dehydrated_device",
+            content=content,
+            access_token=token,
+            shorthand=False,
+        )
+        self.assertEqual(channel.code, 200)
+        device_id = channel.json_body.get("device_id")
+        assert device_id is not None
+        self.assertIsInstance(device_id, str)
+        self.assertEqual("device3", device_id)
+
+        # create a second device without deleting first device
+        content["device_id"] = "device4"
+        content["device_keys"]["device_id"] = "device4"
+        channel = self.make_request(
+            "PUT",
+            "_matrix/client/unstable/org.matrix.msc3814.v1/dehydrated_device",
+            content=content,
+            access_token=token,
+            shorthand=False,
+        )
+        self.assertEqual(channel.code, 200)
+        device_id = channel.json_body.get("device_id")
+        assert device_id is not None
+        self.assertIsInstance(device_id, str)
+        self.assertEqual("device4", device_id)
+
+        # check that the second device that was created is what is returned when we GET
+        channel = self.make_request(
+            "GET",
+            "_matrix/client/unstable/org.matrix.msc3814.v1/dehydrated_device",
+            access_token=token,
+            shorthand=False,
+        )
+        self.assertEqual(channel.code, 200)
+        returned_device_id = channel.json_body["device_id"]
+        self.assertEqual(returned_device_id, "device4")
+
+        # and that if we query the keys for the first device they are not there
+        channel = self.make_request(
+            "POST",
+            "/_matrix/client/r0/keys/query",
+            {
+                "device_keys": {
+                    user: ["device3"],
+                },
+            },
+            token,
+        )
+        self.assertEqual(channel.json_body["device_keys"], {"@mikey:test": {}})
-- 
cgit 1.5.1


From 81a6f8c9ae0241afa9973da2f53efc2467e61c6b Mon Sep 17 00:00:00 2001
From: Shay <hillerys@element.io>
Date: Mon, 7 Aug 2023 10:37:08 -0700
Subject: Drop backwards compat hack for event serialization (#16069)

---
 changelog.d/16069.misc     |  1 +
 synapse/events/snapshot.py | 12 +-----------
 2 files changed, 2 insertions(+), 11 deletions(-)
 create mode 100644 changelog.d/16069.misc

(limited to 'synapse')

diff --git a/changelog.d/16069.misc b/changelog.d/16069.misc
new file mode 100644
index 0000000000..f59ead8638
--- /dev/null
+++ b/changelog.d/16069.misc
@@ -0,0 +1 @@
+Drop backwards compat hack for event serialization.
diff --git a/synapse/events/snapshot.py b/synapse/events/snapshot.py
index a43498ed4d..a9e3d4e556 100644
--- a/synapse/events/snapshot.py
+++ b/synapse/events/snapshot.py
@@ -186,9 +186,6 @@ class EventContext(UnpersistedEventContextBase):
             ),
             "app_service_id": self.app_service.id if self.app_service else None,
             "partial_state": self.partial_state,
-            # add dummy delta_ids and prev_group for backwards compatibility
-            "delta_ids": None,
-            "prev_group": None,
         }
 
     @staticmethod
@@ -203,13 +200,6 @@ class EventContext(UnpersistedEventContextBase):
         Returns:
             The event context.
         """
-        # workaround for backwards/forwards compatibility: if the input doesn't have a value
-        # for "state_group_deltas" just assign an empty dict
-        state_group_deltas = input.get("state_group_deltas", None)
-        if state_group_deltas:
-            state_group_deltas = _decode_state_group_delta(state_group_deltas)
-        else:
-            state_group_deltas = {}
 
         context = EventContext(
             # We use the state_group and prev_state_id stuff to pull the
@@ -217,7 +207,7 @@ class EventContext(UnpersistedEventContextBase):
             storage=storage,
             state_group=input["state_group"],
             state_group_before_event=input["state_group_before_event"],
-            state_group_deltas=state_group_deltas,
+            state_group_deltas=_decode_state_group_delta(input["state_group_deltas"]),
             state_delta_due_to_event=_decode_state_dict(
                 input["state_delta_due_to_event"]
             ),
-- 
cgit 1.5.1


From 8af3f33d84b0f63cb9baab6c8616983222d75307 Mon Sep 17 00:00:00 2001
From: Shay <hillerys@element.io>
Date: Mon, 7 Aug 2023 10:52:15 -0700
Subject: Fix endpoint improperly declaring support for MSC3814 (#16068)

---
 changelog.d/16068.misc         |  1 +
 synapse/rest/client/devices.py | 18 ++++++++----------
 2 files changed, 9 insertions(+), 10 deletions(-)
 create mode 100644 changelog.d/16068.misc

(limited to 'synapse')

diff --git a/changelog.d/16068.misc b/changelog.d/16068.misc
new file mode 100644
index 0000000000..341426a746
--- /dev/null
+++ b/changelog.d/16068.misc
@@ -0,0 +1 @@
+Fix endpoint improperly declaring support for MSC3814.
diff --git a/synapse/rest/client/devices.py b/synapse/rest/client/devices.py
index dd3f7fd666..51f17f80da 100644
--- a/synapse/rest/client/devices.py
+++ b/synapse/rest/client/devices.py
@@ -232,7 +232,7 @@ class DehydratedDeviceDataModel(RequestBodyModel):
 class DehydratedDeviceServlet(RestServlet):
     """Retrieve or store a dehydrated device.
 
-    Implements either MSC2697 or MSC3814.
+    Implements MSC2697.
 
     GET /org.matrix.msc2697.v2/dehydrated_device
 
@@ -266,7 +266,12 @@ class DehydratedDeviceServlet(RestServlet):
 
     """
 
-    def __init__(self, hs: "HomeServer", msc2697: bool = True):
+    PATTERNS = client_patterns(
+        "/org.matrix.msc2697.v2/dehydrated_device$",
+        releases=(),
+    )
+
+    def __init__(self, hs: "HomeServer"):
         super().__init__()
         self.hs = hs
         self.auth = hs.get_auth()
@@ -274,13 +279,6 @@ class DehydratedDeviceServlet(RestServlet):
         assert isinstance(handler, DeviceHandler)
         self.device_handler = handler
 
-        self.PATTERNS = client_patterns(
-            "/org.matrix.msc2697.v2/dehydrated_device$"
-            if msc2697
-            else "/org.matrix.msc3814.v1/dehydrated_device$",
-            releases=(),
-        )
-
     async def on_GET(self, request: SynapseRequest) -> Tuple[int, JsonDict]:
         requester = await self.auth.get_user_by_req(request)
         dehydrated_device = await self.device_handler.get_dehydrated_device(
@@ -579,7 +577,7 @@ def register_servlets(hs: "HomeServer", http_server: HttpServer) -> None:
     if hs.config.worker.worker_app is None:
         DeviceRestServlet(hs).register(http_server)
         if hs.config.experimental.msc2697_enabled:
-            DehydratedDeviceServlet(hs, msc2697=True).register(http_server)
+            DehydratedDeviceServlet(hs).register(http_server)
             ClaimDehydratedDeviceServlet(hs).register(http_server)
         if hs.config.experimental.msc3814_enabled:
             DehydratedDeviceV2Servlet(hs).register(http_server)
-- 
cgit 1.5.1


From f3dc6dc19f902b638c164097342136010f0769d1 Mon Sep 17 00:00:00 2001
From: reivilibre <oliverw@matrix.org>
Date: Tue, 8 Aug 2023 10:10:07 +0000
Subject: Remove old rows from the `cache_invalidation_stream_by_instance`
 table automatically. (This table is not used when Synapse is configured to
 use SQLite.) (#15868)

* Add a cache invalidation clean-up task

* Run the cache invalidation stream clean-up on the background worker

* Tune down

* call_later is in millis!

* Newsfile

Signed-off-by: Olivier Wilkinson (reivilibre) <oliverw@matrix.org>

* fixup! Add a cache invalidation clean-up task

* Update synapse/storage/databases/main/cache.py

Co-authored-by: Eric Eastwood <erice@element.io>

* Update synapse/storage/databases/main/cache.py

Co-authored-by: Eric Eastwood <erice@element.io>

* MILLISEC -> MS

* Expand on comment

* Move and tweak comment about Postgres

* Use `wrap_as_background_process`

---------

Signed-off-by: Olivier Wilkinson (reivilibre) <oliverw@matrix.org>
Co-authored-by: Eric Eastwood <erice@element.io>
---
 changelog.d/15868.feature               |   1 +
 synapse/storage/databases/main/cache.py | 130 ++++++++++++++++++++++++++++++++
 2 files changed, 131 insertions(+)
 create mode 100644 changelog.d/15868.feature

(limited to 'synapse')

diff --git a/changelog.d/15868.feature b/changelog.d/15868.feature
new file mode 100644
index 0000000000..a866bf5774
--- /dev/null
+++ b/changelog.d/15868.feature
@@ -0,0 +1 @@
+Remove old rows from the `cache_invalidation_stream_by_instance` table automatically (this table is unused in SQLite).
diff --git a/synapse/storage/databases/main/cache.py b/synapse/storage/databases/main/cache.py
index c940f864d1..2fbd389c71 100644
--- a/synapse/storage/databases/main/cache.py
+++ b/synapse/storage/databases/main/cache.py
@@ -18,6 +18,8 @@ import logging
 from typing import TYPE_CHECKING, Any, Collection, Iterable, List, Optional, Tuple
 
 from synapse.api.constants import EventTypes
+from synapse.config._base import Config
+from synapse.metrics.background_process_metrics import wrap_as_background_process
 from synapse.replication.tcp.streams import BackfillStream, CachesStream
 from synapse.replication.tcp.streams.events import (
     EventsStream,
@@ -52,6 +54,21 @@ PURGE_HISTORY_CACHE_NAME = "ph_cache_fake"
 # As above, but for invalidating room caches on room deletion
 DELETE_ROOM_CACHE_NAME = "dr_cache_fake"
 
+# How long between cache invalidation table cleanups, once we have caught up
+# with the backlog.
+REGULAR_CLEANUP_INTERVAL_MS = Config.parse_duration("1h")
+
+# How long between cache invalidation table cleanups, before we have caught
+# up with the backlog.
+CATCH_UP_CLEANUP_INTERVAL_MS = Config.parse_duration("1m")
+
+# Maximum number of cache invalidation rows to delete at once.
+CLEAN_UP_MAX_BATCH_SIZE = 20_000
+
+# Keep cache invalidations for 7 days
+# (This is likely to be quite excessive.)
+RETENTION_PERIOD_OF_CACHE_INVALIDATIONS_MS = Config.parse_duration("7d")
+
 
 class CacheInvalidationWorkerStore(SQLBaseStore):
     def __init__(
@@ -98,6 +115,18 @@ class CacheInvalidationWorkerStore(SQLBaseStore):
         else:
             self._cache_id_gen = None
 
+        # Occasionally clean up the cache invalidations stream table by deleting
+        # old rows.
+        # This is only applicable when Postgres is in use; this table is unused
+        # and not populated at all when SQLite is the active database engine.
+        if hs.config.worker.run_background_tasks and isinstance(
+            self.database_engine, PostgresEngine
+        ):
+            self.hs.get_clock().call_later(
+                CATCH_UP_CLEANUP_INTERVAL_MS / 1000,
+                self._clean_up_cache_invalidation_wrapper,
+            )
+
     async def get_all_updated_caches(
         self, instance_name: str, last_id: int, current_id: int, limit: int
     ) -> Tuple[List[Tuple[int, tuple]], int, bool]:
@@ -554,3 +583,104 @@ class CacheInvalidationWorkerStore(SQLBaseStore):
             return self._cache_id_gen.get_current_token_for_writer(instance_name)
         else:
             return 0
+
+    @wrap_as_background_process("clean_up_old_cache_invalidations")
+    async def _clean_up_cache_invalidation_wrapper(self) -> None:
+        """
+        Clean up cache invalidation stream table entries occasionally.
+        If we are behind (i.e. there are entries old enough to
+        be deleted but too many of them to be deleted in one go),
+        then we run slightly more frequently.
+        """
+        delete_up_to: int = (
+            self.hs.get_clock().time_msec() - RETENTION_PERIOD_OF_CACHE_INVALIDATIONS_MS
+        )
+
+        in_backlog = await self._clean_up_batch_of_old_cache_invalidations(delete_up_to)
+
+        # Vary how long we wait before calling again depending on whether we
+        # are still sifting through backlog or we have caught up.
+        if in_backlog:
+            next_interval = CATCH_UP_CLEANUP_INTERVAL_MS
+        else:
+            next_interval = REGULAR_CLEANUP_INTERVAL_MS
+
+        self.hs.get_clock().call_later(
+            next_interval / 1000, self._clean_up_cache_invalidation_wrapper
+        )
+
+    async def _clean_up_batch_of_old_cache_invalidations(
+        self, delete_up_to_millisec: int
+    ) -> bool:
+        """
+        Remove old rows from the `cache_invalidation_stream_by_instance` table automatically (this table is unused in SQLite).
+
+        Up to `CLEAN_UP_BATCH_SIZE` rows will be deleted at once.
+
+        Returns true if and only if we were limited by batch size (i.e. we are in backlog:
+        there are more things to clean up).
+        """
+
+        def _clean_up_batch_of_old_cache_invalidations_txn(
+            txn: LoggingTransaction,
+        ) -> bool:
+            # First get the earliest stream ID
+            txn.execute(
+                """
+                SELECT stream_id FROM cache_invalidation_stream_by_instance
+                ORDER BY stream_id ASC
+                LIMIT 1
+                """
+            )
+            row = txn.fetchone()
+            if row is None:
+                return False
+            earliest_stream_id: int = row[0]
+
+            # Then find the last stream ID of the range we will delete
+            txn.execute(
+                """
+                SELECT stream_id FROM cache_invalidation_stream_by_instance
+                WHERE stream_id <= ? AND invalidation_ts <= ?
+                ORDER BY stream_id DESC
+                LIMIT 1
+                """,
+                (earliest_stream_id + CLEAN_UP_MAX_BATCH_SIZE, delete_up_to_millisec),
+            )
+            row = txn.fetchone()
+            if row is None:
+                return False
+            cutoff_stream_id: int = row[0]
+
+            # Determine whether we are caught up or still catching up
+            txn.execute(
+                """
+                SELECT invalidation_ts FROM cache_invalidation_stream_by_instance
+                WHERE stream_id > ?
+                ORDER BY stream_id ASC
+                LIMIT 1
+                """,
+                (cutoff_stream_id,),
+            )
+            row = txn.fetchone()
+            if row is None:
+                in_backlog = False
+            else:
+                # We are in backlog if the next row could have been deleted
+                # if we didn't have such a small batch size
+                in_backlog = row[0] <= delete_up_to_millisec
+
+            txn.execute(
+                """
+                DELETE FROM cache_invalidation_stream_by_instance
+                WHERE ? <= stream_id AND stream_id <= ?
+                """,
+                (earliest_stream_id, cutoff_stream_id),
+            )
+
+            return in_backlog
+
+        return await self.db_pool.runInteraction(
+            "clean_up_old_cache_invalidations",
+            _clean_up_batch_of_old_cache_invalidations_txn,
+        )
-- 
cgit 1.5.1


From a476d5048b96d6f9422f3d31d3c14a5247855715 Mon Sep 17 00:00:00 2001
From: reivilibre <oliverw@matrix.org>
Date: Tue, 8 Aug 2023 10:53:49 +0000
Subject: Allow modules to schedule delayed background calls. (#15993)

* Add a module API function to provide `call_later`

* Newsfile

Signed-off-by: Olivier Wilkinson (reivilibre) <oliverw@matrix.org>

* Add comments

* Update version number

---------

Signed-off-by: Olivier Wilkinson (reivilibre) <oliverw@matrix.org>
---
 changelog.d/15993.misc         |  1 +
 synapse/module_api/__init__.py | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)
 create mode 100644 changelog.d/15993.misc

(limited to 'synapse')

diff --git a/changelog.d/15993.misc b/changelog.d/15993.misc
new file mode 100644
index 0000000000..35ead05157
--- /dev/null
+++ b/changelog.d/15993.misc
@@ -0,0 +1 @@
+Allow modules to schedule delayed background calls.
\ No newline at end of file
diff --git a/synapse/module_api/__init__.py b/synapse/module_api/__init__.py
index ba1a925003..acee1dafd3 100644
--- a/synapse/module_api/__init__.py
+++ b/synapse/module_api/__init__.py
@@ -34,6 +34,7 @@ import jinja2
 from typing_extensions import ParamSpec
 
 from twisted.internet import defer
+from twisted.internet.interfaces import IDelayedCall
 from twisted.web.resource import Resource
 
 from synapse.api import errors
@@ -1242,6 +1243,46 @@ class ModuleApi:
         """
         return self._hs.config.worker.run_background_tasks
 
+    def delayed_background_call(
+        self,
+        msec: float,
+        f: Callable,
+        *args: object,
+        desc: Optional[str] = None,
+        **kwargs: object,
+    ) -> IDelayedCall:
+        """Wraps a function as a background process and calls it in a given number of milliseconds.
+
+        The scheduled call is not persistent: if the current Synapse instance is
+        restarted before the call is made, the call will not be made.
+
+        Added in Synapse v1.90.0.
+
+        Args:
+            msec: How long to wait before calling, in milliseconds.
+            f: The function to call once. f can be either synchronous or
+                asynchronous, and must follow Synapse's logcontext rules.
+                More info about logcontexts is available at
+                https://matrix-org.github.io/synapse/latest/log_contexts.html
+            *args: Positional arguments to pass to function.
+            desc: The background task's description. Default to the function's name.
+            **kwargs: Keyword arguments to pass to function.
+
+        Returns:
+            IDelayedCall handle from twisted, which allows to cancel the delayed call if desired.
+        """
+
+        if desc is None:
+            desc = f.__name__
+
+        return self._clock.call_later(
+            # convert ms to seconds as needed by call_later.
+            msec * 0.001,
+            run_as_background_process,
+            desc,
+            lambda: maybe_awaitable(f(*args, **kwargs)),
+        )
+
     async def sleep(self, seconds: float) -> None:
         """Sleeps for the given number of seconds.
 
-- 
cgit 1.5.1