From b596a1eb80be4b24142d131238dd5402db6bb855 Mon Sep 17 00:00:00 2001
From: Dirk Klimpel <5740567+dklimpel@users.noreply.github.com>
Date: Thu, 11 Nov 2021 16:01:13 +0100
Subject: Move sql file for `remove_deleted_devices_from_device_inbox` into v65
 (#11303)

---
 .../02remove_deleted_devices_from_device_inbox.sql | 22 ----------------------
 .../05remove_deleted_devices_from_device_inbox.sql | 22 ++++++++++++++++++++++
 2 files changed, 22 insertions(+), 22 deletions(-)
 delete mode 100644 synapse/storage/schema/main/delta/64/02remove_deleted_devices_from_device_inbox.sql
 create mode 100644 synapse/storage/schema/main/delta/65/05remove_deleted_devices_from_device_inbox.sql

(limited to 'synapse/storage')

diff --git a/synapse/storage/schema/main/delta/64/02remove_deleted_devices_from_device_inbox.sql b/synapse/storage/schema/main/delta/64/02remove_deleted_devices_from_device_inbox.sql
deleted file mode 100644
index fca7290741..0000000000
--- a/synapse/storage/schema/main/delta/64/02remove_deleted_devices_from_device_inbox.sql
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2021 The Matrix.org Foundation C.I.C
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
--- Remove messages from the device_inbox table which were orphaned
--- when a device was deleted using Synapse earlier than 1.47.0.
--- This runs as background task, but may take a bit to finish.
-
-INSERT INTO background_updates (ordering, update_name, progress_json) VALUES
-  (6402, 'remove_deleted_devices_from_device_inbox', '{}');
diff --git a/synapse/storage/schema/main/delta/65/05remove_deleted_devices_from_device_inbox.sql b/synapse/storage/schema/main/delta/65/05remove_deleted_devices_from_device_inbox.sql
new file mode 100644
index 0000000000..076179123d
--- /dev/null
+++ b/synapse/storage/schema/main/delta/65/05remove_deleted_devices_from_device_inbox.sql
@@ -0,0 +1,22 @@
+/* Copyright 2021 The Matrix.org Foundation C.I.C
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+-- Remove messages from the device_inbox table which were orphaned
+-- when a device was deleted using Synapse earlier than 1.47.0.
+-- This runs as background task, but may take a bit to finish.
+
+INSERT INTO background_updates (ordering, update_name, progress_json) VALUES
+  (6505, 'remove_deleted_devices_from_device_inbox', '{}');
-- 
cgit 1.5.1


From 9c59e117db6b448a1e930365014b043fa7ef26b6 Mon Sep 17 00:00:00 2001
From: Andrew Morgan <1342360+anoadragon453@users.noreply.github.com>
Date: Mon, 15 Nov 2021 17:34:15 +0000
Subject: Run _upgrade_existing_database on workers if at current
 schema_version (#11346)

Co-authored-by: Richard van der Hoff <1389908+richvdh@users.noreply.github.com>
---
 changelog.d/11346.bugfix              |  1 +
 synapse/storage/prepare_database.py   | 40 +++++++++++++++------------
 tests/storage/test_rollback_worker.py | 52 ++++++++++++++++++++++++++++++++++-
 3 files changed, 74 insertions(+), 19 deletions(-)
 create mode 100644 changelog.d/11346.bugfix

(limited to 'synapse/storage')

diff --git a/changelog.d/11346.bugfix b/changelog.d/11346.bugfix
new file mode 100644
index 0000000000..1fe8020eab
--- /dev/null
+++ b/changelog.d/11346.bugfix
@@ -0,0 +1 @@
+Fix a bug introduced in v1.47.0rc1 which caused worker processes to not halt startup in the presence of outstanding database migrations.
\ No newline at end of file
diff --git a/synapse/storage/prepare_database.py b/synapse/storage/prepare_database.py
index 8b9c6adae2..e45adfcb55 100644
--- a/synapse/storage/prepare_database.py
+++ b/synapse/storage/prepare_database.py
@@ -131,24 +131,16 @@ def prepare_database(
                     "config==None in prepare_database, but database is not empty"
                 )
 
-            # if it's a worker app, refuse to upgrade the database, to avoid multiple
-            # workers doing it at once.
-            if config.worker.worker_app is None:
-                _upgrade_existing_database(
-                    cur,
-                    version_info,
-                    database_engine,
-                    config,
-                    databases=databases,
-                )
-            elif version_info.current_version < SCHEMA_VERSION:
-                # If the DB is on an older version than we expect then we refuse
-                # to start the worker (as the main process needs to run first to
-                # update the schema).
-                raise UpgradeDatabaseException(
-                    OUTDATED_SCHEMA_ON_WORKER_ERROR
-                    % (SCHEMA_VERSION, version_info.current_version)
-                )
+            # This should be run on all processes, master or worker. The master will
+            # apply the deltas, while workers will check if any outstanding deltas
+            # exist and raise an PrepareDatabaseException if they do.
+            _upgrade_existing_database(
+                cur,
+                version_info,
+                database_engine,
+                config,
+                databases=databases,
+            )
 
         else:
             logger.info("%r: Initialising new database", databases)
@@ -358,6 +350,18 @@ def _upgrade_existing_database(
 
     is_worker = config and config.worker.worker_app is not None
 
+    # If the schema version needs to be updated, and we are on a worker, we immediately
+    # know to bail out as workers cannot update the database schema. Only one process
+    # must update the database at the time, therefore we delegate this task to the master.
+    if is_worker and current_schema_state.current_version < SCHEMA_VERSION:
+        # If the DB is on an older version than we expect then we refuse
+        # to start the worker (as the main process needs to run first to
+        # update the schema).
+        raise UpgradeDatabaseException(
+            OUTDATED_SCHEMA_ON_WORKER_ERROR
+            % (SCHEMA_VERSION, current_schema_state.current_version)
+        )
+
     if (
         current_schema_state.compat_version is not None
         and current_schema_state.compat_version > SCHEMA_VERSION
diff --git a/tests/storage/test_rollback_worker.py b/tests/storage/test_rollback_worker.py
index a6be9a1bb1..0ce0892165 100644
--- a/tests/storage/test_rollback_worker.py
+++ b/tests/storage/test_rollback_worker.py
@@ -11,6 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import List
+from unittest import mock
+
 from synapse.app.generic_worker import GenericWorkerServer
 from synapse.storage.database import LoggingDatabaseConnection
 from synapse.storage.prepare_database import PrepareDatabaseException, prepare_database
@@ -19,6 +22,22 @@ from synapse.storage.schema import SCHEMA_VERSION
 from tests.unittest import HomeserverTestCase
 
 
+def fake_listdir(filepath: str) -> List[str]:
+    """
+    A fake implementation of os.listdir which we can use to mock out the filesystem.
+
+    Args:
+        filepath: The directory to list files for.
+
+    Returns:
+        A list of files and folders in the directory.
+    """
+    if filepath.endswith("full_schemas"):
+        return [SCHEMA_VERSION]
+
+    return ["99_add_unicorn_to_database.sql"]
+
+
 class WorkerSchemaTests(HomeserverTestCase):
     def make_homeserver(self, reactor, clock):
         hs = self.setup_test_homeserver(
@@ -51,7 +70,7 @@ class WorkerSchemaTests(HomeserverTestCase):
 
         prepare_database(db_conn, db_pool.engine, self.hs.config)
 
-    def test_not_upgraded(self):
+    def test_not_upgraded_old_schema_version(self):
         """Test that workers don't start if the DB has an older schema version"""
         db_pool = self.hs.get_datastore().db_pool
         db_conn = LoggingDatabaseConnection(
@@ -67,3 +86,34 @@ class WorkerSchemaTests(HomeserverTestCase):
 
         with self.assertRaises(PrepareDatabaseException):
             prepare_database(db_conn, db_pool.engine, self.hs.config)
+
+    def test_not_upgraded_current_schema_version_with_outstanding_deltas(self):
+        """
+        Test that workers don't start if the DB is on the current schema version,
+        but there are still outstanding delta migrations to run.
+        """
+        db_pool = self.hs.get_datastore().db_pool
+        db_conn = LoggingDatabaseConnection(
+            db_pool._db_pool.connect(),
+            db_pool.engine,
+            "tests",
+        )
+
+        # Set the schema version of the database to the current version
+        cur = db_conn.cursor()
+        cur.execute("UPDATE schema_version SET version = ?", (SCHEMA_VERSION,))
+
+        db_conn.commit()
+
+        # Path `os.listdir` here to make synapse think that there is a migration
+        # file ready to be run.
+        # Note that we can't patch this function for the whole method, else Synapse
+        # will try to find the file when building the database initially.
+        with mock.patch("os.listdir", mock.Mock(side_effect=fake_listdir)):
+            with self.assertRaises(PrepareDatabaseException):
+                # Synapse should think that there is an outstanding migration file due to
+                # patching 'os.listdir' in the function decorator.
+                #
+                # We expect Synapse to raise an exception to indicate the master process
+                # needs to apply this migration file.
+                prepare_database(db_conn, db_pool.engine, self.hs.config)
-- 
cgit 1.5.1


From 6e084b62b88cf3d18646a036b7079c8a899349ab Mon Sep 17 00:00:00 2001
From: Andrew Morgan <1342360+anoadragon453@users.noreply.github.com>
Date: Tue, 16 Nov 2021 13:16:43 +0000
Subject: Rename `remove_deleted_devices_from_device_inbox` to ensure it is
 always run (#11353)

Co-authored-by: reivilibre <oliverw@matrix.org>
---
 changelog.d/11353.misc                             |  1 +
 .../05remove_deleted_devices_from_device_inbox.sql | 22 --------------
 .../06remove_deleted_devices_from_device_inbox.sql | 34 ++++++++++++++++++++++
 3 files changed, 35 insertions(+), 22 deletions(-)
 create mode 100644 changelog.d/11353.misc
 delete mode 100644 synapse/storage/schema/main/delta/65/05remove_deleted_devices_from_device_inbox.sql
 create mode 100644 synapse/storage/schema/main/delta/65/06remove_deleted_devices_from_device_inbox.sql

(limited to 'synapse/storage')

diff --git a/changelog.d/11353.misc b/changelog.d/11353.misc
new file mode 100644
index 0000000000..fa96dae919
--- /dev/null
+++ b/changelog.d/11353.misc
@@ -0,0 +1 @@
+Fix an issue which prevented the 'remove deleted devices from `device_inbox` column' background process from running when updating from a recent Synapse version.
\ No newline at end of file
diff --git a/synapse/storage/schema/main/delta/65/05remove_deleted_devices_from_device_inbox.sql b/synapse/storage/schema/main/delta/65/05remove_deleted_devices_from_device_inbox.sql
deleted file mode 100644
index 076179123d..0000000000
--- a/synapse/storage/schema/main/delta/65/05remove_deleted_devices_from_device_inbox.sql
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2021 The Matrix.org Foundation C.I.C
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
--- Remove messages from the device_inbox table which were orphaned
--- when a device was deleted using Synapse earlier than 1.47.0.
--- This runs as background task, but may take a bit to finish.
-
-INSERT INTO background_updates (ordering, update_name, progress_json) VALUES
-  (6505, 'remove_deleted_devices_from_device_inbox', '{}');
diff --git a/synapse/storage/schema/main/delta/65/06remove_deleted_devices_from_device_inbox.sql b/synapse/storage/schema/main/delta/65/06remove_deleted_devices_from_device_inbox.sql
new file mode 100644
index 0000000000..82f6408b36
--- /dev/null
+++ b/synapse/storage/schema/main/delta/65/06remove_deleted_devices_from_device_inbox.sql
@@ -0,0 +1,34 @@
+/* Copyright 2021 The Matrix.org Foundation C.I.C
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+-- Remove messages from the device_inbox table which were orphaned
+-- when a device was deleted using Synapse earlier than 1.47.0.
+-- This runs as background task, but may take a bit to finish.
+
+-- Remove any existing instances of this job running. It's OK to stop and restart this job,
+-- as it's just deleting entries from a table - no progress will be lost.
+--
+-- This is necessary due a similar migration running the job accidentally
+-- being included in schema version 64 during v1.47.0rc1,rc2. If a
+-- homeserver had updated from Synapse <=v1.45.0 (schema version <=64),
+-- then they would have started running this background update already.
+-- If that update was still running, then simply inserting it again would
+-- cause an SQL failure. So we effectively do an "upsert" here instead.
+
+DELETE FROM background_updates WHERE update_name = 'remove_deleted_devices_from_device_inbox';
+
+INSERT INTO background_updates (ordering, update_name, progress_json) VALUES
+  (6506, 'remove_deleted_devices_from_device_inbox', '{}');
-- 
cgit 1.5.1


From 84fac0f814f69645ff1ad564ef8294b31203dc95 Mon Sep 17 00:00:00 2001
From: Sean Quah <8349537+squahtx@users.noreply.github.com>
Date: Wed, 17 Nov 2021 19:07:02 +0000
Subject: Add type annotations to `synapse.metrics` (#10847)

---
 changelog.d/10847.misc                        |   1 +
 mypy.ini                                      |   3 +
 synapse/app/_base.py                          |   2 +-
 synapse/groups/attestations.py                |   4 +-
 synapse/handlers/typing.py                    |   2 +-
 synapse/metrics/__init__.py                   | 101 ++++++++++++++++++--------
 synapse/metrics/_exposition.py                |  34 ++++-----
 synapse/metrics/background_process_metrics.py |  78 +++++++++++++++-----
 synapse/metrics/jemalloc.py                   |  10 ++-
 synapse/storage/database.py                   |   6 +-
 synapse/util/caches/expiringcache.py          |   2 +-
 synapse/util/metrics.py                       |  15 ++--
 12 files changed, 173 insertions(+), 85 deletions(-)
 create mode 100644 changelog.d/10847.misc

(limited to 'synapse/storage')

diff --git a/changelog.d/10847.misc b/changelog.d/10847.misc
new file mode 100644
index 0000000000..7933a38dca
--- /dev/null
+++ b/changelog.d/10847.misc
@@ -0,0 +1 @@
+Add type annotations to `synapse.metrics`.
diff --git a/mypy.ini b/mypy.ini
index f32c6c41a3..308cfd95d8 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -160,6 +160,9 @@ disallow_untyped_defs = True
 [mypy-synapse.handlers.*]
 disallow_untyped_defs = True
 
+[mypy-synapse.metrics.*]
+disallow_untyped_defs = True
+
 [mypy-synapse.push.*]
 disallow_untyped_defs = True
 
diff --git a/synapse/app/_base.py b/synapse/app/_base.py
index 573bb487b2..807ee3d46e 100644
--- a/synapse/app/_base.py
+++ b/synapse/app/_base.py
@@ -402,7 +402,7 @@ async def start(hs: "HomeServer") -> None:
     if hasattr(signal, "SIGHUP"):
 
         @wrap_as_background_process("sighup")
-        def handle_sighup(*args: Any, **kwargs: Any) -> None:
+        async def handle_sighup(*args: Any, **kwargs: Any) -> None:
             # Tell systemd our state, if we're using it. This will silently fail if
             # we're not using systemd.
             sdnotify(b"RELOADING=1")
diff --git a/synapse/groups/attestations.py b/synapse/groups/attestations.py
index 53f99031b1..a87896e538 100644
--- a/synapse/groups/attestations.py
+++ b/synapse/groups/attestations.py
@@ -40,6 +40,8 @@ from typing import TYPE_CHECKING, Optional, Tuple
 
 from signedjson.sign import sign_json
 
+from twisted.internet.defer import Deferred
+
 from synapse.api.errors import HttpResponseException, RequestSendFailed, SynapseError
 from synapse.metrics.background_process_metrics import run_as_background_process
 from synapse.types import JsonDict, get_domain_from_id
@@ -166,7 +168,7 @@ class GroupAttestionRenewer:
 
         return {}
 
-    def _start_renew_attestations(self) -> None:
+    def _start_renew_attestations(self) -> "Deferred[None]":
         return run_as_background_process("renew_attestations", self._renew_attestations)
 
     async def _renew_attestations(self) -> None:
diff --git a/synapse/handlers/typing.py b/synapse/handlers/typing.py
index 22c6174821..1676ebd057 100644
--- a/synapse/handlers/typing.py
+++ b/synapse/handlers/typing.py
@@ -90,7 +90,7 @@ class FollowerTypingHandler:
         self.wheel_timer = WheelTimer(bucket_size=5000)
 
     @wrap_as_background_process("typing._handle_timeouts")
-    def _handle_timeouts(self) -> None:
+    async def _handle_timeouts(self) -> None:
         logger.debug("Checking for typing timeouts")
 
         now = self.clock.time_msec()
diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py
index 91ee5c8193..ceef57ad88 100644
--- a/synapse/metrics/__init__.py
+++ b/synapse/metrics/__init__.py
@@ -20,10 +20,25 @@ import os
 import platform
 import threading
 import time
-from typing import Callable, Dict, Iterable, Mapping, Optional, Tuple, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generic,
+    Iterable,
+    Mapping,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+    cast,
+)
 
 import attr
-from prometheus_client import Counter, Gauge, Histogram
+from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram, Metric
 from prometheus_client.core import (
     REGISTRY,
     CounterMetricFamily,
@@ -32,6 +47,7 @@ from prometheus_client.core import (
 )
 
 from twisted.internet import reactor
+from twisted.internet.base import ReactorBase
 from twisted.python.threadpool import ThreadPool
 
 import synapse
@@ -54,7 +70,7 @@ HAVE_PROC_SELF_STAT = os.path.exists("/proc/self/stat")
 
 class RegistryProxy:
     @staticmethod
-    def collect():
+    def collect() -> Iterable[Metric]:
         for metric in REGISTRY.collect():
             if not metric.name.startswith("__"):
                 yield metric
@@ -74,7 +90,7 @@ class LaterGauge:
         ]
     )
 
-    def collect(self):
+    def collect(self) -> Iterable[Metric]:
 
         g = GaugeMetricFamily(self.name, self.desc, labels=self.labels)
 
@@ -93,10 +109,10 @@ class LaterGauge:
 
         yield g
 
-    def __attrs_post_init__(self):
+    def __attrs_post_init__(self) -> None:
         self._register()
 
-    def _register(self):
+    def _register(self) -> None:
         if self.name in all_gauges.keys():
             logger.warning("%s already registered, reregistering" % (self.name,))
             REGISTRY.unregister(all_gauges.pop(self.name))
@@ -105,7 +121,12 @@ class LaterGauge:
         all_gauges[self.name] = self
 
 
-class InFlightGauge:
+# `MetricsEntry` only makes sense when it is a `Protocol`,
+# but `Protocol` can't be used as a `TypeVar` bound.
+MetricsEntry = TypeVar("MetricsEntry")
+
+
+class InFlightGauge(Generic[MetricsEntry]):
     """Tracks number of things (e.g. requests, Measure blocks, etc) in flight
     at any given time.
 
@@ -115,14 +136,19 @@ class InFlightGauge:
     callbacks.
 
     Args:
-        name (str)
-        desc (str)
-        labels (list[str])
-        sub_metrics (list[str]): A list of sub metrics that the callbacks
-            will update.
+        name
+        desc
+        labels
+        sub_metrics: A list of sub metrics that the callbacks will update.
     """
 
-    def __init__(self, name, desc, labels, sub_metrics):
+    def __init__(
+        self,
+        name: str,
+        desc: str,
+        labels: Sequence[str],
+        sub_metrics: Sequence[str],
+    ):
         self.name = name
         self.desc = desc
         self.labels = labels
@@ -130,19 +156,25 @@ class InFlightGauge:
 
         # Create a class which have the sub_metrics values as attributes, which
         # default to 0 on initialization. Used to pass to registered callbacks.
-        self._metrics_class = attr.make_class(
+        self._metrics_class: Type[MetricsEntry] = attr.make_class(
             "_MetricsEntry", attrs={x: attr.ib(0) for x in sub_metrics}, slots=True
         )
 
         # Counts number of in flight blocks for a given set of label values
-        self._registrations: Dict = {}
+        self._registrations: Dict[
+            Tuple[str, ...], Set[Callable[[MetricsEntry], None]]
+        ] = {}
 
         # Protects access to _registrations
         self._lock = threading.Lock()
 
         self._register_with_collector()
 
-    def register(self, key, callback):
+    def register(
+        self,
+        key: Tuple[str, ...],
+        callback: Callable[[MetricsEntry], None],
+    ) -> None:
         """Registers that we've entered a new block with labels `key`.
 
         `callback` gets called each time the metrics are collected. The same
@@ -158,13 +190,17 @@ class InFlightGauge:
         with self._lock:
             self._registrations.setdefault(key, set()).add(callback)
 
-    def unregister(self, key, callback):
+    def unregister(
+        self,
+        key: Tuple[str, ...],
+        callback: Callable[[MetricsEntry], None],
+    ) -> None:
         """Registers that we've exited a block with labels `key`."""
 
         with self._lock:
             self._registrations.setdefault(key, set()).discard(callback)
 
-    def collect(self):
+    def collect(self) -> Iterable[Metric]:
         """Called by prometheus client when it reads metrics.
 
         Note: may be called by a separate thread.
@@ -200,7 +236,7 @@ class InFlightGauge:
                 gauge.add_metric(key, getattr(metrics, name))
             yield gauge
 
-    def _register_with_collector(self):
+    def _register_with_collector(self) -> None:
         if self.name in all_gauges.keys():
             logger.warning("%s already registered, reregistering" % (self.name,))
             REGISTRY.unregister(all_gauges.pop(self.name))
@@ -230,7 +266,7 @@ class GaugeBucketCollector:
         name: str,
         documentation: str,
         buckets: Iterable[float],
-        registry=REGISTRY,
+        registry: CollectorRegistry = REGISTRY,
     ):
         """
         Args:
@@ -257,12 +293,12 @@ class GaugeBucketCollector:
 
         registry.register(self)
 
-    def collect(self):
+    def collect(self) -> Iterable[Metric]:
         # Don't report metrics unless we've already collected some data
         if self._metric is not None:
             yield self._metric
 
-    def update_data(self, values: Iterable[float]):
+    def update_data(self, values: Iterable[float]) -> None:
         """Update the data to be reported by the metric
 
         The existing data is cleared, and each measurement in the input is assigned
@@ -304,7 +340,7 @@ class GaugeBucketCollector:
 
 
 class CPUMetrics:
-    def __init__(self):
+    def __init__(self) -> None:
         ticks_per_sec = 100
         try:
             # Try and get the system config
@@ -314,7 +350,7 @@ class CPUMetrics:
 
         self.ticks_per_sec = ticks_per_sec
 
-    def collect(self):
+    def collect(self) -> Iterable[Metric]:
         if not HAVE_PROC_SELF_STAT:
             return
 
@@ -364,7 +400,7 @@ gc_time = Histogram(
 
 
 class GCCounts:
-    def collect(self):
+    def collect(self) -> Iterable[Metric]:
         cm = GaugeMetricFamily("python_gc_counts", "GC object counts", labels=["gen"])
         for n, m in enumerate(gc.get_count()):
             cm.add_metric([str(n)], m)
@@ -382,7 +418,7 @@ if not running_on_pypy:
 
 
 class PyPyGCStats:
-    def collect(self):
+    def collect(self) -> Iterable[Metric]:
 
         # @stats is a pretty-printer object with __str__() returning a nice table,
         # plus some fields that contain data from that table.
@@ -565,7 +601,7 @@ def register_threadpool(name: str, threadpool: ThreadPool) -> None:
 
 
 class ReactorLastSeenMetric:
-    def collect(self):
+    def collect(self) -> Iterable[Metric]:
         cm = GaugeMetricFamily(
             "python_twisted_reactor_last_seen",
             "Seconds since the Twisted reactor was last seen",
@@ -584,9 +620,12 @@ MIN_TIME_BETWEEN_GCS = (1.0, 10.0, 30.0)
 _last_gc = [0.0, 0.0, 0.0]
 
 
-def runUntilCurrentTimer(reactor, func):
+F = TypeVar("F", bound=Callable[..., Any])
+
+
+def runUntilCurrentTimer(reactor: ReactorBase, func: F) -> F:
     @functools.wraps(func)
-    def f(*args, **kwargs):
+    def f(*args: Any, **kwargs: Any) -> Any:
         now = reactor.seconds()
         num_pending = 0
 
@@ -649,7 +688,7 @@ def runUntilCurrentTimer(reactor, func):
 
         return ret
 
-    return f
+    return cast(F, f)
 
 
 try:
@@ -677,5 +716,5 @@ __all__ = [
     "start_http_server",
     "LaterGauge",
     "InFlightGauge",
-    "BucketCollector",
+    "GaugeBucketCollector",
 ]
diff --git a/synapse/metrics/_exposition.py b/synapse/metrics/_exposition.py
index bb9bcb5592..353d0a63b6 100644
--- a/synapse/metrics/_exposition.py
+++ b/synapse/metrics/_exposition.py
@@ -25,27 +25,25 @@ import math
 import threading
 from http.server import BaseHTTPRequestHandler, HTTPServer
 from socketserver import ThreadingMixIn
-from typing import Dict, List
+from typing import Any, Dict, List, Type, Union
 from urllib.parse import parse_qs, urlparse
 
-from prometheus_client import REGISTRY
+from prometheus_client import REGISTRY, CollectorRegistry
+from prometheus_client.core import Sample
 
 from twisted.web.resource import Resource
+from twisted.web.server import Request
 
 from synapse.util import caches
 
 CONTENT_TYPE_LATEST = "text/plain; version=0.0.4; charset=utf-8"
 
 
-INF = float("inf")
-MINUS_INF = float("-inf")
-
-
-def floatToGoString(d):
+def floatToGoString(d: Union[int, float]) -> str:
     d = float(d)
-    if d == INF:
+    if d == math.inf:
         return "+Inf"
-    elif d == MINUS_INF:
+    elif d == -math.inf:
         return "-Inf"
     elif math.isnan(d):
         return "NaN"
@@ -60,7 +58,7 @@ def floatToGoString(d):
         return s
 
 
-def sample_line(line, name):
+def sample_line(line: Sample, name: str) -> str:
     if line.labels:
         labelstr = "{{{0}}}".format(
             ",".join(
@@ -82,7 +80,7 @@ def sample_line(line, name):
     return "{}{} {}{}\n".format(name, labelstr, floatToGoString(line.value), timestamp)
 
 
-def generate_latest(registry, emit_help=False):
+def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> bytes:
 
     # Trigger the cache metrics to be rescraped, which updates the common
     # metrics but do not produce metrics themselves
@@ -187,7 +185,7 @@ class MetricsHandler(BaseHTTPRequestHandler):
 
     registry = REGISTRY
 
-    def do_GET(self):
+    def do_GET(self) -> None:
         registry = self.registry
         params = parse_qs(urlparse(self.path).query)
 
@@ -207,11 +205,11 @@ class MetricsHandler(BaseHTTPRequestHandler):
         self.end_headers()
         self.wfile.write(output)
 
-    def log_message(self, format, *args):
+    def log_message(self, format: str, *args: Any) -> None:
         """Log nothing."""
 
     @classmethod
-    def factory(cls, registry):
+    def factory(cls, registry: CollectorRegistry) -> Type:
         """Returns a dynamic MetricsHandler class tied
         to the passed registry.
         """
@@ -236,7 +234,9 @@ class _ThreadingSimpleServer(ThreadingMixIn, HTTPServer):
     daemon_threads = True
 
 
-def start_http_server(port, addr="", registry=REGISTRY):
+def start_http_server(
+    port: int, addr: str = "", registry: CollectorRegistry = REGISTRY
+) -> None:
     """Starts an HTTP server for prometheus metrics as a daemon thread"""
     CustomMetricsHandler = MetricsHandler.factory(registry)
     httpd = _ThreadingSimpleServer((addr, port), CustomMetricsHandler)
@@ -252,10 +252,10 @@ class MetricsResource(Resource):
 
     isLeaf = True
 
-    def __init__(self, registry=REGISTRY):
+    def __init__(self, registry: CollectorRegistry = REGISTRY):
         self.registry = registry
 
-    def render_GET(self, request):
+    def render_GET(self, request: Request) -> bytes:
         request.setHeader(b"Content-Type", CONTENT_TYPE_LATEST.encode("ascii"))
         response = generate_latest(self.registry)
         request.setHeader(b"Content-Length", str(len(response)))
diff --git a/synapse/metrics/background_process_metrics.py b/synapse/metrics/background_process_metrics.py
index 2ab599a334..53c508af91 100644
--- a/synapse/metrics/background_process_metrics.py
+++ b/synapse/metrics/background_process_metrics.py
@@ -15,19 +15,37 @@
 import logging
 import threading
 from functools import wraps
-from typing import TYPE_CHECKING, Dict, Optional, Set, Union
+from types import TracebackType
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Awaitable,
+    Callable,
+    Dict,
+    Iterable,
+    Optional,
+    Set,
+    Type,
+    TypeVar,
+    Union,
+    cast,
+)
 
+from prometheus_client import Metric
 from prometheus_client.core import REGISTRY, Counter, Gauge
 
 from twisted.internet import defer
 
-from synapse.logging.context import LoggingContext, PreserveLoggingContext
+from synapse.logging.context import (
+    ContextResourceUsage,
+    LoggingContext,
+    PreserveLoggingContext,
+)
 from synapse.logging.opentracing import (
     SynapseTags,
     noop_context_manager,
     start_active_span,
 )
-from synapse.util.async_helpers import maybe_awaitable
 
 if TYPE_CHECKING:
     import resource
@@ -116,7 +134,7 @@ class _Collector:
     before they are returned.
     """
 
-    def collect(self):
+    def collect(self) -> Iterable[Metric]:
         global _background_processes_active_since_last_scrape
 
         # We swap out the _background_processes set with an empty one so that
@@ -144,12 +162,12 @@ REGISTRY.register(_Collector())
 
 
 class _BackgroundProcess:
-    def __init__(self, desc, ctx):
+    def __init__(self, desc: str, ctx: LoggingContext):
         self.desc = desc
         self._context = ctx
-        self._reported_stats = None
+        self._reported_stats: Optional[ContextResourceUsage] = None
 
-    def update_metrics(self):
+    def update_metrics(self) -> None:
         """Updates the metrics with values from this process."""
         new_stats = self._context.get_resource_usage()
         if self._reported_stats is None:
@@ -169,7 +187,16 @@ class _BackgroundProcess:
         )
 
 
-def run_as_background_process(desc: str, func, *args, bg_start_span=True, **kwargs):
+R = TypeVar("R")
+
+
+def run_as_background_process(
+    desc: str,
+    func: Callable[..., Awaitable[Optional[R]]],
+    *args: Any,
+    bg_start_span: bool = True,
+    **kwargs: Any,
+) -> "defer.Deferred[Optional[R]]":
     """Run the given function in its own logcontext, with resource metrics
 
     This should be used to wrap processes which are fired off to run in the
@@ -189,11 +216,13 @@ def run_as_background_process(desc: str, func, *args, bg_start_span=True, **kwar
         args: positional args for func
         kwargs: keyword args for func
 
-    Returns: Deferred which returns the result of func, but note that it does not
-        follow the synapse logcontext rules.
+    Returns:
+        Deferred which returns the result of func, or `None` if func raises.
+        Note that the returned Deferred does not follow the synapse logcontext
+        rules.
     """
 
-    async def run():
+    async def run() -> Optional[R]:
         with _bg_metrics_lock:
             count = _background_process_counts.get(desc, 0)
             _background_process_counts[desc] = count + 1
@@ -210,12 +239,13 @@ def run_as_background_process(desc: str, func, *args, bg_start_span=True, **kwar
                 else:
                     ctx = noop_context_manager()
                 with ctx:
-                    return await maybe_awaitable(func(*args, **kwargs))
+                    return await func(*args, **kwargs)
             except Exception:
                 logger.exception(
                     "Background process '%s' threw an exception",
                     desc,
                 )
+                return None
             finally:
                 _background_process_in_flight_count.labels(desc).dec()
 
@@ -225,19 +255,24 @@ def run_as_background_process(desc: str, func, *args, bg_start_span=True, **kwar
         return defer.ensureDeferred(run())
 
 
-def wrap_as_background_process(desc):
+F = TypeVar("F", bound=Callable[..., Awaitable[Optional[Any]]])
+
+
+def wrap_as_background_process(desc: str) -> Callable[[F], F]:
     """Decorator that wraps a function that gets called as a background
     process.
 
-    Equivalent of calling the function with `run_as_background_process`
+    Equivalent to calling the function with `run_as_background_process`
     """
 
-    def wrap_as_background_process_inner(func):
+    def wrap_as_background_process_inner(func: F) -> F:
         @wraps(func)
-        def wrap_as_background_process_inner_2(*args, **kwargs):
+        def wrap_as_background_process_inner_2(
+            *args: Any, **kwargs: Any
+        ) -> "defer.Deferred[Optional[R]]":
             return run_as_background_process(desc, func, *args, **kwargs)
 
-        return wrap_as_background_process_inner_2
+        return cast(F, wrap_as_background_process_inner_2)
 
     return wrap_as_background_process_inner
 
@@ -265,7 +300,7 @@ class BackgroundProcessLoggingContext(LoggingContext):
         super().__init__("%s-%s" % (name, instance_id))
         self._proc = _BackgroundProcess(name, self)
 
-    def start(self, rusage: "Optional[resource.struct_rusage]"):
+    def start(self, rusage: "Optional[resource.struct_rusage]") -> None:
         """Log context has started running (again)."""
 
         super().start(rusage)
@@ -276,7 +311,12 @@ class BackgroundProcessLoggingContext(LoggingContext):
         with _bg_metrics_lock:
             _background_processes_active_since_last_scrape.add(self._proc)
 
-    def __exit__(self, type, value, traceback) -> None:
+    def __exit__(
+        self,
+        type: Optional[Type[BaseException]],
+        value: Optional[BaseException],
+        traceback: Optional[TracebackType],
+    ) -> None:
         """Log context has finished."""
 
         super().__exit__(type, value, traceback)
diff --git a/synapse/metrics/jemalloc.py b/synapse/metrics/jemalloc.py
index 29ab6c0229..98ed9c0829 100644
--- a/synapse/metrics/jemalloc.py
+++ b/synapse/metrics/jemalloc.py
@@ -16,14 +16,16 @@ import ctypes
 import logging
 import os
 import re
-from typing import Optional
+from typing import Iterable, Optional
+
+from prometheus_client import Metric
 
 from synapse.metrics import REGISTRY, GaugeMetricFamily
 
 logger = logging.getLogger(__name__)
 
 
-def _setup_jemalloc_stats():
+def _setup_jemalloc_stats() -> None:
     """Checks to see if jemalloc is loaded, and hooks up a collector to record
     statistics exposed by jemalloc.
     """
@@ -135,7 +137,7 @@ def _setup_jemalloc_stats():
     class JemallocCollector:
         """Metrics for internal jemalloc stats."""
 
-        def collect(self):
+        def collect(self) -> Iterable[Metric]:
             _jemalloc_refresh_stats()
 
             g = GaugeMetricFamily(
@@ -185,7 +187,7 @@ def _setup_jemalloc_stats():
     logger.debug("Added jemalloc stats")
 
 
-def setup_jemalloc_stats():
+def setup_jemalloc_stats() -> None:
     """Try to setup jemalloc stats, if jemalloc is loaded."""
 
     try:
diff --git a/synapse/storage/database.py b/synapse/storage/database.py
index d4cab69ebf..0693d39006 100644
--- a/synapse/storage/database.py
+++ b/synapse/storage/database.py
@@ -188,7 +188,7 @@ class LoggingDatabaseConnection:
 
 
 # The type of entry which goes on our after_callbacks and exception_callbacks lists.
-_CallbackListEntry = Tuple[Callable[..., None], Iterable[Any], Dict[str, Any]]
+_CallbackListEntry = Tuple[Callable[..., object], Iterable[Any], Dict[str, Any]]
 
 
 R = TypeVar("R")
@@ -235,7 +235,7 @@ class LoggingTransaction:
         self.after_callbacks = after_callbacks
         self.exception_callbacks = exception_callbacks
 
-    def call_after(self, callback: Callable[..., None], *args: Any, **kwargs: Any):
+    def call_after(self, callback: Callable[..., object], *args: Any, **kwargs: Any):
         """Call the given callback on the main twisted thread after the
         transaction has finished. Used to invalidate the caches on the
         correct thread.
@@ -247,7 +247,7 @@ class LoggingTransaction:
         self.after_callbacks.append((callback, args, kwargs))
 
     def call_on_exception(
-        self, callback: Callable[..., None], *args: Any, **kwargs: Any
+        self, callback: Callable[..., object], *args: Any, **kwargs: Any
     ):
         # if self.exception_callbacks is None, that means that whatever constructed the
         # LoggingTransaction isn't expecting there to be any callbacks; assert that
diff --git a/synapse/util/caches/expiringcache.py b/synapse/util/caches/expiringcache.py
index 6a7e534576..67ee4c693b 100644
--- a/synapse/util/caches/expiringcache.py
+++ b/synapse/util/caches/expiringcache.py
@@ -159,7 +159,7 @@ class ExpiringCache(Generic[KT, VT]):
             self[key] = value
             return value
 
-    def _prune_cache(self) -> None:
+    async def _prune_cache(self) -> None:
         if not self._expiry_ms:
             # zero expiry time means don't expire. This should never get called
             # since we have this check in start too.
diff --git a/synapse/util/metrics.py b/synapse/util/metrics.py
index ad775dfc7d..98ee49af6e 100644
--- a/synapse/util/metrics.py
+++ b/synapse/util/metrics.py
@@ -56,8 +56,15 @@ block_db_sched_duration = Counter(
     "synapse_util_metrics_block_db_sched_duration_seconds", "", ["block_name"]
 )
 
+
+# This is dynamically created in InFlightGauge.__init__.
+class _InFlightMetric(Protocol):
+    real_time_max: float
+    real_time_sum: float
+
+
 # Tracks the number of blocks currently active
-in_flight = InFlightGauge(
+in_flight: InFlightGauge[_InFlightMetric] = InFlightGauge(
     "synapse_util_metrics_block_in_flight",
     "",
     labels=["block_name"],
@@ -65,12 +72,6 @@ in_flight = InFlightGauge(
 )
 
 
-# This is dynamically created in InFlightGauge.__init__.
-class _InFlightMetric(Protocol):
-    real_time_max: float
-    real_time_sum: float
-
-
 T = TypeVar("T", bound=Callable[..., Any])
 
 
-- 
cgit 1.5.1


From 4bd54b263ef7e2ac29acdc85e0c6392684c44281 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@users.noreply.github.com>
Date: Thu, 18 Nov 2021 08:43:09 -0500
Subject: Do not allow MSC3440 threads to fork threads (#11161)

Adds validation to the Client-Server API to ensure that
the potential thread head does not relate to another event
already. This results in not allowing a thread to "fork" into
other threads.

If the target event is unknown for some reason (maybe it isn't
visible to your homeserver), but is the target of other events
it is assumed that the thread can be created from it. Otherwise,
it is rejected as an unknown event.
---
 changelog.d/11161.feature                   |  1 +
 synapse/handlers/message.py                 | 54 ++++++++++++++++++++---
 synapse/storage/databases/main/relations.py | 67 ++++++++++++++++++++++++++++-
 tests/rest/client/test_relations.py         | 62 ++++++++++++++++++++++++++
 4 files changed, 176 insertions(+), 8 deletions(-)
 create mode 100644 changelog.d/11161.feature

(limited to 'synapse/storage')

diff --git a/changelog.d/11161.feature b/changelog.d/11161.feature
new file mode 100644
index 0000000000..76b0d28084
--- /dev/null
+++ b/changelog.d/11161.feature
@@ -0,0 +1 @@
+Experimental support for the thread relation defined in [MSC3440](https://github.com/matrix-org/matrix-doc/pull/3440).
diff --git a/synapse/handlers/message.py b/synapse/handlers/message.py
index d4c2a6ab7a..22dd4cf5fd 100644
--- a/synapse/handlers/message.py
+++ b/synapse/handlers/message.py
@@ -1001,13 +1001,52 @@ class EventCreationHandler:
             )
 
         self.validator.validate_new(event, self.config)
+        await self._validate_event_relation(event)
+        logger.debug("Created event %s", event.event_id)
+
+        return event, context
+
+    async def _validate_event_relation(self, event: EventBase) -> None:
+        """
+        Ensure the relation data on a new event is not bogus.
+
+        Args:
+            event: The event being created.
+
+        Raises:
+            SynapseError if the event is invalid.
+        """
+
+        relation = event.content.get("m.relates_to")
+        if not relation:
+            return
+
+        relation_type = relation.get("rel_type")
+        if not relation_type:
+            return
+
+        # Ensure the parent is real.
+        relates_to = relation.get("event_id")
+        if not relates_to:
+            return
+
+        parent_event = await self.store.get_event(relates_to, allow_none=True)
+        if parent_event:
+            # And in the same room.
+            if parent_event.room_id != event.room_id:
+                raise SynapseError(400, "Relations must be in the same room")
+
+        else:
+            # There must be some reason that the client knows the event exists,
+            # see if there are existing relations. If so, assume everything is fine.
+            if not await self.store.event_is_target_of_relation(relates_to):
+                # Otherwise, the client can't know about the parent event!
+                raise SynapseError(400, "Can't send relation to unknown event")
 
         # If this event is an annotation then we check that that the sender
         # can't annotate the same way twice (e.g. stops users from liking an
         # event multiple times).
-        relation = event.content.get("m.relates_to", {})
-        if relation.get("rel_type") == RelationTypes.ANNOTATION:
-            relates_to = relation["event_id"]
+        if relation_type == RelationTypes.ANNOTATION:
             aggregation_key = relation["key"]
 
             already_exists = await self.store.has_user_annotated_event(
@@ -1016,9 +1055,12 @@ class EventCreationHandler:
             if already_exists:
                 raise SynapseError(400, "Can't send same reaction twice")
 
-        logger.debug("Created event %s", event.event_id)
-
-        return event, context
+        # Don't attempt to start a thread if the parent event is a relation.
+        elif relation_type == RelationTypes.THREAD:
+            if await self.store.event_includes_relation(relates_to):
+                raise SynapseError(
+                    400, "Cannot start threads from an event with a relation"
+                )
 
     @measure_func("handle_new_client_event")
     async def handle_new_client_event(
diff --git a/synapse/storage/databases/main/relations.py b/synapse/storage/databases/main/relations.py
index 907af10995..0a43acda07 100644
--- a/synapse/storage/databases/main/relations.py
+++ b/synapse/storage/databases/main/relations.py
@@ -132,6 +132,69 @@ class RelationsWorkerStore(SQLBaseStore):
             "get_recent_references_for_event", _get_recent_references_for_event_txn
         )
 
+    async def event_includes_relation(self, event_id: str) -> bool:
+        """Check if the given event relates to another event.
+
+        An event has a relation if it has a valid m.relates_to with a rel_type
+        and event_id in the content:
+
+        {
+            "content": {
+                "m.relates_to": {
+                    "rel_type": "m.replace",
+                    "event_id": "$other_event_id"
+                }
+            }
+        }
+
+        Args:
+            event_id: The event to check.
+
+        Returns:
+            True if the event includes a valid relation.
+        """
+
+        result = await self.db_pool.simple_select_one_onecol(
+            table="event_relations",
+            keyvalues={"event_id": event_id},
+            retcol="event_id",
+            allow_none=True,
+            desc="event_includes_relation",
+        )
+        return result is not None
+
+    async def event_is_target_of_relation(self, parent_id: str) -> bool:
+        """Check if the given event is the target of another event's relation.
+
+        An event is the target of an event relation if it has a valid
+        m.relates_to with a rel_type and event_id pointing to parent_id in the
+        content:
+
+        {
+            "content": {
+                "m.relates_to": {
+                    "rel_type": "m.replace",
+                    "event_id": "$parent_id"
+                }
+            }
+        }
+
+        Args:
+            parent_id: The event to check.
+
+        Returns:
+            True if the event is the target of another event's relation.
+        """
+
+        result = await self.db_pool.simple_select_one_onecol(
+            table="event_relations",
+            keyvalues={"relates_to_id": parent_id},
+            retcol="event_id",
+            allow_none=True,
+            desc="event_is_target_of_relation",
+        )
+        return result is not None
+
     @cached(tree=True)
     async def get_aggregation_groups_for_event(
         self,
@@ -362,7 +425,7 @@ class RelationsWorkerStore(SQLBaseStore):
                 %s;
         """
 
-        def _get_if_event_has_relations(txn) -> List[str]:
+        def _get_if_events_have_relations(txn) -> List[str]:
             clauses: List[str] = []
             clause, args = make_in_list_sql_clause(
                 txn.database_engine, "relates_to_id", parent_ids
@@ -387,7 +450,7 @@ class RelationsWorkerStore(SQLBaseStore):
             return [row[0] for row in txn]
 
         return await self.db_pool.runInteraction(
-            "get_if_event_has_relations", _get_if_event_has_relations
+            "get_if_events_have_relations", _get_if_events_have_relations
         )
 
     async def has_user_annotated_event(
diff --git a/tests/rest/client/test_relations.py b/tests/rest/client/test_relations.py
index 78c2fb86b9..b8a1b92a89 100644
--- a/tests/rest/client/test_relations.py
+++ b/tests/rest/client/test_relations.py
@@ -91,6 +91,49 @@ class RelationsTestCase(unittest.HomeserverTestCase):
         channel = self._send_relation(RelationTypes.ANNOTATION, EventTypes.Member)
         self.assertEquals(400, channel.code, channel.json_body)
 
+    def test_deny_invalid_event(self):
+        """Test that we deny relations on non-existant events"""
+        channel = self._send_relation(
+            RelationTypes.ANNOTATION,
+            EventTypes.Message,
+            parent_id="foo",
+            content={"body": "foo", "msgtype": "m.text"},
+        )
+        self.assertEquals(400, channel.code, channel.json_body)
+
+        # Unless that event is referenced from another event!
+        self.get_success(
+            self.hs.get_datastore().db_pool.simple_insert(
+                table="event_relations",
+                values={
+                    "event_id": "bar",
+                    "relates_to_id": "foo",
+                    "relation_type": RelationTypes.THREAD,
+                },
+                desc="test_deny_invalid_event",
+            )
+        )
+        channel = self._send_relation(
+            RelationTypes.THREAD,
+            EventTypes.Message,
+            parent_id="foo",
+            content={"body": "foo", "msgtype": "m.text"},
+        )
+        self.assertEquals(200, channel.code, channel.json_body)
+
+    def test_deny_invalid_room(self):
+        """Test that we deny relations on non-existant events"""
+        # Create another room and send a message in it.
+        room2 = self.helper.create_room_as(self.user_id, tok=self.user_token)
+        res = self.helper.send(room2, body="Hi!", tok=self.user_token)
+        parent_id = res["event_id"]
+
+        # Attempt to send an annotation to that event.
+        channel = self._send_relation(
+            RelationTypes.ANNOTATION, "m.reaction", parent_id=parent_id, key="A"
+        )
+        self.assertEquals(400, channel.code, channel.json_body)
+
     def test_deny_double_react(self):
         """Test that we deny relations on membership events"""
         channel = self._send_relation(RelationTypes.ANNOTATION, "m.reaction", key="a")
@@ -99,6 +142,25 @@ class RelationsTestCase(unittest.HomeserverTestCase):
         channel = self._send_relation(RelationTypes.ANNOTATION, "m.reaction", "a")
         self.assertEquals(400, channel.code, channel.json_body)
 
+    def test_deny_forked_thread(self):
+        """It is invalid to start a thread off a thread."""
+        channel = self._send_relation(
+            RelationTypes.THREAD,
+            "m.room.message",
+            content={"msgtype": "m.text", "body": "foo"},
+            parent_id=self.parent_id,
+        )
+        self.assertEquals(200, channel.code, channel.json_body)
+        parent_id = channel.json_body["event_id"]
+
+        channel = self._send_relation(
+            RelationTypes.THREAD,
+            "m.room.message",
+            content={"msgtype": "m.text", "body": "foo"},
+            parent_id=parent_id,
+        )
+        self.assertEquals(400, channel.code, channel.json_body)
+
     def test_basic_paginate_relations(self):
         """Tests that calling pagination API correctly the latest relations."""
         channel = self._send_relation(RelationTypes.ANNOTATION, "m.reaction", "a")
-- 
cgit 1.5.1


From 539e44139911dc95c34784f3df2b3706c00b7db9 Mon Sep 17 00:00:00 2001
From: reivilibre <oliverw@matrix.org>
Date: Thu, 18 Nov 2021 14:40:26 +0000
Subject: Use auto_attribs for RefreshTokenLookupResult (#11386)

---
 changelog.d/11386.misc                         |  1 +
 synapse/storage/databases/main/registration.py | 14 +++++++-------
 2 files changed, 8 insertions(+), 7 deletions(-)
 create mode 100644 changelog.d/11386.misc

(limited to 'synapse/storage')

diff --git a/changelog.d/11386.misc b/changelog.d/11386.misc
new file mode 100644
index 0000000000..3178d53f05
--- /dev/null
+++ b/changelog.d/11386.misc
@@ -0,0 +1 @@
+Use `auto_attribs` on the `attrs` class `RefreshTokenLookupResult`.
diff --git a/synapse/storage/databases/main/registration.py b/synapse/storage/databases/main/registration.py
index 5e55440570..7ee699f981 100644
--- a/synapse/storage/databases/main/registration.py
+++ b/synapse/storage/databases/main/registration.py
@@ -84,26 +84,26 @@ class TokenLookupResult:
         return self.user_id
 
 
-@attr.s(frozen=True, slots=True)
+@attr.s(auto_attribs=True, frozen=True, slots=True)
 class RefreshTokenLookupResult:
     """Result of looking up a refresh token."""
 
-    user_id = attr.ib(type=str)
+    user_id: str
     """The user this token belongs to."""
 
-    device_id = attr.ib(type=str)
+    device_id: str
     """The device associated with this refresh token."""
 
-    token_id = attr.ib(type=int)
+    token_id: int
     """The ID of this refresh token."""
 
-    next_token_id = attr.ib(type=Optional[int])
+    next_token_id: Optional[int]
     """The ID of the refresh token which replaced this one."""
 
-    has_next_refresh_token_been_refreshed = attr.ib(type=bool)
+    has_next_refresh_token_been_refreshed: bool
     """True if the next refresh token was used for another refresh."""
 
-    has_next_access_token_been_used = attr.ib(type=bool)
+    has_next_access_token_been_used: bool
     """True if the next access token was already used at least once."""
 
 
-- 
cgit 1.5.1


From 81b18fe5c060a0532ab64b9575d54b84ddbad278 Mon Sep 17 00:00:00 2001
From: Dirk Klimpel <5740567+dklimpel@users.noreply.github.com>
Date: Thu, 18 Nov 2021 18:43:49 +0100
Subject: Add dedicated admin API for blocking a room (#11324)

---
 changelog.d/11324.feature              |   1 +
 docs/admin_api/rooms.md                |  78 +++++++++++
 synapse/rest/admin/__init__.py         |   2 +
 synapse/rest/admin/rooms.py            |  63 +++++++++
 synapse/storage/databases/main/room.py |  32 +++++
 tests/rest/admin/test_room.py          | 228 +++++++++++++++++++++++++++++++++
 6 files changed, 404 insertions(+)
 create mode 100644 changelog.d/11324.feature

(limited to 'synapse/storage')

diff --git a/changelog.d/11324.feature b/changelog.d/11324.feature
new file mode 100644
index 0000000000..55494358bb
--- /dev/null
+++ b/changelog.d/11324.feature
@@ -0,0 +1 @@
+Add dedicated admin API for blocking a room.
\ No newline at end of file
diff --git a/docs/admin_api/rooms.md b/docs/admin_api/rooms.md
index 6a6ae92d66..0f1a74134f 100644
--- a/docs/admin_api/rooms.md
+++ b/docs/admin_api/rooms.md
@@ -3,6 +3,7 @@
 - [Room Details API](#room-details-api)
 - [Room Members API](#room-members-api)
 - [Room State API](#room-state-api)
+- [Block Room API](#block-room-api)
 - [Delete Room API](#delete-room-api)
   * [Version 1 (old version)](#version-1-old-version)
   * [Version 2 (new version)](#version-2-new-version)
@@ -386,6 +387,83 @@ A response body like the following is returned:
 }
 ```
 
+# Block Room API
+The Block Room admin API allows server admins to block and unblock rooms,
+and query to see if a given room is blocked.
+This API can be used to pre-emptively block a room, even if it's unknown to this
+homeserver. Users will be prevented from joining a blocked room.
+
+## Block or unblock a room
+
+The API is:
+
+```
+PUT /_synapse/admin/v1/rooms/<room_id>/block
+```
+
+with a body of:
+
+```json
+{
+    "block": true
+}
+```
+
+A response body like the following is returned:
+
+```json
+{
+    "block": true
+}
+```
+
+**Parameters**
+
+The following parameters should be set in the URL:
+
+- `room_id` - The ID of the room.
+
+The following JSON body parameters are available:
+
+- `block` - If `true` the room will be blocked and if `false` the room will be unblocked.
+
+**Response**
+
+The following fields are possible in the JSON response body:
+
+- `block` - A boolean. `true` if the room is blocked, otherwise `false`
+
+## Get block status
+
+The API is:
+
+```
+GET /_synapse/admin/v1/rooms/<room_id>/block
+```
+
+A response body like the following is returned:
+
+```json
+{
+    "block": true,
+    "user_id": "<user_id>"
+}
+```
+
+**Parameters**
+
+The following parameters should be set in the URL:
+
+- `room_id` - The ID of the room.
+
+**Response**
+
+The following fields are possible in the JSON response body:
+
+- `block` - A boolean. `true` if the room is blocked, otherwise `false`
+- `user_id` - An optional string. If the room is blocked (`block` is `true`) shows
+  the user who has add the room to blocking list. Otherwise it is not displayed.
+
 # Delete Room API
 
 The Delete Room admin API allows server admins to remove rooms from the server
diff --git a/synapse/rest/admin/__init__.py b/synapse/rest/admin/__init__.py
index d78fe406c4..65b76fa10c 100644
--- a/synapse/rest/admin/__init__.py
+++ b/synapse/rest/admin/__init__.py
@@ -46,6 +46,7 @@ from synapse.rest.admin.registration_tokens import (
     RegistrationTokenRestServlet,
 )
 from synapse.rest.admin.rooms import (
+    BlockRoomRestServlet,
     DeleteRoomStatusByDeleteIdRestServlet,
     DeleteRoomStatusByRoomIdRestServlet,
     ForwardExtremitiesRestServlet,
@@ -223,6 +224,7 @@ def register_servlets(hs: "HomeServer", http_server: HttpServer) -> None:
     Register all the admin servlets.
     """
     register_servlets_for_client_rest_resource(hs, http_server)
+    BlockRoomRestServlet(hs).register(http_server)
     ListRoomRestServlet(hs).register(http_server)
     RoomStateRestServlet(hs).register(http_server)
     RoomRestServlet(hs).register(http_server)
diff --git a/synapse/rest/admin/rooms.py b/synapse/rest/admin/rooms.py
index 37cb4d0796..5b8ec1e5ca 100644
--- a/synapse/rest/admin/rooms.py
+++ b/synapse/rest/admin/rooms.py
@@ -782,3 +782,66 @@ class RoomEventContextServlet(RestServlet):
         )
 
         return 200, results
+
+
+class BlockRoomRestServlet(RestServlet):
+    """
+    Manage blocking of rooms.
+    On PUT: Add or remove a room from blocking list.
+    On GET: Get blocking status of room and user who has blocked this room.
+    """
+
+    PATTERNS = admin_patterns("/rooms/(?P<room_id>[^/]+)/block$")
+
+    def __init__(self, hs: "HomeServer"):
+        self._auth = hs.get_auth()
+        self._store = hs.get_datastore()
+
+    async def on_GET(
+        self, request: SynapseRequest, room_id: str
+    ) -> Tuple[int, JsonDict]:
+        await assert_requester_is_admin(self._auth, request)
+
+        if not RoomID.is_valid(room_id):
+            raise SynapseError(
+                HTTPStatus.BAD_REQUEST, "%s is not a legal room ID" % (room_id,)
+            )
+
+        blocked_by = await self._store.room_is_blocked_by(room_id)
+        # Test `not None` if `user_id` is an empty string
+        # if someone add manually an entry in database
+        if blocked_by is not None:
+            response = {"block": True, "user_id": blocked_by}
+        else:
+            response = {"block": False}
+
+        return HTTPStatus.OK, response
+
+    async def on_PUT(
+        self, request: SynapseRequest, room_id: str
+    ) -> Tuple[int, JsonDict]:
+        requester = await self._auth.get_user_by_req(request)
+        await assert_user_is_admin(self._auth, requester.user)
+
+        content = parse_json_object_from_request(request)
+
+        if not RoomID.is_valid(room_id):
+            raise SynapseError(
+                HTTPStatus.BAD_REQUEST, "%s is not a legal room ID" % (room_id,)
+            )
+
+        assert_params_in_dict(content, ["block"])
+        block = content.get("block")
+        if not isinstance(block, bool):
+            raise SynapseError(
+                HTTPStatus.BAD_REQUEST,
+                "Param 'block' must be a boolean.",
+                Codes.BAD_JSON,
+            )
+
+        if block:
+            await self._store.block_room(room_id, requester.user.to_string())
+        else:
+            await self._store.unblock_room(room_id)
+
+        return HTTPStatus.OK, {"block": block}
diff --git a/synapse/storage/databases/main/room.py b/synapse/storage/databases/main/room.py
index 17b398bb69..7d694d852d 100644
--- a/synapse/storage/databases/main/room.py
+++ b/synapse/storage/databases/main/room.py
@@ -397,6 +397,20 @@ class RoomWorkerStore(SQLBaseStore):
             desc="is_room_blocked",
         )
 
+    async def room_is_blocked_by(self, room_id: str) -> Optional[str]:
+        """
+        Function to retrieve user who has blocked the room.
+        user_id is non-nullable
+        It returns None if the room is not blocked.
+        """
+        return await self.db_pool.simple_select_one_onecol(
+            table="blocked_rooms",
+            keyvalues={"room_id": room_id},
+            retcol="user_id",
+            allow_none=True,
+            desc="room_is_blocked_by",
+        )
+
     async def get_rooms_paginate(
         self,
         start: int,
@@ -1775,3 +1789,21 @@ class RoomStore(RoomBackgroundUpdateStore, RoomWorkerStore, SearchStore):
             self.is_room_blocked,
             (room_id,),
         )
+
+    async def unblock_room(self, room_id: str) -> None:
+        """Remove the room from blocking list.
+
+        Args:
+            room_id: Room to unblock
+        """
+        await self.db_pool.simple_delete(
+            table="blocked_rooms",
+            keyvalues={"room_id": room_id},
+            desc="unblock_room",
+        )
+        await self.db_pool.runInteraction(
+            "block_room_invalidation",
+            self._invalidate_cache_and_stream,
+            self.is_room_blocked,
+            (room_id,),
+        )
diff --git a/tests/rest/admin/test_room.py b/tests/rest/admin/test_room.py
index b48fc12e5f..07077aff78 100644
--- a/tests/rest/admin/test_room.py
+++ b/tests/rest/admin/test_room.py
@@ -2226,6 +2226,234 @@ class MakeRoomAdminTestCase(unittest.HomeserverTestCase):
         )
 
 
+class BlockRoomTestCase(unittest.HomeserverTestCase):
+    servlets = [
+        synapse.rest.admin.register_servlets,
+        room.register_servlets,
+        login.register_servlets,
+    ]
+
+    def prepare(self, reactor, clock, hs):
+        self._store = hs.get_datastore()
+
+        self.admin_user = self.register_user("admin", "pass", admin=True)
+        self.admin_user_tok = self.login("admin", "pass")
+
+        self.other_user = self.register_user("user", "pass")
+        self.other_user_tok = self.login("user", "pass")
+
+        self.room_id = self.helper.create_room_as(
+            self.other_user, tok=self.other_user_tok
+        )
+        self.url = "/_synapse/admin/v1/rooms/%s/block"
+
+    @parameterized.expand([("PUT",), ("GET",)])
+    def test_requester_is_no_admin(self, method: str):
+        """If the user is not a server admin, an error 403 is returned."""
+
+        channel = self.make_request(
+            method,
+            self.url % self.room_id,
+            content={},
+            access_token=self.other_user_tok,
+        )
+
+        self.assertEqual(HTTPStatus.FORBIDDEN, channel.code, msg=channel.json_body)
+        self.assertEqual(Codes.FORBIDDEN, channel.json_body["errcode"])
+
+    @parameterized.expand([("PUT",), ("GET",)])
+    def test_room_is_not_valid(self, method: str):
+        """Check that invalid room names, return an error 400."""
+
+        channel = self.make_request(
+            method,
+            self.url % "invalidroom",
+            content={},
+            access_token=self.admin_user_tok,
+        )
+
+        self.assertEqual(HTTPStatus.BAD_REQUEST, channel.code, msg=channel.json_body)
+        self.assertEqual(
+            "invalidroom is not a legal room ID",
+            channel.json_body["error"],
+        )
+
+    def test_block_is_not_valid(self):
+        """If parameter `block` is not valid, return an error."""
+
+        # `block` is not valid
+        channel = self.make_request(
+            "PUT",
+            self.url % self.room_id,
+            content={"block": "NotBool"},
+            access_token=self.admin_user_tok,
+        )
+
+        self.assertEqual(HTTPStatus.BAD_REQUEST, channel.code, msg=channel.json_body)
+        self.assertEqual(Codes.BAD_JSON, channel.json_body["errcode"])
+
+        # `block` is not set
+        channel = self.make_request(
+            "PUT",
+            self.url % self.room_id,
+            content={},
+            access_token=self.admin_user_tok,
+        )
+
+        self.assertEqual(HTTPStatus.BAD_REQUEST, channel.code, msg=channel.json_body)
+        self.assertEqual(Codes.MISSING_PARAM, channel.json_body["errcode"])
+
+        # no content is send
+        channel = self.make_request(
+            "PUT",
+            self.url % self.room_id,
+            access_token=self.admin_user_tok,
+        )
+
+        self.assertEqual(HTTPStatus.BAD_REQUEST, channel.code, msg=channel.json_body)
+        self.assertEqual(Codes.NOT_JSON, channel.json_body["errcode"])
+
+    def test_block_room(self):
+        """Test that block a room is successful."""
+
+        def _request_and_test_block_room(room_id: str) -> None:
+            self._is_blocked(room_id, expect=False)
+            channel = self.make_request(
+                "PUT",
+                self.url % room_id,
+                content={"block": True},
+                access_token=self.admin_user_tok,
+            )
+            self.assertEqual(HTTPStatus.OK, channel.code, msg=channel.json_body)
+            self.assertTrue(channel.json_body["block"])
+            self._is_blocked(room_id, expect=True)
+
+        # known internal room
+        _request_and_test_block_room(self.room_id)
+
+        # unknown internal room
+        _request_and_test_block_room("!unknown:test")
+
+        # unknown remote room
+        _request_and_test_block_room("!unknown:remote")
+
+    def test_block_room_twice(self):
+        """Test that block a room that is already blocked is successful."""
+
+        self._is_blocked(self.room_id, expect=False)
+        for _ in range(2):
+            channel = self.make_request(
+                "PUT",
+                self.url % self.room_id,
+                content={"block": True},
+                access_token=self.admin_user_tok,
+            )
+            self.assertEqual(HTTPStatus.OK, channel.code, msg=channel.json_body)
+            self.assertTrue(channel.json_body["block"])
+            self._is_blocked(self.room_id, expect=True)
+
+    def test_unblock_room(self):
+        """Test that unblock a room is successful."""
+
+        def _request_and_test_unblock_room(room_id: str) -> None:
+            self._block_room(room_id)
+
+            channel = self.make_request(
+                "PUT",
+                self.url % room_id,
+                content={"block": False},
+                access_token=self.admin_user_tok,
+            )
+            self.assertEqual(HTTPStatus.OK, channel.code, msg=channel.json_body)
+            self.assertFalse(channel.json_body["block"])
+            self._is_blocked(room_id, expect=False)
+
+        # known internal room
+        _request_and_test_unblock_room(self.room_id)
+
+        # unknown internal room
+        _request_and_test_unblock_room("!unknown:test")
+
+        # unknown remote room
+        _request_and_test_unblock_room("!unknown:remote")
+
+    def test_unblock_room_twice(self):
+        """Test that unblock a room that is not blocked is successful."""
+
+        self._block_room(self.room_id)
+        for _ in range(2):
+            channel = self.make_request(
+                "PUT",
+                self.url % self.room_id,
+                content={"block": False},
+                access_token=self.admin_user_tok,
+            )
+            self.assertEqual(HTTPStatus.OK, channel.code, msg=channel.json_body)
+            self.assertFalse(channel.json_body["block"])
+            self._is_blocked(self.room_id, expect=False)
+
+    def test_get_blocked_room(self):
+        """Test get status of a blocked room"""
+
+        def _request_blocked_room(room_id: str) -> None:
+            self._block_room(room_id)
+
+            channel = self.make_request(
+                "GET",
+                self.url % room_id,
+                access_token=self.admin_user_tok,
+            )
+            self.assertEqual(HTTPStatus.OK, channel.code, msg=channel.json_body)
+            self.assertTrue(channel.json_body["block"])
+            self.assertEqual(self.other_user, channel.json_body["user_id"])
+
+        # known internal room
+        _request_blocked_room(self.room_id)
+
+        # unknown internal room
+        _request_blocked_room("!unknown:test")
+
+        # unknown remote room
+        _request_blocked_room("!unknown:remote")
+
+    def test_get_unblocked_room(self):
+        """Test get status of a unblocked room"""
+
+        def _request_unblocked_room(room_id: str) -> None:
+            self._is_blocked(room_id, expect=False)
+
+            channel = self.make_request(
+                "GET",
+                self.url % room_id,
+                access_token=self.admin_user_tok,
+            )
+            self.assertEqual(HTTPStatus.OK, channel.code, msg=channel.json_body)
+            self.assertFalse(channel.json_body["block"])
+            self.assertNotIn("user_id", channel.json_body)
+
+        # known internal room
+        _request_unblocked_room(self.room_id)
+
+        # unknown internal room
+        _request_unblocked_room("!unknown:test")
+
+        # unknown remote room
+        _request_unblocked_room("!unknown:remote")
+
+    def _is_blocked(self, room_id: str, expect: bool = True) -> None:
+        """Assert that the room is blocked or not"""
+        d = self._store.is_room_blocked(room_id)
+        if expect:
+            self.assertTrue(self.get_success(d))
+        else:
+            self.assertIsNone(self.get_success(d))
+
+    def _block_room(self, room_id: str) -> None:
+        """Block a room in database"""
+        self.get_success(self._store.block_room(room_id, self.other_user))
+        self._is_blocked(room_id, expect=True)
+
+
 PURGE_TABLES = [
     "current_state_events",
     "event_backward_extremities",
-- 
cgit 1.5.1


From eca7cffb73fce77d025a0d7a08badb855b6df133 Mon Sep 17 00:00:00 2001
From: Hubert Chathi <hubert@uhoreg.ca>
Date: Fri, 19 Nov 2021 06:40:12 -0500
Subject: Keep fallback key marked as used if it's re-uploaded (#11382)

---
 changelog.d/11382.misc                            |  1 +
 synapse/storage/databases/main/end_to_end_keys.py | 51 ++++++++++++++++++-----
 tests/handlers/test_e2e_keys.py                   | 32 +++++++++++++-
 3 files changed, 72 insertions(+), 12 deletions(-)
 create mode 100644 changelog.d/11382.misc

(limited to 'synapse/storage')

diff --git a/changelog.d/11382.misc b/changelog.d/11382.misc
new file mode 100644
index 0000000000..d812ef309e
--- /dev/null
+++ b/changelog.d/11382.misc
@@ -0,0 +1 @@
+Keep fallback key marked as used if it's re-uploaded.
diff --git a/synapse/storage/databases/main/end_to_end_keys.py b/synapse/storage/databases/main/end_to_end_keys.py
index a95ac34f09..b06c1dc45b 100644
--- a/synapse/storage/databases/main/end_to_end_keys.py
+++ b/synapse/storage/databases/main/end_to_end_keys.py
@@ -408,29 +408,58 @@ class EndToEndKeyWorkerStore(EndToEndKeyBackgroundStore):
             fallback_keys: the keys to set.  This is a map from key ID (which is
                 of the form "algorithm:id") to key data.
         """
+        await self.db_pool.runInteraction(
+            "set_e2e_fallback_keys_txn",
+            self._set_e2e_fallback_keys_txn,
+            user_id,
+            device_id,
+            fallback_keys,
+        )
+
+        await self.invalidate_cache_and_stream(
+            "get_e2e_unused_fallback_key_types", (user_id, device_id)
+        )
+
+    def _set_e2e_fallback_keys_txn(
+        self, txn: Connection, user_id: str, device_id: str, fallback_keys: JsonDict
+    ) -> None:
         # fallback_keys will usually only have one item in it, so using a for
         # loop (as opposed to calling simple_upsert_many_txn) won't be too bad
         # FIXME: make sure that only one key per algorithm is uploaded
         for key_id, fallback_key in fallback_keys.items():
             algorithm, key_id = key_id.split(":", 1)
-            await self.db_pool.simple_upsert(
-                "e2e_fallback_keys_json",
+            old_key_json = self.db_pool.simple_select_one_onecol_txn(
+                txn,
+                table="e2e_fallback_keys_json",
                 keyvalues={
                     "user_id": user_id,
                     "device_id": device_id,
                     "algorithm": algorithm,
                 },
-                values={
-                    "key_id": key_id,
-                    "key_json": json_encoder.encode(fallback_key),
-                    "used": False,
-                },
-                desc="set_e2e_fallback_key",
+                retcol="key_json",
+                allow_none=True,
             )
 
-        await self.invalidate_cache_and_stream(
-            "get_e2e_unused_fallback_key_types", (user_id, device_id)
-        )
+            new_key_json = encode_canonical_json(fallback_key).decode("utf-8")
+
+            # If the uploaded key is the same as the current fallback key,
+            # don't do anything.  This prevents marking the key as unused if it
+            # was already used.
+            if old_key_json != new_key_json:
+                self.db_pool.simple_upsert_txn(
+                    txn,
+                    table="e2e_fallback_keys_json",
+                    keyvalues={
+                        "user_id": user_id,
+                        "device_id": device_id,
+                        "algorithm": algorithm,
+                    },
+                    values={
+                        "key_id": key_id,
+                        "key_json": json_encoder.encode(fallback_key),
+                        "used": False,
+                    },
+                )
 
     @cached(max_entries=10000)
     async def get_e2e_unused_fallback_key_types(
diff --git a/tests/handlers/test_e2e_keys.py b/tests/handlers/test_e2e_keys.py
index 0c3b86fda9..f0723892e4 100644
--- a/tests/handlers/test_e2e_keys.py
+++ b/tests/handlers/test_e2e_keys.py
@@ -162,6 +162,7 @@ class E2eKeysHandlerTestCase(unittest.HomeserverTestCase):
         local_user = "@boris:" + self.hs.hostname
         device_id = "xyz"
         fallback_key = {"alg1:k1": "key1"}
+        fallback_key2 = {"alg1:k2": "key2"}
         otk = {"alg1:k2": "key2"}
 
         # we shouldn't have any unused fallback keys yet
@@ -213,6 +214,35 @@ class E2eKeysHandlerTestCase(unittest.HomeserverTestCase):
             {"failures": {}, "one_time_keys": {local_user: {device_id: fallback_key}}},
         )
 
+        # re-uploading the same fallback key should still result in no unused fallback
+        # keys
+        self.get_success(
+            self.handler.upload_keys_for_user(
+                local_user,
+                device_id,
+                {"org.matrix.msc2732.fallback_keys": fallback_key},
+            )
+        )
+
+        res = self.get_success(
+            self.store.get_e2e_unused_fallback_key_types(local_user, device_id)
+        )
+        self.assertEqual(res, [])
+
+        # uploading a new fallback key should result in an unused fallback key
+        self.get_success(
+            self.handler.upload_keys_for_user(
+                local_user,
+                device_id,
+                {"org.matrix.msc2732.fallback_keys": fallback_key2},
+            )
+        )
+
+        res = self.get_success(
+            self.store.get_e2e_unused_fallback_key_types(local_user, device_id)
+        )
+        self.assertEqual(res, ["alg1"])
+
         # if the user uploads a one-time key, the next claim should fetch the
         # one-time key, and then go back to the fallback
         self.get_success(
@@ -238,7 +268,7 @@ class E2eKeysHandlerTestCase(unittest.HomeserverTestCase):
         )
         self.assertEqual(
             res,
-            {"failures": {}, "one_time_keys": {local_user: {device_id: fallback_key}}},
+            {"failures": {}, "one_time_keys": {local_user: {device_id: fallback_key2}}},
         )
 
     def test_replace_master_key(self):
-- 
cgit 1.5.1


From ea20937084903864865f76e22f67d27729f2d6dc Mon Sep 17 00:00:00 2001
From: Dirk Klimpel <5740567+dklimpel@users.noreply.github.com>
Date: Fri, 19 Nov 2021 20:39:46 +0100
Subject: Add an admin API to run background jobs. (#11352)

Instead of having admins poke into the database directly.

Can currently run jobs to populate stats and to populate
the user directory.
---
 changelog.d/11352.feature                          |   1 +
 docs/sample_config.yaml                            |   4 +-
 .../administration/admin_api/background_updates.md |  27 +++-
 docs/user_directory.md                             |   6 +-
 synapse/config/user_directory.py                   |   4 +-
 synapse/rest/admin/__init__.py                     |   2 +
 synapse/rest/admin/background_updates.py           | 123 ++++++++++++----
 synapse/storage/background_updates.py              |   2 +
 tests/rest/admin/test_background_updates.py        | 154 +++++++++++++++++++--
 9 files changed, 280 insertions(+), 43 deletions(-)
 create mode 100644 changelog.d/11352.feature

(limited to 'synapse/storage')

diff --git a/changelog.d/11352.feature b/changelog.d/11352.feature
new file mode 100644
index 0000000000..a4d01b3549
--- /dev/null
+++ b/changelog.d/11352.feature
@@ -0,0 +1 @@
+Add admin API to run background jobs.
\ No newline at end of file
diff --git a/docs/sample_config.yaml b/docs/sample_config.yaml
index 3c931468aa..aee300013f 100644
--- a/docs/sample_config.yaml
+++ b/docs/sample_config.yaml
@@ -2360,8 +2360,8 @@ user_directory:
     # indexes were (re)built was before Synapse 1.44, you'll have to
     # rebuild the indexes in order to search through all known users.
     # These indexes are built the first time Synapse starts; admins can
-    # manually trigger a rebuild following the instructions at
-    #     https://matrix-org.github.io/synapse/latest/user_directory.html
+    # manually trigger a rebuild via API following the instructions at
+    #     https://matrix-org.github.io/synapse/latest/usage/administration/admin_api/background_updates.html#run
     #
     # Uncomment to return search results containing all known users, even if that
     # user does not share a room with the requester.
diff --git a/docs/usage/administration/admin_api/background_updates.md b/docs/usage/administration/admin_api/background_updates.md
index b36d7fe398..9f6ac7d567 100644
--- a/docs/usage/administration/admin_api/background_updates.md
+++ b/docs/usage/administration/admin_api/background_updates.md
@@ -42,7 +42,6 @@ For each update:
 `average_items_per_ms` how many items are processed per millisecond based on an exponential average.
 
 
-
 ## Enabled
 
 This API allow pausing background updates.
@@ -82,3 +81,29 @@ The API returns the `enabled` param.
 ```
 
 There is also a `GET` version which returns the `enabled` state.
+
+
+## Run
+
+This API schedules a specific background update to run. The job starts immediately after calling the API.
+
+
+The API is:
+
+```
+POST /_synapse/admin/v1/background_updates/start_job
+```
+
+with the following body:
+
+```json
+{
+    "job_name": "populate_stats_process_rooms"
+}
+```
+
+The following JSON body parameters are available:
+
+- `job_name` - A string which job to run. Valid values are:
+  - `populate_stats_process_rooms` - Recalculate the stats for all rooms.
+  - `regenerate_directory` - Recalculate the [user directory](../../../user_directory.md) if it is stale or out of sync.
diff --git a/docs/user_directory.md b/docs/user_directory.md
index 07fe954891..c4794b04cf 100644
--- a/docs/user_directory.md
+++ b/docs/user_directory.md
@@ -6,9 +6,9 @@ on this particular server - i.e. ones which your account shares a room with, or
 who are present in a publicly viewable room present on the server.
 
 The directory info is stored in various tables, which can (typically after
-DB corruption) get stale or out of sync.  If this happens, for now the
-solution to fix it is to execute the SQL [here](https://github.com/matrix-org/synapse/blob/master/synapse/storage/schema/main/delta/53/user_dir_populate.sql)
-and then restart synapse. This should then start a background task to
+DB corruption) get stale or out of sync. If this happens, for now the
+solution to fix it is to use the [admin API](usage/administration/admin_api/background_updates.md#run)
+and execute the job `regenerate_directory`. This should then start a background task to
 flush the current tables and regenerate the directory.
 
 Data model
diff --git a/synapse/config/user_directory.py b/synapse/config/user_directory.py
index 2552f688d0..6d6678c7e4 100644
--- a/synapse/config/user_directory.py
+++ b/synapse/config/user_directory.py
@@ -53,8 +53,8 @@ class UserDirectoryConfig(Config):
             # indexes were (re)built was before Synapse 1.44, you'll have to
             # rebuild the indexes in order to search through all known users.
             # These indexes are built the first time Synapse starts; admins can
-            # manually trigger a rebuild following the instructions at
-            #     https://matrix-org.github.io/synapse/latest/user_directory.html
+            # manually trigger a rebuild via API following the instructions at
+            #     https://matrix-org.github.io/synapse/latest/usage/administration/admin_api/background_updates.html#run
             #
             # Uncomment to return search results containing all known users, even if that
             # user does not share a room with the requester.
diff --git a/synapse/rest/admin/__init__.py b/synapse/rest/admin/__init__.py
index 65b76fa10c..ee4a5e481b 100644
--- a/synapse/rest/admin/__init__.py
+++ b/synapse/rest/admin/__init__.py
@@ -28,6 +28,7 @@ from synapse.rest.admin._base import admin_patterns, assert_requester_is_admin
 from synapse.rest.admin.background_updates import (
     BackgroundUpdateEnabledRestServlet,
     BackgroundUpdateRestServlet,
+    BackgroundUpdateStartJobRestServlet,
 )
 from synapse.rest.admin.devices import (
     DeleteDevicesRestServlet,
@@ -261,6 +262,7 @@ def register_servlets(hs: "HomeServer", http_server: HttpServer) -> None:
         SendServerNoticeServlet(hs).register(http_server)
         BackgroundUpdateEnabledRestServlet(hs).register(http_server)
         BackgroundUpdateRestServlet(hs).register(http_server)
+        BackgroundUpdateStartJobRestServlet(hs).register(http_server)
 
 
 def register_servlets_for_client_rest_resource(
diff --git a/synapse/rest/admin/background_updates.py b/synapse/rest/admin/background_updates.py
index 0d0183bf20..479672d4d5 100644
--- a/synapse/rest/admin/background_updates.py
+++ b/synapse/rest/admin/background_updates.py
@@ -12,10 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+from http import HTTPStatus
 from typing import TYPE_CHECKING, Tuple
 
 from synapse.api.errors import SynapseError
-from synapse.http.servlet import RestServlet, parse_json_object_from_request
+from synapse.http.servlet import (
+    RestServlet,
+    assert_params_in_dict,
+    parse_json_object_from_request,
+)
 from synapse.http.site import SynapseRequest
 from synapse.rest.admin._base import admin_patterns, assert_user_is_admin
 from synapse.types import JsonDict
@@ -29,37 +34,36 @@ logger = logging.getLogger(__name__)
 class BackgroundUpdateEnabledRestServlet(RestServlet):
     """Allows temporarily disabling background updates"""
 
-    PATTERNS = admin_patterns("/background_updates/enabled")
+    PATTERNS = admin_patterns("/background_updates/enabled$")
 
     def __init__(self, hs: "HomeServer"):
-        self.group_server = hs.get_groups_server_handler()
-        self.is_mine_id = hs.is_mine_id
-        self.auth = hs.get_auth()
-
-        self.data_stores = hs.get_datastores()
+        self._auth = hs.get_auth()
+        self._data_stores = hs.get_datastores()
 
     async def on_GET(self, request: SynapseRequest) -> Tuple[int, JsonDict]:
-        requester = await self.auth.get_user_by_req(request)
-        await assert_user_is_admin(self.auth, requester.user)
+        requester = await self._auth.get_user_by_req(request)
+        await assert_user_is_admin(self._auth, requester.user)
 
         # We need to check that all configured databases have updates enabled.
         # (They *should* all be in sync.)
-        enabled = all(db.updates.enabled for db in self.data_stores.databases)
+        enabled = all(db.updates.enabled for db in self._data_stores.databases)
 
-        return 200, {"enabled": enabled}
+        return HTTPStatus.OK, {"enabled": enabled}
 
     async def on_POST(self, request: SynapseRequest) -> Tuple[int, JsonDict]:
-        requester = await self.auth.get_user_by_req(request)
-        await assert_user_is_admin(self.auth, requester.user)
+        requester = await self._auth.get_user_by_req(request)
+        await assert_user_is_admin(self._auth, requester.user)
 
         body = parse_json_object_from_request(request)
 
         enabled = body.get("enabled", True)
 
         if not isinstance(enabled, bool):
-            raise SynapseError(400, "'enabled' parameter must be a boolean")
+            raise SynapseError(
+                HTTPStatus.BAD_REQUEST, "'enabled' parameter must be a boolean"
+            )
 
-        for db in self.data_stores.databases:
+        for db in self._data_stores.databases:
             db.updates.enabled = enabled
 
             # If we're re-enabling them ensure that we start the background
@@ -67,32 +71,29 @@ class BackgroundUpdateEnabledRestServlet(RestServlet):
             if enabled:
                 db.updates.start_doing_background_updates()
 
-        return 200, {"enabled": enabled}
+        return HTTPStatus.OK, {"enabled": enabled}
 
 
 class BackgroundUpdateRestServlet(RestServlet):
     """Fetch information about background updates"""
 
-    PATTERNS = admin_patterns("/background_updates/status")
+    PATTERNS = admin_patterns("/background_updates/status$")
 
     def __init__(self, hs: "HomeServer"):
-        self.group_server = hs.get_groups_server_handler()
-        self.is_mine_id = hs.is_mine_id
-        self.auth = hs.get_auth()
-
-        self.data_stores = hs.get_datastores()
+        self._auth = hs.get_auth()
+        self._data_stores = hs.get_datastores()
 
     async def on_GET(self, request: SynapseRequest) -> Tuple[int, JsonDict]:
-        requester = await self.auth.get_user_by_req(request)
-        await assert_user_is_admin(self.auth, requester.user)
+        requester = await self._auth.get_user_by_req(request)
+        await assert_user_is_admin(self._auth, requester.user)
 
         # We need to check that all configured databases have updates enabled.
         # (They *should* all be in sync.)
-        enabled = all(db.updates.enabled for db in self.data_stores.databases)
+        enabled = all(db.updates.enabled for db in self._data_stores.databases)
 
         current_updates = {}
 
-        for db in self.data_stores.databases:
+        for db in self._data_stores.databases:
             update = db.updates.get_current_update()
             if not update:
                 continue
@@ -104,4 +105,72 @@ class BackgroundUpdateRestServlet(RestServlet):
                 "average_items_per_ms": update.average_items_per_ms(),
             }
 
-        return 200, {"enabled": enabled, "current_updates": current_updates}
+        return HTTPStatus.OK, {"enabled": enabled, "current_updates": current_updates}
+
+
+class BackgroundUpdateStartJobRestServlet(RestServlet):
+    """Allows to start specific background updates"""
+
+    PATTERNS = admin_patterns("/background_updates/start_job")
+
+    def __init__(self, hs: "HomeServer"):
+        self._auth = hs.get_auth()
+        self._store = hs.get_datastore()
+
+    async def on_POST(self, request: SynapseRequest) -> Tuple[int, JsonDict]:
+        requester = await self._auth.get_user_by_req(request)
+        await assert_user_is_admin(self._auth, requester.user)
+
+        body = parse_json_object_from_request(request)
+        assert_params_in_dict(body, ["job_name"])
+
+        job_name = body["job_name"]
+
+        if job_name == "populate_stats_process_rooms":
+            jobs = [
+                {
+                    "update_name": "populate_stats_process_rooms",
+                    "progress_json": "{}",
+                },
+            ]
+        elif job_name == "regenerate_directory":
+            jobs = [
+                {
+                    "update_name": "populate_user_directory_createtables",
+                    "progress_json": "{}",
+                    "depends_on": "",
+                },
+                {
+                    "update_name": "populate_user_directory_process_rooms",
+                    "progress_json": "{}",
+                    "depends_on": "populate_user_directory_createtables",
+                },
+                {
+                    "update_name": "populate_user_directory_process_users",
+                    "progress_json": "{}",
+                    "depends_on": "populate_user_directory_process_rooms",
+                },
+                {
+                    "update_name": "populate_user_directory_cleanup",
+                    "progress_json": "{}",
+                    "depends_on": "populate_user_directory_process_users",
+                },
+            ]
+        else:
+            raise SynapseError(HTTPStatus.BAD_REQUEST, "Invalid job_name")
+
+        try:
+            await self._store.db_pool.simple_insert_many(
+                table="background_updates",
+                values=jobs,
+                desc=f"admin_api_run_{job_name}",
+            )
+        except self._store.db_pool.engine.module.IntegrityError:
+            raise SynapseError(
+                HTTPStatus.BAD_REQUEST,
+                "Job %s is already in queue of background updates." % (job_name,),
+            )
+
+        self._store.db_pool.updates.start_doing_background_updates()
+
+        return HTTPStatus.OK, {}
diff --git a/synapse/storage/background_updates.py b/synapse/storage/background_updates.py
index b9a8ca997e..b104f9032c 100644
--- a/synapse/storage/background_updates.py
+++ b/synapse/storage/background_updates.py
@@ -122,6 +122,8 @@ class BackgroundUpdater:
 
     def start_doing_background_updates(self) -> None:
         if self.enabled:
+            # if we start a new background update, not all updates are done.
+            self._all_done = False
             run_as_background_process("background_updates", self.run_background_updates)
 
     async def run_background_updates(self, sleep: bool = True) -> None:
diff --git a/tests/rest/admin/test_background_updates.py b/tests/rest/admin/test_background_updates.py
index 78c48db552..1786316763 100644
--- a/tests/rest/admin/test_background_updates.py
+++ b/tests/rest/admin/test_background_updates.py
@@ -11,8 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from http import HTTPStatus
+from typing import Collection
+
+from parameterized import parameterized
 
 import synapse.rest.admin
+from synapse.api.errors import Codes
 from synapse.rest.client import login
 from synapse.server import HomeServer
 
@@ -30,6 +35,60 @@ class BackgroundUpdatesTestCase(unittest.HomeserverTestCase):
         self.admin_user = self.register_user("admin", "pass", admin=True)
         self.admin_user_tok = self.login("admin", "pass")
 
+    @parameterized.expand(
+        [
+            ("GET", "/_synapse/admin/v1/background_updates/enabled"),
+            ("POST", "/_synapse/admin/v1/background_updates/enabled"),
+            ("GET", "/_synapse/admin/v1/background_updates/status"),
+            ("POST", "/_synapse/admin/v1/background_updates/start_job"),
+        ]
+    )
+    def test_requester_is_no_admin(self, method: str, url: str):
+        """
+        If the user is not a server admin, an error 403 is returned.
+        """
+
+        self.register_user("user", "pass", admin=False)
+        other_user_tok = self.login("user", "pass")
+
+        channel = self.make_request(
+            method,
+            url,
+            content={},
+            access_token=other_user_tok,
+        )
+
+        self.assertEqual(HTTPStatus.FORBIDDEN, channel.code, msg=channel.json_body)
+        self.assertEqual(Codes.FORBIDDEN, channel.json_body["errcode"])
+
+    def test_invalid_parameter(self):
+        """
+        If parameters are invalid, an error is returned.
+        """
+        url = "/_synapse/admin/v1/background_updates/start_job"
+
+        # empty content
+        channel = self.make_request(
+            "POST",
+            url,
+            content={},
+            access_token=self.admin_user_tok,
+        )
+
+        self.assertEqual(HTTPStatus.BAD_REQUEST, channel.code, msg=channel.json_body)
+        self.assertEqual(Codes.MISSING_PARAM, channel.json_body["errcode"])
+
+        # job_name invalid
+        channel = self.make_request(
+            "POST",
+            url,
+            content={"job_name": "unknown"},
+            access_token=self.admin_user_tok,
+        )
+
+        self.assertEqual(HTTPStatus.BAD_REQUEST, channel.code, msg=channel.json_body)
+        self.assertEqual(Codes.UNKNOWN, channel.json_body["errcode"])
+
     def _register_bg_update(self):
         "Adds a bg update but doesn't start it"
 
@@ -60,7 +119,7 @@ class BackgroundUpdatesTestCase(unittest.HomeserverTestCase):
             "/_synapse/admin/v1/background_updates/status",
             access_token=self.admin_user_tok,
         )
-        self.assertEqual(200, int(channel.result["code"]), msg=channel.result["body"])
+        self.assertEqual(HTTPStatus.OK, channel.code, msg=channel.json_body)
 
         # Background updates should be enabled, but none should be running.
         self.assertDictEqual(
@@ -82,7 +141,7 @@ class BackgroundUpdatesTestCase(unittest.HomeserverTestCase):
             "/_synapse/admin/v1/background_updates/status",
             access_token=self.admin_user_tok,
         )
-        self.assertEqual(200, int(channel.result["code"]), msg=channel.result["body"])
+        self.assertEqual(HTTPStatus.OK, channel.code, msg=channel.json_body)
 
         # Background updates should be enabled, and one should be running.
         self.assertDictEqual(
@@ -114,7 +173,7 @@ class BackgroundUpdatesTestCase(unittest.HomeserverTestCase):
             "/_synapse/admin/v1/background_updates/enabled",
             access_token=self.admin_user_tok,
         )
-        self.assertEqual(200, int(channel.result["code"]), msg=channel.result["body"])
+        self.assertEqual(HTTPStatus.OK, channel.code, msg=channel.json_body)
         self.assertDictEqual(channel.json_body, {"enabled": True})
 
         # Disable the BG updates
@@ -124,7 +183,7 @@ class BackgroundUpdatesTestCase(unittest.HomeserverTestCase):
             content={"enabled": False},
             access_token=self.admin_user_tok,
         )
-        self.assertEqual(200, int(channel.result["code"]), msg=channel.result["body"])
+        self.assertEqual(HTTPStatus.OK, channel.code, msg=channel.json_body)
         self.assertDictEqual(channel.json_body, {"enabled": False})
 
         # Advance a bit and get the current status, note this will finish the in
@@ -137,7 +196,7 @@ class BackgroundUpdatesTestCase(unittest.HomeserverTestCase):
             "/_synapse/admin/v1/background_updates/status",
             access_token=self.admin_user_tok,
         )
-        self.assertEqual(200, int(channel.result["code"]), msg=channel.result["body"])
+        self.assertEqual(HTTPStatus.OK, channel.code, msg=channel.json_body)
         self.assertDictEqual(
             channel.json_body,
             {
@@ -162,7 +221,7 @@ class BackgroundUpdatesTestCase(unittest.HomeserverTestCase):
             "/_synapse/admin/v1/background_updates/status",
             access_token=self.admin_user_tok,
         )
-        self.assertEqual(200, int(channel.result["code"]), msg=channel.result["body"])
+        self.assertEqual(HTTPStatus.OK, channel.code, msg=channel.json_body)
 
         # There should be no change from the previous /status response.
         self.assertDictEqual(
@@ -188,7 +247,7 @@ class BackgroundUpdatesTestCase(unittest.HomeserverTestCase):
             content={"enabled": True},
             access_token=self.admin_user_tok,
         )
-        self.assertEqual(200, int(channel.result["code"]), msg=channel.result["body"])
+        self.assertEqual(HTTPStatus.OK, channel.code, msg=channel.json_body)
 
         self.assertDictEqual(channel.json_body, {"enabled": True})
 
@@ -199,7 +258,7 @@ class BackgroundUpdatesTestCase(unittest.HomeserverTestCase):
             "/_synapse/admin/v1/background_updates/status",
             access_token=self.admin_user_tok,
         )
-        self.assertEqual(200, int(channel.result["code"]), msg=channel.result["body"])
+        self.assertEqual(HTTPStatus.OK, channel.code, msg=channel.json_body)
 
         # Background updates should be enabled and making progress.
         self.assertDictEqual(
@@ -216,3 +275,82 @@ class BackgroundUpdatesTestCase(unittest.HomeserverTestCase):
                 "enabled": True,
             },
         )
+
+    @parameterized.expand(
+        [
+            ("populate_stats_process_rooms", ["populate_stats_process_rooms"]),
+            (
+                "regenerate_directory",
+                [
+                    "populate_user_directory_createtables",
+                    "populate_user_directory_process_rooms",
+                    "populate_user_directory_process_users",
+                    "populate_user_directory_cleanup",
+                ],
+            ),
+        ]
+    )
+    def test_start_backround_job(self, job_name: str, updates: Collection[str]):
+        """
+        Test that background updates add to database and be processed.
+
+        Args:
+            job_name: name of the job to call with API
+            updates: collection of background updates to be started
+        """
+
+        # no background update is waiting
+        self.assertTrue(
+            self.get_success(
+                self.store.db_pool.updates.has_completed_background_updates()
+            )
+        )
+
+        channel = self.make_request(
+            "POST",
+            "/_synapse/admin/v1/background_updates/start_job",
+            content={"job_name": job_name},
+            access_token=self.admin_user_tok,
+        )
+
+        self.assertEqual(HTTPStatus.OK, channel.code, msg=channel.json_body)
+
+        # test that each background update is waiting now
+        for update in updates:
+            self.assertFalse(
+                self.get_success(
+                    self.store.db_pool.updates.has_completed_background_update(update)
+                )
+            )
+
+        self.wait_for_background_updates()
+
+        # background updates are done
+        self.assertTrue(
+            self.get_success(
+                self.store.db_pool.updates.has_completed_background_updates()
+            )
+        )
+
+    def test_start_backround_job_twice(self):
+        """Test that add a background update twice return an error."""
+
+        # add job to database
+        self.get_success(
+            self.store.db_pool.simple_insert(
+                table="background_updates",
+                values={
+                    "update_name": "populate_stats_process_rooms",
+                    "progress_json": "{}",
+                },
+            )
+        )
+
+        channel = self.make_request(
+            "POST",
+            "/_synapse/admin/v1/background_updates/start_job",
+            content={"job_name": "populate_stats_process_rooms"},
+            access_token=self.admin_user_tok,
+        )
+
+        self.assertEqual(HTTPStatus.BAD_REQUEST, channel.code, msg=channel.json_body)
-- 
cgit 1.5.1


From 3d893b8cf2358f947678dfb995b73f426200b099 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@users.noreply.github.com>
Date: Mon, 22 Nov 2021 12:01:47 -0500
Subject: Store arbitrary relations from events. (#11391)

Instead of only known relation types. This also reworks the background
update for thread relations to crawl events and search for any relation
type, not just threaded relations.
---
 changelog.d/11391.feature                          |   1 +
 synapse/storage/databases/main/events.py           |  29 +++---
 .../storage/databases/main/events_bg_updates.py    |  88 ++++++++++------
 .../schema/main/delta/65/02_thread_relations.sql   |  18 ----
 .../main/delta/65/07_arbitrary_relations.sql       |  18 ++++
 tests/rest/client/test_relations.py                | 111 +++++++++++++++++++++
 tests/unittest.py                                  |   7 +-
 7 files changed, 210 insertions(+), 62 deletions(-)
 create mode 100644 changelog.d/11391.feature
 delete mode 100644 synapse/storage/schema/main/delta/65/02_thread_relations.sql
 create mode 100644 synapse/storage/schema/main/delta/65/07_arbitrary_relations.sql

(limited to 'synapse/storage')

diff --git a/changelog.d/11391.feature b/changelog.d/11391.feature
new file mode 100644
index 0000000000..4f696285a7
--- /dev/null
+++ b/changelog.d/11391.feature
@@ -0,0 +1 @@
+Store and allow querying of arbitrary event relations.
diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py
index 120e4807d1..06832221ad 100644
--- a/synapse/storage/databases/main/events.py
+++ b/synapse/storage/databases/main/events.py
@@ -1,6 +1,6 @@
 # Copyright 2014-2016 OpenMarket Ltd
 # Copyright 2018-2019 New Vector Ltd
-# Copyright 2019 The Matrix.org Foundation C.I.C.
+# Copyright 2019-2021 The Matrix.org Foundation C.I.C.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1696,34 +1696,33 @@ class PersistEventsStore:
                     },
                 )
 
-    def _handle_event_relations(self, txn, event):
-        """Handles inserting relation data during peristence of events
+    def _handle_event_relations(
+        self, txn: LoggingTransaction, event: EventBase
+    ) -> None:
+        """Handles inserting relation data during persistence of events
 
         Args:
-            txn
-            event (EventBase)
+            txn: The current database transaction.
+            event: The event which might have relations.
         """
         relation = event.content.get("m.relates_to")
         if not relation:
             # No relations
             return
 
+        # Relations must have a type and parent event ID.
         rel_type = relation.get("rel_type")
-        if rel_type not in (
-            RelationTypes.ANNOTATION,
-            RelationTypes.REFERENCE,
-            RelationTypes.REPLACE,
-            RelationTypes.THREAD,
-        ):
-            # Unknown relation type
+        if not isinstance(rel_type, str):
             return
 
         parent_id = relation.get("event_id")
-        if not parent_id:
-            # Invalid relation
+        if not isinstance(parent_id, str):
             return
 
-        aggregation_key = relation.get("key")
+        # Annotations have a key field.
+        aggregation_key = None
+        if rel_type == RelationTypes.ANNOTATION:
+            aggregation_key = relation.get("key")
 
         self.db_pool.simple_insert_txn(
             txn,
diff --git a/synapse/storage/databases/main/events_bg_updates.py b/synapse/storage/databases/main/events_bg_updates.py
index ae3a8a63e4..c88fd35e7f 100644
--- a/synapse/storage/databases/main/events_bg_updates.py
+++ b/synapse/storage/databases/main/events_bg_updates.py
@@ -1,4 +1,4 @@
-# Copyright 2019 The Matrix.org Foundation C.I.C.
+# Copyright 2019-2021 The Matrix.org Foundation C.I.C.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -171,8 +171,14 @@ class EventsBackgroundUpdatesStore(SQLBaseStore):
             self._purged_chain_cover_index,
         )
 
+        # The event_thread_relation background update was replaced with the
+        # event_arbitrary_relations one, which handles any relation to avoid
+        # needed to potentially crawl the entire events table in the future.
+        self.db_pool.updates.register_noop_background_update("event_thread_relation")
+
         self.db_pool.updates.register_background_update_handler(
-            "event_thread_relation", self._event_thread_relation
+            "event_arbitrary_relations",
+            self._event_arbitrary_relations,
         )
 
         ################################################################################
@@ -1099,23 +1105,27 @@ class EventsBackgroundUpdatesStore(SQLBaseStore):
 
         return result
 
-    async def _event_thread_relation(self, progress: JsonDict, batch_size: int) -> int:
-        """Background update handler which will store thread relations for existing events."""
+    async def _event_arbitrary_relations(
+        self, progress: JsonDict, batch_size: int
+    ) -> int:
+        """Background update handler which will store previously unknown relations for existing events."""
         last_event_id = progress.get("last_event_id", "")
 
-        def _event_thread_relation_txn(txn: LoggingTransaction) -> int:
+        def _event_arbitrary_relations_txn(txn: LoggingTransaction) -> int:
+            # Fetch events and then filter based on whether the event has a
+            # relation or not.
             txn.execute(
                 """
                 SELECT event_id, json FROM event_json
-                LEFT JOIN event_relations USING (event_id)
-                WHERE event_id > ? AND event_relations.event_id IS NULL
+                WHERE event_id > ?
                 ORDER BY event_id LIMIT ?
                 """,
                 (last_event_id, batch_size),
             )
 
             results = list(txn)
-            missing_thread_relations = []
+            # (event_id, parent_id, rel_type) for each relation
+            relations_to_insert: List[Tuple[str, str, str]] = []
             for (event_id, event_json_raw) in results:
                 try:
                     event_json = db_to_json(event_json_raw)
@@ -1127,48 +1137,70 @@ class EventsBackgroundUpdatesStore(SQLBaseStore):
                     )
                     continue
 
-                # If there's no relation (or it is not a thread), skip!
+                # If there's no relation, skip!
                 relates_to = event_json["content"].get("m.relates_to")
                 if not relates_to or not isinstance(relates_to, dict):
                     continue
-                if relates_to.get("rel_type") != RelationTypes.THREAD:
+
+                # If the relation type or parent event ID is not a string, skip it.
+                #
+                # Do not consider relation types that have existed for a long time,
+                # since they will already be listed in the `event_relations` table.
+                rel_type = relates_to.get("rel_type")
+                if not isinstance(rel_type, str) or rel_type in (
+                    RelationTypes.ANNOTATION,
+                    RelationTypes.REFERENCE,
+                    RelationTypes.REPLACE,
+                ):
                     continue
 
-                # Get the parent ID.
                 parent_id = relates_to.get("event_id")
                 if not isinstance(parent_id, str):
                     continue
 
-                missing_thread_relations.append((event_id, parent_id))
+                relations_to_insert.append((event_id, parent_id, rel_type))
+
+            # Insert the missing data, note that we upsert here in case the event
+            # has already been processed.
+            if relations_to_insert:
+                self.db_pool.simple_upsert_many_txn(
+                    txn=txn,
+                    table="event_relations",
+                    key_names=("event_id",),
+                    key_values=[(r[0],) for r in relations_to_insert],
+                    value_names=("relates_to_id", "relation_type"),
+                    value_values=[r[1:] for r in relations_to_insert],
+                )
 
-            # Insert the missing data.
-            self.db_pool.simple_insert_many_txn(
-                txn=txn,
-                table="event_relations",
-                values=[
-                    {
-                        "event_id": event_id,
-                        "relates_to_Id": parent_id,
-                        "relation_type": RelationTypes.THREAD,
-                    }
-                    for event_id, parent_id in missing_thread_relations
-                ],
-            )
+                # Iterate the parent IDs and invalidate caches.
+                for parent_id in {r[1] for r in relations_to_insert}:
+                    cache_tuple = (parent_id,)
+                    self._invalidate_cache_and_stream(
+                        txn, self.get_relations_for_event, cache_tuple
+                    )
+                    self._invalidate_cache_and_stream(
+                        txn, self.get_aggregation_groups_for_event, cache_tuple
+                    )
+                    self._invalidate_cache_and_stream(
+                        txn, self.get_thread_summary, cache_tuple
+                    )
 
             if results:
                 latest_event_id = results[-1][0]
                 self.db_pool.updates._background_update_progress_txn(
-                    txn, "event_thread_relation", {"last_event_id": latest_event_id}
+                    txn, "event_arbitrary_relations", {"last_event_id": latest_event_id}
                 )
 
             return len(results)
 
         num_rows = await self.db_pool.runInteraction(
-            desc="event_thread_relation", func=_event_thread_relation_txn
+            desc="event_arbitrary_relations", func=_event_arbitrary_relations_txn
         )
 
         if not num_rows:
-            await self.db_pool.updates._end_background_update("event_thread_relation")
+            await self.db_pool.updates._end_background_update(
+                "event_arbitrary_relations"
+            )
 
         return num_rows
 
diff --git a/synapse/storage/schema/main/delta/65/02_thread_relations.sql b/synapse/storage/schema/main/delta/65/02_thread_relations.sql
deleted file mode 100644
index d60517f7b4..0000000000
--- a/synapse/storage/schema/main/delta/65/02_thread_relations.sql
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright 2021 The Matrix.org Foundation C.I.C
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
--- Check old events for thread relations.
-INSERT INTO background_updates (ordering, update_name, progress_json) VALUES
-  (6502, 'event_thread_relation', '{}');
diff --git a/synapse/storage/schema/main/delta/65/07_arbitrary_relations.sql b/synapse/storage/schema/main/delta/65/07_arbitrary_relations.sql
new file mode 100644
index 0000000000..267b2cb539
--- /dev/null
+++ b/synapse/storage/schema/main/delta/65/07_arbitrary_relations.sql
@@ -0,0 +1,18 @@
+/* Copyright 2021 The Matrix.org Foundation C.I.C
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+-- Check old events for thread relations.
+INSERT INTO background_updates (ordering, update_name, progress_json) VALUES
+  (6507, 'event_arbitrary_relations', '{}');
diff --git a/tests/rest/client/test_relations.py b/tests/rest/client/test_relations.py
index b8a1b92a89..eb10d43217 100644
--- a/tests/rest/client/test_relations.py
+++ b/tests/rest/client/test_relations.py
@@ -1,4 +1,5 @@
 # Copyright 2019 New Vector Ltd
+# Copyright 2021 The Matrix.org Foundation C.I.C.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -46,6 +47,8 @@ class RelationsTestCase(unittest.HomeserverTestCase):
         return config
 
     def prepare(self, reactor, clock, hs):
+        self.store = hs.get_datastore()
+
         self.user_id, self.user_token = self._create_user("alice")
         self.user2_id, self.user2_token = self._create_user("bob")
 
@@ -765,6 +768,52 @@ class RelationsTestCase(unittest.HomeserverTestCase):
         self.assertIn("chunk", channel.json_body)
         self.assertEquals(channel.json_body["chunk"], [])
 
+    def test_unknown_relations(self):
+        """Unknown relations should be accepted."""
+        channel = self._send_relation("m.relation.test", "m.room.test")
+        self.assertEquals(200, channel.code, channel.json_body)
+        event_id = channel.json_body["event_id"]
+
+        channel = self.make_request(
+            "GET",
+            "/_matrix/client/unstable/rooms/%s/relations/%s?limit=1"
+            % (self.room, self.parent_id),
+            access_token=self.user_token,
+        )
+        self.assertEquals(200, channel.code, channel.json_body)
+
+        # We expect to get back a single pagination result, which is the full
+        # relation event we sent above.
+        self.assertEquals(len(channel.json_body["chunk"]), 1, channel.json_body)
+        self.assert_dict(
+            {"event_id": event_id, "sender": self.user_id, "type": "m.room.test"},
+            channel.json_body["chunk"][0],
+        )
+
+        # We also expect to get the original event (the id of which is self.parent_id)
+        self.assertEquals(
+            channel.json_body["original_event"]["event_id"], self.parent_id
+        )
+
+        # When bundling the unknown relation is not included.
+        channel = self.make_request(
+            "GET",
+            "/rooms/%s/event/%s" % (self.room, self.parent_id),
+            access_token=self.user_token,
+        )
+        self.assertEquals(200, channel.code, channel.json_body)
+        self.assertNotIn("m.relations", channel.json_body["unsigned"])
+
+        # But unknown relations can be directly queried.
+        channel = self.make_request(
+            "GET",
+            "/_matrix/client/unstable/rooms/%s/aggregations/%s?limit=1"
+            % (self.room, self.parent_id),
+            access_token=self.user_token,
+        )
+        self.assertEquals(200, channel.code, channel.json_body)
+        self.assertEquals(channel.json_body["chunk"], [])
+
     def _send_relation(
         self,
         relation_type: str,
@@ -811,3 +860,65 @@ class RelationsTestCase(unittest.HomeserverTestCase):
         access_token = self.login(localpart, "abc123")
 
         return user_id, access_token
+
+    def test_background_update(self):
+        """Test the event_arbitrary_relations background update."""
+        channel = self._send_relation(RelationTypes.ANNOTATION, "m.reaction", key="👍")
+        self.assertEquals(200, channel.code, channel.json_body)
+        annotation_event_id_good = channel.json_body["event_id"]
+
+        channel = self._send_relation(RelationTypes.ANNOTATION, "m.reaction", key="A")
+        self.assertEquals(200, channel.code, channel.json_body)
+        annotation_event_id_bad = channel.json_body["event_id"]
+
+        channel = self._send_relation(RelationTypes.THREAD, "m.room.test")
+        self.assertEquals(200, channel.code, channel.json_body)
+        thread_event_id = channel.json_body["event_id"]
+
+        # Clean-up the table as if the inserts did not happen during event creation.
+        self.get_success(
+            self.store.db_pool.simple_delete_many(
+                table="event_relations",
+                column="event_id",
+                iterable=(annotation_event_id_bad, thread_event_id),
+                keyvalues={},
+                desc="RelationsTestCase.test_background_update",
+            )
+        )
+
+        # Only the "good" annotation should be found.
+        channel = self.make_request(
+            "GET",
+            f"/_matrix/client/unstable/rooms/{self.room}/relations/{self.parent_id}?limit=10",
+            access_token=self.user_token,
+        )
+        self.assertEquals(200, channel.code, channel.json_body)
+        self.assertEquals(
+            [ev["event_id"] for ev in channel.json_body["chunk"]],
+            [annotation_event_id_good],
+        )
+
+        # Insert and run the background update.
+        self.get_success(
+            self.store.db_pool.simple_insert(
+                "background_updates",
+                {"update_name": "event_arbitrary_relations", "progress_json": "{}"},
+            )
+        )
+
+        # Ugh, have to reset this flag
+        self.store.db_pool.updates._all_done = False
+        self.wait_for_background_updates()
+
+        # The "good" annotation and the thread should be found, but not the "bad"
+        # annotation.
+        channel = self.make_request(
+            "GET",
+            f"/_matrix/client/unstable/rooms/{self.room}/relations/{self.parent_id}?limit=10",
+            access_token=self.user_token,
+        )
+        self.assertEquals(200, channel.code, channel.json_body)
+        self.assertCountEqual(
+            [ev["event_id"] for ev in channel.json_body["chunk"]],
+            [annotation_event_id_good, thread_event_id],
+        )
diff --git a/tests/unittest.py b/tests/unittest.py
index c9a08a3420..165aafc574 100644
--- a/tests/unittest.py
+++ b/tests/unittest.py
@@ -331,7 +331,12 @@ class HomeserverTestCase(TestCase):
             time.sleep(0.01)
 
     def wait_for_background_updates(self) -> None:
-        """Block until all background database updates have completed."""
+        """
+        Block until all background database updates have completed.
+
+        Note that callers must ensure that's a store property created on the
+        testcase.
+        """
         while not self.get_success(
             self.store.db_pool.updates.has_completed_background_updates()
         ):
-- 
cgit 1.5.1


From 7cebaf96447a8ff50c4525ba7667f58127876c5e Mon Sep 17 00:00:00 2001
From: Shay <hillerys@element.io>
Date: Tue, 23 Nov 2021 06:46:40 -0800
Subject: Remove code invalidated by deprecated config flag
 'trust_identity_servers_for_password_resets' (#11395)

* remove background update code related to deprecated config flag

* changelog entry

* update changelog

* Delete 11394.removal

Duplicate, wrong number

* add no-op background update and change newfragment so it will be consolidated with associated work

* remove unused code

* Remove code associated with deprecated flag from legacy docker dynamic config file

Co-authored-by: reivilibre <oliverw@matrix.org>
---
 changelog.d/11395.removal                      |  1 +
 docker/conf/homeserver.yaml                    |  8 ------
 synapse/storage/databases/main/registration.py | 35 +++-----------------------
 tests/utils.py                                 |  1 -
 4 files changed, 4 insertions(+), 41 deletions(-)
 create mode 100644 changelog.d/11395.removal

(limited to 'synapse/storage')

diff --git a/changelog.d/11395.removal b/changelog.d/11395.removal
new file mode 100644
index 0000000000..6c1fd560ad
--- /dev/null
+++ b/changelog.d/11395.removal
@@ -0,0 +1 @@
+Remove deprecated `trust_identity_server_for_password_resets` configuration flag.
\ No newline at end of file
diff --git a/docker/conf/homeserver.yaml b/docker/conf/homeserver.yaml
index 3cba594d02..f10f78a48c 100644
--- a/docker/conf/homeserver.yaml
+++ b/docker/conf/homeserver.yaml
@@ -148,14 +148,6 @@ bcrypt_rounds: 12
 allow_guest_access: {{ "True" if SYNAPSE_ALLOW_GUEST else "False" }}
 enable_group_creation: true
 
-# The list of identity servers trusted to verify third party
-# identifiers by this server.
-#
-# Also defines the ID server which will be called when an account is
-# deactivated (one will be picked arbitrarily).
-trusted_third_party_id_servers:
-    - matrix.org
-    - vector.im
 
 ## Metrics ###
 
diff --git a/synapse/storage/databases/main/registration.py b/synapse/storage/databases/main/registration.py
index 7ee699f981..8478463a2a 100644
--- a/synapse/storage/databases/main/registration.py
+++ b/synapse/storage/databases/main/registration.py
@@ -1728,11 +1728,11 @@ class RegistrationBackgroundUpdateStore(RegistrationWorkerStore):
         )
 
         self.db_pool.updates.register_background_update_handler(
-            "user_threepids_grandfather", self._bg_user_threepids_grandfather
+            "users_set_deactivated_flag", self._background_update_set_deactivated_flag
         )
 
-        self.db_pool.updates.register_background_update_handler(
-            "users_set_deactivated_flag", self._background_update_set_deactivated_flag
+        self.db_pool.updates.register_noop_background_update(
+            "user_threepids_grandfather"
         )
 
         self.db_pool.updates.register_background_index_update(
@@ -1805,35 +1805,6 @@ class RegistrationBackgroundUpdateStore(RegistrationWorkerStore):
 
         return nb_processed
 
-    async def _bg_user_threepids_grandfather(self, progress, batch_size):
-        """We now track which identity servers a user binds their 3PID to, so
-        we need to handle the case of existing bindings where we didn't track
-        this.
-
-        We do this by grandfathering in existing user threepids assuming that
-        they used one of the server configured trusted identity servers.
-        """
-        id_servers = set(self.config.registration.trusted_third_party_id_servers)
-
-        def _bg_user_threepids_grandfather_txn(txn):
-            sql = """
-                INSERT INTO user_threepid_id_server
-                    (user_id, medium, address, id_server)
-                SELECT user_id, medium, address, ?
-                FROM user_threepids
-            """
-
-            txn.execute_batch(sql, [(id_server,) for id_server in id_servers])
-
-        if id_servers:
-            await self.db_pool.runInteraction(
-                "_bg_user_threepids_grandfather", _bg_user_threepids_grandfather_txn
-            )
-
-        await self.db_pool.updates._end_background_update("user_threepids_grandfather")
-
-        return 1
-
     async def set_user_deactivated_status(
         self, user_id: str, deactivated: bool
     ) -> None:
diff --git a/tests/utils.py b/tests/utils.py
index cf8ba5c5db..983859120f 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -119,7 +119,6 @@ def default_config(name, parse=False):
         "enable_registration": True,
         "enable_registration_captcha": False,
         "macaroon_secret_key": "not even a little secret",
-        "trusted_third_party_id_servers": [],
         "password_providers": [],
         "worker_replication_url": "",
         "worker_app": None,
-- 
cgit 1.5.1


From 55669bd3de7137553085e9f1c16a686ff657108c Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@users.noreply.github.com>
Date: Tue, 23 Nov 2021 10:21:19 -0500
Subject: Add missing type hints to config base classes (#11377)

---
 changelog.d/11377.bugfix                       |   1 +
 changelog.d/11377.misc                         |   1 +
 mypy.ini                                       |   3 +
 synapse/config/_base.py                        | 157 +++++++++++++++----------
 synapse/config/_base.pyi                       |  87 +++++++++-----
 synapse/config/cache.py                        |   4 +-
 synapse/config/key.py                          |   3 +-
 synapse/config/logger.py                       |   4 +-
 synapse/config/server.py                       |   4 +-
 synapse/config/tls.py                          |   2 +-
 synapse/module_api/__init__.py                 |   2 +-
 synapse/storage/databases/main/registration.py |   3 +-
 tests/config/test_load.py                      |  22 ++--
 13 files changed, 184 insertions(+), 109 deletions(-)
 create mode 100644 changelog.d/11377.bugfix
 create mode 100644 changelog.d/11377.misc

(limited to 'synapse/storage')

diff --git a/changelog.d/11377.bugfix b/changelog.d/11377.bugfix
new file mode 100644
index 0000000000..9831fb7bbe
--- /dev/null
+++ b/changelog.d/11377.bugfix
@@ -0,0 +1 @@
+Fix a bug introduced in v1.45.0 where the `read_templates` method of the module API would error.
diff --git a/changelog.d/11377.misc b/changelog.d/11377.misc
new file mode 100644
index 0000000000..3dac625576
--- /dev/null
+++ b/changelog.d/11377.misc
@@ -0,0 +1 @@
+Add type hints to configuration classes.
diff --git a/mypy.ini b/mypy.ini
index 308cfd95d8..bc4f59154d 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -151,6 +151,9 @@ disallow_untyped_defs = True
 [mypy-synapse.app.*]
 disallow_untyped_defs = True
 
+[mypy-synapse.config._base]
+disallow_untyped_defs = True
+
 [mypy-synapse.crypto.*]
 disallow_untyped_defs = True
 
diff --git a/synapse/config/_base.py b/synapse/config/_base.py
index 7c4428a138..1265738dc1 100644
--- a/synapse/config/_base.py
+++ b/synapse/config/_base.py
@@ -20,7 +20,18 @@ import os
 from collections import OrderedDict
 from hashlib import sha256
 from textwrap import dedent
-from typing import Any, Iterable, List, MutableMapping, Optional, Union
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    List,
+    MutableMapping,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
 
 import attr
 import jinja2
@@ -78,7 +89,7 @@ CONFIG_FILE_HEADER = """\
 """
 
 
-def path_exists(file_path):
+def path_exists(file_path: str) -> bool:
     """Check if a file exists
 
     Unlike os.path.exists, this throws an exception if there is an error
@@ -86,7 +97,7 @@ def path_exists(file_path):
     the parent dir).
 
     Returns:
-        bool: True if the file exists; False if not.
+        True if the file exists; False if not.
     """
     try:
         os.stat(file_path)
@@ -102,15 +113,15 @@ class Config:
     A configuration section, containing configuration keys and values.
 
     Attributes:
-        section (str): The section title of this config object, such as
+        section: The section title of this config object, such as
             "tls" or "logger". This is used to refer to it on the root
             logger (for example, `config.tls.some_option`). Must be
             defined in subclasses.
     """
 
-    section = None
+    section: str
 
-    def __init__(self, root_config=None):
+    def __init__(self, root_config: "RootConfig" = None):
         self.root = root_config
 
         # Get the path to the default Synapse template directory
@@ -119,7 +130,7 @@ class Config:
         )
 
     @staticmethod
-    def parse_size(value):
+    def parse_size(value: Union[str, int]) -> int:
         if isinstance(value, int):
             return value
         sizes = {"K": 1024, "M": 1024 * 1024}
@@ -162,15 +173,15 @@ class Config:
         return int(value) * size
 
     @staticmethod
-    def abspath(file_path):
+    def abspath(file_path: str) -> str:
         return os.path.abspath(file_path) if file_path else file_path
 
     @classmethod
-    def path_exists(cls, file_path):
+    def path_exists(cls, file_path: str) -> bool:
         return path_exists(file_path)
 
     @classmethod
-    def check_file(cls, file_path, config_name):
+    def check_file(cls, file_path: Optional[str], config_name: str) -> str:
         if file_path is None:
             raise ConfigError("Missing config for %s." % (config_name,))
         try:
@@ -183,7 +194,7 @@ class Config:
         return cls.abspath(file_path)
 
     @classmethod
-    def ensure_directory(cls, dir_path):
+    def ensure_directory(cls, dir_path: str) -> str:
         dir_path = cls.abspath(dir_path)
         os.makedirs(dir_path, exist_ok=True)
         if not os.path.isdir(dir_path):
@@ -191,7 +202,7 @@ class Config:
         return dir_path
 
     @classmethod
-    def read_file(cls, file_path, config_name):
+    def read_file(cls, file_path: Any, config_name: str) -> str:
         """Deprecated: call read_file directly"""
         return read_file(file_path, (config_name,))
 
@@ -284,6 +295,9 @@ class Config:
         return [env.get_template(filename) for filename in filenames]
 
 
+TRootConfig = TypeVar("TRootConfig", bound="RootConfig")
+
+
 class RootConfig:
     """
     Holder of an application's configuration.
@@ -308,7 +322,9 @@ class RootConfig:
                 raise Exception("Failed making %s: %r" % (config_class.section, e))
             setattr(self, config_class.section, conf)
 
-    def invoke_all(self, func_name: str, *args, **kwargs) -> MutableMapping[str, Any]:
+    def invoke_all(
+        self, func_name: str, *args: Any, **kwargs: Any
+    ) -> MutableMapping[str, Any]:
         """
         Invoke a function on all instantiated config objects this RootConfig is
         configured to use.
@@ -317,6 +333,7 @@ class RootConfig:
             func_name: Name of function to invoke
             *args
             **kwargs
+
         Returns:
             ordered dictionary of config section name and the result of the
             function from it.
@@ -332,7 +349,7 @@ class RootConfig:
         return res
 
     @classmethod
-    def invoke_all_static(cls, func_name: str, *args, **kwargs):
+    def invoke_all_static(cls, func_name: str, *args: Any, **kwargs: any) -> None:
         """
         Invoke a static function on config objects this RootConfig is
         configured to use.
@@ -341,6 +358,7 @@ class RootConfig:
             func_name: Name of function to invoke
             *args
             **kwargs
+
         Returns:
             ordered dictionary of config section name and the result of the
             function from it.
@@ -351,16 +369,16 @@ class RootConfig:
 
     def generate_config(
         self,
-        config_dir_path,
-        data_dir_path,
-        server_name,
-        generate_secrets=False,
-        report_stats=None,
-        open_private_ports=False,
-        listeners=None,
-        tls_certificate_path=None,
-        tls_private_key_path=None,
-    ):
+        config_dir_path: str,
+        data_dir_path: str,
+        server_name: str,
+        generate_secrets: bool = False,
+        report_stats: Optional[bool] = None,
+        open_private_ports: bool = False,
+        listeners: Optional[List[dict]] = None,
+        tls_certificate_path: Optional[str] = None,
+        tls_private_key_path: Optional[str] = None,
+    ) -> str:
         """
         Build a default configuration file
 
@@ -368,27 +386,27 @@ class RootConfig:
         (eg with --generate_config).
 
         Args:
-            config_dir_path (str): The path where the config files are kept. Used to
+            config_dir_path: The path where the config files are kept. Used to
                 create filenames for things like the log config and the signing key.
 
-            data_dir_path (str): The path where the data files are kept. Used to create
+            data_dir_path: The path where the data files are kept. Used to create
                 filenames for things like the database and media store.
 
-            server_name (str): The server name. Used to initialise the server_name
+            server_name: The server name. Used to initialise the server_name
                 config param, but also used in the names of some of the config files.
 
-            generate_secrets (bool): True if we should generate new secrets for things
+            generate_secrets: True if we should generate new secrets for things
                 like the macaroon_secret_key. If False, these parameters will be left
                 unset.
 
-            report_stats (bool|None): Initial setting for the report_stats setting.
+            report_stats: Initial setting for the report_stats setting.
                 If None, report_stats will be left unset.
 
-            open_private_ports (bool): True to leave private ports (such as the non-TLS
+            open_private_ports: True to leave private ports (such as the non-TLS
                 HTTP listener) open to the internet.
 
-            listeners (list(dict)|None): A list of descriptions of the listeners
-                synapse should start with each of which specifies a port (str), a list of
+            listeners: A list of descriptions of the listeners synapse should
+                start with each of which specifies a port (int), a list of
                 resources (list(str)), tls (bool) and type (str). For example:
                 [{
                     "port": 8448,
@@ -403,16 +421,12 @@ class RootConfig:
                     "type": "http",
                 }],
 
+            tls_certificate_path: The path to the tls certificate.
 
-            database (str|None): The database type to configure, either `psycog2`
-                or `sqlite3`.
-
-            tls_certificate_path (str|None): The path to the tls certificate.
-
-            tls_private_key_path (str|None): The path to the tls private key.
+            tls_private_key_path: The path to the tls private key.
 
         Returns:
-            str: the yaml config file
+            The yaml config file
         """
 
         return CONFIG_FILE_HEADER + "\n\n".join(
@@ -432,12 +446,15 @@ class RootConfig:
         )
 
     @classmethod
-    def load_config(cls, description, argv):
+    def load_config(
+        cls: Type[TRootConfig], description: str, argv: List[str]
+    ) -> TRootConfig:
         """Parse the commandline and config files
 
         Doesn't support config-file-generation: used by the worker apps.
 
-        Returns: Config object.
+        Returns:
+            Config object.
         """
         config_parser = argparse.ArgumentParser(description=description)
         cls.add_arguments_to_parser(config_parser)
@@ -446,7 +463,7 @@ class RootConfig:
         return obj
 
     @classmethod
-    def add_arguments_to_parser(cls, config_parser):
+    def add_arguments_to_parser(cls, config_parser: argparse.ArgumentParser) -> None:
         """Adds all the config flags to an ArgumentParser.
 
         Doesn't support config-file-generation: used by the worker apps.
@@ -454,7 +471,7 @@ class RootConfig:
         Used for workers where we want to add extra flags/subcommands.
 
         Args:
-            config_parser (ArgumentParser): App description
+            config_parser: App description
         """
 
         config_parser.add_argument(
@@ -477,7 +494,9 @@ class RootConfig:
         cls.invoke_all_static("add_arguments", config_parser)
 
     @classmethod
-    def load_config_with_parser(cls, parser, argv):
+    def load_config_with_parser(
+        cls: Type[TRootConfig], parser: argparse.ArgumentParser, argv: List[str]
+    ) -> Tuple[TRootConfig, argparse.Namespace]:
         """Parse the commandline and config files with the given parser
 
         Doesn't support config-file-generation: used by the worker apps.
@@ -485,13 +504,12 @@ class RootConfig:
         Used for workers where we want to add extra flags/subcommands.
 
         Args:
-            parser (ArgumentParser)
-            argv (list[str])
+            parser
+            argv
 
         Returns:
-            tuple[HomeServerConfig, argparse.Namespace]: Returns the parsed
-            config object and the parsed argparse.Namespace object from
-            `parser.parse_args(..)`
+            Returns the parsed config object and the parsed argparse.Namespace
+            object from parser.parse_args(..)`
         """
 
         obj = cls()
@@ -520,12 +538,15 @@ class RootConfig:
         return obj, config_args
 
     @classmethod
-    def load_or_generate_config(cls, description, argv):
+    def load_or_generate_config(
+        cls: Type[TRootConfig], description: str, argv: List[str]
+    ) -> Optional[TRootConfig]:
         """Parse the commandline and config files
 
         Supports generation of config files, so is used for the main homeserver app.
 
-        Returns: Config object, or None if --generate-config or --generate-keys was set
+        Returns:
+            Config object, or None if --generate-config or --generate-keys was set
         """
         parser = argparse.ArgumentParser(description=description)
         parser.add_argument(
@@ -680,16 +701,21 @@ class RootConfig:
 
         return obj
 
-    def parse_config_dict(self, config_dict, config_dir_path=None, data_dir_path=None):
+    def parse_config_dict(
+        self,
+        config_dict: Dict[str, Any],
+        config_dir_path: Optional[str] = None,
+        data_dir_path: Optional[str] = None,
+    ) -> None:
         """Read the information from the config dict into this Config object.
 
         Args:
-            config_dict (dict): Configuration data, as read from the yaml
+            config_dict: Configuration data, as read from the yaml
 
-            config_dir_path (str): The path where the config files are kept. Used to
+            config_dir_path: The path where the config files are kept. Used to
                 create filenames for things like the log config and the signing key.
 
-            data_dir_path (str): The path where the data files are kept. Used to create
+            data_dir_path: The path where the data files are kept. Used to create
                 filenames for things like the database and media store.
         """
         self.invoke_all(
@@ -699,17 +725,20 @@ class RootConfig:
             data_dir_path=data_dir_path,
         )
 
-    def generate_missing_files(self, config_dict, config_dir_path):
+    def generate_missing_files(
+        self, config_dict: Dict[str, Any], config_dir_path: str
+    ) -> None:
         self.invoke_all("generate_files", config_dict, config_dir_path)
 
 
-def read_config_files(config_files):
+def read_config_files(config_files: Iterable[str]) -> Dict[str, Any]:
     """Read the config files into a dict
 
     Args:
-        config_files (iterable[str]): A list of the config files to read
+        config_files: A list of the config files to read
 
-    Returns: dict
+    Returns:
+        The configuration dictionary.
     """
     specified_config = {}
     for config_file in config_files:
@@ -733,17 +762,17 @@ def read_config_files(config_files):
     return specified_config
 
 
-def find_config_files(search_paths):
+def find_config_files(search_paths: List[str]) -> List[str]:
     """Finds config files using a list of search paths. If a path is a file
     then that file path is added to the list. If a search path is a directory
     then all the "*.yaml" files in that directory are added to the list in
     sorted order.
 
     Args:
-        search_paths(list(str)): A list of paths to search.
+        search_paths: A list of paths to search.
 
     Returns:
-        list(str): A list of file paths.
+        A list of file paths.
     """
 
     config_files = []
@@ -777,7 +806,7 @@ def find_config_files(search_paths):
     return config_files
 
 
-@attr.s
+@attr.s(auto_attribs=True)
 class ShardedWorkerHandlingConfig:
     """Algorithm for choosing which instance is responsible for handling some
     sharded work.
@@ -787,7 +816,7 @@ class ShardedWorkerHandlingConfig:
     below).
     """
 
-    instances = attr.ib(type=List[str])
+    instances: List[str]
 
     def should_handle(self, instance_name: str, key: str) -> bool:
         """Whether this instance is responsible for handling the given key."""
diff --git a/synapse/config/_base.pyi b/synapse/config/_base.pyi
index c1d9069798..1eb5f5a68c 100644
--- a/synapse/config/_base.pyi
+++ b/synapse/config/_base.pyi
@@ -1,4 +1,18 @@
-from typing import Any, Iterable, List, Optional
+import argparse
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    List,
+    MutableMapping,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+
+import jinja2
 
 from synapse.config import (
     account_validity,
@@ -19,6 +33,7 @@ from synapse.config import (
     logger,
     metrics,
     modules,
+    oembed,
     oidc,
     password_auth_providers,
     push,
@@ -27,6 +42,7 @@ from synapse.config import (
     registration,
     repository,
     retention,
+    room,
     room_directory,
     saml2,
     server,
@@ -51,7 +67,9 @@ MISSING_REPORT_STATS_CONFIG_INSTRUCTIONS: str
 MISSING_REPORT_STATS_SPIEL: str
 MISSING_SERVER_NAME: str
 
-def path_exists(file_path: str): ...
+def path_exists(file_path: str) -> bool: ...
+
+TRootConfig = TypeVar("TRootConfig", bound="RootConfig")
 
 class RootConfig:
     server: server.ServerConfig
@@ -61,6 +79,7 @@ class RootConfig:
     logging: logger.LoggingConfig
     ratelimiting: ratelimiting.RatelimitConfig
     media: repository.ContentRepositoryConfig
+    oembed: oembed.OembedConfig
     captcha: captcha.CaptchaConfig
     voip: voip.VoipConfig
     registration: registration.RegistrationConfig
@@ -80,6 +99,7 @@ class RootConfig:
     authproviders: password_auth_providers.PasswordAuthProviderConfig
     push: push.PushConfig
     spamchecker: spam_checker.SpamCheckerConfig
+    room: room.RoomConfig
     groups: groups.GroupsConfig
     userdirectory: user_directory.UserDirectoryConfig
     consent: consent.ConsentConfig
@@ -87,72 +107,85 @@ class RootConfig:
     servernotices: server_notices.ServerNoticesConfig
     roomdirectory: room_directory.RoomDirectoryConfig
     thirdpartyrules: third_party_event_rules.ThirdPartyRulesConfig
-    tracer: tracer.TracerConfig
+    tracing: tracer.TracerConfig
     redis: redis.RedisConfig
     modules: modules.ModulesConfig
     caches: cache.CacheConfig
     federation: federation.FederationConfig
     retention: retention.RetentionConfig
 
-    config_classes: List = ...
+    config_classes: List[Type["Config"]] = ...
     def __init__(self) -> None: ...
-    def invoke_all(self, func_name: str, *args: Any, **kwargs: Any): ...
+    def invoke_all(
+        self, func_name: str, *args: Any, **kwargs: Any
+    ) -> MutableMapping[str, Any]: ...
     @classmethod
     def invoke_all_static(cls, func_name: str, *args: Any, **kwargs: Any) -> None: ...
-    def __getattr__(self, item: str): ...
     def parse_config_dict(
         self,
-        config_dict: Any,
-        config_dir_path: Optional[Any] = ...,
-        data_dir_path: Optional[Any] = ...,
+        config_dict: Dict[str, Any],
+        config_dir_path: Optional[str] = ...,
+        data_dir_path: Optional[str] = ...,
     ) -> None: ...
-    read_config: Any = ...
     def generate_config(
         self,
         config_dir_path: str,
         data_dir_path: str,
         server_name: str,
         generate_secrets: bool = ...,
-        report_stats: Optional[str] = ...,
+        report_stats: Optional[bool] = ...,
         open_private_ports: bool = ...,
         listeners: Optional[Any] = ...,
-        database_conf: Optional[Any] = ...,
         tls_certificate_path: Optional[str] = ...,
         tls_private_key_path: Optional[str] = ...,
-    ): ...
+    ) -> str: ...
     @classmethod
-    def load_or_generate_config(cls, description: Any, argv: Any): ...
+    def load_or_generate_config(
+        cls: Type[TRootConfig], description: str, argv: List[str]
+    ) -> Optional[TRootConfig]: ...
     @classmethod
-    def load_config(cls, description: Any, argv: Any): ...
+    def load_config(
+        cls: Type[TRootConfig], description: str, argv: List[str]
+    ) -> TRootConfig: ...
     @classmethod
-    def add_arguments_to_parser(cls, config_parser: Any) -> None: ...
+    def add_arguments_to_parser(
+        cls, config_parser: argparse.ArgumentParser
+    ) -> None: ...
     @classmethod
-    def load_config_with_parser(cls, parser: Any, argv: Any): ...
+    def load_config_with_parser(
+        cls: Type[TRootConfig], parser: argparse.ArgumentParser, argv: List[str]
+    ) -> Tuple[TRootConfig, argparse.Namespace]: ...
     def generate_missing_files(
         self, config_dict: dict, config_dir_path: str
     ) -> None: ...
 
 class Config:
     root: RootConfig
+    default_template_dir: str
     def __init__(self, root_config: Optional[RootConfig] = ...) -> None: ...
-    def __getattr__(self, item: str, from_root: bool = ...): ...
     @staticmethod
-    def parse_size(value: Any): ...
+    def parse_size(value: Union[str, int]) -> int: ...
     @staticmethod
-    def parse_duration(value: Any): ...
+    def parse_duration(value: Union[str, int]) -> int: ...
     @staticmethod
-    def abspath(file_path: Optional[str]): ...
+    def abspath(file_path: Optional[str]) -> str: ...
     @classmethod
-    def path_exists(cls, file_path: str): ...
+    def path_exists(cls, file_path: str) -> bool: ...
     @classmethod
-    def check_file(cls, file_path: str, config_name: str): ...
+    def check_file(cls, file_path: str, config_name: str) -> str: ...
     @classmethod
-    def ensure_directory(cls, dir_path: str): ...
+    def ensure_directory(cls, dir_path: str) -> str: ...
     @classmethod
-    def read_file(cls, file_path: str, config_name: str): ...
+    def read_file(cls, file_path: str, config_name: str) -> str: ...
+    def read_template(self, filenames: str) -> jinja2.Template: ...
+    def read_templates(
+        self,
+        filenames: List[str],
+        custom_template_directories: Optional[Iterable[str]] = None,
+    ) -> List[jinja2.Template]: ...
 
-def read_config_files(config_files: List[str]): ...
-def find_config_files(search_paths: List[str]): ...
+def read_config_files(config_files: Iterable[str]) -> Dict[str, Any]: ...
+def find_config_files(search_paths: List[str]) -> List[str]: ...
 
 class ShardedWorkerHandlingConfig:
     instances: List[str]
diff --git a/synapse/config/cache.py b/synapse/config/cache.py
index d119427ad8..f054455534 100644
--- a/synapse/config/cache.py
+++ b/synapse/config/cache.py
@@ -15,7 +15,7 @@
 import os
 import re
 import threading
-from typing import Callable, Dict
+from typing import Callable, Dict, Optional
 
 from synapse.python_dependencies import DependencyException, check_requirements
 
@@ -217,7 +217,7 @@ class CacheConfig(Config):
 
         expiry_time = cache_config.get("expiry_time")
         if expiry_time:
-            self.expiry_time_msec = self.parse_duration(expiry_time)
+            self.expiry_time_msec: Optional[int] = self.parse_duration(expiry_time)
         else:
             self.expiry_time_msec = None
 
diff --git a/synapse/config/key.py b/synapse/config/key.py
index 015dbb8a67..035ee2416b 100644
--- a/synapse/config/key.py
+++ b/synapse/config/key.py
@@ -16,6 +16,7 @@
 import hashlib
 import logging
 import os
+from typing import Any, Dict
 
 import attr
 import jsonschema
@@ -312,7 +313,7 @@ class KeyConfig(Config):
                 )
         return keys
 
-    def generate_files(self, config, config_dir_path):
+    def generate_files(self, config: Dict[str, Any], config_dir_path: str) -> None:
         if "signing_key" in config:
             return
 
diff --git a/synapse/config/logger.py b/synapse/config/logger.py
index 5252e61a99..63aab0babe 100644
--- a/synapse/config/logger.py
+++ b/synapse/config/logger.py
@@ -18,7 +18,7 @@ import os
 import sys
 import threading
 from string import Template
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Dict
 
 import yaml
 from zope.interface import implementer
@@ -185,7 +185,7 @@ class LoggingConfig(Config):
             help=argparse.SUPPRESS,
         )
 
-    def generate_files(self, config, config_dir_path):
+    def generate_files(self, config: Dict[str, Any], config_dir_path: str) -> None:
         log_config = config.get("log_config")
         if log_config and not os.path.exists(log_config):
             log_file = self.abspath("homeserver.log")
diff --git a/synapse/config/server.py b/synapse/config/server.py
index 7bc0030a9e..8445e9dd05 100644
--- a/synapse/config/server.py
+++ b/synapse/config/server.py
@@ -421,7 +421,7 @@ class ServerConfig(Config):
         # before redacting them.
         redaction_retention_period = config.get("redaction_retention_period", "7d")
         if redaction_retention_period is not None:
-            self.redaction_retention_period = self.parse_duration(
+            self.redaction_retention_period: Optional[int] = self.parse_duration(
                 redaction_retention_period
             )
         else:
@@ -430,7 +430,7 @@ class ServerConfig(Config):
         # How long to keep entries in the `users_ips` table.
         user_ips_max_age = config.get("user_ips_max_age", "28d")
         if user_ips_max_age is not None:
-            self.user_ips_max_age = self.parse_duration(user_ips_max_age)
+            self.user_ips_max_age: Optional[int] = self.parse_duration(user_ips_max_age)
         else:
             self.user_ips_max_age = None
 
diff --git a/synapse/config/tls.py b/synapse/config/tls.py
index 21e5ddd15f..4ca111618f 100644
--- a/synapse/config/tls.py
+++ b/synapse/config/tls.py
@@ -245,7 +245,7 @@ class TlsConfig(Config):
         cert_path = self.tls_certificate_file
         logger.info("Loading TLS certificate from %s", cert_path)
         cert_pem = self.read_file(cert_path, "tls_certificate_path")
-        cert = crypto.load_certificate(crypto.FILETYPE_PEM, cert_pem)
+        cert = crypto.load_certificate(crypto.FILETYPE_PEM, cert_pem.encode())
 
         return cert
 
diff --git a/synapse/module_api/__init__.py b/synapse/module_api/__init__.py
index ac8e8142f1..96d7a8f2a9 100644
--- a/synapse/module_api/__init__.py
+++ b/synapse/module_api/__init__.py
@@ -1014,7 +1014,7 @@ class ModuleApi:
             A list containing the loaded templates, with the orders matching the one of
             the filenames parameter.
         """
-        return self._hs.config.read_templates(
+        return self._hs.config.server.read_templates(
             filenames,
             (td for td in (self.custom_template_dir, custom_template_directory) if td),
         )
diff --git a/synapse/storage/databases/main/registration.py b/synapse/storage/databases/main/registration.py
index 8478463a2a..0e8c168667 100644
--- a/synapse/storage/databases/main/registration.py
+++ b/synapse/storage/databases/main/registration.py
@@ -1198,8 +1198,9 @@ class RegistrationWorkerStore(CacheInvalidationWorkerStore):
         expiration_ts = now_ms + self._account_validity_period
 
         if use_delta:
+            assert self._account_validity_startup_job_max_delta is not None
             expiration_ts = random.randrange(
-                expiration_ts - self._account_validity_startup_job_max_delta,
+                int(expiration_ts - self._account_validity_startup_job_max_delta),
                 expiration_ts,
             )
 
diff --git a/tests/config/test_load.py b/tests/config/test_load.py
index d8668d56b2..69a4e9413b 100644
--- a/tests/config/test_load.py
+++ b/tests/config/test_load.py
@@ -46,15 +46,16 @@ class ConfigLoadingFileTestCase(ConfigFileTestCase):
                 "was: %r" % (config.key.macaroon_secret_key,)
             )
 
-        config = HomeServerConfig.load_or_generate_config("", ["-c", self.config_file])
+        config2 = HomeServerConfig.load_or_generate_config("", ["-c", self.config_file])
+        assert config2 is not None
         self.assertTrue(
-            hasattr(config.key, "macaroon_secret_key"),
+            hasattr(config2.key, "macaroon_secret_key"),
             "Want config to have attr macaroon_secret_key",
         )
-        if len(config.key.macaroon_secret_key) < 5:
+        if len(config2.key.macaroon_secret_key) < 5:
             self.fail(
                 "Want macaroon secret key to be string of at least length 5,"
-                "was: %r" % (config.key.macaroon_secret_key,)
+                "was: %r" % (config2.key.macaroon_secret_key,)
             )
 
     def test_load_succeeds_if_macaroon_secret_key_missing(self):
@@ -62,6 +63,9 @@ class ConfigLoadingFileTestCase(ConfigFileTestCase):
         config1 = HomeServerConfig.load_config("", ["-c", self.config_file])
         config2 = HomeServerConfig.load_config("", ["-c", self.config_file])
         config3 = HomeServerConfig.load_or_generate_config("", ["-c", self.config_file])
+        assert config1 is not None
+        assert config2 is not None
+        assert config3 is not None
         self.assertEqual(
             config1.key.macaroon_secret_key, config2.key.macaroon_secret_key
         )
@@ -78,14 +82,16 @@ class ConfigLoadingFileTestCase(ConfigFileTestCase):
         config = HomeServerConfig.load_config("", ["-c", self.config_file])
         self.assertFalse(config.registration.enable_registration)
 
-        config = HomeServerConfig.load_or_generate_config("", ["-c", self.config_file])
-        self.assertFalse(config.registration.enable_registration)
+        config2 = HomeServerConfig.load_or_generate_config("", ["-c", self.config_file])
+        assert config2 is not None
+        self.assertFalse(config2.registration.enable_registration)
 
         # Check that either config value is clobbered by the command line.
-        config = HomeServerConfig.load_or_generate_config(
+        config3 = HomeServerConfig.load_or_generate_config(
             "", ["-c", self.config_file, "--enable-registration"]
         )
-        self.assertTrue(config.registration.enable_registration)
+        assert config3 is not None
+        self.assertTrue(config3.registration.enable_registration)
 
     def test_stats_enabled(self):
         self.generate_config_and_remove_lines_containing("enable_metrics")
-- 
cgit 1.5.1


From 7f9841bdec349075eb424c943db5569439acb83c Mon Sep 17 00:00:00 2001
From: Brendan Abolivier <babolivier@matrix.org>
Date: Wed, 24 Nov 2021 20:21:44 +0100
Subject: Lower minumum batch size to 1 for background updates (#11422)

Co-authored-by: Richard van der Hoff <1389908+richvdh@users.noreply.github.com>
---
 changelog.d/11422.bugfix                    |  1 +
 synapse/storage/background_updates.py       |  2 +-
 tests/rest/admin/test_background_updates.py | 25 +++++++++++++++++--------
 tests/storage/test_background_update.py     |  8 ++++----
 4 files changed, 23 insertions(+), 13 deletions(-)
 create mode 100644 changelog.d/11422.bugfix

(limited to 'synapse/storage')

diff --git a/changelog.d/11422.bugfix b/changelog.d/11422.bugfix
new file mode 100644
index 0000000000..28ac65ea7c
--- /dev/null
+++ b/changelog.d/11422.bugfix
@@ -0,0 +1 @@
+Improve performance of various background database schema updates.
diff --git a/synapse/storage/background_updates.py b/synapse/storage/background_updates.py
index b104f9032c..bc8364400d 100644
--- a/synapse/storage/background_updates.py
+++ b/synapse/storage/background_updates.py
@@ -82,7 +82,7 @@ class BackgroundUpdater:
     process and autotuning the batch size.
     """
 
-    MINIMUM_BACKGROUND_BATCH_SIZE = 100
+    MINIMUM_BACKGROUND_BATCH_SIZE = 1
     DEFAULT_BACKGROUND_BATCH_SIZE = 100
     BACKGROUND_UPDATE_INTERVAL_MS = 1000
     BACKGROUND_UPDATE_DURATION_MS = 100
diff --git a/tests/rest/admin/test_background_updates.py b/tests/rest/admin/test_background_updates.py
index 1786316763..cd5c60b65c 100644
--- a/tests/rest/admin/test_background_updates.py
+++ b/tests/rest/admin/test_background_updates.py
@@ -20,6 +20,7 @@ import synapse.rest.admin
 from synapse.api.errors import Codes
 from synapse.rest.client import login
 from synapse.server import HomeServer
+from synapse.storage.background_updates import BackgroundUpdater
 
 from tests import unittest
 
@@ -150,9 +151,11 @@ class BackgroundUpdatesTestCase(unittest.HomeserverTestCase):
                 "current_updates": {
                     "master": {
                         "name": "test_update",
-                        "average_items_per_ms": 0.1,
+                        "average_items_per_ms": 0.001,
                         "total_duration_ms": 1000.0,
-                        "total_item_count": 100,
+                        "total_item_count": (
+                            BackgroundUpdater.MINIMUM_BACKGROUND_BATCH_SIZE
+                        ),
                     }
                 },
                 "enabled": True,
@@ -203,9 +206,11 @@ class BackgroundUpdatesTestCase(unittest.HomeserverTestCase):
                 "current_updates": {
                     "master": {
                         "name": "test_update",
-                        "average_items_per_ms": 0.1,
+                        "average_items_per_ms": 0.001,
                         "total_duration_ms": 1000.0,
-                        "total_item_count": 100,
+                        "total_item_count": (
+                            BackgroundUpdater.MINIMUM_BACKGROUND_BATCH_SIZE
+                        ),
                     }
                 },
                 "enabled": False,
@@ -230,9 +235,11 @@ class BackgroundUpdatesTestCase(unittest.HomeserverTestCase):
                 "current_updates": {
                     "master": {
                         "name": "test_update",
-                        "average_items_per_ms": 0.1,
+                        "average_items_per_ms": 0.001,
                         "total_duration_ms": 1000.0,
-                        "total_item_count": 100,
+                        "total_item_count": (
+                            BackgroundUpdater.MINIMUM_BACKGROUND_BATCH_SIZE
+                        ),
                     }
                 },
                 "enabled": False,
@@ -267,9 +274,11 @@ class BackgroundUpdatesTestCase(unittest.HomeserverTestCase):
                 "current_updates": {
                     "master": {
                         "name": "test_update",
-                        "average_items_per_ms": 0.1,
+                        "average_items_per_ms": 0.001,
                         "total_duration_ms": 2000.0,
-                        "total_item_count": 200,
+                        "total_item_count": (
+                            2 * BackgroundUpdater.MINIMUM_BACKGROUND_BATCH_SIZE
+                        ),
                     }
                 },
                 "enabled": True,
diff --git a/tests/storage/test_background_update.py b/tests/storage/test_background_update.py
index 0da42b5ac5..a5f5ebad41 100644
--- a/tests/storage/test_background_update.py
+++ b/tests/storage/test_background_update.py
@@ -19,11 +19,11 @@ class BackgroundUpdateTestCase(unittest.HomeserverTestCase):
         )
 
     def test_do_background_update(self):
-        # the time we claim each update takes
-        duration_ms = 42
+        # the time we claim it takes to update one item when running the update
+        duration_ms = 4200
 
         # the target runtime for each bg update
-        target_background_update_duration_ms = 50000
+        target_background_update_duration_ms = 5000000
 
         store = self.hs.get_datastore()
         self.get_success(
@@ -57,7 +57,7 @@ class BackgroundUpdateTestCase(unittest.HomeserverTestCase):
 
         # on the first call, we should get run with the default background update size
         self.update_handler.assert_called_once_with(
-            {"my_key": 1}, self.updates.DEFAULT_BACKGROUND_BATCH_SIZE
+            {"my_key": 1}, self.updates.MINIMUM_BACKGROUND_BATCH_SIZE
         )
 
         # second step: complete the update
-- 
cgit 1.5.1


From 0d88c4f9030c50bb9e016d9a28f6db27f7913d0b Mon Sep 17 00:00:00 2001
From: Brendan Abolivier <babolivier@matrix.org>
Date: Thu, 25 Nov 2021 16:14:54 +0100
Subject: Improve performance of
 `remove_{hidden,deleted}_devices_from_device_inbox` (#11421)

Co-authored-by: Richard van der Hoff <1389908+richvdh@users.noreply.github.com>
---
 changelog.d/11421.bugfix                           |   1 +
 synapse/storage/databases/main/deviceinbox.py      | 209 +++++++--------------
 .../65/08_device_inbox_background_updates.sql      |  18 ++
 tests/storage/databases/main/test_deviceinbox.py   |   4 +-
 4 files changed, 84 insertions(+), 148 deletions(-)
 create mode 100644 changelog.d/11421.bugfix
 create mode 100644 synapse/storage/schema/main/delta/65/08_device_inbox_background_updates.sql

(limited to 'synapse/storage')

diff --git a/changelog.d/11421.bugfix b/changelog.d/11421.bugfix
new file mode 100644
index 0000000000..28ac65ea7c
--- /dev/null
+++ b/changelog.d/11421.bugfix
@@ -0,0 +1 @@
+Improve performance of various background database schema updates.
diff --git a/synapse/storage/databases/main/deviceinbox.py b/synapse/storage/databases/main/deviceinbox.py
index 7c0f953365..ab8766c75b 100644
--- a/synapse/storage/databases/main/deviceinbox.py
+++ b/synapse/storage/databases/main/deviceinbox.py
@@ -599,6 +599,7 @@ class DeviceInboxBackgroundUpdateStore(SQLBaseStore):
     DEVICE_INBOX_STREAM_ID = "device_inbox_stream_drop"
     REMOVE_DELETED_DEVICES = "remove_deleted_devices_from_device_inbox"
     REMOVE_HIDDEN_DEVICES = "remove_hidden_devices_from_device_inbox"
+    REMOVE_DEAD_DEVICES_FROM_INBOX = "remove_dead_devices_from_device_inbox"
 
     def __init__(self, database: DatabasePool, db_conn, hs: "HomeServer"):
         super().__init__(database, db_conn, hs)
@@ -614,14 +615,18 @@ class DeviceInboxBackgroundUpdateStore(SQLBaseStore):
             self.DEVICE_INBOX_STREAM_ID, self._background_drop_index_device_inbox
         )
 
-        self.db_pool.updates.register_background_update_handler(
-            self.REMOVE_DELETED_DEVICES,
-            self._remove_deleted_devices_from_device_inbox,
+        # Used to be a background update that deletes all device_inboxes for deleted
+        # devices.
+        self.db_pool.updates.register_noop_background_update(
+            self.REMOVE_DELETED_DEVICES
         )
+        # Used to be a background update that deletes all device_inboxes for hidden
+        # devices.
+        self.db_pool.updates.register_noop_background_update(self.REMOVE_HIDDEN_DEVICES)
 
         self.db_pool.updates.register_background_update_handler(
-            self.REMOVE_HIDDEN_DEVICES,
-            self._remove_hidden_devices_from_device_inbox,
+            self.REMOVE_DEAD_DEVICES_FROM_INBOX,
+            self._remove_dead_devices_from_device_inbox,
         )
 
     async def _background_drop_index_device_inbox(self, progress, batch_size):
@@ -636,171 +641,83 @@ class DeviceInboxBackgroundUpdateStore(SQLBaseStore):
 
         return 1
 
-    async def _remove_deleted_devices_from_device_inbox(
-        self, progress: JsonDict, batch_size: int
+    async def _remove_dead_devices_from_device_inbox(
+        self,
+        progress: JsonDict,
+        batch_size: int,
     ) -> int:
-        """A background update that deletes all device_inboxes for deleted devices.
-
-        This should only need to be run once (when users upgrade to v1.47.0)
+        """A background update to remove devices that were either deleted or hidden from
+        the device_inbox table.
 
         Args:
-            progress: JsonDict used to store progress of this background update
-            batch_size: the maximum number of rows to retrieve in a single select query
+            progress: The update's progress dict.
+            batch_size: The batch size for this update.
 
         Returns:
-            The number of deleted rows
+            The number of rows deleted.
         """
 
-        def _remove_deleted_devices_from_device_inbox_txn(
+        def _remove_dead_devices_from_device_inbox_txn(
             txn: LoggingTransaction,
-        ) -> int:
-            """stream_id is not unique
-            we need to use an inclusive `stream_id >= ?` clause,
-            since we might not have deleted all dead device messages for the stream_id
-            returned from the previous query
+        ) -> Tuple[int, bool]:
 
-            Then delete only rows matching the `(user_id, device_id, stream_id)` tuple,
-            to avoid problems of deleting a large number of rows all at once
-            due to a single device having lots of device messages.
-            """
+            if "max_stream_id" in progress:
+                max_stream_id = progress["max_stream_id"]
+            else:
+                txn.execute("SELECT max(stream_id) FROM device_inbox")
+                # There's a type mismatch here between how we want to type the row and
+                # what fetchone says it returns, but we silence it because we know that
+                # res can't be None.
+                res: Tuple[Optional[int]] = txn.fetchone()  # type: ignore[assignment]
+                if res[0] is None:
+                    # this can only happen if the `device_inbox` table is empty, in which
+                    # case we have no work to do.
+                    return 0, True
+                else:
+                    max_stream_id = res[0]
 
-            last_stream_id = progress.get("stream_id", 0)
+            start = progress.get("stream_id", 0)
+            stop = start + batch_size
 
+            # delete rows in `device_inbox` which do *not* correspond to a known,
+            # unhidden device.
             sql = """
-                SELECT device_id, user_id, stream_id
-                FROM device_inbox
+                DELETE FROM device_inbox
                 WHERE
-                    stream_id >= ?
-                    AND (device_id, user_id) NOT IN (
-                        SELECT device_id, user_id FROM devices
+                    stream_id >= ? AND stream_id < ?
+                    AND NOT EXISTS (
+                        SELECT * FROM devices d
+                        WHERE
+                            d.device_id=device_inbox.device_id
+                            AND d.user_id=device_inbox.user_id
+                            AND NOT hidden
                     )
-                ORDER BY stream_id
-                LIMIT ?
-            """
-
-            txn.execute(sql, (last_stream_id, batch_size))
-            rows = txn.fetchall()
+                """
 
-            num_deleted = 0
-            for row in rows:
-                num_deleted += self.db_pool.simple_delete_txn(
-                    txn,
-                    "device_inbox",
-                    {"device_id": row[0], "user_id": row[1], "stream_id": row[2]},
-                )
+            txn.execute(sql, (start, stop))
 
-            if rows:
-                # send more than stream_id to progress
-                # otherwise it can happen in large deployments that
-                # no change of status is visible in the log file
-                # it may be that the stream_id does not change in several runs
-                self.db_pool.updates._background_update_progress_txn(
-                    txn,
-                    self.REMOVE_DELETED_DEVICES,
-                    {
-                        "device_id": rows[-1][0],
-                        "user_id": rows[-1][1],
-                        "stream_id": rows[-1][2],
-                    },
-                )
-
-            return num_deleted
-
-        number_deleted = await self.db_pool.runInteraction(
-            "_remove_deleted_devices_from_device_inbox",
-            _remove_deleted_devices_from_device_inbox_txn,
-        )
-
-        # The task is finished when no more lines are deleted.
-        if not number_deleted:
-            await self.db_pool.updates._end_background_update(
-                self.REMOVE_DELETED_DEVICES
+            self.db_pool.updates._background_update_progress_txn(
+                txn,
+                self.REMOVE_DEAD_DEVICES_FROM_INBOX,
+                {
+                    "stream_id": stop,
+                    "max_stream_id": max_stream_id,
+                },
             )
 
-        return number_deleted
-
-    async def _remove_hidden_devices_from_device_inbox(
-        self, progress: JsonDict, batch_size: int
-    ) -> int:
-        """A background update that deletes all device_inboxes for hidden devices.
-
-        This should only need to be run once (when users upgrade to v1.47.0)
-
-        Args:
-            progress: JsonDict used to store progress of this background update
-            batch_size: the maximum number of rows to retrieve in a single select query
-
-        Returns:
-            The number of deleted rows
-        """
-
-        def _remove_hidden_devices_from_device_inbox_txn(
-            txn: LoggingTransaction,
-        ) -> int:
-            """stream_id is not unique
-            we need to use an inclusive `stream_id >= ?` clause,
-            since we might not have deleted all hidden device messages for the stream_id
-            returned from the previous query
-
-            Then delete only rows matching the `(user_id, device_id, stream_id)` tuple,
-            to avoid problems of deleting a large number of rows all at once
-            due to a single device having lots of device messages.
-            """
-
-            last_stream_id = progress.get("stream_id", 0)
-
-            sql = """
-                SELECT device_id, user_id, stream_id
-                FROM device_inbox
-                WHERE
-                    stream_id >= ?
-                    AND (device_id, user_id) IN (
-                        SELECT device_id, user_id FROM devices WHERE hidden = ?
-                    )
-                ORDER BY stream_id
-                LIMIT ?
-            """
-
-            txn.execute(sql, (last_stream_id, True, batch_size))
-            rows = txn.fetchall()
-
-            num_deleted = 0
-            for row in rows:
-                num_deleted += self.db_pool.simple_delete_txn(
-                    txn,
-                    "device_inbox",
-                    {"device_id": row[0], "user_id": row[1], "stream_id": row[2]},
-                )
-
-            if rows:
-                # We don't just save the `stream_id` in progress as
-                # otherwise it can happen in large deployments that
-                # no change of status is visible in the log file, as
-                # it may be that the stream_id does not change in several runs
-                self.db_pool.updates._background_update_progress_txn(
-                    txn,
-                    self.REMOVE_HIDDEN_DEVICES,
-                    {
-                        "device_id": rows[-1][0],
-                        "user_id": rows[-1][1],
-                        "stream_id": rows[-1][2],
-                    },
-                )
-
-            return num_deleted
+            return stop > max_stream_id
 
-        number_deleted = await self.db_pool.runInteraction(
-            "_remove_hidden_devices_from_device_inbox",
-            _remove_hidden_devices_from_device_inbox_txn,
+        finished = await self.db_pool.runInteraction(
+            "_remove_devices_from_device_inbox_txn",
+            _remove_dead_devices_from_device_inbox_txn,
         )
 
-        # The task is finished when no more lines are deleted.
-        if not number_deleted:
+        if finished:
             await self.db_pool.updates._end_background_update(
-                self.REMOVE_HIDDEN_DEVICES
+                self.REMOVE_DEAD_DEVICES_FROM_INBOX,
             )
 
-        return number_deleted
+        return batch_size
 
 
 class DeviceInboxStore(DeviceInboxWorkerStore, DeviceInboxBackgroundUpdateStore):
diff --git a/synapse/storage/schema/main/delta/65/08_device_inbox_background_updates.sql b/synapse/storage/schema/main/delta/65/08_device_inbox_background_updates.sql
new file mode 100644
index 0000000000..d79455c2ce
--- /dev/null
+++ b/synapse/storage/schema/main/delta/65/08_device_inbox_background_updates.sql
@@ -0,0 +1,18 @@
+/* Copyright 2021 The Matrix.org Foundation C.I.C
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+-- Background update to clear the inboxes of hidden and deleted devices.
+INSERT INTO background_updates (ordering, update_name, progress_json) VALUES
+  (6508, 'remove_dead_devices_from_device_inbox', '{}');
diff --git a/tests/storage/databases/main/test_deviceinbox.py b/tests/storage/databases/main/test_deviceinbox.py
index 4b67bd15b7..36c933b9e9 100644
--- a/tests/storage/databases/main/test_deviceinbox.py
+++ b/tests/storage/databases/main/test_deviceinbox.py
@@ -66,7 +66,7 @@ class DeviceInboxBackgroundUpdateStoreTestCase(HomeserverTestCase):
             self.store.db_pool.simple_insert(
                 "background_updates",
                 {
-                    "update_name": "remove_deleted_devices_from_device_inbox",
+                    "update_name": "remove_dead_devices_from_device_inbox",
                     "progress_json": "{}",
                 },
             )
@@ -140,7 +140,7 @@ class DeviceInboxBackgroundUpdateStoreTestCase(HomeserverTestCase):
             self.store.db_pool.simple_insert(
                 "background_updates",
                 {
-                    "update_name": "remove_hidden_devices_from_device_inbox",
+                    "update_name": "remove_dead_devices_from_device_inbox",
                     "progress_json": "{}",
                 },
             )
-- 
cgit 1.5.1


From c675a18071ea86cb20bdc426a4c4c34bc629b6af Mon Sep 17 00:00:00 2001
From: Sean Quah <8349537+squahtx@users.noreply.github.com>
Date: Fri, 26 Nov 2021 13:47:24 +0000
Subject: Track ongoing event fetches correctly (again) (#11376)

The previous fix for the ongoing event fetches counter
(8eec25a1d9d656905db18a2c62a5552e63db2667) was both insufficient and
incorrect.

When the database is unreachable, `_do_fetch` never gets run and so
`_event_fetch_ongoing` is never decremented.

The previous fix also moved the `_event_fetch_ongoing` decrement outside
of the `_event_fetch_lock` which allowed race conditions to corrupt the
counter.
---
 changelog.d/11376.bugfix                           |   1 +
 synapse/storage/databases/main/events_worker.py    | 154 +++++++++++++++------
 tests/storage/databases/main/test_events_worker.py | 139 ++++++++++++++++++-
 3 files changed, 251 insertions(+), 43 deletions(-)
 create mode 100644 changelog.d/11376.bugfix

(limited to 'synapse/storage')

diff --git a/changelog.d/11376.bugfix b/changelog.d/11376.bugfix
new file mode 100644
index 0000000000..639e48b59b
--- /dev/null
+++ b/changelog.d/11376.bugfix
@@ -0,0 +1 @@
+Fix a long-standing bug where all requests that read events from the database could get stuck as a result of losing the database connection, for real this time. Also fix a race condition introduced in the previous insufficient fix in 1.47.0.
diff --git a/synapse/storage/databases/main/events_worker.py b/synapse/storage/databases/main/events_worker.py
index c6bf316d5b..c6bcfe1c32 100644
--- a/synapse/storage/databases/main/events_worker.py
+++ b/synapse/storage/databases/main/events_worker.py
@@ -72,7 +72,7 @@ from synapse.util.metrics import Measure
 logger = logging.getLogger(__name__)
 
 
-# These values are used in the `enqueus_event` and `_do_fetch` methods to
+# These values are used in the `enqueue_event` and `_fetch_loop` methods to
 # control how we batch/bulk fetch events from the database.
 # The values are plucked out of thing air to make initial sync run faster
 # on jki.re
@@ -602,7 +602,7 @@ class EventsWorkerStore(SQLBaseStore):
             # already due to `_get_events_from_db`).
             fetching_deferred: ObservableDeferred[
                 Dict[str, _EventCacheEntry]
-            ] = ObservableDeferred(defer.Deferred())
+            ] = ObservableDeferred(defer.Deferred(), consumeErrors=True)
             for event_id in missing_events_ids:
                 self._current_event_fetches[event_id] = fetching_deferred
 
@@ -736,35 +736,118 @@ class EventsWorkerStore(SQLBaseStore):
             for e in state_to_include.values()
         ]
 
-    def _do_fetch(self, conn: Connection) -> None:
+    def _maybe_start_fetch_thread(self) -> None:
+        """Starts an event fetch thread if we are not yet at the maximum number."""
+        with self._event_fetch_lock:
+            if (
+                self._event_fetch_list
+                and self._event_fetch_ongoing < EVENT_QUEUE_THREADS
+            ):
+                self._event_fetch_ongoing += 1
+                event_fetch_ongoing_gauge.set(self._event_fetch_ongoing)
+                # `_event_fetch_ongoing` is decremented in `_fetch_thread`.
+                should_start = True
+            else:
+                should_start = False
+
+        if should_start:
+            run_as_background_process("fetch_events", self._fetch_thread)
+
+    async def _fetch_thread(self) -> None:
+        """Services requests for events from `_event_fetch_list`."""
+        exc = None
+        try:
+            await self.db_pool.runWithConnection(self._fetch_loop)
+        except BaseException as e:
+            exc = e
+            raise
+        finally:
+            should_restart = False
+            event_fetches_to_fail = []
+            with self._event_fetch_lock:
+                self._event_fetch_ongoing -= 1
+                event_fetch_ongoing_gauge.set(self._event_fetch_ongoing)
+
+                # There may still be work remaining in `_event_fetch_list` if we
+                # failed, or it was added in between us deciding to exit and
+                # decrementing `_event_fetch_ongoing`.
+                if self._event_fetch_list:
+                    if exc is None:
+                        # We decided to exit, but then some more work was added
+                        # before `_event_fetch_ongoing` was decremented.
+                        # If a new event fetch thread was not started, we should
+                        # restart ourselves since the remaining event fetch threads
+                        # may take a while to get around to the new work.
+                        #
+                        # Unfortunately it is not possible to tell whether a new
+                        # event fetch thread was started, so we restart
+                        # unconditionally. If we are unlucky, we will end up with
+                        # an idle fetch thread, but it will time out after
+                        # `EVENT_QUEUE_ITERATIONS * EVENT_QUEUE_TIMEOUT_S` seconds
+                        # in any case.
+                        #
+                        # Note that multiple fetch threads may run down this path at
+                        # the same time.
+                        should_restart = True
+                    elif isinstance(exc, Exception):
+                        if self._event_fetch_ongoing == 0:
+                            # We were the last remaining fetcher and failed.
+                            # Fail any outstanding fetches since no one else will
+                            # handle them.
+                            event_fetches_to_fail = self._event_fetch_list
+                            self._event_fetch_list = []
+                        else:
+                            # We weren't the last remaining fetcher, so another
+                            # fetcher will pick up the work. This will either happen
+                            # after their existing work, however long that takes,
+                            # or after at most `EVENT_QUEUE_TIMEOUT_S` seconds if
+                            # they are idle.
+                            pass
+                    else:
+                        # The exception is a `SystemExit`, `KeyboardInterrupt` or
+                        # `GeneratorExit`. Don't try to do anything clever here.
+                        pass
+
+            if should_restart:
+                # We exited cleanly but noticed more work.
+                self._maybe_start_fetch_thread()
+
+            if event_fetches_to_fail:
+                # We were the last remaining fetcher and failed.
+                # Fail any outstanding fetches since no one else will handle them.
+                assert exc is not None
+                with PreserveLoggingContext():
+                    for _, deferred in event_fetches_to_fail:
+                        deferred.errback(exc)
+
+    def _fetch_loop(self, conn: Connection) -> None:
         """Takes a database connection and waits for requests for events from
         the _event_fetch_list queue.
         """
-        try:
-            i = 0
-            while True:
-                with self._event_fetch_lock:
-                    event_list = self._event_fetch_list
-                    self._event_fetch_list = []
-
-                    if not event_list:
-                        single_threaded = self.database_engine.single_threaded
-                        if (
-                            not self.USE_DEDICATED_DB_THREADS_FOR_EVENT_FETCHING
-                            or single_threaded
-                            or i > EVENT_QUEUE_ITERATIONS
-                        ):
-                            break
-                        else:
-                            self._event_fetch_lock.wait(EVENT_QUEUE_TIMEOUT_S)
-                            i += 1
-                            continue
-                    i = 0
+        i = 0
+        while True:
+            with self._event_fetch_lock:
+                event_list = self._event_fetch_list
+                self._event_fetch_list = []
+
+                if not event_list:
+                    # There are no requests waiting. If we haven't yet reached the
+                    # maximum iteration limit, wait for some more requests to turn up.
+                    # Otherwise, bail out.
+                    single_threaded = self.database_engine.single_threaded
+                    if (
+                        not self.USE_DEDICATED_DB_THREADS_FOR_EVENT_FETCHING
+                        or single_threaded
+                        or i > EVENT_QUEUE_ITERATIONS
+                    ):
+                        return
+
+                    self._event_fetch_lock.wait(EVENT_QUEUE_TIMEOUT_S)
+                    i += 1
+                    continue
+                i = 0
 
-                self._fetch_event_list(conn, event_list)
-        finally:
-            self._event_fetch_ongoing -= 1
-            event_fetch_ongoing_gauge.set(self._event_fetch_ongoing)
+            self._fetch_event_list(conn, event_list)
 
     def _fetch_event_list(
         self, conn: Connection, event_list: List[Tuple[List[str], defer.Deferred]]
@@ -806,9 +889,7 @@ class EventsWorkerStore(SQLBaseStore):
                 # We only want to resolve deferreds from the main thread
                 def fire(evs, exc):
                     for _, d in evs:
-                        if not d.called:
-                            with PreserveLoggingContext():
-                                d.errback(exc)
+                        d.errback(exc)
 
                 with PreserveLoggingContext():
                     self.hs.get_reactor().callFromThread(fire, event_list, e)
@@ -983,20 +1064,9 @@ class EventsWorkerStore(SQLBaseStore):
         events_d = defer.Deferred()
         with self._event_fetch_lock:
             self._event_fetch_list.append((events, events_d))
-
             self._event_fetch_lock.notify()
 
-            if self._event_fetch_ongoing < EVENT_QUEUE_THREADS:
-                self._event_fetch_ongoing += 1
-                event_fetch_ongoing_gauge.set(self._event_fetch_ongoing)
-                should_start = True
-            else:
-                should_start = False
-
-        if should_start:
-            run_as_background_process(
-                "fetch_events", self.db_pool.runWithConnection, self._do_fetch
-            )
+        self._maybe_start_fetch_thread()
 
         logger.debug("Loading %d events: %s", len(events), events)
         with PreserveLoggingContext():
diff --git a/tests/storage/databases/main/test_events_worker.py b/tests/storage/databases/main/test_events_worker.py
index a649e8c618..5ae491ff5a 100644
--- a/tests/storage/databases/main/test_events_worker.py
+++ b/tests/storage/databases/main/test_events_worker.py
@@ -12,11 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
+from contextlib import contextmanager
+from typing import Generator
 
+from twisted.enterprise.adbapi import ConnectionPool
+from twisted.internet.defer import ensureDeferred
+from twisted.test.proto_helpers import MemoryReactor
+
+from synapse.api.room_versions import EventFormatVersions, RoomVersions
 from synapse.logging.context import LoggingContext
 from synapse.rest import admin
 from synapse.rest.client import login, room
-from synapse.storage.databases.main.events_worker import EventsWorkerStore
+from synapse.server import HomeServer
+from synapse.storage.databases.main.events_worker import (
+    EVENT_QUEUE_THREADS,
+    EventsWorkerStore,
+)
+from synapse.storage.types import Connection
+from synapse.util import Clock
 from synapse.util.async_helpers import yieldable_gather_results
 
 from tests import unittest
@@ -144,3 +157,127 @@ class EventCacheTestCase(unittest.HomeserverTestCase):
 
             # We should have fetched the event from the DB
             self.assertEqual(ctx.get_resource_usage().evt_db_fetch_count, 1)
+
+
+class DatabaseOutageTestCase(unittest.HomeserverTestCase):
+    """Test event fetching during a database outage."""
+
+    def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer):
+        self.store: EventsWorkerStore = hs.get_datastore()
+
+        self.room_id = f"!room:{hs.hostname}"
+        self.event_ids = [f"event{i}" for i in range(20)]
+
+        self._populate_events()
+
+    def _populate_events(self) -> None:
+        """Ensure that there are test events in the database.
+
+        When testing with the in-memory SQLite database, all the events are lost during
+        the simulated outage.
+
+        To ensure consistency between `room_id`s and `event_id`s before and after the
+        outage, rows are built and inserted manually.
+
+        Upserts are used to handle the non-SQLite case where events are not lost.
+        """
+        self.get_success(
+            self.store.db_pool.simple_upsert(
+                "rooms",
+                {"room_id": self.room_id},
+                {"room_version": RoomVersions.V4.identifier},
+            )
+        )
+
+        self.event_ids = [f"event{i}" for i in range(20)]
+        for idx, event_id in enumerate(self.event_ids):
+            self.get_success(
+                self.store.db_pool.simple_upsert(
+                    "events",
+                    {"event_id": event_id},
+                    {
+                        "event_id": event_id,
+                        "room_id": self.room_id,
+                        "topological_ordering": idx,
+                        "stream_ordering": idx,
+                        "type": "test",
+                        "processed": True,
+                        "outlier": False,
+                    },
+                )
+            )
+            self.get_success(
+                self.store.db_pool.simple_upsert(
+                    "event_json",
+                    {"event_id": event_id},
+                    {
+                        "room_id": self.room_id,
+                        "json": json.dumps({"type": "test", "room_id": self.room_id}),
+                        "internal_metadata": "{}",
+                        "format_version": EventFormatVersions.V3,
+                    },
+                )
+            )
+
+    @contextmanager
+    def _outage(self) -> Generator[None, None, None]:
+        """Simulate a database outage.
+
+        Returns:
+            A context manager. While the context is active, any attempts to connect to
+            the database will fail.
+        """
+        connection_pool = self.store.db_pool._db_pool
+
+        # Close all connections and shut down the database `ThreadPool`.
+        connection_pool.close()
+
+        # Restart the database `ThreadPool`.
+        connection_pool.start()
+
+        original_connection_factory = connection_pool.connectionFactory
+
+        def connection_factory(_pool: ConnectionPool) -> Connection:
+            raise Exception("Could not connect to the database.")
+
+        connection_pool.connectionFactory = connection_factory  # type: ignore[assignment]
+        try:
+            yield
+        finally:
+            connection_pool.connectionFactory = original_connection_factory
+
+            # If the in-memory SQLite database is being used, all the events are gone.
+            # Restore the test data.
+            self._populate_events()
+
+    def test_failure(self) -> None:
+        """Test that event fetches do not get stuck during a database outage."""
+        with self._outage():
+            failure = self.get_failure(
+                self.store.get_event(self.event_ids[0]), Exception
+            )
+            self.assertEqual(str(failure.value), "Could not connect to the database.")
+
+    def test_recovery(self) -> None:
+        """Test that event fetchers recover after a database outage."""
+        with self._outage():
+            # Kick off a bunch of event fetches but do not pump the reactor
+            event_deferreds = []
+            for event_id in self.event_ids:
+                event_deferreds.append(ensureDeferred(self.store.get_event(event_id)))
+
+            # We should have maxed out on event fetcher threads
+            self.assertEqual(self.store._event_fetch_ongoing, EVENT_QUEUE_THREADS)
+
+            # All the event fetchers will fail
+            self.pump()
+            self.assertEqual(self.store._event_fetch_ongoing, 0)
+
+            for event_deferred in event_deferreds:
+                failure = self.get_failure(event_deferred, Exception)
+                self.assertEqual(
+                    str(failure.value), "Could not connect to the database."
+                )
+
+        # This next event fetch should succeed
+        self.get_success(self.store.get_event(self.event_ids[0]))
-- 
cgit 1.5.1


From 1d8b80b3346b31a297668e093fb813d9ce7a1b48 Mon Sep 17 00:00:00 2001
From: reivilibre <oliverw@matrix.org>
Date: Fri, 26 Nov 2021 14:27:14 +0000
Subject: Support expiry of refresh tokens and expiry of the overall session
 when refresh tokens are in use. (#11425)

---
 changelog.d/11425.feature                          |   1 +
 synapse/config/registration.py                     |  24 ++--
 synapse/handlers/auth.py                           |  90 +++++++++++++--
 synapse/handlers/register.py                       |  44 ++++++--
 synapse/rest/client/login.py                       |  52 ++++++---
 synapse/storage/databases/main/registration.py     |  28 ++++-
 .../main/delta/65/10_expirable_refresh_tokens.sql  |  28 +++++
 tests/rest/client/test_auth.py                     | 125 ++++++++++++++++++++-
 8 files changed, 338 insertions(+), 54 deletions(-)
 create mode 100644 changelog.d/11425.feature
 create mode 100644 synapse/storage/schema/main/delta/65/10_expirable_refresh_tokens.sql

(limited to 'synapse/storage')

diff --git a/changelog.d/11425.feature b/changelog.d/11425.feature
new file mode 100644
index 0000000000..806dd5d91c
--- /dev/null
+++ b/changelog.d/11425.feature
@@ -0,0 +1 @@
+Support expiry of refresh tokens and expiry of the overall session when refresh tokens are in use.
\ No newline at end of file
diff --git a/synapse/config/registration.py b/synapse/config/registration.py
index 61e569d412..5e21548060 100644
--- a/synapse/config/registration.py
+++ b/synapse/config/registration.py
@@ -113,14 +113,11 @@ class RegistrationConfig(Config):
         self.session_lifetime = session_lifetime
 
         # The `refreshable_access_token_lifetime` applies for tokens that can be renewed
-        # using a refresh token, as per MSC2918. If it is `None`, the refresh
-        # token mechanism is disabled.
-        #
-        # Since it is incompatible with the `session_lifetime` mechanism, it is set to
-        # `None` by default if a `session_lifetime` is set.
+        # using a refresh token, as per MSC2918.
+        # If it is `None`, the refresh token mechanism is disabled.
         refreshable_access_token_lifetime = config.get(
             "refreshable_access_token_lifetime",
-            "5m" if session_lifetime is None else None,
+            "5m",
         )
         if refreshable_access_token_lifetime is not None:
             refreshable_access_token_lifetime = self.parse_duration(
@@ -128,17 +125,10 @@ class RegistrationConfig(Config):
             )
         self.refreshable_access_token_lifetime = refreshable_access_token_lifetime
 
-        if (
-            session_lifetime is not None
-            and refreshable_access_token_lifetime is not None
-        ):
-            raise ConfigError(
-                "The refresh token mechanism is incompatible with the "
-                "`session_lifetime` option. Consider disabling the "
-                "`session_lifetime` option or disabling the refresh token "
-                "mechanism by removing the `refreshable_access_token_lifetime` "
-                "option."
-            )
+        refresh_token_lifetime = config.get("refresh_token_lifetime")
+        if refresh_token_lifetime is not None:
+            refresh_token_lifetime = self.parse_duration(refresh_token_lifetime)
+        self.refresh_token_lifetime = refresh_token_lifetime
 
         # The fallback template used for authenticating using a registration token
         self.registration_token_template = self.read_template("registration_token.html")
diff --git a/synapse/handlers/auth.py b/synapse/handlers/auth.py
index 4b66a9862f..4d9c4e5834 100644
--- a/synapse/handlers/auth.py
+++ b/synapse/handlers/auth.py
@@ -18,6 +18,7 @@ import time
 import unicodedata
 import urllib.parse
 from binascii import crc32
+from http import HTTPStatus
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -756,53 +757,109 @@ class AuthHandler:
     async def refresh_token(
         self,
         refresh_token: str,
-        valid_until_ms: Optional[int],
-    ) -> Tuple[str, str]:
+        access_token_valid_until_ms: Optional[int],
+        refresh_token_valid_until_ms: Optional[int],
+    ) -> Tuple[str, str, Optional[int]]:
         """
         Consumes a refresh token and generate both a new access token and a new refresh token from it.
 
         The consumed refresh token is considered invalid after the first use of the new access token or the new refresh token.
 
+        The lifetime of both the access token and refresh token will be capped so that they
+        do not exceed the session's ultimate expiry time, if applicable.
+
         Args:
             refresh_token: The token to consume.
-            valid_until_ms: The expiration timestamp of the new access token.
-
+            access_token_valid_until_ms: The expiration timestamp of the new access token.
+                None if the access token does not expire.
+            refresh_token_valid_until_ms: The expiration timestamp of the new refresh token.
+                None if the refresh token does not expire.
         Returns:
-            A tuple containing the new access token and refresh token
+            A tuple containing:
+              - the new access token
+              - the new refresh token
+              - the actual expiry time of the access token, which may be earlier than
+                `access_token_valid_until_ms`.
         """
 
         # Verify the token signature first before looking up the token
         if not self._verify_refresh_token(refresh_token):
-            raise SynapseError(401, "invalid refresh token", Codes.UNKNOWN_TOKEN)
+            raise SynapseError(
+                HTTPStatus.UNAUTHORIZED, "invalid refresh token", Codes.UNKNOWN_TOKEN
+            )
 
         existing_token = await self.store.lookup_refresh_token(refresh_token)
         if existing_token is None:
-            raise SynapseError(401, "refresh token does not exist", Codes.UNKNOWN_TOKEN)
+            raise SynapseError(
+                HTTPStatus.UNAUTHORIZED,
+                "refresh token does not exist",
+                Codes.UNKNOWN_TOKEN,
+            )
 
         if (
             existing_token.has_next_access_token_been_used
             or existing_token.has_next_refresh_token_been_refreshed
         ):
             raise SynapseError(
-                403, "refresh token isn't valid anymore", Codes.FORBIDDEN
+                HTTPStatus.FORBIDDEN,
+                "refresh token isn't valid anymore",
+                Codes.FORBIDDEN,
+            )
+
+        now_ms = self._clock.time_msec()
+
+        if existing_token.expiry_ts is not None and existing_token.expiry_ts < now_ms:
+
+            raise SynapseError(
+                HTTPStatus.FORBIDDEN,
+                "The supplied refresh token has expired",
+                Codes.FORBIDDEN,
             )
 
+        if existing_token.ultimate_session_expiry_ts is not None:
+            # This session has a bounded lifetime, even across refreshes.
+
+            if access_token_valid_until_ms is not None:
+                access_token_valid_until_ms = min(
+                    access_token_valid_until_ms,
+                    existing_token.ultimate_session_expiry_ts,
+                )
+            else:
+                access_token_valid_until_ms = existing_token.ultimate_session_expiry_ts
+
+            if refresh_token_valid_until_ms is not None:
+                refresh_token_valid_until_ms = min(
+                    refresh_token_valid_until_ms,
+                    existing_token.ultimate_session_expiry_ts,
+                )
+            else:
+                refresh_token_valid_until_ms = existing_token.ultimate_session_expiry_ts
+            if existing_token.ultimate_session_expiry_ts < now_ms:
+                raise SynapseError(
+                    HTTPStatus.FORBIDDEN,
+                    "The session has expired and can no longer be refreshed",
+                    Codes.FORBIDDEN,
+                )
+
         (
             new_refresh_token,
             new_refresh_token_id,
         ) = await self.create_refresh_token_for_user_id(
-            user_id=existing_token.user_id, device_id=existing_token.device_id
+            user_id=existing_token.user_id,
+            device_id=existing_token.device_id,
+            expiry_ts=refresh_token_valid_until_ms,
+            ultimate_session_expiry_ts=existing_token.ultimate_session_expiry_ts,
         )
         access_token = await self.create_access_token_for_user_id(
             user_id=existing_token.user_id,
             device_id=existing_token.device_id,
-            valid_until_ms=valid_until_ms,
+            valid_until_ms=access_token_valid_until_ms,
             refresh_token_id=new_refresh_token_id,
         )
         await self.store.replace_refresh_token(
             existing_token.token_id, new_refresh_token_id
         )
-        return access_token, new_refresh_token
+        return access_token, new_refresh_token, access_token_valid_until_ms
 
     def _verify_refresh_token(self, token: str) -> bool:
         """
@@ -836,6 +893,8 @@ class AuthHandler:
         self,
         user_id: str,
         device_id: str,
+        expiry_ts: Optional[int],
+        ultimate_session_expiry_ts: Optional[int],
     ) -> Tuple[str, int]:
         """
         Creates a new refresh token for the user with the given user ID.
@@ -843,6 +902,13 @@ class AuthHandler:
         Args:
             user_id: canonical user ID
             device_id: the device ID to associate with the token.
+            expiry_ts (milliseconds since the epoch): Time after which the
+                refresh token cannot be used.
+                If None, the refresh token never expires until it has been used.
+            ultimate_session_expiry_ts (milliseconds since the epoch):
+                Time at which the session will end and can not be extended any
+                further.
+                If None, the session can be refreshed indefinitely.
 
         Returns:
             The newly created refresh token and its ID in the database
@@ -852,6 +918,8 @@ class AuthHandler:
             user_id=user_id,
             token=refresh_token,
             device_id=device_id,
+            expiry_ts=expiry_ts,
+            ultimate_session_expiry_ts=ultimate_session_expiry_ts,
         )
         return refresh_token, refresh_token_id
 
diff --git a/synapse/handlers/register.py b/synapse/handlers/register.py
index 448a36108e..8136ae264d 100644
--- a/synapse/handlers/register.py
+++ b/synapse/handlers/register.py
@@ -119,6 +119,7 @@ class RegistrationHandler:
         self.refreshable_access_token_lifetime = (
             hs.config.registration.refreshable_access_token_lifetime
         )
+        self.refresh_token_lifetime = hs.config.registration.refresh_token_lifetime
 
         init_counters_for_auth_provider("")
 
@@ -793,13 +794,13 @@ class RegistrationHandler:
         class and RegisterDeviceReplicationServlet.
         """
         assert not self.hs.config.worker.worker_app
-        valid_until_ms = None
+        access_token_expiry = None
         if self.session_lifetime is not None:
             if is_guest:
                 raise Exception(
                     "session_lifetime is not currently implemented for guest access"
                 )
-            valid_until_ms = self.clock.time_msec() + self.session_lifetime
+            access_token_expiry = self.clock.time_msec() + self.session_lifetime
 
         refresh_token = None
         refresh_token_id = None
@@ -808,25 +809,52 @@ class RegistrationHandler:
             user_id, device_id, initial_display_name
         )
         if is_guest:
-            assert valid_until_ms is None
+            assert access_token_expiry is None
             access_token = self.macaroon_gen.generate_guest_access_token(user_id)
         else:
             if should_issue_refresh_token:
+                now_ms = self.clock.time_msec()
+
+                # Set the expiry time of the refreshable access token
+                access_token_expiry = now_ms + self.refreshable_access_token_lifetime
+
+                # Set the refresh token expiry time (if configured)
+                refresh_token_expiry = None
+                if self.refresh_token_lifetime is not None:
+                    refresh_token_expiry = now_ms + self.refresh_token_lifetime
+
+                # Set an ultimate session expiry time (if configured)
+                ultimate_session_expiry_ts = None
+                if self.session_lifetime is not None:
+                    ultimate_session_expiry_ts = now_ms + self.session_lifetime
+
+                    # Also ensure that the issued tokens don't outlive the
+                    # session.
+                    # (It would be weird to configure a homeserver with a shorter
+                    # session lifetime than token lifetime, but may as well handle
+                    # it.)
+                    access_token_expiry = min(
+                        access_token_expiry, ultimate_session_expiry_ts
+                    )
+                    if refresh_token_expiry is not None:
+                        refresh_token_expiry = min(
+                            refresh_token_expiry, ultimate_session_expiry_ts
+                        )
+
                 (
                     refresh_token,
                     refresh_token_id,
                 ) = await self._auth_handler.create_refresh_token_for_user_id(
                     user_id,
                     device_id=registered_device_id,
-                )
-                valid_until_ms = (
-                    self.clock.time_msec() + self.refreshable_access_token_lifetime
+                    expiry_ts=refresh_token_expiry,
+                    ultimate_session_expiry_ts=ultimate_session_expiry_ts,
                 )
 
             access_token = await self._auth_handler.create_access_token_for_user_id(
                 user_id,
                 device_id=registered_device_id,
-                valid_until_ms=valid_until_ms,
+                valid_until_ms=access_token_expiry,
                 is_appservice_ghost=is_appservice_ghost,
                 refresh_token_id=refresh_token_id,
             )
@@ -834,7 +862,7 @@ class RegistrationHandler:
         return {
             "device_id": registered_device_id,
             "access_token": access_token,
-            "valid_until_ms": valid_until_ms,
+            "valid_until_ms": access_token_expiry,
             "refresh_token": refresh_token,
         }
 
diff --git a/synapse/rest/client/login.py b/synapse/rest/client/login.py
index 67e03dca04..c982e54156 100644
--- a/synapse/rest/client/login.py
+++ b/synapse/rest/client/login.py
@@ -14,7 +14,17 @@
 
 import logging
 import re
-from typing import TYPE_CHECKING, Any, Awaitable, Callable, Dict, List, Optional, Tuple
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Awaitable,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
 
 from typing_extensions import TypedDict
 
@@ -458,6 +468,7 @@ class RefreshTokenServlet(RestServlet):
         self.refreshable_access_token_lifetime = (
             hs.config.registration.refreshable_access_token_lifetime
         )
+        self.refresh_token_lifetime = hs.config.registration.refresh_token_lifetime
 
     async def on_POST(self, request: SynapseRequest) -> Tuple[int, JsonDict]:
         refresh_submission = parse_json_object_from_request(request)
@@ -467,22 +478,33 @@ class RefreshTokenServlet(RestServlet):
         if not isinstance(token, str):
             raise SynapseError(400, "Invalid param: refresh_token", Codes.INVALID_PARAM)
 
-        valid_until_ms = (
-            self._clock.time_msec() + self.refreshable_access_token_lifetime
-        )
-        access_token, refresh_token = await self._auth_handler.refresh_token(
-            token, valid_until_ms
-        )
-        expires_in_ms = valid_until_ms - self._clock.time_msec()
-        return (
-            200,
-            {
-                "access_token": access_token,
-                "refresh_token": refresh_token,
-                "expires_in_ms": expires_in_ms,
-            },
+        now = self._clock.time_msec()
+        access_valid_until_ms = None
+        if self.refreshable_access_token_lifetime is not None:
+            access_valid_until_ms = now + self.refreshable_access_token_lifetime
+        refresh_valid_until_ms = None
+        if self.refresh_token_lifetime is not None:
+            refresh_valid_until_ms = now + self.refresh_token_lifetime
+
+        (
+            access_token,
+            refresh_token,
+            actual_access_token_expiry,
+        ) = await self._auth_handler.refresh_token(
+            token, access_valid_until_ms, refresh_valid_until_ms
         )
 
+        response: Dict[str, Union[str, int]] = {
+            "access_token": access_token,
+            "refresh_token": refresh_token,
+        }
+
+        # expires_in_ms is only present if the token expires
+        if actual_access_token_expiry is not None:
+            response["expires_in_ms"] = actual_access_token_expiry - now
+
+        return 200, response
+
 
 class SsoRedirectServlet(RestServlet):
     PATTERNS = list(client_patterns("/login/(cas|sso)/redirect$", v1=True)) + [
diff --git a/synapse/storage/databases/main/registration.py b/synapse/storage/databases/main/registration.py
index 0e8c168667..e1ddf06916 100644
--- a/synapse/storage/databases/main/registration.py
+++ b/synapse/storage/databases/main/registration.py
@@ -106,6 +106,15 @@ class RefreshTokenLookupResult:
     has_next_access_token_been_used: bool
     """True if the next access token was already used at least once."""
 
+    expiry_ts: Optional[int]
+    """The time at which the refresh token expires and can not be used.
+    If None, the refresh token doesn't expire."""
+
+    ultimate_session_expiry_ts: Optional[int]
+    """The time at which the session comes to an end and can no longer be
+    refreshed.
+    If None, the session can be refreshed indefinitely."""
+
 
 class RegistrationWorkerStore(CacheInvalidationWorkerStore):
     def __init__(
@@ -1626,8 +1635,10 @@ class RegistrationWorkerStore(CacheInvalidationWorkerStore):
                     rt.user_id,
                     rt.device_id,
                     rt.next_token_id,
-                    (nrt.next_token_id IS NOT NULL) has_next_refresh_token_been_refreshed,
-                    at.used has_next_access_token_been_used
+                    (nrt.next_token_id IS NOT NULL) AS has_next_refresh_token_been_refreshed,
+                    at.used AS has_next_access_token_been_used,
+                    rt.expiry_ts,
+                    rt.ultimate_session_expiry_ts
                 FROM refresh_tokens rt
                 LEFT JOIN refresh_tokens nrt ON rt.next_token_id = nrt.id
                 LEFT JOIN access_tokens at ON at.refresh_token_id = nrt.id
@@ -1648,6 +1659,8 @@ class RegistrationWorkerStore(CacheInvalidationWorkerStore):
                 has_next_refresh_token_been_refreshed=row[4],
                 # This column is nullable, ensure it's a boolean
                 has_next_access_token_been_used=(row[5] or False),
+                expiry_ts=row[6],
+                ultimate_session_expiry_ts=row[7],
             )
 
         return await self.db_pool.runInteraction(
@@ -1915,6 +1928,8 @@ class RegistrationStore(StatsStore, RegistrationBackgroundUpdateStore):
         user_id: str,
         token: str,
         device_id: Optional[str],
+        expiry_ts: Optional[int],
+        ultimate_session_expiry_ts: Optional[int],
     ) -> int:
         """Adds a refresh token for the given user.
 
@@ -1922,6 +1937,13 @@ class RegistrationStore(StatsStore, RegistrationBackgroundUpdateStore):
             user_id: The user ID.
             token: The new access token to add.
             device_id: ID of the device to associate with the refresh token.
+            expiry_ts (milliseconds since the epoch): Time after which the
+                refresh token cannot be used.
+                If None, the refresh token never expires until it has been used.
+            ultimate_session_expiry_ts (milliseconds since the epoch):
+                Time at which the session will end and can not be extended any
+                further.
+                If None, the session can be refreshed indefinitely.
         Raises:
             StoreError if there was a problem adding this.
         Returns:
@@ -1937,6 +1959,8 @@ class RegistrationStore(StatsStore, RegistrationBackgroundUpdateStore):
                 "device_id": device_id,
                 "token": token,
                 "next_token_id": None,
+                "expiry_ts": expiry_ts,
+                "ultimate_session_expiry_ts": ultimate_session_expiry_ts,
             },
             desc="add_refresh_token_to_user",
         )
diff --git a/synapse/storage/schema/main/delta/65/10_expirable_refresh_tokens.sql b/synapse/storage/schema/main/delta/65/10_expirable_refresh_tokens.sql
new file mode 100644
index 0000000000..bdc491c817
--- /dev/null
+++ b/synapse/storage/schema/main/delta/65/10_expirable_refresh_tokens.sql
@@ -0,0 +1,28 @@
+/* Copyright 2021 The Matrix.org Foundation C.I.C
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ALTER TABLE refresh_tokens
+  -- We add an expiry_ts column (in milliseconds since the Epoch) to refresh tokens.
+  -- They may not be used after they have expired.
+  -- If null, then the refresh token's lifetime is unlimited.
+  ADD COLUMN expiry_ts BIGINT DEFAULT NULL;
+
+ALTER TABLE refresh_tokens
+  -- We also add an ultimate session expiry time (in milliseconds since the Epoch).
+  -- No matter how much the access and refresh tokens are refreshed, they cannot
+  -- be extended past this time.
+  -- If null, then the session length is unlimited.
+  ADD COLUMN ultimate_session_expiry_ts BIGINT DEFAULT NULL;
diff --git a/tests/rest/client/test_auth.py b/tests/rest/client/test_auth.py
index 8552671431..8045b7f76a 100644
--- a/tests/rest/client/test_auth.py
+++ b/tests/rest/client/test_auth.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from http import HTTPStatus
 from typing import Optional, Union
 
 from twisted.internet.defer import succeed
@@ -513,6 +514,16 @@ class RefreshAuthTests(unittest.HomeserverTestCase):
         self.user_pass = "pass"
         self.user = self.register_user("test", self.user_pass)
 
+    def use_refresh_token(self, refresh_token: str) -> FakeChannel:
+        """
+        Helper that makes a request to use a refresh token.
+        """
+        return self.make_request(
+            "POST",
+            "/_matrix/client/unstable/org.matrix.msc2918.refresh_token/refresh",
+            {"refresh_token": refresh_token},
+        )
+
     def test_login_issue_refresh_token(self):
         """
         A login response should include a refresh_token only if asked.
@@ -599,7 +610,7 @@ class RefreshAuthTests(unittest.HomeserverTestCase):
         )
 
     @override_config({"refreshable_access_token_lifetime": "1m"})
-    def test_refresh_token_expiration(self):
+    def test_refreshable_access_token_expiration(self):
         """
         The access token should have some time as specified in the config.
         """
@@ -623,6 +634,118 @@ class RefreshAuthTests(unittest.HomeserverTestCase):
         self.assertApproximates(
             refresh_response.json_body["expires_in_ms"], 60 * 1000, 100
         )
+        access_token = refresh_response.json_body["access_token"]
+
+        # Advance 59 seconds in the future (just shy of 1 minute, the time of expiry)
+        self.reactor.advance(59.0)
+        # Check that our token is valid
+        self.assertEqual(
+            self.make_request(
+                "GET", "/_matrix/client/v3/account/whoami", access_token=access_token
+            ).code,
+            HTTPStatus.OK,
+        )
+
+        # Advance 2 more seconds (just past the time of expiry)
+        self.reactor.advance(2.0)
+        # Check that our token is invalid
+        self.assertEqual(
+            self.make_request(
+                "GET", "/_matrix/client/v3/account/whoami", access_token=access_token
+            ).code,
+            HTTPStatus.UNAUTHORIZED,
+        )
+
+    @override_config(
+        {"refreshable_access_token_lifetime": "1m", "refresh_token_lifetime": "2m"}
+    )
+    def test_refresh_token_expiry(self):
+        """
+        The refresh token can be configured to have a limited lifetime.
+        When that lifetime has ended, the refresh token can no longer be used to
+        refresh the session.
+        """
+
+        body = {"type": "m.login.password", "user": "test", "password": self.user_pass}
+        login_response = self.make_request(
+            "POST",
+            "/_matrix/client/r0/login?org.matrix.msc2918.refresh_token=true",
+            body,
+        )
+        self.assertEqual(login_response.code, HTTPStatus.OK, login_response.result)
+        refresh_token1 = login_response.json_body["refresh_token"]
+
+        # Advance 119 seconds in the future (just shy of 2 minutes)
+        self.reactor.advance(119.0)
+
+        # Refresh our session. The refresh token should still JUST be valid right now.
+        # By doing so, we get a new access token and a new refresh token.
+        refresh_response = self.use_refresh_token(refresh_token1)
+        self.assertEqual(refresh_response.code, HTTPStatus.OK, refresh_response.result)
+        self.assertIn(
+            "refresh_token",
+            refresh_response.json_body,
+            "No new refresh token returned after refresh.",
+        )
+        refresh_token2 = refresh_response.json_body["refresh_token"]
+
+        # Advance 121 seconds in the future (just a bit more than 2 minutes)
+        self.reactor.advance(121.0)
+
+        # Try to refresh our session, but instead notice that the refresh token is
+        # not valid (it just expired).
+        refresh_response = self.use_refresh_token(refresh_token2)
+        self.assertEqual(
+            refresh_response.code, HTTPStatus.FORBIDDEN, refresh_response.result
+        )
+
+    @override_config(
+        {
+            "refreshable_access_token_lifetime": "2m",
+            "refresh_token_lifetime": "2m",
+            "session_lifetime": "3m",
+        }
+    )
+    def test_ultimate_session_expiry(self):
+        """
+        The session can be configured to have an ultimate, limited lifetime.
+        """
+
+        body = {"type": "m.login.password", "user": "test", "password": self.user_pass}
+        login_response = self.make_request(
+            "POST",
+            "/_matrix/client/r0/login?org.matrix.msc2918.refresh_token=true",
+            body,
+        )
+        self.assertEqual(login_response.code, 200, login_response.result)
+        refresh_token = login_response.json_body["refresh_token"]
+
+        # Advance shy of 2 minutes into the future
+        self.reactor.advance(119.0)
+
+        # Refresh our session. The refresh token should still be valid right now.
+        refresh_response = self.use_refresh_token(refresh_token)
+        self.assertEqual(refresh_response.code, 200, refresh_response.result)
+        self.assertIn(
+            "refresh_token",
+            refresh_response.json_body,
+            "No new refresh token returned after refresh.",
+        )
+        # Notice that our access token lifetime has been diminished to match the
+        # session lifetime.
+        # 3 minutes - 119 seconds = 61 seconds.
+        self.assertEqual(refresh_response.json_body["expires_in_ms"], 61_000)
+        refresh_token = refresh_response.json_body["refresh_token"]
+
+        # Advance 61 seconds into the future. Our session should have expired
+        # now, because we've had our 3 minutes.
+        self.reactor.advance(61.0)
+
+        # Try to issue a new, refreshed, access token.
+        # This should fail because the refresh token's lifetime has also been
+        # diminished as our session expired.
+        refresh_response = self.use_refresh_token(refresh_token)
+        self.assertEqual(refresh_response.code, 403, refresh_response.result)
 
     def test_refresh_token_invalidation(self):
         """Refresh tokens are invalidated after first use of the next token.
-- 
cgit 1.5.1


From ffd858aa68239aeaf06591d94c0ab1b3c185440f Mon Sep 17 00:00:00 2001
From: Sean Quah <8349537+squahtx@users.noreply.github.com>
Date: Fri, 26 Nov 2021 18:41:31 +0000
Subject: Add type hints to `synapse/storage/databases/main/events_worker.py`
 (#11411)

Also refactor the stream ID trackers/generators a bit and try to
document them better.
---
 changelog.d/11411.misc                             |   1 +
 mypy.ini                                           |   4 +-
 .../slave/storage/_slaved_id_tracker.py            |  22 +--
 synapse/replication/slave/storage/push_rule.py     |   4 -
 synapse/replication/tcp/streams/events.py          |   6 +-
 synapse/state/__init__.py                          |   2 +-
 synapse/state/v1.py                                |   3 +-
 synapse/storage/_base.py                           |   4 +-
 synapse/storage/databases/main/events.py           |  29 +--
 synapse/storage/databases/main/events_worker.py    | 218 ++++++++++++++-------
 synapse/storage/databases/main/push_rule.py        |  11 +-
 synapse/storage/util/id_generators.py              | 116 ++++++-----
 tests/replication/test_sharded_event_persister.py  |   6 +-
 13 files changed, 255 insertions(+), 171 deletions(-)
 create mode 100644 changelog.d/11411.misc

(limited to 'synapse/storage')

diff --git a/changelog.d/11411.misc b/changelog.d/11411.misc
new file mode 100644
index 0000000000..86594a332d
--- /dev/null
+++ b/changelog.d/11411.misc
@@ -0,0 +1 @@
+Add type hints to storage classes.
diff --git a/mypy.ini b/mypy.ini
index bc4f59154d..eb3976e74c 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -33,7 +33,6 @@ exclude = (?x)
    |synapse/storage/databases/main/event_federation.py
    |synapse/storage/databases/main/event_push_actions.py
    |synapse/storage/databases/main/events_bg_updates.py
-   |synapse/storage/databases/main/events_worker.py
    |synapse/storage/databases/main/group_server.py
    |synapse/storage/databases/main/metrics.py
    |synapse/storage/databases/main/monthly_active_users.py
@@ -184,6 +183,9 @@ disallow_untyped_defs = True
 [mypy-synapse.storage.databases.main.directory]
 disallow_untyped_defs = True
 
+[mypy-synapse.storage.databases.main.events_worker]
+disallow_untyped_defs = True
+
 [mypy-synapse.storage.databases.main.room_batch]
 disallow_untyped_defs = True
 
diff --git a/synapse/replication/slave/storage/_slaved_id_tracker.py b/synapse/replication/slave/storage/_slaved_id_tracker.py
index 8c1bf9227a..fa132d10b4 100644
--- a/synapse/replication/slave/storage/_slaved_id_tracker.py
+++ b/synapse/replication/slave/storage/_slaved_id_tracker.py
@@ -14,10 +14,18 @@
 from typing import List, Optional, Tuple
 
 from synapse.storage.database import LoggingDatabaseConnection
-from synapse.storage.util.id_generators import _load_current_id
+from synapse.storage.util.id_generators import AbstractStreamIdTracker, _load_current_id
 
 
-class SlavedIdTracker:
+class SlavedIdTracker(AbstractStreamIdTracker):
+    """Tracks the "current" stream ID of a stream with a single writer.
+
+    See `AbstractStreamIdTracker` for more details.
+
+    Note that this class does not work correctly when there are multiple
+    writers.
+    """
+
     def __init__(
         self,
         db_conn: LoggingDatabaseConnection,
@@ -36,17 +44,7 @@ class SlavedIdTracker:
         self._current = (max if self.step > 0 else min)(self._current, new_id)
 
     def get_current_token(self) -> int:
-        """
-
-        Returns:
-            int
-        """
         return self._current
 
     def get_current_token_for_writer(self, instance_name: str) -> int:
-        """Returns the position of the given writer.
-
-        For streams with single writers this is equivalent to
-        `get_current_token`.
-        """
         return self.get_current_token()
diff --git a/synapse/replication/slave/storage/push_rule.py b/synapse/replication/slave/storage/push_rule.py
index 4d5f862862..7541e21de9 100644
--- a/synapse/replication/slave/storage/push_rule.py
+++ b/synapse/replication/slave/storage/push_rule.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from synapse.replication.slave.storage._slaved_id_tracker import SlavedIdTracker
 from synapse.replication.tcp.streams import PushRulesStream
 from synapse.storage.databases.main.push_rule import PushRulesWorkerStore
 
@@ -25,9 +24,6 @@ class SlavedPushRuleStore(SlavedEventStore, PushRulesWorkerStore):
         return self._push_rules_stream_id_gen.get_current_token()
 
     def process_replication_rows(self, stream_name, instance_name, token, rows):
-        # We assert this for the benefit of mypy
-        assert isinstance(self._push_rules_stream_id_gen, SlavedIdTracker)
-
         if stream_name == PushRulesStream.NAME:
             self._push_rules_stream_id_gen.advance(instance_name, token)
             for row in rows:
diff --git a/synapse/replication/tcp/streams/events.py b/synapse/replication/tcp/streams/events.py
index a030e9299e..a390cfcb74 100644
--- a/synapse/replication/tcp/streams/events.py
+++ b/synapse/replication/tcp/streams/events.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 import heapq
 from collections.abc import Iterable
-from typing import TYPE_CHECKING, List, Optional, Tuple, Type
+from typing import TYPE_CHECKING, Optional, Tuple, Type
 
 import attr
 
@@ -157,7 +157,7 @@ class EventsStream(Stream):
 
         # now we fetch up to that many rows from the events table
 
-        event_rows: List[Tuple] = await self._store.get_all_new_forward_event_rows(
+        event_rows = await self._store.get_all_new_forward_event_rows(
             instance_name, from_token, current_token, target_row_count
         )
 
@@ -191,7 +191,7 @@ class EventsStream(Stream):
         # finally, fetch the ex-outliers rows. We assume there are few enough of these
         # not to bother with the limit.
 
-        ex_outliers_rows: List[Tuple] = await self._store.get_ex_outlier_stream_rows(
+        ex_outliers_rows = await self._store.get_ex_outlier_stream_rows(
             instance_name, from_token, upper_limit
         )
 
diff --git a/synapse/state/__init__.py b/synapse/state/__init__.py
index 1605411b00..446204dbe5 100644
--- a/synapse/state/__init__.py
+++ b/synapse/state/__init__.py
@@ -764,7 +764,7 @@ class StateResolutionStore:
     store: "DataStore"
 
     def get_events(
-        self, event_ids: Iterable[str], allow_rejected: bool = False
+        self, event_ids: Collection[str], allow_rejected: bool = False
     ) -> Awaitable[Dict[str, EventBase]]:
         """Get events from the database
 
diff --git a/synapse/state/v1.py b/synapse/state/v1.py
index 6edadea550..499a328201 100644
--- a/synapse/state/v1.py
+++ b/synapse/state/v1.py
@@ -17,6 +17,7 @@ import logging
 from typing import (
     Awaitable,
     Callable,
+    Collection,
     Dict,
     Iterable,
     List,
@@ -44,7 +45,7 @@ async def resolve_events_with_store(
     room_version: RoomVersion,
     state_sets: Sequence[StateMap[str]],
     event_map: Optional[Dict[str, EventBase]],
-    state_map_factory: Callable[[Iterable[str]], Awaitable[Dict[str, EventBase]]],
+    state_map_factory: Callable[[Collection[str]], Awaitable[Dict[str, EventBase]]],
 ) -> StateMap[str]:
     """
     Args:
diff --git a/synapse/storage/_base.py b/synapse/storage/_base.py
index 0623da9aa1..3056e64ff5 100644
--- a/synapse/storage/_base.py
+++ b/synapse/storage/_base.py
@@ -21,7 +21,7 @@ from synapse.storage.database import LoggingTransaction  # noqa: F401
 from synapse.storage.database import make_in_list_sql_clause  # noqa: F401
 from synapse.storage.database import DatabasePool
 from synapse.storage.types import Connection
-from synapse.types import StreamToken, get_domain_from_id
+from synapse.types import get_domain_from_id
 from synapse.util import json_decoder
 
 if TYPE_CHECKING:
@@ -48,7 +48,7 @@ class SQLBaseStore(metaclass=ABCMeta):
         self,
         stream_name: str,
         instance_name: str,
-        token: StreamToken,
+        token: int,
         rows: Iterable[Any],
     ) -> None:
         pass
diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py
index 06832221ad..c3440de2cb 100644
--- a/synapse/storage/databases/main/events.py
+++ b/synapse/storage/databases/main/events.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 import itertools
 import logging
-from collections import OrderedDict, namedtuple
+from collections import OrderedDict
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -41,9 +41,10 @@ from synapse.events.snapshot import EventContext  # noqa: F401
 from synapse.logging.utils import log_function
 from synapse.storage._base import db_to_json, make_in_list_sql_clause
 from synapse.storage.database import DatabasePool, LoggingTransaction
+from synapse.storage.databases.main.events_worker import EventCacheEntry
 from synapse.storage.databases.main.search import SearchEntry
 from synapse.storage.types import Connection
-from synapse.storage.util.id_generators import MultiWriterIdGenerator
+from synapse.storage.util.id_generators import AbstractStreamIdGenerator
 from synapse.storage.util.sequence import SequenceGenerator
 from synapse.types import StateMap, get_domain_from_id
 from synapse.util import json_encoder
@@ -64,9 +65,6 @@ event_counter = Counter(
 )
 
 
-_EventCacheEntry = namedtuple("_EventCacheEntry", ("event", "redacted_event"))
-
-
 @attr.s(slots=True)
 class DeltaState:
     """Deltas to use to update the `current_state_events` table.
@@ -108,16 +106,21 @@ class PersistEventsStore:
         self._ephemeral_messages_enabled = hs.config.server.enable_ephemeral_messages
         self.is_mine_id = hs.is_mine_id
 
-        # Ideally we'd move these ID gens here, unfortunately some other ID
-        # generators are chained off them so doing so is a bit of a PITA.
-        self._backfill_id_gen: MultiWriterIdGenerator = self.store._backfill_id_gen
-        self._stream_id_gen: MultiWriterIdGenerator = self.store._stream_id_gen
-
         # This should only exist on instances that are configured to write
         assert (
             hs.get_instance_name() in hs.config.worker.writers.events
         ), "Can only instantiate EventsStore on master"
 
+        # Since we have been configured to write, we ought to have id generators,
+        # rather than id trackers.
+        assert isinstance(self.store._backfill_id_gen, AbstractStreamIdGenerator)
+        assert isinstance(self.store._stream_id_gen, AbstractStreamIdGenerator)
+
+        # Ideally we'd move these ID gens here, unfortunately some other ID
+        # generators are chained off them so doing so is a bit of a PITA.
+        self._backfill_id_gen: AbstractStreamIdGenerator = self.store._backfill_id_gen
+        self._stream_id_gen: AbstractStreamIdGenerator = self.store._stream_id_gen
+
     async def _persist_events_and_state_updates(
         self,
         events_and_contexts: List[Tuple[EventBase, EventContext]],
@@ -1553,11 +1556,13 @@ class PersistEventsStore:
         for row in rows:
             event = ev_map[row["event_id"]]
             if not row["rejects"] and not row["redacts"]:
-                to_prefill.append(_EventCacheEntry(event=event, redacted_event=None))
+                to_prefill.append(EventCacheEntry(event=event, redacted_event=None))
 
         def prefill():
             for cache_entry in to_prefill:
-                self.store._get_event_cache.set((cache_entry[0].event_id,), cache_entry)
+                self.store._get_event_cache.set(
+                    (cache_entry.event.event_id,), cache_entry
+                )
 
         txn.call_after(prefill)
 
diff --git a/synapse/storage/databases/main/events_worker.py b/synapse/storage/databases/main/events_worker.py
index c6bcfe1c32..4cefc0a07e 100644
--- a/synapse/storage/databases/main/events_worker.py
+++ b/synapse/storage/databases/main/events_worker.py
@@ -15,14 +15,18 @@
 import logging
 import threading
 from typing import (
+    TYPE_CHECKING,
+    Any,
     Collection,
     Container,
     Dict,
     Iterable,
     List,
+    NoReturn,
     Optional,
     Set,
     Tuple,
+    cast,
     overload,
 )
 
@@ -38,6 +42,7 @@ from synapse.api.errors import NotFoundError, SynapseError
 from synapse.api.room_versions import (
     KNOWN_ROOM_VERSIONS,
     EventFormatVersions,
+    RoomVersion,
     RoomVersions,
 )
 from synapse.events import EventBase, make_event_from_dict
@@ -56,10 +61,18 @@ from synapse.replication.slave.storage._slaved_id_tracker import SlavedIdTracker
 from synapse.replication.tcp.streams import BackfillStream
 from synapse.replication.tcp.streams.events import EventsStream
 from synapse.storage._base import SQLBaseStore, db_to_json, make_in_list_sql_clause
-from synapse.storage.database import DatabasePool, LoggingTransaction
+from synapse.storage.database import (
+    DatabasePool,
+    LoggingDatabaseConnection,
+    LoggingTransaction,
+)
 from synapse.storage.engines import PostgresEngine
-from synapse.storage.types import Connection
-from synapse.storage.util.id_generators import MultiWriterIdGenerator, StreamIdGenerator
+from synapse.storage.types import Cursor
+from synapse.storage.util.id_generators import (
+    AbstractStreamIdTracker,
+    MultiWriterIdGenerator,
+    StreamIdGenerator,
+)
 from synapse.storage.util.sequence import build_sequence_generator
 from synapse.types import JsonDict, get_domain_from_id
 from synapse.util import unwrapFirstError
@@ -69,6 +82,9 @@ from synapse.util.caches.lrucache import LruCache
 from synapse.util.iterutils import batch_iter
 from synapse.util.metrics import Measure
 
+if TYPE_CHECKING:
+    from synapse.server import HomeServer
+
 logger = logging.getLogger(__name__)
 
 
@@ -89,7 +105,7 @@ event_fetch_ongoing_gauge = Gauge(
 
 
 @attr.s(slots=True, auto_attribs=True)
-class _EventCacheEntry:
+class EventCacheEntry:
     event: EventBase
     redacted_event: Optional[EventBase]
 
@@ -129,7 +145,7 @@ class _EventRow:
     json: str
     internal_metadata: str
     format_version: Optional[int]
-    room_version_id: Optional[int]
+    room_version_id: Optional[str]
     rejected_reason: Optional[str]
     redactions: List[str]
     outlier: bool
@@ -153,9 +169,16 @@ class EventsWorkerStore(SQLBaseStore):
     # options controlling this.
     USE_DEDICATED_DB_THREADS_FOR_EVENT_FETCHING = True
 
-    def __init__(self, database: DatabasePool, db_conn, hs):
+    def __init__(
+        self,
+        database: DatabasePool,
+        db_conn: LoggingDatabaseConnection,
+        hs: "HomeServer",
+    ):
         super().__init__(database, db_conn, hs)
 
+        self._stream_id_gen: AbstractStreamIdTracker
+        self._backfill_id_gen: AbstractStreamIdTracker
         if isinstance(database.engine, PostgresEngine):
             # If we're using Postgres than we can use `MultiWriterIdGenerator`
             # regardless of whether this process writes to the streams or not.
@@ -214,7 +237,7 @@ class EventsWorkerStore(SQLBaseStore):
                 5 * 60 * 1000,
             )
 
-        self._get_event_cache = LruCache(
+        self._get_event_cache: LruCache[Tuple[str], EventCacheEntry] = LruCache(
             cache_name="*getEvent*",
             max_size=hs.config.caches.event_cache_size,
         )
@@ -223,19 +246,21 @@ class EventsWorkerStore(SQLBaseStore):
         # ID to cache entry. Note that the returned dict may not have the
         # requested event in it if the event isn't in the DB.
         self._current_event_fetches: Dict[
-            str, ObservableDeferred[Dict[str, _EventCacheEntry]]
+            str, ObservableDeferred[Dict[str, EventCacheEntry]]
         ] = {}
 
         self._event_fetch_lock = threading.Condition()
-        self._event_fetch_list = []
+        self._event_fetch_list: List[
+            Tuple[Iterable[str], "defer.Deferred[Dict[str, _EventRow]]"]
+        ] = []
         self._event_fetch_ongoing = 0
         event_fetch_ongoing_gauge.set(self._event_fetch_ongoing)
 
         # We define this sequence here so that it can be referenced from both
         # the DataStore and PersistEventStore.
-        def get_chain_id_txn(txn):
+        def get_chain_id_txn(txn: Cursor) -> int:
             txn.execute("SELECT COALESCE(max(chain_id), 0) FROM event_auth_chains")
-            return txn.fetchone()[0]
+            return cast(Tuple[int], txn.fetchone())[0]
 
         self.event_chain_id_gen = build_sequence_generator(
             db_conn,
@@ -246,7 +271,13 @@ class EventsWorkerStore(SQLBaseStore):
             id_column="chain_id",
         )
 
-    def process_replication_rows(self, stream_name, instance_name, token, rows):
+    def process_replication_rows(
+        self,
+        stream_name: str,
+        instance_name: str,
+        token: int,
+        rows: Iterable[Any],
+    ) -> None:
         if stream_name == EventsStream.NAME:
             self._stream_id_gen.advance(instance_name, token)
         elif stream_name == BackfillStream.NAME:
@@ -280,10 +311,10 @@ class EventsWorkerStore(SQLBaseStore):
         self,
         event_id: str,
         redact_behaviour: EventRedactBehaviour = EventRedactBehaviour.REDACT,
-        get_prev_content: bool = False,
-        allow_rejected: bool = False,
-        allow_none: Literal[False] = False,
-        check_room_id: Optional[str] = None,
+        get_prev_content: bool = ...,
+        allow_rejected: bool = ...,
+        allow_none: Literal[False] = ...,
+        check_room_id: Optional[str] = ...,
     ) -> EventBase:
         ...
 
@@ -292,10 +323,10 @@ class EventsWorkerStore(SQLBaseStore):
         self,
         event_id: str,
         redact_behaviour: EventRedactBehaviour = EventRedactBehaviour.REDACT,
-        get_prev_content: bool = False,
-        allow_rejected: bool = False,
-        allow_none: Literal[True] = False,
-        check_room_id: Optional[str] = None,
+        get_prev_content: bool = ...,
+        allow_rejected: bool = ...,
+        allow_none: Literal[True] = ...,
+        check_room_id: Optional[str] = ...,
     ) -> Optional[EventBase]:
         ...
 
@@ -357,7 +388,7 @@ class EventsWorkerStore(SQLBaseStore):
 
     async def get_events(
         self,
-        event_ids: Iterable[str],
+        event_ids: Collection[str],
         redact_behaviour: EventRedactBehaviour = EventRedactBehaviour.REDACT,
         get_prev_content: bool = False,
         allow_rejected: bool = False,
@@ -544,7 +575,7 @@ class EventsWorkerStore(SQLBaseStore):
 
     async def _get_events_from_cache_or_db(
         self, event_ids: Iterable[str], allow_rejected: bool = False
-    ) -> Dict[str, _EventCacheEntry]:
+    ) -> Dict[str, EventCacheEntry]:
         """Fetch a bunch of events from the cache or the database.
 
         If events are pulled from the database, they will be cached for future lookups.
@@ -578,7 +609,7 @@ class EventsWorkerStore(SQLBaseStore):
         # same dict into itself N times).
         already_fetching_ids: Set[str] = set()
         already_fetching_deferreds: Set[
-            ObservableDeferred[Dict[str, _EventCacheEntry]]
+            ObservableDeferred[Dict[str, EventCacheEntry]]
         ] = set()
 
         for event_id in missing_events_ids:
@@ -601,7 +632,7 @@ class EventsWorkerStore(SQLBaseStore):
             # function returning more events than requested, but that can happen
             # already due to `_get_events_from_db`).
             fetching_deferred: ObservableDeferred[
-                Dict[str, _EventCacheEntry]
+                Dict[str, EventCacheEntry]
             ] = ObservableDeferred(defer.Deferred(), consumeErrors=True)
             for event_id in missing_events_ids:
                 self._current_event_fetches[event_id] = fetching_deferred
@@ -658,12 +689,12 @@ class EventsWorkerStore(SQLBaseStore):
 
         return event_entry_map
 
-    def _invalidate_get_event_cache(self, event_id):
+    def _invalidate_get_event_cache(self, event_id: str) -> None:
         self._get_event_cache.invalidate((event_id,))
 
     def _get_events_from_cache(
         self, events: Iterable[str], update_metrics: bool = True
-    ) -> Dict[str, _EventCacheEntry]:
+    ) -> Dict[str, EventCacheEntry]:
         """Fetch events from the caches.
 
         May return rejected events.
@@ -820,7 +851,7 @@ class EventsWorkerStore(SQLBaseStore):
                     for _, deferred in event_fetches_to_fail:
                         deferred.errback(exc)
 
-    def _fetch_loop(self, conn: Connection) -> None:
+    def _fetch_loop(self, conn: LoggingDatabaseConnection) -> None:
         """Takes a database connection and waits for requests for events from
         the _event_fetch_list queue.
         """
@@ -850,7 +881,9 @@ class EventsWorkerStore(SQLBaseStore):
             self._fetch_event_list(conn, event_list)
 
     def _fetch_event_list(
-        self, conn: Connection, event_list: List[Tuple[List[str], defer.Deferred]]
+        self,
+        conn: LoggingDatabaseConnection,
+        event_list: List[Tuple[Iterable[str], "defer.Deferred[Dict[str, _EventRow]]"]],
     ) -> None:
         """Handle a load of requests from the _event_fetch_list queue
 
@@ -877,7 +910,7 @@ class EventsWorkerStore(SQLBaseStore):
                 )
 
                 # We only want to resolve deferreds from the main thread
-                def fire():
+                def fire() -> None:
                     for _, d in event_list:
                         d.callback(row_dict)
 
@@ -887,16 +920,16 @@ class EventsWorkerStore(SQLBaseStore):
                 logger.exception("do_fetch")
 
                 # We only want to resolve deferreds from the main thread
-                def fire(evs, exc):
-                    for _, d in evs:
+                def fire_errback(exc: Exception) -> None:
+                    for _, d in event_list:
                         d.errback(exc)
 
                 with PreserveLoggingContext():
-                    self.hs.get_reactor().callFromThread(fire, event_list, e)
+                    self.hs.get_reactor().callFromThread(fire_errback, e)
 
     async def _get_events_from_db(
-        self, event_ids: Iterable[str]
-    ) -> Dict[str, _EventCacheEntry]:
+        self, event_ids: Collection[str]
+    ) -> Dict[str, EventCacheEntry]:
         """Fetch a bunch of events from the database.
 
         May return rejected events.
@@ -912,29 +945,29 @@ class EventsWorkerStore(SQLBaseStore):
             map from event id to result. May return extra events which
             weren't asked for.
         """
-        fetched_events = {}
+        fetched_event_ids: Set[str] = set()
+        fetched_events: Dict[str, _EventRow] = {}
         events_to_fetch = event_ids
 
         while events_to_fetch:
             row_map = await self._enqueue_events(events_to_fetch)
 
             # we need to recursively fetch any redactions of those events
-            redaction_ids = set()
+            redaction_ids: Set[str] = set()
             for event_id in events_to_fetch:
                 row = row_map.get(event_id)
-                fetched_events[event_id] = row
+                fetched_event_ids.add(event_id)
                 if row:
+                    fetched_events[event_id] = row
                     redaction_ids.update(row.redactions)
 
-            events_to_fetch = redaction_ids.difference(fetched_events.keys())
+            events_to_fetch = redaction_ids.difference(fetched_event_ids)
             if events_to_fetch:
                 logger.debug("Also fetching redaction events %s", events_to_fetch)
 
         # build a map from event_id to EventBase
-        event_map = {}
+        event_map: Dict[str, EventBase] = {}
         for event_id, row in fetched_events.items():
-            if not row:
-                continue
             assert row.event_id == event_id
 
             rejected_reason = row.rejected_reason
@@ -962,6 +995,7 @@ class EventsWorkerStore(SQLBaseStore):
 
             room_version_id = row.room_version_id
 
+            room_version: Optional[RoomVersion]
             if not room_version_id:
                 # this should only happen for out-of-band membership events which
                 # arrived before #6983 landed. For all other events, we should have
@@ -1032,14 +1066,14 @@ class EventsWorkerStore(SQLBaseStore):
 
         # finally, we can decide whether each one needs redacting, and build
         # the cache entries.
-        result_map = {}
+        result_map: Dict[str, EventCacheEntry] = {}
         for event_id, original_ev in event_map.items():
             redactions = fetched_events[event_id].redactions
             redacted_event = self._maybe_redact_event_row(
                 original_ev, redactions, event_map
             )
 
-            cache_entry = _EventCacheEntry(
+            cache_entry = EventCacheEntry(
                 event=original_ev, redacted_event=redacted_event
             )
 
@@ -1048,7 +1082,7 @@ class EventsWorkerStore(SQLBaseStore):
 
         return result_map
 
-    async def _enqueue_events(self, events: Iterable[str]) -> Dict[str, _EventRow]:
+    async def _enqueue_events(self, events: Collection[str]) -> Dict[str, _EventRow]:
         """Fetches events from the database using the _event_fetch_list. This
         allows batch and bulk fetching of events - it allows us to fetch events
         without having to create a new transaction for each request for events.
@@ -1061,7 +1095,7 @@ class EventsWorkerStore(SQLBaseStore):
             that weren't requested.
         """
 
-        events_d = defer.Deferred()
+        events_d: "defer.Deferred[Dict[str, _EventRow]]" = defer.Deferred()
         with self._event_fetch_lock:
             self._event_fetch_list.append((events, events_d))
             self._event_fetch_lock.notify()
@@ -1216,7 +1250,7 @@ class EventsWorkerStore(SQLBaseStore):
         # no valid redaction found for this event
         return None
 
-    async def have_events_in_timeline(self, event_ids):
+    async def have_events_in_timeline(self, event_ids: Iterable[str]) -> Set[str]:
         """Given a list of event ids, check if we have already processed and
         stored them as non outliers.
         """
@@ -1245,7 +1279,7 @@ class EventsWorkerStore(SQLBaseStore):
             event_ids: events we are looking for
 
         Returns:
-            set[str]: The events we have already seen.
+            The set of events we have already seen.
         """
         res = await self._have_seen_events_dict(
             (room_id, event_id) for event_id in event_ids
@@ -1268,7 +1302,9 @@ class EventsWorkerStore(SQLBaseStore):
         }
         results = {x: True for x in cache_results}
 
-        def have_seen_events_txn(txn, chunk: Tuple[Tuple[str, str], ...]):
+        def have_seen_events_txn(
+            txn: LoggingTransaction, chunk: Tuple[Tuple[str, str], ...]
+        ) -> None:
             # we deliberately do *not* query the database for room_id, to make the
             # query an index-only lookup on `events_event_id_key`.
             #
@@ -1294,12 +1330,14 @@ class EventsWorkerStore(SQLBaseStore):
         return results
 
     @cached(max_entries=100000, tree=True)
-    async def have_seen_event(self, room_id: str, event_id: str):
+    async def have_seen_event(self, room_id: str, event_id: str) -> NoReturn:
         # this only exists for the benefit of the @cachedList descriptor on
         # _have_seen_events_dict
         raise NotImplementedError()
 
-    def _get_current_state_event_counts_txn(self, txn, room_id):
+    def _get_current_state_event_counts_txn(
+        self, txn: LoggingTransaction, room_id: str
+    ) -> int:
         """
         See get_current_state_event_counts.
         """
@@ -1324,7 +1362,7 @@ class EventsWorkerStore(SQLBaseStore):
             room_id,
         )
 
-    async def get_room_complexity(self, room_id):
+    async def get_room_complexity(self, room_id: str) -> Dict[str, float]:
         """
         Get a rough approximation of the complexity of the room. This is used by
         remote servers to decide whether they wish to join the room or not.
@@ -1332,10 +1370,10 @@ class EventsWorkerStore(SQLBaseStore):
         more resources.
 
         Args:
-            room_id (str)
+            room_id: The room ID to query.
 
         Returns:
-            dict[str:int] of complexity version to complexity.
+            dict[str:float] of complexity version to complexity.
         """
         state_events = await self.get_current_state_event_counts(room_id)
 
@@ -1345,13 +1383,13 @@ class EventsWorkerStore(SQLBaseStore):
 
         return {"v1": complexity_v1}
 
-    def get_current_events_token(self):
+    def get_current_events_token(self) -> int:
         """The current maximum token that events have reached"""
         return self._stream_id_gen.get_current_token()
 
     async def get_all_new_forward_event_rows(
         self, instance_name: str, last_id: int, current_id: int, limit: int
-    ) -> List[Tuple]:
+    ) -> List[Tuple[int, str, str, str, str, str, str, str, str]]:
         """Returns new events, for the Events replication stream
 
         Args:
@@ -1365,7 +1403,9 @@ class EventsWorkerStore(SQLBaseStore):
             EventsStreamRow.
         """
 
-        def get_all_new_forward_event_rows(txn):
+        def get_all_new_forward_event_rows(
+            txn: LoggingTransaction,
+        ) -> List[Tuple[int, str, str, str, str, str, str, str, str]]:
             sql = (
                 "SELECT e.stream_ordering, e.event_id, e.room_id, e.type,"
                 " state_key, redacts, relates_to_id, membership, rejections.reason IS NOT NULL"
@@ -1381,7 +1421,9 @@ class EventsWorkerStore(SQLBaseStore):
                 " LIMIT ?"
             )
             txn.execute(sql, (last_id, current_id, instance_name, limit))
-            return txn.fetchall()
+            return cast(
+                List[Tuple[int, str, str, str, str, str, str, str, str]], txn.fetchall()
+            )
 
         return await self.db_pool.runInteraction(
             "get_all_new_forward_event_rows", get_all_new_forward_event_rows
@@ -1389,7 +1431,7 @@ class EventsWorkerStore(SQLBaseStore):
 
     async def get_ex_outlier_stream_rows(
         self, instance_name: str, last_id: int, current_id: int
-    ) -> List[Tuple]:
+    ) -> List[Tuple[int, str, str, str, str, str, str, str, str]]:
         """Returns de-outliered events, for the Events replication stream
 
         Args:
@@ -1402,7 +1444,9 @@ class EventsWorkerStore(SQLBaseStore):
             EventsStreamRow.
         """
 
-        def get_ex_outlier_stream_rows_txn(txn):
+        def get_ex_outlier_stream_rows_txn(
+            txn: LoggingTransaction,
+        ) -> List[Tuple[int, str, str, str, str, str, str, str, str]]:
             sql = (
                 "SELECT event_stream_ordering, e.event_id, e.room_id, e.type,"
                 " state_key, redacts, relates_to_id, membership, rejections.reason IS NOT NULL"
@@ -1420,7 +1464,9 @@ class EventsWorkerStore(SQLBaseStore):
             )
 
             txn.execute(sql, (last_id, current_id, instance_name))
-            return txn.fetchall()
+            return cast(
+                List[Tuple[int, str, str, str, str, str, str, str, str]], txn.fetchall()
+            )
 
         return await self.db_pool.runInteraction(
             "get_ex_outlier_stream_rows", get_ex_outlier_stream_rows_txn
@@ -1428,7 +1474,7 @@ class EventsWorkerStore(SQLBaseStore):
 
     async def get_all_new_backfill_event_rows(
         self, instance_name: str, last_id: int, current_id: int, limit: int
-    ) -> Tuple[List[Tuple[int, list]], int, bool]:
+    ) -> Tuple[List[Tuple[int, Tuple[str, str, str, str, str, str]]], int, bool]:
         """Get updates for backfill replication stream, including all new
         backfilled events and events that have gone from being outliers to not.
 
@@ -1456,7 +1502,9 @@ class EventsWorkerStore(SQLBaseStore):
         if last_id == current_id:
             return [], current_id, False
 
-        def get_all_new_backfill_event_rows(txn):
+        def get_all_new_backfill_event_rows(
+            txn: LoggingTransaction,
+        ) -> Tuple[List[Tuple[int, Tuple[str, str, str, str, str, str]]], int, bool]:
             sql = (
                 "SELECT -e.stream_ordering, e.event_id, e.room_id, e.type,"
                 " state_key, redacts, relates_to_id"
@@ -1470,7 +1518,15 @@ class EventsWorkerStore(SQLBaseStore):
                 " LIMIT ?"
             )
             txn.execute(sql, (-last_id, -current_id, instance_name, limit))
-            new_event_updates = [(row[0], row[1:]) for row in txn]
+            new_event_updates: List[
+                Tuple[int, Tuple[str, str, str, str, str, str]]
+            ] = []
+            row: Tuple[int, str, str, str, str, str, str]
+            # Type safety: iterating over `txn` yields `Tuple`, i.e.
+            # `Tuple[Any, ...]` of arbitrary length. Mypy detects assigning a
+            # variadic tuple to a fixed length tuple and flags it up as an error.
+            for row in txn:  # type: ignore[assignment]
+                new_event_updates.append((row[0], row[1:]))
 
             limited = False
             if len(new_event_updates) == limit:
@@ -1493,7 +1549,11 @@ class EventsWorkerStore(SQLBaseStore):
                 " ORDER BY event_stream_ordering DESC"
             )
             txn.execute(sql, (-last_id, -upper_bound, instance_name))
-            new_event_updates.extend((row[0], row[1:]) for row in txn)
+            # Type safety: iterating over `txn` yields `Tuple`, i.e.
+            # `Tuple[Any, ...]` of arbitrary length. Mypy detects assigning a
+            # variadic tuple to a fixed length tuple and flags it up as an error.
+            for row in txn:  # type: ignore[assignment]
+                new_event_updates.append((row[0], row[1:]))
 
             if len(new_event_updates) >= limit:
                 upper_bound = new_event_updates[-1][0]
@@ -1507,7 +1567,7 @@ class EventsWorkerStore(SQLBaseStore):
 
     async def get_all_updated_current_state_deltas(
         self, instance_name: str, from_token: int, to_token: int, target_row_count: int
-    ) -> Tuple[List[Tuple], int, bool]:
+    ) -> Tuple[List[Tuple[int, str, str, str, str]], int, bool]:
         """Fetch updates from current_state_delta_stream
 
         Args:
@@ -1527,7 +1587,9 @@ class EventsWorkerStore(SQLBaseStore):
                * `limited` is whether there are more updates to fetch.
         """
 
-        def get_all_updated_current_state_deltas_txn(txn):
+        def get_all_updated_current_state_deltas_txn(
+            txn: LoggingTransaction,
+        ) -> List[Tuple[int, str, str, str, str]]:
             sql = """
                 SELECT stream_id, room_id, type, state_key, event_id
                 FROM current_state_delta_stream
@@ -1536,21 +1598,23 @@ class EventsWorkerStore(SQLBaseStore):
                 ORDER BY stream_id ASC LIMIT ?
             """
             txn.execute(sql, (from_token, to_token, instance_name, target_row_count))
-            return txn.fetchall()
+            return cast(List[Tuple[int, str, str, str, str]], txn.fetchall())
 
-        def get_deltas_for_stream_id_txn(txn, stream_id):
+        def get_deltas_for_stream_id_txn(
+            txn: LoggingTransaction, stream_id: int
+        ) -> List[Tuple[int, str, str, str, str]]:
             sql = """
                 SELECT stream_id, room_id, type, state_key, event_id
                 FROM current_state_delta_stream
                 WHERE stream_id = ?
             """
             txn.execute(sql, [stream_id])
-            return txn.fetchall()
+            return cast(List[Tuple[int, str, str, str, str]], txn.fetchall())
 
         # we need to make sure that, for every stream id in the results, we get *all*
         # the rows with that stream id.
 
-        rows: List[Tuple] = await self.db_pool.runInteraction(
+        rows: List[Tuple[int, str, str, str, str]] = await self.db_pool.runInteraction(
             "get_all_updated_current_state_deltas",
             get_all_updated_current_state_deltas_txn,
         )
@@ -1579,14 +1643,14 @@ class EventsWorkerStore(SQLBaseStore):
 
         return rows, to_token, True
 
-    async def is_event_after(self, event_id1, event_id2):
+    async def is_event_after(self, event_id1: str, event_id2: str) -> bool:
         """Returns True if event_id1 is after event_id2 in the stream"""
         to_1, so_1 = await self.get_event_ordering(event_id1)
         to_2, so_2 = await self.get_event_ordering(event_id2)
         return (to_1, so_1) > (to_2, so_2)
 
     @cached(max_entries=5000)
-    async def get_event_ordering(self, event_id):
+    async def get_event_ordering(self, event_id: str) -> Tuple[int, int]:
         res = await self.db_pool.simple_select_one(
             table="events",
             retcols=["topological_ordering", "stream_ordering"],
@@ -1609,7 +1673,9 @@ class EventsWorkerStore(SQLBaseStore):
             None otherwise.
         """
 
-        def get_next_event_to_expire_txn(txn):
+        def get_next_event_to_expire_txn(
+            txn: LoggingTransaction,
+        ) -> Optional[Tuple[str, int]]:
             txn.execute(
                 """
                 SELECT event_id, expiry_ts FROM event_expiry
@@ -1617,7 +1683,7 @@ class EventsWorkerStore(SQLBaseStore):
                 """
             )
 
-            return txn.fetchone()
+            return cast(Optional[Tuple[str, int]], txn.fetchone())
 
         return await self.db_pool.runInteraction(
             desc="get_next_event_to_expire", func=get_next_event_to_expire_txn
@@ -1681,10 +1747,10 @@ class EventsWorkerStore(SQLBaseStore):
         return mapping
 
     @wrap_as_background_process("_cleanup_old_transaction_ids")
-    async def _cleanup_old_transaction_ids(self):
+    async def _cleanup_old_transaction_ids(self) -> None:
         """Cleans out transaction id mappings older than 24hrs."""
 
-        def _cleanup_old_transaction_ids_txn(txn):
+        def _cleanup_old_transaction_ids_txn(txn: LoggingTransaction) -> None:
             sql = """
                 DELETE FROM event_txn_id
                 WHERE inserted_ts < ?
diff --git a/synapse/storage/databases/main/push_rule.py b/synapse/storage/databases/main/push_rule.py
index fa782023d4..3b63267395 100644
--- a/synapse/storage/databases/main/push_rule.py
+++ b/synapse/storage/databases/main/push_rule.py
@@ -28,7 +28,10 @@ from synapse.storage.databases.main.receipts import ReceiptsWorkerStore
 from synapse.storage.databases.main.roommember import RoomMemberWorkerStore
 from synapse.storage.engines import PostgresEngine, Sqlite3Engine
 from synapse.storage.push_rule import InconsistentRuleException, RuleNotFoundException
-from synapse.storage.util.id_generators import StreamIdGenerator
+from synapse.storage.util.id_generators import (
+    AbstractStreamIdTracker,
+    StreamIdGenerator,
+)
 from synapse.util import json_encoder
 from synapse.util.caches.descriptors import cached, cachedList
 from synapse.util.caches.stream_change_cache import StreamChangeCache
@@ -82,9 +85,9 @@ class PushRulesWorkerStore(
         super().__init__(database, db_conn, hs)
 
         if hs.config.worker.worker_app is None:
-            self._push_rules_stream_id_gen: Union[
-                StreamIdGenerator, SlavedIdTracker
-            ] = StreamIdGenerator(db_conn, "push_rules_stream", "stream_id")
+            self._push_rules_stream_id_gen: AbstractStreamIdTracker = StreamIdGenerator(
+                db_conn, "push_rules_stream", "stream_id"
+            )
         else:
             self._push_rules_stream_id_gen = SlavedIdTracker(
                 db_conn, "push_rules_stream", "stream_id"
diff --git a/synapse/storage/util/id_generators.py b/synapse/storage/util/id_generators.py
index ac56bc9a05..4ff3013908 100644
--- a/synapse/storage/util/id_generators.py
+++ b/synapse/storage/util/id_generators.py
@@ -89,31 +89,77 @@ def _load_current_id(
     return (max if step > 0 else min)(current_id, step)
 
 
-class AbstractStreamIdGenerator(metaclass=abc.ABCMeta):
-    @abc.abstractmethod
-    def get_next(self) -> AsyncContextManager[int]:
-        raise NotImplementedError()
+class AbstractStreamIdTracker(metaclass=abc.ABCMeta):
+    """Tracks the "current" stream ID of a stream that may have multiple writers.
+
+    Stream IDs are monotonically increasing or decreasing integers representing write
+    transactions. The "current" stream ID is the stream ID such that all transactions
+    with equal or smaller stream IDs have completed. Since transactions may complete out
+    of order, this is not the same as the stream ID of the last completed transaction.
+
+    Completed transactions include both committed transactions and transactions that
+    have been rolled back.
+    """
 
     @abc.abstractmethod
-    def get_next_mult(self, n: int) -> AsyncContextManager[Sequence[int]]:
+    def advance(self, instance_name: str, new_id: int) -> None:
+        """Advance the position of the named writer to the given ID, if greater
+        than existing entry.
+        """
         raise NotImplementedError()
 
     @abc.abstractmethod
     def get_current_token(self) -> int:
+        """Returns the maximum stream id such that all stream ids less than or
+        equal to it have been successfully persisted.
+
+        Returns:
+            The maximum stream id.
+        """
         raise NotImplementedError()
 
     @abc.abstractmethod
     def get_current_token_for_writer(self, instance_name: str) -> int:
+        """Returns the position of the given writer.
+
+        For streams with single writers this is equivalent to `get_current_token`.
+        """
+        raise NotImplementedError()
+
+
+class AbstractStreamIdGenerator(AbstractStreamIdTracker):
+    """Generates stream IDs for a stream that may have multiple writers.
+
+    Each stream ID represents a write transaction, whose completion is tracked
+    so that the "current" stream ID of the stream can be determined.
+
+    See `AbstractStreamIdTracker` for more details.
+    """
+
+    @abc.abstractmethod
+    def get_next(self) -> AsyncContextManager[int]:
+        """
+        Usage:
+            async with stream_id_gen.get_next() as stream_id:
+                # ... persist event ...
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_next_mult(self, n: int) -> AsyncContextManager[Sequence[int]]:
+        """
+        Usage:
+            async with stream_id_gen.get_next(n) as stream_ids:
+                # ... persist events ...
+        """
         raise NotImplementedError()
 
 
 class StreamIdGenerator(AbstractStreamIdGenerator):
-    """Used to generate new stream ids when persisting events while keeping
-    track of which transactions have been completed.
+    """Generates and tracks stream IDs for a stream with a single writer.
 
-    This allows us to get the "current" stream id, i.e. the stream id such that
-    all ids less than or equal to it have completed. This handles the fact that
-    persistence of events can complete out of order.
+    This class must only be used when the current Synapse process is the sole
+    writer for a stream.
 
     Args:
         db_conn(connection):  A database connection to use to fetch the
@@ -157,12 +203,12 @@ class StreamIdGenerator(AbstractStreamIdGenerator):
         # The key and values are the same, but we never look at the values.
         self._unfinished_ids: OrderedDict[int, int] = OrderedDict()
 
+    def advance(self, instance_name: str, new_id: int) -> None:
+        # `StreamIdGenerator` should only be used when there is a single writer,
+        # so replication should never happen.
+        raise Exception("Replication is not supported by StreamIdGenerator")
+
     def get_next(self) -> AsyncContextManager[int]:
-        """
-        Usage:
-            async with stream_id_gen.get_next() as stream_id:
-                # ... persist event ...
-        """
         with self._lock:
             self._current += self._step
             next_id = self._current
@@ -180,11 +226,6 @@ class StreamIdGenerator(AbstractStreamIdGenerator):
         return _AsyncCtxManagerWrapper(manager())
 
     def get_next_mult(self, n: int) -> AsyncContextManager[Sequence[int]]:
-        """
-        Usage:
-            async with stream_id_gen.get_next(n) as stream_ids:
-                # ... persist events ...
-        """
         with self._lock:
             next_ids = range(
                 self._current + self._step,
@@ -208,12 +249,6 @@ class StreamIdGenerator(AbstractStreamIdGenerator):
         return _AsyncCtxManagerWrapper(manager())
 
     def get_current_token(self) -> int:
-        """Returns the maximum stream id such that all stream ids less than or
-        equal to it have been successfully persisted.
-
-        Returns:
-            The maximum stream id.
-        """
         with self._lock:
             if self._unfinished_ids:
                 return next(iter(self._unfinished_ids)) - self._step
@@ -221,16 +256,11 @@ class StreamIdGenerator(AbstractStreamIdGenerator):
             return self._current
 
     def get_current_token_for_writer(self, instance_name: str) -> int:
-        """Returns the position of the given writer.
-
-        For streams with single writers this is equivalent to
-        `get_current_token`.
-        """
         return self.get_current_token()
 
 
 class MultiWriterIdGenerator(AbstractStreamIdGenerator):
-    """An ID generator that tracks a stream that can have multiple writers.
+    """Generates and tracks stream IDs for a stream with multiple writers.
 
     Uses a Postgres sequence to coordinate ID assignment, but positions of other
     writers will only get updated when `advance` is called (by replication).
@@ -475,12 +505,6 @@ class MultiWriterIdGenerator(AbstractStreamIdGenerator):
         return stream_ids
 
     def get_next(self) -> AsyncContextManager[int]:
-        """
-        Usage:
-            async with stream_id_gen.get_next() as stream_id:
-                # ... persist event ...
-        """
-
         # If we have a list of instances that are allowed to write to this
         # stream, make sure we're in it.
         if self._writers and self._instance_name not in self._writers:
@@ -492,12 +516,6 @@ class MultiWriterIdGenerator(AbstractStreamIdGenerator):
         return cast(AsyncContextManager[int], _MultiWriterCtxManager(self))
 
     def get_next_mult(self, n: int) -> AsyncContextManager[List[int]]:
-        """
-        Usage:
-            async with stream_id_gen.get_next_mult(5) as stream_ids:
-                # ... persist events ...
-        """
-
         # If we have a list of instances that are allowed to write to this
         # stream, make sure we're in it.
         if self._writers and self._instance_name not in self._writers:
@@ -597,15 +615,9 @@ class MultiWriterIdGenerator(AbstractStreamIdGenerator):
             self._add_persisted_position(next_id)
 
     def get_current_token(self) -> int:
-        """Returns the maximum stream id such that all stream ids less than or
-        equal to it have been successfully persisted.
-        """
-
         return self.get_persisted_upto_position()
 
     def get_current_token_for_writer(self, instance_name: str) -> int:
-        """Returns the position of the given writer."""
-
         # If we don't have an entry for the given instance name, we assume it's a
         # new writer.
         #
@@ -631,10 +643,6 @@ class MultiWriterIdGenerator(AbstractStreamIdGenerator):
             }
 
     def advance(self, instance_name: str, new_id: int) -> None:
-        """Advance the position of the named writer to the given ID, if greater
-        than existing entry.
-        """
-
         new_id *= self._return_factor
 
         with self._lock:
diff --git a/tests/replication/test_sharded_event_persister.py b/tests/replication/test_sharded_event_persister.py
index 0a6e4795ee..596ba5a0c9 100644
--- a/tests/replication/test_sharded_event_persister.py
+++ b/tests/replication/test_sharded_event_persister.py
@@ -17,6 +17,7 @@ from unittest.mock import patch
 from synapse.api.room_versions import RoomVersion
 from synapse.rest import admin
 from synapse.rest.client import login, room, sync
+from synapse.storage.util.id_generators import MultiWriterIdGenerator
 
 from tests.replication._base import BaseMultiWorkerStreamTestCase
 from tests.server import make_request
@@ -193,7 +194,10 @@ class EventPersisterShardTestCase(BaseMultiWorkerStreamTestCase):
         #
         # Worker2's event stream position will not advance until we call
         # __aexit__ again.
-        actx = worker_hs2.get_datastore()._stream_id_gen.get_next()
+        worker_store2 = worker_hs2.get_datastore()
+        assert isinstance(worker_store2._stream_id_gen, MultiWriterIdGenerator)
+
+        actx = worker_store2._stream_id_gen.get_next()
         self.get_success(actx.__aenter__())
 
         response = self.helper.send(room_id1, body="Hi!", tok=self.other_access_token)
-- 
cgit 1.5.1


From d08ef6f155971717d2c6dbd78c89312afd4d84fa Mon Sep 17 00:00:00 2001
From: Erik Johnston <erik@matrix.org>
Date: Mon, 29 Nov 2021 16:57:06 +0000
Subject: Make background updates controllable via a plugin (#11306)

Co-authored-by: Brendan Abolivier <babolivier@matrix.org>
---
 changelog.d/11306.feature                          |   1 +
 .../background_update_controller_callbacks.md      |  71 ++++++++
 docs/modules/writing_a_module.md                   |  12 +-
 setup.py                                           |   4 +-
 synapse/module_api/__init__.py                     |  54 +++++-
 synapse/storage/background_updates.py              | 192 ++++++++++++++++++---
 tests/push/test_email.py                           |   9 +-
 tests/rest/admin/test_background_updates.py        |   2 +-
 tests/storage/test_background_update.py            | 104 +++++++++--
 tests/storage/test_event_chain.py                  |   4 +-
 tests/storage/test_user_directory.py               |   5 +-
 tests/unittest.py                                  |  10 +-
 12 files changed, 407 insertions(+), 61 deletions(-)
 create mode 100644 changelog.d/11306.feature
 create mode 100644 docs/modules/background_update_controller_callbacks.md

(limited to 'synapse/storage')

diff --git a/changelog.d/11306.feature b/changelog.d/11306.feature
new file mode 100644
index 0000000000..aba3292015
--- /dev/null
+++ b/changelog.d/11306.feature
@@ -0,0 +1 @@
+Add plugin support for controlling database background updates.
diff --git a/docs/modules/background_update_controller_callbacks.md b/docs/modules/background_update_controller_callbacks.md
new file mode 100644
index 0000000000..b3e7c259f4
--- /dev/null
+++ b/docs/modules/background_update_controller_callbacks.md
@@ -0,0 +1,71 @@
+# Background update controller callbacks
+
+Background update controller callbacks allow module developers to control (e.g. rate-limit)
+how database background updates are run. A database background update is an operation
+Synapse runs on its database in the background after it starts. It's usually used to run
+database operations that would take too long if they were run at the same time as schema
+updates (which are run on startup) and delay Synapse's startup too much: populating a
+table with a big amount of data, adding an index on a big table, deleting superfluous data,
+etc.
+
+Background update controller callbacks can be registered using the module API's
+`register_background_update_controller_callbacks` method. Only the first module (in order
+of appearance in Synapse's configuration file) calling this method can register background
+update controller callbacks, subsequent calls are ignored.
+
+The available background update controller callbacks are:
+
+### `on_update`
+
+_First introduced in Synapse v1.49.0_
+
+```python
+def on_update(update_name: str, database_name: str, one_shot: bool) -> AsyncContextManager[int]
+```
+
+Called when about to do an iteration of a background update. The module is given the name
+of the update, the name of the database, and a flag to indicate whether the background
+update will happen in one go and may take a long time (e.g. creating indices). If this last
+argument is set to `False`, the update will be run in batches.
+
+The module must return an async context manager. It will be entered before Synapse runs a 
+background update; this should return the desired duration of the iteration, in
+milliseconds.
+
+The context manager will be exited when the iteration completes. Note that the duration
+returned by the context manager is a target, and an iteration may take substantially longer
+or shorter. If the `one_shot` flag is set to `True`, the duration returned is ignored.
+
+__Note__: Unlike most module callbacks in Synapse, this one is _synchronous_. This is
+because asynchronous operations are expected to be run by the async context manager.
+
+This callback is required when registering any other background update controller callback.
+
+### `default_batch_size`
+
+_First introduced in Synapse v1.49.0_
+
+```python
+async def default_batch_size(update_name: str, database_name: str) -> int
+```
+
+Called before the first iteration of a background update, with the name of the update and
+of the database. The module must return the number of elements to process in this first
+iteration.
+
+If this callback is not defined, Synapse will use a default value of 100.
+
+### `min_batch_size`
+
+_First introduced in Synapse v1.49.0_
+
+```python
+async def min_batch_size(update_name: str, database_name: str) -> int
+```
+
+Called before running a new batch for a background update, with the name of the update and
+of the database. The module must return an integer representing the minimum number of
+elements to process in this iteration. This number must be at least 1, and is used to
+ensure that progress is always made.
+
+If this callback is not defined, Synapse will use a default value of 100.
diff --git a/docs/modules/writing_a_module.md b/docs/modules/writing_a_module.md
index 7764e06692..e7c0ffad58 100644
--- a/docs/modules/writing_a_module.md
+++ b/docs/modules/writing_a_module.md
@@ -71,15 +71,15 @@ Modules **must** register their web resources in their `__init__` method.
 ## Registering a callback
 
 Modules can use Synapse's module API to register callbacks. Callbacks are functions that
-Synapse will call when performing specific actions. Callbacks must be asynchronous, and
-are split in categories. A single module may implement callbacks from multiple categories,
-and is under no obligation to implement all callbacks from the categories it registers
-callbacks for.
+Synapse will call when performing specific actions. Callbacks must be asynchronous (unless
+specified otherwise), and are split in categories. A single module may implement callbacks
+from multiple categories, and is under no obligation to implement all callbacks from the
+categories it registers callbacks for.
 
 Modules can register callbacks using one of the module API's `register_[...]_callbacks`
 methods. The callback functions are passed to these methods as keyword arguments, with
-the callback name as the argument name and the function as its value. This is demonstrated
-in the example below. A `register_[...]_callbacks` method exists for each category.
+the callback name as the argument name and the function as its value. A
+`register_[...]_callbacks` method exists for each category.
 
 Callbacks for each category can be found on their respective page of the
 [Synapse documentation website](https://matrix-org.github.io/synapse).
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 0ce8beb004..ad99b3bd2c 100755
--- a/setup.py
+++ b/setup.py
@@ -119,7 +119,9 @@ CONDITIONAL_REQUIREMENTS["mypy"] = [
 # Tests assume that all optional dependencies are installed.
 #
 # parameterized_class decorator was introduced in parameterized 0.7.0
-CONDITIONAL_REQUIREMENTS["test"] = ["parameterized>=0.7.0"]
+#
+# We use `mock` library as that backports `AsyncMock` to Python 3.6
+CONDITIONAL_REQUIREMENTS["test"] = ["parameterized>=0.7.0", "mock>=4.0.0"]
 
 CONDITIONAL_REQUIREMENTS["dev"] = (
     CONDITIONAL_REQUIREMENTS["lint"]
diff --git a/synapse/module_api/__init__.py b/synapse/module_api/__init__.py
index 19e570ede2..a8154168be 100644
--- a/synapse/module_api/__init__.py
+++ b/synapse/module_api/__init__.py
@@ -82,10 +82,19 @@ from synapse.http.server import (
 )
 from synapse.http.servlet import parse_json_object_from_request
 from synapse.http.site import SynapseRequest
-from synapse.logging.context import make_deferred_yieldable, run_in_background
+from synapse.logging.context import (
+    defer_to_thread,
+    make_deferred_yieldable,
+    run_in_background,
+)
 from synapse.metrics.background_process_metrics import run_as_background_process
 from synapse.rest.client.login import LoginResponse
 from synapse.storage import DataStore
+from synapse.storage.background_updates import (
+    DEFAULT_BATCH_SIZE_CALLBACK,
+    MIN_BATCH_SIZE_CALLBACK,
+    ON_UPDATE_CALLBACK,
+)
 from synapse.storage.database import DatabasePool, LoggingTransaction
 from synapse.storage.databases.main.roommember import ProfileInfo
 from synapse.storage.state import StateFilter
@@ -311,6 +320,24 @@ class ModuleApi:
             auth_checkers=auth_checkers,
         )
 
+    def register_background_update_controller_callbacks(
+        self,
+        on_update: ON_UPDATE_CALLBACK,
+        default_batch_size: Optional[DEFAULT_BATCH_SIZE_CALLBACK] = None,
+        min_batch_size: Optional[MIN_BATCH_SIZE_CALLBACK] = None,
+    ) -> None:
+        """Registers background update controller callbacks.
+
+        Added in Synapse v1.49.0.
+        """
+
+        for db in self._hs.get_datastores().databases:
+            db.updates.register_update_controller_callbacks(
+                on_update=on_update,
+                default_batch_size=default_batch_size,
+                min_batch_size=min_batch_size,
+            )
+
     def register_web_resource(self, path: str, resource: Resource) -> None:
         """Registers a web resource to be served at the given path.
 
@@ -995,6 +1022,11 @@ class ModuleApi:
                 f,
             )
 
+    async def sleep(self, seconds: float) -> None:
+        """Sleeps for the given number of seconds."""
+
+        await self._clock.sleep(seconds)
+
     async def send_mail(
         self,
         recipient: str,
@@ -1149,6 +1181,26 @@ class ModuleApi:
 
         return {key: state_events[event_id] for key, event_id in state_ids.items()}
 
+    async def defer_to_thread(
+        self,
+        f: Callable[..., T],
+        *args: Any,
+        **kwargs: Any,
+    ) -> T:
+        """Runs the given function in a separate thread from Synapse's thread pool.
+
+        Added in Synapse v1.49.0.
+
+        Args:
+            f: The function to run.
+            args: The function's arguments.
+            kwargs: The function's keyword arguments.
+
+        Returns:
+            The return value of the function once ran in a thread.
+        """
+        return await defer_to_thread(self._hs.get_reactor(), f, *args, **kwargs)
+
 
 class PublicRoomListManager:
     """Contains methods for adding to, removing from and querying whether a room
diff --git a/synapse/storage/background_updates.py b/synapse/storage/background_updates.py
index bc8364400d..d64910aded 100644
--- a/synapse/storage/background_updates.py
+++ b/synapse/storage/background_updates.py
@@ -12,12 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-from typing import TYPE_CHECKING, Awaitable, Callable, Dict, Iterable, Optional
+from typing import (
+    TYPE_CHECKING,
+    AsyncContextManager,
+    Awaitable,
+    Callable,
+    Dict,
+    Iterable,
+    Optional,
+)
+
+import attr
 
 from synapse.metrics.background_process_metrics import run_as_background_process
 from synapse.storage.types import Connection
 from synapse.types import JsonDict
-from synapse.util import json_encoder
+from synapse.util import Clock, json_encoder
 
 from . import engines
 
@@ -28,6 +38,45 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
 
 
+ON_UPDATE_CALLBACK = Callable[[str, str, bool], AsyncContextManager[int]]
+DEFAULT_BATCH_SIZE_CALLBACK = Callable[[str, str], Awaitable[int]]
+MIN_BATCH_SIZE_CALLBACK = Callable[[str, str], Awaitable[int]]
+
+
+@attr.s(slots=True, frozen=True, auto_attribs=True)
+class _BackgroundUpdateHandler:
+    """A handler for a given background update.
+
+    Attributes:
+        callback: The function to call to make progress on the background
+            update.
+        oneshot: Wether the update is likely to happen all in one go, ignoring
+            the supplied target duration, e.g. index creation. This is used by
+            the update controller to help correctly schedule the update.
+    """
+
+    callback: Callable[[JsonDict, int], Awaitable[int]]
+    oneshot: bool = False
+
+
+class _BackgroundUpdateContextManager:
+    BACKGROUND_UPDATE_INTERVAL_MS = 1000
+    BACKGROUND_UPDATE_DURATION_MS = 100
+
+    def __init__(self, sleep: bool, clock: Clock):
+        self._sleep = sleep
+        self._clock = clock
+
+    async def __aenter__(self) -> int:
+        if self._sleep:
+            await self._clock.sleep(self.BACKGROUND_UPDATE_INTERVAL_MS / 1000)
+
+        return self.BACKGROUND_UPDATE_DURATION_MS
+
+    async def __aexit__(self, *exc) -> None:
+        pass
+
+
 class BackgroundUpdatePerformance:
     """Tracks the how long a background update is taking to update its items"""
 
@@ -84,20 +133,22 @@ class BackgroundUpdater:
 
     MINIMUM_BACKGROUND_BATCH_SIZE = 1
     DEFAULT_BACKGROUND_BATCH_SIZE = 100
-    BACKGROUND_UPDATE_INTERVAL_MS = 1000
-    BACKGROUND_UPDATE_DURATION_MS = 100
 
     def __init__(self, hs: "HomeServer", database: "DatabasePool"):
         self._clock = hs.get_clock()
         self.db_pool = database
 
+        self._database_name = database.name()
+
         # if a background update is currently running, its name.
         self._current_background_update: Optional[str] = None
 
+        self._on_update_callback: Optional[ON_UPDATE_CALLBACK] = None
+        self._default_batch_size_callback: Optional[DEFAULT_BATCH_SIZE_CALLBACK] = None
+        self._min_batch_size_callback: Optional[MIN_BATCH_SIZE_CALLBACK] = None
+
         self._background_update_performance: Dict[str, BackgroundUpdatePerformance] = {}
-        self._background_update_handlers: Dict[
-            str, Callable[[JsonDict, int], Awaitable[int]]
-        ] = {}
+        self._background_update_handlers: Dict[str, _BackgroundUpdateHandler] = {}
         self._all_done = False
 
         # Whether we're currently running updates
@@ -107,6 +158,83 @@ class BackgroundUpdater:
         # enable/disable background updates via the admin API.
         self.enabled = True
 
+    def register_update_controller_callbacks(
+        self,
+        on_update: ON_UPDATE_CALLBACK,
+        default_batch_size: Optional[DEFAULT_BATCH_SIZE_CALLBACK] = None,
+        min_batch_size: Optional[DEFAULT_BATCH_SIZE_CALLBACK] = None,
+    ) -> None:
+        """Register callbacks from a module for each hook."""
+        if self._on_update_callback is not None:
+            logger.warning(
+                "More than one module tried to register callbacks for controlling"
+                " background updates. Only the callbacks registered by the first module"
+                " (in order of appearance in Synapse's configuration file) that tried to"
+                " do so will be called."
+            )
+
+            return
+
+        self._on_update_callback = on_update
+
+        if default_batch_size is not None:
+            self._default_batch_size_callback = default_batch_size
+
+        if min_batch_size is not None:
+            self._min_batch_size_callback = min_batch_size
+
+    def _get_context_manager_for_update(
+        self,
+        sleep: bool,
+        update_name: str,
+        database_name: str,
+        oneshot: bool,
+    ) -> AsyncContextManager[int]:
+        """Get a context manager to run a background update with.
+
+        If a module has registered a `update_handler` callback, use the context manager
+        it returns.
+
+        Otherwise, returns a context manager that will return a default value, optionally
+        sleeping if needed.
+
+        Args:
+            sleep: Whether we can sleep between updates.
+            update_name: The name of the update.
+            database_name: The name of the database the update is being run on.
+            oneshot: Whether the update will complete all in one go, e.g. index creation.
+                In such cases the returned target duration is ignored.
+
+        Returns:
+            The target duration in milliseconds that the background update should run for.
+
+            Note: this is a *target*, and an iteration may take substantially longer or
+            shorter.
+        """
+        if self._on_update_callback is not None:
+            return self._on_update_callback(update_name, database_name, oneshot)
+
+        return _BackgroundUpdateContextManager(sleep, self._clock)
+
+    async def _default_batch_size(self, update_name: str, database_name: str) -> int:
+        """The batch size to use for the first iteration of a new background
+        update.
+        """
+        if self._default_batch_size_callback is not None:
+            return await self._default_batch_size_callback(update_name, database_name)
+
+        return self.DEFAULT_BACKGROUND_BATCH_SIZE
+
+    async def _min_batch_size(self, update_name: str, database_name: str) -> int:
+        """A lower bound on the batch size of a new background update.
+
+        Used to ensure that progress is always made. Must be greater than 0.
+        """
+        if self._min_batch_size_callback is not None:
+            return await self._min_batch_size_callback(update_name, database_name)
+
+        return self.MINIMUM_BACKGROUND_BATCH_SIZE
+
     def get_current_update(self) -> Optional[BackgroundUpdatePerformance]:
         """Returns the current background update, if any."""
 
@@ -135,13 +263,8 @@ class BackgroundUpdater:
         try:
             logger.info("Starting background schema updates")
             while self.enabled:
-                if sleep:
-                    await self._clock.sleep(self.BACKGROUND_UPDATE_INTERVAL_MS / 1000.0)
-
                 try:
-                    result = await self.do_next_background_update(
-                        self.BACKGROUND_UPDATE_DURATION_MS
-                    )
+                    result = await self.do_next_background_update(sleep)
                 except Exception:
                     logger.exception("Error doing update")
                 else:
@@ -203,13 +326,15 @@ class BackgroundUpdater:
 
         return not update_exists
 
-    async def do_next_background_update(self, desired_duration_ms: float) -> bool:
+    async def do_next_background_update(self, sleep: bool = True) -> bool:
         """Does some amount of work on the next queued background update
 
         Returns once some amount of work is done.
 
         Args:
-            desired_duration_ms: How long we want to spend updating.
+            sleep: Whether to limit how quickly we run background updates or
+                not.
+
         Returns:
             True if we have finished running all the background updates, otherwise False
         """
@@ -252,7 +377,19 @@ class BackgroundUpdater:
 
             self._current_background_update = upd["update_name"]
 
-        await self._do_background_update(desired_duration_ms)
+        # We have a background update to run, otherwise we would have returned
+        # early.
+        assert self._current_background_update is not None
+        update_info = self._background_update_handlers[self._current_background_update]
+
+        async with self._get_context_manager_for_update(
+            sleep=sleep,
+            update_name=self._current_background_update,
+            database_name=self._database_name,
+            oneshot=update_info.oneshot,
+        ) as desired_duration_ms:
+            await self._do_background_update(desired_duration_ms)
+
         return False
 
     async def _do_background_update(self, desired_duration_ms: float) -> int:
@@ -260,7 +397,7 @@ class BackgroundUpdater:
         update_name = self._current_background_update
         logger.info("Starting update batch on background update '%s'", update_name)
 
-        update_handler = self._background_update_handlers[update_name]
+        update_handler = self._background_update_handlers[update_name].callback
 
         performance = self._background_update_performance.get(update_name)
 
@@ -273,9 +410,14 @@ class BackgroundUpdater:
         if items_per_ms is not None:
             batch_size = int(desired_duration_ms * items_per_ms)
             # Clamp the batch size so that we always make progress
-            batch_size = max(batch_size, self.MINIMUM_BACKGROUND_BATCH_SIZE)
+            batch_size = max(
+                batch_size,
+                await self._min_batch_size(update_name, self._database_name),
+            )
         else:
-            batch_size = self.DEFAULT_BACKGROUND_BATCH_SIZE
+            batch_size = await self._default_batch_size(
+                update_name, self._database_name
+            )
 
         progress_json = await self.db_pool.simple_select_one_onecol(
             "background_updates",
@@ -294,6 +436,8 @@ class BackgroundUpdater:
 
         duration_ms = time_stop - time_start
 
+        performance.update(items_updated, duration_ms)
+
         logger.info(
             "Running background update %r. Processed %r items in %rms."
             " (total_rate=%r/ms, current_rate=%r/ms, total_updated=%r, batch_size=%r)",
@@ -306,8 +450,6 @@ class BackgroundUpdater:
             batch_size,
         )
 
-        performance.update(items_updated, duration_ms)
-
         return len(self._background_update_performance)
 
     def register_background_update_handler(
@@ -331,7 +473,9 @@ class BackgroundUpdater:
             update_name: The name of the update that this code handles.
             update_handler: The function that does the update.
         """
-        self._background_update_handlers[update_name] = update_handler
+        self._background_update_handlers[update_name] = _BackgroundUpdateHandler(
+            update_handler
+        )
 
     def register_noop_background_update(self, update_name: str) -> None:
         """Register a noop handler for a background update.
@@ -453,7 +597,9 @@ class BackgroundUpdater:
             await self._end_background_update(update_name)
             return 1
 
-        self.register_background_update_handler(update_name, updater)
+        self._background_update_handlers[update_name] = _BackgroundUpdateHandler(
+            updater, oneshot=True
+        )
 
     async def _end_background_update(self, update_name: str) -> None:
         """Removes a completed background update task from the queue.
diff --git a/tests/push/test_email.py b/tests/push/test_email.py
index 90f800e564..f8cba7b645 100644
--- a/tests/push/test_email.py
+++ b/tests/push/test_email.py
@@ -128,6 +128,7 @@ class EmailPusherTests(HomeserverTestCase):
         )
 
         self.auth_handler = hs.get_auth_handler()
+        self.store = hs.get_datastore()
 
     def test_need_validated_email(self):
         """Test that we can only add an email pusher if the user has validated
@@ -408,13 +409,7 @@ class EmailPusherTests(HomeserverTestCase):
         self.hs.get_datastore().db_pool.updates._all_done = False
 
         # Now let's actually drive the updates to completion
-        while not self.get_success(
-            self.hs.get_datastore().db_pool.updates.has_completed_background_updates()
-        ):
-            self.get_success(
-                self.hs.get_datastore().db_pool.updates.do_next_background_update(100),
-                by=0.1,
-            )
+        self.wait_for_background_updates()
 
         # Check that all pushers with unlinked addresses were deleted
         pushers = self.get_success(
diff --git a/tests/rest/admin/test_background_updates.py b/tests/rest/admin/test_background_updates.py
index cd5c60b65c..62f242baf6 100644
--- a/tests/rest/admin/test_background_updates.py
+++ b/tests/rest/admin/test_background_updates.py
@@ -135,7 +135,7 @@ class BackgroundUpdatesTestCase(unittest.HomeserverTestCase):
         self._register_bg_update()
 
         self.store.db_pool.updates.start_doing_background_updates()
-        self.reactor.pump([1.0, 1.0])
+        self.reactor.pump([1.0, 1.0, 1.0])
 
         channel = self.make_request(
             "GET",
diff --git a/tests/storage/test_background_update.py b/tests/storage/test_background_update.py
index a5f5ebad41..216d816d56 100644
--- a/tests/storage/test_background_update.py
+++ b/tests/storage/test_background_update.py
@@ -1,8 +1,11 @@
-from unittest.mock import Mock
+from mock import Mock
+
+from twisted.internet.defer import Deferred, ensureDeferred
 
 from synapse.storage.background_updates import BackgroundUpdater
 
 from tests import unittest
+from tests.test_utils import make_awaitable
 
 
 class BackgroundUpdateTestCase(unittest.HomeserverTestCase):
@@ -20,10 +23,10 @@ class BackgroundUpdateTestCase(unittest.HomeserverTestCase):
 
     def test_do_background_update(self):
         # the time we claim it takes to update one item when running the update
-        duration_ms = 4200
+        duration_ms = 10
 
         # the target runtime for each bg update
-        target_background_update_duration_ms = 5000000
+        target_background_update_duration_ms = 100
 
         store = self.hs.get_datastore()
         self.get_success(
@@ -48,10 +51,8 @@ class BackgroundUpdateTestCase(unittest.HomeserverTestCase):
         self.update_handler.side_effect = update
         self.update_handler.reset_mock()
         res = self.get_success(
-            self.updates.do_next_background_update(
-                target_background_update_duration_ms
-            ),
-            by=0.1,
+            self.updates.do_next_background_update(False),
+            by=0.01,
         )
         self.assertFalse(res)
 
@@ -74,16 +75,93 @@ class BackgroundUpdateTestCase(unittest.HomeserverTestCase):
 
         self.update_handler.side_effect = update
         self.update_handler.reset_mock()
-        result = self.get_success(
-            self.updates.do_next_background_update(target_background_update_duration_ms)
-        )
+        result = self.get_success(self.updates.do_next_background_update(False))
         self.assertFalse(result)
         self.update_handler.assert_called_once()
 
         # third step: we don't expect to be called any more
         self.update_handler.reset_mock()
-        result = self.get_success(
-            self.updates.do_next_background_update(target_background_update_duration_ms)
-        )
+        result = self.get_success(self.updates.do_next_background_update(False))
         self.assertTrue(result)
         self.assertFalse(self.update_handler.called)
+
+
+class BackgroundUpdateControllerTestCase(unittest.HomeserverTestCase):
+    def prepare(self, reactor, clock, homeserver):
+        self.updates: BackgroundUpdater = self.hs.get_datastore().db_pool.updates
+        # the base test class should have run the real bg updates for us
+        self.assertTrue(
+            self.get_success(self.updates.has_completed_background_updates())
+        )
+
+        self.update_deferred = Deferred()
+        self.update_handler = Mock(return_value=self.update_deferred)
+        self.updates.register_background_update_handler(
+            "test_update", self.update_handler
+        )
+
+        # Mock out the AsyncContextManager
+        self._update_ctx_manager = Mock(spec=["__aenter__", "__aexit__"])
+        self._update_ctx_manager.__aenter__ = Mock(
+            return_value=make_awaitable(None),
+        )
+        self._update_ctx_manager.__aexit__ = Mock(return_value=make_awaitable(None))
+
+        # Mock out the `update_handler` callback
+        self._on_update = Mock(return_value=self._update_ctx_manager)
+
+        # Define a default batch size value that's not the same as the internal default
+        # value (100).
+        self._default_batch_size = 500
+
+        # Register the callbacks with more mocks
+        self.hs.get_module_api().register_background_update_controller_callbacks(
+            on_update=self._on_update,
+            min_batch_size=Mock(return_value=make_awaitable(self._default_batch_size)),
+            default_batch_size=Mock(
+                return_value=make_awaitable(self._default_batch_size),
+            ),
+        )
+
+    def test_controller(self):
+        store = self.hs.get_datastore()
+        self.get_success(
+            store.db_pool.simple_insert(
+                "background_updates",
+                values={"update_name": "test_update", "progress_json": "{}"},
+            )
+        )
+
+        # Set the return value for the context manager.
+        enter_defer = Deferred()
+        self._update_ctx_manager.__aenter__ = Mock(return_value=enter_defer)
+
+        # Start the background update.
+        do_update_d = ensureDeferred(self.updates.do_next_background_update(True))
+
+        self.pump()
+
+        # `run_update` should have been called, but the update handler won't be
+        # called until the `enter_defer` (returned by `__aenter__`) is resolved.
+        self._on_update.assert_called_once_with(
+            "test_update",
+            "master",
+            False,
+        )
+        self.assertFalse(do_update_d.called)
+        self.assertFalse(self.update_deferred.called)
+
+        # Resolving the `enter_defer` should call the update handler, which then
+        # blocks.
+        enter_defer.callback(100)
+        self.pump()
+        self.update_handler.assert_called_once_with({}, self._default_batch_size)
+        self.assertFalse(self.update_deferred.called)
+        self._update_ctx_manager.__aexit__.assert_not_called()
+
+        # Resolving the update handler deferred should cause the
+        # `do_next_background_update` to finish and return
+        self.update_deferred.callback(100)
+        self.pump()
+        self._update_ctx_manager.__aexit__.assert_called()
+        self.get_success(do_update_d)
diff --git a/tests/storage/test_event_chain.py b/tests/storage/test_event_chain.py
index b31c5eb5ec..7b7f6c349e 100644
--- a/tests/storage/test_event_chain.py
+++ b/tests/storage/test_event_chain.py
@@ -664,7 +664,7 @@ class EventChainBackgroundUpdateTestCase(HomeserverTestCase):
         ):
             iterations += 1
             self.get_success(
-                self.store.db_pool.updates.do_next_background_update(100), by=0.1
+                self.store.db_pool.updates.do_next_background_update(False), by=0.1
             )
 
         # Ensure that we did actually take multiple iterations to process the
@@ -723,7 +723,7 @@ class EventChainBackgroundUpdateTestCase(HomeserverTestCase):
         ):
             iterations += 1
             self.get_success(
-                self.store.db_pool.updates.do_next_background_update(100), by=0.1
+                self.store.db_pool.updates.do_next_background_update(False), by=0.1
             )
 
         # Ensure that we did actually take multiple iterations to process the
diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py
index 37cf7bb232..7f5b28aed8 100644
--- a/tests/storage/test_user_directory.py
+++ b/tests/storage/test_user_directory.py
@@ -23,6 +23,7 @@ from synapse.rest import admin
 from synapse.rest.client import login, register, room
 from synapse.server import HomeServer
 from synapse.storage import DataStore
+from synapse.storage.background_updates import _BackgroundUpdateHandler
 from synapse.storage.roommember import ProfileInfo
 from synapse.util import Clock
 
@@ -391,7 +392,9 @@ class UserDirectoryInitialPopulationTestcase(HomeserverTestCase):
 
         with mock.patch.dict(
             self.store.db_pool.updates._background_update_handlers,
-            populate_user_directory_process_users=mocked_process_users,
+            populate_user_directory_process_users=_BackgroundUpdateHandler(
+                mocked_process_users,
+            ),
         ):
             self._purge_and_rebuild_user_dir()
 
diff --git a/tests/unittest.py b/tests/unittest.py
index 165aafc574..eea0903f05 100644
--- a/tests/unittest.py
+++ b/tests/unittest.py
@@ -331,17 +331,16 @@ class HomeserverTestCase(TestCase):
             time.sleep(0.01)
 
     def wait_for_background_updates(self) -> None:
-        """
-        Block until all background database updates have completed.
+        """Block until all background database updates have completed.
 
-        Note that callers must ensure that's a store property created on the
+        Note that callers must ensure there's a store property created on the
         testcase.
         """
         while not self.get_success(
             self.store.db_pool.updates.has_completed_background_updates()
         ):
             self.get_success(
-                self.store.db_pool.updates.do_next_background_update(100), by=0.1
+                self.store.db_pool.updates.do_next_background_update(False), by=0.1
             )
 
     def make_homeserver(self, reactor, clock):
@@ -500,8 +499,7 @@ class HomeserverTestCase(TestCase):
 
         async def run_bg_updates():
             with LoggingContext("run_bg_updates"):
-                while not await stor.db_pool.updates.has_completed_background_updates():
-                    await stor.db_pool.updates.do_next_background_update(1)
+                self.get_success(stor.db_pool.updates.run_background_updates(False))
 
         hs = setup_test_homeserver(self.addCleanup, *args, **kwargs)
         stor = hs.get_datastore()
-- 
cgit 1.5.1