summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--CHANGES.md9
-rw-r--r--INSTALL.md2
-rw-r--r--changelog.d/6213.bugfix1
-rw-r--r--changelog.d/6235.bugfix1
-rw-r--r--changelog.d/6295.misc1
-rw-r--r--changelog.d/6310.feature1
-rw-r--r--changelog.d/6331.feature1
-rw-r--r--changelog.d/6334.feature1
-rw-r--r--changelog.d/6338.bugfix1
-rw-r--r--changelog.d/6341.misc1
-rw-r--r--debian/changelog6
-rwxr-xr-xscripts-dev/build_debian_packages2
-rw-r--r--synapse/__init__.py2
-rw-r--r--synapse/handlers/pagination.py6
-rw-r--r--synapse/handlers/register.py50
-rw-r--r--synapse/handlers/room_member.py9
-rw-r--r--synapse/replication/http/register.py2
-rw-r--r--synapse/storage/__init__.py2
-rw-r--r--synapse/storage/data_stores/main/events.py349
-rw-r--r--synapse/storage/data_stores/main/events_bg_updates.py63
-rw-r--r--synapse/storage/data_stores/main/group_server.py15
-rw-r--r--synapse/storage/data_stores/main/registration.py8
-rw-r--r--synapse/storage/data_stores/main/schema/delta/56/event_labels_background_update.sql17
-rw-r--r--synapse/storage/data_stores/main/state.py29
-rw-r--r--synapse/storage/purge_events.py117
-rw-r--r--synapse/util/caches/descriptors.py48
-rw-r--r--tests/rest/admin/test_admin.py4
-rw-r--r--tests/storage/test_purge.py15
-rw-r--r--tox.ini2
29 files changed, 545 insertions, 220 deletions
diff --git a/CHANGES.md b/CHANGES.md
index 6faa4b8dce..9312dc2941 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,12 @@
+Synapse 1.5.1 (2019-11-06)
+==========================
+
+Features
+--------
+
+- Limit the length of data returned by url previews, to prevent DoS attacks. ([\#6331](https://github.com/matrix-org/synapse/issues/6331), [\#6334](https://github.com/matrix-org/synapse/issues/6334))
+
+
 Synapse 1.5.0 (2019-10-29)
 ==========================
 
diff --git a/INSTALL.md b/INSTALL.md
index e7b429c05d..29e0abafd3 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -36,7 +36,7 @@ that your email address is probably `user@example.com` rather than
 System requirements:
 
 - POSIX-compliant system (tested on Linux & OS X)
-- Python 3.5, 3.6, or 3.7
+- Python 3.5, 3.6, 3.7 or 3.8.
 - At least 1GB of free RAM if you want to join large public rooms like #matrix:matrix.org
 
 Synapse is written in Python but some of the libraries it uses are written in
diff --git a/changelog.d/6213.bugfix b/changelog.d/6213.bugfix
new file mode 100644
index 0000000000..2bb2d08851
--- /dev/null
+++ b/changelog.d/6213.bugfix
@@ -0,0 +1 @@
+Fix LruCache callback deduplication for Python 3.8. Contributed by @V02460.
diff --git a/changelog.d/6235.bugfix b/changelog.d/6235.bugfix
new file mode 100644
index 0000000000..12718ba934
--- /dev/null
+++ b/changelog.d/6235.bugfix
@@ -0,0 +1 @@
+Remove a room from a server's public rooms list on room upgrade.
\ No newline at end of file
diff --git a/changelog.d/6295.misc b/changelog.d/6295.misc
new file mode 100644
index 0000000000..a3e6b8296e
--- /dev/null
+++ b/changelog.d/6295.misc
@@ -0,0 +1 @@
+Split out state storage into separate data store.
diff --git a/changelog.d/6310.feature b/changelog.d/6310.feature
new file mode 100644
index 0000000000..78a187a1dc
--- /dev/null
+++ b/changelog.d/6310.feature
@@ -0,0 +1 @@
+Implement label-based filtering on `/sync` and `/messages` ([MSC2326](https://github.com/matrix-org/matrix-doc/pull/2326)).
diff --git a/changelog.d/6331.feature b/changelog.d/6331.feature
deleted file mode 100644
index eaf69ef3f6..0000000000
--- a/changelog.d/6331.feature
+++ /dev/null
@@ -1 +0,0 @@
-Limit the length of data returned by url previews, to prevent DoS attacks.
diff --git a/changelog.d/6334.feature b/changelog.d/6334.feature
deleted file mode 100644
index eaf69ef3f6..0000000000
--- a/changelog.d/6334.feature
+++ /dev/null
@@ -1 +0,0 @@
-Limit the length of data returned by url previews, to prevent DoS attacks.
diff --git a/changelog.d/6338.bugfix b/changelog.d/6338.bugfix
new file mode 100644
index 0000000000..8e469f0fb6
--- /dev/null
+++ b/changelog.d/6338.bugfix
@@ -0,0 +1 @@
+Prevent the server taking a long time to start up when guest registration is enabled.
\ No newline at end of file
diff --git a/changelog.d/6341.misc b/changelog.d/6341.misc
new file mode 100644
index 0000000000..359b9bf1d7
--- /dev/null
+++ b/changelog.d/6341.misc
@@ -0,0 +1 @@
+Add continuous integration for python 3.8.
\ No newline at end of file
diff --git a/debian/changelog b/debian/changelog
index acda7e5c63..c4415f460a 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+matrix-synapse-py3 (1.5.1) stable; urgency=medium
+
+  * New synapse release 1.5.1.
+
+ -- Synapse Packaging team <packages@matrix.org>  Wed, 06 Nov 2019 10:02:14 +0000
+
 matrix-synapse-py3 (1.5.0) stable; urgency=medium
 
   * New synapse release 1.5.0.
diff --git a/scripts-dev/build_debian_packages b/scripts-dev/build_debian_packages
index 93305ee9b1..84eaec6a95 100755
--- a/scripts-dev/build_debian_packages
+++ b/scripts-dev/build_debian_packages
@@ -20,11 +20,13 @@ from concurrent.futures import ThreadPoolExecutor
 DISTS = (
     "debian:stretch",
     "debian:buster",
+    "debian:bullseye",
     "debian:sid",
     "ubuntu:xenial",
     "ubuntu:bionic",
     "ubuntu:cosmic",
     "ubuntu:disco",
+    "ubuntu:eoan",
 )
 
 DESC = '''\
diff --git a/synapse/__init__.py b/synapse/__init__.py
index 8587ffa76f..ec16f54a49 100644
--- a/synapse/__init__.py
+++ b/synapse/__init__.py
@@ -36,7 +36,7 @@ try:
 except ImportError:
     pass
 
-__version__ = "1.5.0"
+__version__ = "1.5.1"
 
 if bool(os.environ.get("SYNAPSE_TEST_PATCH_LOG_CONTEXTS", False)):
     # We import here so that we don't have to install a bunch of deps when
diff --git a/synapse/handlers/pagination.py b/synapse/handlers/pagination.py
index 97f15a1c32..260a4351ca 100644
--- a/synapse/handlers/pagination.py
+++ b/synapse/handlers/pagination.py
@@ -127,7 +127,9 @@ class PaginationHandler(object):
         self._purges_in_progress_by_room.add(room_id)
         try:
             with (yield self.pagination_lock.write(room_id)):
-                yield self.store.purge_history(room_id, token, delete_local_events)
+                yield self.storage.purge_events.purge_history(
+                    room_id, token, delete_local_events
+                )
             logger.info("[purge] complete")
             self._purges_by_id[purge_id].status = PurgeStatus.STATUS_COMPLETE
         except Exception:
@@ -170,7 +172,7 @@ class PaginationHandler(object):
             if joined:
                 raise SynapseError(400, "Users are still joined to this room")
 
-            await self.store.purge_room(room_id)
+            await self.storage.purge_events.purge_room(room_id)
 
     @defer.inlineCallbacks
     def get_messages(
diff --git a/synapse/handlers/register.py b/synapse/handlers/register.py
index cff6b0d375..235f11c322 100644
--- a/synapse/handlers/register.py
+++ b/synapse/handlers/register.py
@@ -24,7 +24,6 @@ from synapse.api.errors import (
     AuthError,
     Codes,
     ConsentNotGivenError,
-    LimitExceededError,
     RegistrationError,
     SynapseError,
 )
@@ -168,6 +167,7 @@ class RegistrationHandler(BaseHandler):
         Raises:
             RegistrationError if there was a problem registering.
         """
+        yield self.check_registration_ratelimit(address)
 
         yield self.auth.check_auth_blocking(threepid=threepid)
         password_hash = None
@@ -217,8 +217,13 @@ class RegistrationHandler(BaseHandler):
 
         else:
             # autogen a sequential user ID
+            fail_count = 0
             user = None
             while not user:
+                # Fail after being unable to find a suitable ID a few times
+                if fail_count > 10:
+                    raise SynapseError(500, "Unable to find a suitable guest user ID")
+
                 localpart = yield self._generate_user_id()
                 user = UserID(localpart, self.hs.hostname)
                 user_id = user.to_string()
@@ -233,10 +238,14 @@ class RegistrationHandler(BaseHandler):
                         create_profile_with_displayname=default_display_name,
                         address=address,
                     )
+
+                    # Successfully registered
+                    break
                 except SynapseError:
                     # if user id is taken, just generate another
                     user = None
                     user_id = None
+                    fail_count += 1
 
         if not self.hs.config.user_consent_at_registration:
             yield self._auto_join_rooms(user_id)
@@ -414,6 +423,29 @@ class RegistrationHandler(BaseHandler):
             ratelimit=False,
         )
 
+    def check_registration_ratelimit(self, address):
+        """A simple helper method to check whether the registration rate limit has been hit
+        for a given IP address
+
+        Args:
+            address (str|None): the IP address used to perform the registration. If this is
+                None, no ratelimiting will be performed.
+
+        Raises:
+            LimitExceededError: If the rate limit has been exceeded.
+        """
+        if not address:
+            return
+
+        time_now = self.clock.time()
+
+        self.ratelimiter.ratelimit(
+            address,
+            time_now_s=time_now,
+            rate_hz=self.hs.config.rc_registration.per_second,
+            burst_count=self.hs.config.rc_registration.burst_count,
+        )
+
     def register_with_store(
         self,
         user_id,
@@ -446,22 +478,6 @@ class RegistrationHandler(BaseHandler):
         Returns:
             Deferred
         """
-        # Don't rate limit for app services
-        if appservice_id is None and address is not None:
-            time_now = self.clock.time()
-
-            allowed, time_allowed = self.ratelimiter.can_do_action(
-                address,
-                time_now_s=time_now,
-                rate_hz=self.hs.config.rc_registration.per_second,
-                burst_count=self.hs.config.rc_registration.burst_count,
-            )
-
-            if not allowed:
-                raise LimitExceededError(
-                    retry_after_ms=int(1000 * (time_allowed - time_now))
-                )
-
         if self.hs.config.worker_app:
             return self._register_client(
                 user_id=user_id,
diff --git a/synapse/handlers/room_member.py b/synapse/handlers/room_member.py
index 06d09c2947..6cfee4b361 100644
--- a/synapse/handlers/room_member.py
+++ b/synapse/handlers/room_member.py
@@ -515,6 +515,15 @@ class RoomMemberHandler(object):
             yield self.store.set_room_is_public(old_room_id, False)
             yield self.store.set_room_is_public(room_id, True)
 
+        # Check if any groups we own contain the predecessor room
+        local_group_ids = yield self.store.get_local_groups_for_room(old_room_id)
+        for group_id in local_group_ids:
+            # Add new the new room to those groups
+            yield self.store.add_room_to_group(group_id, room_id, old_room["is_public"])
+
+            # Remove the old room from those groups
+            yield self.store.remove_room_from_group(group_id, old_room_id)
+
     @defer.inlineCallbacks
     def copy_user_state_on_room_upgrade(self, old_room_id, new_room_id, user_ids):
         """Copy user-specific information when they join a new room when that new room is the
diff --git a/synapse/replication/http/register.py b/synapse/replication/http/register.py
index 915cfb9430..0c4aca1291 100644
--- a/synapse/replication/http/register.py
+++ b/synapse/replication/http/register.py
@@ -75,6 +75,8 @@ class ReplicationRegisterServlet(ReplicationEndpoint):
     async def _handle_request(self, request, user_id):
         content = parse_json_object_from_request(request)
 
+        self.registration_handler.check_registration_ratelimit(content["address"])
+
         await self.registration_handler.register_with_store(
             user_id=user_id,
             password_hash=content["password_hash"],
diff --git a/synapse/storage/__init__.py b/synapse/storage/__init__.py
index 0a1a8cc1e5..0460fe8cc9 100644
--- a/synapse/storage/__init__.py
+++ b/synapse/storage/__init__.py
@@ -30,6 +30,7 @@ stored in `synapse.storage.schema`.
 from synapse.storage.data_stores import DataStores
 from synapse.storage.data_stores.main import DataStore
 from synapse.storage.persist_events import EventsPersistenceStorage
+from synapse.storage.purge_events import PurgeEventsStorage
 from synapse.storage.state import StateGroupStorage
 
 __all__ = ["DataStores", "DataStore"]
@@ -46,6 +47,7 @@ class Storage(object):
         self.main = stores.main
 
         self.persistence = EventsPersistenceStorage(hs, stores)
+        self.purge_events = PurgeEventsStorage(hs, stores)
         self.state = StateGroupStorage(hs, stores)
 
 
diff --git a/synapse/storage/data_stores/main/events.py b/synapse/storage/data_stores/main/events.py
index 301f8ea128..878f7568a6 100644
--- a/synapse/storage/data_stores/main/events.py
+++ b/synapse/storage/data_stores/main/events.py
@@ -1375,6 +1375,10 @@ class EventsStore(
                 if True, we will delete local events as well as remote ones
                 (instead of just marking them as outliers and deleting their
                 state groups).
+
+        Returns:
+            Deferred[set[int]]: The set of state groups that are referenced by
+            deleted events.
         """
 
         return self.runInteraction(
@@ -1511,11 +1515,10 @@ class EventsStore(
             [(room_id, event_id) for event_id, in new_backwards_extrems],
         )
 
-        logger.info("[purge] finding redundant state groups")
+        logger.info("[purge] finding state groups referenced by deleted events")
 
         # Get all state groups that are referenced by events that are to be
-        # deleted. We then go and check if they are referenced by other events
-        # or state groups, and if not we delete them.
+        # deleted.
         txn.execute(
             """
             SELECT DISTINCT state_group FROM events_to_purge
@@ -1528,60 +1531,6 @@ class EventsStore(
             "[purge] found %i referenced state groups", len(referenced_state_groups)
         )
 
-        logger.info("[purge] finding state groups that can be deleted")
-
-        _ = self._find_unreferenced_groups_during_purge(txn, referenced_state_groups)
-        state_groups_to_delete, remaining_state_groups = _
-
-        logger.info(
-            "[purge] found %i state groups to delete", len(state_groups_to_delete)
-        )
-
-        logger.info(
-            "[purge] de-delta-ing %i remaining state groups",
-            len(remaining_state_groups),
-        )
-
-        # Now we turn the state groups that reference to-be-deleted state
-        # groups to non delta versions.
-        for sg in remaining_state_groups:
-            logger.info("[purge] de-delta-ing remaining state group %s", sg)
-            curr_state = self._get_state_groups_from_groups_txn(txn, [sg])
-            curr_state = curr_state[sg]
-
-            self._simple_delete_txn(
-                txn, table="state_groups_state", keyvalues={"state_group": sg}
-            )
-
-            self._simple_delete_txn(
-                txn, table="state_group_edges", keyvalues={"state_group": sg}
-            )
-
-            self._simple_insert_many_txn(
-                txn,
-                table="state_groups_state",
-                values=[
-                    {
-                        "state_group": sg,
-                        "room_id": room_id,
-                        "type": key[0],
-                        "state_key": key[1],
-                        "event_id": state_id,
-                    }
-                    for key, state_id in iteritems(curr_state)
-                ],
-            )
-
-        logger.info("[purge] removing redundant state groups")
-        txn.executemany(
-            "DELETE FROM state_groups_state WHERE state_group = ?",
-            ((sg,) for sg in state_groups_to_delete),
-        )
-        txn.executemany(
-            "DELETE FROM state_groups WHERE id = ?",
-            ((sg,) for sg in state_groups_to_delete),
-        )
-
         logger.info("[purge] removing events from event_to_state_groups")
         txn.execute(
             "DELETE FROM event_to_state_groups "
@@ -1668,138 +1617,35 @@ class EventsStore(
 
         logger.info("[purge] done")
 
-    def _find_unreferenced_groups_during_purge(self, txn, state_groups):
-        """Used when purging history to figure out which state groups can be
-        deleted and which need to be de-delta'ed (due to one of its prev groups
-        being scheduled for deletion).
-
-        Args:
-            txn
-            state_groups (set[int]): Set of state groups referenced by events
-                that are going to be deleted.
-
-        Returns:
-            tuple[set[int], set[int]]: The set of state groups that can be
-            deleted and the set of state groups that need to be de-delta'ed
-        """
-        # Graph of state group -> previous group
-        graph = {}
-
-        # Set of events that we have found to be referenced by events
-        referenced_groups = set()
-
-        # Set of state groups we've already seen
-        state_groups_seen = set(state_groups)
-
-        # Set of state groups to handle next.
-        next_to_search = set(state_groups)
-        while next_to_search:
-            # We bound size of groups we're looking up at once, to stop the
-            # SQL query getting too big
-            if len(next_to_search) < 100:
-                current_search = next_to_search
-                next_to_search = set()
-            else:
-                current_search = set(itertools.islice(next_to_search, 100))
-                next_to_search -= current_search
-
-            # Check if state groups are referenced
-            sql = """
-                SELECT DISTINCT state_group FROM event_to_state_groups
-                LEFT JOIN events_to_purge AS ep USING (event_id)
-                WHERE ep.event_id IS NULL AND
-            """
-            clause, args = make_in_list_sql_clause(
-                txn.database_engine, "state_group", current_search
-            )
-            txn.execute(sql + clause, list(args))
-
-            referenced = set(sg for sg, in txn)
-            referenced_groups |= referenced
-
-            # We don't continue iterating up the state group graphs for state
-            # groups that are referenced.
-            current_search -= referenced
-
-            rows = self._simple_select_many_txn(
-                txn,
-                table="state_group_edges",
-                column="prev_state_group",
-                iterable=current_search,
-                keyvalues={},
-                retcols=("prev_state_group", "state_group"),
-            )
-
-            prevs = set(row["state_group"] for row in rows)
-            # We don't bother re-handling groups we've already seen
-            prevs -= state_groups_seen
-            next_to_search |= prevs
-            state_groups_seen |= prevs
-
-            for row in rows:
-                # Note: Each state group can have at most one prev group
-                graph[row["state_group"]] = row["prev_state_group"]
-
-        to_delete = state_groups_seen - referenced_groups
-
-        to_dedelta = set()
-        for sg in referenced_groups:
-            prev_sg = graph.get(sg)
-            if prev_sg and prev_sg in to_delete:
-                to_dedelta.add(sg)
-
-        return to_delete, to_dedelta
+        return referenced_state_groups
 
     def purge_room(self, room_id):
         """Deletes all record of a room
 
         Args:
-            room_id (str):
+            room_id (str)
+
+        Returns:
+            Deferred[List[int]]: The list of state groups to delete.
         """
 
         return self.runInteraction("purge_room", self._purge_room_txn, room_id)
 
     def _purge_room_txn(self, txn, room_id):
-        # first we have to delete the state groups states
-        logger.info("[purge] removing %s from state_groups_state", room_id)
-
+        # First we fetch all the state groups that should be deleted, before
+        # we delete that information.
         txn.execute(
             """
-            DELETE FROM state_groups_state WHERE state_group IN (
-              SELECT state_group FROM events JOIN event_to_state_groups USING(event_id)
-              WHERE events.room_id=?
-            )
+                SELECT DISTINCT state_group FROM events
+                INNER JOIN event_to_state_groups USING(event_id)
+                WHERE events.room_id = ?
             """,
             (room_id,),
         )
 
-        # ... and the state group edges
-        logger.info("[purge] removing %s from state_group_edges", room_id)
-
-        txn.execute(
-            """
-            DELETE FROM state_group_edges WHERE state_group IN (
-              SELECT state_group FROM events JOIN event_to_state_groups USING(event_id)
-              WHERE events.room_id=?
-            )
-            """,
-            (room_id,),
-        )
-
-        # ... and the state groups
-        logger.info("[purge] removing %s from state_groups", room_id)
-
-        txn.execute(
-            """
-            DELETE FROM state_groups WHERE id IN (
-              SELECT state_group FROM events JOIN event_to_state_groups USING(event_id)
-              WHERE events.room_id=?
-            )
-            """,
-            (room_id,),
-        )
+        state_groups = [row[0] for row in txn]
 
-        # and then tables which lack an index on room_id but have one on event_id
+        # Now we delete tables which lack an index on room_id but have one on event_id
         for table in (
             "event_auth",
             "event_edges",
@@ -1887,6 +1733,165 @@ class EventsStore(
 
         logger.info("[purge] done")
 
+        return state_groups
+
+    def purge_unreferenced_state_groups(
+        self, room_id: str, state_groups_to_delete
+    ) -> defer.Deferred:
+        """Deletes no longer referenced state groups and de-deltas any state
+        groups that reference them.
+
+        Args:
+            room_id: The room the state groups belong to (must all be in the
+                same room).
+            state_groups_to_delete (Collection[int]): Set of all state groups
+                to delete.
+        """
+
+        return self.runInteraction(
+            "purge_unreferenced_state_groups",
+            self._purge_unreferenced_state_groups,
+            room_id,
+            state_groups_to_delete,
+        )
+
+    def _purge_unreferenced_state_groups(self, txn, room_id, state_groups_to_delete):
+        logger.info(
+            "[purge] found %i state groups to delete", len(state_groups_to_delete)
+        )
+
+        rows = self._simple_select_many_txn(
+            txn,
+            table="state_group_edges",
+            column="prev_state_group",
+            iterable=state_groups_to_delete,
+            keyvalues={},
+            retcols=("state_group",),
+        )
+
+        remaining_state_groups = set(
+            row["state_group"]
+            for row in rows
+            if row["state_group"] not in state_groups_to_delete
+        )
+
+        logger.info(
+            "[purge] de-delta-ing %i remaining state groups",
+            len(remaining_state_groups),
+        )
+
+        # Now we turn the state groups that reference to-be-deleted state
+        # groups to non delta versions.
+        for sg in remaining_state_groups:
+            logger.info("[purge] de-delta-ing remaining state group %s", sg)
+            curr_state = self._get_state_groups_from_groups_txn(txn, [sg])
+            curr_state = curr_state[sg]
+
+            self._simple_delete_txn(
+                txn, table="state_groups_state", keyvalues={"state_group": sg}
+            )
+
+            self._simple_delete_txn(
+                txn, table="state_group_edges", keyvalues={"state_group": sg}
+            )
+
+            self._simple_insert_many_txn(
+                txn,
+                table="state_groups_state",
+                values=[
+                    {
+                        "state_group": sg,
+                        "room_id": room_id,
+                        "type": key[0],
+                        "state_key": key[1],
+                        "event_id": state_id,
+                    }
+                    for key, state_id in iteritems(curr_state)
+                ],
+            )
+
+        logger.info("[purge] removing redundant state groups")
+        txn.executemany(
+            "DELETE FROM state_groups_state WHERE state_group = ?",
+            ((sg,) for sg in state_groups_to_delete),
+        )
+        txn.executemany(
+            "DELETE FROM state_groups WHERE id = ?",
+            ((sg,) for sg in state_groups_to_delete),
+        )
+
+    @defer.inlineCallbacks
+    def get_previous_state_groups(self, state_groups):
+        """Fetch the previous groups of the given state groups.
+
+        Args:
+            state_groups (Iterable[int])
+
+        Returns:
+            Deferred[dict[int, int]]: mapping from state group to previous
+            state group.
+        """
+
+        rows = yield self._simple_select_many_batch(
+            table="state_group_edges",
+            column="prev_state_group",
+            iterable=state_groups,
+            keyvalues={},
+            retcols=("prev_state_group", "state_group"),
+            desc="get_previous_state_groups",
+        )
+
+        return {row["state_group"]: row["prev_state_group"] for row in rows}
+
+    def purge_room_state(self, room_id, state_groups_to_delete):
+        """Deletes all record of a room from state tables
+
+        Args:
+            room_id (str):
+            state_groups_to_delete (list[int]): State groups to delete
+        """
+
+        return self.runInteraction(
+            "purge_room_state",
+            self._purge_room_state_txn,
+            room_id,
+            state_groups_to_delete,
+        )
+
+    def _purge_room_state_txn(self, txn, room_id, state_groups_to_delete):
+        # first we have to delete the state groups states
+        logger.info("[purge] removing %s from state_groups_state", room_id)
+
+        self._simple_delete_many_txn(
+            txn,
+            table="state_groups_state",
+            column="state_group",
+            iterable=state_groups_to_delete,
+            keyvalues={},
+        )
+
+        # ... and the state group edges
+        logger.info("[purge] removing %s from state_group_edges", room_id)
+
+        self._simple_delete_many_txn(
+            txn,
+            table="state_group_edges",
+            column="state_group",
+            iterable=state_groups_to_delete,
+            keyvalues={},
+        )
+
+        # ... and the state groups
+        logger.info("[purge] removing %s from state_groups", room_id)
+
+        self._simple_delete_many_txn(
+            txn,
+            table="state_groups",
+            column="id",
+            iterable=state_groups_to_delete,
+            keyvalues={},
+        )
+
     async def is_event_after(self, event_id1, event_id2):
         """Returns True if event_id1 is after event_id2 in the stream
         """
diff --git a/synapse/storage/data_stores/main/events_bg_updates.py b/synapse/storage/data_stores/main/events_bg_updates.py
index 51352b9966..0ed59ef48e 100644
--- a/synapse/storage/data_stores/main/events_bg_updates.py
+++ b/synapse/storage/data_stores/main/events_bg_updates.py
@@ -21,6 +21,7 @@ from canonicaljson import json
 
 from twisted.internet import defer
 
+from synapse.api.constants import EventContentFields
 from synapse.storage._base import make_in_list_sql_clause
 from synapse.storage.background_updates import BackgroundUpdateStore
 
@@ -85,6 +86,10 @@ class EventsBackgroundUpdatesStore(BackgroundUpdateStore):
             "event_fix_redactions_bytes", self._event_fix_redactions_bytes
         )
 
+        self.register_background_update_handler(
+            "event_store_labels", self._event_store_labels
+        )
+
     @defer.inlineCallbacks
     def _background_reindex_fields_sender(self, progress, batch_size):
         target_min_stream_id = progress["target_min_stream_id_inclusive"]
@@ -503,3 +508,61 @@ class EventsBackgroundUpdatesStore(BackgroundUpdateStore):
         yield self._end_background_update("event_fix_redactions_bytes")
 
         return 1
+
+    @defer.inlineCallbacks
+    def _event_store_labels(self, progress, batch_size):
+        """Background update handler which will store labels for existing events."""
+        last_event_id = progress.get("last_event_id", "")
+
+        def _event_store_labels_txn(txn):
+            txn.execute(
+                """
+                SELECT event_id, json FROM event_json
+                LEFT JOIN event_labels USING (event_id)
+                WHERE event_id > ? AND label IS NULL
+                ORDER BY event_id LIMIT ?
+                """,
+                (last_event_id, batch_size),
+            )
+
+            results = list(txn)
+
+            nbrows = 0
+            last_row_event_id = ""
+            for (event_id, event_json_raw) in results:
+                event_json = json.loads(event_json_raw)
+
+                self._simple_insert_many_txn(
+                    txn=txn,
+                    table="event_labels",
+                    values=[
+                        {
+                            "event_id": event_id,
+                            "label": label,
+                            "room_id": event_json["room_id"],
+                            "topological_ordering": event_json["depth"],
+                        }
+                        for label in event_json["content"].get(
+                            EventContentFields.LABELS, []
+                        )
+                        if isinstance(label, str)
+                    ],
+                )
+
+                nbrows += 1
+                last_row_event_id = event_id
+
+            self._background_update_progress_txn(
+                txn, "event_store_labels", {"last_event_id": last_row_event_id}
+            )
+
+            return nbrows
+
+        num_rows = yield self.runInteraction(
+            desc="event_store_labels", func=_event_store_labels_txn
+        )
+
+        if not num_rows:
+            yield self._end_background_update("event_store_labels")
+
+        return num_rows
diff --git a/synapse/storage/data_stores/main/group_server.py b/synapse/storage/data_stores/main/group_server.py
index b3a2771f1b..5ded539af8 100644
--- a/synapse/storage/data_stores/main/group_server.py
+++ b/synapse/storage/data_stores/main/group_server.py
@@ -553,6 +553,21 @@ class GroupServerStore(SQLBaseStore):
             desc="remove_user_from_summary",
         )
 
+    def get_local_groups_for_room(self, room_id):
+        """Get all of the local group that contain a given room
+        Args:
+            room_id (str): The ID of a room
+        Returns:
+            Deferred[list[str]]: A twisted.Deferred containing a list of group ids
+                containing this room
+        """
+        return self._simple_select_onecol(
+            table="group_rooms",
+            keyvalues={"room_id": room_id},
+            retcol="group_id",
+            desc="get_local_groups_for_room",
+        )
+
     def get_users_for_summary_by_role(self, group_id, include_private=False):
         """Get the users and roles that should be included in a summary request
 
diff --git a/synapse/storage/data_stores/main/registration.py b/synapse/storage/data_stores/main/registration.py
index f70d41ecab..ee1b2b2bbf 100644
--- a/synapse/storage/data_stores/main/registration.py
+++ b/synapse/storage/data_stores/main/registration.py
@@ -488,14 +488,14 @@ class RegistrationWorkerStore(SQLBaseStore):
         we can. Unfortunately, it's possible some of them are already taken by
         existing users, and there may be gaps in the already taken range. This
         function returns the start of the first allocatable gap. This is to
-        avoid the case of ID 10000000 being pre-allocated, so us wasting the
-        first (and shortest) many generated user IDs.
+        avoid the case of ID 1000 being pre-allocated and starting at 1001 while
+        0-999 are available.
         """
 
         def _find_next_generated_user_id(txn):
-            # We bound between '@1' and '@a' to avoid pulling the entire table
+            # We bound between '@0' and '@a' to avoid pulling the entire table
             # out.
-            txn.execute("SELECT name FROM users WHERE '@1' <= name AND name < '@a'")
+            txn.execute("SELECT name FROM users WHERE '@0' <= name AND name < '@a'")
 
             regex = re.compile(r"^@(\d+):")
 
diff --git a/synapse/storage/data_stores/main/schema/delta/56/event_labels_background_update.sql b/synapse/storage/data_stores/main/schema/delta/56/event_labels_background_update.sql
new file mode 100644
index 0000000000..5f5e0499ae
--- /dev/null
+++ b/synapse/storage/data_stores/main/schema/delta/56/event_labels_background_update.sql
@@ -0,0 +1,17 @@
+/* Copyright 2019 The Matrix.org Foundation C.I.C.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+INSERT INTO background_updates (update_name, progress_json) VALUES
+  ('event_store_labels', '{}');
diff --git a/synapse/storage/data_stores/main/state.py b/synapse/storage/data_stores/main/state.py
index 9e1541988e..6a90daea31 100644
--- a/synapse/storage/data_stores/main/state.py
+++ b/synapse/storage/data_stores/main/state.py
@@ -285,7 +285,11 @@ class StateGroupWorkerStore(
             room_id (str)
 
         Returns:
-            Deferred[unicode|None]: predecessor room id
+            Deferred[dict|None]: A dictionary containing the structure of the predecessor
+                field from the room's create event. The structure is subject to other servers,
+                but it is expected to be:
+                    * room_id (str): The room ID of the predecessor room
+                    * event_id (str): The ID of the tombstone event in the predecessor room
 
         Raises:
             NotFoundError if the room is unknown
@@ -991,6 +995,29 @@ class StateGroupWorkerStore(
 
         return self.runInteraction("store_state_group", _store_state_group_txn)
 
+    @defer.inlineCallbacks
+    def get_referenced_state_groups(self, state_groups):
+        """Check if the state groups are referenced by events.
+
+        Args:
+            state_groups (Iterable[int])
+
+        Returns:
+            Deferred[set[int]]: The subset of state groups that are
+            referenced.
+        """
+
+        rows = yield self._simple_select_many_batch(
+            table="event_to_state_groups",
+            column="state_group",
+            iterable=state_groups,
+            keyvalues={},
+            retcols=("DISTINCT state_group",),
+            desc="get_referenced_state_groups",
+        )
+
+        return set(row["state_group"] for row in rows)
+
 
 class StateBackgroundUpdateStore(
     StateGroupBackgroundUpdateStore, BackgroundUpdateStore
diff --git a/synapse/storage/purge_events.py b/synapse/storage/purge_events.py
new file mode 100644
index 0000000000..a368182034
--- /dev/null
+++ b/synapse/storage/purge_events.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+# Copyright 2019 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import logging
+
+from twisted.internet import defer
+
+logger = logging.getLogger(__name__)
+
+
+class PurgeEventsStorage(object):
+    """High level interface for purging rooms and event history.
+    """
+
+    def __init__(self, hs, stores):
+        self.stores = stores
+
+    @defer.inlineCallbacks
+    def purge_room(self, room_id: str):
+        """Deletes all record of a room
+        """
+
+        state_groups_to_delete = yield self.stores.main.purge_room(room_id)
+        yield self.stores.main.purge_room_state(room_id, state_groups_to_delete)
+
+    @defer.inlineCallbacks
+    def purge_history(self, room_id, token, delete_local_events):
+        """Deletes room history before a certain point
+
+        Args:
+            room_id (str):
+
+            token (str): A topological token to delete events before
+
+            delete_local_events (bool):
+                if True, we will delete local events as well as remote ones
+                (instead of just marking them as outliers and deleting their
+                state groups).
+        """
+        state_groups = yield self.stores.main.purge_history(
+            room_id, token, delete_local_events
+        )
+
+        logger.info("[purge] finding state groups that can be deleted")
+
+        sg_to_delete = yield self._find_unreferenced_groups(state_groups)
+
+        yield self.stores.main.purge_unreferenced_state_groups(room_id, sg_to_delete)
+
+    @defer.inlineCallbacks
+    def _find_unreferenced_groups(self, state_groups):
+        """Used when purging history to figure out which state groups can be
+        deleted.
+
+        Args:
+            state_groups (set[int]): Set of state groups referenced by events
+                that are going to be deleted.
+
+        Returns:
+            Deferred[set[int]] The set of state groups that can be deleted.
+        """
+        # Graph of state group -> previous group
+        graph = {}
+
+        # Set of events that we have found to be referenced by events
+        referenced_groups = set()
+
+        # Set of state groups we've already seen
+        state_groups_seen = set(state_groups)
+
+        # Set of state groups to handle next.
+        next_to_search = set(state_groups)
+        while next_to_search:
+            # We bound size of groups we're looking up at once, to stop the
+            # SQL query getting too big
+            if len(next_to_search) < 100:
+                current_search = next_to_search
+                next_to_search = set()
+            else:
+                current_search = set(itertools.islice(next_to_search, 100))
+                next_to_search -= current_search
+
+            referenced = yield self.stores.main.get_referenced_state_groups(
+                current_search
+            )
+            referenced_groups |= referenced
+
+            # We don't continue iterating up the state group graphs for state
+            # groups that are referenced.
+            current_search -= referenced
+
+            edges = yield self.stores.main.get_previous_state_groups(current_search)
+
+            prevs = set(edges.values())
+            # We don't bother re-handling groups we've already seen
+            prevs -= state_groups_seen
+            next_to_search |= prevs
+            state_groups_seen |= prevs
+
+            graph.update(edges)
+
+        to_delete = state_groups_seen - referenced_groups
+
+        return to_delete
diff --git a/synapse/util/caches/descriptors.py b/synapse/util/caches/descriptors.py
index 0e8da27f53..84f5ae22c3 100644
--- a/synapse/util/caches/descriptors.py
+++ b/synapse/util/caches/descriptors.py
@@ -17,8 +17,8 @@ import functools
 import inspect
 import logging
 import threading
-from collections import namedtuple
-from typing import Any, cast
+from typing import Any, Tuple, Union, cast
+from weakref import WeakValueDictionary
 
 from six import itervalues
 
@@ -38,6 +38,8 @@ from . import register_cache
 
 logger = logging.getLogger(__name__)
 
+CacheKey = Union[Tuple, Any]
+
 
 class _CachedFunction(Protocol):
     invalidate = None  # type: Any
@@ -430,7 +432,7 @@ class CacheDescriptor(_CacheDescriptorBase):
             # Add our own `cache_context` to argument list if the wrapped function
             # has asked for one
             if self.add_cache_context:
-                kwargs["cache_context"] = _CacheContext(cache, cache_key)
+                kwargs["cache_context"] = _CacheContext.get_instance(cache, cache_key)
 
             try:
                 cached_result_d = cache.get(cache_key, callback=invalidate_callback)
@@ -624,14 +626,38 @@ class CacheListDescriptor(_CacheDescriptorBase):
         return wrapped
 
 
-class _CacheContext(namedtuple("_CacheContext", ("cache", "key"))):
-    # We rely on _CacheContext implementing __eq__ and __hash__ sensibly,
-    # which namedtuple does for us (i.e. two _CacheContext are the same if
-    # their caches and keys match). This is important in particular to
-    # dedupe when we add callbacks to lru cache nodes, otherwise the number
-    # of callbacks would grow.
-    def invalidate(self):
-        self.cache.invalidate(self.key)
+class _CacheContext:
+    """Holds cache information from the cached function higher in the calling order.
+
+    Can be used to invalidate the higher level cache entry if something changes
+    on a lower level.
+    """
+
+    _cache_context_objects = (
+        WeakValueDictionary()
+    )  # type: WeakValueDictionary[Tuple[Cache, CacheKey], _CacheContext]
+
+    def __init__(self, cache, cache_key):  # type: (Cache, CacheKey) -> None
+        self._cache = cache
+        self._cache_key = cache_key
+
+    def invalidate(self):  # type: () -> None
+        """Invalidates the cache entry referred to by the context."""
+        self._cache.invalidate(self._cache_key)
+
+    @classmethod
+    def get_instance(cls, cache, cache_key):  # type: (Cache, CacheKey) -> _CacheContext
+        """Returns an instance constructed with the given arguments.
+
+        A new instance is only created if none already exists.
+        """
+
+        # We make sure there are no identical _CacheContext instances. This is
+        # important in particular to dedupe when we add callbacks to lru cache
+        # nodes, otherwise the number of callbacks would grow.
+        return cls._cache_context_objects.setdefault(
+            (cache, cache_key), cls(cache, cache_key)
+        )
 
 
 def cached(
diff --git a/tests/rest/admin/test_admin.py b/tests/rest/admin/test_admin.py
index 8e1ca8b738..d9f1b95cb0 100644
--- a/tests/rest/admin/test_admin.py
+++ b/tests/rest/admin/test_admin.py
@@ -628,10 +628,12 @@ class PurgeRoomTestCase(unittest.HomeserverTestCase):
             "local_invites",
             "room_account_data",
             "room_tags",
+            "state_groups",
+            "state_groups_state",
         ):
             count = self.get_success(
                 self.store._simple_select_one_onecol(
-                    table="events",
+                    table=table,
                     keyvalues={"room_id": room_id},
                     retcol="COUNT(*)",
                     desc="test_purge_room",
diff --git a/tests/storage/test_purge.py b/tests/storage/test_purge.py
index f671599cb8..b9fafaa1a6 100644
--- a/tests/storage/test_purge.py
+++ b/tests/storage/test_purge.py
@@ -40,23 +40,24 @@ class PurgeTests(HomeserverTestCase):
         third = self.helper.send(self.room_id, body="test3")
         last = self.helper.send(self.room_id, body="test4")
 
-        storage = self.hs.get_datastore()
+        store = self.hs.get_datastore()
+        storage = self.hs.get_storage()
 
         # Get the topological token
-        event = storage.get_topological_token_for_event(last["event_id"])
+        event = store.get_topological_token_for_event(last["event_id"])
         self.pump()
         event = self.successResultOf(event)
 
         # Purge everything before this topological token
-        purge = storage.purge_history(self.room_id, event, True)
+        purge = storage.purge_events.purge_history(self.room_id, event, True)
         self.pump()
         self.assertEqual(self.successResultOf(purge), None)
 
         # Try and get the events
-        get_first = storage.get_event(first["event_id"])
-        get_second = storage.get_event(second["event_id"])
-        get_third = storage.get_event(third["event_id"])
-        get_last = storage.get_event(last["event_id"])
+        get_first = store.get_event(first["event_id"])
+        get_second = store.get_event(second["event_id"])
+        get_third = store.get_event(third["event_id"])
+        get_last = store.get_event(last["event_id"])
         self.pump()
 
         # 1-3 should fail and last will succeed, meaning that 1-3 are deleted
diff --git a/tox.ini b/tox.ini
index afe9bc909b..62b350ea6a 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = packaging, py35, py36, py37, check_codestyle, check_isort
+envlist = packaging, py35, py36, py37, py38, check_codestyle, check_isort
 
 [base]
 basepython = python3.7