diff options
29 files changed, 545 insertions, 220 deletions
diff --git a/CHANGES.md b/CHANGES.md index 6faa4b8dce..9312dc2941 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,12 @@ +Synapse 1.5.1 (2019-11-06) +========================== + +Features +-------- + +- Limit the length of data returned by url previews, to prevent DoS attacks. ([\#6331](https://github.com/matrix-org/synapse/issues/6331), [\#6334](https://github.com/matrix-org/synapse/issues/6334)) + + Synapse 1.5.0 (2019-10-29) ========================== diff --git a/INSTALL.md b/INSTALL.md index e7b429c05d..29e0abafd3 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -36,7 +36,7 @@ that your email address is probably `user@example.com` rather than System requirements: - POSIX-compliant system (tested on Linux & OS X) -- Python 3.5, 3.6, or 3.7 +- Python 3.5, 3.6, 3.7 or 3.8. - At least 1GB of free RAM if you want to join large public rooms like #matrix:matrix.org Synapse is written in Python but some of the libraries it uses are written in diff --git a/changelog.d/6213.bugfix b/changelog.d/6213.bugfix new file mode 100644 index 0000000000..2bb2d08851 --- /dev/null +++ b/changelog.d/6213.bugfix @@ -0,0 +1 @@ +Fix LruCache callback deduplication for Python 3.8. Contributed by @V02460. diff --git a/changelog.d/6235.bugfix b/changelog.d/6235.bugfix new file mode 100644 index 0000000000..12718ba934 --- /dev/null +++ b/changelog.d/6235.bugfix @@ -0,0 +1 @@ +Remove a room from a server's public rooms list on room upgrade. \ No newline at end of file diff --git a/changelog.d/6295.misc b/changelog.d/6295.misc new file mode 100644 index 0000000000..a3e6b8296e --- /dev/null +++ b/changelog.d/6295.misc @@ -0,0 +1 @@ +Split out state storage into separate data store. diff --git a/changelog.d/6310.feature b/changelog.d/6310.feature new file mode 100644 index 0000000000..78a187a1dc --- /dev/null +++ b/changelog.d/6310.feature @@ -0,0 +1 @@ +Implement label-based filtering on `/sync` and `/messages` ([MSC2326](https://github.com/matrix-org/matrix-doc/pull/2326)). diff --git a/changelog.d/6331.feature b/changelog.d/6331.feature deleted file mode 100644 index eaf69ef3f6..0000000000 --- a/changelog.d/6331.feature +++ /dev/null @@ -1 +0,0 @@ -Limit the length of data returned by url previews, to prevent DoS attacks. diff --git a/changelog.d/6334.feature b/changelog.d/6334.feature deleted file mode 100644 index eaf69ef3f6..0000000000 --- a/changelog.d/6334.feature +++ /dev/null @@ -1 +0,0 @@ -Limit the length of data returned by url previews, to prevent DoS attacks. diff --git a/changelog.d/6338.bugfix b/changelog.d/6338.bugfix new file mode 100644 index 0000000000..8e469f0fb6 --- /dev/null +++ b/changelog.d/6338.bugfix @@ -0,0 +1 @@ +Prevent the server taking a long time to start up when guest registration is enabled. \ No newline at end of file diff --git a/changelog.d/6341.misc b/changelog.d/6341.misc new file mode 100644 index 0000000000..359b9bf1d7 --- /dev/null +++ b/changelog.d/6341.misc @@ -0,0 +1 @@ +Add continuous integration for python 3.8. \ No newline at end of file diff --git a/debian/changelog b/debian/changelog index acda7e5c63..c4415f460a 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +matrix-synapse-py3 (1.5.1) stable; urgency=medium + + * New synapse release 1.5.1. + + -- Synapse Packaging team <packages@matrix.org> Wed, 06 Nov 2019 10:02:14 +0000 + matrix-synapse-py3 (1.5.0) stable; urgency=medium * New synapse release 1.5.0. diff --git a/scripts-dev/build_debian_packages b/scripts-dev/build_debian_packages index 93305ee9b1..84eaec6a95 100755 --- a/scripts-dev/build_debian_packages +++ b/scripts-dev/build_debian_packages @@ -20,11 +20,13 @@ from concurrent.futures import ThreadPoolExecutor DISTS = ( "debian:stretch", "debian:buster", + "debian:bullseye", "debian:sid", "ubuntu:xenial", "ubuntu:bionic", "ubuntu:cosmic", "ubuntu:disco", + "ubuntu:eoan", ) DESC = '''\ diff --git a/synapse/__init__.py b/synapse/__init__.py index 8587ffa76f..ec16f54a49 100644 --- a/synapse/__init__.py +++ b/synapse/__init__.py @@ -36,7 +36,7 @@ try: except ImportError: pass -__version__ = "1.5.0" +__version__ = "1.5.1" if bool(os.environ.get("SYNAPSE_TEST_PATCH_LOG_CONTEXTS", False)): # We import here so that we don't have to install a bunch of deps when diff --git a/synapse/handlers/pagination.py b/synapse/handlers/pagination.py index 97f15a1c32..260a4351ca 100644 --- a/synapse/handlers/pagination.py +++ b/synapse/handlers/pagination.py @@ -127,7 +127,9 @@ class PaginationHandler(object): self._purges_in_progress_by_room.add(room_id) try: with (yield self.pagination_lock.write(room_id)): - yield self.store.purge_history(room_id, token, delete_local_events) + yield self.storage.purge_events.purge_history( + room_id, token, delete_local_events + ) logger.info("[purge] complete") self._purges_by_id[purge_id].status = PurgeStatus.STATUS_COMPLETE except Exception: @@ -170,7 +172,7 @@ class PaginationHandler(object): if joined: raise SynapseError(400, "Users are still joined to this room") - await self.store.purge_room(room_id) + await self.storage.purge_events.purge_room(room_id) @defer.inlineCallbacks def get_messages( diff --git a/synapse/handlers/register.py b/synapse/handlers/register.py index cff6b0d375..235f11c322 100644 --- a/synapse/handlers/register.py +++ b/synapse/handlers/register.py @@ -24,7 +24,6 @@ from synapse.api.errors import ( AuthError, Codes, ConsentNotGivenError, - LimitExceededError, RegistrationError, SynapseError, ) @@ -168,6 +167,7 @@ class RegistrationHandler(BaseHandler): Raises: RegistrationError if there was a problem registering. """ + yield self.check_registration_ratelimit(address) yield self.auth.check_auth_blocking(threepid=threepid) password_hash = None @@ -217,8 +217,13 @@ class RegistrationHandler(BaseHandler): else: # autogen a sequential user ID + fail_count = 0 user = None while not user: + # Fail after being unable to find a suitable ID a few times + if fail_count > 10: + raise SynapseError(500, "Unable to find a suitable guest user ID") + localpart = yield self._generate_user_id() user = UserID(localpart, self.hs.hostname) user_id = user.to_string() @@ -233,10 +238,14 @@ class RegistrationHandler(BaseHandler): create_profile_with_displayname=default_display_name, address=address, ) + + # Successfully registered + break except SynapseError: # if user id is taken, just generate another user = None user_id = None + fail_count += 1 if not self.hs.config.user_consent_at_registration: yield self._auto_join_rooms(user_id) @@ -414,6 +423,29 @@ class RegistrationHandler(BaseHandler): ratelimit=False, ) + def check_registration_ratelimit(self, address): + """A simple helper method to check whether the registration rate limit has been hit + for a given IP address + + Args: + address (str|None): the IP address used to perform the registration. If this is + None, no ratelimiting will be performed. + + Raises: + LimitExceededError: If the rate limit has been exceeded. + """ + if not address: + return + + time_now = self.clock.time() + + self.ratelimiter.ratelimit( + address, + time_now_s=time_now, + rate_hz=self.hs.config.rc_registration.per_second, + burst_count=self.hs.config.rc_registration.burst_count, + ) + def register_with_store( self, user_id, @@ -446,22 +478,6 @@ class RegistrationHandler(BaseHandler): Returns: Deferred """ - # Don't rate limit for app services - if appservice_id is None and address is not None: - time_now = self.clock.time() - - allowed, time_allowed = self.ratelimiter.can_do_action( - address, - time_now_s=time_now, - rate_hz=self.hs.config.rc_registration.per_second, - burst_count=self.hs.config.rc_registration.burst_count, - ) - - if not allowed: - raise LimitExceededError( - retry_after_ms=int(1000 * (time_allowed - time_now)) - ) - if self.hs.config.worker_app: return self._register_client( user_id=user_id, diff --git a/synapse/handlers/room_member.py b/synapse/handlers/room_member.py index 06d09c2947..6cfee4b361 100644 --- a/synapse/handlers/room_member.py +++ b/synapse/handlers/room_member.py @@ -515,6 +515,15 @@ class RoomMemberHandler(object): yield self.store.set_room_is_public(old_room_id, False) yield self.store.set_room_is_public(room_id, True) + # Check if any groups we own contain the predecessor room + local_group_ids = yield self.store.get_local_groups_for_room(old_room_id) + for group_id in local_group_ids: + # Add new the new room to those groups + yield self.store.add_room_to_group(group_id, room_id, old_room["is_public"]) + + # Remove the old room from those groups + yield self.store.remove_room_from_group(group_id, old_room_id) + @defer.inlineCallbacks def copy_user_state_on_room_upgrade(self, old_room_id, new_room_id, user_ids): """Copy user-specific information when they join a new room when that new room is the diff --git a/synapse/replication/http/register.py b/synapse/replication/http/register.py index 915cfb9430..0c4aca1291 100644 --- a/synapse/replication/http/register.py +++ b/synapse/replication/http/register.py @@ -75,6 +75,8 @@ class ReplicationRegisterServlet(ReplicationEndpoint): async def _handle_request(self, request, user_id): content = parse_json_object_from_request(request) + self.registration_handler.check_registration_ratelimit(content["address"]) + await self.registration_handler.register_with_store( user_id=user_id, password_hash=content["password_hash"], diff --git a/synapse/storage/__init__.py b/synapse/storage/__init__.py index 0a1a8cc1e5..0460fe8cc9 100644 --- a/synapse/storage/__init__.py +++ b/synapse/storage/__init__.py @@ -30,6 +30,7 @@ stored in `synapse.storage.schema`. from synapse.storage.data_stores import DataStores from synapse.storage.data_stores.main import DataStore from synapse.storage.persist_events import EventsPersistenceStorage +from synapse.storage.purge_events import PurgeEventsStorage from synapse.storage.state import StateGroupStorage __all__ = ["DataStores", "DataStore"] @@ -46,6 +47,7 @@ class Storage(object): self.main = stores.main self.persistence = EventsPersistenceStorage(hs, stores) + self.purge_events = PurgeEventsStorage(hs, stores) self.state = StateGroupStorage(hs, stores) diff --git a/synapse/storage/data_stores/main/events.py b/synapse/storage/data_stores/main/events.py index 301f8ea128..878f7568a6 100644 --- a/synapse/storage/data_stores/main/events.py +++ b/synapse/storage/data_stores/main/events.py @@ -1375,6 +1375,10 @@ class EventsStore( if True, we will delete local events as well as remote ones (instead of just marking them as outliers and deleting their state groups). + + Returns: + Deferred[set[int]]: The set of state groups that are referenced by + deleted events. """ return self.runInteraction( @@ -1511,11 +1515,10 @@ class EventsStore( [(room_id, event_id) for event_id, in new_backwards_extrems], ) - logger.info("[purge] finding redundant state groups") + logger.info("[purge] finding state groups referenced by deleted events") # Get all state groups that are referenced by events that are to be - # deleted. We then go and check if they are referenced by other events - # or state groups, and if not we delete them. + # deleted. txn.execute( """ SELECT DISTINCT state_group FROM events_to_purge @@ -1528,60 +1531,6 @@ class EventsStore( "[purge] found %i referenced state groups", len(referenced_state_groups) ) - logger.info("[purge] finding state groups that can be deleted") - - _ = self._find_unreferenced_groups_during_purge(txn, referenced_state_groups) - state_groups_to_delete, remaining_state_groups = _ - - logger.info( - "[purge] found %i state groups to delete", len(state_groups_to_delete) - ) - - logger.info( - "[purge] de-delta-ing %i remaining state groups", - len(remaining_state_groups), - ) - - # Now we turn the state groups that reference to-be-deleted state - # groups to non delta versions. - for sg in remaining_state_groups: - logger.info("[purge] de-delta-ing remaining state group %s", sg) - curr_state = self._get_state_groups_from_groups_txn(txn, [sg]) - curr_state = curr_state[sg] - - self._simple_delete_txn( - txn, table="state_groups_state", keyvalues={"state_group": sg} - ) - - self._simple_delete_txn( - txn, table="state_group_edges", keyvalues={"state_group": sg} - ) - - self._simple_insert_many_txn( - txn, - table="state_groups_state", - values=[ - { - "state_group": sg, - "room_id": room_id, - "type": key[0], - "state_key": key[1], - "event_id": state_id, - } - for key, state_id in iteritems(curr_state) - ], - ) - - logger.info("[purge] removing redundant state groups") - txn.executemany( - "DELETE FROM state_groups_state WHERE state_group = ?", - ((sg,) for sg in state_groups_to_delete), - ) - txn.executemany( - "DELETE FROM state_groups WHERE id = ?", - ((sg,) for sg in state_groups_to_delete), - ) - logger.info("[purge] removing events from event_to_state_groups") txn.execute( "DELETE FROM event_to_state_groups " @@ -1668,138 +1617,35 @@ class EventsStore( logger.info("[purge] done") - def _find_unreferenced_groups_during_purge(self, txn, state_groups): - """Used when purging history to figure out which state groups can be - deleted and which need to be de-delta'ed (due to one of its prev groups - being scheduled for deletion). - - Args: - txn - state_groups (set[int]): Set of state groups referenced by events - that are going to be deleted. - - Returns: - tuple[set[int], set[int]]: The set of state groups that can be - deleted and the set of state groups that need to be de-delta'ed - """ - # Graph of state group -> previous group - graph = {} - - # Set of events that we have found to be referenced by events - referenced_groups = set() - - # Set of state groups we've already seen - state_groups_seen = set(state_groups) - - # Set of state groups to handle next. - next_to_search = set(state_groups) - while next_to_search: - # We bound size of groups we're looking up at once, to stop the - # SQL query getting too big - if len(next_to_search) < 100: - current_search = next_to_search - next_to_search = set() - else: - current_search = set(itertools.islice(next_to_search, 100)) - next_to_search -= current_search - - # Check if state groups are referenced - sql = """ - SELECT DISTINCT state_group FROM event_to_state_groups - LEFT JOIN events_to_purge AS ep USING (event_id) - WHERE ep.event_id IS NULL AND - """ - clause, args = make_in_list_sql_clause( - txn.database_engine, "state_group", current_search - ) - txn.execute(sql + clause, list(args)) - - referenced = set(sg for sg, in txn) - referenced_groups |= referenced - - # We don't continue iterating up the state group graphs for state - # groups that are referenced. - current_search -= referenced - - rows = self._simple_select_many_txn( - txn, - table="state_group_edges", - column="prev_state_group", - iterable=current_search, - keyvalues={}, - retcols=("prev_state_group", "state_group"), - ) - - prevs = set(row["state_group"] for row in rows) - # We don't bother re-handling groups we've already seen - prevs -= state_groups_seen - next_to_search |= prevs - state_groups_seen |= prevs - - for row in rows: - # Note: Each state group can have at most one prev group - graph[row["state_group"]] = row["prev_state_group"] - - to_delete = state_groups_seen - referenced_groups - - to_dedelta = set() - for sg in referenced_groups: - prev_sg = graph.get(sg) - if prev_sg and prev_sg in to_delete: - to_dedelta.add(sg) - - return to_delete, to_dedelta + return referenced_state_groups def purge_room(self, room_id): """Deletes all record of a room Args: - room_id (str): + room_id (str) + + Returns: + Deferred[List[int]]: The list of state groups to delete. """ return self.runInteraction("purge_room", self._purge_room_txn, room_id) def _purge_room_txn(self, txn, room_id): - # first we have to delete the state groups states - logger.info("[purge] removing %s from state_groups_state", room_id) - + # First we fetch all the state groups that should be deleted, before + # we delete that information. txn.execute( """ - DELETE FROM state_groups_state WHERE state_group IN ( - SELECT state_group FROM events JOIN event_to_state_groups USING(event_id) - WHERE events.room_id=? - ) + SELECT DISTINCT state_group FROM events + INNER JOIN event_to_state_groups USING(event_id) + WHERE events.room_id = ? """, (room_id,), ) - # ... and the state group edges - logger.info("[purge] removing %s from state_group_edges", room_id) - - txn.execute( - """ - DELETE FROM state_group_edges WHERE state_group IN ( - SELECT state_group FROM events JOIN event_to_state_groups USING(event_id) - WHERE events.room_id=? - ) - """, - (room_id,), - ) - - # ... and the state groups - logger.info("[purge] removing %s from state_groups", room_id) - - txn.execute( - """ - DELETE FROM state_groups WHERE id IN ( - SELECT state_group FROM events JOIN event_to_state_groups USING(event_id) - WHERE events.room_id=? - ) - """, - (room_id,), - ) + state_groups = [row[0] for row in txn] - # and then tables which lack an index on room_id but have one on event_id + # Now we delete tables which lack an index on room_id but have one on event_id for table in ( "event_auth", "event_edges", @@ -1887,6 +1733,165 @@ class EventsStore( logger.info("[purge] done") + return state_groups + + def purge_unreferenced_state_groups( + self, room_id: str, state_groups_to_delete + ) -> defer.Deferred: + """Deletes no longer referenced state groups and de-deltas any state + groups that reference them. + + Args: + room_id: The room the state groups belong to (must all be in the + same room). + state_groups_to_delete (Collection[int]): Set of all state groups + to delete. + """ + + return self.runInteraction( + "purge_unreferenced_state_groups", + self._purge_unreferenced_state_groups, + room_id, + state_groups_to_delete, + ) + + def _purge_unreferenced_state_groups(self, txn, room_id, state_groups_to_delete): + logger.info( + "[purge] found %i state groups to delete", len(state_groups_to_delete) + ) + + rows = self._simple_select_many_txn( + txn, + table="state_group_edges", + column="prev_state_group", + iterable=state_groups_to_delete, + keyvalues={}, + retcols=("state_group",), + ) + + remaining_state_groups = set( + row["state_group"] + for row in rows + if row["state_group"] not in state_groups_to_delete + ) + + logger.info( + "[purge] de-delta-ing %i remaining state groups", + len(remaining_state_groups), + ) + + # Now we turn the state groups that reference to-be-deleted state + # groups to non delta versions. + for sg in remaining_state_groups: + logger.info("[purge] de-delta-ing remaining state group %s", sg) + curr_state = self._get_state_groups_from_groups_txn(txn, [sg]) + curr_state = curr_state[sg] + + self._simple_delete_txn( + txn, table="state_groups_state", keyvalues={"state_group": sg} + ) + + self._simple_delete_txn( + txn, table="state_group_edges", keyvalues={"state_group": sg} + ) + + self._simple_insert_many_txn( + txn, + table="state_groups_state", + values=[ + { + "state_group": sg, + "room_id": room_id, + "type": key[0], + "state_key": key[1], + "event_id": state_id, + } + for key, state_id in iteritems(curr_state) + ], + ) + + logger.info("[purge] removing redundant state groups") + txn.executemany( + "DELETE FROM state_groups_state WHERE state_group = ?", + ((sg,) for sg in state_groups_to_delete), + ) + txn.executemany( + "DELETE FROM state_groups WHERE id = ?", + ((sg,) for sg in state_groups_to_delete), + ) + + @defer.inlineCallbacks + def get_previous_state_groups(self, state_groups): + """Fetch the previous groups of the given state groups. + + Args: + state_groups (Iterable[int]) + + Returns: + Deferred[dict[int, int]]: mapping from state group to previous + state group. + """ + + rows = yield self._simple_select_many_batch( + table="state_group_edges", + column="prev_state_group", + iterable=state_groups, + keyvalues={}, + retcols=("prev_state_group", "state_group"), + desc="get_previous_state_groups", + ) + + return {row["state_group"]: row["prev_state_group"] for row in rows} + + def purge_room_state(self, room_id, state_groups_to_delete): + """Deletes all record of a room from state tables + + Args: + room_id (str): + state_groups_to_delete (list[int]): State groups to delete + """ + + return self.runInteraction( + "purge_room_state", + self._purge_room_state_txn, + room_id, + state_groups_to_delete, + ) + + def _purge_room_state_txn(self, txn, room_id, state_groups_to_delete): + # first we have to delete the state groups states + logger.info("[purge] removing %s from state_groups_state", room_id) + + self._simple_delete_many_txn( + txn, + table="state_groups_state", + column="state_group", + iterable=state_groups_to_delete, + keyvalues={}, + ) + + # ... and the state group edges + logger.info("[purge] removing %s from state_group_edges", room_id) + + self._simple_delete_many_txn( + txn, + table="state_group_edges", + column="state_group", + iterable=state_groups_to_delete, + keyvalues={}, + ) + + # ... and the state groups + logger.info("[purge] removing %s from state_groups", room_id) + + self._simple_delete_many_txn( + txn, + table="state_groups", + column="id", + iterable=state_groups_to_delete, + keyvalues={}, + ) + async def is_event_after(self, event_id1, event_id2): """Returns True if event_id1 is after event_id2 in the stream """ diff --git a/synapse/storage/data_stores/main/events_bg_updates.py b/synapse/storage/data_stores/main/events_bg_updates.py index 51352b9966..0ed59ef48e 100644 --- a/synapse/storage/data_stores/main/events_bg_updates.py +++ b/synapse/storage/data_stores/main/events_bg_updates.py @@ -21,6 +21,7 @@ from canonicaljson import json from twisted.internet import defer +from synapse.api.constants import EventContentFields from synapse.storage._base import make_in_list_sql_clause from synapse.storage.background_updates import BackgroundUpdateStore @@ -85,6 +86,10 @@ class EventsBackgroundUpdatesStore(BackgroundUpdateStore): "event_fix_redactions_bytes", self._event_fix_redactions_bytes ) + self.register_background_update_handler( + "event_store_labels", self._event_store_labels + ) + @defer.inlineCallbacks def _background_reindex_fields_sender(self, progress, batch_size): target_min_stream_id = progress["target_min_stream_id_inclusive"] @@ -503,3 +508,61 @@ class EventsBackgroundUpdatesStore(BackgroundUpdateStore): yield self._end_background_update("event_fix_redactions_bytes") return 1 + + @defer.inlineCallbacks + def _event_store_labels(self, progress, batch_size): + """Background update handler which will store labels for existing events.""" + last_event_id = progress.get("last_event_id", "") + + def _event_store_labels_txn(txn): + txn.execute( + """ + SELECT event_id, json FROM event_json + LEFT JOIN event_labels USING (event_id) + WHERE event_id > ? AND label IS NULL + ORDER BY event_id LIMIT ? + """, + (last_event_id, batch_size), + ) + + results = list(txn) + + nbrows = 0 + last_row_event_id = "" + for (event_id, event_json_raw) in results: + event_json = json.loads(event_json_raw) + + self._simple_insert_many_txn( + txn=txn, + table="event_labels", + values=[ + { + "event_id": event_id, + "label": label, + "room_id": event_json["room_id"], + "topological_ordering": event_json["depth"], + } + for label in event_json["content"].get( + EventContentFields.LABELS, [] + ) + if isinstance(label, str) + ], + ) + + nbrows += 1 + last_row_event_id = event_id + + self._background_update_progress_txn( + txn, "event_store_labels", {"last_event_id": last_row_event_id} + ) + + return nbrows + + num_rows = yield self.runInteraction( + desc="event_store_labels", func=_event_store_labels_txn + ) + + if not num_rows: + yield self._end_background_update("event_store_labels") + + return num_rows diff --git a/synapse/storage/data_stores/main/group_server.py b/synapse/storage/data_stores/main/group_server.py index b3a2771f1b..5ded539af8 100644 --- a/synapse/storage/data_stores/main/group_server.py +++ b/synapse/storage/data_stores/main/group_server.py @@ -553,6 +553,21 @@ class GroupServerStore(SQLBaseStore): desc="remove_user_from_summary", ) + def get_local_groups_for_room(self, room_id): + """Get all of the local group that contain a given room + Args: + room_id (str): The ID of a room + Returns: + Deferred[list[str]]: A twisted.Deferred containing a list of group ids + containing this room + """ + return self._simple_select_onecol( + table="group_rooms", + keyvalues={"room_id": room_id}, + retcol="group_id", + desc="get_local_groups_for_room", + ) + def get_users_for_summary_by_role(self, group_id, include_private=False): """Get the users and roles that should be included in a summary request diff --git a/synapse/storage/data_stores/main/registration.py b/synapse/storage/data_stores/main/registration.py index f70d41ecab..ee1b2b2bbf 100644 --- a/synapse/storage/data_stores/main/registration.py +++ b/synapse/storage/data_stores/main/registration.py @@ -488,14 +488,14 @@ class RegistrationWorkerStore(SQLBaseStore): we can. Unfortunately, it's possible some of them are already taken by existing users, and there may be gaps in the already taken range. This function returns the start of the first allocatable gap. This is to - avoid the case of ID 10000000 being pre-allocated, so us wasting the - first (and shortest) many generated user IDs. + avoid the case of ID 1000 being pre-allocated and starting at 1001 while + 0-999 are available. """ def _find_next_generated_user_id(txn): - # We bound between '@1' and '@a' to avoid pulling the entire table + # We bound between '@0' and '@a' to avoid pulling the entire table # out. - txn.execute("SELECT name FROM users WHERE '@1' <= name AND name < '@a'") + txn.execute("SELECT name FROM users WHERE '@0' <= name AND name < '@a'") regex = re.compile(r"^@(\d+):") diff --git a/synapse/storage/data_stores/main/schema/delta/56/event_labels_background_update.sql b/synapse/storage/data_stores/main/schema/delta/56/event_labels_background_update.sql new file mode 100644 index 0000000000..5f5e0499ae --- /dev/null +++ b/synapse/storage/data_stores/main/schema/delta/56/event_labels_background_update.sql @@ -0,0 +1,17 @@ +/* Copyright 2019 The Matrix.org Foundation C.I.C. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +INSERT INTO background_updates (update_name, progress_json) VALUES + ('event_store_labels', '{}'); diff --git a/synapse/storage/data_stores/main/state.py b/synapse/storage/data_stores/main/state.py index 9e1541988e..6a90daea31 100644 --- a/synapse/storage/data_stores/main/state.py +++ b/synapse/storage/data_stores/main/state.py @@ -285,7 +285,11 @@ class StateGroupWorkerStore( room_id (str) Returns: - Deferred[unicode|None]: predecessor room id + Deferred[dict|None]: A dictionary containing the structure of the predecessor + field from the room's create event. The structure is subject to other servers, + but it is expected to be: + * room_id (str): The room ID of the predecessor room + * event_id (str): The ID of the tombstone event in the predecessor room Raises: NotFoundError if the room is unknown @@ -991,6 +995,29 @@ class StateGroupWorkerStore( return self.runInteraction("store_state_group", _store_state_group_txn) + @defer.inlineCallbacks + def get_referenced_state_groups(self, state_groups): + """Check if the state groups are referenced by events. + + Args: + state_groups (Iterable[int]) + + Returns: + Deferred[set[int]]: The subset of state groups that are + referenced. + """ + + rows = yield self._simple_select_many_batch( + table="event_to_state_groups", + column="state_group", + iterable=state_groups, + keyvalues={}, + retcols=("DISTINCT state_group",), + desc="get_referenced_state_groups", + ) + + return set(row["state_group"] for row in rows) + class StateBackgroundUpdateStore( StateGroupBackgroundUpdateStore, BackgroundUpdateStore diff --git a/synapse/storage/purge_events.py b/synapse/storage/purge_events.py new file mode 100644 index 0000000000..a368182034 --- /dev/null +++ b/synapse/storage/purge_events.py @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- +# Copyright 2019 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import logging + +from twisted.internet import defer + +logger = logging.getLogger(__name__) + + +class PurgeEventsStorage(object): + """High level interface for purging rooms and event history. + """ + + def __init__(self, hs, stores): + self.stores = stores + + @defer.inlineCallbacks + def purge_room(self, room_id: str): + """Deletes all record of a room + """ + + state_groups_to_delete = yield self.stores.main.purge_room(room_id) + yield self.stores.main.purge_room_state(room_id, state_groups_to_delete) + + @defer.inlineCallbacks + def purge_history(self, room_id, token, delete_local_events): + """Deletes room history before a certain point + + Args: + room_id (str): + + token (str): A topological token to delete events before + + delete_local_events (bool): + if True, we will delete local events as well as remote ones + (instead of just marking them as outliers and deleting their + state groups). + """ + state_groups = yield self.stores.main.purge_history( + room_id, token, delete_local_events + ) + + logger.info("[purge] finding state groups that can be deleted") + + sg_to_delete = yield self._find_unreferenced_groups(state_groups) + + yield self.stores.main.purge_unreferenced_state_groups(room_id, sg_to_delete) + + @defer.inlineCallbacks + def _find_unreferenced_groups(self, state_groups): + """Used when purging history to figure out which state groups can be + deleted. + + Args: + state_groups (set[int]): Set of state groups referenced by events + that are going to be deleted. + + Returns: + Deferred[set[int]] The set of state groups that can be deleted. + """ + # Graph of state group -> previous group + graph = {} + + # Set of events that we have found to be referenced by events + referenced_groups = set() + + # Set of state groups we've already seen + state_groups_seen = set(state_groups) + + # Set of state groups to handle next. + next_to_search = set(state_groups) + while next_to_search: + # We bound size of groups we're looking up at once, to stop the + # SQL query getting too big + if len(next_to_search) < 100: + current_search = next_to_search + next_to_search = set() + else: + current_search = set(itertools.islice(next_to_search, 100)) + next_to_search -= current_search + + referenced = yield self.stores.main.get_referenced_state_groups( + current_search + ) + referenced_groups |= referenced + + # We don't continue iterating up the state group graphs for state + # groups that are referenced. + current_search -= referenced + + edges = yield self.stores.main.get_previous_state_groups(current_search) + + prevs = set(edges.values()) + # We don't bother re-handling groups we've already seen + prevs -= state_groups_seen + next_to_search |= prevs + state_groups_seen |= prevs + + graph.update(edges) + + to_delete = state_groups_seen - referenced_groups + + return to_delete diff --git a/synapse/util/caches/descriptors.py b/synapse/util/caches/descriptors.py index 0e8da27f53..84f5ae22c3 100644 --- a/synapse/util/caches/descriptors.py +++ b/synapse/util/caches/descriptors.py @@ -17,8 +17,8 @@ import functools import inspect import logging import threading -from collections import namedtuple -from typing import Any, cast +from typing import Any, Tuple, Union, cast +from weakref import WeakValueDictionary from six import itervalues @@ -38,6 +38,8 @@ from . import register_cache logger = logging.getLogger(__name__) +CacheKey = Union[Tuple, Any] + class _CachedFunction(Protocol): invalidate = None # type: Any @@ -430,7 +432,7 @@ class CacheDescriptor(_CacheDescriptorBase): # Add our own `cache_context` to argument list if the wrapped function # has asked for one if self.add_cache_context: - kwargs["cache_context"] = _CacheContext(cache, cache_key) + kwargs["cache_context"] = _CacheContext.get_instance(cache, cache_key) try: cached_result_d = cache.get(cache_key, callback=invalidate_callback) @@ -624,14 +626,38 @@ class CacheListDescriptor(_CacheDescriptorBase): return wrapped -class _CacheContext(namedtuple("_CacheContext", ("cache", "key"))): - # We rely on _CacheContext implementing __eq__ and __hash__ sensibly, - # which namedtuple does for us (i.e. two _CacheContext are the same if - # their caches and keys match). This is important in particular to - # dedupe when we add callbacks to lru cache nodes, otherwise the number - # of callbacks would grow. - def invalidate(self): - self.cache.invalidate(self.key) +class _CacheContext: + """Holds cache information from the cached function higher in the calling order. + + Can be used to invalidate the higher level cache entry if something changes + on a lower level. + """ + + _cache_context_objects = ( + WeakValueDictionary() + ) # type: WeakValueDictionary[Tuple[Cache, CacheKey], _CacheContext] + + def __init__(self, cache, cache_key): # type: (Cache, CacheKey) -> None + self._cache = cache + self._cache_key = cache_key + + def invalidate(self): # type: () -> None + """Invalidates the cache entry referred to by the context.""" + self._cache.invalidate(self._cache_key) + + @classmethod + def get_instance(cls, cache, cache_key): # type: (Cache, CacheKey) -> _CacheContext + """Returns an instance constructed with the given arguments. + + A new instance is only created if none already exists. + """ + + # We make sure there are no identical _CacheContext instances. This is + # important in particular to dedupe when we add callbacks to lru cache + # nodes, otherwise the number of callbacks would grow. + return cls._cache_context_objects.setdefault( + (cache, cache_key), cls(cache, cache_key) + ) def cached( diff --git a/tests/rest/admin/test_admin.py b/tests/rest/admin/test_admin.py index 8e1ca8b738..d9f1b95cb0 100644 --- a/tests/rest/admin/test_admin.py +++ b/tests/rest/admin/test_admin.py @@ -628,10 +628,12 @@ class PurgeRoomTestCase(unittest.HomeserverTestCase): "local_invites", "room_account_data", "room_tags", + "state_groups", + "state_groups_state", ): count = self.get_success( self.store._simple_select_one_onecol( - table="events", + table=table, keyvalues={"room_id": room_id}, retcol="COUNT(*)", desc="test_purge_room", diff --git a/tests/storage/test_purge.py b/tests/storage/test_purge.py index f671599cb8..b9fafaa1a6 100644 --- a/tests/storage/test_purge.py +++ b/tests/storage/test_purge.py @@ -40,23 +40,24 @@ class PurgeTests(HomeserverTestCase): third = self.helper.send(self.room_id, body="test3") last = self.helper.send(self.room_id, body="test4") - storage = self.hs.get_datastore() + store = self.hs.get_datastore() + storage = self.hs.get_storage() # Get the topological token - event = storage.get_topological_token_for_event(last["event_id"]) + event = store.get_topological_token_for_event(last["event_id"]) self.pump() event = self.successResultOf(event) # Purge everything before this topological token - purge = storage.purge_history(self.room_id, event, True) + purge = storage.purge_events.purge_history(self.room_id, event, True) self.pump() self.assertEqual(self.successResultOf(purge), None) # Try and get the events - get_first = storage.get_event(first["event_id"]) - get_second = storage.get_event(second["event_id"]) - get_third = storage.get_event(third["event_id"]) - get_last = storage.get_event(last["event_id"]) + get_first = store.get_event(first["event_id"]) + get_second = store.get_event(second["event_id"]) + get_third = store.get_event(third["event_id"]) + get_last = store.get_event(last["event_id"]) self.pump() # 1-3 should fail and last will succeed, meaning that 1-3 are deleted diff --git a/tox.ini b/tox.ini index afe9bc909b..62b350ea6a 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = packaging, py35, py36, py37, check_codestyle, check_isort +envlist = packaging, py35, py36, py37, py38, check_codestyle, check_isort [base] basepython = python3.7 |