diff --git a/synapse/storage/events.py b/synapse/storage/events.py
index 0fb190530a..e4d0f8b1a9 100644
--- a/synapse/storage/events.py
+++ b/synapse/storage/events.py
@@ -37,6 +37,7 @@ from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.storage.background_updates import BackgroundUpdateStore
from synapse.storage.event_federation import EventFederationStore
from synapse.storage.events_worker import EventsWorkerStore
+from synapse.storage.state import StateGroupWorkerStore
from synapse.types import RoomStreamToken, get_domain_from_id
from synapse.util.async_helpers import ObservableDeferred
from synapse.util.caches.descriptors import cached, cachedInlineCallbacks
@@ -203,7 +204,8 @@ def _retry_on_integrity_error(func):
# inherits from EventFederationStore so that we can call _update_backward_extremities
# and _handle_mult_prev_events (though arguably those could both be moved in here)
-class EventsStore(EventFederationStore, EventsWorkerStore, BackgroundUpdateStore):
+class EventsStore(StateGroupWorkerStore, EventFederationStore, EventsWorkerStore,
+ BackgroundUpdateStore):
EVENT_ORIGIN_SERVER_TS_NAME = "event_origin_server_ts"
EVENT_FIELDS_SENDER_URL_UPDATE_NAME = "event_fields_sender_url"
@@ -1995,70 +1997,29 @@ class EventsStore(EventFederationStore, EventsWorkerStore, BackgroundUpdateStore
logger.info("[purge] finding redundant state groups")
- # Get all state groups that are only referenced by events that are
- # to be deleted.
- # This works by first getting state groups that we may want to delete,
- # joining against event_to_state_groups to get events that use that
- # state group, then left joining against events_to_purge again. Any
- # state group where the left join produce *no nulls* are referenced
- # only by events that are going to be purged.
+ # Get all state groups that are referenced by events that are to be
+ # deleted. We then go and check if they are referenced by other events
+ # or state groups, and if not we delete them.
txn.execute("""
- SELECT state_group FROM
- (
- SELECT DISTINCT state_group FROM events_to_purge
- INNER JOIN event_to_state_groups USING (event_id)
- ) AS sp
- INNER JOIN event_to_state_groups USING (state_group)
- LEFT JOIN events_to_purge AS ep USING (event_id)
- GROUP BY state_group
- HAVING SUM(CASE WHEN ep.event_id IS NULL THEN 1 ELSE 0 END) = 0
+ SELECT DISTINCT state_group FROM events_to_purge
+ INNER JOIN event_to_state_groups USING (event_id)
""")
- state_rows = txn.fetchall()
- logger.info("[purge] found %i redundant state groups", len(state_rows))
-
- # make a set of the redundant state groups, so that we can look them up
- # efficiently
- state_groups_to_delete = set([sg for sg, in state_rows])
-
- # Now we get all the state groups that rely on these state groups
- logger.info("[purge] finding state groups which depend on redundant"
- " state groups")
- remaining_state_groups = []
- unreferenced_state_groups = 0
- for i in range(0, len(state_rows), 100):
- chunk = [sg for sg, in state_rows[i:i + 100]]
- # look for state groups whose prev_state_group is one we are about
- # to delete
- rows = self._simple_select_many_txn(
- txn,
- table="state_group_edges",
- column="prev_state_group",
- iterable=chunk,
- retcols=["state_group"],
- keyvalues={},
- )
-
- for row in rows:
- sg = row["state_group"]
-
- if sg in state_groups_to_delete:
- # exclude state groups we are about to delete: no point in
- # updating them
- continue
+ referenced_state_groups = set(sg for sg, in txn)
+ logger.info(
+ "[purge] found %i referenced state groups",
+ len(referenced_state_groups),
+ )
- if not self._is_state_group_referenced(txn, sg):
- # Let's also delete unreferenced state groups while we're
- # here, since otherwise we'd need to de-delta them
- state_groups_to_delete.add(sg)
- unreferenced_state_groups += 1
- continue
+ logger.info("[purge] finding state groups that can be deleted")
- remaining_state_groups.append(sg)
+ state_groups_to_delete, remaining_state_groups = self._find_unreferenced_groups(
+ txn, referenced_state_groups,
+ )
logger.info(
- "[purge] found %i extra unreferenced state groups to delete",
- unreferenced_state_groups,
+ "[purge] found %i state groups to delete",
+ len(state_groups_to_delete),
)
logger.info(
@@ -2109,11 +2070,11 @@ class EventsStore(EventFederationStore, EventsWorkerStore, BackgroundUpdateStore
logger.info("[purge] removing redundant state groups")
txn.executemany(
"DELETE FROM state_groups_state WHERE state_group = ?",
- state_rows
+ ((sg,) for sg in state_groups_to_delete),
)
txn.executemany(
"DELETE FROM state_groups WHERE id = ?",
- state_rows
+ ((sg,) for sg in state_groups_to_delete),
)
logger.info("[purge] removing events from event_to_state_groups")
diff --git a/synapse/storage/state.py b/synapse/storage/state.py
index f7cf5c86c9..0f86311ed4 100644
--- a/synapse/storage/state.py
+++ b/synapse/storage/state.py
@@ -1041,55 +1041,85 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore):
return count
- def _is_state_group_referenced(self, txn, state_group):
- """Checks if a given state group is referenced, or is safe to delete.
+ def _find_unreferenced_groups(self, txn, state_groups):
+ """Used when purging history to figure out which state groups can be
+ deleted and which need to be de-delta'ed (due to one of its prev groups
+ being scheduled for deletion).
- A state group is referenced if it or any of its descendants are
- pointed at by an event. (A descendant is a state_group whose chain of
- prev_groups includes the given state_group.)
- """
-
- # We check this by doing a depth first search to look for any
- # descendant referenced by `event_to_state_groups`.
-
- # State groups we need to check, contains state groups that are
- # descendants of `state_group`
- state_groups_to_search = [state_group]
-
- # Set of state groups we've already checked
- state_groups_searched = set()
-
- while state_groups_to_search:
- state_group = state_groups_to_search.pop() # Next state group to check
+ Args:
+ txn
+ state_groups (set[int]): Set of state groups referenced by events
+ that are going to be deleted.
- is_referenced = self._simple_select_one_onecol_txn(
+ Returns:
+ tuple[set[int], set[int]]: The set of state groups that can be
+ deleted and the set of state groups that need to be de-delta'ed
+ """
+ # Graph of state group -> previous group
+ graph = {}
+
+ # Set of events that we have found to be referenced by events
+ referenced_groups = set()
+
+ # Set of state groups we've already seen
+ state_groups_seen = set(state_groups)
+
+ # Set of state groups to handle next.
+ next_to_search = set(state_groups)
+ while next_to_search:
+ # We bound size of groups we're looking up at once, to stop the
+ # SQL query getting too big
+ if len(next_to_search) < 100:
+ current_search = next_to_search
+ next_to_search = set()
+ else:
+ lst = list(next_to_search)
+ current_search = set(lst[:100])
+ next_to_search = set(lst[100:])
+
+ # Check if state groups are referenced
+ sql = """
+ SELECT state_group, count(*) FROM event_to_state_groups
+ LEFT JOIN events_to_purge AS ep USING (event_id)
+ WHERE state_group IN (%s) AND ep.event_id IS NULL
+ GROUP BY state_group
+ """ % (",".join("?" for _ in current_search),)
+ txn.execute(sql, list(current_search))
+
+ referenced = set(sg for sg, cnt in txn if cnt > 0)
+ referenced_groups |= referenced
+
+ # We don't continue iterating up the state group graphs for state
+ # groups that are referenced.
+ current_search -= referenced
+
+ rows = self._simple_select_many_txn(
txn,
- table="event_to_state_groups",
- keyvalues={"state_group": state_group},
- retcol="event_id",
- allow_none=True,
+ table="state_group_edges",
+ column="prev_state_group",
+ iterable=current_search,
+ keyvalues={},
+ retcols=("prev_state_group", "state_group",),
)
- if is_referenced:
- # A descendant is referenced by event_to_state_groups, so
- # original state group is referenced.
- return True
- state_groups_searched.add(state_group)
+ next_to_search.update(row["state_group"] for row in rows)
+ # We don't bother re-handling groups we've already seen
+ next_to_search -= state_groups_seen
+ state_groups_seen |= next_to_search
- # Find all children of current state group and add to search
- references = self._simple_select_onecol_txn(
- txn,
- table="state_group_edges",
- keyvalues={"prev_state_group": state_group},
- retcol="state_group",
- )
- state_groups_to_search.extend(references)
+ for row in rows:
+ # Note: Each state group can have at most one prev group
+ graph[row["state_group"]] = row["prev_state_group"]
+
+ to_delete = state_groups_seen - referenced_groups
- # Lets be paranoid and check for cycles
- if state_groups_searched.intersection(references):
- raise Exception("State group %s has cyclic dependency", state_group)
+ to_dedelta = set()
+ for sg in referenced_groups:
+ prev_sg = graph.get(sg)
+ if prev_sg and prev_sg in to_delete:
+ to_dedelta.add(sg)
- return False
+ return to_delete, to_dedelta
class StateStore(StateGroupWorkerStore, BackgroundUpdateStore):
|