diff --git a/synapse/storage/state.py b/synapse/storage/state.py
index ec551b0b4f..589a4fec6e 100644
--- a/synapse/storage/state.py
+++ b/synapse/storage/state.py
@@ -16,6 +16,7 @@
from ._base import SQLBaseStore
from synapse.util.caches.descriptors import cached, cachedList
from synapse.util.caches import intern_string
+from synapse.storage.engines import PostgresEngine
from twisted.internet import defer
@@ -24,6 +25,9 @@ import logging
logger = logging.getLogger(__name__)
+MAX_STATE_DELTA_HOPS = 100
+
+
class StateStore(SQLBaseStore):
""" Keeps track of the state at a given event.
@@ -43,6 +47,15 @@ class StateStore(SQLBaseStore):
* `state_groups_state`: Maps state group to state events.
"""
+ STATE_GROUP_DEDUPLICATION_UPDATE_NAME = "state_group_state_deduplication"
+
+ def __init__(self, hs):
+ super(StateStore, self).__init__(hs)
+ self.register_background_update_handler(
+ self.STATE_GROUP_DEDUPLICATION_UPDATE_NAME,
+ self._background_deduplicate_state,
+ )
+
@defer.inlineCallbacks
def get_state_groups_ids(self, room_id, event_ids):
if not event_ids:
@@ -103,11 +116,8 @@ class StateStore(SQLBaseStore):
state_groups[event.event_id] = context.state_group
if self._have_persisted_state_group_txn(txn, context.state_group):
- logger.info("Already persisted state_group: %r", context.state_group)
continue
- state_event_ids = dict(context.current_state_ids)
-
self._simple_insert_txn(
txn,
table="state_groups",
@@ -118,20 +128,51 @@ class StateStore(SQLBaseStore):
},
)
- self._simple_insert_many_txn(
- txn,
- table="state_groups_state",
- values=[
- {
+ # We persist as a delta if we can, while also ensuring the chain
+ # of deltas isn't tooo long, as otherwise read performance degrades.
+ if context.prev_group:
+ potential_hops = self._count_state_group_hops_txn(
+ txn, context.prev_group
+ )
+ if context.prev_group and potential_hops < MAX_STATE_DELTA_HOPS:
+ self._simple_insert_txn(
+ txn,
+ table="state_group_edges",
+ values={
"state_group": context.state_group,
- "room_id": event.room_id,
- "type": key[0],
- "state_key": key[1],
- "event_id": state_id,
- }
- for key, state_id in state_event_ids.items()
- ],
- )
+ "prev_state_group": context.prev_group,
+ },
+ )
+
+ self._simple_insert_many_txn(
+ txn,
+ table="state_groups_state",
+ values=[
+ {
+ "state_group": context.state_group,
+ "room_id": event.room_id,
+ "type": key[0],
+ "state_key": key[1],
+ "event_id": state_id,
+ }
+ for key, state_id in context.delta_ids.items()
+ ],
+ )
+ else:
+ self._simple_insert_many_txn(
+ txn,
+ table="state_groups_state",
+ values=[
+ {
+ "state_group": context.state_group,
+ "room_id": event.room_id,
+ "type": key[0],
+ "state_key": key[1],
+ "event_id": state_id,
+ }
+ for key, state_id in context.current_state_ids.items()
+ ],
+ )
self._simple_insert_many_txn(
txn,
@@ -145,6 +186,47 @@ class StateStore(SQLBaseStore):
],
)
+ def _count_state_group_hops_txn(self, txn, state_group):
+ """Given a state group, count how many hops there are in the tree.
+
+ This is used to ensure the delta chains don't get too long.
+ """
+ if isinstance(self.database_engine, PostgresEngine):
+ sql = ("""
+ WITH RECURSIVE state(state_group) AS (
+ VALUES(?::bigint)
+ UNION ALL
+ SELECT prev_state_group FROM state_group_edges e, state s
+ WHERE s.state_group = e.state_group
+ )
+ SELECT count(*) FROM state;
+ """)
+
+ txn.execute(sql, (state_group,))
+ row = txn.fetchone()
+ if row and row[0]:
+ return row[0]
+ else:
+ return 0
+ else:
+ # We don't use WITH RECURSIVE on sqlite3 as there are distributions
+ # that ship with an sqlite3 version that doesn't support it (e.g. wheezy)
+ next_group = state_group
+ count = 0
+
+ while next_group:
+ next_group = self._simple_select_one_onecol_txn(
+ txn,
+ table="state_group_edges",
+ keyvalues={"state_group": next_group},
+ retcol="prev_state_group",
+ allow_none=True,
+ )
+ if next_group:
+ count += 1
+
+ return count
+
@defer.inlineCallbacks
def get_current_state(self, room_id, event_type=None, state_key=""):
if event_type and state_key is not None:
@@ -206,48 +288,99 @@ class StateStore(SQLBaseStore):
def _get_state_groups_from_groups(self, groups, types):
"""Returns dictionary state_group -> (dict of (type, state_key) -> event id)
"""
- def f(txn, groups):
- if types is not None:
- where_clause = "AND (%s)" % (
- " OR ".join(["(type = ? AND state_key = ?)"] * len(types)),
- )
- else:
- where_clause = ""
-
- sql = (
- "SELECT state_group, event_id, type, state_key"
- " FROM state_groups_state WHERE"
- " state_group IN (%s) %s" % (
- ",".join("?" for _ in groups),
- where_clause,
- )
- )
-
- args = list(groups)
- if types is not None:
- args.extend([i for typ in types for i in typ])
-
- txn.execute(sql, args)
- rows = self.cursor_to_dict(txn)
-
- results = {group: {} for group in groups}
- for row in rows:
- key = (row["type"], row["state_key"])
- results[row["state_group"]][key] = row["event_id"]
- return results
-
results = {}
chunks = [groups[i:i + 100] for i in xrange(0, len(groups), 100)]
for chunk in chunks:
res = yield self.runInteraction(
"_get_state_groups_from_groups",
- f, chunk
+ self._get_state_groups_from_groups_txn, chunk, types,
)
results.update(res)
defer.returnValue(results)
+ def _get_state_groups_from_groups_txn(self, txn, groups, types=None):
+ if types is not None:
+ where_clause = "AND (%s)" % (
+ " OR ".join(["(type = ? AND state_key = ?)"] * len(types)),
+ )
+ else:
+ where_clause = ""
+
+ results = {group: {} for group in groups}
+ if isinstance(self.database_engine, PostgresEngine):
+ # The below query walks the state_group tree so that the "state"
+ # table includes all state_groups in the tree. It then joins
+ # against `state_groups_state` to fetch the latest state.
+ # It assumes that previous state groups are always numerically
+ # lesser.
+ sql = ("""
+ WITH RECURSIVE state(state_group) AS (
+ VALUES(?::bigint)
+ UNION ALL
+ SELECT prev_state_group FROM state_group_edges e, state s
+ WHERE s.state_group = e.state_group
+ )
+ SELECT type, state_key, event_id FROM state_groups_state
+ WHERE ROW(type, state_key, state_group) IN (
+ SELECT type, state_key, max(state_group) FROM state
+ INNER JOIN state_groups_state USING (state_group)
+ GROUP BY type, state_key
+ )
+ %s;
+ """) % (where_clause,)
+
+ for group in groups:
+ args = [group]
+ if types is not None:
+ args.extend([i for typ in types for i in typ])
+
+ txn.execute(sql, args)
+ rows = self.cursor_to_dict(txn)
+ for row in rows:
+ key = (row["type"], row["state_key"])
+ results[group][key] = row["event_id"]
+ else:
+ # We don't use WITH RECURSIVE on sqlite3 as there are distributions
+ # that ship with an sqlite3 version that doesn't support it (e.g. wheezy)
+ for group in groups:
+ group_tree = [group]
+ next_group = group
+
+ while next_group:
+ next_group = self._simple_select_one_onecol_txn(
+ txn,
+ table="state_group_edges",
+ keyvalues={"state_group": next_group},
+ retcol="prev_state_group",
+ allow_none=True,
+ )
+ if next_group:
+ group_tree.append(next_group)
+
+ sql = ("""
+ SELECT type, state_key, event_id FROM state_groups_state
+ INNER JOIN (
+ SELECT type, state_key, max(state_group) as state_group
+ FROM state_groups_state
+ WHERE state_group IN (%s) %s
+ GROUP BY type, state_key
+ ) USING (type, state_key, state_group);
+ """) % (",".join("?" for _ in group_tree), where_clause,)
+
+ args = list(group_tree)
+ if types is not None:
+ args.extend([i for typ in types for i in typ])
+
+ txn.execute(sql, args)
+ rows = self.cursor_to_dict(txn)
+ for row in rows:
+ key = (row["type"], row["state_key"])
+ results[group][key] = row["event_id"]
+
+ return results
+
@defer.inlineCallbacks
def get_state_for_events(self, event_ids, types):
"""Given a list of event_ids and type tuples, return a list of state
@@ -504,32 +637,127 @@ class StateStore(SQLBaseStore):
defer.returnValue(results)
- def get_all_new_state_groups(self, last_id, current_id, limit):
- def get_all_new_state_groups_txn(txn):
- sql = (
- "SELECT id, room_id, event_id FROM state_groups"
- " WHERE ? < id AND id <= ? ORDER BY id LIMIT ?"
+ def get_next_state_group(self):
+ return self._state_groups_id_gen.get_next()
+
+ @defer.inlineCallbacks
+ def _background_deduplicate_state(self, progress, batch_size):
+ """This background update will slowly deduplicate state by reencoding
+ them as deltas.
+ """
+ last_state_group = progress.get("last_state_group", 0)
+ rows_inserted = progress.get("rows_inserted", 0)
+ max_group = progress.get("max_group", None)
+
+ if max_group is None:
+ rows = yield self._execute(
+ "_background_deduplicate_state", None,
+ "SELECT coalesce(max(id), 0) FROM state_groups",
)
- txn.execute(sql, (last_id, current_id, limit))
- groups = txn.fetchall()
+ max_group = rows[0][0]
+
+ def reindex_txn(txn):
+ new_last_state_group = last_state_group
+ for count in xrange(batch_size):
+ txn.execute(
+ "SELECT id, room_id FROM state_groups"
+ " WHERE ? < id AND id <= ?"
+ " ORDER BY id ASC"
+ " LIMIT 1",
+ (new_last_state_group, max_group,)
+ )
+ row = txn.fetchone()
+ if row:
+ state_group, room_id = row
- if not groups:
- return ([], [])
+ if not row or not state_group:
+ return True, count
- lower_bound = groups[0][0]
- upper_bound = groups[-1][0]
- sql = (
- "SELECT state_group, type, state_key, event_id"
- " FROM state_groups_state"
- " WHERE ? <= state_group AND state_group <= ?"
+ txn.execute(
+ "SELECT coalesce(max(id), 0) FROM state_groups"
+ " WHERE id < ? AND room_id = ?",
+ (state_group, room_id,)
+ )
+ prev_group, = txn.fetchone()
+ new_last_state_group = state_group
+
+ if prev_group:
+ potential_hops = self._count_state_group_hops_txn(
+ txn, prev_group
+ )
+ if potential_hops >= MAX_STATE_DELTA_HOPS:
+ # We want to ensure chains are at most this long,#
+ # otherwise read performance degrades.
+ continue
+
+ prev_state = self._get_state_groups_from_groups_txn(
+ txn, [prev_group], types=None
+ )
+ prev_state = prev_state[prev_group]
+
+ curr_state = self._get_state_groups_from_groups_txn(
+ txn, [state_group], types=None
+ )
+ curr_state = curr_state[state_group]
+
+ if not set(prev_state.keys()) - set(curr_state.keys()):
+ # We can only do a delta if the current has a strict super set
+ # of keys
+
+ delta_state = {
+ key: value for key, value in curr_state.items()
+ if prev_state.get(key, None) != value
+ }
+
+ self._simple_insert_txn(
+ txn,
+ table="state_group_edges",
+ values={
+ "state_group": state_group,
+ "prev_state_group": prev_group,
+ }
+ )
+
+ self._simple_delete_txn(
+ txn,
+ table="state_groups_state",
+ keyvalues={
+ "state_group": state_group,
+ }
+ )
+
+ self._simple_insert_many_txn(
+ txn,
+ table="state_groups_state",
+ values=[
+ {
+ "state_group": state_group,
+ "room_id": room_id,
+ "type": key[0],
+ "state_key": key[1],
+ "event_id": state_id,
+ }
+ for key, state_id in delta_state.items()
+ ],
+ )
+
+ progress = {
+ "last_state_group": state_group,
+ "rows_inserted": rows_inserted + batch_size,
+ "max_group": max_group,
+ }
+
+ self._background_update_progress_txn(
+ txn, self.STATE_GROUP_DEDUPLICATION_UPDATE_NAME, progress
)
- txn.execute(sql, (lower_bound, upper_bound))
- state_group_state = txn.fetchall()
- return (groups, state_group_state)
- return self.runInteraction(
- "get_all_new_state_groups", get_all_new_state_groups_txn
+ return False, batch_size
+
+ finished, result = yield self.runInteraction(
+ self.STATE_GROUP_DEDUPLICATION_UPDATE_NAME, reindex_txn
)
- def get_next_state_group(self):
- return self._state_groups_id_gen.get_next()
+ if finished:
+ yield self._end_background_update(self.STATE_GROUP_DEDUPLICATION_UPDATE_NAME)
+
+ defer.returnValue(result)
|