diff options
author | Matthew Hodgson <matthew@matrix.org> | 2016-09-26 01:18:41 +0100 |
---|---|---|
committer | Matthew Hodgson <matthew@matrix.org> | 2016-09-26 01:18:41 +0100 |
commit | cc4944c83e25c3fd622d0d553a6d3fa585f18a47 (patch) | |
tree | 96d09e4af8f9c6de90a29e190d373b87bbb87dda | |
parent | typo (diff) | |
download | synapse-github/matthew/speed-up-dedup.tar.xz |
speed up state deduplication migrations on sqlite github/matthew/speed-up-dedup matthew/speed-up-dedup
Remove the self-inner-join and instead abuse sqlite's quirk that it allows us to mix aggregate & non-aggregate columns in a grouped select to return the event_ids of the events in a state group: see http://marc.info/?l=sqlite-users&m=141460824410980&w=2. This speeds up these queries by around 25x, drastically improving performance when sqlite is upgraded to 0.18.
-rw-r--r-- | synapse/storage/state.py | 16 |
1 files changed, 9 insertions, 7 deletions
diff --git a/synapse/storage/state.py b/synapse/storage/state.py index 7eb342674c..e1b9ceb776 100644 --- a/synapse/storage/state.py +++ b/synapse/storage/state.py @@ -389,14 +389,16 @@ class StateStore(SQLBaseStore): if next_group: group_tree.append(next_group) + # sqlite lets us mix aggregate & non-aggregate columns in + # a grouped select - for details, see: + # http://marc.info/?l=sqlite-users&m=141460824410980&w=2 + # As a result, we can avoid a self-join to populate + # event_id, which empirically speeds things up by 25x. sql = (""" - SELECT type, state_key, event_id FROM state_groups_state - INNER JOIN ( - SELECT type, state_key, max(state_group) as state_group - FROM state_groups_state - WHERE state_group IN (%s) %s - GROUP BY type, state_key - ) USING (type, state_key, state_group); + SELECT type, state_key, event_id, max(state_group) as state_group + FROM state_groups_state + WHERE state_group IN (%s) %s + GROUP BY type, state_key; """) % (",".join("?" for _ in group_tree), where_clause,) args = list(group_tree) |