summary refs log tree commit diff
diff options
context:
space:
mode:
authorMatthew Hodgson <matthew@matrix.org>2016-09-26 01:18:41 +0100
committerMatthew Hodgson <matthew@matrix.org>2016-09-26 01:18:41 +0100
commitcc4944c83e25c3fd622d0d553a6d3fa585f18a47 (patch)
tree96d09e4af8f9c6de90a29e190d373b87bbb87dda
parenttypo (diff)
downloadsynapse-github/matthew/speed-up-dedup.tar.xz
speed up state deduplication migrations on sqlite github/matthew/speed-up-dedup matthew/speed-up-dedup
Remove the self-inner-join and instead abuse sqlite's quirk that it
allows us to mix aggregate & non-aggregate columns in a grouped
select to return the event_ids of the events in a state group:
see http://marc.info/?l=sqlite-users&m=141460824410980&w=2.

This speeds up these queries by around 25x, drastically improving
performance when sqlite is upgraded to 0.18.
-rw-r--r--synapse/storage/state.py16
1 files changed, 9 insertions, 7 deletions
diff --git a/synapse/storage/state.py b/synapse/storage/state.py
index 7eb342674c..e1b9ceb776 100644
--- a/synapse/storage/state.py
+++ b/synapse/storage/state.py
@@ -389,14 +389,16 @@ class StateStore(SQLBaseStore):
                     if next_group:
                         group_tree.append(next_group)
 
+                # sqlite lets us mix aggregate & non-aggregate columns in
+                # a grouped select - for details, see:
+                # http://marc.info/?l=sqlite-users&m=141460824410980&w=2
+                # As a result, we can avoid a self-join to populate
+                # event_id, which empirically speeds things up by 25x.
                 sql = ("""
-                    SELECT type, state_key, event_id FROM state_groups_state
-                    INNER JOIN (
-                        SELECT type, state_key, max(state_group) as state_group
-                        FROM state_groups_state
-                        WHERE state_group IN (%s) %s
-                        GROUP BY type, state_key
-                    ) USING (type, state_key, state_group);
+                    SELECT type, state_key, event_id, max(state_group) as state_group
+                    FROM state_groups_state
+                    WHERE state_group IN (%s) %s
+                    GROUP BY type, state_key;
                 """) % (",".join("?" for _ in group_tree), where_clause,)
 
                 args = list(group_tree)