Merge pull request #6340 from matrix-org/babolivier/pagination_query

Fix the SQL SELECT query in _paginate_room_events_txn
author: Brendan Abolivier <babolivier@matrix.org> 2019-11-08 11:12:24 +0000
committer: GitHub <noreply@github.com> 2019-11-08 11:12:24 +0000
commit: 963ffb60b95bd5c3e08e7f031db2554706bef87c (patch)
tree: 07df6a134f8305b1ec6462f1d27ab41cd4a51bf0 /synapse/storage/data_stores
parent: Merge pull request #6295 from matrix-org/erikj/split_purge_history (diff)
parent: Incorporate review (diff)
download: synapse-963ffb60b95bd5c3e08e7f031db2554706bef87c.tar.xz
1 files changed, 32 insertions, 8 deletions
diff --git a/synapse/storage/data_stores/main/stream.py b/synapse/storage/data_stores/main/stream.py
index 616ef91d4e..8780fdd989 100644
--- a/synapse/storage/data_stores/main/stream.py
+++ b/synapse/storage/data_stores/main/stream.py
@@ -871,14 +871,38 @@ class StreamWorkerStore(EventsWorkerStore, SQLBaseStore):
 
         args.append(int(limit))
 
-        sql = (
-            "SELECT DISTINCT event_id, topological_ordering, stream_ordering"
-            " FROM events"
-            " LEFT JOIN event_labels USING (event_id, room_id, topological_ordering)"
-            " WHERE outlier = ? AND room_id = ? AND %(bounds)s"
-            " ORDER BY topological_ordering %(order)s,"
-            " stream_ordering %(order)s LIMIT ?"
-        ) % {"bounds": bounds, "order": order}
+        select_keywords = "SELECT"
+        join_clause = ""
+        if event_filter and event_filter.labels:
+            # If we're not filtering on a label, then joining on event_labels will
+            # return as many row for a single event as the number of labels it has. To
+            # avoid this, only join if we're filtering on at least one label.
+            join_clause = """
+                LEFT JOIN event_labels
+                USING (event_id, room_id, topological_ordering)
+            """
+            if len(event_filter.labels) > 1:
+                # Using DISTINCT in this SELECT query is quite expensive, because it
+                # requires the engine to sort on the entire (not limited) result set,
+                # i.e. the entire events table. We only need to use it when we're
+                # filtering on more than two labels, because that's the only scenario
+                # in which we can possibly to get multiple times the same event ID in
+                # the results.
+                select_keywords += "DISTINCT"
+
+        sql = """
+            %(select_keywords)s event_id, topological_ordering, stream_ordering
+            FROM events
+            %(join_clause)s
+            WHERE outlier = ? AND room_id = ? AND %(bounds)s
+            ORDER BY topological_ordering %(order)s,
+            stream_ordering %(order)s LIMIT ?
+        """ % {
+            "select_keywords": select_keywords,
+            "join_clause": join_clause,
+            "bounds": bounds,
+            "order": order,
+        }
 
         txn.execute(sql, args)
author	Brendan Abolivier <babolivier@matrix.org>	2019-11-08 11:12:24 +0000
committer	GitHub <noreply@github.com>	2019-11-08 11:12:24 +0000
commit	963ffb60b95bd5c3e08e7f031db2554706bef87c (patch)
tree	07df6a134f8305b1ec6462f1d27ab41cd4a51bf0 /synapse/storage/data_stores
parent	Merge pull request #6295 from matrix-org/erikj/split_purge_history (diff)
parent	Incorporate review (diff)
download	synapse-963ffb60b95bd5c3e08e7f031db2554706bef87c.tar.xz