diff options
author | Erik Johnston <erik@matrix.org> | 2016-04-22 11:23:49 +0100 |
---|---|---|
committer | Erik Johnston <erik@matrix.org> | 2016-04-22 11:23:49 +0100 |
commit | 64ec3493c1203747ab1b77a1f5b477a5f897613d (patch) | |
tree | a57b369c4b857ed1ef353ba7aedfcfcb431b9ec9 /synapse/storage | |
parent | Merge pull request #687 from nikriek/jwt-fix (diff) | |
parent | Update port script (diff) | |
download | synapse-64ec3493c1203747ab1b77a1f5b477a5f897613d.tar.xz |
Merge pull request #745 from matrix-org/erikj/search-index
Optimise event_search in postgres
Diffstat (limited to 'synapse/storage')
-rw-r--r-- | synapse/storage/background_updates.py | 3 | ||||
-rw-r--r-- | synapse/storage/room.py | 16 | ||||
-rw-r--r-- | synapse/storage/schema/delta/31/search_update.py | 65 | ||||
-rw-r--r-- | synapse/storage/search.py | 92 |
4 files changed, 169 insertions, 7 deletions
diff --git a/synapse/storage/background_updates.py b/synapse/storage/background_updates.py index 49904046cf..66a995157d 100644 --- a/synapse/storage/background_updates.py +++ b/synapse/storage/background_updates.py @@ -173,11 +173,12 @@ class BackgroundUpdateStore(SQLBaseStore): logger.info( "Updating %r. Updated %r items in %rms." - " (total_rate=%r/ms, current_rate=%r/ms, total_updated=%r)", + " (total_rate=%r/ms, current_rate=%r/ms, total_updated=%r, batch_size=%r)", update_name, items_updated, duration_ms, performance.total_items_per_ms(), performance.average_items_per_ms(), performance.total_item_count, + batch_size, ) performance.update(items_updated, duration_ms) diff --git a/synapse/storage/room.py b/synapse/storage/room.py index 9be977f387..70aa64fb31 100644 --- a/synapse/storage/room.py +++ b/synapse/storage/room.py @@ -169,20 +169,28 @@ class RoomStore(SQLBaseStore): def _store_event_search_txn(self, txn, event, key, value): if isinstance(self.database_engine, PostgresEngine): sql = ( - "INSERT INTO event_search (event_id, room_id, key, vector)" - " VALUES (?,?,?,to_tsvector('english', ?))" + "INSERT INTO event_search" + " (event_id, room_id, key, vector, stream_ordering, origin_server_ts)" + " VALUES (?,?,?,to_tsvector('english', ?),?,?)" + ) + txn.execute( + sql, + ( + event.event_id, event.room_id, key, value, + event.internal_metadata.stream_ordering, + event.origin_server_ts, + ) ) elif isinstance(self.database_engine, Sqlite3Engine): sql = ( "INSERT INTO event_search (event_id, room_id, key, value)" " VALUES (?,?,?,?)" ) + txn.execute(sql, (event.event_id, event.room_id, key, value,)) else: # This should be unreachable. raise Exception("Unrecognized database engine") - txn.execute(sql, (event.event_id, event.room_id, key, value,)) - @cachedInlineCallbacks() def get_room_name_and_aliases(self, room_id): def f(txn): diff --git a/synapse/storage/schema/delta/31/search_update.py b/synapse/storage/schema/delta/31/search_update.py new file mode 100644 index 0000000000..470ae0c005 --- /dev/null +++ b/synapse/storage/schema/delta/31/search_update.py @@ -0,0 +1,65 @@ +# Copyright 2016 OpenMarket Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from synapse.storage.engines import PostgresEngine +from synapse.storage.prepare_database import get_statements + +import logging +import ujson + +logger = logging.getLogger(__name__) + + +ALTER_TABLE = """ +ALTER TABLE event_search ADD COLUMN origin_server_ts BIGINT; +ALTER TABLE event_search ADD COLUMN stream_ordering BIGINT; +""" + + +def run_create(cur, database_engine, *args, **kwargs): + if not isinstance(database_engine, PostgresEngine): + return + + for statement in get_statements(ALTER_TABLE.splitlines()): + cur.execute(statement) + + cur.execute("SELECT MIN(stream_ordering) FROM events") + rows = cur.fetchall() + min_stream_id = rows[0][0] + + cur.execute("SELECT MAX(stream_ordering) FROM events") + rows = cur.fetchall() + max_stream_id = rows[0][0] + + if min_stream_id is not None and max_stream_id is not None: + progress = { + "target_min_stream_id_inclusive": min_stream_id, + "max_stream_id_exclusive": max_stream_id + 1, + "rows_inserted": 0, + "have_added_indexes": False, + } + progress_json = ujson.dumps(progress) + + sql = ( + "INSERT into background_updates (update_name, progress_json)" + " VALUES (?, ?)" + ) + + sql = database_engine.convert_param_style(sql) + + cur.execute(sql, ("event_search_order", progress_json)) + + +def run_upgrade(cur, database_engine, *args, **kwargs): + pass diff --git a/synapse/storage/search.py b/synapse/storage/search.py index 59ac7f424c..0224299625 100644 --- a/synapse/storage/search.py +++ b/synapse/storage/search.py @@ -29,12 +29,17 @@ logger = logging.getLogger(__name__) class SearchStore(BackgroundUpdateStore): EVENT_SEARCH_UPDATE_NAME = "event_search" + EVENT_SEARCH_ORDER_UPDATE_NAME = "event_search_order" def __init__(self, hs): super(SearchStore, self).__init__(hs) self.register_background_update_handler( self.EVENT_SEARCH_UPDATE_NAME, self._background_reindex_search ) + self.register_background_update_handler( + self.EVENT_SEARCH_ORDER_UPDATE_NAME, + self._background_reindex_search_order + ) @defer.inlineCallbacks def _background_reindex_search(self, progress, batch_size): @@ -132,6 +137,82 @@ class SearchStore(BackgroundUpdateStore): defer.returnValue(result) @defer.inlineCallbacks + def _background_reindex_search_order(self, progress, batch_size): + target_min_stream_id = progress["target_min_stream_id_inclusive"] + max_stream_id = progress["max_stream_id_exclusive"] + rows_inserted = progress.get("rows_inserted", 0) + have_added_index = progress['have_added_indexes'] + + if not have_added_index: + def create_index(conn): + conn.rollback() + conn.set_session(autocommit=True) + c = conn.cursor() + + # We create with NULLS FIRST so that when we search *backwards* + # we get the ones with non null origin_server_ts *first* + c.execute( + "CREATE INDEX CONCURRENTLY event_search_room_order ON event_search(" + "room_id, origin_server_ts NULLS FIRST, stream_ordering NULLS FIRST)" + ) + c.execute( + "CREATE INDEX CONCURRENTLY event_search_order ON event_search(" + "origin_server_ts NULLS FIRST, stream_ordering NULLS FIRST)" + ) + conn.set_session(autocommit=False) + + yield self.runWithConnection(create_index) + + pg = dict(progress) + pg["have_added_indexes"] = True + + yield self.runInteraction( + self.EVENT_SEARCH_ORDER_UPDATE_NAME, + self._background_update_progress_txn, + self.EVENT_SEARCH_ORDER_UPDATE_NAME, pg, + ) + + def reindex_search_txn(txn): + sql = ( + "UPDATE event_search AS es SET stream_ordering = e.stream_ordering," + " origin_server_ts = e.origin_server_ts" + " FROM events AS e" + " WHERE e.event_id = es.event_id" + " AND ? <= e.stream_ordering AND e.stream_ordering < ?" + " RETURNING es.stream_ordering" + ) + + min_stream_id = max_stream_id - batch_size + txn.execute(sql, (min_stream_id, max_stream_id)) + rows = txn.fetchall() + + if min_stream_id < target_min_stream_id: + # We've recached the end. + return len(rows), False + + progress = { + "target_min_stream_id_inclusive": target_min_stream_id, + "max_stream_id_exclusive": min_stream_id, + "rows_inserted": rows_inserted + len(rows), + "have_added_indexes": True, + } + + self._background_update_progress_txn( + txn, self.EVENT_SEARCH_ORDER_UPDATE_NAME, progress + ) + + return len(rows), True + + num_rows, finished = yield self.runInteraction( + self.EVENT_SEARCH_ORDER_UPDATE_NAME, reindex_search_txn + ) + + if not finished: + yield self._end_background_update(self.EVENT_SEARCH_ORDER_UPDATE_NAME) + + defer.returnValue(num_rows) + + @defer.inlineCallbacks def search_msgs(self, room_ids, search_term, keys): """Performs a full text search over events with given keys. @@ -310,7 +391,6 @@ class SearchStore(BackgroundUpdateStore): "SELECT ts_rank_cd(vector, to_tsquery('english', ?)) as rank," " origin_server_ts, stream_ordering, room_id, event_id" " FROM event_search" - " NATURAL JOIN events" " WHERE vector @@ to_tsquery('english', ?) AND " ) args = [search_query, search_query] + args @@ -355,7 +435,15 @@ class SearchStore(BackgroundUpdateStore): # We add an arbitrary limit here to ensure we don't try to pull the # entire table from the database. - sql += " ORDER BY origin_server_ts DESC, stream_ordering DESC LIMIT ?" + if isinstance(self.database_engine, PostgresEngine): + sql += ( + " ORDER BY origin_server_ts DESC NULLS LAST," + " stream_ordering DESC NULLS LAST LIMIT ?" + ) + elif isinstance(self.database_engine, Sqlite3Engine): + sql += " ORDER BY origin_server_ts DESC, stream_ordering DESC LIMIT ?" + else: + raise Exception("Unrecognized database engine") args.append(limit) |