From 05c326d445fdd04ef06cb9a2a02cc0d6dde9935b Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Wed, 4 Nov 2015 17:57:44 +0000 Subject: Implement order and group by --- synapse/storage/search.py | 96 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) (limited to 'synapse/storage/search.py') diff --git a/synapse/storage/search.py b/synapse/storage/search.py index cdf003502f..e37e56c1f2 100644 --- a/synapse/storage/search.py +++ b/synapse/storage/search.py @@ -20,6 +20,12 @@ from synapse.storage.engines import PostgresEngine, Sqlite3Engine from collections import namedtuple +import logging + + +logger = logging.getLogger(__name__) + + """The result of a search. Fields: @@ -109,3 +115,93 @@ class SearchStore(SQLBaseStore): event_map, None )) + + @defer.inlineCallbacks + def search_room(self, room_id, search_term, keys, limit, pagination_token=None): + """Performs a full text search over events with given keys. + + Args: + room_id (str): The room_id to search in + search_term (str): Search term to search for + keys (list): List of keys to search in, currently supports + "content.body", "content.name", "content.topic" + pagination_token (str): A pagination token previously returned + + Returns: + SearchResult + """ + clauses = [] + args = [search_term, room_id] + + local_clauses = [] + for key in keys: + local_clauses.append("key = ?") + args.append(key) + + clauses.append( + "(%s)" % (" OR ".join(local_clauses),) + ) + + if pagination_token: + topo, stream = pagination_token.split(",") + clauses.append( + "(topological_ordering < ?" + " OR (topological_ordering = ? AND stream_ordering < ?))" + ) + args.extend([topo, topo, stream]) + + if isinstance(self.database_engine, PostgresEngine): + sql = ( + "SELECT ts_rank_cd(vector, query) as rank," + " topological_ordering, stream_ordering, room_id, event_id" + " FROM plainto_tsquery('english', ?) as query, event_search" + " NATURAL JOIN events" + " WHERE vector @@ query AND room_id = ?" + ) + elif isinstance(self.database_engine, Sqlite3Engine): + sql = ( + "SELECT rank(matchinfo(event_search)) as rank, room_id, event_id" + " topological_ordering, stream_ordering" + " FROM event_search" + " NATURAL JOIN events" + " WHERE value MATCH ? AND room_id = ?" + ) + else: + # This should be unreachable. + raise Exception("Unrecognized database engine") + + for clause in clauses: + sql += " AND " + clause + + # We add an arbitrary limit here to ensure we don't try to pull the + # entire table from the database. + sql += " ORDER BY topological_ordering DESC, stream_ordering DESC LIMIT ?" + + args.append(limit) + + results = yield self._execute( + "search_rooms", self.cursor_to_dict, sql, *args + ) + + events = yield self._get_events([r["event_id"] for r in results]) + + event_map = { + ev.event_id: ev + for ev in events + } + + pagination_token = None + if results: + topo = results[-1]["topological_ordering"] + stream = results[-1]["stream_ordering"] + pagination_token = "%s,%s" % (topo, stream) + + defer.returnValue(SearchResult( + { + r["event_id"]: r["rank"] + for r in results + if r["event_id"] in event_map + }, + event_map, + pagination_token + )) -- cgit 1.5.1 From 7301e05122e07f6513916e8a35bf05581de6521d Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 5 Nov 2015 14:34:37 +0000 Subject: Implement basic pagination for search results --- synapse/handlers/search.py | 78 +++++++++++++++++++++++++++++++++++------- synapse/rest/client/v1/room.py | 3 +- synapse/storage/search.py | 55 ++++++++++------------------- 3 files changed, 86 insertions(+), 50 deletions(-) (limited to 'synapse/storage/search.py') diff --git a/synapse/handlers/search.py b/synapse/handlers/search.py index 28f5300dc9..696780f34e 100644 --- a/synapse/handlers/search.py +++ b/synapse/handlers/search.py @@ -22,6 +22,8 @@ from synapse.api.filtering import Filter from synapse.api.errors import SynapseError from synapse.events.utils import serialize_event +from unpaddedbase64 import decode_base64, encode_base64 + import logging @@ -34,17 +36,32 @@ class SearchHandler(BaseHandler): super(SearchHandler, self).__init__(hs) @defer.inlineCallbacks - def search(self, user, content): + def search(self, user, content, batch=None): """Performs a full text search for a user. Args: user (UserID) content (dict): Search parameters + batch (str): The next_batch parameter. Used for pagination. Returns: dict to be returned to the client with results of search """ + batch_group = None + batch_group_key = None + batch_token = None + if batch: + try: + b = decode_base64(batch) + batch_group, batch_group_key, batch_token = b.split("\n") + + assert batch_group is not None + assert batch_group_key is not None + assert batch_token is not None + except: + raise SynapseError(400, "Invalid batch") + try: room_cat = content["search_categories"]["room_events"] search_term = room_cat["search_term"] @@ -91,17 +108,25 @@ class SearchHandler(BaseHandler): room_ids = search_filter.filter_rooms(room_ids) + if batch_group == "room_id": + room_ids = room_ids & {batch_group_key} + rank_map = {} allowed_events = [] room_groups = {} sender_group = {} + global_next_batch = None if order_by == "rank": - rank_map, event_map, _ = yield self.store.search_msgs( + results = yield self.store.search_msgs( room_ids, search_term, keys ) - filtered_events = search_filter.filter(event_map.values()) + results_map = {r["event"].event_id: r for r in results} + + rank_map.update({r["event"].event_id: r["rank"] for r in results}) + + filtered_events = search_filter.filter([r["event"] for r in results]) events = yield self._filter_events_for_client( user.to_string(), filtered_events @@ -126,18 +151,26 @@ class SearchHandler(BaseHandler): elif order_by == "recent": for room_id in room_ids: room_events = [] - pagination_token = None + if batch_group == "room_id" and batch_group_key == room_id: + pagination_token = batch_token + else: + pagination_token = None i = 0 while len(room_events) < search_filter.limit() and i < 5: i += 5 - r_map, event_map, pagination_token = yield self.store.search_room( + results = yield self.store.search_room( room_id, search_term, keys, search_filter.limit() * 2, pagination_token=pagination_token, ) - rank_map.update(r_map) - filtered_events = search_filter.filter(event_map.values()) + results_map = {r["event"].event_id: r for r in results} + + rank_map.update({r["event"].event_id: r["rank"] for r in results}) + + filtered_events = search_filter.filter([ + r["event"] for r in results + ]) events = yield self._filter_events_for_client( user.to_string(), filtered_events @@ -146,13 +179,26 @@ class SearchHandler(BaseHandler): room_events.extend(events) room_events = room_events[:search_filter.limit()] - if len(event_map) < search_filter.limit() * 2: + if len(results) < search_filter.limit() * 2: + pagination_token = None break + else: + pagination_token = results[-1]["pagination_token"] + + if room_events: + res = results_map[room_events[-1].event_id] + pagination_token = res["pagination_token"] if room_events: group = room_groups.setdefault(room_id, {}) if pagination_token: - group["next_batch"] = pagination_token + next_batch = encode_base64("%s\n%s\n%s" % ( + "room_id", room_id, pagination_token + )) + group["next_batch"] = next_batch + + if batch_token: + global_next_batch = next_batch group["results"] = [e.event_id for e in room_events] group["order"] = max( @@ -164,11 +210,14 @@ class SearchHandler(BaseHandler): # Normalize the group ranks if room_groups: - mx = max(g["order"] for g in room_groups.values()) - mn = min(g["order"] for g in room_groups.values()) + if len(room_groups) > 1: + mx = max(g["order"] for g in room_groups.values()) + mn = min(g["order"] for g in room_groups.values()) - for g in room_groups.values(): - g["order"] = (g["order"] - mn) * 1.0 / (mx - mn) + for g in room_groups.values(): + g["order"] = (g["order"] - mn) * 1.0 / (mx - mn) + else: + room_groups.values()[0]["order"] = 1 else: # We should never get here due to the guard earlier. @@ -239,6 +288,9 @@ class SearchHandler(BaseHandler): if sender_group and "sender" in group_keys: rooms_cat_res.setdefault("groups", {})["sender"] = sender_group + if global_next_batch: + rooms_cat_res["next_batch"] = global_next_batch + defer.returnValue({ "search_categories": { "room_events": rooms_cat_res diff --git a/synapse/rest/client/v1/room.py b/synapse/rest/client/v1/room.py index 2dcaee86cd..8e28f12d29 100644 --- a/synapse/rest/client/v1/room.py +++ b/synapse/rest/client/v1/room.py @@ -601,7 +601,8 @@ class SearchRestServlet(ClientV1RestServlet): content = _parse_json(request) - results = yield self.handlers.search_handler.search(auth_user, content) + batch = request.args.get("next_batch", [None])[0] + results = yield self.handlers.search_handler.search(auth_user, content, batch) defer.returnValue((200, results)) diff --git a/synapse/storage/search.py b/synapse/storage/search.py index e37e56c1f2..7342e7bae6 100644 --- a/synapse/storage/search.py +++ b/synapse/storage/search.py @@ -18,24 +18,12 @@ from twisted.internet import defer from _base import SQLBaseStore from synapse.storage.engines import PostgresEngine, Sqlite3Engine -from collections import namedtuple - import logging logger = logging.getLogger(__name__) -"""The result of a search. - -Fields: - rank_map (dict): Mapping event_id -> rank - event_map (dict): Mapping event_id -> event - pagination_token (str): Pagination token -""" -SearchResult = namedtuple("SearchResult", ("rank_map", "event_map", "pagination_token")) - - class SearchStore(SQLBaseStore): @defer.inlineCallbacks def search_msgs(self, room_ids, search_term, keys): @@ -48,7 +36,7 @@ class SearchStore(SQLBaseStore): "content.body", "content.name", "content.topic" Returns: - SearchResult + list of dicts """ clauses = [] args = [] @@ -106,15 +94,14 @@ class SearchStore(SQLBaseStore): for ev in events } - defer.returnValue(SearchResult( + defer.returnValue([ { - r["event_id"]: r["rank"] - for r in results - if r["event_id"] in event_map - }, - event_map, - None - )) + "event": event_map[r["event_id"]], + "rank": r["rank"], + } + for r in results + if r["event_id"] in event_map + ]) @defer.inlineCallbacks def search_room(self, room_id, search_term, keys, limit, pagination_token=None): @@ -128,7 +115,7 @@ class SearchStore(SQLBaseStore): pagination_token (str): A pagination token previously returned Returns: - SearchResult + list of dicts """ clauses = [] args = [search_term, room_id] @@ -190,18 +177,14 @@ class SearchStore(SQLBaseStore): for ev in events } - pagination_token = None - if results: - topo = results[-1]["topological_ordering"] - stream = results[-1]["stream_ordering"] - pagination_token = "%s,%s" % (topo, stream) - - defer.returnValue(SearchResult( + defer.returnValue([ { - r["event_id"]: r["rank"] - for r in results - if r["event_id"] in event_map - }, - event_map, - pagination_token - )) + "event": event_map[r["event_id"]], + "rank": r["rank"], + "pagination_token": "%s,%s" % ( + r["topological_ordering"], r["stream_ordering"] + ), + } + for r in results + if r["event_id"] in event_map + ]) -- cgit 1.5.1 From 3640ddfbf6641a79b486d3847ae739c974c62c7a Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 5 Nov 2015 16:10:54 +0000 Subject: Error handling --- synapse/storage/search.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'synapse/storage/search.py') diff --git a/synapse/storage/search.py b/synapse/storage/search.py index 7342e7bae6..3cea2011fa 100644 --- a/synapse/storage/search.py +++ b/synapse/storage/search.py @@ -16,6 +16,7 @@ from twisted.internet import defer from _base import SQLBaseStore +from synapse.api.errors import SynapseError from synapse.storage.engines import PostgresEngine, Sqlite3Engine import logging @@ -130,7 +131,13 @@ class SearchStore(SQLBaseStore): ) if pagination_token: - topo, stream = pagination_token.split(",") + try: + topo, stream = pagination_token.split(",") + topo = int(topo) + stream = int(stream) + except: + raise SynapseError(400, "Invalid pagination token") + clauses.append( "(topological_ordering < ?" " OR (topological_ordering = ? AND stream_ordering < ?))" -- cgit 1.5.1 From dd40fb68e455b9a736f1871a19119eb0391cd0d5 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 8 Nov 2015 16:04:37 +0000 Subject: fix comedy important missing comma breaking recent-ordered FTS on sqlite --- synapse/storage/search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'synapse/storage/search.py') diff --git a/synapse/storage/search.py b/synapse/storage/search.py index 3cea2011fa..91a96d2a83 100644 --- a/synapse/storage/search.py +++ b/synapse/storage/search.py @@ -154,7 +154,7 @@ class SearchStore(SQLBaseStore): ) elif isinstance(self.database_engine, Sqlite3Engine): sql = ( - "SELECT rank(matchinfo(event_search)) as rank, room_id, event_id" + "SELECT rank(matchinfo(event_search)) as rank, room_id, event_id," " topological_ordering, stream_ordering" " FROM event_search" " NATURAL JOIN events" -- cgit 1.5.1 From 2ede7aa8a1da20b735797cc9230da7bbeb55efc3 Mon Sep 17 00:00:00 2001 From: Mark Haines Date: Mon, 9 Nov 2015 19:29:32 +0000 Subject: Add background update task for reindexing event search --- synapse/storage/search.py | 98 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 96 insertions(+), 2 deletions(-) (limited to 'synapse/storage/search.py') diff --git a/synapse/storage/search.py b/synapse/storage/search.py index 3cea2011fa..f7c269865d 100644 --- a/synapse/storage/search.py +++ b/synapse/storage/search.py @@ -15,7 +15,7 @@ from twisted.internet import defer -from _base import SQLBaseStore +from .background_updates import BackgroundUpdateStore from synapse.api.errors import SynapseError from synapse.storage.engines import PostgresEngine, Sqlite3Engine @@ -25,7 +25,101 @@ import logging logger = logging.getLogger(__name__) -class SearchStore(SQLBaseStore): +class SearchStore(BackgroundUpdateStore): + + EVENT_SEARCH_UPDATE_NAME = "event_search" + + + @defer.inlineCallbacks + def _background_reindex_search(self, progress, batch_size): + target_min_stream_id = progress["target_min_stream_id"] + max_stream_id = progress["max_stream_id"] + rows_inserted = progress.get("rows_inserted", 0) + + INSERT_CLUMP_SIZE = 1000 + TYPES = ["m.room.name", "m.room.message", "m.room.topic"] + + def reindex_search_txn(txn): + sql = ( + "SELECT stream_id, event_id FROM events" + " WHERE ? <= stream_ordering AND stream_ordering < ?" + " AND (%s)" + " ORDER BY stream_ordering DESC" + " LIMIT ?" + ) % (" OR ".join("type = '%s'" % TYPES),) + + txn.execute(sql, target_min_stream_id, max_stream_id, batch_size) + + rows = txn.fetch_all() + if not rows: + return None + + min_stream_id = rows[-1][0] + event_ids = [row[1] for row in rows] + + events = self._get_events_txn(txn, event_ids) + + event_search_rows = [] + for event in events: + try: + event_id = event.event_id + room_id = event.room_id + content = event.content + if event.type == "m.room.message": + key = "content.body" + value = content["body"] + elif event.type == "m.room.topic": + key = "content.topic" + value = content["topic"] + elif event.type == "m.room.name": + key = "content.name" + value = content["name"] + except Exception: + # If the event is missing a necessary field then + # skip over it. + continue + + event_search_rows.append((event_id, room_id, key, value)) + + if isinstance(self.database_engine, PostgresEngine): + sql = ( + "INSERT INTO event_search (event_id, room_id, key, vector)" + " VALUES (?,?,?,to_tsvector('english', ?))" + ) + elif isinstance(self.database_engine, Sqlite3Engine): + sql = ( + "INSERT INTO event_search (event_id, room_id, key, value)" + " VALUES (?,?,?,?)" + ) + else: + # This should be unreachable. + raise Exception("Unrecognized database engine") + + for index in range(0, len(event_search_rows), INSERT_CLUMP_SIZE): + clump = event_search_rows[index : index + INSERT_CLUMP_SIZE) + txn.execute_many(sql, clump) + + progress = { + "target_max_stream_id": target_min_stream_id, + "max_stream_id": min_stream_id, + "rows_inserted": rows_inserted + len(event_search_rows) + } + + self._background_update_progress_txn( + txn, self.EVENT_SEARCH_UPDATE_NAME, progress + ) + + return len(event_search_rows) + + result = yield self.runInteration( + self.EVENT_SEARCH_UPDATE_NAME, reindex_search_txn + ) + + if result is None: + yield _end_background_update(self.EVENT_SEARCH_UPDATE_NAME) + + defer.returnValue(result) + @defer.inlineCallbacks def search_msgs(self, room_ids, search_term, keys): """Performs a full text search over events with given keys. -- cgit 1.5.1 From a412b9a465cb6a64f44e3656d8645977f43c275f Mon Sep 17 00:00:00 2001 From: Mark Haines Date: Tue, 10 Nov 2015 15:50:58 +0000 Subject: Run the background updates when starting synapse. --- synapse/app/homeserver.py | 1 + synapse/storage/background_updates.py | 57 ++++++++++++++++++++++++++++++----- synapse/storage/search.py | 11 +++++-- synapse/util/__init__.py | 8 +++++ 4 files changed, 67 insertions(+), 10 deletions(-) (limited to 'synapse/storage/search.py') diff --git a/synapse/app/homeserver.py b/synapse/app/homeserver.py index a77535a4ee..cd7a52ec07 100755 --- a/synapse/app/homeserver.py +++ b/synapse/app/homeserver.py @@ -439,6 +439,7 @@ def setup(config_options): hs.get_pusherpool().start() hs.get_state_handler().start_caching() hs.get_datastore().start_profiling() + hs.get_datastore().start_doing_background_updates() hs.get_replication_layer().start_get_pdu_cache() return hs diff --git a/synapse/storage/background_updates.py b/synapse/storage/background_updates.py index 32a233c213..b6cdc6ec68 100644 --- a/synapse/storage/background_updates.py +++ b/synapse/storage/background_updates.py @@ -43,7 +43,7 @@ class BackgroundUpdatePerformance(object): self.avg_item_count += 0.1 * (item_count - self.avg_item_count) self.avg_duration_ms += 0.1 * (duration_ms - self.avg_duration_ms) - def average_duration_ms_per_item(self): + def average_items_per_ms(self): """An estimate of how long it takes to do a single update. Returns: A duration in ms as a float @@ -53,7 +53,17 @@ class BackgroundUpdatePerformance(object): else: # Use the exponential moving average so that we can adapt to # changes in how long the update process takes. - return float(self.avg_duration_ms) / float(self.avg_item_count) + return float(self.avg_item_count) / float(self.avg_duration_ms) + + def total_items_per_ms(self): + """An estimate of how long it takes to do a single update. + Returns: + A duration in ms as a float + """ + if self.total_item_count == 0: + return None + else: + return float(self.total_item_count) / float(self.total_duration_ms) class BackgroundUpdateStore(SQLBaseStore): @@ -65,12 +75,41 @@ class BackgroundUpdateStore(SQLBaseStore): MINIMUM_BACKGROUND_BATCH_SIZE = 100 DEFAULT_BACKGROUND_BATCH_SIZE = 100 + BACKGROUND_UPDATE_INTERVAL_MS = 1000 + BACKGROUND_UPDATE_DURATION_MS = 100 def __init__(self, hs): super(BackgroundUpdateStore, self).__init__(hs) self._background_update_performance = {} self._background_update_queue = [] self._background_update_handlers = {} + self._background_update_timer = None + + @defer.inlineCallbacks + def start_doing_background_updates(self): + while True: + if self._background_update_timer is not None: + return + + sleep = defer.Deferred() + self._background_update_timer = self._clock.call_later( + self.BACKGROUND_UPDATE_INTERVAL_MS / 1000., sleep.callback + ) + try: + yield sleep + finally: + self._background_update_timer = None + + result = yield self.do_background_update( + self.BACKGROUND_UPDATE_DURATION_MS + ) + + if result is None: + logger.info( + "No more background updates to do." + " Unscheduling background update task." + ) + return @defer.inlineCallbacks def do_background_update(self, desired_duration_ms): @@ -106,10 +145,10 @@ class BackgroundUpdateStore(SQLBaseStore): performance = BackgroundUpdatePerformance(update_name) self._background_update_performance[update_name] = performance - duration_ms_per_item = performance.average_duration_ms_per_item() + items_per_ms = performance.average_items_per_ms() - if duration_ms_per_item is not None: - batch_size = int(desired_duration_ms / duration_ms_per_item) + if items_per_ms is not None: + batch_size = int(desired_duration_ms * items_per_ms) # Clamp the batch size so that we always make progress batch_size = max(batch_size, self.MINIMUM_BACKGROUND_BATCH_SIZE) else: @@ -130,8 +169,12 @@ class BackgroundUpdateStore(SQLBaseStore): duration_ms = time_stop - time_start logger.info( - "Updating %. Updated %r items in %rms", - update_name, items_updated, duration_ms + "Updating %. Updated %r items in %rms." + " (total_rate=%r/ms, current_rate=%r/ms, total_updated=%r)", + update_name, items_updated, duration_ms, + performance.total_items_per_ms(), + performance.average_items_per_ms(), + performance.total_item_count, ) performance.update(items_updated, duration_ms) diff --git a/synapse/storage/search.py b/synapse/storage/search.py index f7c269865d..d170c546b5 100644 --- a/synapse/storage/search.py +++ b/synapse/storage/search.py @@ -29,6 +29,11 @@ class SearchStore(BackgroundUpdateStore): EVENT_SEARCH_UPDATE_NAME = "event_search" + def __init__(self, hs): + super(SearchStore, self).__init__(hs) + self.register_background_update_handler( + self.EVENT_SEARCH_UPDATE_NAME, self._background_reindex_search + ) @defer.inlineCallbacks def _background_reindex_search(self, progress, batch_size): @@ -74,7 +79,7 @@ class SearchStore(BackgroundUpdateStore): elif event.type == "m.room.name": key = "content.name" value = content["name"] - except Exception: + except (KeyError, AttributeError): # If the event is missing a necessary field then # skip over it. continue @@ -96,7 +101,7 @@ class SearchStore(BackgroundUpdateStore): raise Exception("Unrecognized database engine") for index in range(0, len(event_search_rows), INSERT_CLUMP_SIZE): - clump = event_search_rows[index : index + INSERT_CLUMP_SIZE) + clump = event_search_rows[index:index + INSERT_CLUMP_SIZE] txn.execute_many(sql, clump) progress = { @@ -116,7 +121,7 @@ class SearchStore(BackgroundUpdateStore): ) if result is None: - yield _end_background_update(self.EVENT_SEARCH_UPDATE_NAME) + yield self._end_background_update(self.EVENT_SEARCH_UPDATE_NAME) defer.returnValue(result) diff --git a/synapse/util/__init__.py b/synapse/util/__init__.py index 1d123ccefc..d69c7cb991 100644 --- a/synapse/util/__init__.py +++ b/synapse/util/__init__.py @@ -53,6 +53,14 @@ class Clock(object): loop.stop() def call_later(self, delay, callback, *args, **kwargs): + """Call something later + + Args: + delay(float): How long to wait in seconds. + callback(function): Function to call + *args: Postional arguments to pass to function. + **kwargs: Key arguments to pass to function. + """ current_context = LoggingContext.current_context() def wrapped_callback(*args, **kwargs): -- cgit 1.5.1 From 90b503216c40974dc4925a9bbb673779a039143c Mon Sep 17 00:00:00 2001 From: Mark Haines Date: Tue, 10 Nov 2015 16:20:13 +0000 Subject: Use a background task to update databases to use the full text search --- synapse/storage/schema/delta/25/fts.py | 100 ++++++++------------------------- synapse/storage/search.py | 8 +-- 2 files changed, 28 insertions(+), 80 deletions(-) (limited to 'synapse/storage/search.py') diff --git a/synapse/storage/schema/delta/25/fts.py b/synapse/storage/schema/delta/25/fts.py index b7cd0ce3b8..e408d36da0 100644 --- a/synapse/storage/schema/delta/25/fts.py +++ b/synapse/storage/schema/delta/25/fts.py @@ -22,7 +22,7 @@ import ujson logger = logging.getLogger(__name__) -POSTGRES_SQL = """ +POSTGRES_TABLE = """ CREATE TABLE IF NOT EXISTS event_search ( event_id TEXT, room_id TEXT, @@ -31,22 +31,6 @@ CREATE TABLE IF NOT EXISTS event_search ( vector tsvector ); -INSERT INTO event_search SELECT - event_id, room_id, json::json->>'sender', 'content.body', - to_tsvector('english', json::json->'content'->>'body') - FROM events NATURAL JOIN event_json WHERE type = 'm.room.message'; - -INSERT INTO event_search SELECT - event_id, room_id, json::json->>'sender', 'content.name', - to_tsvector('english', json::json->'content'->>'name') - FROM events NATURAL JOIN event_json WHERE type = 'm.room.name'; - -INSERT INTO event_search SELECT - event_id, room_id, json::json->>'sender', 'content.topic', - to_tsvector('english', json::json->'content'->>'topic') - FROM events NATURAL JOIN event_json WHERE type = 'm.room.topic'; - - CREATE INDEX event_search_fts_idx ON event_search USING gin(vector); CREATE INDEX event_search_ev_idx ON event_search(event_id); CREATE INDEX event_search_ev_ridx ON event_search(room_id); @@ -61,67 +45,31 @@ SQLITE_TABLE = ( def run_upgrade(cur, database_engine, *args, **kwargs): if isinstance(database_engine, PostgresEngine): - run_postgres_upgrade(cur) + for statement in get_statements(POSTGRES_TABLE.splitlines()): + cur.execute(statement) return if isinstance(database_engine, Sqlite3Engine): - run_sqlite_upgrade(cur) - return - - -def run_postgres_upgrade(cur): - for statement in get_statements(POSTGRES_SQL.splitlines()): - cur.execute(statement) - - -def run_sqlite_upgrade(cur): cur.execute(SQLITE_TABLE) + return - rowid = -1 - while True: - cur.execute( - "SELECT rowid, json FROM event_json" - " WHERE rowid > ?" - " ORDER BY rowid ASC LIMIT 100", - (rowid,) - ) - - res = cur.fetchall() - - if not res: - break - - events = [ - ujson.loads(js) - for _, js in res - ] - - rowid = max(rid for rid, _ in res) - - rows = [] - for ev in events: - content = ev.get("content", {}) - body = content.get("body", None) - name = content.get("name", None) - topic = content.get("topic", None) - sender = ev.get("sender", None) - if ev["type"] == "m.room.message" and body: - rows.append(( - ev["event_id"], ev["room_id"], sender, "content.body", body - )) - if ev["type"] == "m.room.name" and name: - rows.append(( - ev["event_id"], ev["room_id"], sender, "content.name", name - )) - if ev["type"] == "m.room.topic" and topic: - rows.append(( - ev["event_id"], ev["room_id"], sender, "content.topic", topic - )) - - if rows: - logger.info(rows) - cur.executemany( - "INSERT INTO event_search (event_id, room_id, sender, key, value)" - " VALUES (?,?,?,?,?)", - rows - ) + cur.execute("SELECT MIN(stream_ordering) FROM events") + rows = cur.fetchall() + min_stream_id = rows[0][0] + + cur.execute("SELECT MAX(stream_ordering) FROM events") + rows = cur.fetchall() + max_stream_id = rows[0][0] + + if min_stream_id is not None and max_stream_id is not None: + progress = { + "target_min_stream_id_inclusive": min_stream_id, + "max_stream_id_exclusive": max_stream_id + 1, + "rows_inserted": 0, + } + progress_json = ujson.dumps(progress) + + cur.execute( + "INSERT into background_updates (update_name, progress_json)" + " VALUES (?, ?)", ("event_search", progress_json) + ) diff --git a/synapse/storage/search.py b/synapse/storage/search.py index d170c546b5..3b19b118eb 100644 --- a/synapse/storage/search.py +++ b/synapse/storage/search.py @@ -37,8 +37,8 @@ class SearchStore(BackgroundUpdateStore): @defer.inlineCallbacks def _background_reindex_search(self, progress, batch_size): - target_min_stream_id = progress["target_min_stream_id"] - max_stream_id = progress["max_stream_id"] + target_min_stream_id = progress["target_min_stream_id_inclusive"] + max_stream_id = progress["max_stream_id_exclusive"] rows_inserted = progress.get("rows_inserted", 0) INSERT_CLUMP_SIZE = 1000 @@ -105,8 +105,8 @@ class SearchStore(BackgroundUpdateStore): txn.execute_many(sql, clump) progress = { - "target_max_stream_id": target_min_stream_id, - "max_stream_id": min_stream_id, + "target_min_stream_id_inclusive": target_min_stream_id, + "max_stream_id_exclusive": min_stream_id, "rows_inserted": rows_inserted + len(event_search_rows) } -- cgit 1.5.1 From 940a16119205115c02a3b23e9f3c67a08486bae0 Mon Sep 17 00:00:00 2001 From: Mark Haines Date: Wed, 11 Nov 2015 13:59:40 +0000 Subject: Fix the background update --- synapse/storage/background_updates.py | 13 ++++++++----- synapse/storage/schema/delta/25/fts.py | 7 +++---- synapse/storage/search.py | 16 ++++++++-------- 3 files changed, 19 insertions(+), 17 deletions(-) (limited to 'synapse/storage/search.py') diff --git a/synapse/storage/background_updates.py b/synapse/storage/background_updates.py index b6cdc6ec68..45fccc2e5e 100644 --- a/synapse/storage/background_updates.py +++ b/synapse/storage/background_updates.py @@ -93,16 +93,19 @@ class BackgroundUpdateStore(SQLBaseStore): sleep = defer.Deferred() self._background_update_timer = self._clock.call_later( - self.BACKGROUND_UPDATE_INTERVAL_MS / 1000., sleep.callback + self.BACKGROUND_UPDATE_INTERVAL_MS / 1000., sleep.callback, None ) try: yield sleep finally: self._background_update_timer = None - result = yield self.do_background_update( - self.BACKGROUND_UPDATE_DURATION_MS - ) + try: + result = yield self.do_background_update( + self.BACKGROUND_UPDATE_DURATION_MS + ) + except: + logger.exception("Error doing update") if result is None: logger.info( @@ -169,7 +172,7 @@ class BackgroundUpdateStore(SQLBaseStore): duration_ms = time_stop - time_start logger.info( - "Updating %. Updated %r items in %rms." + "Updating %r. Updated %r items in %rms." " (total_rate=%r/ms, current_rate=%r/ms, total_updated=%r)", update_name, items_updated, duration_ms, performance.total_items_per_ms(), diff --git a/synapse/storage/schema/delta/25/fts.py b/synapse/storage/schema/delta/25/fts.py index e408d36da0..ce92f59025 100644 --- a/synapse/storage/schema/delta/25/fts.py +++ b/synapse/storage/schema/delta/25/fts.py @@ -47,11 +47,10 @@ def run_upgrade(cur, database_engine, *args, **kwargs): if isinstance(database_engine, PostgresEngine): for statement in get_statements(POSTGRES_TABLE.splitlines()): cur.execute(statement) - return - - if isinstance(database_engine, Sqlite3Engine): + elif isinstance(database_engine, Sqlite3Engine): cur.execute(SQLITE_TABLE) - return + else: + raise Exception("Unrecognized database engine") cur.execute("SELECT MIN(stream_ordering) FROM events") rows = cur.fetchall() diff --git a/synapse/storage/search.py b/synapse/storage/search.py index 3b19b118eb..3c0d671129 100644 --- a/synapse/storage/search.py +++ b/synapse/storage/search.py @@ -46,18 +46,18 @@ class SearchStore(BackgroundUpdateStore): def reindex_search_txn(txn): sql = ( - "SELECT stream_id, event_id FROM events" + "SELECT stream_ordering, event_id FROM events" " WHERE ? <= stream_ordering AND stream_ordering < ?" " AND (%s)" " ORDER BY stream_ordering DESC" " LIMIT ?" - ) % (" OR ".join("type = '%s'" % TYPES),) + ) % (" OR ".join("type = '%s'" % (t,) for t in TYPES),) - txn.execute(sql, target_min_stream_id, max_stream_id, batch_size) + txn.execute(sql, (target_min_stream_id, max_stream_id, batch_size)) - rows = txn.fetch_all() + rows = txn.fetchall() if not rows: - return None + return 0 min_stream_id = rows[-1][0] event_ids = [row[1] for row in rows] @@ -102,7 +102,7 @@ class SearchStore(BackgroundUpdateStore): for index in range(0, len(event_search_rows), INSERT_CLUMP_SIZE): clump = event_search_rows[index:index + INSERT_CLUMP_SIZE] - txn.execute_many(sql, clump) + txn.executemany(sql, clump) progress = { "target_min_stream_id_inclusive": target_min_stream_id, @@ -116,11 +116,11 @@ class SearchStore(BackgroundUpdateStore): return len(event_search_rows) - result = yield self.runInteration( + result = yield self.runInteraction( self.EVENT_SEARCH_UPDATE_NAME, reindex_search_txn ) - if result is None: + if not result: yield self._end_background_update(self.EVENT_SEARCH_UPDATE_NAME) defer.returnValue(result) -- cgit 1.5.1 From 14a9d805b959b5d2b26fea0d57794cb3ceb28958 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 12 Nov 2015 14:07:25 +0000 Subject: Use a (hopefully) more efficient SQL query for doing recency based room search --- synapse/storage/search.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'synapse/storage/search.py') diff --git a/synapse/storage/search.py b/synapse/storage/search.py index 2e88c51ad0..eac265b543 100644 --- a/synapse/storage/search.py +++ b/synapse/storage/search.py @@ -253,11 +253,13 @@ class SearchStore(BackgroundUpdateStore): ) elif isinstance(self.database_engine, Sqlite3Engine): sql = ( - "SELECT rank(matchinfo(event_search)) as rank, room_id, event_id," + "SELECT rank(matchinfo) as rank, room_id, event_id," " topological_ordering, stream_ordering" - " FROM event_search" - " NATURAL JOIN events" - " WHERE value MATCH ? AND room_id = ?" + " FROM (SELECT event_id, matchinfo(event_search) FROM event_search" + " WHERE value MATCH" + " )" + " CROSS JOIN events USING (event_id)" + " WHERE room_id = ?" ) else: # This should be unreachable. -- cgit 1.5.1 From 320408ef47eb373c2b8edad7ab3d08c3fa0cb459 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 12 Nov 2015 15:09:45 +0000 Subject: Fix SQL syntax --- synapse/storage/search.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'synapse/storage/search.py') diff --git a/synapse/storage/search.py b/synapse/storage/search.py index eac265b543..e1911e2480 100644 --- a/synapse/storage/search.py +++ b/synapse/storage/search.py @@ -255,8 +255,9 @@ class SearchStore(BackgroundUpdateStore): sql = ( "SELECT rank(matchinfo) as rank, room_id, event_id," " topological_ordering, stream_ordering" - " FROM (SELECT event_id, matchinfo(event_search) FROM event_search" - " WHERE value MATCH" + " FROM (SELECT key, event_id, matchinfo(event_search) as matchinfo" + " FROM event_search" + " WHERE value MATCH ?" " )" " CROSS JOIN events USING (event_id)" " WHERE room_id = ?" -- cgit 1.5.1 From 764e79d0514a0cce9dc456df0355108d40dbeda8 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 12 Nov 2015 15:19:56 +0000 Subject: Comment --- synapse/storage/search.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'synapse/storage/search.py') diff --git a/synapse/storage/search.py b/synapse/storage/search.py index e1911e2480..0b00ddb9db 100644 --- a/synapse/storage/search.py +++ b/synapse/storage/search.py @@ -252,6 +252,8 @@ class SearchStore(BackgroundUpdateStore): " WHERE vector @@ query AND room_id = ?" ) elif isinstance(self.database_engine, Sqlite3Engine): + # We use CROSS JOIN here to ensure we use the right indexes. + # https://sqlite.org/optoverview.html#crossjoin sql = ( "SELECT rank(matchinfo) as rank, room_id, event_id," " topological_ordering, stream_ordering" -- cgit 1.5.1 From 8fd8e72cec8df94403441664d8eecc25fa8d363f Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 12 Nov 2015 15:33:47 +0000 Subject: Expand comment --- synapse/storage/search.py | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'synapse/storage/search.py') diff --git a/synapse/storage/search.py b/synapse/storage/search.py index 0b00ddb9db..dcc5ac65a3 100644 --- a/synapse/storage/search.py +++ b/synapse/storage/search.py @@ -254,6 +254,12 @@ class SearchStore(BackgroundUpdateStore): elif isinstance(self.database_engine, Sqlite3Engine): # We use CROSS JOIN here to ensure we use the right indexes. # https://sqlite.org/optoverview.html#crossjoin + # + # We want to use the full text search index on event_search to + # extract all possible matches first, then lookup those matches + # in the events table to get the topological ordering. We need + # to use the indexes in this order because sqlite refuses to + # MATCH unless it uses the full text search index sql = ( "SELECT rank(matchinfo) as rank, room_id, event_id," " topological_ordering, stream_ordering" -- cgit 1.5.1 From 3de46c7755ac5e061fc13fb916c13b841b65f2b5 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 12 Nov 2015 15:36:43 +0000 Subject: Trailing whitespace --- synapse/storage/search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'synapse/storage/search.py') diff --git a/synapse/storage/search.py b/synapse/storage/search.py index dcc5ac65a3..380270b009 100644 --- a/synapse/storage/search.py +++ b/synapse/storage/search.py @@ -258,7 +258,7 @@ class SearchStore(BackgroundUpdateStore): # We want to use the full text search index on event_search to # extract all possible matches first, then lookup those matches # in the events table to get the topological ordering. We need - # to use the indexes in this order because sqlite refuses to + # to use the indexes in this order because sqlite refuses to # MATCH unless it uses the full text search index sql = ( "SELECT rank(matchinfo) as rank, room_id, event_id," -- cgit 1.5.1