summary refs log tree commit diff
path: root/synapse/storage/search.py
diff options
context:
space:
mode:
authorErik Johnston <erik@matrix.org>2019-10-21 12:56:42 +0100
committerErik Johnston <erik@matrix.org>2019-10-21 16:05:06 +0100
commitc66a06ac6b69b0a03f5c6284ded980399e9df94e (patch)
tree01dfd3b9098a9ace759403744d122c18efbd97ff /synapse/storage/search.py
parentMerge branch 'master' into develop (diff)
downloadsynapse-c66a06ac6b69b0a03f5c6284ded980399e9df94e.tar.xz
Move storage classes into a main "data store".
This is in preparation for having multiple data stores that offer
different functionality, e.g. splitting out state or event storage.
Diffstat (limited to 'synapse/storage/search.py')
-rw-r--r--synapse/storage/search.py713
1 files changed, 0 insertions, 713 deletions
diff --git a/synapse/storage/search.py b/synapse/storage/search.py
deleted file mode 100644
index 7695bf09fc..0000000000
--- a/synapse/storage/search.py
+++ /dev/null
@@ -1,713 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2015, 2016 OpenMarket Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import re
-from collections import namedtuple
-
-from six import string_types
-
-from canonicaljson import json
-
-from twisted.internet import defer
-
-from synapse.api.errors import SynapseError
-from synapse.storage._base import make_in_list_sql_clause
-from synapse.storage.engines import PostgresEngine, Sqlite3Engine
-
-from .background_updates import BackgroundUpdateStore
-
-logger = logging.getLogger(__name__)
-
-SearchEntry = namedtuple(
-    "SearchEntry",
-    ["key", "value", "event_id", "room_id", "stream_ordering", "origin_server_ts"],
-)
-
-
-class SearchBackgroundUpdateStore(BackgroundUpdateStore):
-
-    EVENT_SEARCH_UPDATE_NAME = "event_search"
-    EVENT_SEARCH_ORDER_UPDATE_NAME = "event_search_order"
-    EVENT_SEARCH_USE_GIST_POSTGRES_NAME = "event_search_postgres_gist"
-    EVENT_SEARCH_USE_GIN_POSTGRES_NAME = "event_search_postgres_gin"
-
-    def __init__(self, db_conn, hs):
-        super(SearchBackgroundUpdateStore, self).__init__(db_conn, hs)
-
-        if not hs.config.enable_search:
-            return
-
-        self.register_background_update_handler(
-            self.EVENT_SEARCH_UPDATE_NAME, self._background_reindex_search
-        )
-        self.register_background_update_handler(
-            self.EVENT_SEARCH_ORDER_UPDATE_NAME, self._background_reindex_search_order
-        )
-
-        # we used to have a background update to turn the GIN index into a
-        # GIST one; we no longer do that (obviously) because we actually want
-        # a GIN index. However, it's possible that some people might still have
-        # the background update queued, so we register a handler to clear the
-        # background update.
-        self.register_noop_background_update(self.EVENT_SEARCH_USE_GIST_POSTGRES_NAME)
-
-        self.register_background_update_handler(
-            self.EVENT_SEARCH_USE_GIN_POSTGRES_NAME, self._background_reindex_gin_search
-        )
-
-    @defer.inlineCallbacks
-    def _background_reindex_search(self, progress, batch_size):
-        # we work through the events table from highest stream id to lowest
-        target_min_stream_id = progress["target_min_stream_id_inclusive"]
-        max_stream_id = progress["max_stream_id_exclusive"]
-        rows_inserted = progress.get("rows_inserted", 0)
-
-        TYPES = ["m.room.name", "m.room.message", "m.room.topic"]
-
-        def reindex_search_txn(txn):
-            sql = (
-                "SELECT stream_ordering, event_id, room_id, type, json, "
-                " origin_server_ts FROM events"
-                " JOIN event_json USING (room_id, event_id)"
-                " WHERE ? <= stream_ordering AND stream_ordering < ?"
-                " AND (%s)"
-                " ORDER BY stream_ordering DESC"
-                " LIMIT ?"
-            ) % (" OR ".join("type = '%s'" % (t,) for t in TYPES),)
-
-            txn.execute(sql, (target_min_stream_id, max_stream_id, batch_size))
-
-            # we could stream straight from the results into
-            # store_search_entries_txn with a generator function, but that
-            # would mean having two cursors open on the database at once.
-            # Instead we just build a list of results.
-            rows = self.cursor_to_dict(txn)
-            if not rows:
-                return 0
-
-            min_stream_id = rows[-1]["stream_ordering"]
-
-            event_search_rows = []
-            for row in rows:
-                try:
-                    event_id = row["event_id"]
-                    room_id = row["room_id"]
-                    etype = row["type"]
-                    stream_ordering = row["stream_ordering"]
-                    origin_server_ts = row["origin_server_ts"]
-                    try:
-                        event_json = json.loads(row["json"])
-                        content = event_json["content"]
-                    except Exception:
-                        continue
-
-                    if etype == "m.room.message":
-                        key = "content.body"
-                        value = content["body"]
-                    elif etype == "m.room.topic":
-                        key = "content.topic"
-                        value = content["topic"]
-                    elif etype == "m.room.name":
-                        key = "content.name"
-                        value = content["name"]
-                    else:
-                        raise Exception("unexpected event type %s" % etype)
-                except (KeyError, AttributeError):
-                    # If the event is missing a necessary field then
-                    # skip over it.
-                    continue
-
-                if not isinstance(value, string_types):
-                    # If the event body, name or topic isn't a string
-                    # then skip over it
-                    continue
-
-                event_search_rows.append(
-                    SearchEntry(
-                        key=key,
-                        value=value,
-                        event_id=event_id,
-                        room_id=room_id,
-                        stream_ordering=stream_ordering,
-                        origin_server_ts=origin_server_ts,
-                    )
-                )
-
-            self.store_search_entries_txn(txn, event_search_rows)
-
-            progress = {
-                "target_min_stream_id_inclusive": target_min_stream_id,
-                "max_stream_id_exclusive": min_stream_id,
-                "rows_inserted": rows_inserted + len(event_search_rows),
-            }
-
-            self._background_update_progress_txn(
-                txn, self.EVENT_SEARCH_UPDATE_NAME, progress
-            )
-
-            return len(event_search_rows)
-
-        result = yield self.runInteraction(
-            self.EVENT_SEARCH_UPDATE_NAME, reindex_search_txn
-        )
-
-        if not result:
-            yield self._end_background_update(self.EVENT_SEARCH_UPDATE_NAME)
-
-        return result
-
-    @defer.inlineCallbacks
-    def _background_reindex_gin_search(self, progress, batch_size):
-        """This handles old synapses which used GIST indexes, if any;
-        converting them back to be GIN as per the actual schema.
-        """
-
-        def create_index(conn):
-            conn.rollback()
-
-            # we have to set autocommit, because postgres refuses to
-            # CREATE INDEX CONCURRENTLY without it.
-            conn.set_session(autocommit=True)
-
-            try:
-                c = conn.cursor()
-
-                # if we skipped the conversion to GIST, we may already/still
-                # have an event_search_fts_idx; unfortunately postgres 9.4
-                # doesn't support CREATE INDEX IF EXISTS so we just catch the
-                # exception and ignore it.
-                import psycopg2
-
-                try:
-                    c.execute(
-                        "CREATE INDEX CONCURRENTLY event_search_fts_idx"
-                        " ON event_search USING GIN (vector)"
-                    )
-                except psycopg2.ProgrammingError as e:
-                    logger.warn(
-                        "Ignoring error %r when trying to switch from GIST to GIN", e
-                    )
-
-                # we should now be able to delete the GIST index.
-                c.execute("DROP INDEX IF EXISTS event_search_fts_idx_gist")
-            finally:
-                conn.set_session(autocommit=False)
-
-        if isinstance(self.database_engine, PostgresEngine):
-            yield self.runWithConnection(create_index)
-
-        yield self._end_background_update(self.EVENT_SEARCH_USE_GIN_POSTGRES_NAME)
-        return 1
-
-    @defer.inlineCallbacks
-    def _background_reindex_search_order(self, progress, batch_size):
-        target_min_stream_id = progress["target_min_stream_id_inclusive"]
-        max_stream_id = progress["max_stream_id_exclusive"]
-        rows_inserted = progress.get("rows_inserted", 0)
-        have_added_index = progress["have_added_indexes"]
-
-        if not have_added_index:
-
-            def create_index(conn):
-                conn.rollback()
-                conn.set_session(autocommit=True)
-                c = conn.cursor()
-
-                # We create with NULLS FIRST so that when we search *backwards*
-                # we get the ones with non null origin_server_ts *first*
-                c.execute(
-                    "CREATE INDEX CONCURRENTLY event_search_room_order ON event_search("
-                    "room_id, origin_server_ts NULLS FIRST, stream_ordering NULLS FIRST)"
-                )
-                c.execute(
-                    "CREATE INDEX CONCURRENTLY event_search_order ON event_search("
-                    "origin_server_ts NULLS FIRST, stream_ordering NULLS FIRST)"
-                )
-                conn.set_session(autocommit=False)
-
-            yield self.runWithConnection(create_index)
-
-            pg = dict(progress)
-            pg["have_added_indexes"] = True
-
-            yield self.runInteraction(
-                self.EVENT_SEARCH_ORDER_UPDATE_NAME,
-                self._background_update_progress_txn,
-                self.EVENT_SEARCH_ORDER_UPDATE_NAME,
-                pg,
-            )
-
-        def reindex_search_txn(txn):
-            sql = (
-                "UPDATE event_search AS es SET stream_ordering = e.stream_ordering,"
-                " origin_server_ts = e.origin_server_ts"
-                " FROM events AS e"
-                " WHERE e.event_id = es.event_id"
-                " AND ? <= e.stream_ordering AND e.stream_ordering < ?"
-                " RETURNING es.stream_ordering"
-            )
-
-            min_stream_id = max_stream_id - batch_size
-            txn.execute(sql, (min_stream_id, max_stream_id))
-            rows = txn.fetchall()
-
-            if min_stream_id < target_min_stream_id:
-                # We've recached the end.
-                return len(rows), False
-
-            progress = {
-                "target_min_stream_id_inclusive": target_min_stream_id,
-                "max_stream_id_exclusive": min_stream_id,
-                "rows_inserted": rows_inserted + len(rows),
-                "have_added_indexes": True,
-            }
-
-            self._background_update_progress_txn(
-                txn, self.EVENT_SEARCH_ORDER_UPDATE_NAME, progress
-            )
-
-            return len(rows), True
-
-        num_rows, finished = yield self.runInteraction(
-            self.EVENT_SEARCH_ORDER_UPDATE_NAME, reindex_search_txn
-        )
-
-        if not finished:
-            yield self._end_background_update(self.EVENT_SEARCH_ORDER_UPDATE_NAME)
-
-        return num_rows
-
-    def store_search_entries_txn(self, txn, entries):
-        """Add entries to the search table
-
-        Args:
-            txn (cursor):
-            entries (iterable[SearchEntry]):
-                entries to be added to the table
-        """
-        if not self.hs.config.enable_search:
-            return
-        if isinstance(self.database_engine, PostgresEngine):
-            sql = (
-                "INSERT INTO event_search"
-                " (event_id, room_id, key, vector, stream_ordering, origin_server_ts)"
-                " VALUES (?,?,?,to_tsvector('english', ?),?,?)"
-            )
-
-            args = (
-                (
-                    entry.event_id,
-                    entry.room_id,
-                    entry.key,
-                    entry.value,
-                    entry.stream_ordering,
-                    entry.origin_server_ts,
-                )
-                for entry in entries
-            )
-
-            txn.executemany(sql, args)
-
-        elif isinstance(self.database_engine, Sqlite3Engine):
-            sql = (
-                "INSERT INTO event_search (event_id, room_id, key, value)"
-                " VALUES (?,?,?,?)"
-            )
-            args = (
-                (entry.event_id, entry.room_id, entry.key, entry.value)
-                for entry in entries
-            )
-
-            txn.executemany(sql, args)
-        else:
-            # This should be unreachable.
-            raise Exception("Unrecognized database engine")
-
-
-class SearchStore(SearchBackgroundUpdateStore):
-    def __init__(self, db_conn, hs):
-        super(SearchStore, self).__init__(db_conn, hs)
-
-    def store_event_search_txn(self, txn, event, key, value):
-        """Add event to the search table
-
-        Args:
-            txn (cursor):
-            event (EventBase):
-            key (str):
-            value (str):
-        """
-        self.store_search_entries_txn(
-            txn,
-            (
-                SearchEntry(
-                    key=key,
-                    value=value,
-                    event_id=event.event_id,
-                    room_id=event.room_id,
-                    stream_ordering=event.internal_metadata.stream_ordering,
-                    origin_server_ts=event.origin_server_ts,
-                ),
-            ),
-        )
-
-    @defer.inlineCallbacks
-    def search_msgs(self, room_ids, search_term, keys):
-        """Performs a full text search over events with given keys.
-
-        Args:
-            room_ids (list): List of room ids to search in
-            search_term (str): Search term to search for
-            keys (list): List of keys to search in, currently supports
-                "content.body", "content.name", "content.topic"
-
-        Returns:
-            list of dicts
-        """
-        clauses = []
-
-        search_query = search_query = _parse_query(self.database_engine, search_term)
-
-        args = []
-
-        # Make sure we don't explode because the person is in too many rooms.
-        # We filter the results below regardless.
-        if len(room_ids) < 500:
-            clause, args = make_in_list_sql_clause(
-                self.database_engine, "room_id", room_ids
-            )
-            clauses = [clause]
-
-        local_clauses = []
-        for key in keys:
-            local_clauses.append("key = ?")
-            args.append(key)
-
-        clauses.append("(%s)" % (" OR ".join(local_clauses),))
-
-        count_args = args
-        count_clauses = clauses
-
-        if isinstance(self.database_engine, PostgresEngine):
-            sql = (
-                "SELECT ts_rank_cd(vector, to_tsquery('english', ?)) AS rank,"
-                " room_id, event_id"
-                " FROM event_search"
-                " WHERE vector @@ to_tsquery('english', ?)"
-            )
-            args = [search_query, search_query] + args
-
-            count_sql = (
-                "SELECT room_id, count(*) as count FROM event_search"
-                " WHERE vector @@ to_tsquery('english', ?)"
-            )
-            count_args = [search_query] + count_args
-        elif isinstance(self.database_engine, Sqlite3Engine):
-            sql = (
-                "SELECT rank(matchinfo(event_search)) as rank, room_id, event_id"
-                " FROM event_search"
-                " WHERE value MATCH ?"
-            )
-            args = [search_query] + args
-
-            count_sql = (
-                "SELECT room_id, count(*) as count FROM event_search"
-                " WHERE value MATCH ?"
-            )
-            count_args = [search_term] + count_args
-        else:
-            # This should be unreachable.
-            raise Exception("Unrecognized database engine")
-
-        for clause in clauses:
-            sql += " AND " + clause
-
-        for clause in count_clauses:
-            count_sql += " AND " + clause
-
-        # We add an arbitrary limit here to ensure we don't try to pull the
-        # entire table from the database.
-        sql += " ORDER BY rank DESC LIMIT 500"
-
-        results = yield self._execute("search_msgs", self.cursor_to_dict, sql, *args)
-
-        results = list(filter(lambda row: row["room_id"] in room_ids, results))
-
-        events = yield self.get_events_as_list([r["event_id"] for r in results])
-
-        event_map = {ev.event_id: ev for ev in events}
-
-        highlights = None
-        if isinstance(self.database_engine, PostgresEngine):
-            highlights = yield self._find_highlights_in_postgres(search_query, events)
-
-        count_sql += " GROUP BY room_id"
-
-        count_results = yield self._execute(
-            "search_rooms_count", self.cursor_to_dict, count_sql, *count_args
-        )
-
-        count = sum(row["count"] for row in count_results if row["room_id"] in room_ids)
-
-        return {
-            "results": [
-                {"event": event_map[r["event_id"]], "rank": r["rank"]}
-                for r in results
-                if r["event_id"] in event_map
-            ],
-            "highlights": highlights,
-            "count": count,
-        }
-
-    @defer.inlineCallbacks
-    def search_rooms(self, room_ids, search_term, keys, limit, pagination_token=None):
-        """Performs a full text search over events with given keys.
-
-        Args:
-            room_id (list): The room_ids to search in
-            search_term (str): Search term to search for
-            keys (list): List of keys to search in, currently supports
-                "content.body", "content.name", "content.topic"
-            pagination_token (str): A pagination token previously returned
-
-        Returns:
-            list of dicts
-        """
-        clauses = []
-
-        search_query = search_query = _parse_query(self.database_engine, search_term)
-
-        args = []
-
-        # Make sure we don't explode because the person is in too many rooms.
-        # We filter the results below regardless.
-        if len(room_ids) < 500:
-            clause, args = make_in_list_sql_clause(
-                self.database_engine, "room_id", room_ids
-            )
-            clauses = [clause]
-
-        local_clauses = []
-        for key in keys:
-            local_clauses.append("key = ?")
-            args.append(key)
-
-        clauses.append("(%s)" % (" OR ".join(local_clauses),))
-
-        # take copies of the current args and clauses lists, before adding
-        # pagination clauses to main query.
-        count_args = list(args)
-        count_clauses = list(clauses)
-
-        if pagination_token:
-            try:
-                origin_server_ts, stream = pagination_token.split(",")
-                origin_server_ts = int(origin_server_ts)
-                stream = int(stream)
-            except Exception:
-                raise SynapseError(400, "Invalid pagination token")
-
-            clauses.append(
-                "(origin_server_ts < ?"
-                " OR (origin_server_ts = ? AND stream_ordering < ?))"
-            )
-            args.extend([origin_server_ts, origin_server_ts, stream])
-
-        if isinstance(self.database_engine, PostgresEngine):
-            sql = (
-                "SELECT ts_rank_cd(vector, to_tsquery('english', ?)) as rank,"
-                " origin_server_ts, stream_ordering, room_id, event_id"
-                " FROM event_search"
-                " WHERE vector @@ to_tsquery('english', ?) AND "
-            )
-            args = [search_query, search_query] + args
-
-            count_sql = (
-                "SELECT room_id, count(*) as count FROM event_search"
-                " WHERE vector @@ to_tsquery('english', ?) AND "
-            )
-            count_args = [search_query] + count_args
-        elif isinstance(self.database_engine, Sqlite3Engine):
-            # We use CROSS JOIN here to ensure we use the right indexes.
-            # https://sqlite.org/optoverview.html#crossjoin
-            #
-            # We want to use the full text search index on event_search to
-            # extract all possible matches first, then lookup those matches
-            # in the events table to get the topological ordering. We need
-            # to use the indexes in this order because sqlite refuses to
-            # MATCH unless it uses the full text search index
-            sql = (
-                "SELECT rank(matchinfo) as rank, room_id, event_id,"
-                " origin_server_ts, stream_ordering"
-                " FROM (SELECT key, event_id, matchinfo(event_search) as matchinfo"
-                " FROM event_search"
-                " WHERE value MATCH ?"
-                " )"
-                " CROSS JOIN events USING (event_id)"
-                " WHERE "
-            )
-            args = [search_query] + args
-
-            count_sql = (
-                "SELECT room_id, count(*) as count FROM event_search"
-                " WHERE value MATCH ? AND "
-            )
-            count_args = [search_term] + count_args
-        else:
-            # This should be unreachable.
-            raise Exception("Unrecognized database engine")
-
-        sql += " AND ".join(clauses)
-        count_sql += " AND ".join(count_clauses)
-
-        # We add an arbitrary limit here to ensure we don't try to pull the
-        # entire table from the database.
-        if isinstance(self.database_engine, PostgresEngine):
-            sql += (
-                " ORDER BY origin_server_ts DESC NULLS LAST,"
-                " stream_ordering DESC NULLS LAST LIMIT ?"
-            )
-        elif isinstance(self.database_engine, Sqlite3Engine):
-            sql += " ORDER BY origin_server_ts DESC, stream_ordering DESC LIMIT ?"
-        else:
-            raise Exception("Unrecognized database engine")
-
-        args.append(limit)
-
-        results = yield self._execute("search_rooms", self.cursor_to_dict, sql, *args)
-
-        results = list(filter(lambda row: row["room_id"] in room_ids, results))
-
-        events = yield self.get_events_as_list([r["event_id"] for r in results])
-
-        event_map = {ev.event_id: ev for ev in events}
-
-        highlights = None
-        if isinstance(self.database_engine, PostgresEngine):
-            highlights = yield self._find_highlights_in_postgres(search_query, events)
-
-        count_sql += " GROUP BY room_id"
-
-        count_results = yield self._execute(
-            "search_rooms_count", self.cursor_to_dict, count_sql, *count_args
-        )
-
-        count = sum(row["count"] for row in count_results if row["room_id"] in room_ids)
-
-        return {
-            "results": [
-                {
-                    "event": event_map[r["event_id"]],
-                    "rank": r["rank"],
-                    "pagination_token": "%s,%s"
-                    % (r["origin_server_ts"], r["stream_ordering"]),
-                }
-                for r in results
-                if r["event_id"] in event_map
-            ],
-            "highlights": highlights,
-            "count": count,
-        }
-
-    def _find_highlights_in_postgres(self, search_query, events):
-        """Given a list of events and a search term, return a list of words
-        that match from the content of the event.
-
-        This is used to give a list of words that clients can match against to
-        highlight the matching parts.
-
-        Args:
-            search_query (str)
-            events (list): A list of events
-
-        Returns:
-            deferred : A set of strings.
-        """
-
-        def f(txn):
-            highlight_words = set()
-            for event in events:
-                # As a hack we simply join values of all possible keys. This is
-                # fine since we're only using them to find possible highlights.
-                values = []
-                for key in ("body", "name", "topic"):
-                    v = event.content.get(key, None)
-                    if v:
-                        values.append(v)
-
-                if not values:
-                    continue
-
-                value = " ".join(values)
-
-                # We need to find some values for StartSel and StopSel that
-                # aren't in the value so that we can pick results out.
-                start_sel = "<"
-                stop_sel = ">"
-
-                while start_sel in value:
-                    start_sel += "<"
-                while stop_sel in value:
-                    stop_sel += ">"
-
-                query = "SELECT ts_headline(?, to_tsquery('english', ?), %s)" % (
-                    _to_postgres_options(
-                        {
-                            "StartSel": start_sel,
-                            "StopSel": stop_sel,
-                            "MaxFragments": "50",
-                        }
-                    )
-                )
-                txn.execute(query, (value, search_query))
-                headline, = txn.fetchall()[0]
-
-                # Now we need to pick the possible highlights out of the haedline
-                # result.
-                matcher_regex = "%s(.*?)%s" % (
-                    re.escape(start_sel),
-                    re.escape(stop_sel),
-                )
-
-                res = re.findall(matcher_regex, headline)
-                highlight_words.update([r.lower() for r in res])
-
-            return highlight_words
-
-        return self.runInteraction("_find_highlights", f)
-
-
-def _to_postgres_options(options_dict):
-    return "'%s'" % (",".join("%s=%s" % (k, v) for k, v in options_dict.items()),)
-
-
-def _parse_query(database_engine, search_term):
-    """Takes a plain unicode string from the user and converts it into a form
-    that can be passed to database.
-    We use this so that we can add prefix matching, which isn't something
-    that is supported by default.
-    """
-
-    # Pull out the individual words, discarding any non-word characters.
-    results = re.findall(r"([\w\-]+)", search_term, re.UNICODE)
-
-    if isinstance(database_engine, PostgresEngine):
-        return " & ".join(result + ":*" for result in results)
-    elif isinstance(database_engine, Sqlite3Engine):
-        return " & ".join(result + "*" for result in results)
-    else:
-        # This should be unreachable.
-        raise Exception("Unrecognized database engine")