summary refs log tree commit diff
diff options
context:
space:
mode:
authorDavid Robertson <davidr@element.io>2022-11-02 12:35:00 +0000
committerDavid Robertson <davidr@element.io>2022-11-02 12:35:00 +0000
commit37307a5ccef8e21313f0631d414806ea551e6a73 (patch)
treeb37f2cb6b61d1538bd49b7ae532fff174a18c931
parentMerge branch 'release-v1.71' into matrix-org-hotfixes (diff)
parentDisable auto search for prefixes in event search (diff)
downloadsynapse-37307a5ccef8e21313f0631d414806ea551e6a73.tar.xz
Merge branch 'dmr/revert-fts-changes-on-hotfix' into matrix-org-hotfixes
-rw-r--r--synapse/storage/databases/main/search.py372
-rw-r--r--synapse/storage/engines/postgres.py16
-rw-r--r--synapse/storage/schema/main/delta/73/10_update_sqlite_fts4_tokenizer.py62
-rw-r--r--tests/storage/test_room_search.py221
4 files changed, 118 insertions, 553 deletions
diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py
index e9588d1755..ddcda6f1a7 100644
--- a/synapse/storage/databases/main/search.py
+++ b/synapse/storage/databases/main/search.py
@@ -11,22 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import enum
+
 import logging
 import re
-from collections import deque
-from dataclasses import dataclass
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Collection,
-    Iterable,
-    List,
-    Optional,
-    Set,
-    Tuple,
-    Union,
-)
+from typing import TYPE_CHECKING, Any, Collection, Iterable, List, Optional, Set, Tuple
 
 import attr
 
@@ -39,7 +27,7 @@ from synapse.storage.database import (
     LoggingTransaction,
 )
 from synapse.storage.databases.main.events_worker import EventRedactBehaviour
-from synapse.storage.engines import PostgresEngine, Sqlite3Engine
+from synapse.storage.engines import BaseDatabaseEngine, PostgresEngine, Sqlite3Engine
 from synapse.types import JsonDict
 
 if TYPE_CHECKING:
@@ -80,11 +68,11 @@ class SearchWorkerStore(SQLBaseStore):
         if not self.hs.config.server.enable_search:
             return
         if isinstance(self.database_engine, PostgresEngine):
-            sql = """
-            INSERT INTO event_search
-            (event_id, room_id, key, vector, stream_ordering, origin_server_ts)
-            VALUES (?,?,?,to_tsvector('english', ?),?,?)
-            """
+            sql = (
+                "INSERT INTO event_search"
+                " (event_id, room_id, key, vector, stream_ordering, origin_server_ts)"
+                " VALUES (?,?,?,to_tsvector('english', ?),?,?)"
+            )
 
             args1 = (
                 (
@@ -101,20 +89,20 @@ class SearchWorkerStore(SQLBaseStore):
             txn.execute_batch(sql, args1)
 
         elif isinstance(self.database_engine, Sqlite3Engine):
-            self.db_pool.simple_insert_many_txn(
-                txn,
-                table="event_search",
-                keys=("event_id", "room_id", "key", "value"),
-                values=(
-                    (
-                        entry.event_id,
-                        entry.room_id,
-                        entry.key,
-                        _clean_value_for_search(entry.value),
-                    )
-                    for entry in entries
-                ),
+            sql = (
+                "INSERT INTO event_search (event_id, room_id, key, value)"
+                " VALUES (?,?,?,?)"
             )
+            args2 = (
+                (
+                    entry.event_id,
+                    entry.room_id,
+                    entry.key,
+                    _clean_value_for_search(entry.value),
+                )
+                for entry in entries
+            )
+            txn.execute_batch(sql, args2)
 
         else:
             # This should be unreachable.
@@ -162,17 +150,15 @@ class SearchBackgroundUpdateStore(SearchWorkerStore):
         TYPES = ["m.room.name", "m.room.message", "m.room.topic"]
 
         def reindex_search_txn(txn: LoggingTransaction) -> int:
-            sql = """
-            SELECT stream_ordering, event_id, room_id, type, json, origin_server_ts
-            FROM events
-            JOIN event_json USING (room_id, event_id)
-            WHERE ? <= stream_ordering AND stream_ordering < ?
-            AND (%s)
-            ORDER BY stream_ordering DESC
-            LIMIT ?
-            """ % (
-                " OR ".join("type = '%s'" % (t,) for t in TYPES),
-            )
+            sql = (
+                "SELECT stream_ordering, event_id, room_id, type, json, "
+                " origin_server_ts FROM events"
+                " JOIN event_json USING (room_id, event_id)"
+                " WHERE ? <= stream_ordering AND stream_ordering < ?"
+                " AND (%s)"
+                " ORDER BY stream_ordering DESC"
+                " LIMIT ?"
+            ) % (" OR ".join("type = '%s'" % (t,) for t in TYPES),)
 
             txn.execute(sql, (target_min_stream_id, max_stream_id, batch_size))
 
@@ -286,10 +272,8 @@ class SearchBackgroundUpdateStore(SearchWorkerStore):
 
                 try:
                     c.execute(
-                        """
-                        CREATE INDEX CONCURRENTLY event_search_fts_idx
-                        ON event_search USING GIN (vector)
-                        """
+                        "CREATE INDEX CONCURRENTLY event_search_fts_idx"
+                        " ON event_search USING GIN (vector)"
                     )
                 except psycopg2.ProgrammingError as e:
                     logger.warning(
@@ -327,16 +311,12 @@ class SearchBackgroundUpdateStore(SearchWorkerStore):
                 # We create with NULLS FIRST so that when we search *backwards*
                 # we get the ones with non null origin_server_ts *first*
                 c.execute(
-                    """
-                    CREATE INDEX CONCURRENTLY event_search_room_order
-                    ON event_search(room_id, origin_server_ts NULLS FIRST, stream_ordering NULLS FIRST)
-                    """
+                    "CREATE INDEX CONCURRENTLY event_search_room_order ON event_search("
+                    "room_id, origin_server_ts NULLS FIRST, stream_ordering NULLS FIRST)"
                 )
                 c.execute(
-                    """
-                    CREATE INDEX CONCURRENTLY event_search_order
-                    ON event_search(origin_server_ts NULLS FIRST, stream_ordering NULLS FIRST)
-                    """
+                    "CREATE INDEX CONCURRENTLY event_search_order ON event_search("
+                    "origin_server_ts NULLS FIRST, stream_ordering NULLS FIRST)"
                 )
                 conn.set_session(autocommit=False)
 
@@ -353,14 +333,14 @@ class SearchBackgroundUpdateStore(SearchWorkerStore):
             )
 
         def reindex_search_txn(txn: LoggingTransaction) -> Tuple[int, bool]:
-            sql = """
-            UPDATE event_search AS es
-            SET stream_ordering = e.stream_ordering, origin_server_ts = e.origin_server_ts
-            FROM events AS e
-            WHERE e.event_id = es.event_id
-            AND ? <= e.stream_ordering AND e.stream_ordering < ?
-            RETURNING es.stream_ordering
-            """
+            sql = (
+                "UPDATE event_search AS es SET stream_ordering = e.stream_ordering,"
+                " origin_server_ts = e.origin_server_ts"
+                " FROM events AS e"
+                " WHERE e.event_id = es.event_id"
+                " AND ? <= e.stream_ordering AND e.stream_ordering < ?"
+                " RETURNING es.stream_ordering"
+            )
 
             min_stream_id = max_stream_id - batch_size
             txn.execute(sql, (min_stream_id, max_stream_id))
@@ -441,6 +421,8 @@ class SearchStore(SearchBackgroundUpdateStore):
         """
         clauses = []
 
+        search_query = _parse_query(self.database_engine, search_term)
+
         args: List[Any] = []
 
         # Make sure we don't explode because the person is in too many rooms.
@@ -462,36 +444,32 @@ class SearchStore(SearchBackgroundUpdateStore):
         count_clauses = clauses
 
         if isinstance(self.database_engine, PostgresEngine):
-            search_query = search_term
-            tsquery_func = self.database_engine.tsquery_func
-            sql = f"""
-            SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) AS rank,
-            room_id, event_id
-            FROM event_search
-            WHERE vector @@  {tsquery_func}('english', ?)
-            """
+            sql = (
+                "SELECT ts_rank_cd(vector, to_tsquery('english', ?)) AS rank,"
+                " room_id, event_id"
+                " FROM event_search"
+                " WHERE vector @@ to_tsquery('english', ?)"
+            )
             args = [search_query, search_query] + args
 
-            count_sql = f"""
-            SELECT room_id, count(*) as count FROM event_search
-            WHERE vector @@ {tsquery_func}('english', ?)
-            """
+            count_sql = (
+                "SELECT room_id, count(*) as count FROM event_search"
+                " WHERE vector @@ to_tsquery('english', ?)"
+            )
             count_args = [search_query] + count_args
         elif isinstance(self.database_engine, Sqlite3Engine):
-            search_query = _parse_query_for_sqlite(search_term)
-
-            sql = """
-            SELECT rank(matchinfo(event_search)) as rank, room_id, event_id
-            FROM event_search
-            WHERE value MATCH ?
-            """
+            sql = (
+                "SELECT rank(matchinfo(event_search)) as rank, room_id, event_id"
+                " FROM event_search"
+                " WHERE value MATCH ?"
+            )
             args = [search_query] + args
 
-            count_sql = """
-            SELECT room_id, count(*) as count FROM event_search
-            WHERE value MATCH ?
-            """
-            count_args = [search_query] + count_args
+            count_sql = (
+                "SELECT room_id, count(*) as count FROM event_search"
+                " WHERE value MATCH ?"
+            )
+            count_args = [search_term] + count_args
         else:
             # This should be unreachable.
             raise Exception("Unrecognized database engine")
@@ -523,9 +501,7 @@ class SearchStore(SearchBackgroundUpdateStore):
 
         highlights = None
         if isinstance(self.database_engine, PostgresEngine):
-            highlights = await self._find_highlights_in_postgres(
-                search_query, events, tsquery_func
-            )
+            highlights = await self._find_highlights_in_postgres(search_query, events)
 
         count_sql += " GROUP BY room_id"
 
@@ -534,6 +510,7 @@ class SearchStore(SearchBackgroundUpdateStore):
         )
 
         count = sum(row["count"] for row in count_results if row["room_id"] in room_ids)
+
         return {
             "results": [
                 {"event": event_map[r["event_id"]], "rank": r["rank"]}
@@ -565,6 +542,9 @@ class SearchStore(SearchBackgroundUpdateStore):
             Each match as a dictionary.
         """
         clauses = []
+
+        search_query = _parse_query(self.database_engine, search_term)
+
         args: List[Any] = []
 
         # Make sure we don't explode because the person is in too many rooms.
@@ -596,30 +576,26 @@ class SearchStore(SearchBackgroundUpdateStore):
                 raise SynapseError(400, "Invalid pagination token")
 
             clauses.append(
-                """
-                (origin_server_ts < ? OR (origin_server_ts = ? AND stream_ordering < ?))
-                """
+                "(origin_server_ts < ?"
+                " OR (origin_server_ts = ? AND stream_ordering < ?))"
             )
             args.extend([origin_server_ts, origin_server_ts, stream])
 
         if isinstance(self.database_engine, PostgresEngine):
-            search_query = search_term
-            tsquery_func = self.database_engine.tsquery_func
-            sql = f"""
-            SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) as rank,
-            origin_server_ts, stream_ordering, room_id, event_id
-            FROM event_search
-            WHERE vector @@ {tsquery_func}('english', ?) AND
-            """
+            sql = (
+                "SELECT ts_rank_cd(vector, to_tsquery('english', ?)) as rank,"
+                " origin_server_ts, stream_ordering, room_id, event_id"
+                " FROM event_search"
+                " WHERE vector @@ to_tsquery('english', ?) AND "
+            )
             args = [search_query, search_query] + args
 
-            count_sql = f"""
-            SELECT room_id, count(*) as count FROM event_search
-            WHERE vector @@ {tsquery_func}('english', ?) AND
-            """
+            count_sql = (
+                "SELECT room_id, count(*) as count FROM event_search"
+                " WHERE vector @@ to_tsquery('english', ?) AND "
+            )
             count_args = [search_query] + count_args
         elif isinstance(self.database_engine, Sqlite3Engine):
-
             # We use CROSS JOIN here to ensure we use the right indexes.
             # https://sqlite.org/optoverview.html#crossjoin
             #
@@ -628,25 +604,23 @@ class SearchStore(SearchBackgroundUpdateStore):
             # in the events table to get the topological ordering. We need
             # to use the indexes in this order because sqlite refuses to
             # MATCH unless it uses the full text search index
-            sql = """
-            SELECT
-                rank(matchinfo) as rank, room_id, event_id, origin_server_ts, stream_ordering
-            FROM (
-                SELECT key, event_id, matchinfo(event_search) as matchinfo
-                FROM event_search
-                WHERE value MATCH ?
+            sql = (
+                "SELECT rank(matchinfo) as rank, room_id, event_id,"
+                " origin_server_ts, stream_ordering"
+                " FROM (SELECT key, event_id, matchinfo(event_search) as matchinfo"
+                " FROM event_search"
+                " WHERE value MATCH ?"
+                " )"
+                " CROSS JOIN events USING (event_id)"
+                " WHERE "
             )
-            CROSS JOIN events USING (event_id)
-            WHERE
-            """
-            search_query = _parse_query_for_sqlite(search_term)
             args = [search_query] + args
 
-            count_sql = """
-            SELECT room_id, count(*) as count FROM event_search
-            WHERE value MATCH ? AND
-            """
-            count_args = [search_query] + count_args
+            count_sql = (
+                "SELECT room_id, count(*) as count FROM event_search"
+                " WHERE value MATCH ? AND "
+            )
+            count_args = [search_term] + count_args
         else:
             # This should be unreachable.
             raise Exception("Unrecognized database engine")
@@ -657,10 +631,10 @@ class SearchStore(SearchBackgroundUpdateStore):
         # We add an arbitrary limit here to ensure we don't try to pull the
         # entire table from the database.
         if isinstance(self.database_engine, PostgresEngine):
-            sql += """
-            ORDER BY origin_server_ts DESC NULLS LAST, stream_ordering DESC NULLS LAST
-            LIMIT ?
-            """
+            sql += (
+                " ORDER BY origin_server_ts DESC NULLS LAST,"
+                " stream_ordering DESC NULLS LAST LIMIT ?"
+            )
         elif isinstance(self.database_engine, Sqlite3Engine):
             sql += " ORDER BY origin_server_ts DESC, stream_ordering DESC LIMIT ?"
         else:
@@ -686,9 +660,7 @@ class SearchStore(SearchBackgroundUpdateStore):
 
         highlights = None
         if isinstance(self.database_engine, PostgresEngine):
-            highlights = await self._find_highlights_in_postgres(
-                search_query, events, tsquery_func
-            )
+            highlights = await self._find_highlights_in_postgres(search_query, events)
 
         count_sql += " GROUP BY room_id"
 
@@ -714,7 +686,7 @@ class SearchStore(SearchBackgroundUpdateStore):
         }
 
     async def _find_highlights_in_postgres(
-        self, search_query: str, events: List[EventBase], tsquery_func: str
+        self, search_query: str, events: List[EventBase]
     ) -> Set[str]:
         """Given a list of events and a search term, return a list of words
         that match from the content of the event.
@@ -725,7 +697,6 @@ class SearchStore(SearchBackgroundUpdateStore):
         Args:
             search_query
             events: A list of events
-            tsquery_func: The tsquery_* function to use when making queries
 
         Returns:
             A set of strings.
@@ -758,7 +729,7 @@ class SearchStore(SearchBackgroundUpdateStore):
                 while stop_sel in value:
                     stop_sel += ">"
 
-                query = f"SELECT ts_headline(?, {tsquery_func}('english', ?), %s)" % (
+                query = "SELECT ts_headline(?, to_tsquery('english', ?), %s)" % (
                     _to_postgres_options(
                         {
                             "StartSel": start_sel,
@@ -789,127 +760,20 @@ def _to_postgres_options(options_dict: JsonDict) -> str:
     return "'%s'" % (",".join("%s=%s" % (k, v) for k, v in options_dict.items()),)
 
 
-@dataclass
-class Phrase:
-    phrase: List[str]
-
-
-class SearchToken(enum.Enum):
-    Not = enum.auto()
-    Or = enum.auto()
-    And = enum.auto()
-
-
-Token = Union[str, Phrase, SearchToken]
-TokenList = List[Token]
-
-
-def _is_stop_word(word: str) -> bool:
-    # TODO Pull these out of the dictionary:
-    #  https://github.com/postgres/postgres/blob/master/src/backend/snowball/stopwords/english.stop
-    return word in {"the", "a", "you", "me", "and", "but"}
-
-
-def _tokenize_query(query: str) -> TokenList:
-    """
-    Convert the user-supplied `query` into a TokenList, which can be translated into
-    some DB-specific syntax.
-
-    The following constructs are supported:
-
-    - phrase queries using "double quotes"
-    - case-insensitive `or` and `and` operators
-    - negation of a keyword via unary `-`
-    - unary hyphen to denote NOT e.g. 'include -exclude'
-
-    The following differs from websearch_to_tsquery:
-
-    - Stop words are not removed.
-    - Unclosed phrases are treated differently.
-
-    """
-    tokens: TokenList = []
-
-    # Find phrases.
-    in_phrase = False
-    parts = deque(query.split('"'))
-    for i, part in enumerate(parts):
-        # The contents inside double quotes is treated as a phrase.
-        in_phrase = bool(i % 2)
-
-        # Pull out the individual words, discarding any non-word characters.
-        words = deque(re.findall(r"([\w\-]+)", part, re.UNICODE))
-
-        # Phrases have simplified handling of words.
-        if in_phrase:
-            # Skip stop words.
-            phrase = [word for word in words if not _is_stop_word(word)]
-
-            # Consecutive words are implicitly ANDed together.
-            if tokens and tokens[-1] not in (SearchToken.Not, SearchToken.Or):
-                tokens.append(SearchToken.And)
-
-            # Add the phrase.
-            tokens.append(Phrase(phrase))
-            continue
-
-        # Otherwise, not in a phrase.
-        while words:
-            word = words.popleft()
-
-            if word.startswith("-"):
-                tokens.append(SearchToken.Not)
-
-                # If there's more word, put it back to be processed again.
-                word = word[1:]
-                if word:
-                    words.appendleft(word)
-            elif word.lower() == "or":
-                tokens.append(SearchToken.Or)
-            else:
-                # Skip stop words.
-                if _is_stop_word(word):
-                    continue
-
-                # Consecutive words are implicitly ANDed together.
-                if tokens and tokens[-1] not in (SearchToken.Not, SearchToken.Or):
-                    tokens.append(SearchToken.And)
-
-                # Add the search term.
-                tokens.append(word)
-
-    return tokens
-
-
-def _tokens_to_sqlite_match_query(tokens: TokenList) -> str:
-    """
-    Convert the list of tokens to a string suitable for passing to sqlite's MATCH.
-    Assume sqlite was compiled with enhanced query syntax.
-
-    Ref: https://www.sqlite.org/fts3.html#full_text_index_queries
+def _parse_query(database_engine: BaseDatabaseEngine, search_term: str) -> str:
+    """Takes a plain unicode string from the user and converts it into a form
+    that can be passed to database.
+    We use this so that we can add prefix matching, which isn't something
+    that is supported by default.
     """
-    match_query = []
-    for token in tokens:
-        if isinstance(token, str):
-            match_query.append(token)
-        elif isinstance(token, Phrase):
-            match_query.append('"' + " ".join(token.phrase) + '"')
-        elif token == SearchToken.Not:
-            # TODO: SQLite treats NOT as a *binary* operator. Hopefully a search
-            # term has already been added before this.
-            match_query.append(" NOT ")
-        elif token == SearchToken.Or:
-            match_query.append(" OR ")
-        elif token == SearchToken.And:
-            match_query.append(" AND ")
-        else:
-            raise ValueError(f"unknown token {token}")
 
-    return "".join(match_query)
+    # Pull out the individual words, discarding any non-word characters.
+    results = re.findall(r"([\w\-]+)", search_term, re.UNICODE)
 
-
-def _parse_query_for_sqlite(search_term: str) -> str:
-    """Takes a plain unicode string from the user and converts it into a form
-    that can be passed to sqllite's matchinfo().
-    """
-    return _tokens_to_sqlite_match_query(_tokenize_query(search_term))
+    if isinstance(database_engine, PostgresEngine):
+        return " & ".join(result for result in results)
+    elif isinstance(database_engine, Sqlite3Engine):
+        return " & ".join(result + "*" for result in results)
+    else:
+        # This should be unreachable.
+        raise Exception("Unrecognized database engine")
diff --git a/synapse/storage/engines/postgres.py b/synapse/storage/engines/postgres.py
index 9bf74bbf59..d8c0f64d9a 100644
--- a/synapse/storage/engines/postgres.py
+++ b/synapse/storage/engines/postgres.py
@@ -170,22 +170,6 @@ class PostgresEngine(
         """Do we support the `RETURNING` clause in insert/update/delete?"""
         return True
 
-    @property
-    def tsquery_func(self) -> str:
-        """
-        Selects a tsquery_* func to use.
-
-        Ref: https://www.postgresql.org/docs/current/textsearch-controls.html
-
-        Returns:
-            The function name.
-        """
-        # Postgres 11 added support for websearch_to_tsquery.
-        assert self._version is not None
-        if self._version >= 110000:
-            return "websearch_to_tsquery"
-        return "plainto_tsquery"
-
     def is_deadlock(self, error: Exception) -> bool:
         if isinstance(error, psycopg2.DatabaseError):
             # https://www.postgresql.org/docs/current/static/errcodes-appendix.html
diff --git a/synapse/storage/schema/main/delta/73/10_update_sqlite_fts4_tokenizer.py b/synapse/storage/schema/main/delta/73/10_update_sqlite_fts4_tokenizer.py
deleted file mode 100644
index 3de0a709eb..0000000000
--- a/synapse/storage/schema/main/delta/73/10_update_sqlite_fts4_tokenizer.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright 2022 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-
-from synapse.storage.engines import BaseDatabaseEngine, Sqlite3Engine
-from synapse.storage.types import Cursor
-
-
-def run_create(cur: Cursor, database_engine: BaseDatabaseEngine) -> None:
-    """
-    Upgrade the event_search table to use the porter tokenizer if it isn't already
-
-    Applies only for sqlite.
-    """
-    if not isinstance(database_engine, Sqlite3Engine):
-        return
-
-    # Rebuild the table event_search table with tokenize=porter configured.
-    cur.execute("DROP TABLE event_search")
-    cur.execute(
-        """
-        CREATE VIRTUAL TABLE event_search
-        USING fts4 (tokenize=porter, event_id, room_id, sender, key, value )
-        """
-    )
-
-    # Re-run the background job to re-populate the event_search table.
-    cur.execute("SELECT MIN(stream_ordering) FROM events")
-    row = cur.fetchone()
-    min_stream_id = row[0]
-
-    # If there are not any events, nothing to do.
-    if min_stream_id is None:
-        return
-
-    cur.execute("SELECT MAX(stream_ordering) FROM events")
-    row = cur.fetchone()
-    max_stream_id = row[0]
-
-    progress = {
-        "target_min_stream_id_inclusive": min_stream_id,
-        "max_stream_id_exclusive": max_stream_id + 1,
-    }
-    progress_json = json.dumps(progress)
-
-    sql = """
-    INSERT into background_updates (ordering, update_name, progress_json)
-    VALUES (?, ?, ?)
-    """
-
-    cur.execute(sql, (7310, "event_search", progress_json))
diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py
index 868b5bee84..e747c6b50e 100644
--- a/tests/storage/test_room_search.py
+++ b/tests/storage/test_room_search.py
@@ -12,22 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Tuple, Union
-from unittest.case import SkipTest
-from unittest.mock import PropertyMock, patch
-
-from twisted.test.proto_helpers import MemoryReactor
-
 import synapse.rest.admin
 from synapse.api.constants import EventTypes
 from synapse.api.errors import StoreError
 from synapse.rest.client import login, room
-from synapse.server import HomeServer
-from synapse.storage.databases.main import DataStore
-from synapse.storage.databases.main.search import Phrase, SearchToken, _tokenize_query
 from synapse.storage.engines import PostgresEngine
-from synapse.storage.engines.sqlite import Sqlite3Engine
-from synapse.util import Clock
 
 from tests.unittest import HomeserverTestCase, skip_unless
 from tests.utils import USE_POSTGRES_FOR_TESTS
@@ -198,213 +187,3 @@ class EventSearchInsertionTest(HomeserverTestCase):
             ),
         )
         self.assertCountEqual(values, ["hi", "2"])
-
-
-class MessageSearchTest(HomeserverTestCase):
-    """
-    Check message search.
-
-    A powerful way to check the behaviour is to run the following in Postgres >= 11:
-
-        # SELECT websearch_to_tsquery('english', <your string>);
-
-    The result can be compared to the tokenized version for SQLite and Postgres < 11.
-
-    """
-
-    servlets = [
-        synapse.rest.admin.register_servlets_for_client_rest_resource,
-        login.register_servlets,
-        room.register_servlets,
-    ]
-
-    PHRASE = "the quick brown fox jumps over the lazy dog"
-
-    # Each entry is a search query, followed by either a boolean of whether it is
-    # in the phrase OR a tuple of booleans: whether it matches using websearch
-    # and using plain search.
-    COMMON_CASES: List[Tuple[str, Union[bool, Tuple[bool, bool]]]] = [
-        ("nope", False),
-        ("brown", True),
-        ("quick brown", True),
-        ("brown quick", True),
-        ("quick \t brown", True),
-        ("jump", True),
-        ("brown nope", False),
-        ('"brown quick"', (False, True)),
-        ('"jumps over"', True),
-        ('"quick fox"', (False, True)),
-        ("nope OR doublenope", False),
-        ("furphy OR fox", (True, False)),
-        ("fox -nope", (True, False)),
-        ("fox -brown", (False, True)),
-        ('"fox" quick', True),
-        ('"quick brown', True),
-        ('" quick "', True),
-        ('" nope"', False),
-    ]
-    # TODO Test non-ASCII cases.
-
-    # Case that fail on SQLite.
-    POSTGRES_CASES: List[Tuple[str, Union[bool, Tuple[bool, bool]]]] = [
-        # SQLite treats NOT as a binary operator.
-        ("- fox", (False, True)),
-        ("- nope", (True, False)),
-        ('"-fox quick', (False, True)),
-        # PostgreSQL skips stop words.
-        ('"the quick brown"', True),
-        ('"over lazy"', True),
-    ]
-
-    def prepare(
-        self, reactor: MemoryReactor, clock: Clock, homeserver: HomeServer
-    ) -> None:
-        # Register a user and create a room, create some messages
-        self.register_user("alice", "password")
-        self.access_token = self.login("alice", "password")
-        self.room_id = self.helper.create_room_as("alice", tok=self.access_token)
-
-        # Send the phrase as a message and check it was created
-        response = self.helper.send(self.room_id, self.PHRASE, tok=self.access_token)
-        self.assertIn("event_id", response)
-
-        # The behaviour of a missing trailing double quote changed in PostgreSQL 14
-        # from ignoring the initial double quote to treating it as a phrase.
-        main_store = homeserver.get_datastores().main
-        found = False
-        if isinstance(main_store.database_engine, PostgresEngine):
-            assert main_store.database_engine._version is not None
-            found = main_store.database_engine._version < 140000
-        self.COMMON_CASES.append(('"fox quick', (found, True)))
-
-    def test_tokenize_query(self) -> None:
-        """Test the custom logic to tokenize a user's query."""
-        cases = (
-            ("brown", ["brown"]),
-            ("quick brown", ["quick", SearchToken.And, "brown"]),
-            ("quick \t brown", ["quick", SearchToken.And, "brown"]),
-            ('"brown quick"', [Phrase(["brown", "quick"])]),
-            ("furphy OR fox", ["furphy", SearchToken.Or, "fox"]),
-            ("fox -brown", ["fox", SearchToken.Not, "brown"]),
-            ("- fox", [SearchToken.Not, "fox"]),
-            ('"fox" quick', [Phrase(["fox"]), SearchToken.And, "quick"]),
-            # No trailing double quote.
-            ('"fox quick', [Phrase(["fox", "quick"])]),
-            ('"-fox quick', [Phrase(["-fox", "quick"])]),
-            ('" quick "', [Phrase(["quick"])]),
-            (
-                'q"uick brow"n',
-                [
-                    "q",
-                    SearchToken.And,
-                    Phrase(["uick", "brow"]),
-                    SearchToken.And,
-                    "n",
-                ],
-            ),
-            (
-                '-"quick brown"',
-                [SearchToken.Not, Phrase(["quick", "brown"])],
-            ),
-        )
-
-        for query, expected in cases:
-            tokenized = _tokenize_query(query)
-            self.assertEqual(
-                tokenized, expected, f"{tokenized} != {expected} for {query}"
-            )
-
-    def _check_test_cases(
-        self,
-        store: DataStore,
-        cases: List[Tuple[str, Union[bool, Tuple[bool, bool]]]],
-        index=0,
-    ) -> None:
-        # Run all the test cases versus search_msgs
-        for query, expect_to_contain in cases:
-            if isinstance(expect_to_contain, tuple):
-                expect_to_contain = expect_to_contain[index]
-
-            result = self.get_success(
-                store.search_msgs([self.room_id], query, ["content.body"])
-            )
-            self.assertEquals(
-                result["count"],
-                1 if expect_to_contain else 0,
-                f"expected '{query}' to match '{self.PHRASE}'"
-                if expect_to_contain
-                else f"'{query}' unexpectedly matched '{self.PHRASE}'",
-            )
-            self.assertEquals(
-                len(result["results"]),
-                1 if expect_to_contain else 0,
-                "results array length should match count",
-            )
-
-        # Run them again versus search_rooms
-        for query, expect_to_contain in cases:
-            if isinstance(expect_to_contain, tuple):
-                expect_to_contain = expect_to_contain[index]
-
-            result = self.get_success(
-                store.search_rooms([self.room_id], query, ["content.body"], 10)
-            )
-            self.assertEquals(
-                result["count"],
-                1 if expect_to_contain else 0,
-                f"expected '{query}' to match '{self.PHRASE}'"
-                if expect_to_contain
-                else f"'{query}' unexpectedly matched '{self.PHRASE}'",
-            )
-            self.assertEquals(
-                len(result["results"]),
-                1 if expect_to_contain else 0,
-                "results array length should match count",
-            )
-
-    def test_postgres_web_search_for_phrase(self):
-        """
-        Test searching for phrases using typical web search syntax, as per postgres' websearch_to_tsquery.
-        This test is skipped unless the postgres instance supports websearch_to_tsquery.
-        """
-
-        store = self.hs.get_datastores().main
-        if not isinstance(store.database_engine, PostgresEngine):
-            raise SkipTest("Test only applies when postgres is used as the database")
-
-        if store.database_engine.tsquery_func != "websearch_to_tsquery":
-            raise SkipTest(
-                "Test only applies when postgres supporting websearch_to_tsquery is used as the database"
-            )
-
-        self._check_test_cases(store, self.COMMON_CASES + self.POSTGRES_CASES, index=0)
-
-    def test_postgres_non_web_search_for_phrase(self):
-        """
-        Test postgres searching for phrases without using web search, which is used when websearch_to_tsquery isn't
-        supported by the current postgres version.
-        """
-
-        store = self.hs.get_datastores().main
-        if not isinstance(store.database_engine, PostgresEngine):
-            raise SkipTest("Test only applies when postgres is used as the database")
-
-        # Patch supports_websearch_to_tsquery to always return False to ensure we're testing the plainto_tsquery path.
-        with patch(
-            "synapse.storage.engines.postgres.PostgresEngine.tsquery_func",
-            new_callable=PropertyMock,
-        ) as supports_websearch_to_tsquery:
-            supports_websearch_to_tsquery.return_value = "plainto_tsquery"
-            self._check_test_cases(
-                store, self.COMMON_CASES + self.POSTGRES_CASES, index=1
-            )
-
-    def test_sqlite_search(self):
-        """
-        Test sqlite searching for phrases.
-        """
-        store = self.hs.get_datastores().main
-        if not isinstance(store.database_engine, Sqlite3Engine):
-            raise SkipTest("Test only applies when sqlite is used as the database")
-
-        self._check_test_cases(store, self.COMMON_CASES, index=0)