summary refs log tree commit diff
path: root/tests
diff options
context:
space:
mode:
authorJames Salter <iteration@gmail.com>2022-10-25 19:05:22 +0100
committerGitHub <noreply@github.com>2022-10-25 14:05:22 -0400
commitd902181de98399d90c46c4e4e2cf631064757941 (patch)
treefedb176982c540a8187b3222efec08bb06dbc7ab /tests
parentMerge branch 'release-v1.70' into develop (diff)
downloadsynapse-d902181de98399d90c46c4e4e2cf631064757941.tar.xz
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.

Supports, with the same syntax across Postgresql 11+ and Sqlite:

- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
  documents containing "dogs". 

This is achieved by 

- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
  query syntax.

Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.

Multiple terms separated by a space are implicitly ANDed.

Note that:

1. There is no escaping of full-text syntax that might be supported by the database;
  e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
  as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
  language needs to be known at the time of indexing the message (via room metadata,
  or otherwise), or a separate index for each language supported could be created.

Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
Diffstat (limited to 'tests')
-rw-r--r--tests/storage/test_room_search.py213
1 files changed, 213 insertions, 0 deletions
diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py
index e747c6b50e..9ddc19900a 100644
--- a/tests/storage/test_room_search.py
+++ b/tests/storage/test_room_search.py
@@ -12,11 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import List, Tuple, Union
+from unittest.case import SkipTest
+from unittest.mock import PropertyMock, patch
+
+from twisted.test.proto_helpers import MemoryReactor
+
 import synapse.rest.admin
 from synapse.api.constants import EventTypes
 from synapse.api.errors import StoreError
 from synapse.rest.client import login, room
+from synapse.server import HomeServer
+from synapse.storage.databases.main import DataStore
+from synapse.storage.databases.main.search import Phrase, SearchToken, _tokenize_query
 from synapse.storage.engines import PostgresEngine
+from synapse.storage.engines.sqlite import Sqlite3Engine
+from synapse.util import Clock
 
 from tests.unittest import HomeserverTestCase, skip_unless
 from tests.utils import USE_POSTGRES_FOR_TESTS
@@ -187,3 +198,205 @@ class EventSearchInsertionTest(HomeserverTestCase):
             ),
         )
         self.assertCountEqual(values, ["hi", "2"])
+
+
+class MessageSearchTest(HomeserverTestCase):
+    """
+    Check message search.
+
+    A powerful way to check the behaviour is to run the following in Postgres >= 11:
+
+        # SELECT websearch_to_tsquery('english', <your string>);
+
+    The result can be compared to the tokenized version for SQLite and Postgres < 11.
+
+    """
+
+    servlets = [
+        synapse.rest.admin.register_servlets_for_client_rest_resource,
+        login.register_servlets,
+        room.register_servlets,
+    ]
+
+    PHRASE = "the quick brown fox jumps over the lazy dog"
+
+    # Each entry is a search query, followed by either a boolean of whether it is
+    # in the phrase OR a tuple of booleans: whether it matches using websearch
+    # and using plain search.
+    COMMON_CASES: List[Tuple[str, Union[bool, Tuple[bool, bool]]]] = [
+        ("nope", False),
+        ("brown", True),
+        ("quick brown", True),
+        ("brown quick", True),
+        ("quick \t brown", True),
+        ("jump", True),
+        ("brown nope", False),
+        ('"brown quick"', (False, True)),
+        ('"jumps over"', True),
+        ('"quick fox"', (False, True)),
+        ("nope OR doublenope", False),
+        ("furphy OR fox", (True, False)),
+        ("fox -nope", (True, False)),
+        ("fox -brown", (False, True)),
+        ('"fox" quick', True),
+        ('"fox quick', True),
+        ('"quick brown', True),
+        ('" quick "', True),
+        ('" nope"', False),
+    ]
+    # TODO Test non-ASCII cases.
+
+    # Case that fail on SQLite.
+    POSTGRES_CASES: List[Tuple[str, Union[bool, Tuple[bool, bool]]]] = [
+        # SQLite treats NOT as a binary operator.
+        ("- fox", (False, True)),
+        ("- nope", (True, False)),
+        ('"-fox quick', (False, True)),
+        # PostgreSQL skips stop words.
+        ('"the quick brown"', True),
+        ('"over lazy"', True),
+    ]
+
+    def prepare(
+        self, reactor: MemoryReactor, clock: Clock, homeserver: HomeServer
+    ) -> None:
+        # Register a user and create a room, create some messages
+        self.register_user("alice", "password")
+        self.access_token = self.login("alice", "password")
+        self.room_id = self.helper.create_room_as("alice", tok=self.access_token)
+
+        # Send the phrase as a message and check it was created
+        response = self.helper.send(self.room_id, self.PHRASE, tok=self.access_token)
+        self.assertIn("event_id", response)
+
+    def test_tokenize_query(self) -> None:
+        """Test the custom logic to tokenize a user's query."""
+        cases = (
+            ("brown", ["brown"]),
+            ("quick brown", ["quick", SearchToken.And, "brown"]),
+            ("quick \t brown", ["quick", SearchToken.And, "brown"]),
+            ('"brown quick"', [Phrase(["brown", "quick"])]),
+            ("furphy OR fox", ["furphy", SearchToken.Or, "fox"]),
+            ("fox -brown", ["fox", SearchToken.Not, "brown"]),
+            ("- fox", [SearchToken.Not, "fox"]),
+            ('"fox" quick', [Phrase(["fox"]), SearchToken.And, "quick"]),
+            # No trailing double quoe.
+            ('"fox quick', ["fox", SearchToken.And, "quick"]),
+            ('"-fox quick', [SearchToken.Not, "fox", SearchToken.And, "quick"]),
+            ('" quick "', [Phrase(["quick"])]),
+            (
+                'q"uick brow"n',
+                [
+                    "q",
+                    SearchToken.And,
+                    Phrase(["uick", "brow"]),
+                    SearchToken.And,
+                    "n",
+                ],
+            ),
+            (
+                '-"quick brown"',
+                [SearchToken.Not, Phrase(["quick", "brown"])],
+            ),
+        )
+
+        for query, expected in cases:
+            tokenized = _tokenize_query(query)
+            self.assertEqual(
+                tokenized, expected, f"{tokenized} != {expected} for {query}"
+            )
+
+    def _check_test_cases(
+        self,
+        store: DataStore,
+        cases: List[Tuple[str, Union[bool, Tuple[bool, bool]]]],
+        index=0,
+    ) -> None:
+        # Run all the test cases versus search_msgs
+        for query, expect_to_contain in cases:
+            if isinstance(expect_to_contain, tuple):
+                expect_to_contain = expect_to_contain[index]
+
+            result = self.get_success(
+                store.search_msgs([self.room_id], query, ["content.body"])
+            )
+            self.assertEquals(
+                result["count"],
+                1 if expect_to_contain else 0,
+                f"expected '{query}' to match '{self.PHRASE}'"
+                if expect_to_contain
+                else f"'{query}' unexpectedly matched '{self.PHRASE}'",
+            )
+            self.assertEquals(
+                len(result["results"]),
+                1 if expect_to_contain else 0,
+                "results array length should match count",
+            )
+
+        # Run them again versus search_rooms
+        for query, expect_to_contain in cases:
+            if isinstance(expect_to_contain, tuple):
+                expect_to_contain = expect_to_contain[index]
+
+            result = self.get_success(
+                store.search_rooms([self.room_id], query, ["content.body"], 10)
+            )
+            self.assertEquals(
+                result["count"],
+                1 if expect_to_contain else 0,
+                f"expected '{query}' to match '{self.PHRASE}'"
+                if expect_to_contain
+                else f"'{query}' unexpectedly matched '{self.PHRASE}'",
+            )
+            self.assertEquals(
+                len(result["results"]),
+                1 if expect_to_contain else 0,
+                "results array length should match count",
+            )
+
+    def test_postgres_web_search_for_phrase(self):
+        """
+        Test searching for phrases using typical web search syntax, as per postgres' websearch_to_tsquery.
+        This test is skipped unless the postgres instance supports websearch_to_tsquery.
+        """
+
+        store = self.hs.get_datastores().main
+        if not isinstance(store.database_engine, PostgresEngine):
+            raise SkipTest("Test only applies when postgres is used as the database")
+
+        if store.database_engine.tsquery_func != "websearch_to_tsquery":
+            raise SkipTest(
+                "Test only applies when postgres supporting websearch_to_tsquery is used as the database"
+            )
+
+        self._check_test_cases(store, self.COMMON_CASES + self.POSTGRES_CASES, index=0)
+
+    def test_postgres_non_web_search_for_phrase(self):
+        """
+        Test postgres searching for phrases without using web search, which is used when websearch_to_tsquery isn't
+        supported by the current postgres version.
+        """
+
+        store = self.hs.get_datastores().main
+        if not isinstance(store.database_engine, PostgresEngine):
+            raise SkipTest("Test only applies when postgres is used as the database")
+
+        # Patch supports_websearch_to_tsquery to always return False to ensure we're testing the plainto_tsquery path.
+        with patch(
+            "synapse.storage.engines.postgres.PostgresEngine.tsquery_func",
+            new_callable=PropertyMock,
+        ) as supports_websearch_to_tsquery:
+            supports_websearch_to_tsquery.return_value = "plainto_tsquery"
+            self._check_test_cases(
+                store, self.COMMON_CASES + self.POSTGRES_CASES, index=1
+            )
+
+    def test_sqlite_search(self):
+        """
+        Test sqlite searching for phrases.
+        """
+        store = self.hs.get_datastores().main
+        if not isinstance(store.database_engine, Sqlite3Engine):
+            raise SkipTest("Test only applies when sqlite is used as the database")
+
+        self._check_test_cases(store, self.COMMON_CASES, index=0)