summary refs log tree commit diff
diff options
context:
space:
mode:
authorHillery Shay <shaysquared@gmail.com>2021-09-22 08:25:26 -0700
committerGitHub <noreply@github.com>2021-09-22 08:25:26 -0700
commitf78b68a96b1f179043b38b4109e09fa0a315643d (patch)
tree385b7aaa1987b730e15eb396dedaa71d4ac0557c
parentFix invalidating OTK count cache after claim (#10875) (diff)
downloadsynapse-f78b68a96b1f179043b38b4109e09fa0a315643d.tar.xz
Treat "\u0000" as "\u0020" for the purposes of message search (message indexing) (#10820)
* add test to check if null code points are being inserted

* add logic to detect and replace null code points before insertion into db

* lints

* add license to test

* change approach to null substitution

* add type hint for SearchEntry

* Add changelog entry

Signed-off-by: H.Shay <shaysquared@gmail.com>

* updated changelog

* update chanelog message

* remove duplicate changelog

* Update synapse/storage/databases/main/events.py remove extra space

Co-authored-by: Patrick Cloke <clokep@users.noreply.github.com>

* rename and move test file, update tests, delete old test file

* fix typo in comments

* update _find_highlights_in_postgres to replace null byte with space

* replace null byte in sqlite search insertion

* beef up and reorganize test for this pr

* update changelog

* add type hints and update docstring

* check db engine directly vs using env variable

* refactor tests to be less repetetive

* move rplace logic into seperate function

* requested changes

* Fix typo.

* Update synapse/storage/databases/main/search.py

Co-authored-by: reivilibre <olivier@librepush.net>

* Update changelog.d/10820.misc

Co-authored-by: Aaron Raimist <aaron@raim.ist>

Co-authored-by: Patrick Cloke <clokep@users.noreply.github.com>
Co-authored-by: reivilibre <olivier@librepush.net>
Co-authored-by: Aaron Raimist <aaron@raim.ist>
-rw-r--r--changelog.d/10820.misc1
-rw-r--r--synapse/storage/databases/main/search.py34
-rw-r--r--tests/storage/test_room_search.py74
3 files changed, 100 insertions, 9 deletions
diff --git a/changelog.d/10820.misc b/changelog.d/10820.misc
new file mode 100644
index 0000000000..4373bf6f6b
--- /dev/null
+++ b/changelog.d/10820.misc
@@ -0,0 +1 @@
+Fix a long-standing bug where an `m.room.message` event containing a null byte would cause an internal server error.
\ No newline at end of file
diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py
index 6480d5a9f5..2a1e99e17a 100644
--- a/synapse/storage/databases/main/search.py
+++ b/synapse/storage/databases/main/search.py
@@ -15,12 +15,12 @@
 import logging
 import re
 from collections import namedtuple
-from typing import Collection, List, Optional, Set
+from typing import Collection, Iterable, List, Optional, Set
 
 from synapse.api.errors import SynapseError
 from synapse.events import EventBase
 from synapse.storage._base import SQLBaseStore, db_to_json, make_in_list_sql_clause
-from synapse.storage.database import DatabasePool
+from synapse.storage.database import DatabasePool, LoggingTransaction
 from synapse.storage.databases.main.events_worker import EventRedactBehaviour
 from synapse.storage.engines import PostgresEngine, Sqlite3Engine
 
@@ -32,14 +32,24 @@ SearchEntry = namedtuple(
 )
 
 
+def _clean_value_for_search(value: str) -> str:
+    """
+    Replaces any null code points in the string with spaces as
+    Postgres and SQLite do not like the insertion of strings with
+    null code points into the full-text search tables.
+    """
+    return value.replace("\u0000", " ")
+
+
 class SearchWorkerStore(SQLBaseStore):
-    def store_search_entries_txn(self, txn, entries):
+    def store_search_entries_txn(
+        self, txn: LoggingTransaction, entries: Iterable[SearchEntry]
+    ) -> None:
         """Add entries to the search table
 
         Args:
-            txn (cursor):
-            entries (iterable[SearchEntry]):
-                entries to be added to the table
+            txn:
+            entries: entries to be added to the table
         """
         if not self.hs.config.enable_search:
             return
@@ -55,7 +65,7 @@ class SearchWorkerStore(SQLBaseStore):
                     entry.event_id,
                     entry.room_id,
                     entry.key,
-                    entry.value,
+                    _clean_value_for_search(entry.value),
                     entry.stream_ordering,
                     entry.origin_server_ts,
                 )
@@ -70,11 +80,16 @@ class SearchWorkerStore(SQLBaseStore):
                 " VALUES (?,?,?,?)"
             )
             args = (
-                (entry.event_id, entry.room_id, entry.key, entry.value)
+                (
+                    entry.event_id,
+                    entry.room_id,
+                    entry.key,
+                    _clean_value_for_search(entry.value),
+                )
                 for entry in entries
             )
-
             txn.execute_batch(sql, args)
+
         else:
             # This should be unreachable.
             raise Exception("Unrecognized database engine")
@@ -646,6 +661,7 @@ class SearchStore(SearchBackgroundUpdateStore):
                 for key in ("body", "name", "topic"):
                     v = event.content.get(key, None)
                     if v:
+                        v = _clean_value_for_search(v)
                         values.append(v)
 
                 if not values:
diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py
new file mode 100644
index 0000000000..8971ecccbd
--- /dev/null
+++ b/tests/storage/test_room_search.py
@@ -0,0 +1,74 @@
+# Copyright 2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import synapse.rest.admin
+from synapse.rest.client import login, room
+from synapse.storage.engines import PostgresEngine
+
+from tests.unittest import HomeserverTestCase
+
+
+class NullByteInsertionTest(HomeserverTestCase):
+    servlets = [
+        synapse.rest.admin.register_servlets_for_client_rest_resource,
+        login.register_servlets,
+        room.register_servlets,
+    ]
+
+    def test_null_byte(self):
+        """
+        Postgres/SQLite don't like null bytes going into the search tables. Internally
+        we replace those with a space.
+
+        Ensure this doesn't break anything.
+        """
+
+        # Register a user and create a room, create some messages
+        self.register_user("alice", "password")
+        access_token = self.login("alice", "password")
+        room_id = self.helper.create_room_as("alice", tok=access_token)
+
+        # Send messages and ensure they don't cause an internal server
+        # error
+        for body in ["hi\u0000bob", "another message", "hi alice"]:
+            response = self.helper.send(room_id, body, tok=access_token)
+            self.assertIn("event_id", response)
+
+        # Check that search works for the message where the null byte was replaced
+        store = self.hs.get_datastore()
+        result = self.get_success(
+            store.search_msgs([room_id], "hi bob", ["content.body"])
+        )
+        self.assertEquals(result.get("count"), 1)
+        if isinstance(store.database_engine, PostgresEngine):
+            self.assertIn("hi", result.get("highlights"))
+            self.assertIn("bob", result.get("highlights"))
+
+        # Check that search works for an unrelated message
+        result = self.get_success(
+            store.search_msgs([room_id], "another", ["content.body"])
+        )
+        self.assertEquals(result.get("count"), 1)
+        if isinstance(store.database_engine, PostgresEngine):
+            self.assertIn("another", result.get("highlights"))
+
+        # Check that search works for a search term that overlaps with the message
+        # containing a null byte and an unrelated message.
+        result = self.get_success(store.search_msgs([room_id], "hi", ["content.body"]))
+        self.assertEquals(result.get("count"), 2)
+        result = self.get_success(
+            store.search_msgs([room_id], "hi alice", ["content.body"])
+        )
+        if isinstance(store.database_engine, PostgresEngine):
+            self.assertIn("alice", result.get("highlights"))