Fix a bug introduced in Synapse v1.74.0 where searching with colons when using ICU for search term tokenisation would fail with an error. (#15079)

Co-authored-by: David Robertson <davidr@element.io>
author: reivilibre <oliverw@matrix.org> 2023-02-20 12:00:18 +0000
committer: GitHub <noreply@github.com> 2023-02-20 12:00:18 +0000
commit: 1cbc3f197cc1b9732649ffb769b05d90c0e904d7 (patch)
tree: ab5ccdaf1cd2caac240bcc5370e6fa6c64b96bc8 /synapse
parent: Bump types-setuptools from 67.1.0.0 to 67.3.0.1 (#15105) (diff)
download: synapse-1cbc3f197cc1b9732649ffb769b05d90c0e904d7.tar.xz
1 files changed, 20 insertions, 4 deletions
diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py
index f6a6fd4079..30af4b3b6c 100644
--- a/synapse/storage/databases/main/user_directory.py
+++ b/synapse/storage/databases/main/user_directory.py
@@ -918,11 +918,19 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]:
     We use this so that we can add prefix matching, which isn't something
     that is supported by default.
     """
-    results = _parse_words(search_term)
+    escaped_words = []
+    for word in _parse_words(search_term):
+        # Postgres tsvector and tsquery quoting rules:
+        # words potentially containing punctuation should be quoted
+        # and then existing quotes and backslashes should be doubled
+        # See: https://www.postgresql.org/docs/current/datatype-textsearch.html#DATATYPE-TSQUERY
+
+        quoted_word = word.replace("'", "''").replace("\\", "\\\\")
+        escaped_words.append(f"'{quoted_word}'")
 
-    both = " & ".join("(%s:* | %s)" % (result, result) for result in results)
-    exact = " & ".join("%s" % (result,) for result in results)
-    prefix = " & ".join("%s:*" % (result,) for result in results)
+    both = " & ".join("(%s:* | %s)" % (word, word) for word in escaped_words)
+    exact = " & ".join("%s" % (word,) for word in escaped_words)
+    prefix = " & ".join("%s:*" % (word,) for word in escaped_words)
 
     return both, exact, prefix
 
@@ -944,6 +952,14 @@ def _parse_words(search_term: str) -> List[str]:
     if USE_ICU:
         return _parse_words_with_icu(search_term)
 
+    return _parse_words_with_regex(search_term)
+
+
+def _parse_words_with_regex(search_term: str) -> List[str]:
+    """
+    Break down search term into words, when we don't have ICU available.
+    See: `_parse_words`
+    """
     return re.findall(r"([\w\-]+)", search_term, re.UNICODE)
author	reivilibre <oliverw@matrix.org>	2023-02-20 12:00:18 +0000
committer	GitHub <noreply@github.com>	2023-02-20 12:00:18 +0000
commit	1cbc3f197cc1b9732649ffb769b05d90c0e904d7 (patch)
tree	ab5ccdaf1cd2caac240bcc5370e6fa6c64b96bc8 /synapse
parent	Bump types-setuptools from 67.1.0.0 to 67.3.0.1 (#15105) (diff)
download	synapse-1cbc3f197cc1b9732649ffb769b05d90c0e904d7.tar.xz