diff --git a/changelog.d/14464.feature b/changelog.d/14464.feature
new file mode 100644
index 0000000000..688ea32117
--- /dev/null
+++ b/changelog.d/14464.feature
@@ -0,0 +1 @@
+Improve user search for international display names.
diff --git a/debian/changelog b/debian/changelog
index 163b7210bf..5d3c4f7d6b 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,10 @@
+matrix-synapse-py3 (1.74.0~rc1) UNRELEASED; urgency=medium
+
+ * New dependency on libicu-dev to provide improved results for user
+ search.
+
+ -- Synapse Packaging team <packages@matrix.org> Tue, 06 Dec 2022 15:28:10 +0000
+
matrix-synapse-py3 (1.73.0) stable; urgency=medium
* New Synapse release 1.73.0.
diff --git a/debian/control b/debian/control
index 86f5a66d02..bc628cec08 100644
--- a/debian/control
+++ b/debian/control
@@ -8,6 +8,8 @@ Build-Depends:
dh-virtualenv (>= 1.1),
libsystemd-dev,
libpq-dev,
+ libicu-dev,
+ pkg-config,
lsb-release,
python3-dev,
python3,
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 185d5bc3d4..7e5123210a 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -97,6 +97,8 @@ RUN \
zlib1g-dev \
git \
curl \
+ libicu-dev \
+ pkg-config \
&& rm -rf /var/lib/apt/lists/*
diff --git a/docker/Dockerfile-dhvirtualenv b/docker/Dockerfile-dhvirtualenv
index 73165f6f85..f3b5b00ce6 100644
--- a/docker/Dockerfile-dhvirtualenv
+++ b/docker/Dockerfile-dhvirtualenv
@@ -84,6 +84,8 @@ RUN apt-get update -qq -o Acquire::Languages=none \
python3-venv \
sqlite3 \
libpq-dev \
+ libicu-dev \
+ pkg-config \
xmlsec1
# Install rust and ensure it's in the PATH
diff --git a/poetry.lock b/poetry.lock
index cac22e2ef0..ccda8a23fb 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -838,6 +838,14 @@ optional = false
python-versions = ">=3.5"
[[package]]
+name = "pyicu"
+version = "2.10.2"
+description = "Python extension wrapping the ICU C++ API"
+category = "main"
+optional = true
+python-versions = "*"
+
+[[package]]
name = "pyjwt"
version = "2.4.0"
description = "JSON Web Token implementation in Python"
@@ -1622,7 +1630,7 @@ docs = ["Sphinx", "repoze.sphinx.autointerface"]
test = ["zope.i18nmessageid", "zope.testing", "zope.testrunner"]
[extras]
-all = ["matrix-synapse-ldap3", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pysaml2", "authlib", "lxml", "sentry-sdk", "jaeger-client", "opentracing", "txredisapi", "hiredis", "Pympler"]
+all = ["matrix-synapse-ldap3", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pysaml2", "authlib", "lxml", "sentry-sdk", "jaeger-client", "opentracing", "txredisapi", "hiredis", "Pympler", "pyicu"]
cache-memory = ["Pympler"]
jwt = ["authlib"]
matrix-synapse-ldap3 = ["matrix-synapse-ldap3"]
@@ -1635,11 +1643,12 @@ sentry = ["sentry-sdk"]
systemd = ["systemd-python"]
test = ["parameterized", "idna"]
url-preview = ["lxml"]
+user-search = ["pyicu"]
[metadata]
lock-version = "1.1"
python-versions = "^3.7.1"
-content-hash = "8c44ceeb9df5c3ab43040400e0a6b895de49417e61293a1ba027640b34f03263"
+content-hash = "f20007013f33bc35a01e412c48adc62a936030f3074e06286674c5ad7f44d300"
[metadata.files]
attrs = [
@@ -2427,6 +2436,9 @@ pygments = [
{file = "Pygments-2.11.2-py3-none-any.whl", hash = "sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65"},
{file = "Pygments-2.11.2.tar.gz", hash = "sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a"},
]
+pyicu = [
+ {file = "PyICU-2.10.2.tar.gz", hash = "sha256:0c3309eea7fab6857507ace62403515b60fe096cbfb4f90d14f55ff75c5441c1"},
+]
pyjwt = [
{file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"},
{file = "PyJWT-2.4.0.tar.gz", hash = "sha256:d42908208c699b3b973cbeb01a969ba6a96c821eefb1c5bfe4c390c01d67abba"},
diff --git a/pyproject.toml b/pyproject.toml
index df59fa0562..bb383683cc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -208,6 +208,7 @@ hiredis = { version = "*", optional = true }
Pympler = { version = "*", optional = true }
parameterized = { version = ">=0.7.4", optional = true }
idna = { version = ">=2.5", optional = true }
+pyicu = { version = ">=2.10.2", optional = true }
[tool.poetry.extras]
# NB: Packages that should be part of `pip install matrix-synapse[all]` need to be specified
@@ -230,6 +231,10 @@ redis = ["txredisapi", "hiredis"]
# Required to use experimental `caches.track_memory_usage` config option.
cache-memory = ["pympler"]
test = ["parameterized", "idna"]
+# Allows for better search for international characters in the user directory. This
+# requires libicu's development headers installed on the system (e.g. libicu-dev on
+# Debian-based distributions).
+user-search = ["pyicu"]
# The duplication here is awful. I hate hate hate hate hate it. However, for now I want
# to ensure you can still `pip install matrix-synapse[all]` like today. Two motivations:
@@ -261,6 +266,8 @@ all = [
"txredisapi", "hiredis",
# cache-memory
"pympler",
+ # improved user search
+ "pyicu",
# omitted:
# - test: it's useful to have this separate from dev deps in the olddeps job
# - systemd: this is a system-based requirement
diff --git a/stubs/icu.pyi b/stubs/icu.pyi
new file mode 100644
index 0000000000..efeda7938a
--- /dev/null
+++ b/stubs/icu.pyi
@@ -0,0 +1,25 @@
+# Copyright 2022 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Stub for PyICU.
+
+class Locale:
+ @staticmethod
+ def getDefault() -> Locale: ...
+
+class BreakIterator:
+ @staticmethod
+ def createWordInstance(locale: Locale) -> BreakIterator: ...
+ def setText(self, text: str) -> None: ...
+ def nextBoundary(self) -> int: ...
diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py
index af9952f513..14ef5b040d 100644
--- a/synapse/storage/databases/main/user_directory.py
+++ b/synapse/storage/databases/main/user_directory.py
@@ -26,6 +26,14 @@ from typing import (
cast,
)
+try:
+ # Figure out if ICU support is available for searching users.
+ import icu
+
+ USE_ICU = True
+except ModuleNotFoundError:
+ USE_ICU = False
+
from typing_extensions import TypedDict
from synapse.api.errors import StoreError
@@ -900,7 +908,7 @@ def _parse_query_sqlite(search_term: str) -> str:
"""
# Pull out the individual words, discarding any non-word characters.
- results = re.findall(r"([\w\-]+)", search_term, re.UNICODE)
+ results = _parse_words(search_term)
return " & ".join("(%s* OR %s)" % (result, result) for result in results)
@@ -910,12 +918,63 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]:
We use this so that we can add prefix matching, which isn't something
that is supported by default.
"""
-
- # Pull out the individual words, discarding any non-word characters.
- results = re.findall(r"([\w\-]+)", search_term, re.UNICODE)
+ results = _parse_words(search_term)
both = " & ".join("(%s:* | %s)" % (result, result) for result in results)
exact = " & ".join("%s" % (result,) for result in results)
prefix = " & ".join("%s:*" % (result,) for result in results)
return both, exact, prefix
+
+
+def _parse_words(search_term: str) -> List[str]:
+ """Split the provided search string into a list of its words.
+
+ If support for ICU (International Components for Unicode) is available, use it.
+ Otherwise, fall back to using a regex to detect word boundaries. This latter
+ solution works well enough for most latin-based languages, but doesn't work as well
+ with other languages.
+
+ Args:
+ search_term: The search string.
+
+ Returns:
+ A list of the words in the search string.
+ """
+ if USE_ICU:
+ return _parse_words_with_icu(search_term)
+
+ return re.findall(r"([\w\-]+)", search_term, re.UNICODE)
+
+
+def _parse_words_with_icu(search_term: str) -> List[str]:
+ """Break down the provided search string into its individual words using ICU
+ (International Components for Unicode).
+
+ Args:
+ search_term: The search string.
+
+ Returns:
+ A list of the words in the search string.
+ """
+ results = []
+ breaker = icu.BreakIterator.createWordInstance(icu.Locale.getDefault())
+ breaker.setText(search_term)
+ i = 0
+ while True:
+ j = breaker.nextBoundary()
+ if j < 0:
+ break
+
+ result = search_term[i:j]
+
+ # libicu considers spaces and punctuation between words as words, but we don't
+ # want to include those in results as they would result in syntax errors in SQL
+ # queries (e.g. "foo bar" would result in the search query including "foo & &
+ # bar").
+ if len(re.findall(r"([\w\-]+)", result, re.UNICODE)):
+ results.append(result)
+
+ i = j
+
+ return results
diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py
index 88c7d5fec0..3ba896ecf3 100644
--- a/tests/storage/test_user_directory.py
+++ b/tests/storage/test_user_directory.py
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+import re
from typing import Any, Dict, Set, Tuple
from unittest import mock
from unittest.mock import Mock, patch
@@ -30,6 +31,12 @@ from synapse.util import Clock
from tests.test_utils.event_injection import inject_member_event
from tests.unittest import HomeserverTestCase, override_config
+try:
+ import icu
+except ImportError:
+ icu = None # type: ignore
+
+
ALICE = "@alice:a"
BOB = "@bob:b"
BOBBY = "@bobby:a"
@@ -467,3 +474,39 @@ class UserDirectoryStoreTestCase(HomeserverTestCase):
r["results"][0],
{"user_id": BELA, "display_name": "Bela", "avatar_url": None},
)
+
+
+class UserDirectoryICUTestCase(HomeserverTestCase):
+ if not icu:
+ skip = "Requires PyICU"
+
+ def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
+ self.store = hs.get_datastores().main
+ self.user_dir_helper = GetUserDirectoryTables(self.store)
+
+ def test_icu_word_boundary(self) -> None:
+ """Tests that we correctly detect word boundaries when ICU (International
+ Components for Unicode) support is available.
+ """
+
+ display_name = "Gáo"
+
+ # This word is not broken down correctly by Python's regular expressions,
+ # likely because á is actually a lowercase a followed by a U+0301 combining
+ # acute accent. This is specifically something that ICU support fixes.
+ matches = re.findall(r"([\w\-]+)", display_name, re.UNICODE)
+ self.assertEqual(len(matches), 2)
+
+ self.get_success(
+ self.store.update_profile_in_user_dir(ALICE, display_name, None)
+ )
+ self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE,)))
+
+ # Check that searching for this user yields the correct result.
+ r = self.get_success(self.store.search_user_dir(BOB, display_name, 10))
+ self.assertFalse(r["limited"])
+ self.assertEqual(len(r["results"]), 1)
+ self.assertDictEqual(
+ r["results"][0],
+ {"user_id": ALICE, "display_name": display_name, "avatar_url": None},
+ )
|