From d8cc86eff484b6f570f55a5badb337080c6e4dcd Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 16 Nov 2022 10:25:24 -0500 Subject: Remove redundant types from comments. (#14412) Remove type hints from comments which have been added as Python type hints. This helps avoid drift between comments and reality, as well as removing redundant information. Also adds some missing type hints which were simple to fill in. --- synapse/storage/databases/main/user_directory.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'synapse/storage/databases/main/user_directory.py') diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index ddb25b5cea..698d6f7515 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -185,9 +185,8 @@ class UserDirectoryBackgroundUpdateStore(StateDeltasStore): - who should be in the user_directory. Args: - progress (dict) - batch_size (int): Maximum number of state events to process - per cycle. + progress + batch_size: Maximum number of state events to process per cycle. Returns: number of events processed. @@ -708,10 +707,10 @@ class UserDirectoryStore(UserDirectoryBackgroundUpdateStore): Returns the rooms that a user is in. Args: - user_id(str): Must be a local user + user_id: Must be a local user Returns: - list: user_id + List of room IDs """ rows = await self.db_pool.simple_select_onecol( table="users_who_share_private_rooms", -- cgit 1.5.1 From f792dd74e1e6f64cb15d920d87818f47f17e7848 Mon Sep 17 00:00:00 2001 From: Sean Quah <8349537+squahtx@users.noreply.github.com> Date: Mon, 28 Nov 2022 13:42:06 +0000 Subject: Remove option to skip locking of tables during emulated upserts (#14469) To perform an emulated upsert into a table safely, we must either: * lock the table, * be the only writer upserting into the table * or rely on another unique index being present. When the 2nd or 3rd cases were applicable, we previously avoided locking the table as an optimization. However, as seen in #14406, it is easy to slip up when adding new schema deltas and corrupt the database. The only time we lock when performing emulated upserts is while waiting for background updates on postgres. On sqlite, we do no locking at all. Let's remove the option to skip locking tables, so that we don't shoot ourselves in the foot again. Signed-off-by: Sean Quah --- changelog.d/14469.misc | 1 + synapse/storage/database.py | 56 +++++++--------------- synapse/storage/databases/main/account_data.py | 8 ---- synapse/storage/databases/main/appservice.py | 2 - synapse/storage/databases/main/devices.py | 9 ---- synapse/storage/databases/main/event_federation.py | 1 - synapse/storage/databases/main/pusher.py | 6 --- synapse/storage/databases/main/room.py | 6 --- synapse/storage/databases/main/room_batch.py | 2 - synapse/storage/databases/main/user_directory.py | 2 - 10 files changed, 19 insertions(+), 74 deletions(-) create mode 100644 changelog.d/14469.misc (limited to 'synapse/storage/databases/main/user_directory.py') diff --git a/changelog.d/14469.misc b/changelog.d/14469.misc new file mode 100644 index 0000000000..a12a21e9ae --- /dev/null +++ b/changelog.d/14469.misc @@ -0,0 +1 @@ +Remove option to skip locking of tables when performing emulated upserts, to avoid a class of bugs in future. diff --git a/synapse/storage/database.py b/synapse/storage/database.py index a14b13aec8..55bcb90001 100644 --- a/synapse/storage/database.py +++ b/synapse/storage/database.py @@ -1129,7 +1129,6 @@ class DatabasePool: values: Dict[str, Any], insertion_values: Optional[Dict[str, Any]] = None, desc: str = "simple_upsert", - lock: bool = True, ) -> bool: """Insert a row with values + insertion_values; on conflict, update with values. @@ -1154,21 +1153,12 @@ class DatabasePool: requiring that a unique index exist on the column names used to detect a conflict (i.e. `keyvalues.keys()`). - If there is no such index, we can "emulate" an upsert with a SELECT followed - by either an INSERT or an UPDATE. This is unsafe: we cannot make the same - atomicity guarantees that a native upsert can and are very vulnerable to races - and crashes. Therefore if we wish to upsert without an appropriate unique index, - we must either: - - 1. Acquire a table-level lock before the emulated upsert (`lock=True`), or - 2. VERY CAREFULLY ensure that we are the only thread and worker which will be - writing to this table, in which case we can proceed without a lock - (`lock=False`). - - Generally speaking, you should use `lock=True`. If the table in question has a - unique index[*], this class will use a native upsert (which is atomic and so can - ignore the `lock` argument). Otherwise this class will use an emulated upsert, - in which case we want the safer option unless we been VERY CAREFUL. + If there is no such index yet[*], we can "emulate" an upsert with a SELECT + followed by either an INSERT or an UPDATE. This is unsafe unless *all* upserters + run at the SERIALIZABLE isolation level: we cannot make the same atomicity + guarantees that a native upsert can and are very vulnerable to races and + crashes. Therefore to upsert without an appropriate unique index, we acquire a + table-level lock before the emulated upsert. [*]: Some tables have unique indices added to them in the background. Those tables `T` are keys in the dictionary UNIQUE_INDEX_BACKGROUND_UPDATES, @@ -1189,7 +1179,6 @@ class DatabasePool: values: The nonunique columns and their new values insertion_values: additional key/values to use only when inserting desc: description of the transaction, for logging and metrics - lock: True to lock the table when doing the upsert. Returns: Returns True if a row was inserted or updated (i.e. if `values` is not empty then this always returns True) @@ -1209,7 +1198,6 @@ class DatabasePool: keyvalues, values, insertion_values, - lock=lock, db_autocommit=autocommit, ) except self.engine.module.IntegrityError as e: @@ -1232,7 +1220,6 @@ class DatabasePool: values: Dict[str, Any], insertion_values: Optional[Dict[str, Any]] = None, where_clause: Optional[str] = None, - lock: bool = True, ) -> bool: """ Pick the UPSERT method which works best on the platform. Either the @@ -1245,8 +1232,6 @@ class DatabasePool: values: The nonunique columns and their new values insertion_values: additional key/values to use only when inserting where_clause: An index predicate to apply to the upsert. - lock: True to lock the table when doing the upsert. Unused when performing - a native upsert. Returns: Returns True if a row was inserted or updated (i.e. if `values` is not empty then this always returns True) @@ -1270,7 +1255,6 @@ class DatabasePool: values, insertion_values=insertion_values, where_clause=where_clause, - lock=lock, ) def simple_upsert_txn_emulated( @@ -1291,14 +1275,15 @@ class DatabasePool: insertion_values: additional key/values to use only when inserting where_clause: An index predicate to apply to the upsert. lock: True to lock the table when doing the upsert. + Must not be False unless the table has already been locked. Returns: Returns True if a row was inserted or updated (i.e. if `values` is not empty then this always returns True) """ insertion_values = insertion_values or {} - # We need to lock the table :(, unless we're *really* careful if lock: + # We need to lock the table :( self.engine.lock_table(txn, table) def _getwhere(key: str) -> str: @@ -1406,7 +1391,6 @@ class DatabasePool: value_names: Collection[str], value_values: Collection[Collection[Any]], desc: str, - lock: bool = True, ) -> None: """ Upsert, many times. @@ -1418,8 +1402,6 @@ class DatabasePool: value_names: The value column names value_values: A list of each row's value column values. Ignored if value_names is empty. - lock: True to lock the table when doing the upsert. Unused when performing - a native upsert. """ # We can autocommit if it safe to upsert @@ -1433,7 +1415,6 @@ class DatabasePool: key_values, value_names, value_values, - lock=lock, db_autocommit=autocommit, ) @@ -1445,7 +1426,6 @@ class DatabasePool: key_values: Collection[Iterable[Any]], value_names: Collection[str], value_values: Iterable[Iterable[Any]], - lock: bool = True, ) -> None: """ Upsert, many times. @@ -1457,8 +1437,6 @@ class DatabasePool: value_names: The value column names value_values: A list of each row's value column values. Ignored if value_names is empty. - lock: True to lock the table when doing the upsert. Unused when performing - a native upsert. """ if table not in self._unsafe_to_upsert_tables: return self.simple_upsert_many_txn_native_upsert( @@ -1466,7 +1444,12 @@ class DatabasePool: ) else: return self.simple_upsert_many_txn_emulated( - txn, table, key_names, key_values, value_names, value_values, lock=lock + txn, + table, + key_names, + key_values, + value_names, + value_values, ) def simple_upsert_many_txn_emulated( @@ -1477,7 +1460,6 @@ class DatabasePool: key_values: Collection[Iterable[Any]], value_names: Collection[str], value_values: Iterable[Iterable[Any]], - lock: bool = True, ) -> None: """ Upsert, many times, but without native UPSERT support or batching. @@ -1489,18 +1471,16 @@ class DatabasePool: value_names: The value column names value_values: A list of each row's value column values. Ignored if value_names is empty. - lock: True to lock the table when doing the upsert. """ # No value columns, therefore make a blank list so that the following # zip() works correctly. if not value_names: value_values = [() for x in range(len(key_values))] - if lock: - # Lock the table just once, to prevent it being done once per row. - # Note that, according to Postgres' documentation, once obtained, - # the lock is held for the remainder of the current transaction. - self.engine.lock_table(txn, "user_ips") + # Lock the table just once, to prevent it being done once per row. + # Note that, according to Postgres' documentation, once obtained, + # the lock is held for the remainder of the current transaction. + self.engine.lock_table(txn, "user_ips") for keyv, valv in zip(key_values, value_values): _keys = {x: y for x, y in zip(key_names, keyv)} diff --git a/synapse/storage/databases/main/account_data.py b/synapse/storage/databases/main/account_data.py index 282687ebce..07908c41d9 100644 --- a/synapse/storage/databases/main/account_data.py +++ b/synapse/storage/databases/main/account_data.py @@ -449,9 +449,6 @@ class AccountDataWorkerStore(PushRulesWorkerStore, CacheInvalidationWorkerStore) content_json = json_encoder.encode(content) async with self._account_data_id_gen.get_next() as next_id: - # no need to lock here as room_account_data has a unique constraint - # on (user_id, room_id, account_data_type) so simple_upsert will - # retry if there is a conflict. await self.db_pool.simple_upsert( desc="add_room_account_data", table="room_account_data", @@ -461,7 +458,6 @@ class AccountDataWorkerStore(PushRulesWorkerStore, CacheInvalidationWorkerStore) "account_data_type": account_data_type, }, values={"stream_id": next_id, "content": content_json}, - lock=False, ) self._account_data_stream_cache.entity_has_changed(user_id, next_id) @@ -517,15 +513,11 @@ class AccountDataWorkerStore(PushRulesWorkerStore, CacheInvalidationWorkerStore) ) -> None: content_json = json_encoder.encode(content) - # no need to lock here as account_data has a unique constraint on - # (user_id, account_data_type) so simple_upsert will retry if - # there is a conflict. self.db_pool.simple_upsert_txn( txn, table="account_data", keyvalues={"user_id": user_id, "account_data_type": account_data_type}, values={"stream_id": next_id, "content": content_json}, - lock=False, ) # Ignored users get denormalized into a separate table as an optimisation. diff --git a/synapse/storage/databases/main/appservice.py b/synapse/storage/databases/main/appservice.py index 63046c0527..25da0c56c5 100644 --- a/synapse/storage/databases/main/appservice.py +++ b/synapse/storage/databases/main/appservice.py @@ -451,8 +451,6 @@ class ApplicationServiceTransactionWorkerStore( table="application_services_state", keyvalues={"as_id": service.id}, values={f"{stream_type}_stream_id": pos}, - # no need to lock when emulating upsert: as_id is a unique key - lock=False, desc="set_appservice_stream_type_pos", ) diff --git a/synapse/storage/databases/main/devices.py b/synapse/storage/databases/main/devices.py index 05a193f889..534f7fc04a 100644 --- a/synapse/storage/databases/main/devices.py +++ b/synapse/storage/databases/main/devices.py @@ -1744,9 +1744,6 @@ class DeviceStore(DeviceWorkerStore, DeviceBackgroundUpdateStore): table="device_lists_remote_cache", keyvalues={"user_id": user_id, "device_id": device_id}, values={"content": json_encoder.encode(content)}, - # we don't need to lock, because we assume we are the only thread - # updating this user's devices. - lock=False, ) txn.call_after(self._get_cached_user_device.invalidate, (user_id, device_id)) @@ -1760,9 +1757,6 @@ class DeviceStore(DeviceWorkerStore, DeviceBackgroundUpdateStore): table="device_lists_remote_extremeties", keyvalues={"user_id": user_id}, values={"stream_id": stream_id}, - # again, we can assume we are the only thread updating this user's - # extremity. - lock=False, ) async def update_remote_device_list_cache( @@ -1815,9 +1809,6 @@ class DeviceStore(DeviceWorkerStore, DeviceBackgroundUpdateStore): table="device_lists_remote_extremeties", keyvalues={"user_id": user_id}, values={"stream_id": stream_id}, - # we don't need to lock, because we can assume we are the only thread - # updating this user's extremity. - lock=False, ) async def add_device_change_to_streams( diff --git a/synapse/storage/databases/main/event_federation.py b/synapse/storage/databases/main/event_federation.py index 309a4ba664..bbee02ab18 100644 --- a/synapse/storage/databases/main/event_federation.py +++ b/synapse/storage/databases/main/event_federation.py @@ -1686,7 +1686,6 @@ class EventFederationWorkerStore(SignatureWorkerStore, EventsWorkerStore, SQLBas }, insertion_values={}, desc="insert_insertion_extremity", - lock=False, ) async def insert_received_event_to_staging( diff --git a/synapse/storage/databases/main/pusher.py b/synapse/storage/databases/main/pusher.py index fee37b9ce4..40fd781a6a 100644 --- a/synapse/storage/databases/main/pusher.py +++ b/synapse/storage/databases/main/pusher.py @@ -325,14 +325,11 @@ class PusherWorkerStore(SQLBaseStore): async def set_throttle_params( self, pusher_id: str, room_id: str, params: ThrottleParams ) -> None: - # no need to lock because `pusher_throttle` has a primary key on - # (pusher, room_id) so simple_upsert will retry await self.db_pool.simple_upsert( "pusher_throttle", {"pusher": pusher_id, "room_id": room_id}, {"last_sent_ts": params.last_sent_ts, "throttle_ms": params.throttle_ms}, desc="set_throttle_params", - lock=False, ) async def _remove_deactivated_pushers(self, progress: dict, batch_size: int) -> int: @@ -589,8 +586,6 @@ class PusherStore(PusherWorkerStore, PusherBackgroundUpdatesStore): device_id: Optional[str] = None, ) -> None: async with self._pushers_id_gen.get_next() as stream_id: - # no need to lock because `pushers` has a unique key on - # (app_id, pushkey, user_name) so simple_upsert will retry await self.db_pool.simple_upsert( table="pushers", keyvalues={"app_id": app_id, "pushkey": pushkey, "user_name": user_id}, @@ -609,7 +604,6 @@ class PusherStore(PusherWorkerStore, PusherBackgroundUpdatesStore): "device_id": device_id, }, desc="add_pusher", - lock=False, ) user_has_pusher = self.get_if_user_has_pusher.cache.get_immediate( diff --git a/synapse/storage/databases/main/room.py b/synapse/storage/databases/main/room.py index 52ad947c6c..1309bfd374 100644 --- a/synapse/storage/databases/main/room.py +++ b/synapse/storage/databases/main/room.py @@ -1847,9 +1847,6 @@ class RoomStore(RoomBackgroundUpdateStore, RoomWorkerStore): "creator": room_creator, "has_auth_chain_index": has_auth_chain_index, }, - # rooms has a unique constraint on room_id, so no need to lock when doing an - # emulated upsert. - lock=False, ) async def store_partial_state_room( @@ -1970,9 +1967,6 @@ class RoomStore(RoomBackgroundUpdateStore, RoomWorkerStore): "creator": "", "has_auth_chain_index": has_auth_chain_index, }, - # rooms has a unique constraint on room_id, so no need to lock when doing an - # emulated upsert. - lock=False, ) async def set_room_is_public(self, room_id: str, is_public: bool) -> None: diff --git a/synapse/storage/databases/main/room_batch.py b/synapse/storage/databases/main/room_batch.py index 39e80f6f5b..131f357d04 100644 --- a/synapse/storage/databases/main/room_batch.py +++ b/synapse/storage/databases/main/room_batch.py @@ -44,6 +44,4 @@ class RoomBatchStore(SQLBaseStore): table="event_to_state_groups", keyvalues={"event_id": event_id}, values={"state_group": state_group_id, "event_id": event_id}, - # Unique constraint on event_id so we don't have to lock - lock=False, ) diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index 698d6f7515..044435deab 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -481,7 +481,6 @@ class UserDirectoryBackgroundUpdateStore(StateDeltasStore): table="user_directory", keyvalues={"user_id": user_id}, values={"display_name": display_name, "avatar_url": avatar_url}, - lock=False, # We're only inserter ) if isinstance(self.database_engine, PostgresEngine): @@ -511,7 +510,6 @@ class UserDirectoryBackgroundUpdateStore(StateDeltasStore): table="user_directory_search", keyvalues={"user_id": user_id}, values={"value": value}, - lock=False, # We're only inserter ) else: # This should be unreachable. -- cgit 1.5.1 From cf1059d045640485a5a0b1e3d945b796b0e6f228 Mon Sep 17 00:00:00 2001 From: reivilibre Date: Wed, 7 Dec 2022 11:19:43 +0000 Subject: Fix a long-standing bug where the user directory would return 1 more row than requested. (#14631) --- changelog.d/14631.bugfix | 1 + synapse/rest/client/user_directory.py | 4 ++-- synapse/storage/databases/main/user_directory.py | 2 +- tests/storage/test_user_directory.py | 6 ++++++ 4 files changed, 10 insertions(+), 3 deletions(-) create mode 100644 changelog.d/14631.bugfix (limited to 'synapse/storage/databases/main/user_directory.py') diff --git a/changelog.d/14631.bugfix b/changelog.d/14631.bugfix new file mode 100644 index 0000000000..c5376bab9f --- /dev/null +++ b/changelog.d/14631.bugfix @@ -0,0 +1 @@ +Fix a long-standing bug where the user directory would return 1 more row than requested. \ No newline at end of file diff --git a/synapse/rest/client/user_directory.py b/synapse/rest/client/user_directory.py index 116c982ce6..4670fad608 100644 --- a/synapse/rest/client/user_directory.py +++ b/synapse/rest/client/user_directory.py @@ -63,8 +63,8 @@ class UserDirectorySearchRestServlet(RestServlet): body = parse_json_object_from_request(request) - limit = body.get("limit", 10) - limit = min(limit, 50) + limit = int(body.get("limit", 10)) + limit = max(min(limit, 50), 0) try: search_term = body["search_term"] diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index 044435deab..af9952f513 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -886,7 +886,7 @@ class UserDirectoryStore(UserDirectoryBackgroundUpdateStore): limited = len(results) > limit - return {"limited": limited, "results": results} + return {"limited": limited, "results": results[0:limit]} def _parse_query_sqlite(search_term: str) -> str: diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py index 5b60cf5285..88c7d5fec0 100644 --- a/tests/storage/test_user_directory.py +++ b/tests/storage/test_user_directory.py @@ -448,6 +448,12 @@ class UserDirectoryStoreTestCase(HomeserverTestCase): {"user_id": BOBBY, "display_name": "bobby", "avatar_url": None}, ) + @override_config({"user_directory": {"search_all_users": True}}) + def test_search_user_limit_correct(self) -> None: + r = self.get_success(self.store.search_user_dir(ALICE, "bob", 1)) + self.assertTrue(r["limited"]) + self.assertEqual(1, len(r["results"])) + @override_config({"user_directory": {"search_all_users": True}}) def test_search_user_dir_stop_words(self) -> None: """Tests that a user can look up another user by searching for the start if its -- cgit 1.5.1 From 2a3cd59dd06411a79fb7500970db1b98f0d87695 Mon Sep 17 00:00:00 2001 From: Brendan Abolivier Date: Mon, 12 Dec 2022 13:21:17 +0100 Subject: Add optional ICU support for user search (#14464) Fixes #13655 This change uses ICU (International Components for Unicode) to improve boundary detection in user search. This change also adds a new dependency on libicu-dev and pkg-config for the Debian packages, which are available in all supported distros. --- changelog.d/14464.feature | 1 + debian/changelog | 7 +++ debian/control | 2 + docker/Dockerfile | 2 + docker/Dockerfile-dhvirtualenv | 2 + poetry.lock | 16 +++++- pyproject.toml | 7 +++ stubs/icu.pyi | 25 +++++++++ synapse/storage/databases/main/user_directory.py | 67 ++++++++++++++++++++++-- tests/storage/test_user_directory.py | 43 +++++++++++++++ 10 files changed, 166 insertions(+), 6 deletions(-) create mode 100644 changelog.d/14464.feature create mode 100644 stubs/icu.pyi (limited to 'synapse/storage/databases/main/user_directory.py') diff --git a/changelog.d/14464.feature b/changelog.d/14464.feature new file mode 100644 index 0000000000..688ea32117 --- /dev/null +++ b/changelog.d/14464.feature @@ -0,0 +1 @@ +Improve user search for international display names. diff --git a/debian/changelog b/debian/changelog index 163b7210bf..5d3c4f7d6b 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,10 @@ +matrix-synapse-py3 (1.74.0~rc1) UNRELEASED; urgency=medium + + * New dependency on libicu-dev to provide improved results for user + search. + + -- Synapse Packaging team Tue, 06 Dec 2022 15:28:10 +0000 + matrix-synapse-py3 (1.73.0) stable; urgency=medium * New Synapse release 1.73.0. diff --git a/debian/control b/debian/control index 86f5a66d02..bc628cec08 100644 --- a/debian/control +++ b/debian/control @@ -8,6 +8,8 @@ Build-Depends: dh-virtualenv (>= 1.1), libsystemd-dev, libpq-dev, + libicu-dev, + pkg-config, lsb-release, python3-dev, python3, diff --git a/docker/Dockerfile b/docker/Dockerfile index 185d5bc3d4..7e5123210a 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -97,6 +97,8 @@ RUN \ zlib1g-dev \ git \ curl \ + libicu-dev \ + pkg-config \ && rm -rf /var/lib/apt/lists/* diff --git a/docker/Dockerfile-dhvirtualenv b/docker/Dockerfile-dhvirtualenv index 73165f6f85..f3b5b00ce6 100644 --- a/docker/Dockerfile-dhvirtualenv +++ b/docker/Dockerfile-dhvirtualenv @@ -84,6 +84,8 @@ RUN apt-get update -qq -o Acquire::Languages=none \ python3-venv \ sqlite3 \ libpq-dev \ + libicu-dev \ + pkg-config \ xmlsec1 # Install rust and ensure it's in the PATH diff --git a/poetry.lock b/poetry.lock index cac22e2ef0..ccda8a23fb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -837,6 +837,14 @@ category = "dev" optional = false python-versions = ">=3.5" +[[package]] +name = "pyicu" +version = "2.10.2" +description = "Python extension wrapping the ICU C++ API" +category = "main" +optional = true +python-versions = "*" + [[package]] name = "pyjwt" version = "2.4.0" @@ -1622,7 +1630,7 @@ docs = ["Sphinx", "repoze.sphinx.autointerface"] test = ["zope.i18nmessageid", "zope.testing", "zope.testrunner"] [extras] -all = ["matrix-synapse-ldap3", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pysaml2", "authlib", "lxml", "sentry-sdk", "jaeger-client", "opentracing", "txredisapi", "hiredis", "Pympler"] +all = ["matrix-synapse-ldap3", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pysaml2", "authlib", "lxml", "sentry-sdk", "jaeger-client", "opentracing", "txredisapi", "hiredis", "Pympler", "pyicu"] cache-memory = ["Pympler"] jwt = ["authlib"] matrix-synapse-ldap3 = ["matrix-synapse-ldap3"] @@ -1635,11 +1643,12 @@ sentry = ["sentry-sdk"] systemd = ["systemd-python"] test = ["parameterized", "idna"] url-preview = ["lxml"] +user-search = ["pyicu"] [metadata] lock-version = "1.1" python-versions = "^3.7.1" -content-hash = "8c44ceeb9df5c3ab43040400e0a6b895de49417e61293a1ba027640b34f03263" +content-hash = "f20007013f33bc35a01e412c48adc62a936030f3074e06286674c5ad7f44d300" [metadata.files] attrs = [ @@ -2427,6 +2436,9 @@ pygments = [ {file = "Pygments-2.11.2-py3-none-any.whl", hash = "sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65"}, {file = "Pygments-2.11.2.tar.gz", hash = "sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a"}, ] +pyicu = [ + {file = "PyICU-2.10.2.tar.gz", hash = "sha256:0c3309eea7fab6857507ace62403515b60fe096cbfb4f90d14f55ff75c5441c1"}, +] pyjwt = [ {file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"}, {file = "PyJWT-2.4.0.tar.gz", hash = "sha256:d42908208c699b3b973cbeb01a969ba6a96c821eefb1c5bfe4c390c01d67abba"}, diff --git a/pyproject.toml b/pyproject.toml index df59fa0562..bb383683cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -208,6 +208,7 @@ hiredis = { version = "*", optional = true } Pympler = { version = "*", optional = true } parameterized = { version = ">=0.7.4", optional = true } idna = { version = ">=2.5", optional = true } +pyicu = { version = ">=2.10.2", optional = true } [tool.poetry.extras] # NB: Packages that should be part of `pip install matrix-synapse[all]` need to be specified @@ -230,6 +231,10 @@ redis = ["txredisapi", "hiredis"] # Required to use experimental `caches.track_memory_usage` config option. cache-memory = ["pympler"] test = ["parameterized", "idna"] +# Allows for better search for international characters in the user directory. This +# requires libicu's development headers installed on the system (e.g. libicu-dev on +# Debian-based distributions). +user-search = ["pyicu"] # The duplication here is awful. I hate hate hate hate hate it. However, for now I want # to ensure you can still `pip install matrix-synapse[all]` like today. Two motivations: @@ -261,6 +266,8 @@ all = [ "txredisapi", "hiredis", # cache-memory "pympler", + # improved user search + "pyicu", # omitted: # - test: it's useful to have this separate from dev deps in the olddeps job # - systemd: this is a system-based requirement diff --git a/stubs/icu.pyi b/stubs/icu.pyi new file mode 100644 index 0000000000..efeda7938a --- /dev/null +++ b/stubs/icu.pyi @@ -0,0 +1,25 @@ +# Copyright 2022 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Stub for PyICU. + +class Locale: + @staticmethod + def getDefault() -> Locale: ... + +class BreakIterator: + @staticmethod + def createWordInstance(locale: Locale) -> BreakIterator: ... + def setText(self, text: str) -> None: ... + def nextBoundary(self) -> int: ... diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index af9952f513..14ef5b040d 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -26,6 +26,14 @@ from typing import ( cast, ) +try: + # Figure out if ICU support is available for searching users. + import icu + + USE_ICU = True +except ModuleNotFoundError: + USE_ICU = False + from typing_extensions import TypedDict from synapse.api.errors import StoreError @@ -900,7 +908,7 @@ def _parse_query_sqlite(search_term: str) -> str: """ # Pull out the individual words, discarding any non-word characters. - results = re.findall(r"([\w\-]+)", search_term, re.UNICODE) + results = _parse_words(search_term) return " & ".join("(%s* OR %s)" % (result, result) for result in results) @@ -910,12 +918,63 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]: We use this so that we can add prefix matching, which isn't something that is supported by default. """ - - # Pull out the individual words, discarding any non-word characters. - results = re.findall(r"([\w\-]+)", search_term, re.UNICODE) + results = _parse_words(search_term) both = " & ".join("(%s:* | %s)" % (result, result) for result in results) exact = " & ".join("%s" % (result,) for result in results) prefix = " & ".join("%s:*" % (result,) for result in results) return both, exact, prefix + + +def _parse_words(search_term: str) -> List[str]: + """Split the provided search string into a list of its words. + + If support for ICU (International Components for Unicode) is available, use it. + Otherwise, fall back to using a regex to detect word boundaries. This latter + solution works well enough for most latin-based languages, but doesn't work as well + with other languages. + + Args: + search_term: The search string. + + Returns: + A list of the words in the search string. + """ + if USE_ICU: + return _parse_words_with_icu(search_term) + + return re.findall(r"([\w\-]+)", search_term, re.UNICODE) + + +def _parse_words_with_icu(search_term: str) -> List[str]: + """Break down the provided search string into its individual words using ICU + (International Components for Unicode). + + Args: + search_term: The search string. + + Returns: + A list of the words in the search string. + """ + results = [] + breaker = icu.BreakIterator.createWordInstance(icu.Locale.getDefault()) + breaker.setText(search_term) + i = 0 + while True: + j = breaker.nextBoundary() + if j < 0: + break + + result = search_term[i:j] + + # libicu considers spaces and punctuation between words as words, but we don't + # want to include those in results as they would result in syntax errors in SQL + # queries (e.g. "foo bar" would result in the search query including "foo & & + # bar"). + if len(re.findall(r"([\w\-]+)", result, re.UNICODE)): + results.append(result) + + i = j + + return results diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py index 88c7d5fec0..3ba896ecf3 100644 --- a/tests/storage/test_user_directory.py +++ b/tests/storage/test_user_directory.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import re from typing import Any, Dict, Set, Tuple from unittest import mock from unittest.mock import Mock, patch @@ -30,6 +31,12 @@ from synapse.util import Clock from tests.test_utils.event_injection import inject_member_event from tests.unittest import HomeserverTestCase, override_config +try: + import icu +except ImportError: + icu = None # type: ignore + + ALICE = "@alice:a" BOB = "@bob:b" BOBBY = "@bobby:a" @@ -467,3 +474,39 @@ class UserDirectoryStoreTestCase(HomeserverTestCase): r["results"][0], {"user_id": BELA, "display_name": "Bela", "avatar_url": None}, ) + + +class UserDirectoryICUTestCase(HomeserverTestCase): + if not icu: + skip = "Requires PyICU" + + def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: + self.store = hs.get_datastores().main + self.user_dir_helper = GetUserDirectoryTables(self.store) + + def test_icu_word_boundary(self) -> None: + """Tests that we correctly detect word boundaries when ICU (International + Components for Unicode) support is available. + """ + + display_name = "Gáo" + + # This word is not broken down correctly by Python's regular expressions, + # likely because á is actually a lowercase a followed by a U+0301 combining + # acute accent. This is specifically something that ICU support fixes. + matches = re.findall(r"([\w\-]+)", display_name, re.UNICODE) + self.assertEqual(len(matches), 2) + + self.get_success( + self.store.update_profile_in_user_dir(ALICE, display_name, None) + ) + self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE,))) + + # Check that searching for this user yields the correct result. + r = self.get_success(self.store.search_user_dir(BOB, display_name, 10)) + self.assertFalse(r["limited"]) + self.assertEqual(len(r["results"]), 1) + self.assertDictEqual( + r["results"][0], + {"user_id": ALICE, "display_name": display_name, "avatar_url": None}, + ) -- cgit 1.5.1