From 5a0b652d36ae4b6d423498c1f2c82c97a49c6f75 Mon Sep 17 00:00:00 2001 From: Sean Quah <8349537+squahtx@users.noreply.github.com> Date: Tue, 30 Nov 2021 15:39:07 +0000 Subject: Eliminate a few `Any`s in `LruCache` type hints (#11453) --- synapse/util/caches/deferred_cache.py | 9 ++++++++- synapse/util/caches/lrucache.py | 37 ++++++++++++++++++++--------------- synapse/util/linked_list.py | 4 ++-- 3 files changed, 31 insertions(+), 19 deletions(-) (limited to 'synapse/util') diff --git a/synapse/util/caches/deferred_cache.py b/synapse/util/caches/deferred_cache.py index 3c4cc093af..377c9a282a 100644 --- a/synapse/util/caches/deferred_cache.py +++ b/synapse/util/caches/deferred_cache.py @@ -22,6 +22,7 @@ from typing import ( Iterable, MutableMapping, Optional, + Sized, TypeVar, Union, cast, @@ -104,7 +105,13 @@ class DeferredCache(Generic[KT, VT]): max_size=max_entries, cache_name=name, cache_type=cache_type, - size_callback=(lambda d: len(d) or 1) if iterable else None, + size_callback=( + (lambda d: len(cast(Sized, d)) or 1) + # Argument 1 to "len" has incompatible type "VT"; expected "Sized" + # We trust that `VT` is `Sized` when `iterable` is `True` + if iterable + else None + ), metrics_collection_callback=metrics_cb, apply_cache_factor_from_config=apply_cache_factor_from_config, prune_unread_entries=prune_unread_entries, diff --git a/synapse/util/caches/lrucache.py b/synapse/util/caches/lrucache.py index a0a7a9de32..05c4dcb062 100644 --- a/synapse/util/caches/lrucache.py +++ b/synapse/util/caches/lrucache.py @@ -15,14 +15,15 @@ import logging import threading import weakref +from enum import Enum from functools import wraps from typing import ( TYPE_CHECKING, Any, Callable, Collection, + Dict, Generic, - Iterable, List, Optional, Type, @@ -190,7 +191,7 @@ class _Node(Generic[KT, VT]): root: "ListNode[_Node]", key: KT, value: VT, - cache: "weakref.ReferenceType[LruCache]", + cache: "weakref.ReferenceType[LruCache[KT, VT]]", clock: Clock, callbacks: Collection[Callable[[], None]] = (), prune_unread_entries: bool = True, @@ -290,6 +291,12 @@ class _Node(Generic[KT, VT]): self._global_list_node.update_last_access(clock) +class _Sentinel(Enum): + # defining a sentinel in this way allows mypy to correctly handle the + # type of a dictionary lookup. + sentinel = object() + + class LruCache(Generic[KT, VT]): """ Least-recently-used cache, supporting prometheus metrics and invalidation callbacks. @@ -302,7 +309,7 @@ class LruCache(Generic[KT, VT]): max_size: int, cache_name: Optional[str] = None, cache_type: Type[Union[dict, TreeCache]] = dict, - size_callback: Optional[Callable] = None, + size_callback: Optional[Callable[[VT], int]] = None, metrics_collection_callback: Optional[Callable[[], None]] = None, apply_cache_factor_from_config: bool = True, clock: Optional[Clock] = None, @@ -339,7 +346,7 @@ class LruCache(Generic[KT, VT]): else: real_clock = clock - cache = cache_type() + cache: Union[Dict[KT, _Node[KT, VT]], TreeCache] = cache_type() self.cache = cache # Used for introspection. self.apply_cache_factor_from_config = apply_cache_factor_from_config @@ -374,7 +381,7 @@ class LruCache(Generic[KT, VT]): # creating more each time we create a `_Node`. weak_ref_to_self = weakref.ref(self) - list_root = ListNode[_Node].create_root_node() + list_root = ListNode[_Node[KT, VT]].create_root_node() lock = threading.Lock() @@ -422,7 +429,7 @@ class LruCache(Generic[KT, VT]): def add_node( key: KT, value: VT, callbacks: Collection[Callable[[], None]] = () ) -> None: - node = _Node( + node: _Node[KT, VT] = _Node( list_root, key, value, @@ -439,10 +446,10 @@ class LruCache(Generic[KT, VT]): if caches.TRACK_MEMORY_USAGE and metrics: metrics.inc_memory_usage(node.memory) - def move_node_to_front(node: _Node) -> None: + def move_node_to_front(node: _Node[KT, VT]) -> None: node.move_to_front(real_clock, list_root) - def delete_node(node: _Node) -> int: + def delete_node(node: _Node[KT, VT]) -> int: node.drop_from_lists() deleted_len = 1 @@ -496,7 +503,7 @@ class LruCache(Generic[KT, VT]): @synchronized def cache_set( - key: KT, value: VT, callbacks: Iterable[Callable[[], None]] = () + key: KT, value: VT, callbacks: Collection[Callable[[], None]] = () ) -> None: node = cache.get(key, None) if node is not None: @@ -590,8 +597,6 @@ class LruCache(Generic[KT, VT]): def cache_contains(key: KT) -> bool: return key in cache - self.sentinel = object() - # make sure that we clear out any excess entries after we get resized. self._on_resize = evict @@ -608,18 +613,18 @@ class LruCache(Generic[KT, VT]): self.clear = cache_clear def __getitem__(self, key: KT) -> VT: - result = self.get(key, self.sentinel) - if result is self.sentinel: + result = self.get(key, _Sentinel.sentinel) + if result is _Sentinel.sentinel: raise KeyError() else: - return cast(VT, result) + return result def __setitem__(self, key: KT, value: VT) -> None: self.set(key, value) def __delitem__(self, key: KT, value: VT) -> None: - result = self.pop(key, self.sentinel) - if result is self.sentinel: + result = self.pop(key, _Sentinel.sentinel) + if result is _Sentinel.sentinel: raise KeyError() def __len__(self) -> int: diff --git a/synapse/util/linked_list.py b/synapse/util/linked_list.py index 9f4be757ba..8efbf061aa 100644 --- a/synapse/util/linked_list.py +++ b/synapse/util/linked_list.py @@ -84,7 +84,7 @@ class ListNode(Generic[P]): # immediately rather than at the next GC. self.cache_entry = None - def move_after(self, node: "ListNode") -> None: + def move_after(self, node: "ListNode[P]") -> None: """Move this node from its current location in the list to after the given node. """ @@ -122,7 +122,7 @@ class ListNode(Generic[P]): self.prev_node = None self.next_node = None - def _refs_insert_after(self, node: "ListNode") -> None: + def _refs_insert_after(self, node: "ListNode[P]") -> None: """Internal method to insert the node after the given node.""" # This method should only be called when we're not already in the list. -- cgit 1.5.1 From 7ff22d6da41cd5ca80db95c18b409aea38e49fcd Mon Sep 17 00:00:00 2001 From: Sean Quah <8349537+squahtx@users.noreply.github.com> Date: Tue, 30 Nov 2021 16:28:02 +0000 Subject: Fix `LruCache` corruption bug with a `size_callback` that can return 0 (#11454) When all entries in an `LruCache` have a size of 0 according to the provided `size_callback`, and `drop_from_cache` is called on a cache node, the node would be unlinked from the LRU linked list but remain in the cache dictionary. An assertion would be later be tripped due to the inconsistency. Avoid unintentionally calling `__len__` and use a strict `is None` check instead when unwrapping the weak reference. --- changelog.d/11454.bugfix | 1 + synapse/util/caches/lrucache.py | 5 ++++- tests/util/test_lrucache.py | 12 ++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 changelog.d/11454.bugfix (limited to 'synapse/util') diff --git a/changelog.d/11454.bugfix b/changelog.d/11454.bugfix new file mode 100644 index 0000000000..096265cbc9 --- /dev/null +++ b/changelog.d/11454.bugfix @@ -0,0 +1 @@ +Fix an `LruCache` corruption bug, introduced in 1.38.0, that would cause certain requests to fail until the next Synapse restart. diff --git a/synapse/util/caches/lrucache.py b/synapse/util/caches/lrucache.py index 05c4dcb062..eb96f7e665 100644 --- a/synapse/util/caches/lrucache.py +++ b/synapse/util/caches/lrucache.py @@ -271,7 +271,10 @@ class _Node(Generic[KT, VT]): removed from all lists. """ cache = self._cache() - if not cache or not cache.pop(self.key, None): + if ( + cache is None + or cache.pop(self.key, _Sentinel.sentinel) is _Sentinel.sentinel + ): # `cache.pop` should call `drop_from_lists()`, unless this Node had # already been removed from the cache. self.drop_from_lists() diff --git a/tests/util/test_lrucache.py b/tests/util/test_lrucache.py index 6578f3411e..291644eb7d 100644 --- a/tests/util/test_lrucache.py +++ b/tests/util/test_lrucache.py @@ -13,6 +13,7 @@ # limitations under the License. +from typing import List from unittest.mock import Mock from synapse.util.caches.lrucache import LruCache, setup_expire_lru_cache_entries @@ -261,6 +262,17 @@ class LruCacheSizedTestCase(unittest.HomeserverTestCase): self.assertEquals(cache["key4"], [4]) self.assertEquals(cache["key5"], [5, 6]) + def test_zero_size_drop_from_cache(self) -> None: + """Test that `drop_from_cache` works correctly with 0-sized entries.""" + cache: LruCache[str, List[int]] = LruCache(5, size_callback=lambda x: 0) + cache["key1"] = [] + + self.assertEqual(len(cache), 0) + cache.cache["key1"].drop_from_cache() + self.assertIsNone( + cache.pop("key1"), "Cache entry should have been evicted but wasn't" + ) + class TimeEvictionTestCase(unittest.HomeserverTestCase): """Test that time based eviction works correctly.""" -- cgit 1.5.1 From 7b62791e001d6a4f8897ed48b3232d7f8fe6aa48 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 1 Dec 2021 12:43:32 -0500 Subject: Clean-up get_version_string (#11468) --- changelog.d/11468.misc | 1 + synapse/util/versionstring.py | 82 +++++++++++++------------------------------ 2 files changed, 26 insertions(+), 57 deletions(-) create mode 100644 changelog.d/11468.misc (limited to 'synapse/util') diff --git a/changelog.d/11468.misc b/changelog.d/11468.misc new file mode 100644 index 0000000000..6fc0b5bcab --- /dev/null +++ b/changelog.d/11468.misc @@ -0,0 +1 @@ +Refactor `get_version_string` to fix-up types and duplicated code. diff --git a/synapse/util/versionstring.py b/synapse/util/versionstring.py index 899ee0adc8..c144ff62c1 100644 --- a/synapse/util/versionstring.py +++ b/synapse/util/versionstring.py @@ -1,4 +1,5 @@ # Copyright 2016 OpenMarket Ltd +# Copyright 2021 The Matrix.org Foundation C.I.C. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,10 +30,11 @@ def get_version_string(module: ModuleType) -> str: If called on a module not in a git checkout will return `__version__`. Args: - module (module) + module: The module to check the version of. Must declare a __version__ + attribute. Returns: - str + The module version (as a string). """ cached_version = version_cache.get(module) @@ -44,71 +46,37 @@ def get_version_string(module: ModuleType) -> str: version_string = module.__version__ # type: ignore[attr-defined] try: - null = open(os.devnull, "w") cwd = os.path.dirname(os.path.abspath(module.__file__)) - try: - git_branch = ( - subprocess.check_output( - ["git", "rev-parse", "--abbrev-ref", "HEAD"], stderr=null, cwd=cwd + def _run_git_command(prefix: str, *params: str) -> str: + try: + result = ( + subprocess.check_output( + ["git", *params], stderr=subprocess.DEVNULL, cwd=cwd + ) + .strip() + .decode("ascii") ) - .strip() - .decode("ascii") - ) - git_branch = "b=" + git_branch - except (subprocess.CalledProcessError, FileNotFoundError): - # FileNotFoundError can arise when git is not installed - git_branch = "" - - try: - git_tag = ( - subprocess.check_output( - ["git", "describe", "--exact-match"], stderr=null, cwd=cwd - ) - .strip() - .decode("ascii") - ) - git_tag = "t=" + git_tag - except (subprocess.CalledProcessError, FileNotFoundError): - git_tag = "" - - try: - git_commit = ( - subprocess.check_output( - ["git", "rev-parse", "--short", "HEAD"], stderr=null, cwd=cwd - ) - .strip() - .decode("ascii") - ) - except (subprocess.CalledProcessError, FileNotFoundError): - git_commit = "" - - try: - dirty_string = "-this_is_a_dirty_checkout" - is_dirty = ( - subprocess.check_output( - ["git", "describe", "--dirty=" + dirty_string], stderr=null, cwd=cwd - ) - .strip() - .decode("ascii") - .endswith(dirty_string) - ) + return prefix + result + except (subprocess.CalledProcessError, FileNotFoundError): + return "" - git_dirty = "dirty" if is_dirty else "" - except (subprocess.CalledProcessError, FileNotFoundError): - git_dirty = "" + git_branch = _run_git_command("b=", "rev-parse", "--abbrev-ref", "HEAD") + git_tag = _run_git_command("t=", "describe", "--exact-match") + git_commit = _run_git_command("", "rev-parse", "--short", "HEAD") + + dirty_string = "-this_is_a_dirty_checkout" + is_dirty = _run_git_command("", "describe", "--dirty=" + dirty_string).endswith( + dirty_string + ) + git_dirty = "dirty" if is_dirty else "" if git_branch or git_tag or git_commit or git_dirty: git_version = ",".join( s for s in (git_branch, git_tag, git_commit, git_dirty) if s ) - version_string = "%s (%s)" % ( - # If the __version__ attribute doesn't exist, we'll have failed - # loudly above. - module.__version__, # type: ignore[attr-defined] - git_version, - ) + version_string = f"{version_string} ({git_version})" except Exception as e: logger.info("Failed to check for git repository: %s", e) -- cgit 1.5.1 From a77c36989785c0d5565ab9a1169f4f88e512ce8a Mon Sep 17 00:00:00 2001 From: Sean Quah <8349537+squahtx@users.noreply.github.com> Date: Mon, 6 Dec 2021 11:36:08 +0000 Subject: Move `glob_to_regex` and `re_word_boundary` to `matrix-python-common` (#11505) --- changelog.d/11505.misc | 1 + synapse/config/room_directory.py | 3 +- synapse/config/tls.py | 3 +- synapse/federation/federation_server.py | 3 +- synapse/push/push_rule_evaluator.py | 7 ++-- synapse/python_dependencies.py | 1 + synapse/util/__init__.py | 59 +-------------------------------- tests/util/test_glob_to_regex.py | 59 --------------------------------- 8 files changed, 13 insertions(+), 123 deletions(-) create mode 100644 changelog.d/11505.misc delete mode 100644 tests/util/test_glob_to_regex.py (limited to 'synapse/util') diff --git a/changelog.d/11505.misc b/changelog.d/11505.misc new file mode 100644 index 0000000000..926b562fad --- /dev/null +++ b/changelog.d/11505.misc @@ -0,0 +1 @@ +Move `glob_to_regex` and `re_word_boundary` to `matrix-python-common`. diff --git a/synapse/config/room_directory.py b/synapse/config/room_directory.py index 57316c59b6..3c5e0f7ce7 100644 --- a/synapse/config/room_directory.py +++ b/synapse/config/room_directory.py @@ -15,8 +15,9 @@ from typing import List +from matrix_common.regex import glob_to_regex + from synapse.types import JsonDict -from synapse.util import glob_to_regex from ._base import Config, ConfigError diff --git a/synapse/config/tls.py b/synapse/config/tls.py index 4ca111618f..3e235b57a7 100644 --- a/synapse/config/tls.py +++ b/synapse/config/tls.py @@ -16,11 +16,12 @@ import logging import os from typing import List, Optional, Pattern +from matrix_common.regex import glob_to_regex + from OpenSSL import SSL, crypto from twisted.internet._sslverify import Certificate, trustRootFromCertificates from synapse.config._base import Config, ConfigError -from synapse.util import glob_to_regex logger = logging.getLogger(__name__) diff --git a/synapse/federation/federation_server.py b/synapse/federation/federation_server.py index 8e37e76206..4697a62c18 100644 --- a/synapse/federation/federation_server.py +++ b/synapse/federation/federation_server.py @@ -28,6 +28,7 @@ from typing import ( Union, ) +from matrix_common.regex import glob_to_regex from prometheus_client import Counter, Gauge, Histogram from twisted.internet import defer @@ -66,7 +67,7 @@ from synapse.replication.http.federation import ( ) from synapse.storage.databases.main.lock import Lock from synapse.types import JsonDict, get_domain_from_id -from synapse.util import glob_to_regex, json_decoder, unwrapFirstError +from synapse.util import json_decoder, unwrapFirstError from synapse.util.async_helpers import Linearizer, concurrently_execute from synapse.util.caches.response_cache import ResponseCache from synapse.util.stringutils import parse_server_name diff --git a/synapse/push/push_rule_evaluator.py b/synapse/push/push_rule_evaluator.py index 7f68092ec5..659a53805d 100644 --- a/synapse/push/push_rule_evaluator.py +++ b/synapse/push/push_rule_evaluator.py @@ -17,9 +17,10 @@ import logging import re from typing import Any, Dict, List, Optional, Pattern, Tuple, Union +from matrix_common.regex import glob_to_regex, to_word_pattern + from synapse.events import EventBase from synapse.types import JsonDict, UserID -from synapse.util import glob_to_regex, re_word_boundary from synapse.util.caches.lrucache import LruCache logger = logging.getLogger(__name__) @@ -184,7 +185,7 @@ class PushRuleEvaluatorForEvent: r = regex_cache.get((display_name, False, True), None) if not r: r1 = re.escape(display_name) - r1 = re_word_boundary(r1) + r1 = to_word_pattern(r1) r = re.compile(r1, flags=re.IGNORECASE) regex_cache[(display_name, False, True)] = r @@ -213,7 +214,7 @@ def _glob_matches(glob: str, value: str, word_boundary: bool = False) -> bool: try: r = regex_cache.get((glob, True, word_boundary), None) if not r: - r = glob_to_regex(glob, word_boundary) + r = glob_to_regex(glob, word_boundary=word_boundary) regex_cache[(glob, True, word_boundary)] = r return bool(r.search(value)) except re.error: diff --git a/synapse/python_dependencies.py b/synapse/python_dependencies.py index 7d26954244..386debd7db 100644 --- a/synapse/python_dependencies.py +++ b/synapse/python_dependencies.py @@ -87,6 +87,7 @@ REQUIREMENTS = [ # with the latest security patches. "cryptography>=3.4.7", "ijson>=3.1", + "matrix-common==1.0.0", ] CONDITIONAL_REQUIREMENTS = { diff --git a/synapse/util/__init__.py b/synapse/util/__init__.py index 95f23e27b6..f157132210 100644 --- a/synapse/util/__init__.py +++ b/synapse/util/__init__.py @@ -14,9 +14,8 @@ import json import logging -import re import typing -from typing import Any, Callable, Dict, Generator, Optional, Pattern +from typing import Any, Callable, Dict, Generator, Optional import attr from frozendict import frozendict @@ -35,9 +34,6 @@ if typing.TYPE_CHECKING: logger = logging.getLogger(__name__) -_WILDCARD_RUN = re.compile(r"([\?\*]+)") - - def _reject_invalid_json(val: Any) -> None: """Do not allow Infinity, -Infinity, or NaN values in JSON.""" raise ValueError("Invalid JSON value: '%s'" % val) @@ -185,56 +181,3 @@ def log_failure( if not consumeErrors: return failure return None - - -def glob_to_regex(glob: str, word_boundary: bool = False) -> Pattern: - """Converts a glob to a compiled regex object. - - Args: - glob: pattern to match - word_boundary: If True, the pattern will be allowed to match at word boundaries - anywhere in the string. Otherwise, the pattern is anchored at the start and - end of the string. - - Returns: - compiled regex pattern - """ - - # Patterns with wildcards must be simplified to avoid performance cliffs - # - The glob `?**?**?` is equivalent to the glob `???*` - # - The glob `???*` is equivalent to the regex `.{3,}` - chunks = [] - for chunk in _WILDCARD_RUN.split(glob): - # No wildcards? re.escape() - if not _WILDCARD_RUN.match(chunk): - chunks.append(re.escape(chunk)) - continue - - # Wildcards? Simplify. - qmarks = chunk.count("?") - if "*" in chunk: - chunks.append(".{%d,}" % qmarks) - else: - chunks.append(".{%d}" % qmarks) - - res = "".join(chunks) - - if word_boundary: - res = re_word_boundary(res) - else: - # \A anchors at start of string, \Z at end of string - res = r"\A" + res + r"\Z" - - return re.compile(res, re.IGNORECASE) - - -def re_word_boundary(r: str) -> str: - """ - Adds word boundary characters to the start and end of an - expression to require that the match occur as a whole word, - but do so respecting the fact that strings starting or ending - with non-word characters will change word boundaries. - """ - # we can't use \b as it chokes on unicode. however \W seems to be okay - # as shorthand for [^0-9A-Za-z_]. - return r"(^|\W)%s(\W|$)" % (r,) diff --git a/tests/util/test_glob_to_regex.py b/tests/util/test_glob_to_regex.py deleted file mode 100644 index 220accb92b..0000000000 --- a/tests/util/test_glob_to_regex.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright 2021 The Matrix.org Foundation C.I.C. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from synapse.util import glob_to_regex - -from tests.unittest import TestCase - - -class GlobToRegexTestCase(TestCase): - def test_literal_match(self): - """patterns without wildcards should match""" - pat = glob_to_regex("foobaz") - self.assertTrue( - pat.match("FoobaZ"), "patterns should match and be case-insensitive" - ) - self.assertFalse( - pat.match("x foobaz"), "pattern should not match at word boundaries" - ) - - def test_wildcard_match(self): - pat = glob_to_regex("f?o*baz") - - self.assertTrue( - pat.match("FoobarbaZ"), - "* should match string and pattern should be case-insensitive", - ) - self.assertTrue(pat.match("foobaz"), "* should match 0 characters") - self.assertFalse(pat.match("fooxaz"), "the character after * must match") - self.assertFalse(pat.match("fobbaz"), "? should not match 0 characters") - self.assertFalse(pat.match("fiiobaz"), "? should not match 2 characters") - - def test_multi_wildcard(self): - """patterns with multiple wildcards in a row should match""" - pat = glob_to_regex("**baz") - self.assertTrue(pat.match("agsgsbaz"), "** should match any string") - self.assertTrue(pat.match("baz"), "** should match the empty string") - self.assertEqual(pat.pattern, r"\A.{0,}baz\Z") - - pat = glob_to_regex("*?baz") - self.assertTrue(pat.match("agsgsbaz"), "*? should match any string") - self.assertTrue(pat.match("abaz"), "*? should match a single char") - self.assertFalse(pat.match("baz"), "*? should not match the empty string") - self.assertEqual(pat.pattern, r"\A.{1,}baz\Z") - - pat = glob_to_regex("a?*?*?baz") - self.assertTrue(pat.match("a g baz"), "?*?*? should match 3 chars") - self.assertFalse(pat.match("a..baz"), "?*?*? should not match 2 chars") - self.assertTrue(pat.match("a.gg.baz"), "?*?*? should match 4 chars") - self.assertEqual(pat.pattern, r"\Aa.{3,}baz\Z") -- cgit 1.5.1 From 088d748f2cb51f03f3bcacc0fb3af1e0f9607737 Mon Sep 17 00:00:00 2001 From: Sean Quah <8349537+squahtx@users.noreply.github.com> Date: Tue, 7 Dec 2021 13:51:11 +0000 Subject: Revert "Move `glob_to_regex` and `re_word_boundary` to `matrix-python-common` (#11505) (#11527) This reverts commit a77c36989785c0d5565ab9a1169f4f88e512ce8a. --- changelog.d/11527.misc | 1 + synapse/config/room_directory.py | 3 +- synapse/config/tls.py | 3 +- synapse/federation/federation_server.py | 3 +- synapse/push/push_rule_evaluator.py | 7 ++-- synapse/python_dependencies.py | 1 - synapse/util/__init__.py | 59 ++++++++++++++++++++++++++++++++- tests/util/test_glob_to_regex.py | 59 +++++++++++++++++++++++++++++++++ 8 files changed, 124 insertions(+), 12 deletions(-) create mode 100644 changelog.d/11527.misc create mode 100644 tests/util/test_glob_to_regex.py (limited to 'synapse/util') diff --git a/changelog.d/11527.misc b/changelog.d/11527.misc new file mode 100644 index 0000000000..081eae317c --- /dev/null +++ b/changelog.d/11527.misc @@ -0,0 +1 @@ +Temporarily revert usage of `matrix-python-common`. diff --git a/synapse/config/room_directory.py b/synapse/config/room_directory.py index 3c5e0f7ce7..57316c59b6 100644 --- a/synapse/config/room_directory.py +++ b/synapse/config/room_directory.py @@ -15,9 +15,8 @@ from typing import List -from matrix_common.regex import glob_to_regex - from synapse.types import JsonDict +from synapse.util import glob_to_regex from ._base import Config, ConfigError diff --git a/synapse/config/tls.py b/synapse/config/tls.py index 3e235b57a7..4ca111618f 100644 --- a/synapse/config/tls.py +++ b/synapse/config/tls.py @@ -16,12 +16,11 @@ import logging import os from typing import List, Optional, Pattern -from matrix_common.regex import glob_to_regex - from OpenSSL import SSL, crypto from twisted.internet._sslverify import Certificate, trustRootFromCertificates from synapse.config._base import Config, ConfigError +from synapse.util import glob_to_regex logger = logging.getLogger(__name__) diff --git a/synapse/federation/federation_server.py b/synapse/federation/federation_server.py index 4697a62c18..8e37e76206 100644 --- a/synapse/federation/federation_server.py +++ b/synapse/federation/federation_server.py @@ -28,7 +28,6 @@ from typing import ( Union, ) -from matrix_common.regex import glob_to_regex from prometheus_client import Counter, Gauge, Histogram from twisted.internet import defer @@ -67,7 +66,7 @@ from synapse.replication.http.federation import ( ) from synapse.storage.databases.main.lock import Lock from synapse.types import JsonDict, get_domain_from_id -from synapse.util import json_decoder, unwrapFirstError +from synapse.util import glob_to_regex, json_decoder, unwrapFirstError from synapse.util.async_helpers import Linearizer, concurrently_execute from synapse.util.caches.response_cache import ResponseCache from synapse.util.stringutils import parse_server_name diff --git a/synapse/push/push_rule_evaluator.py b/synapse/push/push_rule_evaluator.py index 659a53805d..7f68092ec5 100644 --- a/synapse/push/push_rule_evaluator.py +++ b/synapse/push/push_rule_evaluator.py @@ -17,10 +17,9 @@ import logging import re from typing import Any, Dict, List, Optional, Pattern, Tuple, Union -from matrix_common.regex import glob_to_regex, to_word_pattern - from synapse.events import EventBase from synapse.types import JsonDict, UserID +from synapse.util import glob_to_regex, re_word_boundary from synapse.util.caches.lrucache import LruCache logger = logging.getLogger(__name__) @@ -185,7 +184,7 @@ class PushRuleEvaluatorForEvent: r = regex_cache.get((display_name, False, True), None) if not r: r1 = re.escape(display_name) - r1 = to_word_pattern(r1) + r1 = re_word_boundary(r1) r = re.compile(r1, flags=re.IGNORECASE) regex_cache[(display_name, False, True)] = r @@ -214,7 +213,7 @@ def _glob_matches(glob: str, value: str, word_boundary: bool = False) -> bool: try: r = regex_cache.get((glob, True, word_boundary), None) if not r: - r = glob_to_regex(glob, word_boundary=word_boundary) + r = glob_to_regex(glob, word_boundary) regex_cache[(glob, True, word_boundary)] = r return bool(r.search(value)) except re.error: diff --git a/synapse/python_dependencies.py b/synapse/python_dependencies.py index 386debd7db..7d26954244 100644 --- a/synapse/python_dependencies.py +++ b/synapse/python_dependencies.py @@ -87,7 +87,6 @@ REQUIREMENTS = [ # with the latest security patches. "cryptography>=3.4.7", "ijson>=3.1", - "matrix-common==1.0.0", ] CONDITIONAL_REQUIREMENTS = { diff --git a/synapse/util/__init__.py b/synapse/util/__init__.py index f157132210..95f23e27b6 100644 --- a/synapse/util/__init__.py +++ b/synapse/util/__init__.py @@ -14,8 +14,9 @@ import json import logging +import re import typing -from typing import Any, Callable, Dict, Generator, Optional +from typing import Any, Callable, Dict, Generator, Optional, Pattern import attr from frozendict import frozendict @@ -34,6 +35,9 @@ if typing.TYPE_CHECKING: logger = logging.getLogger(__name__) +_WILDCARD_RUN = re.compile(r"([\?\*]+)") + + def _reject_invalid_json(val: Any) -> None: """Do not allow Infinity, -Infinity, or NaN values in JSON.""" raise ValueError("Invalid JSON value: '%s'" % val) @@ -181,3 +185,56 @@ def log_failure( if not consumeErrors: return failure return None + + +def glob_to_regex(glob: str, word_boundary: bool = False) -> Pattern: + """Converts a glob to a compiled regex object. + + Args: + glob: pattern to match + word_boundary: If True, the pattern will be allowed to match at word boundaries + anywhere in the string. Otherwise, the pattern is anchored at the start and + end of the string. + + Returns: + compiled regex pattern + """ + + # Patterns with wildcards must be simplified to avoid performance cliffs + # - The glob `?**?**?` is equivalent to the glob `???*` + # - The glob `???*` is equivalent to the regex `.{3,}` + chunks = [] + for chunk in _WILDCARD_RUN.split(glob): + # No wildcards? re.escape() + if not _WILDCARD_RUN.match(chunk): + chunks.append(re.escape(chunk)) + continue + + # Wildcards? Simplify. + qmarks = chunk.count("?") + if "*" in chunk: + chunks.append(".{%d,}" % qmarks) + else: + chunks.append(".{%d}" % qmarks) + + res = "".join(chunks) + + if word_boundary: + res = re_word_boundary(res) + else: + # \A anchors at start of string, \Z at end of string + res = r"\A" + res + r"\Z" + + return re.compile(res, re.IGNORECASE) + + +def re_word_boundary(r: str) -> str: + """ + Adds word boundary characters to the start and end of an + expression to require that the match occur as a whole word, + but do so respecting the fact that strings starting or ending + with non-word characters will change word boundaries. + """ + # we can't use \b as it chokes on unicode. however \W seems to be okay + # as shorthand for [^0-9A-Za-z_]. + return r"(^|\W)%s(\W|$)" % (r,) diff --git a/tests/util/test_glob_to_regex.py b/tests/util/test_glob_to_regex.py new file mode 100644 index 0000000000..220accb92b --- /dev/null +++ b/tests/util/test_glob_to_regex.py @@ -0,0 +1,59 @@ +# Copyright 2021 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from synapse.util import glob_to_regex + +from tests.unittest import TestCase + + +class GlobToRegexTestCase(TestCase): + def test_literal_match(self): + """patterns without wildcards should match""" + pat = glob_to_regex("foobaz") + self.assertTrue( + pat.match("FoobaZ"), "patterns should match and be case-insensitive" + ) + self.assertFalse( + pat.match("x foobaz"), "pattern should not match at word boundaries" + ) + + def test_wildcard_match(self): + pat = glob_to_regex("f?o*baz") + + self.assertTrue( + pat.match("FoobarbaZ"), + "* should match string and pattern should be case-insensitive", + ) + self.assertTrue(pat.match("foobaz"), "* should match 0 characters") + self.assertFalse(pat.match("fooxaz"), "the character after * must match") + self.assertFalse(pat.match("fobbaz"), "? should not match 0 characters") + self.assertFalse(pat.match("fiiobaz"), "? should not match 2 characters") + + def test_multi_wildcard(self): + """patterns with multiple wildcards in a row should match""" + pat = glob_to_regex("**baz") + self.assertTrue(pat.match("agsgsbaz"), "** should match any string") + self.assertTrue(pat.match("baz"), "** should match the empty string") + self.assertEqual(pat.pattern, r"\A.{0,}baz\Z") + + pat = glob_to_regex("*?baz") + self.assertTrue(pat.match("agsgsbaz"), "*? should match any string") + self.assertTrue(pat.match("abaz"), "*? should match a single char") + self.assertFalse(pat.match("baz"), "*? should not match the empty string") + self.assertEqual(pat.pattern, r"\A.{1,}baz\Z") + + pat = glob_to_regex("a?*?*?baz") + self.assertTrue(pat.match("a g baz"), "?*?*? should match 3 chars") + self.assertFalse(pat.match("a..baz"), "?*?*? should not match 2 chars") + self.assertTrue(pat.match("a.gg.baz"), "?*?*? should match 4 chars") + self.assertEqual(pat.pattern, r"\Aa.{3,}baz\Z") -- cgit 1.5.1