Merge pull request from GHSA-x345-32rc-8h85

* tests for push rule pattern matching * tests for acl pattern matching * factor out common `re.escape` * Factor out common re.compile * Factor out common anchoring code * add word_boundary support to `glob_to_regex` * Use `glob_to_regex` in push rule evaluator NB that this drops support for character classes. I don't think anyone ever used them. * Improve efficiency of globs with multiple wildcards The idea here is that we compress multiple `*` globs into a single `.*`. We also need to consider `?`, since `*?*` is as hard to implement efficiently as `**`. * add assertion on regex pattern * Fix mypy * Simplify glob_to_regex * Inline the glob_to_regex helper function Signed-off-by: Dan Callahan <danc@element.io> * Moar comments Signed-off-by: Dan Callahan <danc@element.io> Co-authored-by: Dan Callahan <danc@element.io>
author: Richard van der Hoff <1389908+richvdh@users.noreply.github.com> 2021-05-11 10:47:23 +0100
committer: GitHub <noreply@github.com> 2021-05-11 11:47:23 +0200
commit: 03318a766cac9f8b053db2214d9c332a977d226c (patch)
tree: 3444111943cbfda45609535c83e53f9adede3a90 /synapse/push/push_rule_evaluator.py
parent: Unpin attrs dep after new version has been released (#9946) (diff)
download: synapse-03318a766cac9f8b053db2214d9c332a977d226c.tar.xz
1 files changed, 3 insertions, 52 deletions
diff --git a/synapse/push/push_rule_evaluator.py b/synapse/push/push_rule_evaluator.py
index 49ecb38522..98b90a4f51 100644
--- a/synapse/push/push_rule_evaluator.py
+++ b/synapse/push/push_rule_evaluator.py
@@ -19,6 +19,7 @@ from typing import Any, Dict, List, Optional, Pattern, Tuple, Union
 
 from synapse.events import EventBase
 from synapse.types import UserID
+from synapse.util import glob_to_regex, re_word_boundary
 from synapse.util.caches.lrucache import LruCache
 
 logger = logging.getLogger(__name__)
@@ -183,7 +184,7 @@ class PushRuleEvaluatorForEvent:
         r = regex_cache.get((display_name, False, True), None)
         if not r:
             r1 = re.escape(display_name)
-            r1 = _re_word_boundary(r1)
+            r1 = re_word_boundary(r1)
             r = re.compile(r1, flags=re.IGNORECASE)
             regex_cache[(display_name, False, True)] = r
 
@@ -212,7 +213,7 @@ def _glob_matches(glob: str, value: str, word_boundary: bool = False) -> bool:
     try:
         r = regex_cache.get((glob, True, word_boundary), None)
         if not r:
-            r = _glob_to_re(glob, word_boundary)
+            r = glob_to_regex(glob, word_boundary)
             regex_cache[(glob, True, word_boundary)] = r
         return bool(r.search(value))
     except re.error:
@@ -220,56 +221,6 @@ def _glob_matches(glob: str, value: str, word_boundary: bool = False) -> bool:
         return False
 
 
-def _glob_to_re(glob: str, word_boundary: bool) -> Pattern:
-    """Generates regex for a given glob.
-
-    Args:
-        glob
-        word_boundary: Whether to match against word boundaries or entire string.
-    """
-    if IS_GLOB.search(glob):
-        r = re.escape(glob)
-
-        r = r.replace(r"\*", ".*?")
-        r = r.replace(r"\?", ".")
-
-        # handle [abc], [a-z] and [!a-z] style ranges.
-        r = GLOB_REGEX.sub(
-            lambda x: (
-                "[%s%s]" % (x.group(1) and "^" or "", x.group(2).replace(r"\\\-", "-"))
-            ),
-            r,
-        )
-        if word_boundary:
-            r = _re_word_boundary(r)
-
-            return re.compile(r, flags=re.IGNORECASE)
-        else:
-            r = "^" + r + "$"
-
-            return re.compile(r, flags=re.IGNORECASE)
-    elif word_boundary:
-        r = re.escape(glob)
-        r = _re_word_boundary(r)
-
-        return re.compile(r, flags=re.IGNORECASE)
-    else:
-        r = "^" + re.escape(glob) + "$"
-        return re.compile(r, flags=re.IGNORECASE)
-
-
-def _re_word_boundary(r: str) -> str:
-    """
-    Adds word boundary characters to the start and end of an
-    expression to require that the match occur as a whole word,
-    but do so respecting the fact that strings starting or ending
-    with non-word characters will change word boundaries.
-    """
-    # we can't use \b as it chokes on unicode. however \W seems to be okay
-    # as shorthand for [^0-9A-Za-z_].
-    return r"(^|\W)%s(\W|$)" % (r,)
-
-
 def _flatten_dict(
     d: Union[EventBase, dict],
     prefix: Optional[List[str]] = None,
author	Richard van der Hoff <1389908+richvdh@users.noreply.github.com>	2021-05-11 10:47:23 +0100
committer	GitHub <noreply@github.com>	2021-05-11 11:47:23 +0200
commit	03318a766cac9f8b053db2214d9c332a977d226c (patch)
tree	3444111943cbfda45609535c83e53f9adede3a90 /synapse/push/push_rule_evaluator.py
parent	Unpin attrs dep after new version has been released (#9946) (diff)
download	synapse-03318a766cac9f8b053db2214d9c332a977d226c.tar.xz