diff --git a/synapse/__init__.py b/synapse/__init__.py
index 441cd8b339..ce822ccb04 100644
--- a/synapse/__init__.py
+++ b/synapse/__init__.py
@@ -47,7 +47,7 @@ try:
except ImportError:
pass
-__version__ = "1.33.1"
+__version__ = "1.33.2"
if bool(os.environ.get("SYNAPSE_TEST_PATCH_LOG_CONTEXTS", False)):
# We import here so that we don't have to install a bunch of deps when
diff --git a/synapse/config/tls.py b/synapse/config/tls.py
index b041869758..7df4e4c3e6 100644
--- a/synapse/config/tls.py
+++ b/synapse/config/tls.py
@@ -17,7 +17,7 @@ import os
import warnings
from datetime import datetime
from hashlib import sha256
-from typing import List, Optional
+from typing import List, Optional, Pattern
from unpaddedbase64 import encode_base64
@@ -124,7 +124,7 @@ class TlsConfig(Config):
fed_whitelist_entries = []
# Support globs (*) in whitelist values
- self.federation_certificate_verification_whitelist = [] # type: List[str]
+ self.federation_certificate_verification_whitelist = [] # type: List[Pattern]
for entry in fed_whitelist_entries:
try:
entry_regex = glob_to_regex(entry.encode("ascii").decode("ascii"))
diff --git a/synapse/push/push_rule_evaluator.py b/synapse/push/push_rule_evaluator.py
index 49ecb38522..98b90a4f51 100644
--- a/synapse/push/push_rule_evaluator.py
+++ b/synapse/push/push_rule_evaluator.py
@@ -19,6 +19,7 @@ from typing import Any, Dict, List, Optional, Pattern, Tuple, Union
from synapse.events import EventBase
from synapse.types import UserID
+from synapse.util import glob_to_regex, re_word_boundary
from synapse.util.caches.lrucache import LruCache
logger = logging.getLogger(__name__)
@@ -183,7 +184,7 @@ class PushRuleEvaluatorForEvent:
r = regex_cache.get((display_name, False, True), None)
if not r:
r1 = re.escape(display_name)
- r1 = _re_word_boundary(r1)
+ r1 = re_word_boundary(r1)
r = re.compile(r1, flags=re.IGNORECASE)
regex_cache[(display_name, False, True)] = r
@@ -212,7 +213,7 @@ def _glob_matches(glob: str, value: str, word_boundary: bool = False) -> bool:
try:
r = regex_cache.get((glob, True, word_boundary), None)
if not r:
- r = _glob_to_re(glob, word_boundary)
+ r = glob_to_regex(glob, word_boundary)
regex_cache[(glob, True, word_boundary)] = r
return bool(r.search(value))
except re.error:
@@ -220,56 +221,6 @@ def _glob_matches(glob: str, value: str, word_boundary: bool = False) -> bool:
return False
-def _glob_to_re(glob: str, word_boundary: bool) -> Pattern:
- """Generates regex for a given glob.
-
- Args:
- glob
- word_boundary: Whether to match against word boundaries or entire string.
- """
- if IS_GLOB.search(glob):
- r = re.escape(glob)
-
- r = r.replace(r"\*", ".*?")
- r = r.replace(r"\?", ".")
-
- # handle [abc], [a-z] and [!a-z] style ranges.
- r = GLOB_REGEX.sub(
- lambda x: (
- "[%s%s]" % (x.group(1) and "^" or "", x.group(2).replace(r"\\\-", "-"))
- ),
- r,
- )
- if word_boundary:
- r = _re_word_boundary(r)
-
- return re.compile(r, flags=re.IGNORECASE)
- else:
- r = "^" + r + "$"
-
- return re.compile(r, flags=re.IGNORECASE)
- elif word_boundary:
- r = re.escape(glob)
- r = _re_word_boundary(r)
-
- return re.compile(r, flags=re.IGNORECASE)
- else:
- r = "^" + re.escape(glob) + "$"
- return re.compile(r, flags=re.IGNORECASE)
-
-
-def _re_word_boundary(r: str) -> str:
- """
- Adds word boundary characters to the start and end of an
- expression to require that the match occur as a whole word,
- but do so respecting the fact that strings starting or ending
- with non-word characters will change word boundaries.
- """
- # we can't use \b as it chokes on unicode. however \W seems to be okay
- # as shorthand for [^0-9A-Za-z_].
- return r"(^|\W)%s(\W|$)" % (r,)
-
-
def _flatten_dict(
d: Union[EventBase, dict],
prefix: Optional[List[str]] = None,
diff --git a/synapse/util/__init__.py b/synapse/util/__init__.py
index 0f84fa3f4e..b69f562ca5 100644
--- a/synapse/util/__init__.py
+++ b/synapse/util/__init__.py
@@ -15,6 +15,7 @@
import json
import logging
import re
+from typing import Pattern
import attr
from frozendict import frozendict
@@ -26,6 +27,9 @@ from synapse.logging import context
logger = logging.getLogger(__name__)
+_WILDCARD_RUN = re.compile(r"([\?\*]+)")
+
+
def _reject_invalid_json(val):
"""Do not allow Infinity, -Infinity, or NaN values in JSON."""
raise ValueError("Invalid JSON value: '%s'" % val)
@@ -158,25 +162,54 @@ def log_failure(failure, msg, consumeErrors=True):
return failure
-def glob_to_regex(glob):
+def glob_to_regex(glob: str, word_boundary: bool = False) -> Pattern:
"""Converts a glob to a compiled regex object.
- The regex is anchored at the beginning and end of the string.
-
Args:
- glob (str)
+ glob: pattern to match
+ word_boundary: If True, the pattern will be allowed to match at word boundaries
+ anywhere in the string. Otherwise, the pattern is anchored at the start and
+ end of the string.
Returns:
- re.RegexObject
+ compiled regex pattern
"""
- res = ""
- for c in glob:
- if c == "*":
- res = res + ".*"
- elif c == "?":
- res = res + "."
+
+ # Patterns with wildcards must be simplified to avoid performance cliffs
+ # - The glob `?**?**?` is equivalent to the glob `???*`
+ # - The glob `???*` is equivalent to the regex `.{3,}`
+ chunks = []
+ for chunk in _WILDCARD_RUN.split(glob):
+ # No wildcards? re.escape()
+ if not _WILDCARD_RUN.match(chunk):
+ chunks.append(re.escape(chunk))
+ continue
+
+ # Wildcards? Simplify.
+ qmarks = chunk.count("?")
+ if "*" in chunk:
+ chunks.append(".{%d,}" % qmarks)
else:
- res = res + re.escape(c)
+ chunks.append(".{%d}" % qmarks)
+
+ res = "".join(chunks)
- # \A anchors at start of string, \Z at end of string
- return re.compile(r"\A" + res + r"\Z", re.IGNORECASE)
+ if word_boundary:
+ res = re_word_boundary(res)
+ else:
+ # \A anchors at start of string, \Z at end of string
+ res = r"\A" + res + r"\Z"
+
+ return re.compile(res, re.IGNORECASE)
+
+
+def re_word_boundary(r: str) -> str:
+ """
+ Adds word boundary characters to the start and end of an
+ expression to require that the match occur as a whole word,
+ but do so respecting the fact that strings starting or ending
+ with non-word characters will change word boundaries.
+ """
+ # we can't use \b as it chokes on unicode. however \W seems to be okay
+ # as shorthand for [^0-9A-Za-z_].
+ return r"(^|\W)%s(\W|$)" % (r,)
|