summary refs log tree commit diff
path: root/synapse/util/__init__.py
diff options
context:
space:
mode:
authorRichard van der Hoff <1389908+richvdh@users.noreply.github.com>2021-05-11 10:47:23 +0100
committerGitHub <noreply@github.com>2021-05-11 11:47:23 +0200
commit03318a766cac9f8b053db2214d9c332a977d226c (patch)
tree3444111943cbfda45609535c83e53f9adede3a90 /synapse/util/__init__.py
parentUnpin attrs dep after new version has been released (#9946) (diff)
downloadsynapse-03318a766cac9f8b053db2214d9c332a977d226c.tar.xz
Merge pull request from GHSA-x345-32rc-8h85
* tests for push rule pattern matching

* tests for acl pattern matching

* factor out common `re.escape`

* Factor out common re.compile

* Factor out common anchoring code

* add word_boundary support to `glob_to_regex`

* Use `glob_to_regex` in push rule evaluator

NB that this drops support for character classes. I don't think anyone ever
used them.

* Improve efficiency of globs with multiple wildcards

The idea here is that we compress multiple `*` globs into a single `.*`. We
also need to consider `?`, since `*?*` is as hard to implement efficiently as
`**`.

* add assertion on regex pattern

* Fix mypy

* Simplify glob_to_regex

* Inline the glob_to_regex helper function

Signed-off-by: Dan Callahan <danc@element.io>

* Moar comments

Signed-off-by: Dan Callahan <danc@element.io>

Co-authored-by: Dan Callahan <danc@element.io>
Diffstat (limited to 'synapse/util/__init__.py')
-rw-r--r--synapse/util/__init__.py61
1 files changed, 47 insertions, 14 deletions
diff --git a/synapse/util/__init__.py b/synapse/util/__init__.py

index 0f84fa3f4e..b69f562ca5 100644 --- a/synapse/util/__init__.py +++ b/synapse/util/__init__.py
@@ -15,6 +15,7 @@ import json import logging import re +from typing import Pattern import attr from frozendict import frozendict @@ -26,6 +27,9 @@ from synapse.logging import context logger = logging.getLogger(__name__) +_WILDCARD_RUN = re.compile(r"([\?\*]+)") + + def _reject_invalid_json(val): """Do not allow Infinity, -Infinity, or NaN values in JSON.""" raise ValueError("Invalid JSON value: '%s'" % val) @@ -158,25 +162,54 @@ def log_failure(failure, msg, consumeErrors=True): return failure -def glob_to_regex(glob): +def glob_to_regex(glob: str, word_boundary: bool = False) -> Pattern: """Converts a glob to a compiled regex object. - The regex is anchored at the beginning and end of the string. - Args: - glob (str) + glob: pattern to match + word_boundary: If True, the pattern will be allowed to match at word boundaries + anywhere in the string. Otherwise, the pattern is anchored at the start and + end of the string. Returns: - re.RegexObject + compiled regex pattern """ - res = "" - for c in glob: - if c == "*": - res = res + ".*" - elif c == "?": - res = res + "." + + # Patterns with wildcards must be simplified to avoid performance cliffs + # - The glob `?**?**?` is equivalent to the glob `???*` + # - The glob `???*` is equivalent to the regex `.{3,}` + chunks = [] + for chunk in _WILDCARD_RUN.split(glob): + # No wildcards? re.escape() + if not _WILDCARD_RUN.match(chunk): + chunks.append(re.escape(chunk)) + continue + + # Wildcards? Simplify. + qmarks = chunk.count("?") + if "*" in chunk: + chunks.append(".{%d,}" % qmarks) else: - res = res + re.escape(c) + chunks.append(".{%d}" % qmarks) + + res = "".join(chunks) - # \A anchors at start of string, \Z at end of string - return re.compile(r"\A" + res + r"\Z", re.IGNORECASE) + if word_boundary: + res = re_word_boundary(res) + else: + # \A anchors at start of string, \Z at end of string + res = r"\A" + res + r"\Z" + + return re.compile(res, re.IGNORECASE) + + +def re_word_boundary(r: str) -> str: + """ + Adds word boundary characters to the start and end of an + expression to require that the match occur as a whole word, + but do so respecting the fact that strings starting or ending + with non-word characters will change word boundaries. + """ + # we can't use \b as it chokes on unicode. however \W seems to be okay + # as shorthand for [^0-9A-Za-z_]. + return r"(^|\W)%s(\W|$)" % (r,)