diff options
author | Richard van der Hoff <1389908+richvdh@users.noreply.github.com> | 2021-05-11 10:47:23 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-05-11 11:47:23 +0200 |
commit | 03318a766cac9f8b053db2214d9c332a977d226c (patch) | |
tree | 3444111943cbfda45609535c83e53f9adede3a90 /synapse/util | |
parent | Unpin attrs dep after new version has been released (#9946) (diff) | |
download | synapse-03318a766cac9f8b053db2214d9c332a977d226c.tar.xz |
Merge pull request from GHSA-x345-32rc-8h85
* tests for push rule pattern matching * tests for acl pattern matching * factor out common `re.escape` * Factor out common re.compile * Factor out common anchoring code * add word_boundary support to `glob_to_regex` * Use `glob_to_regex` in push rule evaluator NB that this drops support for character classes. I don't think anyone ever used them. * Improve efficiency of globs with multiple wildcards The idea here is that we compress multiple `*` globs into a single `.*`. We also need to consider `?`, since `*?*` is as hard to implement efficiently as `**`. * add assertion on regex pattern * Fix mypy * Simplify glob_to_regex * Inline the glob_to_regex helper function Signed-off-by: Dan Callahan <danc@element.io> * Moar comments Signed-off-by: Dan Callahan <danc@element.io> Co-authored-by: Dan Callahan <danc@element.io>
Diffstat (limited to 'synapse/util')
-rw-r--r-- | synapse/util/__init__.py | 61 |
1 files changed, 47 insertions, 14 deletions
diff --git a/synapse/util/__init__.py b/synapse/util/__init__.py index 0f84fa3f4e..b69f562ca5 100644 --- a/synapse/util/__init__.py +++ b/synapse/util/__init__.py @@ -15,6 +15,7 @@ import json import logging import re +from typing import Pattern import attr from frozendict import frozendict @@ -26,6 +27,9 @@ from synapse.logging import context logger = logging.getLogger(__name__) +_WILDCARD_RUN = re.compile(r"([\?\*]+)") + + def _reject_invalid_json(val): """Do not allow Infinity, -Infinity, or NaN values in JSON.""" raise ValueError("Invalid JSON value: '%s'" % val) @@ -158,25 +162,54 @@ def log_failure(failure, msg, consumeErrors=True): return failure -def glob_to_regex(glob): +def glob_to_regex(glob: str, word_boundary: bool = False) -> Pattern: """Converts a glob to a compiled regex object. - The regex is anchored at the beginning and end of the string. - Args: - glob (str) + glob: pattern to match + word_boundary: If True, the pattern will be allowed to match at word boundaries + anywhere in the string. Otherwise, the pattern is anchored at the start and + end of the string. Returns: - re.RegexObject + compiled regex pattern """ - res = "" - for c in glob: - if c == "*": - res = res + ".*" - elif c == "?": - res = res + "." + + # Patterns with wildcards must be simplified to avoid performance cliffs + # - The glob `?**?**?` is equivalent to the glob `???*` + # - The glob `???*` is equivalent to the regex `.{3,}` + chunks = [] + for chunk in _WILDCARD_RUN.split(glob): + # No wildcards? re.escape() + if not _WILDCARD_RUN.match(chunk): + chunks.append(re.escape(chunk)) + continue + + # Wildcards? Simplify. + qmarks = chunk.count("?") + if "*" in chunk: + chunks.append(".{%d,}" % qmarks) else: - res = res + re.escape(c) + chunks.append(".{%d}" % qmarks) + + res = "".join(chunks) - # \A anchors at start of string, \Z at end of string - return re.compile(r"\A" + res + r"\Z", re.IGNORECASE) + if word_boundary: + res = re_word_boundary(res) + else: + # \A anchors at start of string, \Z at end of string + res = r"\A" + res + r"\Z" + + return re.compile(res, re.IGNORECASE) + + +def re_word_boundary(r: str) -> str: + """ + Adds word boundary characters to the start and end of an + expression to require that the match occur as a whole word, + but do so respecting the fact that strings starting or ending + with non-word characters will change word boundaries. + """ + # we can't use \b as it chokes on unicode. however \W seems to be okay + # as shorthand for [^0-9A-Za-z_]. + return r"(^|\W)%s(\W|$)" % (r,) |