Fix notif kws that start/end with non-word chars

Only prepend / append word bounary characters if the search expression starts or ends with a word character, otherwise they don't work because there's no word bounary between whitespace and a non-word char.
author: David Baker <dave@matrix.org> 2017-10-05 11:33:30 +0100
committer: David Baker <dave@matrix.org> 2017-10-05 11:33:30 +0100
commit: 6748f0a57962fb9657cab60083d94b4c97a0526c (patch)
tree: 3d2231a712af6c5542778aab9be29d6de63ba716 /synapse
parent: Merge pull request #2495 from matrix-org/dbkr/spam_check_room_creation (diff)
download: synapse-6748f0a57962fb9657cab60083d94b4c97a0526c.tar.xz
1 files changed, 21 insertions, 2 deletions
diff --git a/synapse/push/push_rule_evaluator.py b/synapse/push/push_rule_evaluator.py
index 172c27c137..5a34d60abb 100644
--- a/synapse/push/push_rule_evaluator.py
+++ b/synapse/push/push_rule_evaluator.py
@@ -26,6 +26,8 @@ logger = logging.getLogger(__name__)
 GLOB_REGEX = re.compile(r'\\\[(\\\!|)(.*)\\\]')
 IS_GLOB = re.compile(r'[\?\*\[\]]')
 INEQUALITY_EXPR = re.compile("^([=<>]*)([0-9]*)$")
+STARTS_WITH_WORD_CHAR_REGEX = re.compile(r"^\w")
+ENDS_WITH_WORD_CHAR_REGEX = re.compile(r"\w$")
 
 
 def _room_member_count(ev, condition, room_member_count):
@@ -183,7 +185,7 @@ def _glob_to_re(glob, word_boundary):
             r,
         )
         if word_boundary:
-            r = r"\b%s\b" % (r,)
+            r = _re_word_boundary(r)
 
             return re.compile(r, flags=re.IGNORECASE)
         else:
@@ -192,13 +194,30 @@ def _glob_to_re(glob, word_boundary):
             return re.compile(r, flags=re.IGNORECASE)
     elif word_boundary:
         r = re.escape(glob)
-        r = r"\b%s\b" % (r,)
+        r = _re_word_boundary(r)
 
         return re.compile(r, flags=re.IGNORECASE)
     else:
         r = "^" + re.escape(glob) + "$"
         return re.compile(r, flags=re.IGNORECASE)
 
+def _re_word_boundary(r):
+    """
+    Adds word boundary characters to the start and end of an
+    expression to require that the match occur as a whole word,
+    but do so respecting the fact that strings starting or ending
+    with non-word characters will change word boundaries.
+    """
+    # Matching a regex string aginst a regex, since by definition
+    # \b is the boundary between a \w and a \W, so match \w at the
+    # start or end of the expression (although this will miss, eg.
+    # "[dl]og")
+    if STARTS_WITH_WORD_CHAR_REGEX.search(r):
+        r = r"\b%s" % (r,)
+    if ENDS_WITH_WORD_CHAR_REGEX.search(r):
+        r = r"%s\b" % (r,)
+    return r
+
 
 def _flatten_dict(d, prefix=[], result=None):
     if result is None:
author	David Baker <dave@matrix.org>	2017-10-05 11:33:30 +0100
committer	David Baker <dave@matrix.org>	2017-10-05 11:33:30 +0100
commit	6748f0a57962fb9657cab60083d94b4c97a0526c (patch)
tree	3d2231a712af6c5542778aab9be29d6de63ba716 /synapse
parent	Merge pull request #2495 from matrix-org/dbkr/spam_check_room_creation (diff)
download	synapse-6748f0a57962fb9657cab60083d94b4c97a0526c.tar.xz