Merge pull request #2500 from matrix-org/dbkr/fix_word_boundary_mentions

Fix notif kws that start/end with non-word chars
author: David Baker <dbkr@users.noreply.github.com> 2017-10-05 12:27:59 +0100
committer: GitHub <noreply@github.com> 2017-10-05 12:27:59 +0100
commit: 44f8e383f36edf7ca31980a08449f2f11d107661 (patch)
tree: 11ba51e0f9f9464e9522ed0d6dff943383efd57d
parent: Merge pull request #2495 from matrix-org/dbkr/spam_check_room_creation (diff)
parent: Use better method for word boundary searching (diff)
download: synapse-44f8e383f36edf7ca31980a08449f2f11d107661.tar.xz
1 files changed, 14 insertions, 2 deletions
diff --git a/synapse/push/push_rule_evaluator.py b/synapse/push/push_rule_evaluator.py
index 172c27c137..65f9a63fd8 100644
--- a/synapse/push/push_rule_evaluator.py
+++ b/synapse/push/push_rule_evaluator.py
@@ -183,7 +183,7 @@ def _glob_to_re(glob, word_boundary):
             r,
         )
         if word_boundary:
-            r = r"\b%s\b" % (r,)
+            r = _re_word_boundary(r)
 
             return re.compile(r, flags=re.IGNORECASE)
         else:
@@ -192,7 +192,7 @@ def _glob_to_re(glob, word_boundary):
             return re.compile(r, flags=re.IGNORECASE)
     elif word_boundary:
         r = re.escape(glob)
-        r = r"\b%s\b" % (r,)
+        r = _re_word_boundary(r)
 
         return re.compile(r, flags=re.IGNORECASE)
     else:
@@ -200,6 +200,18 @@ def _glob_to_re(glob, word_boundary):
         return re.compile(r, flags=re.IGNORECASE)
 
 
+def _re_word_boundary(r):
+    """
+    Adds word boundary characters to the start and end of an
+    expression to require that the match occur as a whole word,
+    but do so respecting the fact that strings starting or ending
+    with non-word characters will change word boundaries.
+    """
+    # we can't use \b as it chokes on unicode. however \W seems to be okay
+    # as shorthand for [^0-9A-Za-z_].
+    return r"(^|\W)%s(\W|$)" % (r,)
+
+
 def _flatten_dict(d, prefix=[], result=None):
     if result is None:
         result = {}
author	David Baker <dbkr@users.noreply.github.com>	2017-10-05 12:27:59 +0100
committer	GitHub <noreply@github.com>	2017-10-05 12:27:59 +0100
commit	44f8e383f36edf7ca31980a08449f2f11d107661 (patch)
tree	11ba51e0f9f9464e9522ed0d6dff943383efd57d
parent	Merge pull request #2495 from matrix-org/dbkr/spam_check_room_creation (diff)
parent	Use better method for word boundary searching (diff)
download	synapse-44f8e383f36edf7ca31980a08449f2f11d107661.tar.xz