summary refs log tree commit diff
diff options
context:
space:
mode:
authorErik Johnston <erik@matrix.org>2015-11-27 16:40:42 +0000
committerErik Johnston <erik@matrix.org>2015-11-27 16:40:42 +0000
commit76936f43ae0f88d4523fe07b7a9ccf8ddb5563ac (patch)
tree0d5fe0c4ca6536dff2220c594427a1cb1568e713
parentMerge pull request #397 from matrix-org/erikj/redaction_inequality (diff)
downloadsynapse-76936f43ae0f88d4523fe07b7a9ccf8ddb5563ac.tar.xz
Return words to highlight in search results
-rw-r--r--synapse/handlers/search.py19
-rw-r--r--synapse/storage/search.py123
2 files changed, 120 insertions, 22 deletions
diff --git a/synapse/handlers/search.py b/synapse/handlers/search.py
index 50688e51a8..6d2197339e 100644
--- a/synapse/handlers/search.py
+++ b/synapse/handlers/search.py
@@ -139,11 +139,18 @@ class SearchHandler(BaseHandler):
         # Holds the next_batch for the entire result set if one of those exists
         global_next_batch = None
 
+        highlights = set()
+
         if order_by == "rank":
-            results = yield self.store.search_msgs(
+            search_result = yield self.store.search_msgs(
                 room_ids, search_term, keys
             )
 
+            if search_result["highlights"]:
+                highlights.update(search_result["highlights"])
+
+            results = search_result["results"]
+
             results_map = {r["event"].event_id: r for r in results}
 
             rank_map.update({r["event"].event_id: r["rank"] for r in results})
@@ -187,11 +194,16 @@ class SearchHandler(BaseHandler):
                 # But only go around 5 times since otherwise synapse will be sad.
                 while len(room_events) < search_filter.limit() and i < 5:
                     i += 1
-                    results = yield self.store.search_room(
+                    search_result = yield self.store.search_room(
                         room_id, search_term, keys, search_filter.limit() * 2,
                         pagination_token=pagination_token,
                     )
 
+                    if search_result["highlights"]:
+                        highlights.update(search_result["highlights"])
+
+                    results = search_result["results"]
+
                     results_map = {r["event"].event_id: r for r in results}
 
                     rank_map.update({r["event"].event_id: r["rank"] for r in results})
@@ -347,7 +359,8 @@ class SearchHandler(BaseHandler):
 
         rooms_cat_res = {
             "results": results,
-            "count": len(results)
+            "count": len(results),
+            "highlights": list(highlights),
         }
 
         if state_results:
diff --git a/synapse/storage/search.py b/synapse/storage/search.py
index 380270b009..c6386642df 100644
--- a/synapse/storage/search.py
+++ b/synapse/storage/search.py
@@ -20,6 +20,7 @@ from synapse.api.errors import SynapseError
 from synapse.storage.engines import PostgresEngine, Sqlite3Engine
 
 import logging
+import re
 
 
 logger = logging.getLogger(__name__)
@@ -194,14 +195,21 @@ class SearchStore(BackgroundUpdateStore):
             for ev in events
         }
 
-        defer.returnValue([
-            {
-                "event": event_map[r["event_id"]],
-                "rank": r["rank"],
-            }
-            for r in results
-            if r["event_id"] in event_map
-        ])
+        highlights = None
+        if isinstance(self.database_engine, PostgresEngine):
+            highlights = yield self._find_highlights_in_postgres(search_term, events)
+
+        defer.returnValue({
+            "results": [
+                {
+                    "event": event_map[r["event_id"]],
+                    "rank": r["rank"],
+                }
+                for r in results
+                if r["event_id"] in event_map
+            ],
+            "highlights": highlights,
+        })
 
     @defer.inlineCallbacks
     def search_room(self, room_id, search_term, keys, limit, pagination_token=None):
@@ -294,14 +302,91 @@ class SearchStore(BackgroundUpdateStore):
             for ev in events
         }
 
-        defer.returnValue([
-            {
-                "event": event_map[r["event_id"]],
-                "rank": r["rank"],
-                "pagination_token": "%s,%s" % (
-                    r["topological_ordering"], r["stream_ordering"]
-                ),
-            }
-            for r in results
-            if r["event_id"] in event_map
-        ])
+        highlights = None
+        if isinstance(self.database_engine, PostgresEngine):
+            highlights = yield self._find_highlights_in_postgres(search_term, events)
+
+        defer.returnValue({
+            "results": [
+                {
+                    "event": event_map[r["event_id"]],
+                    "rank": r["rank"],
+                    "pagination_token": "%s,%s" % (
+                        r["topological_ordering"], r["stream_ordering"]
+                    ),
+                }
+                for r in results
+                if r["event_id"] in event_map
+            ],
+            "highlights": highlights,
+        })
+
+    def _find_highlights_in_postgres(self, search_term, events):
+        """Given a list of events and a search term, return a list of words
+        that match from the content of the event.
+
+        This is used to give a list of words that clients can match against to
+        highlight the matching parts.
+
+        Args:
+            search_term (str)
+            events (list): A list of events
+
+        Returns:
+            deferred : A set of strings.
+        """
+        def f(txn):
+            highlight_words = set()
+            for event in events:
+                # As a hack we simply join values of all possible keys. This is
+                # fine since we're only using them to find possible highlights.
+                values = []
+                for key in ("body", "name", "topic"):
+                    v = event.content.get(key, None)
+                    if v:
+                        values.append(v)
+
+                if not values:
+                    continue
+
+                value = " ".join(values)
+
+                # We need to find some values for StartSel and StopSel that
+                # aren't in the value so that we can pick results out.
+                start_sel = "<"
+                stop_sel = ">"
+
+                while start_sel in value:
+                    start_sel += "<"
+                while stop_sel in value:
+                    stop_sel += ">"
+
+                query = "SELECT ts_headline(?, plainto_tsquery('english', ?), %s)" % (
+                    _to_postgres_options({
+                        "StartSel": start_sel,
+                        "StopSel": stop_sel,
+                        "MaxFragments": "50",
+                    })
+                )
+                txn.execute(query, (value, search_term,))
+                headline, = txn.fetchall()[0]
+
+                # Now we need to pick the possible highlights out of the haedline
+                # result.
+                matcher_regex = "%s(.*?)%s" % (
+                    re.escape(start_sel),
+                    re.escape(stop_sel),
+                )
+
+                res = re.findall(matcher_regex, headline)
+                highlight_words.update([r.lower() for r in res])
+
+            return highlight_words
+
+        return self.runInteraction("_find_highlights", f)
+
+
+def _to_postgres_options(options_dict):
+    return "'%s'" % (
+        ",".join("%s=%s" % (k, v) for k, v in options_dict.items()),
+    )