From 4d25bc6c92c3d219786892c5d510e0c4c4eb8b96 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 22 Oct 2015 11:12:28 +0100 Subject: Move FTS to delta 25 --- synapse/storage/schema/delta/25/fts.py | 123 +++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 synapse/storage/schema/delta/25/fts.py (limited to 'synapse/storage/schema/delta/25/fts.py') diff --git a/synapse/storage/schema/delta/25/fts.py b/synapse/storage/schema/delta/25/fts.py new file mode 100644 index 0000000000..fd181fc630 --- /dev/null +++ b/synapse/storage/schema/delta/25/fts.py @@ -0,0 +1,123 @@ +# Copyright 2015 OpenMarket Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +from synapse.storage.prepare_database import get_statements +from synapse.storage.engines import PostgresEngine, Sqlite3Engine + +import ujson + +logger = logging.getLogger(__name__) + + +POSTGRES_SQL = """ +CREATE TABLE IF NOT EXISTS event_search ( + event_id TEXT, + room_id TEXT, + key TEXT, + vector tsvector +); + +INSERT INTO event_search SELECT + event_id, room_id, 'content.body', + to_tsvector('english', json::json->'content'->>'body') + FROM events NATURAL JOIN event_json WHERE type = 'm.room.message'; + +INSERT INTO event_search SELECT + event_id, room_id, 'content.name', + to_tsvector('english', json::json->'content'->>'name') + FROM events NATURAL JOIN event_json WHERE type = 'm.room.name'; + +INSERT INTO event_search SELECT + event_id, room_id, 'content.topic', + to_tsvector('english', json::json->'content'->>'topic') + FROM events NATURAL JOIN event_json WHERE type = 'm.room.topic'; + + +CREATE INDEX event_search_fts_idx ON event_search USING gin(vector); +CREATE INDEX event_search_ev_idx ON event_search(event_id); +CREATE INDEX event_search_ev_ridx ON event_search(room_id); +""" + + +SQLITE_TABLE = ( + "CREATE VIRTUAL TABLE IF NOT EXISTS event_search USING fts3 ( event_id, room_id, key, value)" +) + + +def run_upgrade(cur, database_engine, *args, **kwargs): + if isinstance(database_engine, PostgresEngine): + run_postgres_upgrade(cur) + return + + if isinstance(database_engine, Sqlite3Engine): + run_sqlite_upgrade(cur) + return + + +def run_postgres_upgrade(cur): + for statement in get_statements(POSTGRES_SQL.splitlines()): + cur.execute(statement) + + +def run_sqlite_upgrade(cur): + cur.execute(SQLITE_TABLE) + + rowid = -1 + while True: + cur.execute( + "SELECT rowid, json FROM event_json" + " WHERE rowid > ?" + " ORDER BY rowid ASC LIMIT 100", + (rowid,) + ) + + res = cur.fetchall() + + if not res: + break + + events = [ + ujson.loads(js) + for _, js in res + ] + + rowid = max(rid for rid, _ in res) + + rows = [] + for ev in events: + if ev["type"] == "m.room.message": + rows.append(( + ev["event_id"], ev["room_id"], "content.body", + ev["content"]["body"] + )) + if ev["type"] == "m.room.name": + rows.append(( + ev["event_id"], ev["room_id"], "content.name", + ev["content"]["name"] + )) + if ev["type"] == "m.room.topic": + rows.append(( + ev["event_id"], ev["room_id"], "content.topic", + ev["content"]["topic"] + )) + + if rows: + logger.info(rows) + cur.executemany( + "INSERT INTO event_search (event_id, room_id, key, value)" + " VALUES (?,?,?,?)", + rows + ) -- cgit 1.5.1 From f142898f529f89c5bd90710db2d0771894b0b33f Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 22 Oct 2015 11:18:01 +0100 Subject: PEP8 --- synapse/storage/schema/delta/25/fts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'synapse/storage/schema/delta/25/fts.py') diff --git a/synapse/storage/schema/delta/25/fts.py b/synapse/storage/schema/delta/25/fts.py index fd181fc630..ed3cc06557 100644 --- a/synapse/storage/schema/delta/25/fts.py +++ b/synapse/storage/schema/delta/25/fts.py @@ -53,7 +53,8 @@ CREATE INDEX event_search_ev_ridx ON event_search(room_id); SQLITE_TABLE = ( - "CREATE VIRTUAL TABLE IF NOT EXISTS event_search USING fts3 ( event_id, room_id, key, value)" + "CREATE VIRTUAL TABLE IF NOT EXISTS event_search" + " USING fts3 ( event_id, room_id, key, value)" ) -- cgit 1.5.1 From 0c36098c1fe561629651e446589a644fed9188e4 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Fri, 23 Oct 2015 13:23:48 +0100 Subject: Implement rank function for SQLite FTS --- synapse/storage/engines/sqlite3.py | 27 +++++++++++++++++++++++++++ synapse/storage/schema/delta/25/fts.py | 2 +- synapse/storage/search.py | 3 ++- 3 files changed, 30 insertions(+), 2 deletions(-) (limited to 'synapse/storage/schema/delta/25/fts.py') diff --git a/synapse/storage/engines/sqlite3.py b/synapse/storage/engines/sqlite3.py index bad3b5c5ac..a5a54ec011 100644 --- a/synapse/storage/engines/sqlite3.py +++ b/synapse/storage/engines/sqlite3.py @@ -17,6 +17,8 @@ from synapse.storage.prepare_database import ( prepare_database, prepare_sqlite3_database ) +import struct + class Sqlite3Engine(object): single_threaded = True @@ -32,6 +34,7 @@ class Sqlite3Engine(object): def on_new_connection(self, db_conn): self.prepare_database(db_conn) + db_conn.create_function("rank", 1, _rank) def prepare_database(self, db_conn): prepare_sqlite3_database(db_conn) @@ -45,3 +48,27 @@ class Sqlite3Engine(object): def lock_table(self, txn, table): return + + +# Following functions taken from: https://github.com/coleifer/peewee + +def _parse_match_info(buf): + bufsize = len(buf) + return [struct.unpack('@I', buf[i:i+4])[0] for i in range(0, bufsize, 4)] + + +def _rank(raw_match_info): + """Handle match_info called w/default args 'pcx' - based on the example rank + function http://sqlite.org/fts3.html#appendix_a + """ + match_info = _parse_match_info(raw_match_info) + score = 0.0 + p, c = match_info[:2] + for phrase_num in range(p): + phrase_info_idx = 2 + (phrase_num * c * 3) + for col_num in range(c): + col_idx = phrase_info_idx + (col_num * 3) + x1, x2 = match_info[col_idx:col_idx + 2] + if x1 > 0: + score += float(x1) / x2 + return score diff --git a/synapse/storage/schema/delta/25/fts.py b/synapse/storage/schema/delta/25/fts.py index ed3cc06557..3f0b02d119 100644 --- a/synapse/storage/schema/delta/25/fts.py +++ b/synapse/storage/schema/delta/25/fts.py @@ -54,7 +54,7 @@ CREATE INDEX event_search_ev_ridx ON event_search(room_id); SQLITE_TABLE = ( "CREATE VIRTUAL TABLE IF NOT EXISTS event_search" - " USING fts3 ( event_id, room_id, key, value)" + " USING fts4 ( event_id, room_id, key, value )" ) diff --git a/synapse/storage/search.py b/synapse/storage/search.py index 9608b5d6a7..cdf003502f 100644 --- a/synapse/storage/search.py +++ b/synapse/storage/search.py @@ -72,7 +72,8 @@ class SearchStore(SQLBaseStore): ) elif isinstance(self.database_engine, Sqlite3Engine): sql = ( - "SELECT 0 as rank, room_id, event_id FROM event_search" + "SELECT rank(matchinfo(event_search)) as rank, room_id, event_id" + " FROM event_search" " WHERE value MATCH ?" ) else: -- cgit 1.5.1 From 4cf633d5e9628a56cf9b400ee3e073fcdcb365f0 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Fri, 23 Oct 2015 15:41:36 +0100 Subject: Pull out sender when computing search results --- synapse/storage/schema/delta/25/fts.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'synapse/storage/schema/delta/25/fts.py') diff --git a/synapse/storage/schema/delta/25/fts.py b/synapse/storage/schema/delta/25/fts.py index 3f0b02d119..056487af30 100644 --- a/synapse/storage/schema/delta/25/fts.py +++ b/synapse/storage/schema/delta/25/fts.py @@ -26,22 +26,23 @@ POSTGRES_SQL = """ CREATE TABLE IF NOT EXISTS event_search ( event_id TEXT, room_id TEXT, + sender TEXT, key TEXT, vector tsvector ); INSERT INTO event_search SELECT - event_id, room_id, 'content.body', + event_id, room_id, json::json->>'sender', 'content.body', to_tsvector('english', json::json->'content'->>'body') FROM events NATURAL JOIN event_json WHERE type = 'm.room.message'; INSERT INTO event_search SELECT - event_id, room_id, 'content.name', + event_id, room_id, json::json->>'sender', 'content.name', to_tsvector('english', json::json->'content'->>'name') FROM events NATURAL JOIN event_json WHERE type = 'm.room.name'; INSERT INTO event_search SELECT - event_id, room_id, 'content.topic', + event_id, room_id, json::json->>'sender', 'content.topic', to_tsvector('english', json::json->'content'->>'topic') FROM events NATURAL JOIN event_json WHERE type = 'm.room.topic'; @@ -99,26 +100,28 @@ def run_sqlite_upgrade(cur): rows = [] for ev in events: - if ev["type"] == "m.room.message": + content = ev.get("content", {}) + body = content.get("body", None) + name = content.get("name", None) + topic = content.get("topic", None) + sender = ev.get("sender", None) + if ev["type"] == "m.room.message" and body: rows.append(( - ev["event_id"], ev["room_id"], "content.body", - ev["content"]["body"] + ev["event_id"], ev["room_id"], sender, "content.body", body )) - if ev["type"] == "m.room.name": + if ev["type"] == "m.room.name" and name: rows.append(( - ev["event_id"], ev["room_id"], "content.name", - ev["content"]["name"] + ev["event_id"], ev["room_id"], sender, "content.name", name )) - if ev["type"] == "m.room.topic": + if ev["type"] == "m.room.topic" and topic: rows.append(( - ev["event_id"], ev["room_id"], "content.topic", - ev["content"]["topic"] + ev["event_id"], ev["room_id"], sender, "content.topic", topic )) if rows: logger.info(rows) cur.executemany( - "INSERT INTO event_search (event_id, room_id, key, value)" - " VALUES (?,?,?,?)", + "INSERT INTO event_search (event_id, room_id, sender, key, value)" + " VALUES (?,?,?,?,?)", rows ) -- cgit 1.5.1 From 621e84d9a0f06f65bd51171079fd18eb916ab81a Mon Sep 17 00:00:00 2001 From: Daniel Wagner-Hall Date: Fri, 30 Oct 2015 16:25:53 +0000 Subject: Add missing column --- synapse/storage/schema/delta/25/fts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'synapse/storage/schema/delta/25/fts.py') diff --git a/synapse/storage/schema/delta/25/fts.py b/synapse/storage/schema/delta/25/fts.py index 056487af30..b7cd0ce3b8 100644 --- a/synapse/storage/schema/delta/25/fts.py +++ b/synapse/storage/schema/delta/25/fts.py @@ -55,7 +55,7 @@ CREATE INDEX event_search_ev_ridx ON event_search(room_id); SQLITE_TABLE = ( "CREATE VIRTUAL TABLE IF NOT EXISTS event_search" - " USING fts4 ( event_id, room_id, key, value )" + " USING fts4 ( event_id, room_id, sender, key, value )" ) -- cgit 1.5.1