From 362d80b7706c7bcd57ea5ee3e8e38b31c7d1ad8b Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 12 Feb 2019 11:28:08 +0000 Subject: Reduce user_ips bloat during dedupe background update The background update to remove duplicate rows naively deleted and reinserted the duplicates. For large tables with a large number of duplicates this causes a lot of bloat (with postgres), as the inserted rows are appended to the table, since deleted rows will not be overwritten until a VACUUM has happened. This should hopefully also help ensure that the query in the last batch uses the correct index, as inserting a large number of new rows without analyzing will upset the query planner. --- synapse/storage/client_ips.py | 63 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 60 insertions(+), 3 deletions(-) (limited to 'synapse/storage/client_ips.py') diff --git a/synapse/storage/client_ips.py b/synapse/storage/client_ips.py index 091d7116c5..a20cc8231f 100644 --- a/synapse/storage/client_ips.py +++ b/synapse/storage/client_ips.py @@ -167,12 +167,16 @@ class ClientIpStore(background_updates.BackgroundUpdateStore): clause = "? <= last_seen AND last_seen < ?" args = (begin_last_seen, end_last_seen) + # (Note: The DISTINCT in the inner query is important to ensure that + # the COUNT(*) is accurate, otherwise double counting may happen due + # to the join effectively being a cross product) txn.execute( """ SELECT user_id, access_token, ip, - MAX(device_id), MAX(user_agent), MAX(last_seen) + MAX(device_id), MAX(user_agent), MAX(last_seen), + COUNT(*) FROM ( - SELECT user_id, access_token, ip + SELECT DISTINCT user_id, access_token, ip FROM user_ips WHERE {} ) c @@ -186,7 +190,60 @@ class ClientIpStore(background_updates.BackgroundUpdateStore): # We've got some duplicates for i in res: - user_id, access_token, ip, device_id, user_agent, last_seen = i + user_id, access_token, ip, device_id, user_agent, last_seen, count = i + + # We want to delete the duplicates so we end up with only a + # single row. + # + # The naive way of doing this would be just to delete all rows + # and reinsert a constructed row. However, if there are a lot of + # duplicate rows this can cause the table to grow a lot, which + # can be problematic in two ways: + # 1. If user_ips is already large then this can cause the + # table to rapidly grow, potentially filling the disk. + # 2. Reinserting a lot of rows can confuse the table + # statistics for postgres, causing it to not use the + # correct indices for the query above, resulting in a full + # table scan. This is incredibly slow for large tables and + # can kill database performance. (This seems to mainly + # happen for the last query where the clause is simply `? < + # last_seen`) + # + # So instead we want to delete all but *one* of the duplicate + # rows. That is hard to do reliably, so we cheat and do a two + # step process: + # 1. Delete all rows with a last_seen strictly less than the + # max last_seen. This hopefully results in deleting all but + # one row the majority of the time, but there may be + # duplicate last_seen + # 2. If multiple rows remain, we fall back to the naive method + # and simply delete all rows and reinsert. + # + # Note that this relies on no new duplicate rows being inserted, + # but if that is happening then this entire process is futile + # anyway. + + # Do step 1: + + txn.execute( + """ + DELETE FROM user_ips + WHERE user_id = ? AND access_token = ? AND ip = ? AND last_seen < ? + """, + (user_id, access_token, ip, last_seen) + ) + if txn.rowcount == count - 1: + # We deleted all but one of the duplicate rows, i.e. there + # is exactly one remaining and so there is nothing left to + # do. + continue + elif txn.rowcount >= count: + raise Exception( + "We deleted more duplicate rows from 'user_ips' than expected", + ) + + # The previous step didn't delete enough rows, so we fallback to + # step 2: # Drop all the duplicates txn.execute( -- cgit 1.5.1 From 483ba85c7a1a8ee9b7eebcc5c07d522c71229c9f Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 12 Feb 2019 11:55:27 +0000 Subject: Analyze user_ips before running deduplication Due to the table locks taken out by the naive upsert, the table statistics may be out of date. During deduplication it is important that the correct index is used as otherwise a full table scan may be incorrectly used, which can end up thrashing the database badly. --- synapse/storage/client_ips.py | 24 ++++++++++++++++++++++ synapse/storage/schema/delta/53/user_ips_index.sql | 10 ++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) (limited to 'synapse/storage/client_ips.py') diff --git a/synapse/storage/client_ips.py b/synapse/storage/client_ips.py index 091d7116c5..6f81406269 100644 --- a/synapse/storage/client_ips.py +++ b/synapse/storage/client_ips.py @@ -65,6 +65,11 @@ class ClientIpStore(background_updates.BackgroundUpdateStore): columns=["last_seen"], ) + self.register_background_update_handler( + "user_ips_analyze", + self._analyze_user_ip, + ) + self.register_background_update_handler( "user_ips_remove_dupes", self._remove_user_ip_dupes, @@ -108,6 +113,25 @@ class ClientIpStore(background_updates.BackgroundUpdateStore): yield self._end_background_update("user_ips_drop_nonunique_index") defer.returnValue(1) + @defer.inlineCallbacks + def _analyze_user_ip(self, progress, batch_size): + # Background update to analyze user_ips table before we run the + # deduplication background update. The table may not have been analyzed + # for ages due to the table locks. + # + # This will lock out the naive upserts to user_ips while it happens, but + # the analyze should be quick (28GB table takes ~10s) + def user_ips_analyze(txn): + txn.execute("ANALYZE user_ips") + + end_last_seen = yield self.runInteraction( + "user_ips_analyze", user_ips_analyze + ) + + yield self._end_background_update("user_ips_analyze") + + defer.returnValue(1) + @defer.inlineCallbacks def _remove_user_ip_dupes(self, progress, batch_size): # This works function works by scanning the user_ips table in batches diff --git a/synapse/storage/schema/delta/53/user_ips_index.sql b/synapse/storage/schema/delta/53/user_ips_index.sql index 4ca346c111..b812c5794f 100644 --- a/synapse/storage/schema/delta/53/user_ips_index.sql +++ b/synapse/storage/schema/delta/53/user_ips_index.sql @@ -13,9 +13,13 @@ * limitations under the License. */ --- delete duplicates + -- analyze user_ips, to help ensure the correct indices are used INSERT INTO background_updates (update_name, progress_json) VALUES - ('user_ips_remove_dupes', '{}'); + ('user_ips_analyze', '{}'); + +-- delete duplicates +INSERT INTO background_updates (update_name, progress_json, depends_on) VALUES + ('user_ips_remove_dupes', '{}', 'user_ips_analyze'); -- add a new unique index to user_ips table INSERT INTO background_updates (update_name, progress_json, depends_on) VALUES @@ -23,4 +27,4 @@ INSERT INTO background_updates (update_name, progress_json, depends_on) VALUES -- drop the old original index INSERT INTO background_updates (update_name, progress_json, depends_on) VALUES - ('user_ips_drop_nonunique_index', '{}', 'user_ips_device_unique_index'); \ No newline at end of file + ('user_ips_drop_nonunique_index', '{}', 'user_ips_device_unique_index'); -- cgit 1.5.1 From 495ea92350c4a3f6bd92cded5da939326b858d9b Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 12 Feb 2019 12:40:42 +0000 Subject: Fix pep8 --- synapse/storage/client_ips.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'synapse/storage/client_ips.py') diff --git a/synapse/storage/client_ips.py b/synapse/storage/client_ips.py index 6f81406269..cc23d7cdbe 100644 --- a/synapse/storage/client_ips.py +++ b/synapse/storage/client_ips.py @@ -124,7 +124,7 @@ class ClientIpStore(background_updates.BackgroundUpdateStore): def user_ips_analyze(txn): txn.execute("ANALYZE user_ips") - end_last_seen = yield self.runInteraction( + yield self.runInteraction( "user_ips_analyze", user_ips_analyze ) -- cgit 1.5.1