diff options
author | Erik Johnston <erikj@jki.re> | 2019-02-12 13:02:58 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-02-12 13:02:58 +0000 |
commit | 3df8fcca256ab4305f2577ceb53364fbbefe2b20 (patch) | |
tree | 51921f52f2c383b2ca82a21822f6b6fe9c0c9792 /synapse | |
parent | Merge pull request #4625 from matrix-org/rav/fix_generate_config_warnings (diff) | |
parent | Newsfile (diff) | |
download | synapse-3df8fcca256ab4305f2577ceb53364fbbefe2b20.tar.xz |
Merge pull request #4626 from matrix-org/erikj/fixup_user_ips_dedupe
Reduce user_ips bloat during dedupe background update
Diffstat (limited to 'synapse')
-rw-r--r-- | synapse/storage/client_ips.py | 63 |
1 files changed, 60 insertions, 3 deletions
diff --git a/synapse/storage/client_ips.py b/synapse/storage/client_ips.py index 091d7116c5..a20cc8231f 100644 --- a/synapse/storage/client_ips.py +++ b/synapse/storage/client_ips.py @@ -167,12 +167,16 @@ class ClientIpStore(background_updates.BackgroundUpdateStore): clause = "? <= last_seen AND last_seen < ?" args = (begin_last_seen, end_last_seen) + # (Note: The DISTINCT in the inner query is important to ensure that + # the COUNT(*) is accurate, otherwise double counting may happen due + # to the join effectively being a cross product) txn.execute( """ SELECT user_id, access_token, ip, - MAX(device_id), MAX(user_agent), MAX(last_seen) + MAX(device_id), MAX(user_agent), MAX(last_seen), + COUNT(*) FROM ( - SELECT user_id, access_token, ip + SELECT DISTINCT user_id, access_token, ip FROM user_ips WHERE {} ) c @@ -186,7 +190,60 @@ class ClientIpStore(background_updates.BackgroundUpdateStore): # We've got some duplicates for i in res: - user_id, access_token, ip, device_id, user_agent, last_seen = i + user_id, access_token, ip, device_id, user_agent, last_seen, count = i + + # We want to delete the duplicates so we end up with only a + # single row. + # + # The naive way of doing this would be just to delete all rows + # and reinsert a constructed row. However, if there are a lot of + # duplicate rows this can cause the table to grow a lot, which + # can be problematic in two ways: + # 1. If user_ips is already large then this can cause the + # table to rapidly grow, potentially filling the disk. + # 2. Reinserting a lot of rows can confuse the table + # statistics for postgres, causing it to not use the + # correct indices for the query above, resulting in a full + # table scan. This is incredibly slow for large tables and + # can kill database performance. (This seems to mainly + # happen for the last query where the clause is simply `? < + # last_seen`) + # + # So instead we want to delete all but *one* of the duplicate + # rows. That is hard to do reliably, so we cheat and do a two + # step process: + # 1. Delete all rows with a last_seen strictly less than the + # max last_seen. This hopefully results in deleting all but + # one row the majority of the time, but there may be + # duplicate last_seen + # 2. If multiple rows remain, we fall back to the naive method + # and simply delete all rows and reinsert. + # + # Note that this relies on no new duplicate rows being inserted, + # but if that is happening then this entire process is futile + # anyway. + + # Do step 1: + + txn.execute( + """ + DELETE FROM user_ips + WHERE user_id = ? AND access_token = ? AND ip = ? AND last_seen < ? + """, + (user_id, access_token, ip, last_seen) + ) + if txn.rowcount == count - 1: + # We deleted all but one of the duplicate rows, i.e. there + # is exactly one remaining and so there is nothing left to + # do. + continue + elif txn.rowcount >= count: + raise Exception( + "We deleted more duplicate rows from 'user_ips' than expected", + ) + + # The previous step didn't delete enough rows, so we fallback to + # step 2: # Drop all the duplicates txn.execute( |