diff options
author | Erik Johnston <erikj@jki.re> | 2019-01-22 16:51:57 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-01-22 16:51:57 +0000 |
commit | 12699a701f47cb6243b636588fb49c0b7fda9ca3 (patch) | |
tree | 702f2b98c87ce9d5ff69a510321beb002da941fa | |
parent | Merge pull request #4423 from matrix-org/neilj/disable_msisdn_on_registration (diff) | |
parent | Refactor to rewrite the SQL instead (diff) | |
download | synapse-12699a701f47cb6243b636588fb49c0b7fda9ca3.tar.xz |
Merge pull request #4434 from matrix-org/erikj/fix_user_ips_dedup
Fix bug when removing duplicate rows from user_ips
-rw-r--r-- | changelog.d/4434.misc | 1 | ||||
-rw-r--r-- | synapse/storage/client_ips.py | 61 |
2 files changed, 36 insertions, 26 deletions
diff --git a/changelog.d/4434.misc b/changelog.d/4434.misc new file mode 100644 index 0000000000..047061ed3c --- /dev/null +++ b/changelog.d/4434.misc @@ -0,0 +1 @@ +Apply a unique index to the user_ips table, preventing duplicates. diff --git a/synapse/storage/client_ips.py b/synapse/storage/client_ips.py index 5d548f250a..78721a941a 100644 --- a/synapse/storage/client_ips.py +++ b/synapse/storage/client_ips.py @@ -110,8 +110,13 @@ class ClientIpStore(background_updates.BackgroundUpdateStore): @defer.inlineCallbacks def _remove_user_ip_dupes(self, progress, batch_size): + # This works function works by scanning the user_ips table in batches + # based on `last_seen`. For each row in a batch it searches the rest of + # the table to see if there are any duplicates, if there are then they + # are removed and replaced with a suitable row. - last_seen_progress = progress.get("last_seen", 0) + # Fetch the start of the batch + begin_last_seen = progress.get("last_seen", 0) def get_last_seen(txn): txn.execute( @@ -122,29 +127,23 @@ class ClientIpStore(background_updates.BackgroundUpdateStore): LIMIT 1 OFFSET ? """, - (last_seen_progress, batch_size) + (begin_last_seen, batch_size) ) - results = txn.fetchone() - return results - - # Get a last seen that's sufficiently far away enough from the last one - last_seen = yield self.runInteraction( + row = txn.fetchone() + if row: + return row[0] + else: + return None + + # Get a last seen that has roughly `batch_size` since `begin_last_seen` + end_last_seen = yield self.runInteraction( "user_ips_dups_get_last_seen", get_last_seen ) - if not last_seen: - # If we get a None then we're reaching the end and just need to - # delete the last batch. - last = True - - # We fake not having an upper bound by using a future date, by - # just multiplying the current time by two.... - last_seen = int(self.clock.time_msec()) * 2 - else: - last = False - last_seen = last_seen[0] + # If it returns None, then we're processing the last batch + last = end_last_seen is None - def remove(txn, last_seen_progress, last_seen): + def remove(txn): # This works by looking at all entries in the given time span, and # then for each (user_id, access_token, ip) tuple in that range # checking for any duplicates in the rest of the table (via a join). @@ -153,6 +152,16 @@ class ClientIpStore(background_updates.BackgroundUpdateStore): # all other duplicates. # It is efficient due to the existence of (user_id, access_token, # ip) and (last_seen) indices. + + # Define the search space, which requires handling the last batch in + # a different way + if last: + clause = "? <= last_seen" + args = (begin_last_seen,) + else: + clause = "? <= last_seen AND last_seen < ?" + args = (begin_last_seen, end_last_seen) + txn.execute( """ SELECT user_id, access_token, ip, @@ -160,13 +169,14 @@ class ClientIpStore(background_updates.BackgroundUpdateStore): FROM ( SELECT user_id, access_token, ip FROM user_ips - WHERE ? <= last_seen AND last_seen < ? + WHERE {} ORDER BY last_seen ) c INNER JOIN user_ips USING (user_id, access_token, ip) GROUP BY user_id, access_token, ip - HAVING count(*) > 1""", - (last_seen_progress, last_seen) + HAVING count(*) > 1 + """.format(clause), + args ) res = txn.fetchall() @@ -194,12 +204,11 @@ class ClientIpStore(background_updates.BackgroundUpdateStore): ) self._background_update_progress_txn( - txn, "user_ips_remove_dupes", {"last_seen": last_seen} + txn, "user_ips_remove_dupes", {"last_seen": end_last_seen} ) - yield self.runInteraction( - "user_ips_dups_remove", remove, last_seen_progress, last_seen - ) + yield self.runInteraction("user_ips_dups_remove", remove) + if last: yield self._end_background_update("user_ips_remove_dupes") |