summary refs log tree commit diff
diff options
context:
space:
mode:
authorErik Johnston <erik@matrix.org>2016-10-27 14:14:44 +0100
committerErik Johnston <erik@matrix.org>2016-10-27 14:14:44 +0100
commit1fc1bc2a51b025e46e3d44056ea1836dfc48cc6e (patch)
treecfe3173336cbc2d0e2440119bdc9e0eb4bbf5594
parentUpdate changelog (diff)
downloadsynapse-1fc1bc2a51b025e46e3d44056ea1836dfc48cc6e.tar.xz
Fix user_threepids schema delta
The delta `37/user_threepids.sql` aimed to update all the email
addresses to be lower case, however duplicate emails may exist in the
table already.

This commit adds a step where the delta moves the duplicate emails to a
new `medium` `email_old`. Only the most recently used account keeps the
binding intact. We move rather than delete so that we retain some record
of which emails were associated with which account.
-rw-r--r--synapse/storage/schema/delta/37/user_threepids.sql39
1 files changed, 38 insertions, 1 deletions
diff --git a/synapse/storage/schema/delta/37/user_threepids.sql b/synapse/storage/schema/delta/37/user_threepids.sql
index ef8813e72a..5f483a2ed7 100644
--- a/synapse/storage/schema/delta/37/user_threepids.sql
+++ b/synapse/storage/schema/delta/37/user_threepids.sql
@@ -17,7 +17,44 @@
  * Update any email addresses that were stored with mixed case into all
  * lowercase
  */
-UPDATE user_threepids SET address = LOWER(address) where medium = 'email';
+
+ -- There may be "duplicate" emails (with different case) already in the table,
+ -- so we find them and move all but the most recently used account.
+ UPDATE user_threepids
+ SET medium = 'email_old'
+ WHERE medium = 'email'
+    AND address IN (
+         -- `user_last_seen` maps user_ids to the last time we saw them
+         WITH user_last_seen AS (
+             SELECT user_id, max(last_seen) AS ts FROM user_ips GROUP BY user_id
+         ),
+         -- `duplicate_addresses` is a table of all the email addresses that
+         -- appear multiple times and the most recently we saw any of their users
+         duplicate_addresses AS (
+             SELECT lower(u1.address) AS address, max(ts.ts) AS max_ts
+             FROM user_threepids AS u1
+             INNER JOIN user_threepids AS u2 ON u1.medium = u2.medium AND lower(u1.address) = lower(u2.address) AND u1.address != u2.address
+             INNER JOIN user_last_seen as ts ON ts.user_id = u1.user_id
+             WHERE u1.medium = 'email' AND u2.medium = 'email'
+             GROUP BY lower(u1.address)
+         )
+         -- We select all the addresses that are linked to the user_id that is NOT
+         -- the most recently seen.
+         SELECT u.address
+         FROM
+             user_threepids AS u,
+             duplicate_addresses,
+             user_last_seen AS ts
+         WHERE
+             lower(u.address) = duplicate_addresses.address
+             AND u.user_id = ts.user_id
+             AND ts.ts != max_ts  -- NOT the most recently used
+     );
+
+
+-- This update is now safe since we've removed the duplicate addresses.
+UPDATE user_threepids SET address = LOWER(address) WHERE medium = 'email';
+
 
 /* Add an index for the select we do on passwored reset */
 CREATE INDEX user_threepids_medium_address on user_threepids (medium, address);