1 files changed, 28 insertions, 5 deletions
diff --git a/synapse/handlers/device.py b/synapse/handlers/device.py
index 9e017116a9..c708c35d4d 100644
--- a/synapse/handlers/device.py
+++ b/synapse/handlers/device.py
@@ -20,7 +20,11 @@ from twisted.internet import defer
 
 from synapse.api import errors
 from synapse.api.constants import EventTypes
-from synapse.api.errors import FederationDeniedError
+from synapse.api.errors import (
+    FederationDeniedError,
+    HttpResponseException,
+    RequestSendFailed,
+)
 from synapse.types import RoomStreamToken, get_domain_from_id
 from synapse.util import stringutils
 from synapse.util.async_helpers import Linearizer
@@ -504,13 +508,13 @@ class DeviceListEduUpdater(object):
                 origin = get_domain_from_id(user_id)
                 try:
                     result = yield self.federation.query_user_devices(origin, user_id)
-                except NotRetryingDestination:
+                except (
+                    NotRetryingDestination, RequestSendFailed, HttpResponseException,
+                ):
                     # TODO: Remember that we are now out of sync and try again
                     # later
                     logger.warn(
-                        "Failed to handle device list update for %s,"
-                        " we're not retrying the remote",
-                        user_id,
+                        "Failed to handle device list update for %s", user_id,
                     )
                     # We abort on exceptions rather than accepting the update
                     # as otherwise synapse will 'forget' that its device list
@@ -532,6 +536,25 @@ class DeviceListEduUpdater(object):
 
                 stream_id = result["stream_id"]
                 devices = result["devices"]
+
+                # If the remote server has more than ~1000 devices for this user
+                # we assume that something is going horribly wrong (e.g. a bot
+                # that logs in and creates a new device every time it tries to
+                # send a message).  Maintaining lots of devices per user in the
+                # cache can cause serious performance issues as if this request
+                # takes more than 60s to complete, internal replication from the
+                # inbound federation worker to the synapse master may time out
+                # causing the inbound federation to fail and causing the remote
+                # server to retry, causing a DoS.  So in this scenario we give
+                # up on storing the total list of devices and only handle the
+                # delta instead.
+                if len(devices) > 1000:
+                    logger.warn(
+                        "Ignoring device list snapshot for %s as it has >1K devs (%d)",
+                        user_id, len(devices)
+                    )
+                    devices = []
+
                 yield self.store.update_remote_device_list_cache(
                     user_id, devices, stream_id,
                 )