summary refs log tree commit diff
diff options
context:
space:
mode:
authorErik Johnston <erikj@jki.re>2019-02-27 11:00:59 +0000
committerGitHub <noreply@github.com>2019-02-27 11:00:59 +0000
commit7590e9fa285afac717132b16cbcba4e8b19943dd (patch)
tree40ab9e389596559b7119054ffc62e57c64bf4f90
parent0.99.2rc1 (diff)
parentMove connecting logic into ClientReplicationStreamProtocol (diff)
downloadsynapse-7590e9fa285afac717132b16cbcba4e8b19943dd.tar.xz
Merge pull request #4749 from matrix-org/erikj/replication_connection_backoff
Fix tightloop over connecting to replication server
-rw-r--r--changelog.d/4749.bugfix1
-rw-r--r--docs/tcp_replication.rst4
-rw-r--r--synapse/replication/tcp/client.py22
-rw-r--r--synapse/replication/tcp/commands.py5
-rw-r--r--synapse/replication/tcp/protocol.py17
5 files changed, 43 insertions, 6 deletions
diff --git a/changelog.d/4749.bugfix b/changelog.d/4749.bugfix
new file mode 100644
index 0000000000..174e6b4e5e
--- /dev/null
+++ b/changelog.d/4749.bugfix
@@ -0,0 +1 @@
+Fix tightloop over connecting to replication server.
diff --git a/docs/tcp_replication.rst b/docs/tcp_replication.rst
index 73436cea62..75e723484c 100644
--- a/docs/tcp_replication.rst
+++ b/docs/tcp_replication.rst
@@ -188,7 +188,9 @@ RDATA (S)
     A single update in a stream
 
 POSITION (S)
-    The position of the stream has been updated
+    The position of the stream has been updated. Sent to the client after all
+    missing updates for a stream have been sent to the client and they're now
+    up to date.
 
 ERROR (S, C)
     There was an error
diff --git a/synapse/replication/tcp/client.py b/synapse/replication/tcp/client.py
index 586dddb40b..e558f90e1a 100644
--- a/synapse/replication/tcp/client.py
+++ b/synapse/replication/tcp/client.py
@@ -39,7 +39,7 @@ class ReplicationClientFactory(ReconnectingClientFactory):
     Accepts a handler that will be called when new data is available or data
     is required.
     """
-    maxDelay = 5  # Try at least once every N seconds
+    maxDelay = 30  # Try at least once every N seconds
 
     def __init__(self, hs, client_name, handler):
         self.client_name = client_name
@@ -54,7 +54,6 @@ class ReplicationClientFactory(ReconnectingClientFactory):
 
     def buildProtocol(self, addr):
         logger.info("Connected to replication: %r", addr)
-        self.resetDelay()
         return ClientReplicationStreamProtocol(
             self.client_name, self.server_name, self._clock, self.handler
         )
@@ -90,15 +89,18 @@ class ReplicationClientHandler(object):
         # Used for tests.
         self.awaiting_syncs = {}
 
+        # The factory used to create connections.
+        self.factory = None
+
     def start_replication(self, hs):
         """Helper method to start a replication connection to the remote server
         using TCP.
         """
         client_name = hs.config.worker_name
-        factory = ReplicationClientFactory(hs, client_name, self)
+        self.factory = ReplicationClientFactory(hs, client_name, self)
         host = hs.config.worker_replication_host
         port = hs.config.worker_replication_port
-        hs.get_reactor().connectTCP(host, port, factory)
+        hs.get_reactor().connectTCP(host, port, self.factory)
 
     def on_rdata(self, stream_name, token, rows):
         """Called when we get new replication data. By default this just pokes
@@ -140,6 +142,7 @@ class ReplicationClientHandler(object):
             args["account_data"] = user_account_data
         elif room_account_data:
             args["account_data"] = room_account_data
+
         return args
 
     def get_currently_syncing_users(self):
@@ -204,3 +207,14 @@ class ReplicationClientHandler(object):
             for cmd in self.pending_commands:
                 connection.send_command(cmd)
             self.pending_commands = []
+
+    def finished_connecting(self):
+        """Called when we have successfully subscribed and caught up to all
+        streams we're interested in.
+        """
+        logger.info("Finished connecting to server")
+
+        # We don't reset the delay any earlier as otherwise if there is a
+        # problem during start up we'll end up tight looping connecting to the
+        # server.
+        self.factory.resetDelay()
diff --git a/synapse/replication/tcp/commands.py b/synapse/replication/tcp/commands.py
index 327556f6a1..2098c32a77 100644
--- a/synapse/replication/tcp/commands.py
+++ b/synapse/replication/tcp/commands.py
@@ -127,8 +127,11 @@ class RdataCommand(Command):
 
 
 class PositionCommand(Command):
-    """Sent by the client to tell the client the stream postition without
+    """Sent by the server to tell the client the stream postition without
     needing to send an RDATA.
+
+    Sent to the client after all missing updates for a stream have been sent
+    to the client and they're now up to date.
     """
     NAME = "POSITION"
 
diff --git a/synapse/replication/tcp/protocol.py b/synapse/replication/tcp/protocol.py
index 429471c345..49ae5b3355 100644
--- a/synapse/replication/tcp/protocol.py
+++ b/synapse/replication/tcp/protocol.py
@@ -526,6 +526,11 @@ class ClientReplicationStreamProtocol(BaseReplicationStreamProtocol):
         self.server_name = server_name
         self.handler = handler
 
+        # Set of stream names that have been subscribe to, but haven't yet
+        # caught up with. This is used to track when the client has been fully
+        # connected to the remote.
+        self.streams_connecting = set()
+
         # Map of stream to batched updates. See RdataCommand for info on how
         # batching works.
         self.pending_batches = {}
@@ -548,6 +553,10 @@ class ClientReplicationStreamProtocol(BaseReplicationStreamProtocol):
         # We've now finished connecting to so inform the client handler
         self.handler.update_connection(self)
 
+        # This will happen if we don't actually subscribe to any streams
+        if not self.streams_connecting:
+            self.handler.finished_connecting()
+
     def on_SERVER(self, cmd):
         if cmd.data != self.server_name:
             logger.error("[%s] Connected to wrong remote: %r", self.id(), cmd.data)
@@ -577,6 +586,12 @@ class ClientReplicationStreamProtocol(BaseReplicationStreamProtocol):
             return self.handler.on_rdata(stream_name, cmd.token, rows)
 
     def on_POSITION(self, cmd):
+        # When we get a `POSITION` command it means we've finished getting
+        # missing updates for the given stream, and are now up to date.
+        self.streams_connecting.discard(cmd.stream_name)
+        if not self.streams_connecting:
+            self.handler.finished_connecting()
+
         return self.handler.on_position(cmd.stream_name, cmd.token)
 
     def on_SYNC(self, cmd):
@@ -593,6 +608,8 @@ class ClientReplicationStreamProtocol(BaseReplicationStreamProtocol):
             self.id(), stream_name, token
         )
 
+        self.streams_connecting.add(stream_name)
+
         self.send_command(ReplicateCommand(stream_name, token))
 
     def on_connection_closed(self):