2 files changed, 3 insertions, 550 deletions
diff --git a/synapse/replication/tcp/__init__.py b/synapse/replication/tcp/__init__.py
index 2c5f5f0bf8..05ded3dc82 100644
--- a/synapse/replication/tcp/__init__.py
+++ b/synapse/replication/tcp/__init__.py
@@ -41,7 +41,7 @@ The general interaction of the classes are:
                     v |
             +-------------+
             | Protocols   |
-            | (TCP/redis) |
+            | (Redis)     |
             +-------------+
 
 Where the ReplicationDataHandler (or subclasses) handles incoming stream
diff --git a/synapse/replication/tcp/protocol.py b/synapse/replication/tcp/protocol.py
index 56a5c21910..772bb1f264 100644
--- a/synapse/replication/tcp/protocol.py
+++ b/synapse/replication/tcp/protocol.py
@@ -11,82 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""This module contains the implementation of both the client and server
-protocols.
-
-The basic structure of the protocol is line based, where the initial word of
-each line specifies the command. The rest of the line is parsed based on the
-command. For example, the `RDATA` command is defined as::
-
-    RDATA <stream_name> <token> <row_json>
-
-(Note that `<row_json>` may contains spaces, but cannot contain newlines.)
-
-Blank lines are ignored.
-
-# Example
-
-An example iteraction is shown below. Each line is prefixed with '>' or '<' to
-indicate which side is sending, these are *not* included on the wire::
-
-    * connection established *
-    > SERVER localhost:8823
-    > PING 1490197665618
-    < NAME synapse.app.appservice
-    < PING 1490197665618
-    < REPLICATE
-    > POSITION events 1
-    > POSITION backfill 1
-    > POSITION caches 1
-    > RDATA caches 2 ["get_user_by_id",["@01register-user:localhost:8823"],1490197670513]
-    > RDATA events 14 ["ev", ["$149019767112vOHxz:localhost:8823",
-        "!AFDCvgApUmpdfVjIXm:localhost:8823","m.room.guest_access","",null]]
-    < PING 1490197675618
-    > ERROR server stopping
-    * connection closed by server *
-"""
-import fcntl
-import logging
-import struct
-from inspect import isawaitable
-from typing import TYPE_CHECKING, Any, Collection, List, Optional
-
 from prometheus_client import Counter
-from zope.interface import Interface, implementer
+from zope.interface import Interface
 
-from twisted.internet import task
-from twisted.internet.tcp import Connection
-from twisted.protocols.basic import LineOnlyReceiver
-from twisted.python.failure import Failure
-
-from synapse.logging.context import PreserveLoggingContext
-from synapse.metrics import LaterGauge
-from synapse.metrics.background_process_metrics import (
-    BackgroundProcessLoggingContext,
-    run_as_background_process,
-)
-from synapse.replication.tcp.commands import (
-    VALID_CLIENT_COMMANDS,
-    VALID_SERVER_COMMANDS,
-    Command,
-    ErrorCommand,
-    NameCommand,
-    PingCommand,
-    ReplicateCommand,
-    ServerCommand,
-    parse_command_from_line,
-)
-from synapse.util import Clock
-from synapse.util.stringutils import random_string
-
-if TYPE_CHECKING:
-    from synapse.replication.tcp.handler import ReplicationCommandHandler
-    from synapse.server import HomeServer
-
-
-connection_close_counter = Counter(
-    "synapse_replication_tcp_protocol_close_reason", "", ["reason_type"]
-)
+from synapse.replication.tcp.commands import Command
 
 tcp_inbound_commands_counter = Counter(
     "synapse_replication_tcp_protocol_inbound_commands",
@@ -100,484 +28,9 @@ tcp_outbound_commands_counter = Counter(
     ["command", "name"],
 )
 
-# A list of all connected protocols. This allows us to send metrics about the
-# connections.
-connected_connections: "List[BaseReplicationStreamProtocol]" = []
-
-
-logger = logging.getLogger(__name__)
-
-
-PING_TIME = 5000
-PING_TIMEOUT_MULTIPLIER = 5
-PING_TIMEOUT_MS = PING_TIME * PING_TIMEOUT_MULTIPLIER
-
-
-class ConnectionStates:
-    CONNECTING = "connecting"
-    ESTABLISHED = "established"
-    PAUSED = "paused"
-    CLOSED = "closed"
-
 
 class IReplicationConnection(Interface):
     """An interface for replication connections."""
 
     def send_command(cmd: Command) -> None:
         """Send the command down the connection"""
-
-
-@implementer(IReplicationConnection)
-class BaseReplicationStreamProtocol(LineOnlyReceiver):
-    """Base replication protocol shared between client and server.
-
-    Reads lines (ignoring blank ones) and parses them into command classes,
-    asserting that they are valid for the given direction, i.e. server commands
-    are only sent by the server.
-
-    On receiving a new command it calls `on_<COMMAND_NAME>` with the parsed
-    command before delegating to `ReplicationCommandHandler.on_<COMMAND_NAME>`.
-    `ReplicationCommandHandler.on_<COMMAND_NAME>` can optionally return a coroutine;
-    if so, that will get run as a background process.
-
-    It also sends `PING` periodically, and correctly times out remote connections
-    (if they send a `PING` command)
-    """
-
-    # The transport is going to be an ITCPTransport, but that doesn't have the
-    # (un)registerProducer methods, those are only on the implementation.
-    transport: Connection
-
-    delimiter = b"\n"
-
-    # Valid commands we expect to receive
-    VALID_INBOUND_COMMANDS: Collection[str] = []
-
-    # Valid commands we can send
-    VALID_OUTBOUND_COMMANDS: Collection[str] = []
-
-    max_line_buffer = 10000
-
-    def __init__(self, clock: Clock, handler: "ReplicationCommandHandler"):
-        self.clock = clock
-        self.command_handler = handler
-
-        self.last_received_command = self.clock.time_msec()
-        self.last_sent_command = 0
-        # When we requested the connection be closed
-        self.time_we_closed: Optional[int] = None
-
-        self.received_ping = False  # Have we received a ping from the other side
-
-        self.state = ConnectionStates.CONNECTING
-
-        self.name = "anon"  # The name sent by a client.
-        self.conn_id = random_string(5)  # To dedupe in case of name clashes.
-
-        # List of pending commands to send once we've established the connection
-        self.pending_commands: List[Command] = []
-
-        # The LoopingCall for sending pings.
-        self._send_ping_loop: Optional[task.LoopingCall] = None
-
-        # a logcontext which we use for processing incoming commands. We declare it as a
-        # background process so that the CPU stats get reported to prometheus.
-        with PreserveLoggingContext():
-            # thanks to `PreserveLoggingContext()`, the new logcontext is guaranteed to
-            # capture the sentinel context as its containing context and won't prevent
-            # GC of / unintentionally reactivate what would be the current context.
-            self._logging_context = BackgroundProcessLoggingContext(
-                "replication-conn", self.conn_id
-            )
-
-    def connectionMade(self) -> None:
-        logger.info("[%s] Connection established", self.id())
-
-        self.state = ConnectionStates.ESTABLISHED
-
-        connected_connections.append(self)  # Register connection for metrics
-
-        assert self.transport is not None
-        self.transport.registerProducer(self, True)  # For the *Producing callbacks
-
-        self._send_pending_commands()
-
-        # Starts sending pings
-        self._send_ping_loop = self.clock.looping_call(self.send_ping, 5000)
-
-        # Always send the initial PING so that the other side knows that they
-        # can time us out.
-        self.send_command(PingCommand(str(self.clock.time_msec())))
-
-        self.command_handler.new_connection(self)
-
-    def send_ping(self) -> None:
-        """Periodically sends a ping and checks if we should close the connection
-        due to the other side timing out.
-        """
-        now = self.clock.time_msec()
-
-        if self.time_we_closed:
-            if now - self.time_we_closed > PING_TIMEOUT_MS:
-                logger.info(
-                    "[%s] Failed to close connection gracefully, aborting", self.id()
-                )
-                assert self.transport is not None
-                self.transport.abortConnection()
-        else:
-            if now - self.last_sent_command >= PING_TIME:
-                self.send_command(PingCommand(str(now)))
-
-            if (
-                self.received_ping
-                and now - self.last_received_command > PING_TIMEOUT_MS
-            ):
-                logger.info(
-                    "[%s] Connection hasn't received command in %r ms. Closing.",
-                    self.id(),
-                    now - self.last_received_command,
-                )
-                self.send_error("ping timeout")
-
-    def lineReceived(self, line: bytes) -> None:
-        """Called when we've received a line"""
-        with PreserveLoggingContext(self._logging_context):
-            self._parse_and_dispatch_line(line)
-
-    def _parse_and_dispatch_line(self, line: bytes) -> None:
-        if line.strip() == b"":
-            # Ignore blank lines
-            return
-
-        linestr = line.decode("utf-8")
-
-        try:
-            cmd = parse_command_from_line(linestr)
-        except Exception as e:
-            logger.exception("[%s] failed to parse line: %r", self.id(), linestr)
-            self.send_error("failed to parse line: %r (%r):" % (e, linestr))
-            return
-
-        if cmd.NAME not in self.VALID_INBOUND_COMMANDS:
-            logger.error("[%s] invalid command %s", self.id(), cmd.NAME)
-            self.send_error("invalid command: %s", cmd.NAME)
-            return
-
-        self.last_received_command = self.clock.time_msec()
-
-        tcp_inbound_commands_counter.labels(cmd.NAME, self.name).inc()
-
-        self.handle_command(cmd)
-
-    def handle_command(self, cmd: Command) -> None:
-        """Handle a command we have received over the replication stream.
-
-        First calls `self.on_<COMMAND>` if it exists, then calls
-        `self.command_handler.on_<COMMAND>` if it exists (which can optionally
-        return an Awaitable).
-
-        This allows for protocol level handling of commands (e.g. PINGs), before
-        delegating to the handler.
-
-        Args:
-            cmd: received command
-        """
-        handled = False
-
-        # First call any command handlers on this instance. These are for TCP
-        # specific handling.
-        cmd_func = getattr(self, "on_%s" % (cmd.NAME,), None)
-        if cmd_func:
-            cmd_func(cmd)
-            handled = True
-
-        # Then call out to the handler.
-        cmd_func = getattr(self.command_handler, "on_%s" % (cmd.NAME,), None)
-        if cmd_func:
-            res = cmd_func(self, cmd)
-
-            # the handler might be a coroutine: fire it off as a background process
-            # if so.
-
-            if isawaitable(res):
-                run_as_background_process(
-                    "replication-" + cmd.get_logcontext_id(), lambda: res
-                )
-
-            handled = True
-
-        if not handled:
-            logger.warning("Unhandled command: %r", cmd)
-
-    def close(self) -> None:
-        logger.warning("[%s] Closing connection", self.id())
-        self.time_we_closed = self.clock.time_msec()
-        assert self.transport is not None
-        self.transport.loseConnection()
-        self.on_connection_closed()
-
-    def send_error(self, error_string: str, *args: Any) -> None:
-        """Send an error to remote and close the connection."""
-        self.send_command(ErrorCommand(error_string % args))
-        self.close()
-
-    def send_command(self, cmd: Command, do_buffer: bool = True) -> None:
-        """Send a command if connection has been established.
-
-        Args:
-            cmd
-            do_buffer: Whether to buffer the message or always attempt
-                to send the command. This is mostly used to send an error
-                message if we're about to close the connection due our buffers
-                becoming full.
-        """
-        if self.state == ConnectionStates.CLOSED:
-            logger.debug("[%s] Not sending, connection closed", self.id())
-            return
-
-        if do_buffer and self.state != ConnectionStates.ESTABLISHED:
-            self._queue_command(cmd)
-            return
-
-        tcp_outbound_commands_counter.labels(cmd.NAME, self.name).inc()
-
-        string = "%s %s" % (cmd.NAME, cmd.to_line())
-        if "\n" in string:
-            raise Exception("Unexpected newline in command: %r", string)
-
-        encoded_string = string.encode("utf-8")
-
-        if len(encoded_string) > self.MAX_LENGTH:
-            raise Exception(
-                "Failed to send command %s as too long (%d > %d)"
-                % (cmd.NAME, len(encoded_string), self.MAX_LENGTH)
-            )
-
-        self.sendLine(encoded_string)
-
-        self.last_sent_command = self.clock.time_msec()
-
-    def _queue_command(self, cmd: Command) -> None:
-        """Queue the command until the connection is ready to write to again."""
-        logger.debug("[%s] Queueing as conn %r, cmd: %r", self.id(), self.state, cmd)
-        self.pending_commands.append(cmd)
-
-        if len(self.pending_commands) > self.max_line_buffer:
-            # The other side is failing to keep up and out buffers are becoming
-            # full, so lets close the connection.
-            # XXX: should we squawk more loudly?
-            logger.error("[%s] Remote failed to keep up", self.id())
-            self.send_command(ErrorCommand("Failed to keep up"), do_buffer=False)
-            self.close()
-
-    def _send_pending_commands(self) -> None:
-        """Send any queued commandes"""
-        pending = self.pending_commands
-        self.pending_commands = []
-        for cmd in pending:
-            self.send_command(cmd)
-
-    def on_PING(self, cmd: PingCommand) -> None:
-        self.received_ping = True
-
-    def on_ERROR(self, cmd: ErrorCommand) -> None:
-        logger.error("[%s] Remote reported error: %r", self.id(), cmd.data)
-
-    def pauseProducing(self) -> None:
-        """This is called when both the kernel send buffer and the twisted
-        tcp connection send buffers have become full.
-
-        We don't actually have any control over those sizes, so we buffer some
-        commands ourselves before knifing the connection due to the remote
-        failing to keep up.
-        """
-        logger.info("[%s] Pause producing", self.id())
-        self.state = ConnectionStates.PAUSED
-
-    def resumeProducing(self) -> None:
-        """The remote has caught up after we started buffering!"""
-        logger.info("[%s] Resume producing", self.id())
-        self.state = ConnectionStates.ESTABLISHED
-        self._send_pending_commands()
-
-    def stopProducing(self) -> None:
-        """We're never going to send any more data (normally because either
-        we or the remote has closed the connection)
-        """
-        logger.info("[%s] Stop producing", self.id())
-        self.on_connection_closed()
-
-    def connectionLost(self, reason: Failure) -> None:  # type: ignore[override]
-        logger.info("[%s] Replication connection closed: %r", self.id(), reason)
-        if isinstance(reason, Failure):
-            assert reason.type is not None
-            connection_close_counter.labels(reason.type.__name__).inc()
-        else:
-            connection_close_counter.labels(reason.__class__.__name__).inc()  # type: ignore[unreachable]
-
-        try:
-            # Remove us from list of connections to be monitored
-            connected_connections.remove(self)
-        except ValueError:
-            pass
-
-        # Stop the looping call sending pings.
-        if self._send_ping_loop and self._send_ping_loop.running:
-            self._send_ping_loop.stop()
-
-        self.on_connection_closed()
-
-    def on_connection_closed(self) -> None:
-        logger.info("[%s] Connection was closed", self.id())
-
-        self.state = ConnectionStates.CLOSED
-        self.pending_commands = []
-
-        self.command_handler.lost_connection(self)
-
-        if self.transport:
-            self.transport.unregisterProducer()
-
-        # mark the logging context as finished by triggering `__exit__()`
-        with PreserveLoggingContext():
-            with self._logging_context:
-                pass
-            # the sentinel context is now active, which may not be correct.
-            # PreserveLoggingContext() will restore the correct logging context.
-
-    def __str__(self) -> str:
-        addr = None
-        if self.transport:
-            addr = str(self.transport.getPeer())
-        return "ReplicationConnection<name=%s,conn_id=%s,addr=%s>" % (
-            self.name,
-            self.conn_id,
-            addr,
-        )
-
-    def id(self) -> str:
-        return "%s-%s" % (self.name, self.conn_id)
-
-    def lineLengthExceeded(self, line: str) -> None:
-        """Called when we receive a line that is above the maximum line length"""
-        self.send_error("Line length exceeded")
-
-
-class ServerReplicationStreamProtocol(BaseReplicationStreamProtocol):
-    VALID_INBOUND_COMMANDS = VALID_CLIENT_COMMANDS
-    VALID_OUTBOUND_COMMANDS = VALID_SERVER_COMMANDS
-
-    def __init__(
-        self, server_name: str, clock: Clock, handler: "ReplicationCommandHandler"
-    ):
-        super().__init__(clock, handler)
-
-        self.server_name = server_name
-
-    def connectionMade(self) -> None:
-        self.send_command(ServerCommand(self.server_name))
-        super().connectionMade()
-
-    def on_NAME(self, cmd: NameCommand) -> None:
-        logger.info("[%s] Renamed to %r", self.id(), cmd.data)
-        self.name = cmd.data
-
-
-class ClientReplicationStreamProtocol(BaseReplicationStreamProtocol):
-    VALID_INBOUND_COMMANDS = VALID_SERVER_COMMANDS
-    VALID_OUTBOUND_COMMANDS = VALID_CLIENT_COMMANDS
-
-    def __init__(
-        self,
-        hs: "HomeServer",
-        client_name: str,
-        server_name: str,
-        clock: Clock,
-        command_handler: "ReplicationCommandHandler",
-    ):
-        super().__init__(clock, command_handler)
-
-        self.client_name = client_name
-        self.server_name = server_name
-
-    def connectionMade(self) -> None:
-        self.send_command(NameCommand(self.client_name))
-        super().connectionMade()
-
-        # Once we've connected subscribe to the necessary streams
-        self.replicate()
-
-    def on_SERVER(self, cmd: ServerCommand) -> None:
-        if cmd.data != self.server_name:
-            logger.error("[%s] Connected to wrong remote: %r", self.id(), cmd.data)
-            self.send_error("Wrong remote")
-
-    def replicate(self) -> None:
-        """Send the subscription request to the server"""
-        logger.info("[%s] Subscribing to replication streams", self.id())
-
-        self.send_command(ReplicateCommand())
-
-
-# The following simply registers metrics for the replication connections
-
-pending_commands = LaterGauge(
-    "synapse_replication_tcp_protocol_pending_commands",
-    "",
-    ["name"],
-    lambda: {(p.name,): len(p.pending_commands) for p in connected_connections},
-)
-
-
-def transport_buffer_size(protocol: BaseReplicationStreamProtocol) -> int:
-    if protocol.transport:
-        size = len(protocol.transport.dataBuffer) + protocol.transport._tempDataLen
-        return size
-    return 0
-
-
-transport_send_buffer = LaterGauge(
-    "synapse_replication_tcp_protocol_transport_send_buffer",
-    "",
-    ["name"],
-    lambda: {(p.name,): transport_buffer_size(p) for p in connected_connections},
-)
-
-
-def transport_kernel_read_buffer_size(
-    protocol: BaseReplicationStreamProtocol, read: bool = True
-) -> int:
-    SIOCINQ = 0x541B
-    SIOCOUTQ = 0x5411
-
-    if protocol.transport:
-        fileno = protocol.transport.getHandle().fileno()
-        if read:
-            op = SIOCINQ
-        else:
-            op = SIOCOUTQ
-        size = struct.unpack("I", fcntl.ioctl(fileno, op, b"\0\0\0\0"))[0]
-        return size
-    return 0
-
-
-tcp_transport_kernel_send_buffer = LaterGauge(
-    "synapse_replication_tcp_protocol_transport_kernel_send_buffer",
-    "",
-    ["name"],
-    lambda: {
-        (p.name,): transport_kernel_read_buffer_size(p, False)
-        for p in connected_connections
-    },
-)
-
-
-tcp_transport_kernel_read_buffer = LaterGauge(
-    "synapse_replication_tcp_protocol_transport_kernel_read_buffer",
-    "",
-    ["name"],
-    lambda: {
-        (p.name,): transport_kernel_read_buffer_size(p, True)
-        for p in connected_connections
-    },
-)