summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--changelog.d/10272.bugfix1
-rw-r--r--synapse/federation/federation_server.py98
-rw-r--r--synapse/storage/databases/main/event_federation.py109
-rw-r--r--synapse/storage/schema/main/delta/59/16federation_inbound_staging.sql32
-rw-r--r--sytest-blacklist6
5 files changed, 241 insertions, 5 deletions
diff --git a/changelog.d/10272.bugfix b/changelog.d/10272.bugfix
new file mode 100644
index 0000000000..3cefa05788
--- /dev/null
+++ b/changelog.d/10272.bugfix
@@ -0,0 +1 @@
+Handle inbound events from federation asynchronously.
diff --git a/synapse/federation/federation_server.py b/synapse/federation/federation_server.py
index 2b07f18529..1d050e54e2 100644
--- a/synapse/federation/federation_server.py
+++ b/synapse/federation/federation_server.py
@@ -44,7 +44,7 @@ from synapse.api.errors import (
     SynapseError,
     UnsupportedRoomVersionError,
 )
-from synapse.api.room_versions import KNOWN_ROOM_VERSIONS
+from synapse.api.room_versions import KNOWN_ROOM_VERSIONS, RoomVersion
 from synapse.events import EventBase
 from synapse.federation.federation_base import FederationBase, event_from_pdu_json
 from synapse.federation.persistence import TransactionActions
@@ -57,10 +57,12 @@ from synapse.logging.context import (
 )
 from synapse.logging.opentracing import log_kv, start_active_span_from_edu, trace
 from synapse.logging.utils import log_function
+from synapse.metrics.background_process_metrics import wrap_as_background_process
 from synapse.replication.http.federation import (
     ReplicationFederationSendEduRestServlet,
     ReplicationGetQueryRestServlet,
 )
+from synapse.storage.databases.main.lock import Lock
 from synapse.types import JsonDict
 from synapse.util import glob_to_regex, json_decoder, unwrapFirstError
 from synapse.util.async_helpers import Linearizer, concurrently_execute
@@ -96,6 +98,11 @@ last_pdu_ts_metric = Gauge(
 )
 
 
+# The name of the lock to use when process events in a room received over
+# federation.
+_INBOUND_EVENT_HANDLING_LOCK_NAME = "federation_inbound_pdu"
+
+
 class FederationServer(FederationBase):
     def __init__(self, hs: "HomeServer"):
         super().__init__(hs)
@@ -834,7 +841,94 @@ class FederationServer(FederationBase):
         except SynapseError as e:
             raise FederationError("ERROR", e.code, e.msg, affected=pdu.event_id)
 
-        await self.handler.on_receive_pdu(origin, pdu, sent_to_us_directly=True)
+        # Add the event to our staging area
+        await self.store.insert_received_event_to_staging(origin, pdu)
+
+        # Try and acquire the processing lock for the room, if we get it start a
+        # background process for handling the events in the room.
+        lock = await self.store.try_acquire_lock(
+            _INBOUND_EVENT_HANDLING_LOCK_NAME, pdu.room_id
+        )
+        if lock:
+            self._process_incoming_pdus_in_room_inner(
+                pdu.room_id, room_version, lock, origin, pdu
+            )
+
+    @wrap_as_background_process("_process_incoming_pdus_in_room_inner")
+    async def _process_incoming_pdus_in_room_inner(
+        self,
+        room_id: str,
+        room_version: RoomVersion,
+        lock: Lock,
+        latest_origin: str,
+        latest_event: EventBase,
+    ) -> None:
+        """Process events in the staging area for the given room.
+
+        The latest_origin and latest_event args are the latest origin and event
+        received.
+        """
+
+        # The common path is for the event we just received be the only event in
+        # the room, so instead of pulling the event out of the DB and parsing
+        # the event we just pull out the next event ID and check if that matches.
+        next_origin, next_event_id = await self.store.get_next_staged_event_id_for_room(
+            room_id
+        )
+        if next_origin == latest_origin and next_event_id == latest_event.event_id:
+            origin = latest_origin
+            event = latest_event
+        else:
+            next = await self.store.get_next_staged_event_for_room(
+                room_id, room_version
+            )
+            if not next:
+                return
+
+            origin, event = next
+
+        # We loop round until there are no more events in the room in the
+        # staging area, or we fail to get the lock (which means another process
+        # has started processing).
+        while True:
+            async with lock:
+                try:
+                    await self.handler.on_receive_pdu(
+                        origin, event, sent_to_us_directly=True
+                    )
+                except FederationError as e:
+                    # XXX: Ideally we'd inform the remote we failed to process
+                    # the event, but we can't return an error in the transaction
+                    # response (as we've already responded).
+                    logger.warning("Error handling PDU %s: %s", event.event_id, e)
+                except Exception:
+                    f = failure.Failure()
+                    logger.error(
+                        "Failed to handle PDU %s",
+                        event.event_id,
+                        exc_info=(f.type, f.value, f.getTracebackObject()),  # type: ignore
+                    )
+
+                await self.store.remove_received_event_from_staging(
+                    origin, event.event_id
+                )
+
+            # We need to do this check outside the lock to avoid a race between
+            # a new event being inserted by another instance and it attempting
+            # to acquire the lock.
+            next = await self.store.get_next_staged_event_for_room(
+                room_id, room_version
+            )
+            if not next:
+                break
+
+            origin, event = next
+
+            lock = await self.store.try_acquire_lock(
+                _INBOUND_EVENT_HANDLING_LOCK_NAME, room_id
+            )
+            if not lock:
+                return
 
     def __str__(self) -> str:
         return "<ReplicationLayer(%s)>" % self.server_name
diff --git a/synapse/storage/databases/main/event_federation.py b/synapse/storage/databases/main/event_federation.py
index c0ea445550..f23f8c6ecf 100644
--- a/synapse/storage/databases/main/event_federation.py
+++ b/synapse/storage/databases/main/event_federation.py
@@ -14,18 +14,20 @@
 import itertools
 import logging
 from queue import Empty, PriorityQueue
-from typing import Collection, Dict, Iterable, List, Set, Tuple
+from typing import Collection, Dict, Iterable, List, Optional, Set, Tuple
 
 from synapse.api.constants import MAX_DEPTH
 from synapse.api.errors import StoreError
-from synapse.events import EventBase
+from synapse.api.room_versions import RoomVersion
+from synapse.events import EventBase, make_event_from_dict
 from synapse.metrics.background_process_metrics import wrap_as_background_process
-from synapse.storage._base import SQLBaseStore, make_in_list_sql_clause
+from synapse.storage._base import SQLBaseStore, db_to_json, make_in_list_sql_clause
 from synapse.storage.database import DatabasePool, LoggingTransaction
 from synapse.storage.databases.main.events_worker import EventsWorkerStore
 from synapse.storage.databases.main.signatures import SignatureWorkerStore
 from synapse.storage.engines import PostgresEngine
 from synapse.storage.types import Cursor
+from synapse.util import json_encoder
 from synapse.util.caches.descriptors import cached
 from synapse.util.caches.lrucache import LruCache
 from synapse.util.iterutils import batch_iter
@@ -1044,6 +1046,107 @@ class EventFederationWorkerStore(EventsWorkerStore, SignatureWorkerStore, SQLBas
             _delete_old_forward_extrem_cache_txn,
         )
 
+    async def insert_received_event_to_staging(
+        self, origin: str, event: EventBase
+    ) -> None:
+        """Insert a newly received event from federation into the staging area."""
+
+        # We use an upsert here to handle the case where we see the same event
+        # from the same server multiple times.
+        await self.db_pool.simple_upsert(
+            table="federation_inbound_events_staging",
+            keyvalues={
+                "origin": origin,
+                "event_id": event.event_id,
+            },
+            values={},
+            insertion_values={
+                "room_id": event.room_id,
+                "received_ts": self._clock.time_msec(),
+                "event_json": json_encoder.encode(event.get_dict()),
+                "internal_metadata": json_encoder.encode(
+                    event.internal_metadata.get_dict()
+                ),
+            },
+            desc="insert_received_event_to_staging",
+        )
+
+    async def remove_received_event_from_staging(
+        self,
+        origin: str,
+        event_id: str,
+    ) -> None:
+        """Remove the given event from the staging area"""
+        await self.db_pool.simple_delete(
+            table="federation_inbound_events_staging",
+            keyvalues={
+                "origin": origin,
+                "event_id": event_id,
+            },
+            desc="remove_received_event_from_staging",
+        )
+
+    async def get_next_staged_event_id_for_room(
+        self,
+        room_id: str,
+    ) -> Optional[Tuple[str, str]]:
+        """Get the next event ID in the staging area for the given room."""
+
+        def _get_next_staged_event_id_for_room_txn(txn):
+            sql = """
+                SELECT origin, event_id
+                FROM federation_inbound_events_staging
+                WHERE room_id = ?
+                ORDER BY received_ts ASC
+                LIMIT 1
+            """
+
+            txn.execute(sql, (room_id,))
+
+            return txn.fetchone()
+
+        return await self.db_pool.runInteraction(
+            "get_next_staged_event_id_for_room", _get_next_staged_event_id_for_room_txn
+        )
+
+    async def get_next_staged_event_for_room(
+        self,
+        room_id: str,
+        room_version: RoomVersion,
+    ) -> Optional[Tuple[str, EventBase]]:
+        """Get the next event in the staging area for the given room."""
+
+        def _get_next_staged_event_for_room_txn(txn):
+            sql = """
+                SELECT event_json, internal_metadata, origin
+                FROM federation_inbound_events_staging
+                WHERE room_id = ?
+                ORDER BY received_ts ASC
+                LIMIT 1
+            """
+            txn.execute(sql, (room_id,))
+
+            return txn.fetchone()
+
+        row = await self.db_pool.runInteraction(
+            "get_next_staged_event_for_room", _get_next_staged_event_for_room_txn
+        )
+
+        if not row:
+            return None
+
+        event_d = db_to_json(row[0])
+        internal_metadata_d = db_to_json(row[1])
+        origin = row[2]
+
+        event = make_event_from_dict(
+            event_dict=event_d,
+            room_version=room_version,
+            internal_metadata_dict=internal_metadata_d,
+        )
+
+        return origin, event
+
 
 class EventFederationStore(EventFederationWorkerStore):
     """Responsible for storing and serving up the various graphs associated
diff --git a/synapse/storage/schema/main/delta/59/16federation_inbound_staging.sql b/synapse/storage/schema/main/delta/59/16federation_inbound_staging.sql
new file mode 100644
index 0000000000..43bc5c025f
--- /dev/null
+++ b/synapse/storage/schema/main/delta/59/16federation_inbound_staging.sql
@@ -0,0 +1,32 @@
+/* Copyright 2021 The Matrix.org Foundation C.I.C
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+-- A staging area for newly received events over federation.
+--
+-- Note we may store the same event multiple times if it comes from different
+-- servers; this is to handle the case if we get a redacted and non-redacted
+-- versions of the event.
+CREATE TABLE federation_inbound_events_staging (
+    origin TEXT NOT NULL,
+    room_id TEXT NOT NULL,
+    event_id TEXT NOT NULL,
+    received_ts BIGINT NOT NULL,
+    event_json TEXT NOT NULL,
+    internal_metadata TEXT NOT NULL
+);
+
+CREATE INDEX federation_inbound_events_staging_room ON federation_inbound_events_staging(room_id, received_ts);
+CREATE UNIQUE INDEX federation_inbound_events_staging_instance_event ON federation_inbound_events_staging(origin, event_id);
diff --git a/sytest-blacklist b/sytest-blacklist
index de9986357b..89c4e828fd 100644
--- a/sytest-blacklist
+++ b/sytest-blacklist
@@ -41,3 +41,9 @@ We can't peek into rooms with invited history_visibility
 We can't peek into rooms with joined history_visibility
 Local users can peek by room alias
 Peeked rooms only turn up in the sync for the device who peeked them
+
+
+# Blacklisted due to changes made in #10272
+Outbound federation will ignore a missing event with bad JSON for room version 6
+Backfilled events whose prev_events are in a different room do not allow cross-room back-pagination
+Federation rejects inbound events where the prev_events cannot be found