diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py
index ace7adcffb..9e065e1116 100644
--- a/synapse/handlers/federation_event.py
+++ b/synapse/handlers/federation_event.py
@@ -862,6 +862,9 @@ class FederationEventHandler:
self._sanity_check_event(event)
except SynapseError as err:
logger.warning("Event %s failed sanity check: %s", event_id, err)
+ await self._store.record_event_failed_pull_attempt(
+ event.room_id, event_id, str(err)
+ )
return
try:
@@ -897,6 +900,10 @@ class FederationEventHandler:
backfilled=backfilled,
)
except FederationError as e:
+ await self._store.record_event_failed_pull_attempt(
+ event.room_id, event_id, str(e)
+ )
+
if e.code == 403:
logger.warning("Pulled event %s failed history check.", event_id)
else:
diff --git a/synapse/storage/databases/main/event_federation.py b/synapse/storage/databases/main/event_federation.py
index ca47a22bf1..ef477978ed 100644
--- a/synapse/storage/databases/main/event_federation.py
+++ b/synapse/storage/databases/main/event_federation.py
@@ -1294,6 +1294,51 @@ class EventFederationWorkerStore(SignatureWorkerStore, EventsWorkerStore, SQLBas
return event_id_results
+ @trace
+ async def record_event_failed_pull_attempt(
+ self, room_id: str, event_id: str, cause: str
+ ) -> None:
+ """
+ Record when we fail to pull an event over federation.
+
+ This information allows us to be more intelligent when we decide to
+ retry (we don't need to fail over and over) and we can process that
+ event in the background so we don't block on it each time.
+
+ Args:
+ room_id: The room where the event failed to pull from
+ event_id: The event that failed to be fetched or processed
+ cause: The error message or reason that we failed to pull the event
+ """
+ await self.db_pool.runInteraction(
+ "record_event_failed_pull_attempt",
+ self._record_event_failed_pull_attempt_upsert_txn,
+ room_id,
+ event_id,
+ cause,
+ db_autocommit=True, # Safe as it's a single upsert
+ )
+
+ def _record_event_failed_pull_attempt_upsert_txn(
+ self,
+ txn: LoggingTransaction,
+ room_id: str,
+ event_id: str,
+ cause: str,
+ ) -> None:
+ sql = """
+ INSERT INTO event_failed_pull_attempts (
+ room_id, event_id, num_attempts, last_attempt_ts, last_cause
+ )
+ VALUES (?, ?, ?, ?, ?)
+ ON CONFLICT (room_id, event_id) DO UPDATE SET
+ num_attempts=event_failed_pull_attempts.num_attempts + 1,
+ last_attempt_ts=EXCLUDED.last_attempt_ts,
+ last_cause=EXCLUDED.last_cause;
+ """
+
+ txn.execute(sql, (room_id, event_id, 1, self._clock.time_msec(), cause))
+
async def get_missing_events(
self,
room_id: str,
diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py
index c0b4080e4b..1b54a2eb57 100644
--- a/synapse/storage/databases/main/events.py
+++ b/synapse/storage/databases/main/events.py
@@ -2435,17 +2435,31 @@ class PersistEventsStore:
"DELETE FROM event_backward_extremities"
" WHERE event_id = ? AND room_id = ?"
)
+ backward_extremity_tuples_to_remove = [
+ (ev.event_id, ev.room_id)
+ for ev in events
+ if not ev.internal_metadata.is_outlier()
+ # If we encountered an event with no prev_events, then we might
+ # as well remove it now because it won't ever have anything else
+ # to backfill from.
+ or len(ev.prev_event_ids()) == 0
+ ]
txn.execute_batch(
query,
- [
- (ev.event_id, ev.room_id)
- for ev in events
- if not ev.internal_metadata.is_outlier()
- # If we encountered an event with no prev_events, then we might
- # as well remove it now because it won't ever have anything else
- # to backfill from.
- or len(ev.prev_event_ids()) == 0
- ],
+ backward_extremity_tuples_to_remove,
+ )
+
+ # Clear out the failed backfill attempts after we successfully pulled
+ # the event. Since we no longer need these events as backward
+ # extremities, it also means that they won't be backfilled from again so
+ # we no longer need to store the backfill attempts around it.
+ query = """
+ DELETE FROM event_failed_pull_attempts
+ WHERE event_id = ? and room_id = ?
+ """
+ txn.execute_batch(
+ query,
+ backward_extremity_tuples_to_remove,
)
diff --git a/synapse/storage/schema/__init__.py b/synapse/storage/schema/__init__.py
index 38c9532bfd..68e055c664 100644
--- a/synapse/storage/schema/__init__.py
+++ b/synapse/storage/schema/__init__.py
@@ -81,6 +81,8 @@ Changes in SCHEMA_VERSION = 72:
Changes in SCHEMA_VERSION = 73;
- thread_id column is added to event_push_actions, event_push_actions_staging
event_push_summary, receipts_linearized, and receipts_graph.
+ - Add table `event_failed_pull_attempts` to keep track when we fail to pull
+ events over federation.
"""
diff --git a/synapse/storage/schema/main/delta/73/01event_failed_pull_attempts.sql b/synapse/storage/schema/main/delta/73/01event_failed_pull_attempts.sql
new file mode 100644
index 0000000000..d397ee1082
--- /dev/null
+++ b/synapse/storage/schema/main/delta/73/01event_failed_pull_attempts.sql
@@ -0,0 +1,29 @@
+/* Copyright 2022 The Matrix.org Foundation C.I.C
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+-- Add a table that keeps track of when we failed to pull an event over
+-- federation (via /backfill, `/event`, `/get_missing_events`, etc). This allows
+-- us to be more intelligent when we decide to retry (we don't need to fail over
+-- and over) and we can process that event in the background so we don't block
+-- on it each time.
+CREATE TABLE IF NOT EXISTS event_failed_pull_attempts(
+ room_id TEXT NOT NULL REFERENCES rooms (room_id),
+ event_id TEXT NOT NULL,
+ num_attempts INT NOT NULL,
+ last_attempt_ts BIGINT NOT NULL,
+ last_cause TEXT NOT NULL,
+ PRIMARY KEY (room_id, event_id)
+);
|