Keep track when we try and fail to process a pulled event (#13589)

We can follow-up this PR with: 1. Only try to backfill from an event if we haven't tried recently -> https://github.com/matrix-org/synapse/issues/13622 1. When we decide to backfill that event again, process it in the background so it doesn't block and make `/messages` slow when we know it will probably fail again -> https://github.com/matrix-org/synapse/issues/13623 1. Generally track failures everywhere we try and fail to pull an event over federation -> https://github.com/matrix-org/synapse/issues/13700 Fix https://github.com/matrix-org/synapse/issues/13621 Part of https://github.com/matrix-org/synapse/issues/13356 Mentioned in [internal doc](https://docs.google.com/document/d/1lvUoVfYUiy6UaHB6Rb4HicjaJAU40-APue9Q4vzuW3c/edit#bookmark=id.qv7cj51sv9i5)
author: Eric Eastwood <erice@element.io> 2022-09-14 13:57:50 -0500
committer: GitHub <noreply@github.com> 2022-09-14 13:57:50 -0500
commit: 957e3d74fc70f92bb9ed3c709f87752bf77a8c90 (patch)
tree: cf177f3d8a17f7bade6c9da71ddc14239f02d196 /synapse/storage/databases
parent: Update event push action and receipt tables to support threads. (#13753) (diff)
download: synapse-957e3d74fc70f92bb9ed3c709f87752bf77a8c90.tar.xz
2 files changed, 68 insertions, 9 deletions
diff --git a/synapse/storage/databases/main/event_federation.py b/synapse/storage/databases/main/event_federation.py
index ca47a22bf1..ef477978ed 100644
--- a/synapse/storage/databases/main/event_federation.py
+++ b/synapse/storage/databases/main/event_federation.py
@@ -1294,6 +1294,51 @@ class EventFederationWorkerStore(SignatureWorkerStore, EventsWorkerStore, SQLBas
 
         return event_id_results
 
+    @trace
+    async def record_event_failed_pull_attempt(
+        self, room_id: str, event_id: str, cause: str
+    ) -> None:
+        """
+        Record when we fail to pull an event over federation.
+
+        This information allows us to be more intelligent when we decide to
+        retry (we don't need to fail over and over) and we can process that
+        event in the background so we don't block on it each time.
+
+        Args:
+            room_id: The room where the event failed to pull from
+            event_id: The event that failed to be fetched or processed
+            cause: The error message or reason that we failed to pull the event
+        """
+        await self.db_pool.runInteraction(
+            "record_event_failed_pull_attempt",
+            self._record_event_failed_pull_attempt_upsert_txn,
+            room_id,
+            event_id,
+            cause,
+            db_autocommit=True,  # Safe as it's a single upsert
+        )
+
+    def _record_event_failed_pull_attempt_upsert_txn(
+        self,
+        txn: LoggingTransaction,
+        room_id: str,
+        event_id: str,
+        cause: str,
+    ) -> None:
+        sql = """
+            INSERT INTO event_failed_pull_attempts (
+                room_id, event_id, num_attempts, last_attempt_ts, last_cause
+            )
+                VALUES (?, ?, ?, ?, ?)
+            ON CONFLICT (room_id, event_id) DO UPDATE SET
+                num_attempts=event_failed_pull_attempts.num_attempts + 1,
+                last_attempt_ts=EXCLUDED.last_attempt_ts,
+                last_cause=EXCLUDED.last_cause;
+        """
+
+        txn.execute(sql, (room_id, event_id, 1, self._clock.time_msec(), cause))
+
     async def get_missing_events(
         self,
         room_id: str,
diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py
index c0b4080e4b..1b54a2eb57 100644
--- a/synapse/storage/databases/main/events.py
+++ b/synapse/storage/databases/main/events.py
@@ -2435,17 +2435,31 @@ class PersistEventsStore:
             "DELETE FROM event_backward_extremities"
             " WHERE event_id = ? AND room_id = ?"
         )
+        backward_extremity_tuples_to_remove = [
+            (ev.event_id, ev.room_id)
+            for ev in events
+            if not ev.internal_metadata.is_outlier()
+            # If we encountered an event with no prev_events, then we might
+            # as well remove it now because it won't ever have anything else
+            # to backfill from.
+            or len(ev.prev_event_ids()) == 0
+        ]
         txn.execute_batch(
             query,
-            [
-                (ev.event_id, ev.room_id)
-                for ev in events
-                if not ev.internal_metadata.is_outlier()
-                # If we encountered an event with no prev_events, then we might
-                # as well remove it now because it won't ever have anything else
-                # to backfill from.
-                or len(ev.prev_event_ids()) == 0
-            ],
+            backward_extremity_tuples_to_remove,
+        )
+
+        # Clear out the failed backfill attempts after we successfully pulled
+        # the event. Since we no longer need these events as backward
+        # extremities, it also means that they won't be backfilled from again so
+        # we no longer need to store the backfill attempts around it.
+        query = """
+            DELETE FROM event_failed_pull_attempts
+            WHERE event_id = ? and room_id = ?
+        """
+        txn.execute_batch(
+            query,
+            backward_extremity_tuples_to_remove,
         )
author	Eric Eastwood <erice@element.io>	2022-09-14 13:57:50 -0500
committer	GitHub <noreply@github.com>	2022-09-14 13:57:50 -0500
commit	957e3d74fc70f92bb9ed3c709f87752bf77a8c90 (patch)
tree	cf177f3d8a17f7bade6c9da71ddc14239f02d196 /synapse/storage/databases
parent	Update event push action and receipt tables to support threads. (#13753) (diff)
download	synapse-957e3d74fc70f92bb9ed3c709f87752bf77a8c90.tar.xz