summary refs log tree commit diff
diff options
context:
space:
mode:
authorDavid Robertson <davidr@element.io>2023-06-07 18:12:23 +0100
committerGitHub <noreply@github.com>2023-06-07 17:12:23 +0000
commitd162aecaac52fb467822e319e4c3c5b216c33ca9 (patch)
tree8a92ee76319fb5ed9f0c26de7df0067758b46296
parentRemove superfluous `room_memberships` join from background update (#15733) (diff)
downloadsynapse-d162aecaac52fb467822e319e4c3c5b216c33ca9.tar.xz
Quick & dirty metric for background update status (#15740)
* Quick & dirty metric for background update status

* Changelog

* Remove debug

Co-authored-by: Mathieu Velten <mathieuv@matrix.org>

* Actually write to _aborted

---------

Co-authored-by: Mathieu Velten <mathieuv@matrix.org>
-rw-r--r--changelog.d/15740.feature1
-rw-r--r--synapse/metrics/__init__.py2
-rw-r--r--synapse/storage/background_updates.py30
-rw-r--r--synapse/storage/database.py8
4 files changed, 40 insertions, 1 deletions
diff --git a/changelog.d/15740.feature b/changelog.d/15740.feature
new file mode 100644
index 0000000000..fed342ea55
--- /dev/null
+++ b/changelog.d/15740.feature
@@ -0,0 +1 @@
+Expose a metric reporting the database background update status.
diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py
index 8ce5887229..39fc629937 100644
--- a/synapse/metrics/__init__.py
+++ b/synapse/metrics/__init__.py
@@ -77,6 +77,8 @@ RegistryProxy = cast(CollectorRegistry, _RegistryProxy)
 
 @attr.s(slots=True, hash=True, auto_attribs=True)
 class LaterGauge(Collector):
+    """A Gauge which periodically calls a user-provided callback to produce metrics."""
+
     name: str
     desc: str
     labels: Optional[Sequence[str]] = attr.ib(hash=False)
diff --git a/synapse/storage/background_updates.py b/synapse/storage/background_updates.py
index ca085ef800..edc97a9d61 100644
--- a/synapse/storage/background_updates.py
+++ b/synapse/storage/background_updates.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+from enum import IntEnum
 from types import TracebackType
 from typing import (
     TYPE_CHECKING,
@@ -136,6 +137,15 @@ class BackgroundUpdatePerformance:
             return float(self.total_item_count) / float(self.total_duration_ms)
 
 
+class UpdaterStatus(IntEnum):
+    # Use negative values for error conditions.
+    ABORTED = -1
+    DISABLED = 0
+    NOT_STARTED = 1
+    RUNNING_UPDATE = 2
+    COMPLETE = 3
+
+
 class BackgroundUpdater:
     """Background updates are updates to the database that run in the
     background. Each update processes a batch of data at once. We attempt to
@@ -158,11 +168,16 @@ class BackgroundUpdater:
 
         self._background_update_performance: Dict[str, BackgroundUpdatePerformance] = {}
         self._background_update_handlers: Dict[str, _BackgroundUpdateHandler] = {}
+        # TODO: all these bool flags make me feel icky---can we combine into a status
+        # enum?
         self._all_done = False
 
         # Whether we're currently running updates
         self._running = False
 
+        # Marker to be set if we abort and halt all background updates.
+        self._aborted = False
+
         # Whether background updates are enabled. This allows us to
         # enable/disable background updates via the admin API.
         self.enabled = True
@@ -175,6 +190,20 @@ class BackgroundUpdater:
         self.sleep_duration_ms = hs.config.background_updates.sleep_duration_ms
         self.sleep_enabled = hs.config.background_updates.sleep_enabled
 
+    def get_status(self) -> UpdaterStatus:
+        """An integer summarising the updater status. Used as a metric."""
+        if self._aborted:
+            return UpdaterStatus.ABORTED
+        # TODO: a status for "have seen at least one failure, but haven't aborted yet".
+        if not self.enabled:
+            return UpdaterStatus.DISABLED
+
+        if self._all_done:
+            return UpdaterStatus.COMPLETE
+        if self._running:
+            return UpdaterStatus.RUNNING_UPDATE
+        return UpdaterStatus.NOT_STARTED
+
     def register_update_controller_callbacks(
         self,
         on_update: ON_UPDATE_CALLBACK,
@@ -296,6 +325,7 @@ class BackgroundUpdater:
                 except Exception:
                     back_to_back_failures += 1
                     if back_to_back_failures >= 5:
+                        self._aborted = True
                         raise RuntimeError(
                             "5 back-to-back background update failures; aborting."
                         )
diff --git a/synapse/storage/database.py b/synapse/storage/database.py
index bdaa508dbe..10fa6c4802 100644
--- a/synapse/storage/database.py
+++ b/synapse/storage/database.py
@@ -54,7 +54,7 @@ from synapse.logging.context import (
     current_context,
     make_deferred_yieldable,
 )
-from synapse.metrics import register_threadpool
+from synapse.metrics import LaterGauge, register_threadpool
 from synapse.metrics.background_process_metrics import run_as_background_process
 from synapse.storage.background_updates import BackgroundUpdater
 from synapse.storage.engines import BaseDatabaseEngine, PostgresEngine, Sqlite3Engine
@@ -547,6 +547,12 @@ class DatabasePool:
         self._db_pool = make_pool(hs.get_reactor(), database_config, engine)
 
         self.updates = BackgroundUpdater(hs, self)
+        LaterGauge(
+            "synapse_background_update_status",
+            "Background update status",
+            [],
+            self.updates.get_status,
+        )
 
         self._previous_txn_total_time = 0.0
         self._current_txn_total_time = 0.0