diff --git a/changelog.d/15740.feature b/changelog.d/15740.feature
new file mode 100644
index 0000000000..fed342ea55
--- /dev/null
+++ b/changelog.d/15740.feature
@@ -0,0 +1 @@
+Expose a metric reporting the database background update status.
diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py
index 8ce5887229..39fc629937 100644
--- a/synapse/metrics/__init__.py
+++ b/synapse/metrics/__init__.py
@@ -77,6 +77,8 @@ RegistryProxy = cast(CollectorRegistry, _RegistryProxy)
@attr.s(slots=True, hash=True, auto_attribs=True)
class LaterGauge(Collector):
+ """A Gauge which periodically calls a user-provided callback to produce metrics."""
+
name: str
desc: str
labels: Optional[Sequence[str]] = attr.ib(hash=False)
diff --git a/synapse/storage/background_updates.py b/synapse/storage/background_updates.py
index ca085ef800..edc97a9d61 100644
--- a/synapse/storage/background_updates.py
+++ b/synapse/storage/background_updates.py
@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
+from enum import IntEnum
from types import TracebackType
from typing import (
TYPE_CHECKING,
@@ -136,6 +137,15 @@ class BackgroundUpdatePerformance:
return float(self.total_item_count) / float(self.total_duration_ms)
+class UpdaterStatus(IntEnum):
+ # Use negative values for error conditions.
+ ABORTED = -1
+ DISABLED = 0
+ NOT_STARTED = 1
+ RUNNING_UPDATE = 2
+ COMPLETE = 3
+
+
class BackgroundUpdater:
"""Background updates are updates to the database that run in the
background. Each update processes a batch of data at once. We attempt to
@@ -158,11 +168,16 @@ class BackgroundUpdater:
self._background_update_performance: Dict[str, BackgroundUpdatePerformance] = {}
self._background_update_handlers: Dict[str, _BackgroundUpdateHandler] = {}
+ # TODO: all these bool flags make me feel icky---can we combine into a status
+ # enum?
self._all_done = False
# Whether we're currently running updates
self._running = False
+ # Marker to be set if we abort and halt all background updates.
+ self._aborted = False
+
# Whether background updates are enabled. This allows us to
# enable/disable background updates via the admin API.
self.enabled = True
@@ -175,6 +190,20 @@ class BackgroundUpdater:
self.sleep_duration_ms = hs.config.background_updates.sleep_duration_ms
self.sleep_enabled = hs.config.background_updates.sleep_enabled
+ def get_status(self) -> UpdaterStatus:
+ """An integer summarising the updater status. Used as a metric."""
+ if self._aborted:
+ return UpdaterStatus.ABORTED
+ # TODO: a status for "have seen at least one failure, but haven't aborted yet".
+ if not self.enabled:
+ return UpdaterStatus.DISABLED
+
+ if self._all_done:
+ return UpdaterStatus.COMPLETE
+ if self._running:
+ return UpdaterStatus.RUNNING_UPDATE
+ return UpdaterStatus.NOT_STARTED
+
def register_update_controller_callbacks(
self,
on_update: ON_UPDATE_CALLBACK,
@@ -296,6 +325,7 @@ class BackgroundUpdater:
except Exception:
back_to_back_failures += 1
if back_to_back_failures >= 5:
+ self._aborted = True
raise RuntimeError(
"5 back-to-back background update failures; aborting."
)
diff --git a/synapse/storage/database.py b/synapse/storage/database.py
index bdaa508dbe..10fa6c4802 100644
--- a/synapse/storage/database.py
+++ b/synapse/storage/database.py
@@ -54,7 +54,7 @@ from synapse.logging.context import (
current_context,
make_deferred_yieldable,
)
-from synapse.metrics import register_threadpool
+from synapse.metrics import LaterGauge, register_threadpool
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.storage.background_updates import BackgroundUpdater
from synapse.storage.engines import BaseDatabaseEngine, PostgresEngine, Sqlite3Engine
@@ -547,6 +547,12 @@ class DatabasePool:
self._db_pool = make_pool(hs.get_reactor(), database_config, engine)
self.updates = BackgroundUpdater(hs, self)
+ LaterGauge(
+ "synapse_background_update_status",
+ "Background update status",
+ [],
+ self.updates.get_status,
+ )
self._previous_txn_total_time = 0.0
self._current_txn_total_time = 0.0
|