From c1ef579b63a39d8d6fb31ddc1d3cf173eaf7e5b7 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff <1389908+richvdh@users.noreply.github.com> Date: Thu, 1 Oct 2020 11:09:12 +0100 Subject: Add prometheus metrics to track federation delays (#8430) Add a pair of federation metrics to track the delays in sending PDUs to/from particular servers. --- synapse/federation/sender/transaction_manager.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'synapse/federation/sender') diff --git a/synapse/federation/sender/transaction_manager.py b/synapse/federation/sender/transaction_manager.py index c84072ab73..3e07f925e0 100644 --- a/synapse/federation/sender/transaction_manager.py +++ b/synapse/federation/sender/transaction_manager.py @@ -15,6 +15,8 @@ import logging from typing import TYPE_CHECKING, List +from prometheus_client import Gauge + from synapse.api.errors import HttpResponseException from synapse.events import EventBase from synapse.federation.persistence import TransactionActions @@ -34,6 +36,12 @@ if TYPE_CHECKING: logger = logging.getLogger(__name__) +last_pdu_age_metric = Gauge( + "synapse_federation_last_sent_pdu_age", + "The age (in seconds) of the last PDU successfully sent to the given domain", + labelnames=("server_name",), +) + class TransactionManager: """Helper class which handles building and sending transactions @@ -48,6 +56,10 @@ class TransactionManager: self._transaction_actions = TransactionActions(self._store) self._transport_layer = hs.get_federation_transport_client() + self._federation_metrics_domains = ( + hs.get_config().federation.federation_metrics_domains + ) + # HACK to get unique tx id self._next_txn_id = int(self.clock.time_msec()) @@ -119,6 +131,9 @@ class TransactionManager: # FIXME (erikj): This is a bit of a hack to make the Pdu age # keys work + # FIXME (richardv): I also believe it no longer works. We (now?) store + # "age_ts" in "unsigned" rather than at the top level. See + # https://github.com/matrix-org/synapse/issues/8429. def json_data_cb(): data = transaction.get_dict() now = int(self.clock.time_msec()) @@ -167,5 +182,12 @@ class TransactionManager: ) success = False + if success and pdus and destination in self._federation_metrics_domains: + last_pdu = pdus[-1] + last_pdu_age = self.clock.time_msec() - last_pdu.origin_server_ts + last_pdu_age_metric.labels(server_name=destination).set( + last_pdu_age / 1000 + ) + set_tag(tags.ERROR, not success) return success -- cgit 1.4.1