summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--changelog.d/8430.feature1
-rw-r--r--docs/sample_config.yaml12
-rw-r--r--synapse/config/_util.py6
-rw-r--r--synapse/config/federation.py27
-rw-r--r--synapse/config/homeserver.py1
-rw-r--r--synapse/config/tls.py1
-rw-r--r--synapse/federation/federation_server.py24
-rw-r--r--synapse/federation/sender/transaction_manager.py22
8 files changed, 88 insertions, 6 deletions
diff --git a/changelog.d/8430.feature b/changelog.d/8430.feature
new file mode 100644
index 0000000000..1f31d42bc1
--- /dev/null
+++ b/changelog.d/8430.feature
@@ -0,0 +1 @@
+Add prometheus metrics to track federation delays.
diff --git a/docs/sample_config.yaml b/docs/sample_config.yaml
index 066844b5a9..8a3206e845 100644
--- a/docs/sample_config.yaml
+++ b/docs/sample_config.yaml
@@ -629,6 +629,7 @@ acme:
 #tls_fingerprints: [{"sha256": "<base64_encoded_sha256_fingerprint>"}]
 
 
+## Federation ##
 
 # Restrict federation to the following whitelist of domains.
 # N.B. we recommend also firewalling your federation listener to limit
@@ -662,6 +663,17 @@ federation_ip_range_blacklist:
   - 'fe80::/64'
   - 'fc00::/7'
 
+# Report prometheus metrics on the age of PDUs being sent to and received from
+# the following domains. This can be used to give an idea of "delay" on inbound
+# and outbound federation, though be aware that any delay can be due to problems
+# at either end or with the intermediate network.
+#
+# By default, no domains are monitored in this way.
+#
+#federation_metrics_domains:
+#  - matrix.org
+#  - example.com
+
 
 ## Caching ##
 
diff --git a/synapse/config/_util.py b/synapse/config/_util.py
index cd31b1c3c9..c74969a977 100644
--- a/synapse/config/_util.py
+++ b/synapse/config/_util.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, List
+from typing import Any, Iterable
 
 import jsonschema
 
@@ -20,7 +20,9 @@ from synapse.config._base import ConfigError
 from synapse.types import JsonDict
 
 
-def validate_config(json_schema: JsonDict, config: Any, config_path: List[str]) -> None:
+def validate_config(
+    json_schema: JsonDict, config: Any, config_path: Iterable[str]
+) -> None:
     """Validates a config setting against a JsonSchema definition
 
     This can be used to validate a section of the config file against a schema
diff --git a/synapse/config/federation.py b/synapse/config/federation.py
index 2c77d8f85b..ffd8fca54e 100644
--- a/synapse/config/federation.py
+++ b/synapse/config/federation.py
@@ -17,7 +17,8 @@ from typing import Optional
 
 from netaddr import IPSet
 
-from ._base import Config, ConfigError
+from synapse.config._base import Config, ConfigError
+from synapse.config._util import validate_config
 
 
 class FederationConfig(Config):
@@ -52,8 +53,18 @@ class FederationConfig(Config):
                 "Invalid range(s) provided in federation_ip_range_blacklist: %s" % e
             )
 
+        federation_metrics_domains = config.get("federation_metrics_domains") or []
+        validate_config(
+            _METRICS_FOR_DOMAINS_SCHEMA,
+            federation_metrics_domains,
+            ("federation_metrics_domains",),
+        )
+        self.federation_metrics_domains = set(federation_metrics_domains)
+
     def generate_config_section(self, config_dir_path, server_name, **kwargs):
         return """\
+        ## Federation ##
+
         # Restrict federation to the following whitelist of domains.
         # N.B. we recommend also firewalling your federation listener to limit
         # inbound federation traffic as early as possible, rather than relying
@@ -85,4 +96,18 @@ class FederationConfig(Config):
           - '::1/128'
           - 'fe80::/64'
           - 'fc00::/7'
+
+        # Report prometheus metrics on the age of PDUs being sent to and received from
+        # the following domains. This can be used to give an idea of "delay" on inbound
+        # and outbound federation, though be aware that any delay can be due to problems
+        # at either end or with the intermediate network.
+        #
+        # By default, no domains are monitored in this way.
+        #
+        #federation_metrics_domains:
+        #  - matrix.org
+        #  - example.com
         """
+
+
+_METRICS_FOR_DOMAINS_SCHEMA = {"type": "array", "items": {"type": "string"}}
diff --git a/synapse/config/homeserver.py b/synapse/config/homeserver.py
index 556e291495..be65554524 100644
--- a/synapse/config/homeserver.py
+++ b/synapse/config/homeserver.py
@@ -92,5 +92,4 @@ class HomeServerConfig(RootConfig):
         TracerConfig,
         WorkerConfig,
         RedisConfig,
-        FederationConfig,
     ]
diff --git a/synapse/config/tls.py b/synapse/config/tls.py
index e368ea564d..9ddb8b546b 100644
--- a/synapse/config/tls.py
+++ b/synapse/config/tls.py
@@ -471,7 +471,6 @@ class TlsConfig(Config):
         # or by checking matrix.org/federationtester/api/report?server_name=$host
         #
         #tls_fingerprints: [{"sha256": "<base64_encoded_sha256_fingerprint>"}]
-
         """
             # Lowercase the string representation of boolean values
             % {
diff --git a/synapse/federation/federation_server.py b/synapse/federation/federation_server.py
index 2dcd081cbc..24329dd0e3 100644
--- a/synapse/federation/federation_server.py
+++ b/synapse/federation/federation_server.py
@@ -28,7 +28,7 @@ from typing import (
     Union,
 )
 
-from prometheus_client import Counter, Histogram
+from prometheus_client import Counter, Gauge, Histogram
 
 from twisted.internet import defer
 from twisted.internet.abstract import isIPAddress
@@ -88,6 +88,13 @@ pdu_process_time = Histogram(
 )
 
 
+last_pdu_age_metric = Gauge(
+    "synapse_federation_last_received_pdu_age",
+    "The age (in seconds) of the last PDU successfully received from the given domain",
+    labelnames=("server_name",),
+)
+
+
 class FederationServer(FederationBase):
     def __init__(self, hs):
         super().__init__(hs)
@@ -118,6 +125,10 @@ class FederationServer(FederationBase):
             hs, "state_ids_resp", timeout_ms=30000
         )
 
+        self._federation_metrics_domains = (
+            hs.get_config().federation.federation_metrics_domains
+        )
+
     async def on_backfill_request(
         self, origin: str, room_id: str, versions: List[str], limit: int
     ) -> Tuple[int, Dict[str, Any]]:
@@ -262,7 +273,11 @@ class FederationServer(FederationBase):
 
         pdus_by_room = {}  # type: Dict[str, List[EventBase]]
 
+        newest_pdu_ts = 0
+
         for p in transaction.pdus:  # type: ignore
+            # FIXME (richardv): I don't think this works:
+            #  https://github.com/matrix-org/synapse/issues/8429
             if "unsigned" in p:
                 unsigned = p["unsigned"]
                 if "age" in unsigned:
@@ -300,6 +315,9 @@ class FederationServer(FederationBase):
             event = event_from_pdu_json(p, room_version)
             pdus_by_room.setdefault(room_id, []).append(event)
 
+            if event.origin_server_ts > newest_pdu_ts:
+                newest_pdu_ts = event.origin_server_ts
+
         pdu_results = {}
 
         # we can process different rooms in parallel (which is useful if they
@@ -340,6 +358,10 @@ class FederationServer(FederationBase):
             process_pdus_for_room, pdus_by_room.keys(), TRANSACTION_CONCURRENCY_LIMIT
         )
 
+        if newest_pdu_ts and origin in self._federation_metrics_domains:
+            newest_pdu_age = self._clock.time_msec() - newest_pdu_ts
+            last_pdu_age_metric.labels(server_name=origin).set(newest_pdu_age / 1000)
+
         return pdu_results
 
     async def _handle_edus_in_txn(self, origin: str, transaction: Transaction):
diff --git a/synapse/federation/sender/transaction_manager.py b/synapse/federation/sender/transaction_manager.py
index c84072ab73..3e07f925e0 100644
--- a/synapse/federation/sender/transaction_manager.py
+++ b/synapse/federation/sender/transaction_manager.py
@@ -15,6 +15,8 @@
 import logging
 from typing import TYPE_CHECKING, List
 
+from prometheus_client import Gauge
+
 from synapse.api.errors import HttpResponseException
 from synapse.events import EventBase
 from synapse.federation.persistence import TransactionActions
@@ -34,6 +36,12 @@ if TYPE_CHECKING:
 
 logger = logging.getLogger(__name__)
 
+last_pdu_age_metric = Gauge(
+    "synapse_federation_last_sent_pdu_age",
+    "The age (in seconds) of the last PDU successfully sent to the given domain",
+    labelnames=("server_name",),
+)
+
 
 class TransactionManager:
     """Helper class which handles building and sending transactions
@@ -48,6 +56,10 @@ class TransactionManager:
         self._transaction_actions = TransactionActions(self._store)
         self._transport_layer = hs.get_federation_transport_client()
 
+        self._federation_metrics_domains = (
+            hs.get_config().federation.federation_metrics_domains
+        )
+
         # HACK to get unique tx id
         self._next_txn_id = int(self.clock.time_msec())
 
@@ -119,6 +131,9 @@ class TransactionManager:
 
             # FIXME (erikj): This is a bit of a hack to make the Pdu age
             # keys work
+            # FIXME (richardv): I also believe it no longer works. We (now?) store
+            #  "age_ts" in "unsigned" rather than at the top level. See
+            #  https://github.com/matrix-org/synapse/issues/8429.
             def json_data_cb():
                 data = transaction.get_dict()
                 now = int(self.clock.time_msec())
@@ -167,5 +182,12 @@ class TransactionManager:
                     )
                 success = False
 
+            if success and pdus and destination in self._federation_metrics_domains:
+                last_pdu = pdus[-1]
+                last_pdu_age = self.clock.time_msec() - last_pdu.origin_server_ts
+                last_pdu_age_metric.labels(server_name=destination).set(
+                    last_pdu_age / 1000
+                )
+
             set_tag(tags.ERROR, not success)
             return success