diff --git a/synapse/api/auth.py b/synapse/api/auth.py
index f17fda6315..b052cf532b 100644
--- a/synapse/api/auth.py
+++ b/synapse/api/auth.py
@@ -57,7 +57,7 @@ class Auth(object):
self.TOKEN_NOT_FOUND_HTTP_STATUS = 401
self.token_cache = LruCache(CACHE_SIZE_FACTOR * 10000)
- register_cache("token_cache", self.token_cache)
+ register_cache("cache", "token_cache", self.token_cache)
@defer.inlineCallbacks
def check_from_context(self, event, context, do_sig_check=True):
diff --git a/synapse/app/homeserver.py b/synapse/app/homeserver.py
index caccbaa814..a5b135193f 100755
--- a/synapse/app/homeserver.py
+++ b/synapse/app/homeserver.py
@@ -34,8 +34,7 @@ from synapse.module_api import ModuleApi
from synapse.http.additional_resource import AdditionalResource
from synapse.http.server import RootRedirect
from synapse.http.site import SynapseSite
-from synapse.metrics import register_memory_metrics
-from synapse.metrics.resource import METRICS_PREFIX, MetricsResource
+from synapse.metrics.resource import METRICS_PREFIX
from synapse.python_dependencies import CONDITIONAL_REQUIREMENTS, \
check_requirements
from synapse.replication.http import ReplicationRestResource, REPLICATION_PREFIX
@@ -230,7 +229,8 @@ class SynapseHomeServer(HomeServer):
resources[WEB_CLIENT_PREFIX] = build_resource_for_web_client(self)
if name == "metrics" and self.get_config().enable_metrics:
- resources[METRICS_PREFIX] = MetricsResource(self)
+ from prometheus_client.twisted import MetricsResource
+ resources[METRICS_PREFIX] = MetricsResource()
if name == "replication":
resources[REPLICATION_PREFIX] = ReplicationRestResource(self)
@@ -362,8 +362,6 @@ def setup(config_options):
hs.get_datastore().start_doing_background_updates()
hs.get_federation_client().start_get_pdu_cache()
- register_memory_metrics(hs)
-
reactor.callWhenRunning(start)
return hs
diff --git a/synapse/federation/federation_client.py b/synapse/federation/federation_client.py
index 6163f7c466..2761ffae07 100644
--- a/synapse/federation/federation_client.py
+++ b/synapse/federation/federation_client.py
@@ -32,20 +32,17 @@ from synapse.federation.federation_base import (
FederationBase,
event_from_pdu_json,
)
-import synapse.metrics
from synapse.util import logcontext, unwrapFirstError
from synapse.util.caches.expiringcache import ExpiringCache
from synapse.util.logcontext import make_deferred_yieldable, run_in_background
from synapse.util.logutils import log_function
from synapse.util.retryutils import NotRetryingDestination
-logger = logging.getLogger(__name__)
-
+from prometheus_client import Counter
-# synapse.federation.federation_client is a silly name
-metrics = synapse.metrics.get_metrics_for("synapse.federation.client")
+logger = logging.getLogger(__name__)
-sent_queries_counter = metrics.register_counter("sent_queries", labels=["type"])
+sent_queries_counter = Counter("synapse_federation_client_sent_queries", "", ["type"])
PDU_RETRY_TIME_MS = 1 * 60 * 1000
@@ -108,7 +105,7 @@ class FederationClient(FederationBase):
a Deferred which will eventually yield a JSON object from the
response
"""
- sent_queries_counter.inc(query_type)
+ sent_queries_counter.labels(query_type).inc()
return self.transport_layer.make_query(
destination, query_type, args, retry_on_dns_fail=retry_on_dns_fail,
@@ -127,7 +124,7 @@ class FederationClient(FederationBase):
a Deferred which will eventually yield a JSON object from the
response
"""
- sent_queries_counter.inc("client_device_keys")
+ sent_queries_counter.labels("client_device_keys").inc()
return self.transport_layer.query_client_keys(
destination, content, timeout
)
@@ -137,7 +134,7 @@ class FederationClient(FederationBase):
"""Query the device keys for a list of user ids hosted on a remote
server.
"""
- sent_queries_counter.inc("user_devices")
+ sent_queries_counter.labels("user_devices").inc()
return self.transport_layer.query_user_devices(
destination, user_id, timeout
)
@@ -154,7 +151,7 @@ class FederationClient(FederationBase):
a Deferred which will eventually yield a JSON object from the
response
"""
- sent_queries_counter.inc("client_one_time_keys")
+ sent_queries_counter.labels("client_one_time_keys").inc()
return self.transport_layer.claim_client_keys(
destination, content, timeout
)
diff --git a/synapse/federation/federation_server.py b/synapse/federation/federation_server.py
index 247ddc89d5..8211273006 100644
--- a/synapse/federation/federation_server.py
+++ b/synapse/federation/federation_server.py
@@ -27,12 +27,13 @@ from synapse.federation.federation_base import (
from synapse.federation.persistence import TransactionActions
from synapse.federation.units import Edu, Transaction
-import synapse.metrics
from synapse.types import get_domain_from_id
from synapse.util import async
from synapse.util.caches.response_cache import ResponseCache
from synapse.util.logutils import log_function
+from prometheus_client import Counter
+
from six import iteritems
# when processing incoming transactions, we try to handle multiple rooms in
@@ -41,14 +42,11 @@ TRANSACTION_CONCURRENCY_LIMIT = 10
logger = logging.getLogger(__name__)
-# synapse.federation.federation_server is a silly name
-metrics = synapse.metrics.get_metrics_for("synapse.federation.server")
-
-received_pdus_counter = metrics.register_counter("received_pdus")
+received_pdus_counter = Counter("synapse_federation_server_received_pdus", "")
-received_edus_counter = metrics.register_counter("received_edus")
+received_edus_counter = Counter("synapse_federation_server_received_edus", "")
-received_queries_counter = metrics.register_counter("received_queries", labels=["type"])
+received_queries_counter = Counter("synapse_federation_server_received_queries", "", ["type"])
class FederationServer(FederationBase):
@@ -131,7 +129,7 @@ class FederationServer(FederationBase):
logger.debug("[%s] Transaction is new", transaction.transaction_id)
- received_pdus_counter.inc_by(len(transaction.pdus))
+ received_pdus_counter.inc(len(transaction.pdus))
pdus_by_room = {}
@@ -292,7 +290,7 @@ class FederationServer(FederationBase):
@defer.inlineCallbacks
def on_query_request(self, query_type, args):
- received_queries_counter.inc(query_type)
+ received_queries_counter.labels(query_type).inc()
resp = yield self.registry.on_query(query_type, args)
defer.returnValue((200, resp))
diff --git a/synapse/federation/send_queue.py b/synapse/federation/send_queue.py
index 0f0c687b37..e6e1888f3a 100644
--- a/synapse/federation/send_queue.py
+++ b/synapse/federation/send_queue.py
@@ -33,7 +33,7 @@ from .units import Edu
from synapse.storage.presence import UserPresenceState
from synapse.util.metrics import Measure
-import synapse.metrics
+from synapse.metrics import LaterGauge
from blist import sorteddict
from collections import namedtuple
@@ -45,9 +45,6 @@ from six import itervalues, iteritems
logger = logging.getLogger(__name__)
-metrics = synapse.metrics.get_metrics_for(__name__)
-
-
class FederationRemoteSendQueue(object):
"""A drop in replacement for TransactionQueue"""
@@ -77,8 +74,7 @@ class FederationRemoteSendQueue(object):
# lambda binds to the queue rather than to the name of the queue which
# changes. ARGH.
def register(name, queue):
- metrics.register_callback(
- queue_name + "_size",
+ LaterGauge("synapse_federation_send_queue_%s_size" % (queue_name,), "",
lambda: len(queue),
)
diff --git a/synapse/federation/transaction_queue.py b/synapse/federation/transaction_queue.py
index ded2b1871a..778924a13c 100644
--- a/synapse/federation/transaction_queue.py
+++ b/synapse/federation/transaction_queue.py
@@ -26,23 +26,18 @@ from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter
from synapse.util.metrics import measure_func
from synapse.handlers.presence import format_user_presence_state, get_interested_remotes
import synapse.metrics
+from synapse.metrics import LaterGauge
+from synapse.metrics import (
+ sent_edus_counter, sent_transactions_counter, events_processed_counter)
+
+from prometheus_client import Counter
import logging
logger = logging.getLogger(__name__)
-metrics = synapse.metrics.get_metrics_for(__name__)
-
-client_metrics = synapse.metrics.get_metrics_for("synapse.federation.client")
-sent_pdus_destination_dist = client_metrics.register_distribution(
- "sent_pdu_destinations"
-)
-sent_edus_counter = client_metrics.register_counter("sent_edus")
-
-sent_transactions_counter = client_metrics.register_counter("sent_transactions")
-
-events_processed_counter = client_metrics.register_counter("events_processed")
+sent_pdus_destination_dist = Counter("synapse_federation_client_sent_pdu_destinations", "")
class TransactionQueue(object):
@@ -69,8 +64,7 @@ class TransactionQueue(object):
# done
self.pending_transactions = {}
- metrics.register_callback(
- "pending_destinations",
+ LaterGauge("pending_destinations", "", [],
lambda: len(self.pending_transactions),
)
@@ -94,12 +88,12 @@ class TransactionQueue(object):
# Map of destination -> (edu_type, key) -> Edu
self.pending_edus_keyed_by_dest = edus_keyed = {}
- metrics.register_callback(
- "pending_pdus",
+ LaterGauge(
+ "pending_pdus", "", [],
lambda: sum(map(len, pdus.values())),
)
- metrics.register_callback(
- "pending_edus",
+ LaterGauge(
+ "pending_edus", "", [],
lambda: (
sum(map(len, edus.values()))
+ sum(map(len, presence.values()))
@@ -241,18 +235,15 @@ class TransactionQueue(object):
now = self.clock.time_msec()
ts = yield self.store.get_received_ts(events[-1].event_id)
- synapse.metrics.event_processing_lag.set(
- now - ts, "federation_sender",
- )
- synapse.metrics.event_processing_last_ts.set(
- ts, "federation_sender",
- )
+ synapse.metrics.event_processing_lag.labels(
+ "federation_sender").set(now - ts)
+ synapse.metrics.event_processing_last_ts.labels(
+ "federation_sender").set(ts)
- events_processed_counter.inc_by(len(events))
+ events_processed_counter.inc(len(events))
- synapse.metrics.event_processing_positions.set(
- next_token, "federation_sender",
- )
+ synapse.metrics.event_processing_positions.labels(
+ "federation_sender").set(next_token)
finally:
self._is_processing = False
@@ -275,7 +266,7 @@ class TransactionQueue(object):
if not destinations:
return
- sent_pdus_destination_dist.inc_by(len(destinations))
+ sent_pdus_destination_dist.inc(len(destinations))
for destination in destinations:
self.pending_pdus_by_dest.setdefault(destination, []).append(
diff --git a/synapse/handlers/appservice.py b/synapse/handlers/appservice.py
index b596f098fd..a7345331af 100644
--- a/synapse/handlers/appservice.py
+++ b/synapse/handlers/appservice.py
@@ -21,14 +21,13 @@ from synapse.util.metrics import Measure
from synapse.util.logcontext import (
make_deferred_yieldable, run_in_background,
)
+from prometheus_client import Counter
import logging
logger = logging.getLogger(__name__)
-metrics = synapse.metrics.get_metrics_for(__name__)
-
-events_processed_counter = metrics.register_counter("events_processed")
+events_processed_counter = Counter("synapse_handlers_appservice_events_processed", "")
def log_failure(failure):
@@ -128,18 +127,12 @@ class ApplicationServicesHandler(object):
now = self.clock.time_msec()
ts = yield self.store.get_received_ts(events[-1].event_id)
- synapse.metrics.event_processing_positions.set(
- upper_bound, "appservice_sender",
- )
+ synapse.metrics.event_processing_positions.labels("appservice_sender").set(upper_bound)
- events_processed_counter.inc_by(len(events))
+ events_processed_counter.inc(len(events))
- synapse.metrics.event_processing_lag.set(
- now - ts, "appservice_sender",
- )
- synapse.metrics.event_processing_last_ts.set(
- ts, "appservice_sender",
- )
+ synapse.metrics.event_processing_lag.labels("appservice_sender").set(now - ts)
+ synapse.metrics.event_processing_last_ts.labels("appservice_sender").set(ts)
finally:
self.is_processing = False
diff --git a/synapse/handlers/presence.py b/synapse/handlers/presence.py
index 500a131874..aca822c46a 100644
--- a/synapse/handlers/presence.py
+++ b/synapse/handlers/presence.py
@@ -36,27 +36,27 @@ from synapse.util.logutils import log_function
from synapse.util.metrics import Measure
from synapse.util.wheel_timer import WheelTimer
from synapse.types import UserID, get_domain_from_id
-import synapse.metrics
+from synapse.metrics import LaterGauge
import logging
+from prometheus_client import Counter
+
logger = logging.getLogger(__name__)
-metrics = synapse.metrics.get_metrics_for(__name__)
-notified_presence_counter = metrics.register_counter("notified_presence")
-federation_presence_out_counter = metrics.register_counter("federation_presence_out")
-presence_updates_counter = metrics.register_counter("presence_updates")
-timers_fired_counter = metrics.register_counter("timers_fired")
-federation_presence_counter = metrics.register_counter("federation_presence")
-bump_active_time_counter = metrics.register_counter("bump_active_time")
+notified_presence_counter = Counter("synapse_handler_presence_notified_presence", "")
+federation_presence_out_counter = Counter("synapse_handler_presence_federation_presence_out", "")
+presence_updates_counter = Counter("synapse_handler_presence_presence_updates", "")
+timers_fired_counter = Counter("synapse_handler_presence_timers_fired", "")
+federation_presence_counter = Counter("synapse_handler_presence_federation_presence", "")
+bump_active_time_counter = Counter("synapse_handler_presence_bump_active_time", "")
-get_updates_counter = metrics.register_counter("get_updates", labels=["type"])
+get_updates_counter = Counter("synapse_handler_presence_get_updates", "", ["type"])
-notify_reason_counter = metrics.register_counter("notify_reason", labels=["reason"])
-state_transition_counter = metrics.register_counter(
- "state_transition", labels=["from", "to"]
+notify_reason_counter = Counter("synapse_handler_presence_notify_reason", "", ["reason"])
+state_transition_counter = Counter("synapse_handler_presence_state_transition", "", ["from", "to"]
)
@@ -141,9 +141,9 @@ class PresenceHandler(object):
for state in active_presence
}
- metrics.register_callback(
- "user_to_current_state_size", lambda: len(self.user_to_current_state)
- )
+ LaterGauge(
+ "user_to_current_state_size", "", [], lambda: len(self.user_to_current_state)
+ ).register()
now = self.clock.time_msec()
for state in active_presence:
@@ -212,7 +212,7 @@ class PresenceHandler(object):
60 * 1000,
)
- metrics.register_callback("wheel_timer_size", lambda: len(self.wheel_timer))
+ LaterGauge("wheel_timer_size", "", [], lambda: len(self.wheel_timer)).register()
@defer.inlineCallbacks
def _on_shutdown(self):
@@ -315,10 +315,10 @@ class PresenceHandler(object):
# TODO: We should probably ensure there are no races hereafter
- presence_updates_counter.inc_by(len(new_states))
+ presence_updates_counter.inc(len(new_states))
if to_notify:
- notified_presence_counter.inc_by(len(to_notify))
+ notified_presence_counter.inc(len(to_notify))
yield self._persist_and_notify(to_notify.values())
self.unpersisted_users_changes |= set(s.user_id for s in new_states)
@@ -329,7 +329,7 @@ class PresenceHandler(object):
if user_id not in to_notify
}
if to_federation_ping:
- federation_presence_out_counter.inc_by(len(to_federation_ping))
+ federation_presence_out_counter.inc(len(to_federation_ping))
self._push_to_remotes(to_federation_ping.values())
@@ -367,7 +367,7 @@ class PresenceHandler(object):
for user_id in users_to_check
]
- timers_fired_counter.inc_by(len(states))
+ timers_fired_counter.inc(len(states))
changes = handle_timeouts(
states,
@@ -656,7 +656,7 @@ class PresenceHandler(object):
updates.append(prev_state.copy_and_replace(**new_fields))
if updates:
- federation_presence_counter.inc_by(len(updates))
+ federation_presence_counter.inc(len(updates))
yield self._update_states(updates)
@defer.inlineCallbacks
@@ -931,28 +931,28 @@ def should_notify(old_state, new_state):
return False
if old_state.status_msg != new_state.status_msg:
- notify_reason_counter.inc("status_msg_change")
+ notify_reason_counter.labels("status_msg_change").inc()
return True
if old_state.state != new_state.state:
- notify_reason_counter.inc("state_change")
- state_transition_counter.inc(old_state.state, new_state.state)
+ notify_reason_counter.labels("state_change").inc()
+ state_transition_counter.labels(old_state.state, new_state.state).inc()
return True
if old_state.state == PresenceState.ONLINE:
if new_state.currently_active != old_state.currently_active:
- notify_reason_counter.inc("current_active_change")
+ notify_reason_counter.labels("current_active_change").inc()
return True
if new_state.last_active_ts - old_state.last_active_ts > LAST_ACTIVE_GRANULARITY:
# Only notify about last active bumps if we're not currently acive
if not new_state.currently_active:
- notify_reason_counter.inc("last_active_change_online")
+ notify_reason_counter.labels("last_active_change_online").inc()
return True
elif new_state.last_active_ts - old_state.last_active_ts > LAST_ACTIVE_GRANULARITY:
# Always notify for a transition where last active gets bumped.
- notify_reason_counter.inc("last_active_change_not_online")
+ notify_reason_counter.labels("last_active_change_not_online").inc()
return True
return False
@@ -1026,14 +1026,14 @@ class PresenceEventSource(object):
if changed is not None and len(changed) < 500:
# For small deltas, its quicker to get all changes and then
# work out if we share a room or they're in our presence list
- get_updates_counter.inc("stream")
+ get_updates_counter.labels("stream").inc()
for other_user_id in changed:
if other_user_id in users_interested_in:
user_ids_changed.add(other_user_id)
else:
# Too many possible updates. Find all users we can see and check
# if any of them have changed.
- get_updates_counter.inc("full")
+ get_updates_counter.labels("full").inc()
if from_key:
user_ids_changed = stream_change_cache.get_entities_changed(
diff --git a/synapse/http/client.py b/synapse/http/client.py
index 70a19d9b74..61a1d2e2b3 100644
--- a/synapse/http/client.py
+++ b/synapse/http/client.py
@@ -23,7 +23,6 @@ from synapse.http import cancelled_to_request_timed_out_error
from synapse.util.async import add_timeout_to_deferred
from synapse.util.caches import CACHE_SIZE_FACTOR
from synapse.util.logcontext import make_deferred_yieldable
-import synapse.metrics
from synapse.http.endpoint import SpiderEndpoint
from canonicaljson import encode_canonical_json
@@ -42,6 +41,7 @@ from twisted.web._newclient import ResponseDone
from six import StringIO
+from prometheus_client import Counter
import simplejson as json
import logging
import urllib
@@ -49,16 +49,8 @@ import urllib
logger = logging.getLogger(__name__)
-metrics = synapse.metrics.get_metrics_for(__name__)
-
-outgoing_requests_counter = metrics.register_counter(
- "requests",
- labels=["method"],
-)
-incoming_responses_counter = metrics.register_counter(
- "responses",
- labels=["method", "code"],
-)
+outgoing_requests_counter = Counter("synapse_http_client_requests", "", ["method"])
+incoming_responses_counter = Counter("synapse_http_client_responses", "", ["method", "code"])
class SimpleHttpClient(object):
@@ -95,7 +87,7 @@ class SimpleHttpClient(object):
def request(self, method, uri, *args, **kwargs):
# A small wrapper around self.agent.request() so we can easily attach
# counters to it
- outgoing_requests_counter.inc(method)
+ outgoing_requests_counter.labels(method).inc()
logger.info("Sending request %s %s", method, uri)
@@ -109,14 +101,14 @@ class SimpleHttpClient(object):
)
response = yield make_deferred_yieldable(request_deferred)
- incoming_responses_counter.inc(method, response.code)
+ incoming_responses_counter.labels(method, response.code).inc()
logger.info(
"Received response to %s %s: %s",
method, uri, response.code
)
defer.returnValue(response)
except Exception as e:
- incoming_responses_counter.inc(method, "ERR")
+ incoming_responses_counter.labels(method, "ERR").inc()
logger.info(
"Error sending request to %s %s: %s %s",
method, uri, type(e).__name__, e.message
diff --git a/synapse/http/matrixfederationclient.py b/synapse/http/matrixfederationclient.py
index 4b2b85464d..259d3884e2 100644
--- a/synapse/http/matrixfederationclient.py
+++ b/synapse/http/matrixfederationclient.py
@@ -43,19 +43,13 @@ import sys
import urllib
from six.moves.urllib import parse as urlparse
+from prometheus_client import Counter
+
logger = logging.getLogger(__name__)
outbound_logger = logging.getLogger("synapse.http.outbound")
-metrics = synapse.metrics.get_metrics_for(__name__)
-
-outgoing_requests_counter = metrics.register_counter(
- "requests",
- labels=["method"],
-)
-incoming_responses_counter = metrics.register_counter(
- "responses",
- labels=["method", "code"],
-)
+outgoing_requests_counter = Counter("synapse_http_matrixfederationclient_requests", "", ["method"])
+incoming_responses_counter = Counter("synapse_http_matrixfederationclient_responses", "", ["method", "code"])
MAX_LONG_RETRIES = 10
diff --git a/synapse/http/request_metrics.py b/synapse/http/request_metrics.py
index 5917c83958..0984870e7e 100644
--- a/synapse/http/request_metrics.py
+++ b/synapse/http/request_metrics.py
@@ -16,114 +16,56 @@
import logging
-import synapse.metrics
+from prometheus_client.core import Counter, Histogram
+from synapse.metrics import LaterGauge
+
from synapse.util.logcontext import LoggingContext
logger = logging.getLogger(__name__)
-metrics = synapse.metrics.get_metrics_for("synapse.http.server")
# total number of responses served, split by method/servlet/tag
-response_count = metrics.register_counter(
- "response_count",
- labels=["method", "servlet", "tag"],
- alternative_names=(
- # the following are all deprecated aliases for the same metric
- metrics.name_prefix + x for x in (
- "_requests",
- "_response_time:count",
- "_response_ru_utime:count",
- "_response_ru_stime:count",
- "_response_db_txn_count:count",
- "_response_db_txn_duration:count",
- )
- )
-)
+response_count = Counter("synapse_http_server_response_count", "", ["method", "servlet", "tag"])
-requests_counter = metrics.register_counter(
- "requests_received",
- labels=["method", "servlet", ],
-)
+requests_counter = Counter("synapse_http_server_requests_received", "", ["method", "servlet"])
-outgoing_responses_counter = metrics.register_counter(
- "responses",
- labels=["method", "code"],
-)
+outgoing_responses_counter = Counter("synapse_http_server_responses", "", ["method", "code"])
-response_timer = metrics.register_counter(
- "response_time_seconds",
- labels=["method", "servlet", "tag"],
- alternative_names=(
- metrics.name_prefix + "_response_time:total",
- ),
-)
+response_timer = Histogram("synapse_http_server_response_time_seconds", "", ["method", "servlet", "tag"])
-response_ru_utime = metrics.register_counter(
- "response_ru_utime_seconds", labels=["method", "servlet", "tag"],
- alternative_names=(
- metrics.name_prefix + "_response_ru_utime:total",
- ),
-)
+response_ru_utime = Counter("synapse_http_server_response_ru_utime_seconds", "", ["method", "servlet", "tag"])
-response_ru_stime = metrics.register_counter(
- "response_ru_stime_seconds", labels=["method", "servlet", "tag"],
- alternative_names=(
- metrics.name_prefix + "_response_ru_stime:total",
- ),
-)
+response_ru_stime = Counter("synapse_http_server_response_ru_stime_seconds", "", ["method", "servlet", "tag"])
-response_db_txn_count = metrics.register_counter(
- "response_db_txn_count", labels=["method", "servlet", "tag"],
- alternative_names=(
- metrics.name_prefix + "_response_db_txn_count:total",
- ),
-)
+response_db_txn_count = Counter("synapse_http_server_response_db_txn_count", "", ["method", "servlet", "tag"])
# seconds spent waiting for db txns, excluding scheduling time, when processing
# this request
-response_db_txn_duration = metrics.register_counter(
- "response_db_txn_duration_seconds", labels=["method", "servlet", "tag"],
- alternative_names=(
- metrics.name_prefix + "_response_db_txn_duration:total",
- ),
-)
+response_db_txn_duration = Counter("synapse_http_server_response_db_txn_duration_seconds", "", ["method", "servlet", "tag"])
# seconds spent waiting for a db connection, when processing this request
-response_db_sched_duration = metrics.register_counter(
- "response_db_sched_duration_seconds", labels=["method", "servlet", "tag"]
+response_db_sched_duration = Counter("synapse_http_request_response_db_sched_duration_seconds", "", ["method", "servlet", "tag"]
)
# size in bytes of the response written
-response_size = metrics.register_counter(
- "response_size", labels=["method", "servlet", "tag"]
+response_size = Counter("synapse_http_request_response_size", "", ["method", "servlet", "tag"]
)
# In flight metrics are incremented while the requests are in flight, rather
# than when the response was written.
-in_flight_requests_ru_utime = metrics.register_counter(
- "in_flight_requests_ru_utime_seconds", labels=["method", "servlet"],
-)
+in_flight_requests_ru_utime = Counter("synapse_http_request_in_flight_requests_ru_utime_seconds", "", ["method", "servlet"])
-in_flight_requests_ru_stime = metrics.register_counter(
- "in_flight_requests_ru_stime_seconds", labels=["method", "servlet"],
-)
+in_flight_requests_ru_stime = Counter("synapse_http_request_in_flight_requests_ru_stime_seconds", "", ["method", "servlet"])
-in_flight_requests_db_txn_count = metrics.register_counter(
- "in_flight_requests_db_txn_count", labels=["method", "servlet"],
-)
+in_flight_requests_db_txn_count = Counter("synapse_http_request_in_flight_requests_db_txn_count", "", ["method", "servlet"])
# seconds spent waiting for db txns, excluding scheduling time, when processing
# this request
-in_flight_requests_db_txn_duration = metrics.register_counter(
- "in_flight_requests_db_txn_duration_seconds", labels=["method", "servlet"],
-)
+in_flight_requests_db_txn_duration = Counter("synapse_http_request_in_flight_requests_db_txn_duration_seconds", "", ["method", "servlet"])
# seconds spent waiting for a db connection, when processing this request
-in_flight_requests_db_sched_duration = metrics.register_counter(
- "in_flight_requests_db_sched_duration_seconds", labels=["method", "servlet"]
-)
-
+in_flight_requests_db_sched_duration = Counter("synapse_http_request_in_flight_requests_db_sched_duration_seconds", "", ["method", "servlet"])
# The set of all in flight requests, set[RequestMetrics]
_in_flight_requests = set()
@@ -157,11 +99,10 @@ def _get_in_flight_counts():
return counts
-
-metrics.register_callback(
- "in_flight_requests_count",
- _get_in_flight_counts,
- labels=["method", "servlet"]
+LaterGauge(
+ "synapse_http_request_metrics_in_flight_requests_count", "",
+ ["method", "servlet"],
+ _get_in_flight_counts
)
@@ -192,34 +133,22 @@ class RequestMetrics(object):
)
return
- outgoing_responses_counter.inc(request.method, str(request.code))
+ outgoing_responses_counter.labels(request.method, str(request.code)).inc()
- response_count.inc(request.method, self.name, tag)
+ response_count.labels(request.method, self.name, tag).inc()
- response_timer.inc_by(
- time_msec - self.start, request.method,
- self.name, tag
- )
+ response_timer.labels(request.method, self.name, tag).observe(time_msec - self.start)
ru_utime, ru_stime = context.get_resource_usage()
- response_ru_utime.inc_by(
- ru_utime, request.method, self.name, tag
- )
- response_ru_stime.inc_by(
- ru_stime, request.method, self.name, tag
- )
- response_db_txn_count.inc_by(
- context.db_txn_count, request.method, self.name, tag
- )
- response_db_txn_duration.inc_by(
- context.db_txn_duration_ms / 1000., request.method, self.name, tag
- )
- response_db_sched_duration.inc_by(
- context.db_sched_duration_ms / 1000., request.method, self.name, tag
- )
+ response_ru_utime.labels(request.method, self.name, tag).inc(ru_utime)
+ response_ru_stime.labels(request.method, self.name, tag).inc(ru_stime)
+ response_db_txn_count.labels(request.method, self.name, tag).inc(context.db_txn_count)
+ response_db_txn_duration.labels(request.method, self.name, tag).inc(context.db_txn_duration_ms / 1000.)
+ response_db_sched_duration.labels(request.method, self.name, tag).inc(
+ context.db_sched_duration_ms / 1000.)
- response_size.inc_by(request.sentLength, request.method, self.name, tag)
+ response_size.labels(request.method, self.name, tag).inc(request.sentLength)
# We always call this at the end to ensure that we update the metrics
# regardless of whether a call to /metrics while the request was in
@@ -229,28 +158,16 @@ class RequestMetrics(object):
def update_metrics(self):
"""Updates the in flight metrics with values from this request.
"""
-
diff = self._request_stats.update(self.start_context)
- in_flight_requests_ru_utime.inc_by(
- diff.ru_utime, self.method, self.name,
- )
+ in_flight_requests_ru_utime.labels(self.method, self.name).inc(diff.ru_utime)
+ in_flight_requests_ru_stime.labels(self.method, self.name).inc(diff.ru_stime)
- in_flight_requests_ru_stime.inc_by(
- diff.ru_stime, self.method, self.name,
- )
+ in_flight_requests_db_txn_count.labels(self.method, self.name).inc(diff.db_txn_count)
- in_flight_requests_db_txn_count.inc_by(
- diff.db_txn_count, self.method, self.name,
- )
+ in_flight_requests_db_txn_duration.labels(self.method, self.name).inc(diff.db_txn_duration_ms / 1000.)
- in_flight_requests_db_txn_duration.inc_by(
- diff.db_txn_duration_ms / 1000., self.method, self.name,
- )
-
- in_flight_requests_db_sched_duration.inc_by(
- diff.db_sched_duration_ms / 1000., self.method, self.name,
- )
+ in_flight_requests_db_sched_duration.labels(self.method, self.name).inc(diff.db_sched_duration_ms / 1000.)
class _RequestStats(object):
diff --git a/synapse/http/server.py b/synapse/http/server.py
index faf700851a..bc09b8b2be 100644
--- a/synapse/http/server.py
+++ b/synapse/http/server.py
@@ -210,8 +210,8 @@ def wrap_request_handler_with_logging(h):
# dispatching to the handler, so that the handler
# can update the servlet name in the request
# metrics
- requests_counter.inc(request.method,
- request.request_metrics.name)
+ requests_counter.labels(request.method,
+ request.request_metrics.name).inc()
yield d
return wrapped_request_handler
diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py
index e3b831db67..ab0b921497 100644
--- a/synapse/metrics/__init__.py
+++ b/synapse/metrics/__init__.py
@@ -18,14 +18,13 @@ import functools
import time
import gc
import platform
+import attr
-from twisted.internet import reactor
+from prometheus_client import Gauge, Histogram, Counter
+from prometheus_client.core import (
+ GaugeMetricFamily, CounterMetricFamily, REGISTRY)
-from .metric import (
- CounterMetric, CallbackMetric, DistributionMetric, CacheMetric,
- MemoryUsageMetric, GaugeMetric,
-)
-from .process_collector import register_process_collector
+from twisted.internet import reactor
logger = logging.getLogger(__name__)
@@ -34,149 +33,98 @@ logger = logging.getLogger(__name__)
running_on_pypy = platform.python_implementation() == 'PyPy'
all_metrics = []
all_collectors = []
+all_gauges = {}
+@attr.s(hash=True)
+class LaterGauge(object):
-class Metrics(object):
- """ A single Metrics object gives a (mutable) slice view of the all_metrics
- dict, allowing callers to easily register new metrics that are namespaced
- nicely."""
-
- def __init__(self, name):
- self.name_prefix = name
-
- def make_subspace(self, name):
- return Metrics("%s_%s" % (self.name_prefix, name))
-
- def register_collector(self, func):
- all_collectors.append(func)
-
- def _register(self, metric_class, name, *args, **kwargs):
- full_name = "%s_%s" % (self.name_prefix, name)
-
- metric = metric_class(full_name, *args, **kwargs)
-
- all_metrics.append(metric)
- return metric
-
- def register_counter(self, *args, **kwargs):
- """
- Returns:
- CounterMetric
- """
- return self._register(CounterMetric, *args, **kwargs)
-
- def register_gauge(self, *args, **kwargs):
- """
- Returns:
- GaugeMetric
- """
- return self._register(GaugeMetric, *args, **kwargs)
+ name = attr.ib()
+ desc = attr.ib()
+ labels = attr.ib(hash=False)
+ caller = attr.ib()
- def register_callback(self, *args, **kwargs):
- """
- Returns:
- CallbackMetric
- """
- return self._register(CallbackMetric, *args, **kwargs)
+ def collect(self):
- def register_distribution(self, *args, **kwargs):
- """
- Returns:
- DistributionMetric
- """
- return self._register(DistributionMetric, *args, **kwargs)
+ g = GaugeMetricFamily(self.name, self.desc, self.labels)
- def register_cache(self, *args, **kwargs):
- """
- Returns:
- CacheMetric
- """
- return self._register(CacheMetric, *args, **kwargs)
+ try:
+ calls = self.caller()
+ except Exception as e:
+ print(e)
+ logger.err()
+ yield g
+ if isinstance(calls, dict):
+ for k, v in calls.items():
+ g.add_metric(k, v)
+ else:
+ g.add_metric([], calls)
-def register_memory_metrics(hs):
- try:
- import psutil
- process = psutil.Process()
- process.memory_info().rss
- except (ImportError, AttributeError):
- logger.warn(
- "psutil is not installed or incorrect version."
- " Disabling memory metrics."
- )
- return
- metric = MemoryUsageMetric(hs, psutil)
- all_metrics.append(metric)
+ yield g
+ def __attrs_post_init__(self):
+ self._register()
-def get_metrics_for(pkg_name):
- """ Returns a Metrics instance for conveniently creating metrics
- namespaced with the given name prefix. """
+ def _register(self):
+ if self.name in all_gauges.keys():
+ logger.warning("%s already registered, reregistering" % (self.name,))
+ REGISTRY.unregister(all_gauges.pop(self.name))
- # Convert a "package.name" to "package_name" because Prometheus doesn't
- # let us use . in metric names
- return Metrics(pkg_name.replace(".", "_"))
+ REGISTRY.register(self)
+ all_gauges[self.name] = self
-def render_all():
- strs = []
+#
+# Python GC metrics
+#
- for collector in all_collectors:
- collector()
+gc_unreachable = Gauge("python_gc_unreachable_total", "Unreachable GC objects", ["gen"])
+gc_time = Histogram("python_gc_time", "Time taken to GC (ms)", ["gen"], buckets=[1, 2, 5, 10, 25, 50, 100, 250, 500, 1000])
- for metric in all_metrics:
- try:
- strs += metric.render()
- except Exception:
- strs += ["# FAILED to render"]
- logger.exception("Failed to render metric")
+class GCCounts(object):
+ def collect(self):
+ gc_counts = gc.get_count()
- strs.append("") # to generate a final CRLF
+ cm = GaugeMetricFamily("python_gc_counts", "GC cycle counts", labels=["gen"])
+ for n, m in enumerate(gc.get_count()):
+ cm.add_metric([str(n)], m)
- return "\n".join(strs)
+ yield cm
+REGISTRY.register(GCCounts())
-register_process_collector(get_metrics_for("process"))
+#
+# Twisted reactor metrics
+#
+tick_time = Histogram("python_twisted_reactor_tick_time", "Tick time of the Twisted reactor (ms)", buckets=[1, 2, 5, 10, 50, 100, 250, 500, 1000, 2000])
+pending_calls_metric = Histogram("python_twisted_reactor_pending_calls", "Pending calls", buckets=[1, 2, 5, 10, 25, 50, 100, 250, 500, 1000])
-python_metrics = get_metrics_for("python")
+#
+# Federation Metrics
+#
-gc_time = python_metrics.register_distribution("gc_time", labels=["gen"])
-gc_unreachable = python_metrics.register_counter("gc_unreachable_total", labels=["gen"])
-python_metrics.register_callback(
- "gc_counts", lambda: {(i,): v for i, v in enumerate(gc.get_count())}, labels=["gen"]
-)
+sent_edus_counter = Counter("synapse_federation_client_sent_edus", "")
-reactor_metrics = get_metrics_for("python.twisted.reactor")
-tick_time = reactor_metrics.register_distribution("tick_time")
-pending_calls_metric = reactor_metrics.register_distribution("pending_calls")
+sent_transactions_counter = Counter("synapse_federation_client_sent_transactions", "")
-synapse_metrics = get_metrics_for("synapse")
+events_processed_counter = Counter("synapse_federation_client_events_processed", "")
# Used to track where various components have processed in the event stream,
# e.g. federation sending, appservice sending, etc.
-event_processing_positions = synapse_metrics.register_gauge(
- "event_processing_positions", labels=["name"],
-)
+event_processing_positions = Gauge("synapse_event_processing_positions", "", ["name"])
# Used to track the current max events stream position
-event_persisted_position = synapse_metrics.register_gauge(
- "event_persisted_position",
-)
+event_persisted_position = Gauge("synapse_event_persisted_position", "")
# Used to track the received_ts of the last event processed by various
# components
-event_processing_last_ts = synapse_metrics.register_gauge(
- "event_processing_last_ts", labels=["name"],
-)
+event_processing_last_ts = Gauge("synapse_event_processing_last_ts", "", ["name"])
# Used to track the lag processing events. This is the time difference
# between the last processed event's received_ts and the time it was
# finished being processed.
-event_processing_lag = synapse_metrics.register_gauge(
- "event_processing_lag", labels=["name"],
-)
-
+event_processing_lag = Gauge("synapse_event_processing_lag", "", ["name"])
def runUntilCurrentTimer(func):
@@ -206,8 +154,8 @@ def runUntilCurrentTimer(func):
# since about 25% of time is actually spent running things triggered by
# I/O events, but that is harder to capture without rewriting half the
# reactor.
- tick_time.inc_by(end - start)
- pending_calls_metric.inc_by(num_pending)
+ tick_time.observe(end - start)
+ pending_calls_metric.observe(num_pending)
if running_on_pypy:
return ret
@@ -224,8 +172,8 @@ def runUntilCurrentTimer(func):
unreachable = gc.collect(i)
end = time.time() * 1000
- gc_time.inc_by(end - start, i)
- gc_unreachable.inc_by(unreachable, i)
+ gc_time.labels(i).observe(end - start)
+ gc_unreachable.labels(i).set(unreachable)
return ret
diff --git a/synapse/metrics/metric.py b/synapse/metrics/metric.py
deleted file mode 100644
index f421e7a93f..0000000000
--- a/synapse/metrics/metric.py
+++ /dev/null
@@ -1,328 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2015, 2016 OpenMarket Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from itertools import chain
-import logging
-import re
-
-logger = logging.getLogger(__name__)
-
-
-def flatten(items):
- """Flatten a list of lists
-
- Args:
- items: iterable[iterable[X]]
-
- Returns:
- list[X]: flattened list
- """
- return list(chain.from_iterable(items))
-
-
-class BaseMetric(object):
- """Base class for metrics which report a single value per label set
- """
-
- def __init__(self, name, labels=[], alternative_names=[]):
- """
- Args:
- name (str): principal name for this metric
- labels (list(str)): names of the labels which will be reported
- for this metric
- alternative_names (iterable(str)): list of alternative names for
- this metric. This can be useful to provide a migration path
- when renaming metrics.
- """
- self._names = [name] + list(alternative_names)
- self.labels = labels # OK not to clone as we never write it
-
- def dimension(self):
- return len(self.labels)
-
- def is_scalar(self):
- return not len(self.labels)
-
- def _render_labelvalue(self, value):
- return '"%s"' % (_escape_label_value(value),)
-
- def _render_key(self, values):
- if self.is_scalar():
- return ""
- return "{%s}" % (
- ",".join(["%s=%s" % (k, self._render_labelvalue(v))
- for k, v in zip(self.labels, values)])
- )
-
- def _render_for_labels(self, label_values, value):
- """Render this metric for a single set of labels
-
- Args:
- label_values (list[object]): values for each of the labels,
- (which get stringified).
- value: value of the metric at with these labels
-
- Returns:
- iterable[str]: rendered metric
- """
- rendered_labels = self._render_key(label_values)
- return (
- "%s%s %.12g" % (name, rendered_labels, value)
- for name in self._names
- )
-
- def render(self):
- """Render this metric
-
- Each metric is rendered as:
-
- name{label1="val1",label2="val2"} value
-
- https://prometheus.io/docs/instrumenting/exposition_formats/#text-format-details
-
- Returns:
- iterable[str]: rendered metrics
- """
- raise NotImplementedError()
-
-
-class CounterMetric(BaseMetric):
- """The simplest kind of metric; one that stores a monotonically-increasing
- value that counts events or running totals.
-
- Example use cases for Counters:
- - Number of requests processed
- - Number of items that were inserted into a queue
- - Total amount of data that a system has processed
- Counters can only go up (and be reset when the process restarts).
- """
-
- def __init__(self, *args, **kwargs):
- super(CounterMetric, self).__init__(*args, **kwargs)
-
- # dict[list[str]]: value for each set of label values. the keys are the
- # label values, in the same order as the labels in self.labels.
- #
- # (if the metric is a scalar, the (single) key is the empty tuple).
- self.counts = {}
-
- # Scalar metrics are never empty
- if self.is_scalar():
- self.counts[()] = 0.
-
- def inc_by(self, incr, *values):
- if len(values) != self.dimension():
- raise ValueError(
- "Expected as many values to inc() as labels (%d)" % (self.dimension())
- )
-
- # TODO: should assert that the tag values are all strings
-
- if values not in self.counts:
- self.counts[values] = incr
- else:
- self.counts[values] += incr
-
- def inc(self, *values):
- self.inc_by(1, *values)
-
- def render(self):
- return flatten(
- self._render_for_labels(k, self.counts[k])
- for k in sorted(self.counts.keys())
- )
-
-
-class GaugeMetric(BaseMetric):
- """A metric that can go up or down
- """
-
- def __init__(self, *args, **kwargs):
- super(GaugeMetric, self).__init__(*args, **kwargs)
-
- # dict[list[str]]: value for each set of label values. the keys are the
- # label values, in the same order as the labels in self.labels.
- #
- # (if the metric is a scalar, the (single) key is the empty tuple).
- self.guages = {}
-
- def set(self, v, *values):
- if len(values) != self.dimension():
- raise ValueError(
- "Expected as many values to inc() as labels (%d)" % (self.dimension())
- )
-
- # TODO: should assert that the tag values are all strings
-
- self.guages[values] = v
-
- def render(self):
- return flatten(
- self._render_for_labels(k, self.guages[k])
- for k in sorted(self.guages.keys())
- )
-
-
-class CallbackMetric(BaseMetric):
- """A metric that returns the numeric value returned by a callback whenever
- it is rendered. Typically this is used to implement gauges that yield the
- size or other state of some in-memory object by actively querying it."""
-
- def __init__(self, name, callback, labels=[]):
- super(CallbackMetric, self).__init__(name, labels=labels)
-
- self.callback = callback
-
- def render(self):
- try:
- value = self.callback()
- except Exception:
- logger.exception("Failed to render %s", self.name)
- return ["# FAILED to render " + self.name]
-
- if self.is_scalar():
- return list(self._render_for_labels([], value))
-
- return flatten(
- self._render_for_labels(k, value[k])
- for k in sorted(value.keys())
- )
-
-
-class DistributionMetric(object):
- """A combination of an event counter and an accumulator, which counts
- both the number of events and accumulates the total value. Typically this
- could be used to keep track of method-running times, or other distributions
- of values that occur in discrete occurances.
-
- TODO(paul): Try to export some heatmap-style stats?
- """
-
- def __init__(self, name, *args, **kwargs):
- self.counts = CounterMetric(name + ":count", **kwargs)
- self.totals = CounterMetric(name + ":total", **kwargs)
-
- def inc_by(self, inc, *values):
- self.counts.inc(*values)
- self.totals.inc_by(inc, *values)
-
- def render(self):
- return self.counts.render() + self.totals.render()
-
-
-class CacheMetric(object):
- __slots__ = (
- "name", "cache_name", "hits", "misses", "evicted_size", "size_callback",
- )
-
- def __init__(self, name, size_callback, cache_name):
- self.name = name
- self.cache_name = cache_name
-
- self.hits = 0
- self.misses = 0
- self.evicted_size = 0
-
- self.size_callback = size_callback
-
- def inc_hits(self):
- self.hits += 1
-
- def inc_misses(self):
- self.misses += 1
-
- def inc_evictions(self, size=1):
- self.evicted_size += size
-
- def render(self):
- size = self.size_callback()
- hits = self.hits
- total = self.misses + self.hits
-
- return [
- """%s:hits{name="%s"} %d""" % (self.name, self.cache_name, hits),
- """%s:total{name="%s"} %d""" % (self.name, self.cache_name, total),
- """%s:size{name="%s"} %d""" % (self.name, self.cache_name, size),
- """%s:evicted_size{name="%s"} %d""" % (
- self.name, self.cache_name, self.evicted_size
- ),
- ]
-
-
-class MemoryUsageMetric(object):
- """Keeps track of the current memory usage, using psutil.
-
- The class will keep the current min/max/sum/counts of rss over the last
- WINDOW_SIZE_SEC, by polling UPDATE_HZ times per second
- """
-
- UPDATE_HZ = 2 # number of times to get memory per second
- WINDOW_SIZE_SEC = 30 # the size of the window in seconds
-
- def __init__(self, hs, psutil):
- clock = hs.get_clock()
- self.memory_snapshots = []
-
- self.process = psutil.Process()
-
- clock.looping_call(self._update_curr_values, 1000 / self.UPDATE_HZ)
-
- def _update_curr_values(self):
- max_size = self.UPDATE_HZ * self.WINDOW_SIZE_SEC
- self.memory_snapshots.append(self.process.memory_info().rss)
- self.memory_snapshots[:] = self.memory_snapshots[-max_size:]
-
- def render(self):
- if not self.memory_snapshots:
- return []
-
- max_rss = max(self.memory_snapshots)
- min_rss = min(self.memory_snapshots)
- sum_rss = sum(self.memory_snapshots)
- len_rss = len(self.memory_snapshots)
-
- return [
- "process_psutil_rss:max %d" % max_rss,
- "process_psutil_rss:min %d" % min_rss,
- "process_psutil_rss:total %d" % sum_rss,
- "process_psutil_rss:count %d" % len_rss,
- ]
-
-
-def _escape_character(m):
- """Replaces a single character with its escape sequence.
-
- Args:
- m (re.MatchObject): A match object whose first group is the single
- character to replace
-
- Returns:
- str
- """
- c = m.group(1)
- if c == "\\":
- return "\\\\"
- elif c == "\"":
- return "\\\""
- elif c == "\n":
- return "\\n"
- return c
-
-
-def _escape_label_value(value):
- """Takes a label value and escapes quotes, newlines and backslashes
- """
- return re.sub(r"([\n\"\\])", _escape_character, str(value))
diff --git a/synapse/metrics/process_collector.py b/synapse/metrics/process_collector.py
deleted file mode 100644
index 6fec3de399..0000000000
--- a/synapse/metrics/process_collector.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2015, 2016 OpenMarket Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-
-TICKS_PER_SEC = 100
-BYTES_PER_PAGE = 4096
-
-HAVE_PROC_STAT = os.path.exists("/proc/stat")
-HAVE_PROC_SELF_STAT = os.path.exists("/proc/self/stat")
-HAVE_PROC_SELF_LIMITS = os.path.exists("/proc/self/limits")
-HAVE_PROC_SELF_FD = os.path.exists("/proc/self/fd")
-
-# Field indexes from /proc/self/stat, taken from the proc(5) manpage
-STAT_FIELDS = {
- "utime": 14,
- "stime": 15,
- "starttime": 22,
- "vsize": 23,
- "rss": 24,
-}
-
-
-stats = {}
-
-# In order to report process_start_time_seconds we need to know the
-# machine's boot time, because the value in /proc/self/stat is relative to
-# this
-boot_time = None
-if HAVE_PROC_STAT:
- with open("/proc/stat") as _procstat:
- for line in _procstat:
- if line.startswith("btime "):
- boot_time = int(line.split()[1])
-
-
-def update_resource_metrics():
- if HAVE_PROC_SELF_STAT:
- global stats
- with open("/proc/self/stat") as s:
- line = s.read()
- # line is PID (command) more stats go here ...
- raw_stats = line.split(") ", 1)[1].split(" ")
-
- for (name, index) in STAT_FIELDS.iteritems():
- # subtract 3 from the index, because proc(5) is 1-based, and
- # we've lost the first two fields in PID and COMMAND above
- stats[name] = int(raw_stats[index - 3])
-
-
-def _count_fds():
- # Not every OS will have a /proc/self/fd directory
- if not HAVE_PROC_SELF_FD:
- return 0
-
- return len(os.listdir("/proc/self/fd"))
-
-
-def register_process_collector(process_metrics):
- process_metrics.register_collector(update_resource_metrics)
-
- if HAVE_PROC_SELF_STAT:
- process_metrics.register_callback(
- "cpu_user_seconds_total",
- lambda: float(stats["utime"]) / TICKS_PER_SEC
- )
- process_metrics.register_callback(
- "cpu_system_seconds_total",
- lambda: float(stats["stime"]) / TICKS_PER_SEC
- )
- process_metrics.register_callback(
- "cpu_seconds_total",
- lambda: (float(stats["utime"] + stats["stime"])) / TICKS_PER_SEC
- )
-
- process_metrics.register_callback(
- "virtual_memory_bytes",
- lambda: int(stats["vsize"])
- )
- process_metrics.register_callback(
- "resident_memory_bytes",
- lambda: int(stats["rss"]) * BYTES_PER_PAGE
- )
-
- process_metrics.register_callback(
- "start_time_seconds",
- lambda: boot_time + int(stats["starttime"]) / TICKS_PER_SEC
- )
-
- if HAVE_PROC_SELF_FD:
- process_metrics.register_callback(
- "open_fds",
- lambda: _count_fds()
- )
-
- if HAVE_PROC_SELF_LIMITS:
- def _get_max_fds():
- with open("/proc/self/limits") as limits:
- for line in limits:
- if not line.startswith("Max open files "):
- continue
- # Line is Max open files $SOFT $HARD
- return int(line.split()[3])
- return None
-
- process_metrics.register_callback(
- "max_fds",
- lambda: _get_max_fds()
- )
diff --git a/synapse/metrics/resource.py b/synapse/metrics/resource.py
index 870f400600..7996e6ab66 100644
--- a/synapse/metrics/resource.py
+++ b/synapse/metrics/resource.py
@@ -13,27 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from twisted.web.resource import Resource
-
-import synapse.metrics
-
-
METRICS_PREFIX = "/_synapse/metrics"
-
-
-class MetricsResource(Resource):
- isLeaf = True
-
- def __init__(self, hs):
- Resource.__init__(self) # Resource is old-style, so no super()
-
- self.hs = hs
-
- def render_GET(self, request):
- response = synapse.metrics.render_all()
-
- request.setHeader("Content-Type", "text/plain")
- request.setHeader("Content-Length", str(len(response)))
-
- # Encode as UTF-8 (default)
- return response.encode()
diff --git a/synapse/notifier.py b/synapse/notifier.py
index 8355c7d621..123e6f1840 100644
--- a/synapse/notifier.py
+++ b/synapse/notifier.py
@@ -28,22 +28,19 @@ from synapse.util.logcontext import PreserveLoggingContext, run_in_background
from synapse.util.metrics import Measure
from synapse.types import StreamToken
from synapse.visibility import filter_events_for_client
-import synapse.metrics
+from synapse.metrics import LaterGauge
from collections import namedtuple
+from prometheus_client import Counter
import logging
logger = logging.getLogger(__name__)
-metrics = synapse.metrics.get_metrics_for(__name__)
+notified_events_counter = Counter("synapse_notifier_notified_events", "")
-notified_events_counter = metrics.register_counter("notified_events")
-
-users_woken_by_stream_counter = metrics.register_counter(
- "users_woken_by_stream", labels=["stream"]
-)
+users_woken_by_stream_counter = Counter("synapse_notifier_users_woken_by_stream", "", ["stream"])
# TODO(paul): Should be shared somewhere
@@ -108,7 +105,7 @@ class _NotifierUserStream(object):
self.last_notified_ms = time_now_ms
noify_deferred = self.notify_deferred
- users_woken_by_stream_counter.inc(stream_key)
+ users_woken_by_stream_counter.labels(stream_key).inc()
with PreserveLoggingContext():
self.notify_deferred = ObservableDeferred(defer.Deferred())
@@ -197,14 +194,14 @@ class Notifier(object):
all_user_streams.add(x)
return sum(stream.count_listeners() for stream in all_user_streams)
- metrics.register_callback("listeners", count_listeners)
+ LaterGauge("listeners", "", [], count_listeners)
- metrics.register_callback(
- "rooms",
+ LaterGauge(
+ "rooms", "", [],
lambda: count(bool, self.room_to_user_streams.values()),
)
- metrics.register_callback(
- "users",
+ LaterGauge(
+ "users", "", [],
lambda: len(self.user_to_user_stream),
)
diff --git a/synapse/push/bulk_push_rule_evaluator.py b/synapse/push/bulk_push_rule_evaluator.py
index 7c680659b6..6fcca5e260 100644
--- a/synapse/push/bulk_push_rule_evaluator.py
+++ b/synapse/push/bulk_push_rule_evaluator.py
@@ -22,35 +22,29 @@ from .push_rule_evaluator import PushRuleEvaluatorForEvent
from synapse.event_auth import get_user_power_level
from synapse.api.constants import EventTypes, Membership
-from synapse.metrics import get_metrics_for
-from synapse.util.caches import metrics as cache_metrics
+from synapse.util.caches import register_cache
from synapse.util.caches.descriptors import cached
from synapse.util.async import Linearizer
from synapse.state import POWER_KEY
from collections import namedtuple
-
+from prometheus_client import Counter
logger = logging.getLogger(__name__)
rules_by_room = {}
-push_metrics = get_metrics_for(__name__)
-push_rules_invalidation_counter = push_metrics.register_counter(
- "push_rules_invalidation_counter"
-)
-push_rules_state_size_counter = push_metrics.register_counter(
- "push_rules_state_size_counter"
-)
+push_rules_invalidation_counter = Counter("synapse_push_bulk_push_role_evaluator_push_rules_invalidation_counter", "")
+push_rules_state_size_counter = Counter("synapse_push_bulk_push_role_evaluator_push_rules_state_size_counter", "")
# Measures whether we use the fast path of using state deltas, or if we have to
# recalculate from scratch
-push_rules_delta_state_cache_metric = cache_metrics.register_cache(
+push_rules_delta_state_cache_metric = register_cache(
"cache",
- size_callback=lambda: 0, # Meaningless size, as this isn't a cache that stores values
- cache_name="push_rules_delta_state_cache_metric",
+ "push_rules_delta_state_cache_metric",
+ cache=[], # Meaningless size, as this isn't a cache that stores values
)
@@ -64,10 +58,10 @@ class BulkPushRuleEvaluator(object):
self.store = hs.get_datastore()
self.auth = hs.get_auth()
- self.room_push_rule_cache_metrics = cache_metrics.register_cache(
+ self.room_push_rule_cache_metrics = register_cache(
"cache",
- size_callback=lambda: 0, # There's not good value for this
- cache_name="room_push_rule_cache",
+ "room_push_rule_cache",
+ cache=[], # Meaningless size, as this isn't a cache that stores values
)
@defer.inlineCallbacks
@@ -309,7 +303,7 @@ class RulesForRoom(object):
current_state_ids = context.current_state_ids
push_rules_delta_state_cache_metric.inc_misses()
- push_rules_state_size_counter.inc_by(len(current_state_ids))
+ push_rules_state_size_counter.inc(len(current_state_ids))
logger.debug(
"Looking for member changes in %r %r", state_group, current_state_ids
diff --git a/synapse/push/httppusher.py b/synapse/push/httppusher.py
index b077e1a446..e22088ad6f 100644
--- a/synapse/push/httppusher.py
+++ b/synapse/push/httppusher.py
@@ -20,22 +20,17 @@ from twisted.internet.error import AlreadyCalled, AlreadyCancelled
from . import push_rule_evaluator
from . import push_tools
-import synapse
from synapse.push import PusherConfigException
from synapse.util.logcontext import LoggingContext
from synapse.util.metrics import Measure
-logger = logging.getLogger(__name__)
+from prometheus_client import Counter
-metrics = synapse.metrics.get_metrics_for(__name__)
+logger = logging.getLogger(__name__)
-http_push_processed_counter = metrics.register_counter(
- "http_pushes_processed",
-)
+http_push_processed_counter = Counter("http_pushes_processed", "")
-http_push_failed_counter = metrics.register_counter(
- "http_pushes_failed",
-)
+http_push_failed_counter = Counter("http_pushes_failed", "")
class HttpPusher(object):
diff --git a/synapse/push/push_rule_evaluator.py b/synapse/push/push_rule_evaluator.py
index 3601f2d365..c3e6c5c258 100644
--- a/synapse/push/push_rule_evaluator.py
+++ b/synapse/push/push_rule_evaluator.py
@@ -150,7 +150,7 @@ class PushRuleEvaluatorForEvent(object):
# Caches (glob, word_boundary) -> regex for push. See _glob_matches
regex_cache = LruCache(50000 * CACHE_SIZE_FACTOR)
-register_cache("regex_push_cache", regex_cache)
+register_cache("cache", "regex_push_cache", regex_cache)
def _glob_matches(glob, value, word_boundary=False):
diff --git a/synapse/python_dependencies.py b/synapse/python_dependencies.py
index 216db4d164..478c497722 100644
--- a/synapse/python_dependencies.py
+++ b/synapse/python_dependencies.py
@@ -56,6 +56,7 @@ REQUIREMENTS = {
"msgpack-python>=0.3.0": ["msgpack"],
"phonenumbers>=8.2.0": ["phonenumbers"],
"six": ["six"],
+ "prometheus_client": ["prometheus_client"],
}
CONDITIONAL_REQUIREMENTS = {
"web_client": {
diff --git a/synapse/replication/tcp/protocol.py b/synapse/replication/tcp/protocol.py
index d7d38464b2..5848f57c5e 100644
--- a/synapse/replication/tcp/protocol.py
+++ b/synapse/replication/tcp/protocol.py
@@ -60,19 +60,19 @@ from .commands import (
)
from .streams import STREAMS_MAP
+from synapse.metrics import LaterGauge
from synapse.util.stringutils import random_string
-from synapse.metrics.metric import CounterMetric
+
+from prometheus_client import Counter
+
+from collections import defaultdict
import logging
-import synapse.metrics
import struct
import fcntl
-
-metrics = synapse.metrics.get_metrics_for(__name__)
-
-connection_close_counter = metrics.register_counter(
- "close_reason", labels=["reason_type"],
+connection_close_counter = Counter(
+ "synapse_replication_tcp_protocol_close_reason", "", ["reason_type"],
)
@@ -136,12 +136,8 @@ class BaseReplicationStreamProtocol(LineOnlyReceiver):
# The LoopingCall for sending pings.
self._send_ping_loop = None
- self.inbound_commands_counter = CounterMetric(
- "inbound_commands", labels=["command"],
- )
- self.outbound_commands_counter = CounterMetric(
- "outbound_commands", labels=["command"],
- )
+ self.inbound_commands_counter = defaultdict(int)
+ self.outbound_commands_counter = defaultdict(int)
def connectionMade(self):
logger.info("[%s] Connection established", self.id())
@@ -201,7 +197,7 @@ class BaseReplicationStreamProtocol(LineOnlyReceiver):
self.last_received_command = self.clock.time_msec()
- self.inbound_commands_counter.inc(cmd_name)
+ self.inbound_commands_counter[cmd_name] = self.inbound_commands_counter[cmd_name] + 1
cmd_cls = COMMAND_MAP[cmd_name]
try:
@@ -251,8 +247,7 @@ class BaseReplicationStreamProtocol(LineOnlyReceiver):
self._queue_command(cmd)
return
- self.outbound_commands_counter.inc(cmd.NAME)
-
+ self.outbound_commands_counter[cmd.NAME] = self.outbound_commands_counter[cmd.NAME] + 1
string = "%s %s" % (cmd.NAME, cmd.to_line(),)
if "\n" in string:
raise Exception("Unexpected newline in command: %r", string)
@@ -317,9 +312,9 @@ class BaseReplicationStreamProtocol(LineOnlyReceiver):
def connectionLost(self, reason):
logger.info("[%s] Replication connection closed: %r", self.id(), reason)
if isinstance(reason, Failure):
- connection_close_counter.inc(reason.type.__name__)
+ connection_close_counter.labels(reason.type.__name__).inc()
else:
- connection_close_counter.inc(reason.__class__.__name__)
+ connection_close_counter.labels(reason.__class__.__name__).inc()
try:
# Remove us from list of connections to be monitored
@@ -566,14 +561,12 @@ class ClientReplicationStreamProtocol(BaseReplicationStreamProtocol):
# The following simply registers metrics for the replication connections
-metrics.register_callback(
- "pending_commands",
+pending_commands = LaterGauge(
+ "pending_commands", "", ["name", "conn_id"],
lambda: {
(p.name, p.conn_id): len(p.pending_commands)
for p in connected_connections
- },
- labels=["name", "conn_id"],
-)
+ })
def transport_buffer_size(protocol):
@@ -583,14 +576,12 @@ def transport_buffer_size(protocol):
return 0
-metrics.register_callback(
- "transport_send_buffer",
+transport_send_buffer = LaterGauge(
+ "synapse_replication_tcp_transport_send_buffer", "", ["name", "conn_id"],
lambda: {
(p.name, p.conn_id): transport_buffer_size(p)
for p in connected_connections
- },
- labels=["name", "conn_id"],
-)
+ })
def transport_kernel_read_buffer_size(protocol, read=True):
@@ -608,48 +599,37 @@ def transport_kernel_read_buffer_size(protocol, read=True):
return 0
-metrics.register_callback(
- "transport_kernel_send_buffer",
+tcp_transport_kernel_send_buffer = LaterGauge(
+ "synapse_replication_tcp_transport_kernel_send_buffer", "", ["name", "conn_id"],
lambda: {
(p.name, p.conn_id): transport_kernel_read_buffer_size(p, False)
for p in connected_connections
- },
- labels=["name", "conn_id"],
-)
+ })
-metrics.register_callback(
- "transport_kernel_read_buffer",
+tcp_transport_kernel_read_buffer = LaterGauge(
+ "synapse_replication_tcp_transport_kernel_read_buffer", "", ["name", "conn_id"],
lambda: {
(p.name, p.conn_id): transport_kernel_read_buffer_size(p, True)
for p in connected_connections
- },
- labels=["name", "conn_id"],
-)
+ })
-metrics.register_callback(
- "inbound_commands",
+tcp_inbound_commands = LaterGauge(
+ "synapse_replication_tcp_inbound_commands", "", ["command", "name", "conn_id"],
lambda: {
(k[0], p.name, p.conn_id): count
for p in connected_connections
- for k, count in p.inbound_commands_counter.counts.iteritems()
- },
- labels=["command", "name", "conn_id"],
-)
+ for k, count in p.inbound_commands_counter.items()
+ })
-metrics.register_callback(
- "outbound_commands",
+tcp_outbound_commands = LaterGauge(
+ "synapse_replication_tcp_outbound_commands", "", ["command", "name", "conn_id"],
lambda: {
(k[0], p.name, p.conn_id): count
for p in connected_connections
- for k, count in p.outbound_commands_counter.counts.iteritems()
- },
- labels=["command", "name", "conn_id"],
-)
+ for k, count in p.outbound_commands_counter.items()
+ })
# number of updates received for each RDATA stream
-inbound_rdata_count = metrics.register_counter(
- "inbound_rdata_count",
- labels=["stream_name"],
-)
+inbound_rdata_count = Counter("synapse_replication_tcp_inbound_rdata_count", "", ["stream_name"])
diff --git a/synapse/replication/tcp/resource.py b/synapse/replication/tcp/resource.py
index a603c520ea..ed251c79ac 100644
--- a/synapse/replication/tcp/resource.py
+++ b/synapse/replication/tcp/resource.py
@@ -22,20 +22,19 @@ from .streams import STREAMS_MAP, FederationStream
from .protocol import ServerReplicationStreamProtocol
from synapse.util.metrics import Measure, measure_func
+from synapse.metrics import LaterGauge
import logging
-import synapse.metrics
+from prometheus_client import Counter
-metrics = synapse.metrics.get_metrics_for(__name__)
-stream_updates_counter = metrics.register_counter(
- "stream_updates", labels=["stream_name"]
+stream_updates_counter = Counter("synapse_replication_tcp_resource_stream_updates", "", ["stream_name"]
)
-user_sync_counter = metrics.register_counter("user_sync")
-federation_ack_counter = metrics.register_counter("federation_ack")
-remove_pusher_counter = metrics.register_counter("remove_pusher")
-invalidate_cache_counter = metrics.register_counter("invalidate_cache")
-user_ip_cache_counter = metrics.register_counter("user_ip_cache")
+user_sync_counter = Counter("synapse_replication_tcp_resource_user_sync", "")
+federation_ack_counter = Counter("synapse_replication_tcp_resource_federation_ack", "")
+remove_pusher_counter = Counter("synapse_replication_tcp_resource_remove_pusher", "")
+invalidate_cache_counter = Counter("synapse_replication_tcp_resource_invalidate_cache", "")
+user_ip_cache_counter = Counter("synapse_replication_tcp_resource_user_ip_cache", "")
logger = logging.getLogger(__name__)
@@ -74,7 +73,8 @@ class ReplicationStreamer(object):
# Current connections.
self.connections = []
- metrics.register_callback("total_connections", lambda: len(self.connections))
+ l = LaterGauge("synapse_replication_tcp_resource_total_connections", "", [], lambda: len(self.connections))
+ l.register()
# List of streams that clients can subscribe to.
# We only support federation stream if federation sending hase been
@@ -86,17 +86,15 @@ class ReplicationStreamer(object):
self.streams_by_name = {stream.NAME: stream for stream in self.streams}
- metrics.register_callback(
- "connections_per_stream",
+ LaterGauge(
+ "synapse_replication_tcp_resource_connections_per_stream", "", ["stream_name"],
lambda: {
(stream_name,): len([
conn for conn in self.connections
if stream_name in conn.replication_streams
])
for stream_name in self.streams_by_name
- },
- labels=["stream_name"],
- )
+ }).register()
self.federation_sender = None
if not hs.config.send_federation:
@@ -176,7 +174,7 @@ class ReplicationStreamer(object):
logger.info(
"Streaming: %s -> %s", stream.NAME, updates[-1][0]
)
- stream_updates_counter.inc_by(len(updates), stream.NAME)
+ stream_updates_counter.labels(stream.NAME).inc(len(updates))
# Some streams return multiple rows with the same stream IDs,
# we need to make sure they get sent out in batches. We do
diff --git a/synapse/storage/_base.py b/synapse/storage/_base.py
index 2262776ab2..d1b625dc30 100644
--- a/synapse/storage/_base.py
+++ b/synapse/storage/_base.py
@@ -18,8 +18,8 @@ from synapse.api.errors import StoreError
from synapse.util.logcontext import LoggingContext, PreserveLoggingContext
from synapse.util.caches.descriptors import Cache
from synapse.storage.engines import PostgresEngine
-import synapse.metrics
+from prometheus_client import Histogram
from twisted.internet import defer
@@ -34,13 +34,10 @@ sql_logger = logging.getLogger("synapse.storage.SQL")
transaction_logger = logging.getLogger("synapse.storage.txn")
perf_logger = logging.getLogger("synapse.storage.TIME")
+sql_scheduling_timer = Histogram("synapse_storage_schedule_time", "")
-metrics = synapse.metrics.get_metrics_for("synapse.storage")
-
-sql_scheduling_timer = metrics.register_distribution("schedule_time")
-
-sql_query_timer = metrics.register_distribution("query_time", labels=["verb"])
-sql_txn_timer = metrics.register_distribution("transaction_time", labels=["desc"])
+sql_query_timer = Histogram("synapse_storage_query_time", "", ["verb"])
+sql_txn_timer = Histogram("synapse_storage_transaction_time", "", ["desc"])
class LoggingTransaction(object):
@@ -117,7 +114,7 @@ class LoggingTransaction(object):
finally:
msecs = (time.time() * 1000) - start
sql_logger.debug("[SQL time] {%s} %f", self.name, msecs)
- sql_query_timer.inc_by(msecs, sql.split()[0])
+ sql_query_timer.labels(sql.split()[0]).observe(msecs)
class PerformanceCounters(object):
@@ -287,7 +284,7 @@ class SQLBaseStore(object):
self._current_txn_total_time += duration
self._txn_perf_counters.update(desc, start, end)
- sql_txn_timer.inc_by(duration, desc)
+ sql_txn_timer.labels(desc).observe(duration)
@defer.inlineCallbacks
def runInteraction(self, desc, func, *args, **kwargs):
@@ -349,7 +346,7 @@ class SQLBaseStore(object):
def inner_func(conn, *args, **kwargs):
with LoggingContext("runWithConnection") as context:
sched_duration_ms = time.time() * 1000 - start_time
- sql_scheduling_timer.inc_by(sched_duration_ms)
+ sql_scheduling_timer.observe(sched_duration_ms)
current_context.add_database_scheduled(sched_duration_ms)
if self.database_engine.is_connection_closed(conn):
diff --git a/synapse/storage/events.py b/synapse/storage/events.py
index 5ebef98c4f..00d66886ad 100644
--- a/synapse/storage/events.py
+++ b/synapse/storage/events.py
@@ -40,30 +40,24 @@ import synapse.metrics
from synapse.events import EventBase # noqa: F401
from synapse.events.snapshot import EventContext # noqa: F401
-logger = logging.getLogger(__name__)
+from prometheus_client import Counter
+logger = logging.getLogger(__name__)
-metrics = synapse.metrics.get_metrics_for(__name__)
-persist_event_counter = metrics.register_counter("persisted_events")
-event_counter = metrics.register_counter(
- "persisted_events_sep", labels=["type", "origin_type", "origin_entity"]
-)
+persist_event_counter = Counter("synapse_storage_events_persisted_events", "")
+event_counter = Counter("synapse_storage_events_persisted_events_sep", "", ["type", "origin_type", "origin_entity"])
# The number of times we are recalculating the current state
-state_delta_counter = metrics.register_counter(
- "state_delta",
-)
+state_delta_counter = Counter("synapse_storage_events_state_delta", "")
+
# The number of times we are recalculating state when there is only a
# single forward extremity
-state_delta_single_event_counter = metrics.register_counter(
- "state_delta_single_event",
-)
+state_delta_single_event_counter = Counter("synapse_storage_events_state_delta_single_event", "")
+
# The number of times we are reculating state when we could have resonably
# calculated the delta when we calculated the state for an event we were
# persisting.
-state_delta_reuse_delta_counter = metrics.register_counter(
- "state_delta_reuse_delta",
-)
+state_delta_reuse_delta_counter = Counter("synapse_storage_events_state_delta_reuse_delta", "")
def encode_json(json_object):
@@ -445,7 +439,7 @@ class EventsStore(EventsWorkerStore):
state_delta_for_room=state_delta_for_room,
new_forward_extremeties=new_forward_extremeties,
)
- persist_event_counter.inc_by(len(chunk))
+ persist_event_counter.inc(len(chunk))
synapse.metrics.event_persisted_position.set(
chunk[-1][0].internal_metadata.stream_ordering,
)
@@ -460,7 +454,7 @@ class EventsStore(EventsWorkerStore):
origin_type = "remote"
origin_entity = get_domain_from_id(event.sender)
- event_counter.inc(event.type, origin_type, origin_entity)
+ event_counter.labels(event.type, origin_type, origin_entity).inc()
for room_id, new_state in current_state_for_room.iteritems():
self.get_current_state_ids.prefill(
diff --git a/synapse/util/caches/__init__.py b/synapse/util/caches/__init__.py
index 4adae96681..438dcddf55 100644
--- a/synapse/util/caches/__init__.py
+++ b/synapse/util/caches/__init__.py
@@ -13,29 +13,52 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import synapse.metrics
+from prometheus_client.core import GaugeMetricFamily, REGISTRY
+
import os
CACHE_SIZE_FACTOR = float(os.environ.get("SYNAPSE_CACHE_FACTOR", 0.5))
-metrics = synapse.metrics.get_metrics_for("synapse.util.caches")
-
caches_by_name = {}
-# cache_counter = metrics.register_cache(
-# "cache",
-# lambda: {(name,): len(caches_by_name[name]) for name in caches_by_name.keys()},
-# labels=["name"],
-# )
-
-
-def register_cache(name, cache):
- caches_by_name[name] = cache
- return metrics.register_cache(
- "cache",
- lambda: len(cache),
- name,
- )
+collectors_by_name = {}
+
+def register_cache(name, cache_name, cache):
+
+ # Check if the metric is already registered. Unregister it, if so.
+ metric_name = "synapse_util_caches_%s:%s" % (name, cache_name,)
+ if metric_name in collectors_by_name.keys():
+ REGISTRY.unregister(collectors_by_name[metric_name])
+
+ class CacheMetric(object):
+
+ hits = 0
+ misses = 0
+ evicted_size = 0
+
+ def inc_hits(self):
+ self.hits += 1
+
+ def inc_misses(self):
+ self.misses += 1
+
+ def inc_evictions(self, size=1):
+ self.evicted_size += size
+
+ def collect(self):
+ cache_size = len(cache)
+
+ gm = GaugeMetricFamily(metric_name, "", labels=["size", "hits", "misses", "total"])
+ gm.add_metric(["size"], cache_size)
+ gm.add_metric(["hits"], self.hits)
+ gm.add_metric(["misses"], self.misses)
+ gm.add_metric(["total"], self.hits + self.misses)
+ yield gm
+ metric = CacheMetric()
+ REGISTRY.register(metric)
+ caches_by_name[cache_name] = cache
+ collectors_by_name[metric_name] = metric
+ return metric
KNOWN_KEYS = {
key: key for key in
diff --git a/synapse/util/caches/descriptors.py b/synapse/util/caches/descriptors.py
index 68285a7594..a4188eb099 100644
--- a/synapse/util/caches/descriptors.py
+++ b/synapse/util/caches/descriptors.py
@@ -80,7 +80,7 @@ class Cache(object):
self.name = name
self.keylen = keylen
self.thread = None
- self.metrics = register_cache(name, self.cache)
+ self.metrics = register_cache("descriptor", name, self.cache)
def _on_evicted(self, evicted_count):
self.metrics.inc_evictions(evicted_count)
diff --git a/synapse/util/caches/dictionary_cache.py b/synapse/util/caches/dictionary_cache.py
index 1709e8b429..bdc21e348f 100644
--- a/synapse/util/caches/dictionary_cache.py
+++ b/synapse/util/caches/dictionary_cache.py
@@ -55,7 +55,7 @@ class DictionaryCache(object):
__slots__ = []
self.sentinel = Sentinel()
- self.metrics = register_cache(name, self.cache)
+ self.metrics = register_cache("dictionary", name, self.cache)
def check_thread(self):
expected_thread = self.thread
diff --git a/synapse/util/caches/expiringcache.py b/synapse/util/caches/expiringcache.py
index 0aa103eecb..ff04c91955 100644
--- a/synapse/util/caches/expiringcache.py
+++ b/synapse/util/caches/expiringcache.py
@@ -52,12 +52,12 @@ class ExpiringCache(object):
self._cache = OrderedDict()
- self.metrics = register_cache(cache_name, self)
-
self.iterable = iterable
self._size_estimate = 0
+ self.metrics = register_cache("expiring", cache_name, self)
+
def start(self):
if not self._expiry_ms:
# Don't bother starting the loop if things never expire
diff --git a/synapse/util/caches/response_cache.py b/synapse/util/caches/response_cache.py
index 7f79333e96..a8491b42d5 100644
--- a/synapse/util/caches/response_cache.py
+++ b/synapse/util/caches/response_cache.py
@@ -17,7 +17,7 @@ import logging
from twisted.internet import defer
from synapse.util.async import ObservableDeferred
-from synapse.util.caches import metrics as cache_metrics
+from synapse.util.caches import register_cache
from synapse.util.logcontext import make_deferred_yieldable, run_in_background
logger = logging.getLogger(__name__)
@@ -38,15 +38,16 @@ class ResponseCache(object):
self.timeout_sec = timeout_ms / 1000.
self._name = name
- self._metrics = cache_metrics.register_cache(
- "response_cache",
- size_callback=lambda: self.size(),
- cache_name=name,
+ self._metrics = register_cache(
+ "response_cache", name, self
)
def size(self):
return len(self.pending_result_cache)
+ def __len__(self):
+ return self.size()
+
def get(self, key):
"""Look up the given key.
diff --git a/synapse/util/caches/stream_change_cache.py b/synapse/util/caches/stream_change_cache.py
index 941d873ab8..a7fe0397fa 100644
--- a/synapse/util/caches/stream_change_cache.py
+++ b/synapse/util/caches/stream_change_cache.py
@@ -38,7 +38,7 @@ class StreamChangeCache(object):
self._cache = sorteddict()
self._earliest_known_stream_pos = current_stream_pos
self.name = name
- self.metrics = register_cache(self.name, self._cache)
+ self.metrics = register_cache("cache", self.name, self._cache)
for entity, stream_pos in prefilled_cache.items():
self.entity_has_changed(entity, stream_pos)
diff --git a/synapse/util/metrics.py b/synapse/util/metrics.py
index e4b5687a4b..a964286d85 100644
--- a/synapse/util/metrics.py
+++ b/synapse/util/metrics.py
@@ -15,8 +15,8 @@
from twisted.internet import defer
+from prometheus_client import Counter
from synapse.util.logcontext import LoggingContext
-import synapse.metrics
from functools import wraps
import logging
@@ -24,66 +24,21 @@ import logging
logger = logging.getLogger(__name__)
+block_counter = Counter("synapse_util_metrics_block_count", "", ["block_name"])
-metrics = synapse.metrics.get_metrics_for(__name__)
-
-# total number of times we have hit this block
-block_counter = metrics.register_counter(
- "block_count",
- labels=["block_name"],
- alternative_names=(
- # the following are all deprecated aliases for the same metric
- metrics.name_prefix + x for x in (
- "_block_timer:count",
- "_block_ru_utime:count",
- "_block_ru_stime:count",
- "_block_db_txn_count:count",
- "_block_db_txn_duration:count",
- )
- )
-)
-
-block_timer = metrics.register_counter(
- "block_time_seconds",
- labels=["block_name"],
- alternative_names=(
- metrics.name_prefix + "_block_timer:total",
- ),
-)
-
-block_ru_utime = metrics.register_counter(
- "block_ru_utime_seconds", labels=["block_name"],
- alternative_names=(
- metrics.name_prefix + "_block_ru_utime:total",
- ),
-)
-
-block_ru_stime = metrics.register_counter(
- "block_ru_stime_seconds", labels=["block_name"],
- alternative_names=(
- metrics.name_prefix + "_block_ru_stime:total",
- ),
-)
-
-block_db_txn_count = metrics.register_counter(
- "block_db_txn_count", labels=["block_name"],
- alternative_names=(
- metrics.name_prefix + "_block_db_txn_count:total",
- ),
-)
+block_timer = Counter("synapse_util_metrics_block_time_seconds", "", ["block_name"])
+
+block_ru_utime = Counter("synapse_util_metrics_block_ru_utime_seconds", "", ["block_name"])
+
+block_ru_stime = Counter("synapse_util_metrics_block_ru_stime_seconds", "", ["block_name"])
+
+block_db_txn_count = Counter("synapse_util_metrics_block_db_txn_count", "", ["block_name"])
# seconds spent waiting for db txns, excluding scheduling time, in this block
-block_db_txn_duration = metrics.register_counter(
- "block_db_txn_duration_seconds", labels=["block_name"],
- alternative_names=(
- metrics.name_prefix + "_block_db_txn_duration:total",
- ),
-)
+block_db_txn_duration = Counter("synapse_util_metrics_block_db_txn_duration_seconds", "", ["block_name"])
# seconds spent waiting for a db connection, in this block
-block_db_sched_duration = metrics.register_counter(
- "block_db_sched_duration_seconds", labels=["block_name"],
-)
+block_db_sched_duration = Counter("synapse_util_metrics_block_db_sched_duration_seconds", "", ["block_name"])
def measure_func(name):
@@ -132,8 +87,8 @@ class Measure(object):
duration = self.clock.time_msec() - self.start
- block_counter.inc(self.name)
- block_timer.inc_by(duration, self.name)
+ block_counter.labels(self.name).inc()
+ block_timer.labels(self.name).inc(duration)
context = LoggingContext.current_context()
@@ -150,19 +105,13 @@ class Measure(object):
ru_utime, ru_stime = context.get_resource_usage()
- block_ru_utime.inc_by(ru_utime - self.ru_utime, self.name)
- block_ru_stime.inc_by(ru_stime - self.ru_stime, self.name)
- block_db_txn_count.inc_by(
- context.db_txn_count - self.db_txn_count, self.name
- )
- block_db_txn_duration.inc_by(
- (context.db_txn_duration_ms - self.db_txn_duration_ms) / 1000.,
- self.name
- )
- block_db_sched_duration.inc_by(
- (context.db_sched_duration_ms - self.db_sched_duration_ms) / 1000.,
- self.name
- )
+ block_ru_utime.labels(self.name).inc(ru_utime - self.ru_utime)
+ block_ru_stime.labels(self.name).inc(ru_stime - self.ru_stime)
+ block_db_txn_count.labels(self.name).inc(context.db_txn_count - self.db_txn_count)
+ block_db_txn_duration.labels(self.name).inc(
+ (context.db_txn_duration_ms - self.db_txn_duration_ms) / 1000.)
+ block_db_sched_duration.labels(self.name).inc(
+ (context.db_sched_duration_ms - self.db_sched_duration_ms) / 1000.)
if self.created_context:
self.start_context.__exit__(exc_type, exc_val, exc_tb)
diff --git a/tests/__init__.py b/tests/__init__.py
index bfebb0f644..aab20e8e02 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -12,3 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+
+from twisted.trial import util
+util.DEFAULT_TIMEOUT_DURATION = 10
diff --git a/tests/metrics/__init__.py b/tests/metrics/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
--- a/tests/metrics/__init__.py
+++ /dev/null
diff --git a/tests/metrics/test_metric.py b/tests/metrics/test_metric.py
deleted file mode 100644
index 069c0be762..0000000000
--- a/tests/metrics/test_metric.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2015, 2016 OpenMarket Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from tests import unittest
-
-from synapse.metrics.metric import (
- CounterMetric, CallbackMetric, DistributionMetric, CacheMetric,
- _escape_label_value,
-)
-
-
-class CounterMetricTestCase(unittest.TestCase):
-
- def test_scalar(self):
- counter = CounterMetric("scalar")
-
- self.assertEquals(counter.render(), [
- 'scalar 0',
- ])
-
- counter.inc()
-
- self.assertEquals(counter.render(), [
- 'scalar 1',
- ])
-
- counter.inc_by(2)
-
- self.assertEquals(counter.render(), [
- 'scalar 3'
- ])
-
- def test_vector(self):
- counter = CounterMetric("vector", labels=["method"])
-
- # Empty counter doesn't yet know what values it has
- self.assertEquals(counter.render(), [])
-
- counter.inc("GET")
-
- self.assertEquals(counter.render(), [
- 'vector{method="GET"} 1',
- ])
-
- counter.inc("GET")
- counter.inc("PUT")
-
- self.assertEquals(counter.render(), [
- 'vector{method="GET"} 2',
- 'vector{method="PUT"} 1',
- ])
-
-
-class CallbackMetricTestCase(unittest.TestCase):
-
- def test_scalar(self):
- d = dict()
-
- metric = CallbackMetric("size", lambda: len(d))
-
- self.assertEquals(metric.render(), [
- 'size 0',
- ])
-
- d["key"] = "value"
-
- self.assertEquals(metric.render(), [
- 'size 1',
- ])
-
- def test_vector(self):
- vals = dict()
-
- metric = CallbackMetric("values", lambda: vals, labels=["type"])
-
- self.assertEquals(metric.render(), [])
-
- # Keys have to be tuples, even if they're 1-element
- vals[("foo",)] = 1
- vals[("bar",)] = 2
-
- self.assertEquals(metric.render(), [
- 'values{type="bar"} 2',
- 'values{type="foo"} 1',
- ])
-
-
-class DistributionMetricTestCase(unittest.TestCase):
-
- def test_scalar(self):
- metric = DistributionMetric("thing")
-
- self.assertEquals(metric.render(), [
- 'thing:count 0',
- 'thing:total 0',
- ])
-
- metric.inc_by(500)
-
- self.assertEquals(metric.render(), [
- 'thing:count 1',
- 'thing:total 500',
- ])
-
- def test_vector(self):
- metric = DistributionMetric("queries", labels=["verb"])
-
- self.assertEquals(metric.render(), [])
-
- metric.inc_by(300, "SELECT")
- metric.inc_by(200, "SELECT")
- metric.inc_by(800, "INSERT")
-
- self.assertEquals(metric.render(), [
- 'queries:count{verb="INSERT"} 1',
- 'queries:count{verb="SELECT"} 2',
- 'queries:total{verb="INSERT"} 800',
- 'queries:total{verb="SELECT"} 500',
- ])
-
-
-class CacheMetricTestCase(unittest.TestCase):
-
- def test_cache(self):
- d = dict()
-
- metric = CacheMetric("cache", lambda: len(d), "cache_name")
-
- self.assertEquals(metric.render(), [
- 'cache:hits{name="cache_name"} 0',
- 'cache:total{name="cache_name"} 0',
- 'cache:size{name="cache_name"} 0',
- 'cache:evicted_size{name="cache_name"} 0',
- ])
-
- metric.inc_misses()
- d["key"] = "value"
-
- self.assertEquals(metric.render(), [
- 'cache:hits{name="cache_name"} 0',
- 'cache:total{name="cache_name"} 1',
- 'cache:size{name="cache_name"} 1',
- 'cache:evicted_size{name="cache_name"} 0',
- ])
-
- metric.inc_hits()
-
- self.assertEquals(metric.render(), [
- 'cache:hits{name="cache_name"} 1',
- 'cache:total{name="cache_name"} 2',
- 'cache:size{name="cache_name"} 1',
- 'cache:evicted_size{name="cache_name"} 0',
- ])
-
- metric.inc_evictions(2)
-
- self.assertEquals(metric.render(), [
- 'cache:hits{name="cache_name"} 1',
- 'cache:total{name="cache_name"} 2',
- 'cache:size{name="cache_name"} 1',
- 'cache:evicted_size{name="cache_name"} 2',
- ])
-
-
-class LabelValueEscapeTestCase(unittest.TestCase):
- def test_simple(self):
- string = "safjhsdlifhyskljfksdfh"
- self.assertEqual(string, _escape_label_value(string))
-
- def test_escape(self):
- self.assertEqual(
- "abc\\\"def\\nghi\\\\",
- _escape_label_value("abc\"def\nghi\\"),
- )
-
- def test_sequence_of_escapes(self):
- self.assertEqual(
- "abc\\\"def\\nghi\\\\\\n",
- _escape_label_value("abc\"def\nghi\\\n"),
- )
|