diff options
author | Richard van der Hoff <github@rvanderhoff.org.uk> | 2018-01-16 14:10:21 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-01-16 14:10:21 +0000 |
commit | febdca4b37f50c062b20a8042fdc8abb34e57fbc (patch) | |
tree | 6a221582129d1399a47bdc3b00479b7ffa0624e8 | |
parent | Merge pull request #2790 from matrix-org/rav/preserve_event_logcontext_leak (diff) | |
parent | document metrics changes (diff) | |
download | synapse-febdca4b37f50c062b20a8042fdc8abb34e57fbc.tar.xz |
Merge pull request #2785 from matrix-org/rav/reorganise_metrics_again
Reorganise request and block metrics
-rw-r--r-- | CHANGES.rst | 5 | ||||
-rw-r--r-- | docs/metrics-howto.rst | 61 | ||||
-rw-r--r-- | synapse/http/server.py | 56 | ||||
-rw-r--r-- | synapse/metrics/metric.py | 81 | ||||
-rw-r--r-- | synapse/util/metrics.py | 53 |
5 files changed, 209 insertions, 47 deletions
diff --git a/CHANGES.rst b/CHANGES.rst index 24e4e7a384..a7ed49e105 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -2,7 +2,10 @@ Unreleased ========== synctl no longer starts the main synapse when using ``-a`` option with workers. -A new worker file should be added with ``worker_app: synapse.app.homeserver`` +A new worker file should be added with ``worker_app: synapse.app.homeserver``. + +This release also begins the process of renaming a number of the metrics +reported to prometheus. See `docs/metrics-howto.rst <docs/metrics-howto.rst#block-and-response-metrics-renamed-for-0-27-0>`_. Changes in synapse v0.26.0 (2018-01-05) diff --git a/docs/metrics-howto.rst b/docs/metrics-howto.rst index 143cd0f42f..8acc479bc3 100644 --- a/docs/metrics-howto.rst +++ b/docs/metrics-howto.rst @@ -16,7 +16,7 @@ How to monitor Synapse metrics using Prometheus metrics_port: 9092 Also ensure that ``enable_metrics`` is set to ``True``. - + Restart synapse. 3. Add a prometheus target for synapse. @@ -28,11 +28,58 @@ How to monitor Synapse metrics using Prometheus static_configs: - targets: ["my.server.here:9092"] - If your prometheus is older than 1.5.2, you will need to replace + If your prometheus is older than 1.5.2, you will need to replace ``static_configs`` in the above with ``target_groups``. - + Restart prometheus. + +Block and response metrics renamed for 0.27.0 +--------------------------------------------- + +Synapse 0.27.0 begins the process of rationalising the duplicate ``*:count`` +metrics reported for the resource tracking for code blocks and HTTP requests. + +At the same time, the corresponding ``*:total`` metrics are being renamed, as +the ``:total`` suffix no longer makes sense in the absence of a corresponding +``:count`` metric. + +To enable a graceful migration path, this release just adds new names for the +metrics being renamed. A future release will remove the old ones. + +The following table shows the new metrics, and the old metrics which they are +replacing. + +==================================================== =================================================== +New name Old name +==================================================== =================================================== +synapse_util_metrics_block_count synapse_util_metrics_block_timer:count +synapse_util_metrics_block_count synapse_util_metrics_block_ru_utime:count +synapse_util_metrics_block_count synapse_util_metrics_block_ru_stime:count +synapse_util_metrics_block_count synapse_util_metrics_block_db_txn_count:count +synapse_util_metrics_block_count synapse_util_metrics_block_db_txn_duration:count + +synapse_util_metrics_block_time_seconds synapse_util_metrics_block_timer:total +synapse_util_metrics_block_ru_utime_seconds synapse_util_metrics_block_ru_utime:total +synapse_util_metrics_block_ru_stime_seconds synapse_util_metrics_block_ru_stime:total +synapse_util_metrics_block_db_txn_count synapse_util_metrics_block_db_txn_count:total +synapse_util_metrics_block_db_txn_duration_seconds synapse_util_metrics_block_db_txn_duration:total + +synapse_http_server_response_count synapse_http_server_requests +synapse_http_server_response_count synapse_http_server_response_time:count +synapse_http_server_response_count synapse_http_server_response_ru_utime:count +synapse_http_server_response_count synapse_http_server_response_ru_stime:count +synapse_http_server_response_count synapse_http_server_response_db_txn_count:count +synapse_http_server_response_count synapse_http_server_response_db_txn_duration:count + +synapse_http_server_response_time_seconds synapse_http_server_response_time:total +synapse_http_server_response_ru_utime_seconds synapse_http_server_response_ru_utime:total +synapse_http_server_response_ru_stime_seconds synapse_http_server_response_ru_stime:total +synapse_http_server_response_db_txn_count synapse_http_server_response_db_txn_count:total +synapse_http_server_response_db_txn_duration_seconds synapse_http_server_response_db_txn_duration:total +==================================================== =================================================== + + Standard Metric Names --------------------- @@ -42,7 +89,7 @@ have been changed to seconds, from miliseconds. ================================== ============================= New name Old name ----------------------------------- ----------------------------- +================================== ============================= process_cpu_user_seconds_total process_resource_utime / 1000 process_cpu_system_seconds_total process_resource_stime / 1000 process_open_fds (no 'type' label) process_fds @@ -52,8 +99,8 @@ The python-specific counts of garbage collector performance have been renamed. =========================== ====================== New name Old name ---------------------------- ---------------------- -python_gc_time reactor_gc_time +=========================== ====================== +python_gc_time reactor_gc_time python_gc_unreachable_total reactor_gc_unreachable python_gc_counts reactor_gc_counts =========================== ====================== @@ -62,7 +109,7 @@ The twisted-specific reactor metrics have been renamed. ==================================== ===================== New name Old name ------------------------------------- --------------------- +==================================== ===================== python_twisted_reactor_pending_calls reactor_pending_calls python_twisted_reactor_tick_time reactor_tick_time ==================================== ===================== diff --git a/synapse/http/server.py b/synapse/http/server.py index 6e8f4c9c5f..269b65ca41 100644 --- a/synapse/http/server.py +++ b/synapse/http/server.py @@ -42,34 +42,62 @@ logger = logging.getLogger(__name__) metrics = synapse.metrics.get_metrics_for(__name__) -incoming_requests_counter = metrics.register_counter( - "requests", +# total number of responses served, split by method/servlet/tag +response_count = metrics.register_counter( + "response_count", labels=["method", "servlet", "tag"], + alternative_names=( + # the following are all deprecated aliases for the same metric + metrics.name_prefix + x for x in ( + "_requests", + "_response_time:count", + "_response_ru_utime:count", + "_response_ru_stime:count", + "_response_db_txn_count:count", + "_response_db_txn_duration:count", + ) + ) ) + outgoing_responses_counter = metrics.register_counter( "responses", labels=["method", "code"], ) -response_timer = metrics.register_distribution( - "response_time", - labels=["method", "servlet", "tag"] +response_timer = metrics.register_counter( + "response_time_seconds", + labels=["method", "servlet", "tag"], + alternative_names=( + metrics.name_prefix + "_response_time:total", + ), ) -response_ru_utime = metrics.register_distribution( - "response_ru_utime", labels=["method", "servlet", "tag"] +response_ru_utime = metrics.register_counter( + "response_ru_utime_seconds", labels=["method", "servlet", "tag"], + alternative_names=( + metrics.name_prefix + "_response_ru_utime:total", + ), ) -response_ru_stime = metrics.register_distribution( - "response_ru_stime", labels=["method", "servlet", "tag"] +response_ru_stime = metrics.register_counter( + "response_ru_stime_seconds", labels=["method", "servlet", "tag"], + alternative_names=( + metrics.name_prefix + "_response_ru_stime:total", + ), ) -response_db_txn_count = metrics.register_distribution( - "response_db_txn_count", labels=["method", "servlet", "tag"] +response_db_txn_count = metrics.register_counter( + "response_db_txn_count", labels=["method", "servlet", "tag"], + alternative_names=( + metrics.name_prefix + "_response_db_txn_count:total", + ), ) -response_db_txn_duration = metrics.register_distribution( - "response_db_txn_duration", labels=["method", "servlet", "tag"] +response_db_txn_duration = metrics.register_counter( + "response_db_txn_duration_seconds", labels=["method", "servlet", "tag"], + alternative_names=( + metrics.name_prefix + "_response_db_txn_duration:total", + ), ) @@ -330,7 +358,7 @@ class RequestMetrics(object): ) return - incoming_requests_counter.inc(request.method, self.name, tag) + response_count.inc(request.method, self.name, tag) response_timer.inc_by( clock.time_msec() - self.start, request.method, diff --git a/synapse/metrics/metric.py b/synapse/metrics/metric.py index 1d054dd557..f480aae614 100644 --- a/synapse/metrics/metric.py +++ b/synapse/metrics/metric.py @@ -17,16 +17,33 @@ from itertools import chain -# TODO(paul): I can't believe Python doesn't have one of these -def map_concat(func, items): - # flatten a list-of-lists - return list(chain.from_iterable(map(func, items))) +def flatten(items): + """Flatten a list of lists + + Args: + items: iterable[iterable[X]] + + Returns: + list[X]: flattened list + """ + return list(chain.from_iterable(items)) class BaseMetric(object): + """Base class for metrics which report a single value per label set + """ - def __init__(self, name, labels=[]): - self.name = name + def __init__(self, name, labels=[], alternative_names=[]): + """ + Args: + name (str): principal name for this metric + labels (list(str)): names of the labels which will be reported + for this metric + alternative_names (iterable(str)): list of alternative names for + this metric. This can be useful to provide a migration path + when renaming metrics. + """ + self._names = [name] + list(alternative_names) self.labels = labels # OK not to clone as we never write it def dimension(self): @@ -36,7 +53,7 @@ class BaseMetric(object): return not len(self.labels) def _render_labelvalue(self, value): - # TODO: some kind of value escape + # TODO: escape backslashes, quotes and newlines return '"%s"' % (value) def _render_key(self, values): @@ -47,6 +64,36 @@ class BaseMetric(object): for k, v in zip(self.labels, values)]) ) + def _render_for_labels(self, label_values, value): + """Render this metric for a single set of labels + + Args: + label_values (list[str]): values for each of the labels + value: value of the metric at with these labels + + Returns: + iterable[str]: rendered metric + """ + rendered_labels = self._render_key(label_values) + return ( + "%s%s %.12g" % (name, rendered_labels, value) + for name in self._names + ) + + def render(self): + """Render this metric + + Each metric is rendered as: + + name{label1="val1",label2="val2"} value + + https://prometheus.io/docs/instrumenting/exposition_formats/#text-format-details + + Returns: + iterable[str]: rendered metrics + """ + raise NotImplementedError() + class CounterMetric(BaseMetric): """The simplest kind of metric; one that stores a monotonically-increasing @@ -62,6 +109,10 @@ class CounterMetric(BaseMetric): def __init__(self, *args, **kwargs): super(CounterMetric, self).__init__(*args, **kwargs) + # dict[list[str]]: value for each set of label values. the keys are the + # label values, in the same order as the labels in self.labels. + # + # (if the metric is a scalar, the (single) key is the empty list). self.counts = {} # Scalar metrics are never empty @@ -84,11 +135,11 @@ class CounterMetric(BaseMetric): def inc(self, *values): self.inc_by(1, *values) - def render_item(self, k): - return ["%s%s %.12g" % (self.name, self._render_key(k), self.counts[k])] - def render(self): - return map_concat(self.render_item, sorted(self.counts.keys())) + return flatten( + self._render_for_labels(k, self.counts[k]) + for k in sorted(self.counts.keys()) + ) class CallbackMetric(BaseMetric): @@ -105,10 +156,12 @@ class CallbackMetric(BaseMetric): value = self.callback() if self.is_scalar(): - return ["%s %.12g" % (self.name, value)] + return list(self._render_for_labels([], value)) - return ["%s%s %.12g" % (self.name, self._render_key(k), value[k]) - for k in sorted(value.keys())] + return flatten( + self._render_for_labels(k, value[k]) + for k in sorted(value.keys()) + ) class DistributionMetric(object): diff --git a/synapse/util/metrics.py b/synapse/util/metrics.py index 4ea930d3e8..8d22ff3068 100644 --- a/synapse/util/metrics.py +++ b/synapse/util/metrics.py @@ -27,25 +27,56 @@ logger = logging.getLogger(__name__) metrics = synapse.metrics.get_metrics_for(__name__) -block_timer = metrics.register_distribution( - "block_timer", - labels=["block_name"] +# total number of times we have hit this block +response_count = metrics.register_counter( + "block_count", + labels=["block_name"], + alternative_names=( + # the following are all deprecated aliases for the same metric + metrics.name_prefix + x for x in ( + "_block_timer:count", + "_block_ru_utime:count", + "_block_ru_stime:count", + "_block_db_txn_count:count", + "_block_db_txn_duration:count", + ) + ) +) + +block_timer = metrics.register_counter( + "block_time_seconds", + labels=["block_name"], + alternative_names=( + metrics.name_prefix + "_block_timer:total", + ), ) -block_ru_utime = metrics.register_distribution( - "block_ru_utime", labels=["block_name"] +block_ru_utime = metrics.register_counter( + "block_ru_utime_seconds", labels=["block_name"], + alternative_names=( + metrics.name_prefix + "_block_ru_utime:total", + ), ) -block_ru_stime = metrics.register_distribution( - "block_ru_stime", labels=["block_name"] +block_ru_stime = metrics.register_counter( + "block_ru_stime_seconds", labels=["block_name"], + alternative_names=( + metrics.name_prefix + "_block_ru_stime:total", + ), ) -block_db_txn_count = metrics.register_distribution( - "block_db_txn_count", labels=["block_name"] +block_db_txn_count = metrics.register_counter( + "block_db_txn_count", labels=["block_name"], + alternative_names=( + metrics.name_prefix + "_block_db_txn_count:total", + ), ) -block_db_txn_duration = metrics.register_distribution( - "block_db_txn_duration", labels=["block_name"] +block_db_txn_duration = metrics.register_counter( + "block_db_txn_duration_seconds", labels=["block_name"], + alternative_names=( + metrics.name_prefix + "_block_db_txn_count:total", + ), ) |