summary refs log tree commit diff
diff options
context:
space:
mode:
authorPaul Evans <leonerd@leonerd.org.uk>2016-10-21 13:06:19 +0100
committerGitHub <noreply@github.com>2016-10-21 13:06:19 +0100
commita842fed4183f3227e52fa0682e254a7cd2df6bff (patch)
tree17326c9ff939645cb5be1f28081aaa37d8d7d235
parentMerge pull request #1175 from matrix-org/luke/feature-configurable-as-rate-li... (diff)
parentSplit callback metric lambda functions down onto their own lines to keep line... (diff)
downloadsynapse-a842fed4183f3227e52fa0682e254a7cd2df6bff.tar.xz
Merge pull request #1177 from matrix-org/paul/standard-metric-names
Standardise prometheus metrics
-rwxr-xr-xsynapse/app/homeserver.py2
-rw-r--r--synapse/metrics/__init__.py70
-rw-r--r--synapse/metrics/metric.py4
-rw-r--r--synapse/metrics/process_collector.py181
4 files changed, 191 insertions, 66 deletions
diff --git a/synapse/app/homeserver.py b/synapse/app/homeserver.py
index 54f35900f8..f27150d411 100755
--- a/synapse/app/homeserver.py
+++ b/synapse/app/homeserver.py
@@ -52,6 +52,7 @@ from synapse.config.homeserver import HomeServerConfig
 from synapse.crypto import context_factory
 from synapse.util.logcontext import LoggingContext
 from synapse.metrics import register_memory_metrics, get_metrics_for
+from synapse.metrics.process_collector import register_process_collector
 from synapse.metrics.resource import MetricsResource, METRICS_PREFIX
 from synapse.replication.resource import ReplicationResource, REPLICATION_PREFIX
 from synapse.federation.transport.server import TransportLayerServer
@@ -337,6 +338,7 @@ def setup(config_options):
         hs.get_replication_layer().start_get_pdu_cache()
 
         register_memory_metrics(hs)
+        register_process_collector()
 
     reactor.callWhenRunning(start)
 
diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py
index 76d5998d75..a6b868775d 100644
--- a/synapse/metrics/__init__.py
+++ b/synapse/metrics/__init__.py
@@ -13,14 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Because otherwise 'resource' collides with synapse.metrics.resource
-from __future__ import absolute_import
-
 import logging
-from resource import getrusage, RUSAGE_SELF
 import functools
-import os
-import stat
 import time
 import gc
 
@@ -36,6 +30,7 @@ logger = logging.getLogger(__name__)
 
 
 all_metrics = []
+all_collectors = []
 
 
 class Metrics(object):
@@ -46,6 +41,9 @@ class Metrics(object):
     def __init__(self, name):
         self.name_prefix = name
 
+    def register_collector(self, func):
+        all_collectors.append(func)
+
     def _register(self, metric_class, name, *args, **kwargs):
         full_name = "%s_%s" % (self.name_prefix, name)
 
@@ -94,8 +92,8 @@ def get_metrics_for(pkg_name):
 def render_all():
     strs = []
 
-    # TODO(paul): Internal hack
-    update_resource_metrics()
+    for collector in all_collectors:
+        collector()
 
     for metric in all_metrics:
         try:
@@ -109,62 +107,6 @@ def render_all():
     return "\n".join(strs)
 
 
-# Now register some standard process-wide state metrics, to give indications of
-# process resource usage
-
-rusage = None
-
-
-def update_resource_metrics():
-    global rusage
-    rusage = getrusage(RUSAGE_SELF)
-
-resource_metrics = get_metrics_for("process.resource")
-
-# msecs
-resource_metrics.register_callback("utime", lambda: rusage.ru_utime * 1000)
-resource_metrics.register_callback("stime", lambda: rusage.ru_stime * 1000)
-
-# kilobytes
-resource_metrics.register_callback("maxrss", lambda: rusage.ru_maxrss * 1024)
-
-TYPES = {
-    stat.S_IFSOCK: "SOCK",
-    stat.S_IFLNK: "LNK",
-    stat.S_IFREG: "REG",
-    stat.S_IFBLK: "BLK",
-    stat.S_IFDIR: "DIR",
-    stat.S_IFCHR: "CHR",
-    stat.S_IFIFO: "FIFO",
-}
-
-
-def _process_fds():
-    counts = {(k,): 0 for k in TYPES.values()}
-    counts[("other",)] = 0
-
-    # Not every OS will have a /proc/self/fd directory
-    if not os.path.exists("/proc/self/fd"):
-        return counts
-
-    for fd in os.listdir("/proc/self/fd"):
-        try:
-            s = os.stat("/proc/self/fd/%s" % (fd))
-            fmt = stat.S_IFMT(s.st_mode)
-            if fmt in TYPES:
-                t = TYPES[fmt]
-            else:
-                t = "other"
-
-            counts[(t,)] += 1
-        except OSError:
-            # the dirh itself used by listdir() is usually missing by now
-            pass
-
-    return counts
-
-get_metrics_for("process").register_callback("fds", _process_fds, labels=["type"])
-
 reactor_metrics = get_metrics_for("reactor")
 tick_time = reactor_metrics.register_distribution("tick_time")
 pending_calls_metric = reactor_metrics.register_distribution("pending_calls")
diff --git a/synapse/metrics/metric.py b/synapse/metrics/metric.py
index e81af29895..e87b2b80a7 100644
--- a/synapse/metrics/metric.py
+++ b/synapse/metrics/metric.py
@@ -98,9 +98,9 @@ class CallbackMetric(BaseMetric):
         value = self.callback()
 
         if self.is_scalar():
-            return ["%s %d" % (self.name, value)]
+            return ["%s %.12g" % (self.name, value)]
 
-        return ["%s%s %d" % (self.name, self._render_key(k), value[k])
+        return ["%s%s %.12g" % (self.name, self._render_key(k), value[k])
                 for k in sorted(value.keys())]
 
 
diff --git a/synapse/metrics/process_collector.py b/synapse/metrics/process_collector.py
new file mode 100644
index 0000000000..1c851d9234
--- /dev/null
+++ b/synapse/metrics/process_collector.py
@@ -0,0 +1,181 @@
+# -*- coding: utf-8 -*-
+# Copyright 2015, 2016 OpenMarket Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Because otherwise 'resource' collides with synapse.metrics.resource
+from __future__ import absolute_import
+
+import os
+import stat
+from resource import getrusage, RUSAGE_SELF
+
+from synapse.metrics import get_metrics_for
+
+
+TICKS_PER_SEC = 100
+BYTES_PER_PAGE = 4096
+
+HAVE_PROC_STAT = os.path.exists("/proc/stat")
+HAVE_PROC_SELF_STAT = os.path.exists("/proc/self/stat")
+HAVE_PROC_SELF_LIMITS = os.path.exists("/proc/self/limits")
+HAVE_PROC_SELF_FD = os.path.exists("/proc/self/fd")
+
+TYPES = {
+    stat.S_IFSOCK: "SOCK",
+    stat.S_IFLNK: "LNK",
+    stat.S_IFREG: "REG",
+    stat.S_IFBLK: "BLK",
+    stat.S_IFDIR: "DIR",
+    stat.S_IFCHR: "CHR",
+    stat.S_IFIFO: "FIFO",
+}
+
+# Field indexes from /proc/self/stat, taken from the proc(5) manpage
+STAT_FIELDS = {
+    "utime": 14,
+    "stime": 15,
+    "starttime": 22,
+    "vsize": 23,
+    "rss": 24,
+}
+
+
+rusage = None
+stats = {}
+fd_counts = None
+
+# In order to report process_start_time_seconds we need to know the
+# machine's boot time, because the value in /proc/self/stat is relative to
+# this
+boot_time = None
+if HAVE_PROC_STAT:
+    with open("/proc/stat") as _procstat:
+        for line in _procstat:
+            if line.startswith("btime "):
+                boot_time = int(line.split()[1])
+
+
+def update_resource_metrics():
+    global rusage
+    rusage = getrusage(RUSAGE_SELF)
+
+    if HAVE_PROC_SELF_STAT:
+        global stats
+        with open("/proc/self/stat") as s:
+            line = s.read()
+            # line is PID (command) more stats go here ...
+            raw_stats = line.split(") ", 1)[1].split(" ")
+
+            for (name, index) in STAT_FIELDS.iteritems():
+                # subtract 3 from the index, because proc(5) is 1-based, and
+                # we've lost the first two fields in PID and COMMAND above
+                stats[name] = int(raw_stats[index - 3])
+
+    global fd_counts
+    fd_counts = _process_fds()
+
+
+def _process_fds():
+    counts = {(k,): 0 for k in TYPES.values()}
+    counts[("other",)] = 0
+
+    # Not every OS will have a /proc/self/fd directory
+    if not HAVE_PROC_SELF_FD:
+        return counts
+
+    for fd in os.listdir("/proc/self/fd"):
+        try:
+            s = os.stat("/proc/self/fd/%s" % (fd))
+            fmt = stat.S_IFMT(s.st_mode)
+            if fmt in TYPES:
+                t = TYPES[fmt]
+            else:
+                t = "other"
+
+            counts[(t,)] += 1
+        except OSError:
+            # the dirh itself used by listdir() is usually missing by now
+            pass
+
+    return counts
+
+
+def register_process_collector():
+    # Legacy synapse-invented metric names
+
+    resource_metrics = get_metrics_for("process.resource")
+
+    resource_metrics.register_collector(update_resource_metrics)
+
+    # msecs
+    resource_metrics.register_callback("utime", lambda: rusage.ru_utime * 1000)
+    resource_metrics.register_callback("stime", lambda: rusage.ru_stime * 1000)
+
+    # kilobytes
+    resource_metrics.register_callback("maxrss", lambda: rusage.ru_maxrss * 1024)
+
+    get_metrics_for("process").register_callback("fds", _process_fds, labels=["type"])
+
+    # New prometheus-standard metric names
+
+    process_metrics = get_metrics_for("process")
+
+    if HAVE_PROC_SELF_STAT:
+        process_metrics.register_callback(
+            "cpu_user_seconds_total",
+            lambda: float(stats["utime"]) / TICKS_PER_SEC
+        )
+        process_metrics.register_callback(
+            "cpu_system_seconds_total",
+            lambda: float(stats["stime"]) / TICKS_PER_SEC
+        )
+        process_metrics.register_callback(
+            "cpu_seconds_total",
+            lambda: (float(stats["utime"] + stats["stime"])) / TICKS_PER_SEC
+        )
+
+        process_metrics.register_callback(
+            "virtual_memory_bytes",
+            lambda: int(stats["vsize"])
+        )
+        process_metrics.register_callback(
+            "resident_memory_bytes",
+            lambda: int(stats["rss"]) * BYTES_PER_PAGE
+        )
+
+        process_metrics.register_callback(
+            "start_time_seconds",
+            lambda: boot_time + int(stats["starttime"]) / TICKS_PER_SEC
+        )
+
+    if HAVE_PROC_SELF_FD:
+        process_metrics.register_callback(
+            "open_fds",
+            lambda: sum(fd_counts.values())
+        )
+
+    if HAVE_PROC_SELF_LIMITS:
+        def _get_max_fds():
+            with open("/proc/self/limits") as limits:
+                for line in limits:
+                    if not line.startswith("Max open files "):
+                        continue
+                    # Line is  Max open files  $SOFT  $HARD
+                    return int(line.split()[3])
+            return None
+
+        process_metrics.register_callback(
+            "max_fds",
+            lambda: _get_max_fds()
+        )