diff options
-rw-r--r-- | CHANGES.rst | 9 | ||||
-rw-r--r-- | UPGRADE.rst | 9 | ||||
-rw-r--r-- | contrib/prometheus/consoles/synapse.html | 20 | ||||
-rw-r--r-- | contrib/prometheus/synapse-v1.rules | 6 | ||||
-rw-r--r-- | contrib/prometheus/synapse-v2.rules | 12 | ||||
-rw-r--r-- | docs/metrics-howto.rst | 11 | ||||
-rw-r--r-- | synapse/handlers/room_member.py | 8 | ||||
-rw-r--r-- | synapse/http/endpoint.py | 103 | ||||
-rw-r--r-- | synapse/http/server.py | 26 | ||||
-rw-r--r-- | synapse/util/metrics.py | 25 | ||||
-rw-r--r-- | tests/test_dns.py | 29 |
11 files changed, 89 insertions, 169 deletions
diff --git a/CHANGES.rst b/CHANGES.rst index 5fbad54427..38372381ac 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,12 +1,3 @@ -Changes in synapse v0.28.0 (2018-xx-xx) -======================================= - -As previously advised, this release removes a number of redundant Prometheus -metrics. Administrators may need to update their dashboards and alerting rules -to use the updated metric names, if they have not already done so. See -`docs/metrics-howto.rst <docs/metrics-howto.rst#deprecated-metrics-removed-in-0-28-0>`_ -for more details. - Changes in synapse v0.27.2 (2018-03-26) ======================================= diff --git a/UPGRADE.rst b/UPGRADE.rst index 39a16b1c0c..f6bb1070b1 100644 --- a/UPGRADE.rst +++ b/UPGRADE.rst @@ -52,7 +52,7 @@ Upgrading to $NEXT_VERSION ==================== This release expands the anonymous usage stats sent if the opt-in -``report_stats`` configuration is set to ``true``. We now capture RSS memory +``report_stats`` configuration is set to ``true``. We now capture RSS memory and cpu use at a very coarse level. This requires administrators to install the optional ``psutil`` python module. @@ -60,13 +60,6 @@ We would appreciate it if you could assist by ensuring this module is available and ``report_stats`` is enabled. This will let us see if performance changes to synapse are having an impact to the general community. -This release also removes a number of redundant Prometheus metrics. -Administrators may need to update their dashboards and alerting rules to use -the updated metric names, if they have not already done so. See -`docs/metrics-howto.rst <docs/metrics-howto.rst#deprecated-metrics-removed-in-0-28-0>`_ -for more details. - - Upgrading to v0.15.0 ==================== diff --git a/contrib/prometheus/consoles/synapse.html b/contrib/prometheus/consoles/synapse.html index e23d8a1fce..69aa87f85e 100644 --- a/contrib/prometheus/consoles/synapse.html +++ b/contrib/prometheus/consoles/synapse.html @@ -202,11 +202,11 @@ new PromConsole.Graph({ <h1>Requests</h1> <h3>Requests by Servlet</h3> -<div id="synapse_http_server_requests_servlet"></div> +<div id="synapse_http_server_request_count_servlet"></div> <script> new PromConsole.Graph({ - node: document.querySelector("#synapse_http_server_requests_servlet"), - expr: "rate(synapse_http_server_requests:servlet[2m])", + node: document.querySelector("#synapse_http_server_request_count_servlet"), + expr: "rate(synapse_http_server_request_count:servlet[2m])", name: "[[servlet]]", yAxisFormatter: PromConsole.NumberFormatter.humanize, yHoverFormatter: PromConsole.NumberFormatter.humanize, @@ -215,11 +215,11 @@ new PromConsole.Graph({ }) </script> <h4> (without <tt>EventStreamRestServlet</tt> or <tt>SyncRestServlet</tt>)</h4> -<div id="synapse_http_server_requests_servlet_minus_events"></div> +<div id="synapse_http_server_request_count_servlet_minus_events"></div> <script> new PromConsole.Graph({ - node: document.querySelector("#synapse_http_server_requests_servlet_minus_events"), - expr: "rate(synapse_http_server_requests:servlet{servlet!=\"EventStreamRestServlet\", servlet!=\"SyncRestServlet\"}[2m])", + node: document.querySelector("#synapse_http_server_request_count_servlet_minus_events"), + expr: "rate(synapse_http_server_request_count:servlet{servlet!=\"EventStreamRestServlet\", servlet!=\"SyncRestServlet\"}[2m])", name: "[[servlet]]", yAxisFormatter: PromConsole.NumberFormatter.humanize, yHoverFormatter: PromConsole.NumberFormatter.humanize, @@ -233,7 +233,7 @@ new PromConsole.Graph({ <script> new PromConsole.Graph({ node: document.querySelector("#synapse_http_server_response_time_avg"), - expr: "rate(synapse_http_server_response_time:total[2m]) / rate(synapse_http_server_response_time:count[2m]) / 1000", + expr: "rate(synapse_http_server_response_time_seconds[2m]) / rate(synapse_http_server_response_count[2m]) / 1000", name: "[[servlet]]", yAxisFormatter: PromConsole.NumberFormatter.humanize, yHoverFormatter: PromConsole.NumberFormatter.humanize, @@ -276,7 +276,7 @@ new PromConsole.Graph({ <script> new PromConsole.Graph({ node: document.querySelector("#synapse_http_server_response_ru_utime"), - expr: "rate(synapse_http_server_response_ru_utime:total[2m])", + expr: "rate(synapse_http_server_response_ru_utime_seconds[2m])", name: "[[servlet]]", yAxisFormatter: PromConsole.NumberFormatter.humanize, yHoverFormatter: PromConsole.NumberFormatter.humanize, @@ -291,7 +291,7 @@ new PromConsole.Graph({ <script> new PromConsole.Graph({ node: document.querySelector("#synapse_http_server_response_db_txn_duration"), - expr: "rate(synapse_http_server_response_db_txn_duration:total[2m])", + expr: "rate(synapse_http_server_response_db_txn_duration_seconds[2m])", name: "[[servlet]]", yAxisFormatter: PromConsole.NumberFormatter.humanize, yHoverFormatter: PromConsole.NumberFormatter.humanize, @@ -306,7 +306,7 @@ new PromConsole.Graph({ <script> new PromConsole.Graph({ node: document.querySelector("#synapse_http_server_send_time_avg"), - expr: "rate(synapse_http_server_response_time:total{servlet='RoomSendEventRestServlet'}[2m]) / rate(synapse_http_server_response_time:count{servlet='RoomSendEventRestServlet'}[2m]) / 1000", + expr: "rate(synapse_http_server_response_time_second{servlet='RoomSendEventRestServlet'}[2m]) / rate(synapse_http_server_response_count{servlet='RoomSendEventRestServlet'}[2m]) / 1000", name: "[[servlet]]", yAxisFormatter: PromConsole.NumberFormatter.humanize, yHoverFormatter: PromConsole.NumberFormatter.humanize, diff --git a/contrib/prometheus/synapse-v1.rules b/contrib/prometheus/synapse-v1.rules index b6f84174b0..4c900ba537 100644 --- a/contrib/prometheus/synapse-v1.rules +++ b/contrib/prometheus/synapse-v1.rules @@ -1,10 +1,10 @@ synapse_federation_transaction_queue_pendingEdus:total = sum(synapse_federation_transaction_queue_pendingEdus or absent(synapse_federation_transaction_queue_pendingEdus)*0) synapse_federation_transaction_queue_pendingPdus:total = sum(synapse_federation_transaction_queue_pendingPdus or absent(synapse_federation_transaction_queue_pendingPdus)*0) -synapse_http_server_requests:method{servlet=""} = sum(synapse_http_server_requests) by (method) -synapse_http_server_requests:servlet{method=""} = sum(synapse_http_server_requests) by (servlet) +synapse_http_server_request_count:method{servlet=""} = sum(synapse_http_server_request_count) by (method) +synapse_http_server_request_count:servlet{method=""} = sum(synapse_http_server_request_count) by (servlet) -synapse_http_server_requests:total{servlet=""} = sum(synapse_http_server_requests:by_method) by (servlet) +synapse_http_server_request_count:total{servlet=""} = sum(synapse_http_server_request_count:by_method) by (servlet) synapse_cache:hit_ratio_5m = rate(synapse_util_caches_cache:hits[5m]) / rate(synapse_util_caches_cache:total[5m]) synapse_cache:hit_ratio_30s = rate(synapse_util_caches_cache:hits[30s]) / rate(synapse_util_caches_cache:total[30s]) diff --git a/contrib/prometheus/synapse-v2.rules b/contrib/prometheus/synapse-v2.rules index 07e37a885e..6ccca2daaf 100644 --- a/contrib/prometheus/synapse-v2.rules +++ b/contrib/prometheus/synapse-v2.rules @@ -5,19 +5,19 @@ groups: expr: "sum(synapse_federation_transaction_queue_pendingEdus or absent(synapse_federation_transaction_queue_pendingEdus)*0)" - record: "synapse_federation_transaction_queue_pendingPdus:total" expr: "sum(synapse_federation_transaction_queue_pendingPdus or absent(synapse_federation_transaction_queue_pendingPdus)*0)" - - record: 'synapse_http_server_requests:method' + - record: 'synapse_http_server_request_count:method' labels: servlet: "" - expr: "sum(synapse_http_server_requests) by (method)" - - record: 'synapse_http_server_requests:servlet' + expr: "sum(synapse_http_server_request_count) by (method)" + - record: 'synapse_http_server_request_count:servlet' labels: method: "" - expr: 'sum(synapse_http_server_requests) by (servlet)' + expr: 'sum(synapse_http_server_request_count) by (servlet)' - - record: 'synapse_http_server_requests:total' + - record: 'synapse_http_server_request_count:total' labels: servlet: "" - expr: 'sum(synapse_http_server_requests:by_method) by (servlet)' + expr: 'sum(synapse_http_server_request_count:by_method) by (servlet)' - record: 'synapse_cache:hit_ratio_5m' expr: 'rate(synapse_util_caches_cache:hits[5m]) / rate(synapse_util_caches_cache:total[5m])' diff --git a/docs/metrics-howto.rst b/docs/metrics-howto.rst index 5e2d7c52ec..8acc479bc3 100644 --- a/docs/metrics-howto.rst +++ b/docs/metrics-howto.rst @@ -34,17 +34,6 @@ How to monitor Synapse metrics using Prometheus Restart prometheus. -Deprecated metrics removed in 0.28.0 ------------------------------------- - -Synapse 0.28.0 removes all of the metrics deprecated by 0.27.0, which are those -listed under "Old name" below. This has been done to reduce the bandwidth used -by gathering metrics and the storage requirements for the Prometheus server, as -well as reducing CPU overhead for both Synapse and Prometheus. - -Administrators should update any alerts or monitoring dashboards to use the -"New name" listed below. - Block and response metrics renamed for 0.27.0 --------------------------------------------- diff --git a/synapse/handlers/room_member.py b/synapse/handlers/room_member.py index 9977be8831..c45142d38d 100644 --- a/synapse/handlers/room_member.py +++ b/synapse/handlers/room_member.py @@ -852,6 +852,14 @@ class RoomMemberMasterHandler(RoomMemberHandler): def _remote_join(self, requester, remote_room_hosts, room_id, user, content): """Implements RoomMemberHandler._remote_join """ + # filter ourselves out of remote_room_hosts: do_invite_join ignores it + # and if it is the only entry we'd like to return a 404 rather than a + # 500. + + remote_room_hosts = [ + host for host in remote_room_hosts if host != self.hs.hostname + ] + if len(remote_room_hosts) == 0: raise SynapseError(404, "No known servers") diff --git a/synapse/http/endpoint.py b/synapse/http/endpoint.py index 87639b9151..00572c2897 100644 --- a/synapse/http/endpoint.py +++ b/synapse/http/endpoint.py @@ -12,8 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import socket - from twisted.internet.endpoints import HostnameEndpoint, wrapClientTLS from twisted.internet import defer, reactor from twisted.internet.error import ConnectError @@ -33,7 +31,7 @@ SERVER_CACHE = {} # our record of an individual server which can be tried to reach a destination. # -# "host" is actually a dotted-quad or ipv6 address string. Except when there's +# "host" is the hostname acquired from the SRV record. Except when there's # no SRV record, in which case it is the original hostname. _Server = collections.namedtuple( "_Server", "priority weight host port expires" @@ -297,20 +295,13 @@ def resolve_service(service_name, dns_client=client, cache=SERVER_CACHE, clock=t payload = answer.payload - hosts = yield _get_hosts_for_srv_record( - dns_client, str(payload.target) - ) - - for (ip, ttl) in hosts: - host_ttl = min(answer.ttl, ttl) - - servers.append(_Server( - host=ip, - port=int(payload.port), - priority=int(payload.priority), - weight=int(payload.weight), - expires=int(clock.time()) + host_ttl, - )) + servers.append(_Server( + host=str(payload.target), + port=int(payload.port), + priority=int(payload.priority), + weight=int(payload.weight), + expires=int(clock.time()) + answer.ttl, + )) servers.sort() cache[service_name] = list(servers) @@ -328,81 +319,3 @@ def resolve_service(service_name, dns_client=client, cache=SERVER_CACHE, clock=t raise e defer.returnValue(servers) - - -@defer.inlineCallbacks -def _get_hosts_for_srv_record(dns_client, host): - """Look up each of the hosts in a SRV record - - Args: - dns_client (twisted.names.dns.IResolver): - host (basestring): host to look up - - Returns: - Deferred[list[(str, int)]]: a list of (host, ttl) pairs - - """ - ip4_servers = [] - ip6_servers = [] - - def cb(res): - # lookupAddress and lookupIP6Address return a three-tuple - # giving the answer, authority, and additional sections of the - # response. - # - # we only care about the answers. - - return res[0] - - def eb(res, record_type): - if res.check(DNSNameError): - return [] - logger.warn("Error looking up %s for %s: %s", record_type, host, res) - return res - - # no logcontexts here, so we can safely fire these off and gatherResults - d1 = dns_client.lookupAddress(host).addCallbacks( - cb, eb, errbackArgs=("A", )) - d2 = dns_client.lookupIPV6Address(host).addCallbacks( - cb, eb, errbackArgs=("AAAA", )) - results = yield defer.DeferredList( - [d1, d2], consumeErrors=True) - - # if all of the lookups failed, raise an exception rather than blowing out - # the cache with an empty result. - if results and all(s == defer.FAILURE for (s, _) in results): - defer.returnValue(results[0][1]) - - for (success, result) in results: - if success == defer.FAILURE: - continue - - for answer in result: - if not answer.payload: - continue - - try: - if answer.type == dns.A: - ip = answer.payload.dottedQuad() - ip4_servers.append((ip, answer.ttl)) - elif answer.type == dns.AAAA: - ip = socket.inet_ntop( - socket.AF_INET6, answer.payload.address, - ) - ip6_servers.append((ip, answer.ttl)) - else: - # the most likely candidate here is a CNAME record. - # rfc2782 says srvs may not point to aliases. - logger.warn( - "Ignoring unexpected DNS record type %s for %s", - answer.type, host, - ) - continue - except Exception as e: - logger.warn("Ignoring invalid DNS response for %s: %s", - host, e) - continue - - # keep the ipv4 results before the ipv6 results, mostly to match historical - # behaviour. - defer.returnValue(ip4_servers + ip6_servers) diff --git a/synapse/http/server.py b/synapse/http/server.py index ac75206ef5..64e083ebfc 100644 --- a/synapse/http/server.py +++ b/synapse/http/server.py @@ -47,6 +47,17 @@ metrics = synapse.metrics.get_metrics_for(__name__) response_count = metrics.register_counter( "response_count", labels=["method", "servlet", "tag"], + alternative_names=( + # the following are all deprecated aliases for the same metric + metrics.name_prefix + x for x in ( + "_requests", + "_response_time:count", + "_response_ru_utime:count", + "_response_ru_stime:count", + "_response_db_txn_count:count", + "_response_db_txn_duration:count", + ) + ) ) requests_counter = metrics.register_counter( @@ -62,24 +73,39 @@ outgoing_responses_counter = metrics.register_counter( response_timer = metrics.register_counter( "response_time_seconds", labels=["method", "servlet", "tag"], + alternative_names=( + metrics.name_prefix + "_response_time:total", + ), ) response_ru_utime = metrics.register_counter( "response_ru_utime_seconds", labels=["method", "servlet", "tag"], + alternative_names=( + metrics.name_prefix + "_response_ru_utime:total", + ), ) response_ru_stime = metrics.register_counter( "response_ru_stime_seconds", labels=["method", "servlet", "tag"], + alternative_names=( + metrics.name_prefix + "_response_ru_stime:total", + ), ) response_db_txn_count = metrics.register_counter( "response_db_txn_count", labels=["method", "servlet", "tag"], + alternative_names=( + metrics.name_prefix + "_response_db_txn_count:total", + ), ) # seconds spent waiting for db txns, excluding scheduling time, when processing # this request response_db_txn_duration = metrics.register_counter( "response_db_txn_duration_seconds", labels=["method", "servlet", "tag"], + alternative_names=( + metrics.name_prefix + "_response_db_txn_duration:total", + ), ) # seconds spent waiting for a db connection, when processing this request diff --git a/synapse/util/metrics.py b/synapse/util/metrics.py index c3d8237e8f..e4b5687a4b 100644 --- a/synapse/util/metrics.py +++ b/synapse/util/metrics.py @@ -31,28 +31,53 @@ metrics = synapse.metrics.get_metrics_for(__name__) block_counter = metrics.register_counter( "block_count", labels=["block_name"], + alternative_names=( + # the following are all deprecated aliases for the same metric + metrics.name_prefix + x for x in ( + "_block_timer:count", + "_block_ru_utime:count", + "_block_ru_stime:count", + "_block_db_txn_count:count", + "_block_db_txn_duration:count", + ) + ) ) block_timer = metrics.register_counter( "block_time_seconds", labels=["block_name"], + alternative_names=( + metrics.name_prefix + "_block_timer:total", + ), ) block_ru_utime = metrics.register_counter( "block_ru_utime_seconds", labels=["block_name"], + alternative_names=( + metrics.name_prefix + "_block_ru_utime:total", + ), ) block_ru_stime = metrics.register_counter( "block_ru_stime_seconds", labels=["block_name"], + alternative_names=( + metrics.name_prefix + "_block_ru_stime:total", + ), ) block_db_txn_count = metrics.register_counter( "block_db_txn_count", labels=["block_name"], + alternative_names=( + metrics.name_prefix + "_block_db_txn_count:total", + ), ) # seconds spent waiting for db txns, excluding scheduling time, in this block block_db_txn_duration = metrics.register_counter( "block_db_txn_duration_seconds", labels=["block_name"], + alternative_names=( + metrics.name_prefix + "_block_db_txn_duration:total", + ), ) # seconds spent waiting for a db connection, in this block diff --git a/tests/test_dns.py b/tests/test_dns.py index d08b0f4333..af607d626f 100644 --- a/tests/test_dns.py +++ b/tests/test_dns.py @@ -33,8 +33,6 @@ class DnsTestCase(unittest.TestCase): service_name = "test_service.example.com" host_name = "example.com" - ip_address = "127.0.0.1" - ip6_address = "::1" answer_srv = dns.RRHeader( type=dns.SRV, @@ -43,29 +41,9 @@ class DnsTestCase(unittest.TestCase): ) ) - answer_a = dns.RRHeader( - type=dns.A, - payload=dns.Record_A( - address=ip_address, - ) - ) - - answer_aaaa = dns.RRHeader( - type=dns.AAAA, - payload=dns.Record_AAAA( - address=ip6_address, - ) - ) - dns_client_mock.lookupService.return_value = defer.succeed( ([answer_srv], None, None), ) - dns_client_mock.lookupAddress.return_value = defer.succeed( - ([answer_a], None, None), - ) - dns_client_mock.lookupIPV6Address.return_value = defer.succeed( - ([answer_aaaa], None, None), - ) cache = {} @@ -74,13 +52,10 @@ class DnsTestCase(unittest.TestCase): ) dns_client_mock.lookupService.assert_called_once_with(service_name) - dns_client_mock.lookupAddress.assert_called_once_with(host_name) - dns_client_mock.lookupIPV6Address.assert_called_once_with(host_name) - self.assertEquals(len(servers), 2) + self.assertEquals(len(servers), 1) self.assertEquals(servers, cache[service_name]) - self.assertEquals(servers[0].host, ip_address) - self.assertEquals(servers[1].host, ip6_address) + self.assertEquals(servers[0].host, host_name) @defer.inlineCallbacks def test_from_cache_expired_and_dns_fail(self): |