diff --git a/changelog.d/3722.bugfix b/changelog.d/3722.bugfix
new file mode 100644
index 0000000000..16cbaf76cb
--- /dev/null
+++ b/changelog.d/3722.bugfix
@@ -0,0 +1 @@
+Fix error collecting prometheus metrics when run on dedicated thread due to threading concurrency issues
diff --git a/changelog.d/3727.misc b/changelog.d/3727.misc
new file mode 100644
index 0000000000..0b83220d90
--- /dev/null
+++ b/changelog.d/3727.misc
@@ -0,0 +1 @@
+Log failure to authenticate remote servers as warnings (without stack traces)
diff --git a/synapse/crypto/keyclient.py b/synapse/crypto/keyclient.py
index c20a32096a..e94400b8e2 100644
--- a/synapse/crypto/keyclient.py
+++ b/synapse/crypto/keyclient.py
@@ -18,7 +18,9 @@ import logging
from canonicaljson import json
from twisted.internet import defer, reactor
+from twisted.internet.error import ConnectError
from twisted.internet.protocol import Factory
+from twisted.names.error import DomainError
from twisted.web.http import HTTPClient
from synapse.http.endpoint import matrix_federation_endpoint
@@ -47,12 +49,14 @@ def fetch_server_key(server_name, tls_client_options_factory, path=KEY_API_V1):
server_response, server_certificate = yield protocol.remote_key
defer.returnValue((server_response, server_certificate))
except SynapseKeyClientError as e:
- logger.exception("Error getting key for %r" % (server_name,))
+ logger.warn("Error getting key for %r: %s", server_name, e)
if e.status.startswith("4"):
# Don't retry for 4xx responses.
raise IOError("Cannot get key for %r" % server_name)
+ except (ConnectError, DomainError) as e:
+ logger.warn("Error getting key for %r: %s", server_name, e)
except Exception as e:
- logger.exception(e)
+ logger.exception("Error getting key for %r", server_name)
raise IOError("Cannot get key for %r" % server_name)
diff --git a/synapse/federation/transport/server.py b/synapse/federation/transport/server.py
index 77969a4f38..7a993fd1cf 100644
--- a/synapse/federation/transport/server.py
+++ b/synapse/federation/transport/server.py
@@ -261,10 +261,10 @@ class BaseFederationServlet(object):
except NoAuthenticationError:
origin = None
if self.REQUIRE_AUTH:
- logger.exception("authenticate_request failed")
+ logger.warn("authenticate_request failed: missing authentication")
raise
- except Exception:
- logger.exception("authenticate_request failed")
+ except Exception as e:
+ logger.warn("authenticate_request failed: %s", e)
raise
if origin:
diff --git a/synapse/http/request_metrics.py b/synapse/http/request_metrics.py
index 588e280571..72c2654678 100644
--- a/synapse/http/request_metrics.py
+++ b/synapse/http/request_metrics.py
@@ -15,6 +15,7 @@
# limitations under the License.
import logging
+import threading
from prometheus_client.core import Counter, Histogram
@@ -111,6 +112,9 @@ in_flight_requests_db_sched_duration = Counter(
# The set of all in flight requests, set[RequestMetrics]
_in_flight_requests = set()
+# Protects the _in_flight_requests set from concurrent accesss
+_in_flight_requests_lock = threading.Lock()
+
def _get_in_flight_counts():
"""Returns a count of all in flight requests by (method, server_name)
@@ -120,7 +124,8 @@ def _get_in_flight_counts():
"""
# Cast to a list to prevent it changing while the Prometheus
# thread is collecting metrics
- reqs = list(_in_flight_requests)
+ with _in_flight_requests_lock:
+ reqs = list(_in_flight_requests)
for rm in reqs:
rm.update_metrics()
@@ -154,10 +159,12 @@ class RequestMetrics(object):
# to the "in flight" metrics.
self._request_stats = self.start_context.get_resource_usage()
- _in_flight_requests.add(self)
+ with _in_flight_requests_lock:
+ _in_flight_requests.add(self)
def stop(self, time_sec, request):
- _in_flight_requests.discard(self)
+ with _in_flight_requests_lock:
+ _in_flight_requests.discard(self)
context = LoggingContext.current_context()
diff --git a/synapse/metrics/background_process_metrics.py b/synapse/metrics/background_process_metrics.py
index ce678d5f75..167167be0a 100644
--- a/synapse/metrics/background_process_metrics.py
+++ b/synapse/metrics/background_process_metrics.py
@@ -13,6 +13,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+import threading
+
import six
from prometheus_client.core import REGISTRY, Counter, GaugeMetricFamily
@@ -78,6 +80,9 @@ _background_process_counts = dict() # type: dict[str, int]
# of process descriptions that no longer have any active processes.
_background_processes = dict() # type: dict[str, set[_BackgroundProcess]]
+# A lock that covers the above dicts
+_bg_metrics_lock = threading.Lock()
+
class _Collector(object):
"""A custom metrics collector for the background process metrics.
@@ -92,7 +97,11 @@ class _Collector(object):
labels=["name"],
)
- for desc, processes in six.iteritems(_background_processes):
+ # We copy the dict so that it doesn't change from underneath us
+ with _bg_metrics_lock:
+ _background_processes_copy = dict(_background_processes)
+
+ for desc, processes in six.iteritems(_background_processes_copy):
background_process_in_flight_count.add_metric(
(desc,), len(processes),
)
@@ -167,19 +176,26 @@ def run_as_background_process(desc, func, *args, **kwargs):
"""
@defer.inlineCallbacks
def run():
- count = _background_process_counts.get(desc, 0)
- _background_process_counts[desc] = count + 1
+ with _bg_metrics_lock:
+ count = _background_process_counts.get(desc, 0)
+ _background_process_counts[desc] = count + 1
+
_background_process_start_count.labels(desc).inc()
with LoggingContext(desc) as context:
context.request = "%s-%i" % (desc, count)
proc = _BackgroundProcess(desc, context)
- _background_processes.setdefault(desc, set()).add(proc)
+
+ with _bg_metrics_lock:
+ _background_processes.setdefault(desc, set()).add(proc)
+
try:
yield func(*args, **kwargs)
finally:
proc.update_metrics()
- _background_processes[desc].remove(proc)
+
+ with _bg_metrics_lock:
+ _background_processes[desc].remove(proc)
with PreserveLoggingContext():
return run()
|