From 77b692e65dfe5da79e9f69f4a24bde1c768a998c Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Fri, 13 Jul 2018 15:26:10 +0100 Subject: Don't return unknown entities in get_entities_changed The stream cache keeps track of all entities that have changed since a particular stream position, so get_entities_changed does not need to return unknown entites when given a larger stream position. This makes it consistent with the behaviour of has_entity_changed. --- synapse/util/caches/stream_change_cache.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'synapse/util') diff --git a/synapse/util/caches/stream_change_cache.py b/synapse/util/caches/stream_change_cache.py index a1f8ff8f10..258655349b 100644 --- a/synapse/util/caches/stream_change_cache.py +++ b/synapse/util/caches/stream_change_cache.py @@ -74,19 +74,12 @@ class StreamChangeCache(object): assert type(stream_pos) is int if stream_pos >= self._earliest_known_stream_pos: - changed_entities = { + result = { self._cache[k] for k in self._cache.islice( start=self._cache.bisect_right(stream_pos), ) } - # we need to include entities which we don't know about, as well as - # those which are known to have changed since the stream pos. - result = { - e for e in entities - if e in changed_entities or e not in self._entity_to_key - } - self.metrics.inc_hits() else: result = set(entities) -- cgit 1.5.1 From 547b1355d3747b267db3e21aefd143382f49f4ec Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 17 Jul 2018 10:27:51 +0100 Subject: Fix perf regression in PR #3530 The get_entities_changed function was changed to return all changed entities since the given stream position, rather than only those changed from a given list of entities. This resulted in the function incorrectly returning large numbers of entities that, for example, caused large increases in database usage. --- synapse/util/caches/stream_change_cache.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'synapse/util') diff --git a/synapse/util/caches/stream_change_cache.py b/synapse/util/caches/stream_change_cache.py index 258655349b..c1e76b1a0e 100644 --- a/synapse/util/caches/stream_change_cache.py +++ b/synapse/util/caches/stream_change_cache.py @@ -74,12 +74,17 @@ class StreamChangeCache(object): assert type(stream_pos) is int if stream_pos >= self._earliest_known_stream_pos: - result = { + changed_entities = { self._cache[k] for k in self._cache.islice( start=self._cache.bisect_right(stream_pos), ) } + result = { + e for e in entities + if e in changed_entities + } + self.metrics.inc_hits() else: result = set(entities) -- cgit 1.5.1 From b2aa05a8d6b1b2fb9e1efcc6fb03f1b49bc1be1d Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 17 Jul 2018 11:07:04 +0100 Subject: Use efficient .intersection --- synapse/util/caches/stream_change_cache.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'synapse/util') diff --git a/synapse/util/caches/stream_change_cache.py b/synapse/util/caches/stream_change_cache.py index c1e76b1a0e..f2bde74dc5 100644 --- a/synapse/util/caches/stream_change_cache.py +++ b/synapse/util/caches/stream_change_cache.py @@ -80,10 +80,7 @@ class StreamChangeCache(object): ) } - result = { - e for e in entities - if e in changed_entities - } + result = changed_entities.intersection(entities) self.metrics.inc_hits() else: -- cgit 1.5.1 From 667fba68f3ca808f48143a2a739a54665b0162c6 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Wed, 18 Jul 2018 14:35:24 +0100 Subject: Run things as background processes This fixes #3518, and ensures that we get useful logs and metrics for lots of things that happen in the background. (There are certainly more things that happen in the background; these are just the common ones I've found running a single-process synapse locally). --- synapse/federation/transaction_queue.py | 15 ++++++--------- synapse/storage/background_updates.py | 10 ++++++++-- synapse/storage/client_ips.py | 15 +++++++++++---- synapse/storage/events.py | 10 ++++------ synapse/storage/events_worker.py | 10 ++++++---- synapse/util/caches/expiringcache.py | 6 +++++- synapse/util/distributor.py | 4 ++++ 7 files changed, 44 insertions(+), 26 deletions(-) (limited to 'synapse/util') diff --git a/synapse/federation/transaction_queue.py b/synapse/federation/transaction_queue.py index 5c5a73b73c..6996d6b695 100644 --- a/synapse/federation/transaction_queue.py +++ b/synapse/federation/transaction_queue.py @@ -168,7 +168,7 @@ class TransactionQueue(object): # fire off a processing loop in the background run_as_background_process( - "process_transaction_queue", + "process_event_queue_for_federation", self._process_event_queue_loop, ) @@ -434,14 +434,11 @@ class TransactionQueue(object): logger.debug("TX [%s] Starting transaction loop", destination) - # Drop the logcontext before starting the transaction. It doesn't - # really make sense to log all the outbound transactions against - # whatever path led us to this point: that's pretty arbitrary really. - # - # (this also means we can fire off _perform_transaction without - # yielding) - with logcontext.PreserveLoggingContext(): - self._transaction_transmission_loop(destination) + run_as_background_process( + "federation_transaction_transmission_loop", + self._transaction_transmission_loop, + destination, + ) @defer.inlineCallbacks def _transaction_transmission_loop(self, destination): diff --git a/synapse/storage/background_updates.py b/synapse/storage/background_updates.py index dc9eca7d15..5fe1ca2de7 100644 --- a/synapse/storage/background_updates.py +++ b/synapse/storage/background_updates.py @@ -19,6 +19,8 @@ from canonicaljson import json from twisted.internet import defer +from synapse.metrics.background_process_metrics import run_as_background_process + from . import engines from ._base import SQLBaseStore @@ -87,10 +89,14 @@ class BackgroundUpdateStore(SQLBaseStore): self._background_update_handlers = {} self._all_done = False - @defer.inlineCallbacks def start_doing_background_updates(self): - logger.info("Starting background schema updates") + run_as_background_process( + "background_updates", self._run_background_updates, + ) + @defer.inlineCallbacks + def _run_background_updates(self): + logger.info("Starting background schema updates") while True: yield self.hs.get_clock().sleep( self.BACKGROUND_UPDATE_INTERVAL_MS / 1000.) diff --git a/synapse/storage/client_ips.py b/synapse/storage/client_ips.py index b78eda3413..77ae10da3d 100644 --- a/synapse/storage/client_ips.py +++ b/synapse/storage/client_ips.py @@ -19,6 +19,7 @@ from six import iteritems from twisted.internet import defer +from synapse.metrics.background_process_metrics import run_as_background_process from synapse.util.caches import CACHE_SIZE_FACTOR from . import background_updates @@ -93,10 +94,16 @@ class ClientIpStore(background_updates.BackgroundUpdateStore): self._batch_row_update[key] = (user_agent, device_id, now) def _update_client_ips_batch(self): - to_update = self._batch_row_update - self._batch_row_update = {} - return self.runInteraction( - "_update_client_ips_batch", self._update_client_ips_batch_txn, to_update + def update(): + to_update = self._batch_row_update + self._batch_row_update = {} + return self.runInteraction( + "_update_client_ips_batch", self._update_client_ips_batch_txn, + to_update, + ) + + run_as_background_process( + "update_client_ips", update, ) def _update_client_ips_batch_txn(self, txn, to_update): diff --git a/synapse/storage/events.py b/synapse/storage/events.py index 2aaab0d02c..4ff0fdc4ab 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -33,12 +33,13 @@ from synapse.api.errors import SynapseError # these are only included to make the type annotations work from synapse.events import EventBase # noqa: F401 from synapse.events.snapshot import EventContext # noqa: F401 +from synapse.metrics.background_process_metrics import run_as_background_process from synapse.storage.events_worker import EventsWorkerStore from synapse.types import RoomStreamToken, get_domain_from_id from synapse.util.async import ObservableDeferred from synapse.util.caches.descriptors import cached, cachedInlineCallbacks from synapse.util.frozenutils import frozendict_json_encoder -from synapse.util.logcontext import PreserveLoggingContext, make_deferred_yieldable +from synapse.util.logcontext import make_deferred_yieldable from synapse.util.logutils import log_function from synapse.util.metrics import Measure @@ -155,11 +156,8 @@ class _EventPeristenceQueue(object): self._event_persist_queues[room_id] = queue self._currently_persisting_rooms.discard(room_id) - # set handle_queue_loop off on the background. We don't want to - # attribute work done in it to the current request, so we drop the - # logcontext altogether. - with PreserveLoggingContext(): - handle_queue_loop() + # set handle_queue_loop off in the background + run_as_background_process("persist_events", handle_queue_loop) def _get_drainining_queue(self, room_id): queue = self._event_persist_queues.setdefault(room_id, deque()) diff --git a/synapse/storage/events_worker.py b/synapse/storage/events_worker.py index 67433606c6..f28239a808 100644 --- a/synapse/storage/events_worker.py +++ b/synapse/storage/events_worker.py @@ -25,6 +25,7 @@ from synapse.events import EventBase # noqa: F401 from synapse.events import FrozenEvent from synapse.events.snapshot import EventContext # noqa: F401 from synapse.events.utils import prune_event +from synapse.metrics.background_process_metrics import run_as_background_process from synapse.util.logcontext import ( LoggingContext, PreserveLoggingContext, @@ -322,10 +323,11 @@ class EventsWorkerStore(SQLBaseStore): should_start = False if should_start: - with PreserveLoggingContext(): - self.runWithConnection( - self._do_fetch - ) + run_as_background_process( + "fetch_events", + self.runWithConnection, + self._do_fetch, + ) logger.debug("Loading %d events", len(events)) with PreserveLoggingContext(): diff --git a/synapse/util/caches/expiringcache.py b/synapse/util/caches/expiringcache.py index 4abca91f6d..465adc54a8 100644 --- a/synapse/util/caches/expiringcache.py +++ b/synapse/util/caches/expiringcache.py @@ -16,6 +16,7 @@ import logging from collections import OrderedDict +from synapse.metrics.background_process_metrics import run_as_background_process from synapse.util.caches import register_cache logger = logging.getLogger(__name__) @@ -63,7 +64,10 @@ class ExpiringCache(object): return def f(): - self._prune_cache() + run_as_background_process( + "prune_cache_%s" % self._cache_name, + self._prune_cache, + ) self._clock.looping_call(f, self._expiry_ms / 2) diff --git a/synapse/util/distributor.py b/synapse/util/distributor.py index 734331caaa..d91ae400eb 100644 --- a/synapse/util/distributor.py +++ b/synapse/util/distributor.py @@ -75,6 +75,10 @@ class Distributor(object): self.pre_registration[name].append(observer) def fire(self, name, *args, **kwargs): + """Dispatches the given signal to the registered observers. + + Runs the observers as a background process. Does not return a deferred. + """ if name not in self.signals: raise KeyError("%r does not have a signal named %s" % (self, name)) -- cgit 1.5.1 From 8c69b735e3186875a8bf676241f990470fb7c592 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Wed, 18 Jul 2018 15:33:13 +0100 Subject: Make Distributor run its processes as a background process This is more involved than it might otherwise be, because the current implementation just drops its logcontexts and runs everything in the sentinel context. It turns out that we aren't actually using a bunch of the functionality here (notably suppress_failures and the fact that Distributor.fire returns a deferred), so the easiest way to fix this is actually by simplifying a bunch of code. --- synapse/util/distributor.py | 44 +++++++++++++++-------------------- tests/test_distributor.py | 56 ++++----------------------------------------- 2 files changed, 22 insertions(+), 78 deletions(-) (limited to 'synapse/util') diff --git a/synapse/util/distributor.py b/synapse/util/distributor.py index d91ae400eb..194da87639 100644 --- a/synapse/util/distributor.py +++ b/synapse/util/distributor.py @@ -17,20 +17,18 @@ import logging from twisted.internet import defer -from synapse.util import unwrapFirstError -from synapse.util.logcontext import PreserveLoggingContext +from synapse.metrics.background_process_metrics import run_as_background_process +from synapse.util.logcontext import make_deferred_yieldable, run_in_background logger = logging.getLogger(__name__) def user_left_room(distributor, user, room_id): - with PreserveLoggingContext(): - distributor.fire("user_left_room", user=user, room_id=room_id) + distributor.fire("user_left_room", user=user, room_id=room_id) def user_joined_room(distributor, user, room_id): - with PreserveLoggingContext(): - distributor.fire("user_joined_room", user=user, room_id=room_id) + distributor.fire("user_joined_room", user=user, room_id=room_id) class Distributor(object): @@ -44,9 +42,7 @@ class Distributor(object): model will do for today. """ - def __init__(self, suppress_failures=True): - self.suppress_failures = suppress_failures - + def __init__(self): self.signals = {} self.pre_registration = {} @@ -56,7 +52,6 @@ class Distributor(object): self.signals[name] = Signal( name, - suppress_failures=self.suppress_failures, ) if name in self.pre_registration: @@ -82,7 +77,11 @@ class Distributor(object): if name not in self.signals: raise KeyError("%r does not have a signal named %s" % (self, name)) - return self.signals[name].fire(*args, **kwargs) + run_as_background_process( + name, + self.signals[name].fire, + *args, **kwargs + ) class Signal(object): @@ -95,9 +94,8 @@ class Signal(object): method into all of the observers. """ - def __init__(self, name, suppress_failures): + def __init__(self, name): self.name = name - self.suppress_failures = suppress_failures self.observers = [] def observe(self, observer): @@ -107,7 +105,6 @@ class Signal(object): Each observer callable may return a Deferred.""" self.observers.append(observer) - @defer.inlineCallbacks def fire(self, *args, **kwargs): """Invokes every callable in the observer list, passing in the args and kwargs. Exceptions thrown by observers are logged but ignored. It is @@ -125,22 +122,17 @@ class Signal(object): failure.type, failure.value, failure.getTracebackObject())) - if not self.suppress_failures: - return failure return defer.maybeDeferred(observer, *args, **kwargs).addErrback(eb) - with PreserveLoggingContext(): - deferreds = [ - do(observer) - for observer in self.observers - ] - - res = yield defer.gatherResults( - deferreds, consumeErrors=True - ).addErrback(unwrapFirstError) + deferreds = [ + run_in_background(do, o) + for o in self.observers + ] - defer.returnValue(res) + return make_deferred_yieldable(defer.gatherResults( + deferreds, consumeErrors=True, + )) def __repr__(self): return "" % (self.name,) diff --git a/tests/test_distributor.py b/tests/test_distributor.py index 04a88056f1..71d11cda77 100644 --- a/tests/test_distributor.py +++ b/tests/test_distributor.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- # Copyright 2014-2016 OpenMarket Ltd +# Copyright 2018 New Vector Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,8 +16,6 @@ from mock import Mock, patch -from twisted.internet import defer - from synapse.util.distributor import Distributor from . import unittest @@ -27,38 +26,15 @@ class DistributorTestCase(unittest.TestCase): def setUp(self): self.dist = Distributor() - @defer.inlineCallbacks def test_signal_dispatch(self): self.dist.declare("alert") observer = Mock() self.dist.observe("alert", observer) - d = self.dist.fire("alert", 1, 2, 3) - yield d - self.assertTrue(d.called) + self.dist.fire("alert", 1, 2, 3) observer.assert_called_with(1, 2, 3) - @defer.inlineCallbacks - def test_signal_dispatch_deferred(self): - self.dist.declare("whine") - - d_inner = defer.Deferred() - - def observer(): - return d_inner - - self.dist.observe("whine", observer) - - d_outer = self.dist.fire("whine") - - self.assertFalse(d_outer.called) - - d_inner.callback(None) - yield d_outer - self.assertTrue(d_outer.called) - - @defer.inlineCallbacks def test_signal_catch(self): self.dist.declare("alarm") @@ -71,9 +47,7 @@ class DistributorTestCase(unittest.TestCase): with patch( "synapse.util.distributor.logger", spec=["warning"] ) as mock_logger: - d = self.dist.fire("alarm", "Go") - yield d - self.assertTrue(d.called) + self.dist.fire("alarm", "Go") observers[0].assert_called_once_with("Go") observers[1].assert_called_once_with("Go") @@ -83,34 +57,12 @@ class DistributorTestCase(unittest.TestCase): mock_logger.warning.call_args[0][0], str ) - @defer.inlineCallbacks - def test_signal_catch_no_suppress(self): - # Gut-wrenching - self.dist.suppress_failures = False - - self.dist.declare("whail") - - class MyException(Exception): - pass - - @defer.inlineCallbacks - def observer(): - raise MyException("Oopsie") - - self.dist.observe("whail", observer) - - d = self.dist.fire("whail") - - yield self.assertFailure(d, MyException) - self.dist.suppress_failures = True - - @defer.inlineCallbacks def test_signal_prereg(self): observer = Mock() self.dist.observe("flare", observer) self.dist.declare("flare") - yield self.dist.fire("flare", 4, 5) + self.dist.fire("flare", 4, 5) observer.assert_called_with(4, 5) -- cgit 1.5.1 From 95ccb6e2ec57f2150a697ea9cde030e8f78d6db9 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Thu, 19 Jul 2018 20:58:18 +1000 Subject: Don't spew errors because we can't save metrics (#3563) --- changelog.d/3548.bugfix | 1 + synapse/util/logcontext.py | 11 +++++++++++ synapse/util/metrics.py | 19 +++++++++++++------ 3 files changed, 25 insertions(+), 6 deletions(-) create mode 100644 changelog.d/3548.bugfix (limited to 'synapse/util') diff --git a/changelog.d/3548.bugfix b/changelog.d/3548.bugfix new file mode 100644 index 0000000000..38dc3b1232 --- /dev/null +++ b/changelog.d/3548.bugfix @@ -0,0 +1 @@ +Catch failures saving metrics captured by Measure, and instead log the faulty metrics information for further analysis. diff --git a/synapse/util/logcontext.py b/synapse/util/logcontext.py index f6c7175f74..8dcae50b39 100644 --- a/synapse/util/logcontext.py +++ b/synapse/util/logcontext.py @@ -99,6 +99,17 @@ class ContextResourceUsage(object): self.db_sched_duration_sec = 0 self.evt_db_fetch_count = 0 + def __repr__(self): + return ("") % ( + self.ru_stime, + self.ru_utime, + self.db_txn_count, + self.db_txn_duration_sec, + self.db_sched_duration_sec, + self.evt_db_fetch_count,) + def __iadd__(self, other): """Add another ContextResourceUsage's stats to this one's. diff --git a/synapse/util/metrics.py b/synapse/util/metrics.py index 6ba7107896..97f1267380 100644 --- a/synapse/util/metrics.py +++ b/synapse/util/metrics.py @@ -104,12 +104,19 @@ class Measure(object): logger.warn("Expected context. (%r)", self.name) return - usage = context.get_resource_usage() - self.start_usage - block_ru_utime.labels(self.name).inc(usage.ru_utime) - block_ru_stime.labels(self.name).inc(usage.ru_stime) - block_db_txn_count.labels(self.name).inc(usage.db_txn_count) - block_db_txn_duration.labels(self.name).inc(usage.db_txn_duration_sec) - block_db_sched_duration.labels(self.name).inc(usage.db_sched_duration_sec) + current = context.get_resource_usage() + usage = current - self.start_usage + try: + block_ru_utime.labels(self.name).inc(usage.ru_utime) + block_ru_stime.labels(self.name).inc(usage.ru_stime) + block_db_txn_count.labels(self.name).inc(usage.db_txn_count) + block_db_txn_duration.labels(self.name).inc(usage.db_txn_duration_sec) + block_db_sched_duration.labels(self.name).inc(usage.db_sched_duration_sec) + except ValueError: + logger.warn( + "Failed to save metrics! OLD: %r, NEW: %r", + self.start_usage, current + ) if self.created_context: self.start_context.__exit__(exc_type, exc_val, exc_tb) -- cgit 1.5.1 From d7275eecf3abcd5f7c645e5da47c7e7310350333 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Fri, 20 Jul 2018 12:37:12 +0100 Subject: Add a sleep to the Limiter to fix stack overflows. Fixes #3570 --- synapse/util/async.py | 23 ++++++++++++++++++++--- tests/util/test_limiter.py | 8 ++++---- 2 files changed, 24 insertions(+), 7 deletions(-) (limited to 'synapse/util') diff --git a/synapse/util/async.py b/synapse/util/async.py index 5d0fb39130..7d5acecb1c 100644 --- a/synapse/util/async.py +++ b/synapse/util/async.py @@ -248,11 +248,15 @@ class Limiter(object): # do some work. """ - def __init__(self, max_count): + def __init__(self, max_count, clock=None): """ Args: max_count(int): The maximum number of concurrent access """ + if not clock: + from twisted.internet import reactor + clock = Clock(reactor) + self._clock = clock self.max_count = max_count # key_to_defer is a map from the key to a 2 element list where @@ -277,10 +281,23 @@ class Limiter(object): with PreserveLoggingContext(): yield new_defer logger.info("Acquired limiter lock for key %r", key) + entry[0] += 1 + + # if the code holding the lock completes synchronously, then it + # will recursively run the next claimant on the list. That can + # relatively rapidly lead to stack exhaustion. This is essentially + # the same problem as http://twistedmatrix.com/trac/ticket/9304. + # + # In order to break the cycle, we add a cheeky sleep(0) here to + # ensure that we fall back to the reactor between each iteration. + # + # (This needs to happen while we hold the lock, and the context manager's exit + # code must be synchronous, so this is the only sensible place.) + yield self._clock.sleep(0) + else: logger.info("Acquired uncontended limiter lock for key %r", key) - - entry[0] += 1 + entry[0] += 1 @contextmanager def _ctx_manager(): diff --git a/tests/util/test_limiter.py b/tests/util/test_limiter.py index a5a767b1ff..f7b942f5c1 100644 --- a/tests/util/test_limiter.py +++ b/tests/util/test_limiter.py @@ -48,21 +48,21 @@ class LimiterTestCase(unittest.TestCase): self.assertFalse(d4.called) self.assertFalse(d5.called) - self.assertTrue(d4.called) + cm4 = yield d4 self.assertFalse(d5.called) with cm3: self.assertFalse(d5.called) - self.assertTrue(d5.called) + cm5 = yield d5 with cm2: pass - with (yield d4): + with cm4: pass - with (yield d5): + with cm5: pass d6 = limiter.queue(key) -- cgit 1.5.1 From 8462c26485fb4f19fc52accc05870c0ea4c8eb6a Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Fri, 20 Jul 2018 12:43:23 +0100 Subject: Improvements to the Limiter * give them names, to improve logging * use a deque rather than a list for efficiency --- synapse/handlers/message.py | 2 +- synapse/util/async.py | 33 ++++++++++++++++++++------------- 2 files changed, 21 insertions(+), 14 deletions(-) (limited to 'synapse/util') diff --git a/synapse/handlers/message.py b/synapse/handlers/message.py index a39b852ceb..8c12c6990f 100644 --- a/synapse/handlers/message.py +++ b/synapse/handlers/message.py @@ -427,7 +427,7 @@ class EventCreationHandler(object): # We arbitrarily limit concurrent event creation for a room to 5. # This is to stop us from diverging history *too* much. - self.limiter = Limiter(max_count=5) + self.limiter = Limiter(max_count=5, name="room_event_creation_limit") self.action_generator = hs.get_action_generator() diff --git a/synapse/util/async.py b/synapse/util/async.py index 7d5acecb1c..22071ddef7 100644 --- a/synapse/util/async.py +++ b/synapse/util/async.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- # Copyright 2014-2016 OpenMarket Ltd +# Copyright 2018 New Vector Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import collections import logging from contextlib import contextmanager @@ -248,11 +249,16 @@ class Limiter(object): # do some work. """ - def __init__(self, max_count, clock=None): + def __init__(self, max_count=1, name=None, clock=None): """ Args: - max_count(int): The maximum number of concurrent access + max_count(int): The maximum number of concurrent accesses """ + if name is None: + self.name = id(self) + else: + self.name = name + if not clock: from twisted.internet import reactor clock = Clock(reactor) @@ -260,14 +266,14 @@ class Limiter(object): self.max_count = max_count # key_to_defer is a map from the key to a 2 element list where - # the first element is the number of things executing - # the second element is a list of deferreds for the things blocked from + # the first element is the number of things executing, and + # the second element is a deque of deferreds for the things blocked from # executing. self.key_to_defer = {} @defer.inlineCallbacks def queue(self, key): - entry = self.key_to_defer.setdefault(key, [0, []]) + entry = self.key_to_defer.setdefault(key, [0, collections.deque()]) # If the number of things executing is greater than the maximum # then add a deferred to the list of blocked items @@ -277,10 +283,10 @@ class Limiter(object): new_defer = defer.Deferred() entry[1].append(new_defer) - logger.info("Waiting to acquire limiter lock for key %r", key) - with PreserveLoggingContext(): - yield new_defer - logger.info("Acquired limiter lock for key %r", key) + logger.info("Waiting to acquire limiter lock %r for key %r", self.name, key) + yield make_deferred_yieldable(new_defer) + + logger.info("Acquired limiter lock %r for key %r", self.name, key) entry[0] += 1 # if the code holding the lock completes synchronously, then it @@ -296,7 +302,7 @@ class Limiter(object): yield self._clock.sleep(0) else: - logger.info("Acquired uncontended limiter lock for key %r", key) + logger.info("Acquired uncontended limiter lock %r for key %r", self.name, key) entry[0] += 1 @contextmanager @@ -304,15 +310,16 @@ class Limiter(object): try: yield finally: - logger.info("Releasing limiter lock for key %r", key) + logger.info("Releasing limiter lock %r for key %r", self.name, key) # We've finished executing so check if there are any things # blocked waiting to execute and start one of them entry[0] -= 1 if entry[1]: - next_def = entry[1].pop(0) + next_def = entry[1].popleft() + # we need to run the next thing in the sentinel context. with PreserveLoggingContext(): next_def.callback(None) elif entry[0] == 0: -- cgit 1.5.1 From 7c712f95bbc7f405355d5714c92d65551f64fec2 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Fri, 20 Jul 2018 13:11:43 +0100 Subject: Combine Limiter and Linearizer Linearizer was effectively a Limiter with max_count=1, so rather than maintaining two sets of code, let's combine them. --- synapse/handlers/message.py | 4 +- synapse/util/async.py | 99 +++++-------------------------------------- tests/util/test_limiter.py | 70 ------------------------------ tests/util/test_linearizer.py | 47 ++++++++++++++++++++ 4 files changed, 59 insertions(+), 161 deletions(-) delete mode 100644 tests/util/test_limiter.py (limited to 'synapse/util') diff --git a/synapse/handlers/message.py b/synapse/handlers/message.py index 8c12c6990f..abc07ea87c 100644 --- a/synapse/handlers/message.py +++ b/synapse/handlers/message.py @@ -33,7 +33,7 @@ from synapse.events.utils import serialize_event from synapse.events.validator import EventValidator from synapse.replication.http.send_event import send_event_to_master from synapse.types import RoomAlias, RoomStreamToken, UserID -from synapse.util.async import Limiter, ReadWriteLock +from synapse.util.async import Linearizer, ReadWriteLock from synapse.util.frozenutils import frozendict_json_encoder from synapse.util.logcontext import run_in_background from synapse.util.metrics import measure_func @@ -427,7 +427,7 @@ class EventCreationHandler(object): # We arbitrarily limit concurrent event creation for a room to 5. # This is to stop us from diverging history *too* much. - self.limiter = Limiter(max_count=5, name="room_event_creation_limit") + self.limiter = Linearizer(max_count=5, name="room_event_creation_limit") self.action_generator = hs.get_action_generator() diff --git a/synapse/util/async.py b/synapse/util/async.py index 22071ddef7..5a50d9700f 100644 --- a/synapse/util/async.py +++ b/synapse/util/async.py @@ -157,91 +157,8 @@ def concurrently_execute(func, args, limit): class Linearizer(object): - """Linearizes access to resources based on a key. Useful to ensure only one - thing is happening at a time on a given resource. - - Example: - - with (yield linearizer.queue("test_key")): - # do some work. - - """ - def __init__(self, name=None, clock=None): - if name is None: - self.name = id(self) - else: - self.name = name - self.key_to_defer = {} - - if not clock: - from twisted.internet import reactor - clock = Clock(reactor) - self._clock = clock - - @defer.inlineCallbacks - def queue(self, key): - # If there is already a deferred in the queue, we pull it out so that - # we can wait on it later. - # Then we replace it with a deferred that we resolve *after* the - # context manager has exited. - # We only return the context manager after the previous deferred has - # resolved. - # This all has the net effect of creating a chain of deferreds that - # wait for the previous deferred before starting their work. - current_defer = self.key_to_defer.get(key) - - new_defer = defer.Deferred() - self.key_to_defer[key] = new_defer - - if current_defer: - logger.info( - "Waiting to acquire linearizer lock %r for key %r", self.name, key - ) - try: - with PreserveLoggingContext(): - yield current_defer - except Exception: - logger.exception("Unexpected exception in Linearizer") - - logger.info("Acquired linearizer lock %r for key %r", self.name, - key) - - # if the code holding the lock completes synchronously, then it - # will recursively run the next claimant on the list. That can - # relatively rapidly lead to stack exhaustion. This is essentially - # the same problem as http://twistedmatrix.com/trac/ticket/9304. - # - # In order to break the cycle, we add a cheeky sleep(0) here to - # ensure that we fall back to the reactor between each iteration. - # - # (There's no particular need for it to happen before we return - # the context manager, but it needs to happen while we hold the - # lock, and the context manager's exit code must be synchronous, - # so actually this is the only sensible place. - yield self._clock.sleep(0) - - else: - logger.info("Acquired uncontended linearizer lock %r for key %r", - self.name, key) - - @contextmanager - def _ctx_manager(): - try: - yield - finally: - logger.info("Releasing linearizer lock %r for key %r", self.name, key) - with PreserveLoggingContext(): - new_defer.callback(None) - current_d = self.key_to_defer.get(key) - if current_d is new_defer: - self.key_to_defer.pop(key, None) - - defer.returnValue(_ctx_manager()) - - -class Limiter(object): """Limits concurrent access to resources based on a key. Useful to ensure - only a few thing happen at a time on a given resource. + only a few things happen at a time on a given resource. Example: @@ -249,7 +166,7 @@ class Limiter(object): # do some work. """ - def __init__(self, max_count=1, name=None, clock=None): + def __init__(self, name=None, max_count=1, clock=None): """ Args: max_count(int): The maximum number of concurrent accesses @@ -283,10 +200,12 @@ class Limiter(object): new_defer = defer.Deferred() entry[1].append(new_defer) - logger.info("Waiting to acquire limiter lock %r for key %r", self.name, key) + logger.info( + "Waiting to acquire linearizer lock %r for key %r", self.name, key, + ) yield make_deferred_yieldable(new_defer) - logger.info("Acquired limiter lock %r for key %r", self.name, key) + logger.info("Acquired linearizer lock %r for key %r", self.name, key) entry[0] += 1 # if the code holding the lock completes synchronously, then it @@ -302,7 +221,9 @@ class Limiter(object): yield self._clock.sleep(0) else: - logger.info("Acquired uncontended limiter lock %r for key %r", self.name, key) + logger.info( + "Acquired uncontended linearizer lock %r for key %r", self.name, key, + ) entry[0] += 1 @contextmanager @@ -310,7 +231,7 @@ class Limiter(object): try: yield finally: - logger.info("Releasing limiter lock %r for key %r", self.name, key) + logger.info("Releasing linearizer lock %r for key %r", self.name, key) # We've finished executing so check if there are any things # blocked waiting to execute and start one of them diff --git a/tests/util/test_limiter.py b/tests/util/test_limiter.py deleted file mode 100644 index f7b942f5c1..0000000000 --- a/tests/util/test_limiter.py +++ /dev/null @@ -1,70 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2016 OpenMarket Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from twisted.internet import defer - -from synapse.util.async import Limiter - -from tests import unittest - - -class LimiterTestCase(unittest.TestCase): - - @defer.inlineCallbacks - def test_limiter(self): - limiter = Limiter(3) - - key = object() - - d1 = limiter.queue(key) - cm1 = yield d1 - - d2 = limiter.queue(key) - cm2 = yield d2 - - d3 = limiter.queue(key) - cm3 = yield d3 - - d4 = limiter.queue(key) - self.assertFalse(d4.called) - - d5 = limiter.queue(key) - self.assertFalse(d5.called) - - with cm1: - self.assertFalse(d4.called) - self.assertFalse(d5.called) - - cm4 = yield d4 - self.assertFalse(d5.called) - - with cm3: - self.assertFalse(d5.called) - - cm5 = yield d5 - - with cm2: - pass - - with cm4: - pass - - with cm5: - pass - - d6 = limiter.queue(key) - with (yield d6): - pass diff --git a/tests/util/test_linearizer.py b/tests/util/test_linearizer.py index c95907b32c..c9563124f9 100644 --- a/tests/util/test_linearizer.py +++ b/tests/util/test_linearizer.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- # Copyright 2016 OpenMarket Ltd +# Copyright 2018 New Vector Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -65,3 +66,49 @@ class LinearizerTestCase(unittest.TestCase): func(i) return func(1000) + + @defer.inlineCallbacks + def test_multiple_entries(self): + limiter = Linearizer(max_count=3) + + key = object() + + d1 = limiter.queue(key) + cm1 = yield d1 + + d2 = limiter.queue(key) + cm2 = yield d2 + + d3 = limiter.queue(key) + cm3 = yield d3 + + d4 = limiter.queue(key) + self.assertFalse(d4.called) + + d5 = limiter.queue(key) + self.assertFalse(d5.called) + + with cm1: + self.assertFalse(d4.called) + self.assertFalse(d5.called) + + cm4 = yield d4 + self.assertFalse(d5.called) + + with cm3: + self.assertFalse(d5.called) + + cm5 = yield d5 + + with cm2: + pass + + with cm4: + pass + + with cm5: + pass + + d6 = limiter.queue(key) + with (yield d6): + pass -- cgit 1.5.1 From 3d6df846580ce6ef8769945e6990af2f44251e40 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Fri, 20 Jul 2018 13:59:55 +0100 Subject: Test and fix support for cancellation in Linearizer --- synapse/util/async.py | 28 ++++++++++++++++++++++------ tests/util/test_linearizer.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 6 deletions(-) (limited to 'synapse/util') diff --git a/synapse/util/async.py b/synapse/util/async.py index 5a50d9700f..a7094e2fb4 100644 --- a/synapse/util/async.py +++ b/synapse/util/async.py @@ -184,13 +184,13 @@ class Linearizer(object): # key_to_defer is a map from the key to a 2 element list where # the first element is the number of things executing, and - # the second element is a deque of deferreds for the things blocked from - # executing. + # the second element is an OrderedDict, where the keys are deferreds for the + # things blocked from executing. self.key_to_defer = {} @defer.inlineCallbacks def queue(self, key): - entry = self.key_to_defer.setdefault(key, [0, collections.deque()]) + entry = self.key_to_defer.setdefault(key, [0, collections.OrderedDict()]) # If the number of things executing is greater than the maximum # then add a deferred to the list of blocked items @@ -198,12 +198,28 @@ class Linearizer(object): # this item so that it can continue executing. if entry[0] >= self.max_count: new_defer = defer.Deferred() - entry[1].append(new_defer) + entry[1][new_defer] = 1 logger.info( "Waiting to acquire linearizer lock %r for key %r", self.name, key, ) - yield make_deferred_yieldable(new_defer) + try: + yield make_deferred_yieldable(new_defer) + except Exception as e: + if isinstance(e, CancelledError): + logger.info( + "Cancelling wait for linearizer lock %r for key %r", + self.name, key, + ) + else: + logger.warn( + "Unexpected exception waiting for linearizer lock %r for key %r", + self.name, key, + ) + + # we just have to take ourselves back out of the queue. + del entry[1][new_defer] + raise logger.info("Acquired linearizer lock %r for key %r", self.name, key) entry[0] += 1 @@ -238,7 +254,7 @@ class Linearizer(object): entry[0] -= 1 if entry[1]: - next_def = entry[1].popleft() + (next_def, _) = entry[1].popitem(last=False) # we need to run the next thing in the sentinel context. with PreserveLoggingContext(): diff --git a/tests/util/test_linearizer.py b/tests/util/test_linearizer.py index c9563124f9..4729bd5a0a 100644 --- a/tests/util/test_linearizer.py +++ b/tests/util/test_linearizer.py @@ -17,6 +17,7 @@ from six.moves import range from twisted.internet import defer, reactor +from twisted.internet.defer import CancelledError from synapse.util import Clock, logcontext from synapse.util.async import Linearizer @@ -112,3 +113,33 @@ class LinearizerTestCase(unittest.TestCase): d6 = limiter.queue(key) with (yield d6): pass + + @defer.inlineCallbacks + def test_cancellation(self): + linearizer = Linearizer() + + key = object() + + d1 = linearizer.queue(key) + cm1 = yield d1 + + d2 = linearizer.queue(key) + self.assertFalse(d2.called) + + d3 = linearizer.queue(key) + self.assertFalse(d3.called) + + d2.cancel() + + with cm1: + pass + + self.assertTrue(d2.called) + try: + yield d2 + self.fail("Expected d2 to raise CancelledError") + except CancelledError: + pass + + with (yield d3): + pass -- cgit 1.5.1 From 03751a64203b169cbf33b636b6d940ca6d414c31 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Thu, 26 Jul 2018 11:44:26 +0100 Subject: Fix some looping_call calls which were broken in #3604 It turns out that looping_call does check the deferred returned by its callback, and (at least in the case of client_ips), we were relying on this, and I broke it in #3604. Update run_as_background_process to return the deferred, and make sure we return it to clock.looping_call. --- changelog.d/3610.feature | 1 + synapse/app/homeserver.py | 4 ++-- synapse/groups/attestations.py | 2 +- synapse/handlers/profile.py | 2 +- synapse/metrics/background_process_metrics.py | 10 ++++++++-- synapse/rest/media/v1/media_repository.py | 2 +- synapse/rest/media/v1/preview_url_resource.py | 2 +- synapse/storage/client_ips.py | 2 +- synapse/storage/devices.py | 2 +- synapse/storage/event_federation.py | 2 +- synapse/storage/event_push_actions.py | 4 ++-- synapse/storage/transactions.py | 4 +++- synapse/util/caches/expiringcache.py | 2 +- 13 files changed, 24 insertions(+), 15 deletions(-) create mode 100644 changelog.d/3610.feature (limited to 'synapse/util') diff --git a/changelog.d/3610.feature b/changelog.d/3610.feature new file mode 100644 index 0000000000..77a294cb9f --- /dev/null +++ b/changelog.d/3610.feature @@ -0,0 +1 @@ +Add metrics to track resource usage by background processes diff --git a/synapse/app/homeserver.py b/synapse/app/homeserver.py index b7e7718290..57b815d777 100755 --- a/synapse/app/homeserver.py +++ b/synapse/app/homeserver.py @@ -429,7 +429,7 @@ def run(hs): stats_process = [] def start_phone_stats_home(): - run_as_background_process("phone_stats_home", phone_stats_home) + return run_as_background_process("phone_stats_home", phone_stats_home) @defer.inlineCallbacks def phone_stats_home(): @@ -502,7 +502,7 @@ def run(hs): ) def generate_user_daily_visit_stats(): - run_as_background_process( + return run_as_background_process( "generate_user_daily_visits", hs.get_datastore().generate_user_daily_visits, ) diff --git a/synapse/groups/attestations.py b/synapse/groups/attestations.py index 4216af0a27..b04f4234ca 100644 --- a/synapse/groups/attestations.py +++ b/synapse/groups/attestations.py @@ -153,7 +153,7 @@ class GroupAttestionRenewer(object): defer.returnValue({}) def _start_renew_attestations(self): - run_as_background_process("renew_attestations", self._renew_attestations) + return run_as_background_process("renew_attestations", self._renew_attestations) @defer.inlineCallbacks def _renew_attestations(self): diff --git a/synapse/handlers/profile.py b/synapse/handlers/profile.py index 43692b83a8..cb5c6d587e 100644 --- a/synapse/handlers/profile.py +++ b/synapse/handlers/profile.py @@ -256,7 +256,7 @@ class ProfileHandler(BaseHandler): ) def _start_update_remote_profile_cache(self): - run_as_background_process( + return run_as_background_process( "Update remote profile", self._update_remote_profile_cache, ) diff --git a/synapse/metrics/background_process_metrics.py b/synapse/metrics/background_process_metrics.py index 9d820e44a6..ce678d5f75 100644 --- a/synapse/metrics/background_process_metrics.py +++ b/synapse/metrics/background_process_metrics.py @@ -151,13 +151,19 @@ def run_as_background_process(desc, func, *args, **kwargs): This should be used to wrap processes which are fired off to run in the background, instead of being associated with a particular request. + It returns a Deferred which completes when the function completes, but it doesn't + follow the synapse logcontext rules, which makes it appropriate for passing to + clock.looping_call and friends (or for firing-and-forgetting in the middle of a + normal synapse inlineCallbacks function). + Args: desc (str): a description for this background process type func: a function, which may return a Deferred args: positional args for func kwargs: keyword args for func - Returns: None + Returns: Deferred which returns the result of func, but note that it does not + follow the synapse logcontext rules. """ @defer.inlineCallbacks def run(): @@ -176,4 +182,4 @@ def run_as_background_process(desc, func, *args, **kwargs): _background_processes[desc].remove(proc) with PreserveLoggingContext(): - run() + return run() diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index 5b13378caa..174ad20123 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -106,7 +106,7 @@ class MediaRepository(object): ) def _start_update_recently_accessed(self): - run_as_background_process( + return run_as_background_process( "update_recently_accessed_media", self._update_recently_accessed, ) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 4efd5339a4..27aa0def2f 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -373,7 +373,7 @@ class PreviewUrlResource(Resource): }) def _start_expire_url_cache_data(self): - run_as_background_process( + return run_as_background_process( "expire_url_cache_data", self._expire_url_cache_data, ) diff --git a/synapse/storage/client_ips.py b/synapse/storage/client_ips.py index 77ae10da3d..b8cefd43d6 100644 --- a/synapse/storage/client_ips.py +++ b/synapse/storage/client_ips.py @@ -102,7 +102,7 @@ class ClientIpStore(background_updates.BackgroundUpdateStore): to_update, ) - run_as_background_process( + return run_as_background_process( "update_client_ips", update, ) diff --git a/synapse/storage/devices.py b/synapse/storage/devices.py index 52dccb1507..c0943ecf91 100644 --- a/synapse/storage/devices.py +++ b/synapse/storage/devices.py @@ -712,7 +712,7 @@ class DeviceStore(SQLBaseStore): logger.info("Pruned %d device list outbound pokes", txn.rowcount) - run_as_background_process( + return run_as_background_process( "prune_old_outbound_device_pokes", self.runInteraction, "_prune_old_outbound_device_pokes", diff --git a/synapse/storage/event_federation.py b/synapse/storage/event_federation.py index 65f2d19e20..f269ec6fb3 100644 --- a/synapse/storage/event_federation.py +++ b/synapse/storage/event_federation.py @@ -549,7 +549,7 @@ class EventFederationStore(EventFederationWorkerStore): sql, (self.stream_ordering_month_ago, self.stream_ordering_month_ago,) ) - run_as_background_process( + return run_as_background_process( "delete_old_forward_extrem_cache", self.runInteraction, "_delete_old_forward_extrem_cache", diff --git a/synapse/storage/event_push_actions.py b/synapse/storage/event_push_actions.py index 4f44b0ad47..6840320641 100644 --- a/synapse/storage/event_push_actions.py +++ b/synapse/storage/event_push_actions.py @@ -460,7 +460,7 @@ class EventPushActionsWorkerStore(SQLBaseStore): ) def _find_stream_orderings_for_times(self): - run_as_background_process( + return run_as_background_process( "event_push_action_stream_orderings", self.runInteraction, "_find_stream_orderings_for_times", @@ -790,7 +790,7 @@ class EventPushActionsStore(EventPushActionsWorkerStore): """, (room_id, user_id, stream_ordering)) def _start_rotate_notifs(self): - run_as_background_process("rotate_notifs", self._rotate_notifs) + return run_as_background_process("rotate_notifs", self._rotate_notifs) @defer.inlineCallbacks def _rotate_notifs(self): diff --git a/synapse/storage/transactions.py b/synapse/storage/transactions.py index b4b479d94c..428e7fa36e 100644 --- a/synapse/storage/transactions.py +++ b/synapse/storage/transactions.py @@ -273,7 +273,9 @@ class TransactionStore(SQLBaseStore): return self.cursor_to_dict(txn) def _start_cleanup_transactions(self): - run_as_background_process("cleanup_transactions", self._cleanup_transactions) + return run_as_background_process( + "cleanup_transactions", self._cleanup_transactions, + ) def _cleanup_transactions(self): now = self._clock.time_msec() diff --git a/synapse/util/caches/expiringcache.py b/synapse/util/caches/expiringcache.py index 465adc54a8..ce85b2ae11 100644 --- a/synapse/util/caches/expiringcache.py +++ b/synapse/util/caches/expiringcache.py @@ -64,7 +64,7 @@ class ExpiringCache(object): return def f(): - run_as_background_process( + return run_as_background_process( "prune_cache_%s" % self._cache_name, self._prune_cache, ) -- cgit 1.5.1