summary refs log tree commit diff
diff options
context:
space:
mode:
authorErik Johnston <erik@matrix.org>2021-04-30 10:14:23 +0100
committerErik Johnston <erik@matrix.org>2021-04-30 10:14:23 +0100
commitf5a25c7b531febb19bb62ab5e142f0ac630d7ede (patch)
treebb9872477cc7d3f18081e7ed76f265ed83218bdc
parentMerge branch 'erikj/fix_presence_joined' into erikj/test_send (diff)
parentNewsfile (diff)
downloadsynapse-f5a25c7b531febb19bb62ab5e142f0ac630d7ede.tar.xz
Merge branch 'erikj/limit_how_often_gc' into erikj/test_send
-rw-r--r--changelog.d/9902.feature1
-rw-r--r--docs/sample_config.yaml8
-rw-r--r--synapse/app/generic_worker.py3
-rw-r--r--synapse/app/homeserver.py3
-rw-r--r--synapse/config/server.py9
-rw-r--r--synapse/metrics/__init__.py18
6 files changed, 40 insertions, 2 deletions
diff --git a/changelog.d/9902.feature b/changelog.d/9902.feature
new file mode 100644
index 0000000000..4d9f324d4e
--- /dev/null
+++ b/changelog.d/9902.feature
@@ -0,0 +1 @@
+Add limits to how often Synapse will GC, ensuring that large servers do not end up GC thrashing if `gc_thresholds` has not been correctly set.
diff --git a/docs/sample_config.yaml b/docs/sample_config.yaml
index e0350279ad..ebf364cf40 100644
--- a/docs/sample_config.yaml
+++ b/docs/sample_config.yaml
@@ -152,6 +152,14 @@ presence:
 #
 #gc_thresholds: [700, 10, 10]
 
+# The minimum time in seconds between each GC for a generation, regardless of
+# the GC thresholds. This ensures that we don't do GC too frequently.
+#
+# A value of `[1, 10, 30]` indicates that a second must pass between consecutive
+# generation 0 GCs, etc.
+#
+# gc_min_seconds_between: [1, 10, 30]
+
 # Set the limit on the returned events in the timeline in the get
 # and sync operations. The default value is 100. -1 means no upper limit.
 #
diff --git a/synapse/app/generic_worker.py b/synapse/app/generic_worker.py
index 1a15ceee81..a3fe9a3f38 100644
--- a/synapse/app/generic_worker.py
+++ b/synapse/app/generic_worker.py
@@ -455,6 +455,9 @@ def start(config_options):
 
     synapse.events.USE_FROZEN_DICTS = config.use_frozen_dicts
 
+    if config.server.gc_seconds:
+        synapse.metrics.MIN_TIME_BETWEEN_GCS = config.server.gc_seconds
+
     hs = GenericWorkerServer(
         config.server_name,
         config=config,
diff --git a/synapse/app/homeserver.py b/synapse/app/homeserver.py
index 8e78134bbe..6a823da10d 100644
--- a/synapse/app/homeserver.py
+++ b/synapse/app/homeserver.py
@@ -342,6 +342,9 @@ def setup(config_options):
 
     events.USE_FROZEN_DICTS = config.use_frozen_dicts
 
+    if config.server.gc_seconds:
+        synapse.metrics.MIN_TIME_BETWEEN_GCS = config.server.gc_seconds
+
     hs = SynapseHomeServer(
         config.server_name,
         config=config,
diff --git a/synapse/config/server.py b/synapse/config/server.py
index 21ca7b33e3..ca1c9711f8 100644
--- a/synapse/config/server.py
+++ b/synapse/config/server.py
@@ -572,6 +572,7 @@ class ServerConfig(Config):
             _warn_if_webclient_configured(self.listeners)
 
         self.gc_thresholds = read_gc_thresholds(config.get("gc_thresholds", None))
+        self.gc_seconds = read_gc_thresholds(config.get("gc_min_seconds_between", None))
 
         @attr.s
         class LimitRemoteRoomsConfig:
@@ -917,6 +918,14 @@ class ServerConfig(Config):
         #
         #gc_thresholds: [700, 10, 10]
 
+        # The minimum time in seconds between each GC for a generation, regardless of
+        # the GC thresholds. This ensures that we don't do GC too frequently.
+        #
+        # A value of `[1, 10, 30]` indicates that a second must pass between consecutive
+        # generation 0 GCs, etc.
+        #
+        # gc_min_seconds_between: [1, 10, 30]
+
         # Set the limit on the returned events in the timeline in the get
         # and sync operations. The default value is 100. -1 means no upper limit.
         #
diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py
index 13be0b9b96..c841363b1e 100644
--- a/synapse/metrics/__init__.py
+++ b/synapse/metrics/__init__.py
@@ -538,6 +538,13 @@ class ReactorLastSeenMetric:
 
 REGISTRY.register(ReactorLastSeenMetric())
 
+# The minimum time in seconds between GCs for each generation, regardless of the current GC
+# thresholds and counts.
+MIN_TIME_BETWEEN_GCS = [1, 10, 30]
+
+# The time in seconds of the last time we did a GC for each generation.
+_last_gc = [0, 0, 0]
+
 
 def runUntilCurrentTimer(reactor, func):
     @functools.wraps(func)
@@ -578,11 +585,16 @@ def runUntilCurrentTimer(reactor, func):
             return ret
 
         # Check if we need to do a manual GC (since its been disabled), and do
-        # one if necessary.
+        # one if necessary. Note we go in reverse order as e.g. a gen 1 GC may
+        # promote an object into gen 2, and we don't want to handle the same
+        # object multiple times.
         threshold = gc.get_threshold()
         counts = gc.get_count()
         for i in (2, 1, 0):
-            if threshold[i] < counts[i]:
+            # We check if we need to do one based on a straightforward
+            # comparison between the threshold and count. We also do an extra
+            # check to make sure that we don't a GC too often.
+            if threshold[i] < counts[i] and MIN_TIME_BETWEEN_GCS[i] < end - _last_gc[i]:
                 if i == 0:
                     logger.debug("Collecting gc %d", i)
                 else:
@@ -592,6 +604,8 @@ def runUntilCurrentTimer(reactor, func):
                 unreachable = gc.collect(i)
                 end = time.time()
 
+                _last_gc[i] = int(end)
+
                 gc_time.labels(i).observe(end - start)
                 gc_unreachable.labels(i).set(unreachable)