diff --git a/changelog.d/9902.feature b/changelog.d/9902.feature
new file mode 100644
index 0000000000..4d9f324d4e
--- /dev/null
+++ b/changelog.d/9902.feature
@@ -0,0 +1 @@
+Add limits to how often Synapse will GC, ensuring that large servers do not end up GC thrashing if `gc_thresholds` has not been correctly set.
diff --git a/docs/sample_config.yaml b/docs/sample_config.yaml
index d013725cdc..f469d6e54f 100644
--- a/docs/sample_config.yaml
+++ b/docs/sample_config.yaml
@@ -152,6 +152,16 @@ presence:
#
#gc_thresholds: [700, 10, 10]
+# The minimum time in seconds between each GC for a generation, regardless of
+# the GC thresholds. This ensures that we don't do GC too frequently.
+#
+# A value of `[1s, 10s, 30s]` indicates that a second must pass between consecutive
+# generation 0 GCs, etc.
+#
+# Defaults to `[1s, 10s, 30s]`.
+#
+#gc_min_interval: [0.5s, 30s, 1m]
+
# Set the limit on the returned events in the timeline in the get
# and sync operations. The default value is 100. -1 means no upper limit.
#
diff --git a/synapse/app/generic_worker.py b/synapse/app/generic_worker.py
index 1a15ceee81..a3fe9a3f38 100644
--- a/synapse/app/generic_worker.py
+++ b/synapse/app/generic_worker.py
@@ -455,6 +455,9 @@ def start(config_options):
synapse.events.USE_FROZEN_DICTS = config.use_frozen_dicts
+ if config.server.gc_seconds:
+ synapse.metrics.MIN_TIME_BETWEEN_GCS = config.server.gc_seconds
+
hs = GenericWorkerServer(
config.server_name,
config=config,
diff --git a/synapse/app/homeserver.py b/synapse/app/homeserver.py
index 8e78134bbe..6a823da10d 100644
--- a/synapse/app/homeserver.py
+++ b/synapse/app/homeserver.py
@@ -342,6 +342,9 @@ def setup(config_options):
events.USE_FROZEN_DICTS = config.use_frozen_dicts
+ if config.server.gc_seconds:
+ synapse.metrics.MIN_TIME_BETWEEN_GCS = config.server.gc_seconds
+
hs = SynapseHomeServer(
config.server_name,
config=config,
diff --git a/synapse/config/server.py b/synapse/config/server.py
index 21ca7b33e3..c290a35a92 100644
--- a/synapse/config/server.py
+++ b/synapse/config/server.py
@@ -19,7 +19,7 @@ import logging
import os.path
import re
from textwrap import indent
-from typing import Any, Dict, Iterable, List, Optional, Set
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
import attr
import yaml
@@ -572,6 +572,7 @@ class ServerConfig(Config):
_warn_if_webclient_configured(self.listeners)
self.gc_thresholds = read_gc_thresholds(config.get("gc_thresholds", None))
+ self.gc_seconds = self.read_gc_intervals(config.get("gc_min_interval", None))
@attr.s
class LimitRemoteRoomsConfig:
@@ -917,6 +918,16 @@ class ServerConfig(Config):
#
#gc_thresholds: [700, 10, 10]
+ # The minimum time in seconds between each GC for a generation, regardless of
+ # the GC thresholds. This ensures that we don't do GC too frequently.
+ #
+ # A value of `[1s, 10s, 30s]` indicates that a second must pass between consecutive
+ # generation 0 GCs, etc.
+ #
+ # Defaults to `[1s, 10s, 30s]`.
+ #
+ #gc_min_interval: [0.5s, 30s, 1m]
+
# Set the limit on the returned events in the timeline in the get
# and sync operations. The default value is 100. -1 means no upper limit.
#
@@ -1305,6 +1316,24 @@ class ServerConfig(Config):
help="Turn on the twisted telnet manhole service on the given port.",
)
+ def read_gc_intervals(self, durations) -> Optional[Tuple[float, float, float]]:
+ """Reads the three durations for the GC min interval option, returning seconds."""
+ if durations is None:
+ return None
+
+ try:
+ if len(durations) != 3:
+ raise ValueError()
+ return (
+ self.parse_duration(durations[0]) / 1000,
+ self.parse_duration(durations[1]) / 1000,
+ self.parse_duration(durations[2]) / 1000,
+ )
+ except Exception:
+ raise ConfigError(
+ "Value of `gc_min_interval` must be a list of three durations if set"
+ )
+
def is_threepid_reserved(reserved_threepids, threepid):
"""Check the threepid against the reserved threepid config
diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py
index 31b7b3c256..e671da26d5 100644
--- a/synapse/metrics/__init__.py
+++ b/synapse/metrics/__init__.py
@@ -535,6 +535,13 @@ class ReactorLastSeenMetric:
REGISTRY.register(ReactorLastSeenMetric())
+# The minimum time in seconds between GCs for each generation, regardless of the current GC
+# thresholds and counts.
+MIN_TIME_BETWEEN_GCS = (1.0, 10.0, 30.0)
+
+# The time (in seconds since the epoch) of the last time we did a GC for each generation.
+_last_gc = [0.0, 0.0, 0.0]
+
def runUntilCurrentTimer(reactor, func):
@functools.wraps(func)
@@ -575,11 +582,16 @@ def runUntilCurrentTimer(reactor, func):
return ret
# Check if we need to do a manual GC (since its been disabled), and do
- # one if necessary.
+ # one if necessary. Note we go in reverse order as e.g. a gen 1 GC may
+ # promote an object into gen 2, and we don't want to handle the same
+ # object multiple times.
threshold = gc.get_threshold()
counts = gc.get_count()
for i in (2, 1, 0):
- if threshold[i] < counts[i]:
+ # We check if we need to do one based on a straightforward
+ # comparison between the threshold and count. We also do an extra
+ # check to make sure that we don't a GC too often.
+ if threshold[i] < counts[i] and MIN_TIME_BETWEEN_GCS[i] < end - _last_gc[i]:
if i == 0:
logger.debug("Collecting gc %d", i)
else:
@@ -589,6 +601,8 @@ def runUntilCurrentTimer(reactor, func):
unreachable = gc.collect(i)
end = time.time()
+ _last_gc[i] = end
+
gc_time.labels(i).observe(end - start)
gc_unreachable.labels(i).set(unreachable)
|