From b85821aca2c3cfc1c732dfcc0c1d6758a263487a Mon Sep 17 00:00:00 2001
From: Andrew Morgan <1342360+anoadragon453@users.noreply.github.com>
Date: Tue, 4 May 2021 13:28:59 +0100
Subject: Add port parameter to the sample config for psycopg2 args (#9911)

Adds the `port` option with the default value to the sample config file.
---
 synapse/config/database.py | 1 +
 1 file changed, 1 insertion(+)

(limited to 'synapse/config')

diff --git a/synapse/config/database.py b/synapse/config/database.py
index 79a02706b4..c76ef1e1de 100644
--- a/synapse/config/database.py
+++ b/synapse/config/database.py
@@ -58,6 +58,7 @@ DEFAULT_CONFIG = """\
 #    password: secretpassword
 #    database: synapse
 #    host: localhost
+#    port: 5432
 #    cp_min: 5
 #    cp_max: 10
 #
-- 
cgit 1.5.1


From 1fb9a2d0bf2506ca6e5343cb340a441585ca1c07 Mon Sep 17 00:00:00 2001
From: Erik Johnston <erik@matrix.org>
Date: Wed, 5 May 2021 16:53:45 +0100
Subject: Limit how often GC happens by time. (#9902)

Synapse can be quite memory intensive, and unless care is taken to tune
the GC thresholds it can end up thrashing, causing noticable performance
problems for large servers. We fix this by limiting how often we GC a
given generation, regardless of current counts/thresholds.

This does not help with the reverse problem where the thresholds are set
too high, but that should only happen in situations where they've been
manually configured.

Adds a `gc_min_seconds_between` config option to override the defaults.

Fixes #9890.
---
 changelog.d/9902.feature      |  1 +
 docs/sample_config.yaml       | 10 ++++++++++
 synapse/app/generic_worker.py |  3 +++
 synapse/app/homeserver.py     |  3 +++
 synapse/config/server.py      | 31 ++++++++++++++++++++++++++++++-
 synapse/metrics/__init__.py   | 18 ++++++++++++++++--
 6 files changed, 63 insertions(+), 3 deletions(-)
 create mode 100644 changelog.d/9902.feature

(limited to 'synapse/config')

diff --git a/changelog.d/9902.feature b/changelog.d/9902.feature
new file mode 100644
index 0000000000..4d9f324d4e
--- /dev/null
+++ b/changelog.d/9902.feature
@@ -0,0 +1 @@
+Add limits to how often Synapse will GC, ensuring that large servers do not end up GC thrashing if `gc_thresholds` has not been correctly set.
diff --git a/docs/sample_config.yaml b/docs/sample_config.yaml
index d013725cdc..f469d6e54f 100644
--- a/docs/sample_config.yaml
+++ b/docs/sample_config.yaml
@@ -152,6 +152,16 @@ presence:
 #
 #gc_thresholds: [700, 10, 10]
 
+# The minimum time in seconds between each GC for a generation, regardless of
+# the GC thresholds. This ensures that we don't do GC too frequently.
+#
+# A value of `[1s, 10s, 30s]` indicates that a second must pass between consecutive
+# generation 0 GCs, etc.
+#
+# Defaults to `[1s, 10s, 30s]`.
+#
+#gc_min_interval: [0.5s, 30s, 1m]
+
 # Set the limit on the returned events in the timeline in the get
 # and sync operations. The default value is 100. -1 means no upper limit.
 #
diff --git a/synapse/app/generic_worker.py b/synapse/app/generic_worker.py
index 1a15ceee81..a3fe9a3f38 100644
--- a/synapse/app/generic_worker.py
+++ b/synapse/app/generic_worker.py
@@ -455,6 +455,9 @@ def start(config_options):
 
     synapse.events.USE_FROZEN_DICTS = config.use_frozen_dicts
 
+    if config.server.gc_seconds:
+        synapse.metrics.MIN_TIME_BETWEEN_GCS = config.server.gc_seconds
+
     hs = GenericWorkerServer(
         config.server_name,
         config=config,
diff --git a/synapse/app/homeserver.py b/synapse/app/homeserver.py
index 8e78134bbe..6a823da10d 100644
--- a/synapse/app/homeserver.py
+++ b/synapse/app/homeserver.py
@@ -342,6 +342,9 @@ def setup(config_options):
 
     events.USE_FROZEN_DICTS = config.use_frozen_dicts
 
+    if config.server.gc_seconds:
+        synapse.metrics.MIN_TIME_BETWEEN_GCS = config.server.gc_seconds
+
     hs = SynapseHomeServer(
         config.server_name,
         config=config,
diff --git a/synapse/config/server.py b/synapse/config/server.py
index 21ca7b33e3..c290a35a92 100644
--- a/synapse/config/server.py
+++ b/synapse/config/server.py
@@ -19,7 +19,7 @@ import logging
 import os.path
 import re
 from textwrap import indent
-from typing import Any, Dict, Iterable, List, Optional, Set
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
 
 import attr
 import yaml
@@ -572,6 +572,7 @@ class ServerConfig(Config):
             _warn_if_webclient_configured(self.listeners)
 
         self.gc_thresholds = read_gc_thresholds(config.get("gc_thresholds", None))
+        self.gc_seconds = self.read_gc_intervals(config.get("gc_min_interval", None))
 
         @attr.s
         class LimitRemoteRoomsConfig:
@@ -917,6 +918,16 @@ class ServerConfig(Config):
         #
         #gc_thresholds: [700, 10, 10]
 
+        # The minimum time in seconds between each GC for a generation, regardless of
+        # the GC thresholds. This ensures that we don't do GC too frequently.
+        #
+        # A value of `[1s, 10s, 30s]` indicates that a second must pass between consecutive
+        # generation 0 GCs, etc.
+        #
+        # Defaults to `[1s, 10s, 30s]`.
+        #
+        #gc_min_interval: [0.5s, 30s, 1m]
+
         # Set the limit on the returned events in the timeline in the get
         # and sync operations. The default value is 100. -1 means no upper limit.
         #
@@ -1305,6 +1316,24 @@ class ServerConfig(Config):
             help="Turn on the twisted telnet manhole service on the given port.",
         )
 
+    def read_gc_intervals(self, durations) -> Optional[Tuple[float, float, float]]:
+        """Reads the three durations for the GC min interval option, returning seconds."""
+        if durations is None:
+            return None
+
+        try:
+            if len(durations) != 3:
+                raise ValueError()
+            return (
+                self.parse_duration(durations[0]) / 1000,
+                self.parse_duration(durations[1]) / 1000,
+                self.parse_duration(durations[2]) / 1000,
+            )
+        except Exception:
+            raise ConfigError(
+                "Value of `gc_min_interval` must be a list of three durations if set"
+            )
+
 
 def is_threepid_reserved(reserved_threepids, threepid):
     """Check the threepid against the reserved threepid config
diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py
index 31b7b3c256..e671da26d5 100644
--- a/synapse/metrics/__init__.py
+++ b/synapse/metrics/__init__.py
@@ -535,6 +535,13 @@ class ReactorLastSeenMetric:
 
 REGISTRY.register(ReactorLastSeenMetric())
 
+# The minimum time in seconds between GCs for each generation, regardless of the current GC
+# thresholds and counts.
+MIN_TIME_BETWEEN_GCS = (1.0, 10.0, 30.0)
+
+# The time (in seconds since the epoch) of the last time we did a GC for each generation.
+_last_gc = [0.0, 0.0, 0.0]
+
 
 def runUntilCurrentTimer(reactor, func):
     @functools.wraps(func)
@@ -575,11 +582,16 @@ def runUntilCurrentTimer(reactor, func):
             return ret
 
         # Check if we need to do a manual GC (since its been disabled), and do
-        # one if necessary.
+        # one if necessary. Note we go in reverse order as e.g. a gen 1 GC may
+        # promote an object into gen 2, and we don't want to handle the same
+        # object multiple times.
         threshold = gc.get_threshold()
         counts = gc.get_count()
         for i in (2, 1, 0):
-            if threshold[i] < counts[i]:
+            # We check if we need to do one based on a straightforward
+            # comparison between the threshold and count. We also do an extra
+            # check to make sure that we don't a GC too often.
+            if threshold[i] < counts[i] and MIN_TIME_BETWEEN_GCS[i] < end - _last_gc[i]:
                 if i == 0:
                     logger.debug("Collecting gc %d", i)
                 else:
@@ -589,6 +601,8 @@ def runUntilCurrentTimer(reactor, func):
                 unreachable = gc.collect(i)
                 end = time.time()
 
+                _last_gc[i] = end
+
                 gc_time.labels(i).observe(end - start)
                 gc_unreachable.labels(i).set(unreachable)
 
-- 
cgit 1.5.1


From ef889c98a6cde0cfa95f7fdaf7f99ec3c1e9bb7f Mon Sep 17 00:00:00 2001
From: Erik Johnston <erik@matrix.org>
Date: Wed, 5 May 2021 16:54:36 +0100
Subject: Optionally track memory usage of each LruCache (#9881)

This will double count slightly in the presence of interned strings. It's off by default as it can consume a lot of resources.
---
 changelog.d/9881.feature        |  1 +
 mypy.ini                        |  3 +++
 synapse/app/generic_worker.py   |  1 +
 synapse/app/homeserver.py       |  1 +
 synapse/config/cache.py         | 11 ++++++++++
 synapse/python_dependencies.py  |  2 ++
 synapse/util/caches/__init__.py | 31 ++++++++++++++++++++++++++
 synapse/util/caches/lrucache.py | 48 ++++++++++++++++++++++++++++++++++++++++-
 8 files changed, 97 insertions(+), 1 deletion(-)
 create mode 100644 changelog.d/9881.feature

(limited to 'synapse/config')

diff --git a/changelog.d/9881.feature b/changelog.d/9881.feature
new file mode 100644
index 0000000000..088a517e02
--- /dev/null
+++ b/changelog.d/9881.feature
@@ -0,0 +1 @@
+Add experimental option to track memory usage of the caches.
diff --git a/mypy.ini b/mypy.ini
index a40f705b76..ea655a0d4d 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -171,3 +171,6 @@ ignore_missing_imports = True
 
 [mypy-txacme.*]
 ignore_missing_imports = True
+
+[mypy-pympler.*]
+ignore_missing_imports = True
diff --git a/synapse/app/generic_worker.py b/synapse/app/generic_worker.py
index a3fe9a3f38..f730cdbd78 100644
--- a/synapse/app/generic_worker.py
+++ b/synapse/app/generic_worker.py
@@ -454,6 +454,7 @@ def start(config_options):
         config.server.update_user_directory = False
 
     synapse.events.USE_FROZEN_DICTS = config.use_frozen_dicts
+    synapse.util.caches.TRACK_MEMORY_USAGE = config.caches.track_memory_usage
 
     if config.server.gc_seconds:
         synapse.metrics.MIN_TIME_BETWEEN_GCS = config.server.gc_seconds
diff --git a/synapse/app/homeserver.py b/synapse/app/homeserver.py
index 6a823da10d..b2501ee4d7 100644
--- a/synapse/app/homeserver.py
+++ b/synapse/app/homeserver.py
@@ -341,6 +341,7 @@ def setup(config_options):
         sys.exit(0)
 
     events.USE_FROZEN_DICTS = config.use_frozen_dicts
+    synapse.util.caches.TRACK_MEMORY_USAGE = config.caches.track_memory_usage
 
     if config.server.gc_seconds:
         synapse.metrics.MIN_TIME_BETWEEN_GCS = config.server.gc_seconds
diff --git a/synapse/config/cache.py b/synapse/config/cache.py
index 41b9b3f51f..91165ee1ce 100644
--- a/synapse/config/cache.py
+++ b/synapse/config/cache.py
@@ -17,6 +17,8 @@ import re
 import threading
 from typing import Callable, Dict
 
+from synapse.python_dependencies import DependencyException, check_requirements
+
 from ._base import Config, ConfigError
 
 # The prefix for all cache factor-related environment variables
@@ -189,6 +191,15 @@ class CacheConfig(Config):
                 )
             self.cache_factors[cache] = factor
 
+        self.track_memory_usage = cache_config.get("track_memory_usage", False)
+        if self.track_memory_usage:
+            try:
+                check_requirements("cache_memory")
+            except DependencyException as e:
+                raise ConfigError(
+                    e.message  # noqa: B306, DependencyException.message is a property
+                )
+
         # Resize all caches (if necessary) with the new factors we've loaded
         self.resize_all_caches()
 
diff --git a/synapse/python_dependencies.py b/synapse/python_dependencies.py
index 2de946f464..d58eeeaa74 100644
--- a/synapse/python_dependencies.py
+++ b/synapse/python_dependencies.py
@@ -116,6 +116,8 @@ CONDITIONAL_REQUIREMENTS = {
     # hiredis is not a *strict* dependency, but it makes things much faster.
     # (if it is not installed, we fall back to slow code.)
     "redis": ["txredisapi>=1.4.7", "hiredis"],
+    # Required to use experimental `caches.track_memory_usage` config option.
+    "cache_memory": ["pympler"],
 }
 
 ALL_OPTIONAL_REQUIREMENTS = set()  # type: Set[str]
diff --git a/synapse/util/caches/__init__.py b/synapse/util/caches/__init__.py
index 46af7fa473..ca36f07c20 100644
--- a/synapse/util/caches/__init__.py
+++ b/synapse/util/caches/__init__.py
@@ -24,6 +24,11 @@ from synapse.config.cache import add_resizable_cache
 
 logger = logging.getLogger(__name__)
 
+
+# Whether to track estimated memory usage of the LruCaches.
+TRACK_MEMORY_USAGE = False
+
+
 caches_by_name = {}  # type: Dict[str, Sized]
 collectors_by_name = {}  # type: Dict[str, CacheMetric]
 
@@ -32,6 +37,11 @@ cache_hits = Gauge("synapse_util_caches_cache:hits", "", ["name"])
 cache_evicted = Gauge("synapse_util_caches_cache:evicted_size", "", ["name"])
 cache_total = Gauge("synapse_util_caches_cache:total", "", ["name"])
 cache_max_size = Gauge("synapse_util_caches_cache_max_size", "", ["name"])
+cache_memory_usage = Gauge(
+    "synapse_util_caches_cache_size_bytes",
+    "Estimated memory usage of the caches",
+    ["name"],
+)
 
 response_cache_size = Gauge("synapse_util_caches_response_cache:size", "", ["name"])
 response_cache_hits = Gauge("synapse_util_caches_response_cache:hits", "", ["name"])
@@ -52,6 +62,7 @@ class CacheMetric:
     hits = attr.ib(default=0)
     misses = attr.ib(default=0)
     evicted_size = attr.ib(default=0)
+    memory_usage = attr.ib(default=None)
 
     def inc_hits(self):
         self.hits += 1
@@ -62,6 +73,19 @@ class CacheMetric:
     def inc_evictions(self, size=1):
         self.evicted_size += size
 
+    def inc_memory_usage(self, memory: int):
+        if self.memory_usage is None:
+            self.memory_usage = 0
+
+        self.memory_usage += memory
+
+    def dec_memory_usage(self, memory: int):
+        self.memory_usage -= memory
+
+    def clear_memory_usage(self):
+        if self.memory_usage is not None:
+            self.memory_usage = 0
+
     def describe(self):
         return []
 
@@ -81,6 +105,13 @@ class CacheMetric:
                 cache_total.labels(self._cache_name).set(self.hits + self.misses)
                 if getattr(self._cache, "max_size", None):
                     cache_max_size.labels(self._cache_name).set(self._cache.max_size)
+
+                if TRACK_MEMORY_USAGE:
+                    # self.memory_usage can be None if nothing has been inserted
+                    # into the cache yet.
+                    cache_memory_usage.labels(self._cache_name).set(
+                        self.memory_usage or 0
+                    )
             if self._collect_callback:
                 self._collect_callback()
         except Exception as e:
diff --git a/synapse/util/caches/lrucache.py b/synapse/util/caches/lrucache.py
index 10b0ec6b75..1be675e014 100644
--- a/synapse/util/caches/lrucache.py
+++ b/synapse/util/caches/lrucache.py
@@ -32,9 +32,36 @@ from typing import (
 from typing_extensions import Literal
 
 from synapse.config import cache as cache_config
+from synapse.util import caches
 from synapse.util.caches import CacheMetric, register_cache
 from synapse.util.caches.treecache import TreeCache
 
+try:
+    from pympler.asizeof import Asizer
+
+    def _get_size_of(val: Any, *, recurse=True) -> int:
+        """Get an estimate of the size in bytes of the object.
+
+        Args:
+            val: The object to size.
+            recurse: If true will include referenced values in the size,
+                otherwise only sizes the given object.
+        """
+        # Ignore singleton values when calculating memory usage.
+        if val in ((), None, ""):
+            return 0
+
+        sizer = Asizer()
+        sizer.exclude_refs((), None, "")
+        return sizer.asizeof(val, limit=100 if recurse else 0)
+
+
+except ImportError:
+
+    def _get_size_of(val: Any, *, recurse=True) -> int:
+        return 0
+
+
 # Function type: the type used for invalidation callbacks
 FT = TypeVar("FT", bound=Callable[..., Any])
 
@@ -56,7 +83,7 @@ def enumerate_leaves(node, depth):
 
 
 class _Node:
-    __slots__ = ["prev_node", "next_node", "key", "value", "callbacks"]
+    __slots__ = ["prev_node", "next_node", "key", "value", "callbacks", "memory"]
 
     def __init__(
         self,
@@ -84,6 +111,16 @@ class _Node:
 
         self.add_callbacks(callbacks)
 
+        self.memory = 0
+        if caches.TRACK_MEMORY_USAGE:
+            self.memory = (
+                _get_size_of(key)
+                + _get_size_of(value)
+                + _get_size_of(self.callbacks, recurse=False)
+                + _get_size_of(self, recurse=False)
+            )
+            self.memory += _get_size_of(self.memory, recurse=False)
+
     def add_callbacks(self, callbacks: Collection[Callable[[], None]]) -> None:
         """Add to stored list of callbacks, removing duplicates."""
 
@@ -233,6 +270,9 @@ class LruCache(Generic[KT, VT]):
             if size_callback:
                 cached_cache_len[0] += size_callback(node.value)
 
+            if caches.TRACK_MEMORY_USAGE and metrics:
+                metrics.inc_memory_usage(node.memory)
+
         def move_node_to_front(node):
             prev_node = node.prev_node
             next_node = node.next_node
@@ -258,6 +298,9 @@ class LruCache(Generic[KT, VT]):
 
             node.run_and_clear_callbacks()
 
+            if caches.TRACK_MEMORY_USAGE and metrics:
+                metrics.dec_memory_usage(node.memory)
+
             return deleted_len
 
         @overload
@@ -373,6 +416,9 @@ class LruCache(Generic[KT, VT]):
             if size_callback:
                 cached_cache_len[0] = 0
 
+            if caches.TRACK_MEMORY_USAGE and metrics:
+                metrics.clear_memory_usage()
+
         @synchronized
         def cache_contains(key: KT) -> bool:
             return key in cache
-- 
cgit 1.5.1