Fix stack overflow in `_PerHostRatelimiter` due to synchronous requests (#14812)

When there are many synchronous requests waiting on a `_PerHostRatelimiter`, each request will be started recursively just after the previous request has completed. Under the right conditions, this leads to stack exhaustion. A common way for requests to become synchronous is when the remote client disconnects early, because the homeserver is overloaded and slow to respond. Avoid stack exhaustion under these conditions by deferring subsequent requests until the next reactor tick. Fixes #14480. Signed-off-by: Sean Quah <seanq@matrix.org>
author: Sean Quah <8349537+squahtx@users.noreply.github.com> 2023-01-13 00:16:21 +0000
committer: GitHub <noreply@github.com> 2023-01-13 00:16:21 +0000
commit: 772e8c23856e27960caba4dd87af42401b6c0cac (patch)
tree: 11d8c1cb6bb8780277d4caa5d67bb2334b0222f4 /tests
parent: Merge branch 'release-v1.75' into develop (diff)
download: synapse-772e8c23856e27960caba4dd87af42401b6c0cac.tar.xz
1 files changed, 42 insertions, 3 deletions
diff --git a/tests/util/test_ratelimitutils.py b/tests/util/test_ratelimitutils.py
index 5b327b390e..2f3ea15b96 100644
--- a/tests/util/test_ratelimitutils.py
+++ b/tests/util/test_ratelimitutils.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from typing import Optional
 
+from twisted.internet import defer
 from twisted.internet.defer import Deferred
 
 from synapse.config.homeserver import HomeServerConfig
@@ -29,7 +30,7 @@ class FederationRateLimiterTestCase(TestCase):
         """A simple test with the default values"""
         reactor, clock = get_clock()
         rc_config = build_rc_config()
-        ratelimiter = FederationRateLimiter(clock, rc_config)
+        ratelimiter = FederationRateLimiter(reactor, clock, rc_config)
 
         with ratelimiter.ratelimit("testhost") as d1:
             # shouldn't block
@@ -39,7 +40,7 @@ class FederationRateLimiterTestCase(TestCase):
         """Test what happens when we hit the concurrent limit"""
         reactor, clock = get_clock()
         rc_config = build_rc_config({"rc_federation": {"concurrent": 2}})
-        ratelimiter = FederationRateLimiter(clock, rc_config)
+        ratelimiter = FederationRateLimiter(reactor, clock, rc_config)
 
         with ratelimiter.ratelimit("testhost") as d1:
             # shouldn't block
@@ -57,6 +58,7 @@ class FederationRateLimiterTestCase(TestCase):
 
             # ... until we complete an earlier request
             cm2.__exit__(None, None, None)
+            reactor.advance(0.0)
             self.successResultOf(d3)
 
     def test_sleep_limit(self) -> None:
@@ -65,7 +67,7 @@ class FederationRateLimiterTestCase(TestCase):
         rc_config = build_rc_config(
             {"rc_federation": {"sleep_limit": 2, "sleep_delay": 500}}
         )
-        ratelimiter = FederationRateLimiter(clock, rc_config)
+        ratelimiter = FederationRateLimiter(reactor, clock, rc_config)
 
         with ratelimiter.ratelimit("testhost") as d1:
             # shouldn't block
@@ -81,6 +83,43 @@ class FederationRateLimiterTestCase(TestCase):
             sleep_time = _await_resolution(reactor, d3)
             self.assertAlmostEqual(sleep_time, 500, places=3)
 
+    def test_lots_of_queued_things(self) -> None:
+        """Tests lots of synchronous things queued up behind a slow thing.
+
+        The stack should *not* explode when the slow thing completes.
+        """
+        reactor, clock = get_clock()
+        rc_config = build_rc_config(
+            {
+                "rc_federation": {
+                    "sleep_limit": 1000000000,  # never sleep
+                    "reject_limit": 1000000000,  # never reject requests
+                    "concurrent": 1,
+                }
+            }
+        )
+        ratelimiter = FederationRateLimiter(reactor, clock, rc_config)
+
+        with ratelimiter.ratelimit("testhost") as d:
+            # shouldn't block
+            self.successResultOf(d)
+
+            async def task() -> None:
+                with ratelimiter.ratelimit("testhost") as d:
+                    await d
+
+            for _ in range(1, 100):
+                defer.ensureDeferred(task())
+
+            last_task = defer.ensureDeferred(task())
+
+            # Upon exiting the context manager, all the synchronous things will resume.
+            # If a stack overflow occurs, the final task will not complete.
+
+        # Wait for all the things to complete.
+        reactor.advance(0.0)
+        self.successResultOf(last_task)
+
 
 def _await_resolution(reactor: ThreadedMemoryReactorClock, d: Deferred) -> float:
     """advance the clock until the deferred completes.
author	Sean Quah <8349537+squahtx@users.noreply.github.com>	2023-01-13 00:16:21 +0000
committer	GitHub <noreply@github.com>	2023-01-13 00:16:21 +0000
commit	772e8c23856e27960caba4dd87af42401b6c0cac (patch)
tree	11d8c1cb6bb8780277d4caa5d67bb2334b0222f4 /tests
parent	Merge branch 'release-v1.75' into develop (diff)
download	synapse-772e8c23856e27960caba4dd87af42401b6c0cac.tar.xz