Reduce the memory usage of previewing media files. (#9421)

This reduces the memory usage of previewing media files which end up larger than the `max_spider_size` by avoiding buffering content internally in treq. It also checks the `Content-Length` header in additional places instead of streaming the content to check the body length.
author: Patrick Cloke <clokep@users.noreply.github.com> 2021-02-18 09:01:29 -0500
committer: GitHub <noreply@github.com> 2021-02-18 09:01:29 -0500
commit: 8ec221710329c157a0c83d401e592a325ae02c20 (patch)
tree: 18d2d3e555405132ab0a9bee5d2d1cd05e5680d2 /synapse
parent: Add http2 to the nginx example config (#9390) (diff)
download: synapse-8ec221710329c157a0c83d401e592a325ae02c20.tar.xz
1 files changed, 12 insertions, 14 deletions
diff --git a/synapse/http/client.py b/synapse/http/client.py
index 73b414ccff..e54d9bd213 100644
--- a/synapse/http/client.py
+++ b/synapse/http/client.py
@@ -56,7 +56,7 @@ from twisted.web.client import (
 )
 from twisted.web.http import PotentialDataLoss
 from twisted.web.http_headers import Headers
-from twisted.web.iweb import IAgent, IBodyProducer, IResponse
+from twisted.web.iweb import UNKNOWN_LENGTH, IAgent, IBodyProducer, IResponse
 
 from synapse.api.errors import Codes, HttpResponseException, SynapseError
 from synapse.http import QuieterFileBodyProducer, RequestTimedOutError, redact_uri
@@ -408,6 +408,9 @@ class SimpleHttpClient:
                     agent=self.agent,
                     data=body_producer,
                     headers=headers,
+                    # Avoid buffering the body in treq since we do not reuse
+                    # response bodies.
+                    unbuffered=True,
                     **self._extra_treq_args,
                 )  # type: defer.Deferred
 
@@ -702,18 +705,6 @@ class SimpleHttpClient:
 
         resp_headers = dict(response.headers.getAllRawHeaders())
 
-        if (
-            b"Content-Length" in resp_headers
-            and max_size
-            and int(resp_headers[b"Content-Length"][0]) > max_size
-        ):
-            logger.warning("Requested URL is too large > %r bytes" % (max_size,))
-            raise SynapseError(
-                502,
-                "Requested file is too large > %r bytes" % (max_size,),
-                Codes.TOO_LARGE,
-            )
-
         if response.code > 299:
             logger.warning("Got %d when downloading %s" % (response.code, url))
             raise SynapseError(502, "Got error %d" % (response.code,), Codes.UNKNOWN)
@@ -780,7 +771,9 @@ class _ReadBodyWithMaxSizeProtocol(protocol.Protocol):
         # in the meantime.
         if self.max_size is not None and self.length >= self.max_size:
             self.deferred.errback(BodyExceededMaxSize())
-            self.transport.loseConnection()
+            # Close the connection (forcefully) since all the data will get
+            # discarded anyway.
+            self.transport.abortConnection()
 
     def connectionLost(self, reason: Failure) -> None:
         # If the maximum size was already exceeded, there's nothing to do.
@@ -814,6 +807,11 @@ def read_body_with_max_size(
     Returns:
         A Deferred which resolves to the length of the read body.
     """
+    # If the Content-Length header gives a size larger than the maximum allowed
+    # size, do not bother downloading the body.
+    if max_size is not None and response.length != UNKNOWN_LENGTH:
+        if response.length > max_size:
+            return defer.fail(BodyExceededMaxSize())
 
     d = defer.Deferred()
     response.deliverBody(_ReadBodyWithMaxSizeProtocol(stream, d, max_size))
author	Patrick Cloke <clokep@users.noreply.github.com>	2021-02-18 09:01:29 -0500
committer	GitHub <noreply@github.com>	2021-02-18 09:01:29 -0500
commit	8ec221710329c157a0c83d401e592a325ae02c20 (patch)
tree	18d2d3e555405132ab0a9bee5d2d1cd05e5680d2 /synapse
parent	Add http2 to the nginx example config (#9390) (diff)
download	synapse-8ec221710329c157a0c83d401e592a325ae02c20.tar.xz