diff --git a/changelog.d/11936.bugfix b/changelog.d/11936.bugfix
new file mode 100644
index 0000000000..bc149f2801
--- /dev/null
+++ b/changelog.d/11936.bugfix
@@ -0,0 +1 @@
+Implement an allow list of content types for which we will attempt to preview a URL. This prevents Synapse from making useless longer-lived connections to streaming media servers.
diff --git a/synapse/http/client.py b/synapse/http/client.py
index 743a7ffcb1..d617055617 100644
--- a/synapse/http/client.py
+++ b/synapse/http/client.py
@@ -20,6 +20,7 @@ from typing import (
TYPE_CHECKING,
Any,
BinaryIO,
+ Callable,
Dict,
Iterable,
List,
@@ -693,12 +694,18 @@ class SimpleHttpClient:
output_stream: BinaryIO,
max_size: Optional[int] = None,
headers: Optional[RawHeaders] = None,
+ is_allowed_content_type: Optional[Callable[[str], bool]] = None,
) -> Tuple[int, Dict[bytes, List[bytes]], str, int]:
"""GETs a file from a given URL
Args:
url: The URL to GET
output_stream: File to write the response body to.
headers: A map from header name to a list of values for that header
+ is_allowed_content_type: A predicate to determine whether the
+ content type of the file we're downloading is allowed. If set and
+ it evaluates to False when called with the content type, the
+ request will be terminated before completing the download by
+ raising SynapseError.
Returns:
A tuple of the file length, dict of the response
headers, absolute URI of the response and HTTP response code.
@@ -726,6 +733,17 @@ class SimpleHttpClient:
HTTPStatus.BAD_GATEWAY, "Got error %d" % (response.code,), Codes.UNKNOWN
)
+ if is_allowed_content_type and b"Content-Type" in resp_headers:
+ content_type = resp_headers[b"Content-Type"][0].decode("ascii")
+ if not is_allowed_content_type(content_type):
+ raise SynapseError(
+ HTTPStatus.BAD_GATEWAY,
+ (
+ "Requested file's content type not allowed for this operation: %s"
+ % content_type
+ ),
+ )
+
# TODO: if our Content-Type is HTML or something, just read the first
# N bytes into RAM rather than saving it all to disk only to read it
# straight back in again
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index efd84ced8f..8d3d1e54dc 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -403,6 +403,7 @@ class PreviewUrlResource(DirectServeJsonResource):
output_stream=output_stream,
max_size=self.max_spider_size,
headers={"Accept-Language": self.url_preview_accept_language},
+ is_allowed_content_type=_is_previewable,
)
except SynapseError:
# Pass SynapseErrors through directly, so that the servlet
@@ -761,3 +762,10 @@ def _is_html(content_type: str) -> bool:
def _is_json(content_type: str) -> bool:
return content_type.lower().startswith("application/json")
+
+
+def _is_previewable(content_type: str) -> bool:
+ """Returns True for content types for which we will perform URL preview and False
+ otherwise."""
+
+ return _is_html(content_type) or _is_media(content_type) or _is_json(content_type)
diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py
index 53f6186213..da2c533260 100644
--- a/tests/rest/media/v1/test_url_preview.py
+++ b/tests/rest/media/v1/test_url_preview.py
@@ -243,6 +243,78 @@ class URLPreviewTests(unittest.HomeserverTestCase):
self.assertEqual(channel.code, 200)
self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
+ def test_video_rejected(self):
+ self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
+
+ end_content = b"anything"
+
+ channel = self.make_request(
+ "GET",
+ "preview_url?url=http://matrix.org",
+ shorthand=False,
+ await_result=False,
+ )
+ self.pump()
+
+ client = self.reactor.tcpClients[0][2].buildProtocol(None)
+ server = AccumulatingProtocol()
+ server.makeConnection(FakeTransport(client, self.reactor))
+ client.makeConnection(FakeTransport(server, self.reactor))
+ client.dataReceived(
+ (
+ b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+ b"Content-Type: video/mp4\r\n\r\n"
+ )
+ % (len(end_content))
+ + end_content
+ )
+
+ self.pump()
+ self.assertEqual(channel.code, 502)
+ self.assertEqual(
+ channel.json_body,
+ {
+ "errcode": "M_UNKNOWN",
+ "error": "Requested file's content type not allowed for this operation: video/mp4",
+ },
+ )
+
+ def test_audio_rejected(self):
+ self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
+
+ end_content = b"anything"
+
+ channel = self.make_request(
+ "GET",
+ "preview_url?url=http://matrix.org",
+ shorthand=False,
+ await_result=False,
+ )
+ self.pump()
+
+ client = self.reactor.tcpClients[0][2].buildProtocol(None)
+ server = AccumulatingProtocol()
+ server.makeConnection(FakeTransport(client, self.reactor))
+ client.makeConnection(FakeTransport(server, self.reactor))
+ client.dataReceived(
+ (
+ b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+ b"Content-Type: audio/aac\r\n\r\n"
+ )
+ % (len(end_content))
+ + end_content
+ )
+
+ self.pump()
+ self.assertEqual(channel.code, 502)
+ self.assertEqual(
+ channel.json_body,
+ {
+ "errcode": "M_UNKNOWN",
+ "error": "Requested file's content type not allowed for this operation: audio/aac",
+ },
+ )
+
def test_non_ascii_preview_content_type(self):
self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
|