diff --git a/changelog.d/6331.feature b/changelog.d/6331.feature
new file mode 100644
index 0000000000..eaf69ef3f6
--- /dev/null
+++ b/changelog.d/6331.feature
@@ -0,0 +1 @@
+Limit the length of data returned by url previews, to prevent DoS attacks.
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 531d923f76..69544b3711 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -56,6 +56,9 @@ logger = logging.getLogger(__name__)
_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
+OG_TAG_NAME_MAXLEN = 50
+OG_TAG_VALUE_MAXLEN = 1000
+
class PreviewUrlResource(DirectServeResource):
isLeaf = True
@@ -171,7 +174,7 @@ class PreviewUrlResource(DirectServeResource):
ts (int):
Returns:
- Deferred[str]: json-encoded og data
+ Deferred[bytes]: json-encoded og data
"""
# check the URL cache in the DB (which will also provide us with
# historical previews, if we have any)
@@ -272,6 +275,17 @@ class PreviewUrlResource(DirectServeResource):
logger.warning("Failed to find any OG data in %s", url)
og = {}
+ # filter out any stupidly long values
+ keys_to_remove = []
+ for k, v in og.items():
+ if len(k) > OG_TAG_NAME_MAXLEN or len(v) > OG_TAG_VALUE_MAXLEN:
+ logger.warning(
+ "Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN]
+ )
+ keys_to_remove.append(k)
+ for k in keys_to_remove:
+ del og[k]
+
logger.debug("Calculated OG for %s as %s", url, og)
jsonog = json.dumps(og)
@@ -506,6 +520,10 @@ def _calc_og(tree, media_uri):
og = {}
for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
if "content" in tag.attrib:
+ # if we've got more than 50 tags, someone is taking the piss
+ if len(og) >= 50:
+ logger.warning("Skipping OG for page with too many 'og:' tags")
+ return {}
og[tag.attrib["property"]] = tag.attrib["content"]
# TODO: grab article: meta tags too, e.g.:
diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py
index 976652aee8..852b8ab11c 100644
--- a/tests/rest/media/v1/test_url_preview.py
+++ b/tests/rest/media/v1/test_url_preview.py
@@ -247,6 +247,41 @@ class URLPreviewTests(unittest.HomeserverTestCase):
self.assertEqual(channel.code, 200)
self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
+ def test_overlong_title(self):
+ self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")]
+
+ end_content = (
+ b"<html><head>"
+ b"<title>" + b"x" * 2000 + b"</title>"
+ b'<meta property="og:description" content="hi" />'
+ b"</head></html>"
+ )
+
+ request, channel = self.make_request(
+ "GET", "url_preview?url=http://matrix.org", shorthand=False
+ )
+ request.render(self.preview_url)
+ self.pump()
+
+ client = self.reactor.tcpClients[0][2].buildProtocol(None)
+ server = AccumulatingProtocol()
+ server.makeConnection(FakeTransport(client, self.reactor))
+ client.makeConnection(FakeTransport(server, self.reactor))
+ client.dataReceived(
+ (
+ b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+ b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
+ )
+ % (len(end_content),)
+ + end_content
+ )
+
+ self.pump()
+ self.assertEqual(channel.code, 200)
+ res = channel.json_body
+ # We should only see the `og:description` field, as `title` is too long and should be stripped out
+ self.assertCountEqual(["og:description"], res.keys())
+
def test_ipaddr(self):
"""
IP addresses can be previewed directly.
|