diff options
-rw-r--r-- | changelog.d/6331.feature | 1 | ||||
-rw-r--r-- | changelog.d/6334.feature | 1 | ||||
-rw-r--r-- | synapse/rest/media/v1/preview_url_resource.py | 21 | ||||
-rw-r--r-- | tests/rest/media/v1/test_url_preview.py | 35 |
4 files changed, 57 insertions, 1 deletions
diff --git a/changelog.d/6331.feature b/changelog.d/6331.feature new file mode 100644 index 0000000000..eaf69ef3f6 --- /dev/null +++ b/changelog.d/6331.feature @@ -0,0 +1 @@ +Limit the length of data returned by url previews, to prevent DoS attacks. diff --git a/changelog.d/6334.feature b/changelog.d/6334.feature new file mode 100644 index 0000000000..eaf69ef3f6 --- /dev/null +++ b/changelog.d/6334.feature @@ -0,0 +1 @@ +Limit the length of data returned by url previews, to prevent DoS attacks. diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 531d923f76..15c15a12f5 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -56,6 +56,9 @@ logger = logging.getLogger(__name__) _charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I) _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I) +OG_TAG_NAME_MAXLEN = 50 +OG_TAG_VALUE_MAXLEN = 1000 + class PreviewUrlResource(DirectServeResource): isLeaf = True @@ -171,7 +174,7 @@ class PreviewUrlResource(DirectServeResource): ts (int): Returns: - Deferred[str]: json-encoded og data + Deferred[bytes]: json-encoded og data """ # check the URL cache in the DB (which will also provide us with # historical previews, if we have any) @@ -272,6 +275,18 @@ class PreviewUrlResource(DirectServeResource): logger.warning("Failed to find any OG data in %s", url) og = {} + # filter out any stupidly long values + keys_to_remove = [] + for k, v in og.items(): + # values can be numeric as well as strings, hence the cast to str + if len(k) > OG_TAG_NAME_MAXLEN or len(str(v)) > OG_TAG_VALUE_MAXLEN: + logger.warning( + "Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN] + ) + keys_to_remove.append(k) + for k in keys_to_remove: + del og[k] + logger.debug("Calculated OG for %s as %s", url, og) jsonog = json.dumps(og) @@ -506,6 +521,10 @@ def _calc_og(tree, media_uri): og = {} for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): if "content" in tag.attrib: + # if we've got more than 50 tags, someone is taking the piss + if len(og) >= 50: + logger.warning("Skipping OG for page with too many 'og:' tags") + return {} og[tag.attrib["property"]] = tag.attrib["content"] # TODO: grab article: meta tags too, e.g.: diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index 976652aee8..852b8ab11c 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -247,6 +247,41 @@ class URLPreviewTests(unittest.HomeserverTestCase): self.assertEqual(channel.code, 200) self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430") + def test_overlong_title(self): + self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")] + + end_content = ( + b"<html><head>" + b"<title>" + b"x" * 2000 + b"</title>" + b'<meta property="og:description" content="hi" />' + b"</head></html>" + ) + + request, channel = self.make_request( + "GET", "url_preview?url=http://matrix.org", shorthand=False + ) + request.render(self.preview_url) + self.pump() + + client = self.reactor.tcpClients[0][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b'Content-Type: text/html; charset="windows-1251"\r\n\r\n' + ) + % (len(end_content),) + + end_content + ) + + self.pump() + self.assertEqual(channel.code, 200) + res = channel.json_body + # We should only see the `og:description` field, as `title` is too long and should be stripped out + self.assertCountEqual(["og:description"], res.keys()) + def test_ipaddr(self): """ IP addresses can be previewed directly. |