From 807efd26aec9b65c6a2f02d10fd139095a5b3387 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 24 Jan 2022 08:58:18 -0500 Subject: Support rendering previews with data: URLs in them (#11767) Images which are data URLs will no longer break URL previews and will properly be "downloaded" and thumbnailed. --- tests/rest/media/v1/test_html_preview.py | 481 +++++++++++++++++++++++++++++++ tests/rest/media/v1/test_url_preview.py | 81 +++++- 2 files changed, 554 insertions(+), 8 deletions(-) create mode 100644 tests/rest/media/v1/test_html_preview.py (limited to 'tests/rest') diff --git a/tests/rest/media/v1/test_html_preview.py b/tests/rest/media/v1/test_html_preview.py new file mode 100644 index 0000000000..a4b57e3d1f --- /dev/null +++ b/tests/rest/media/v1/test_html_preview.py @@ -0,0 +1,481 @@ +# Copyright 2014-2016 OpenMarket Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from synapse.rest.media.v1.preview_html import ( + _get_html_media_encodings, + decode_body, + parse_html_to_open_graph, + rebase_url, + summarize_paragraphs, +) + +from tests import unittest + +try: + import lxml +except ImportError: + lxml = None + + +class SummarizeTestCase(unittest.TestCase): + if not lxml: + skip = "url preview feature requires lxml" + + def test_long_summarize(self): + example_paras = [ + """Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami: + Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in + Troms county, Norway. The administrative centre of the municipality is + the city of Tromsø. Outside of Norway, Tromso and Tromsö are + alternative spellings of the city.Tromsø is considered the northernmost + city in the world with a population above 50,000. The most populous town + north of it is Alta, Norway, with a population of 14,272 (2013).""", + """Tromsø lies in Northern Norway. The municipality has a population of + (2015) 72,066, but with an annual influx of students it has over 75,000 + most of the year. It is the largest urban area in Northern Norway and the + third largest north of the Arctic Circle (following Murmansk and Norilsk). + Most of Tromsø, including the city centre, is located on the island of + Tromsøya, 350 kilometres (217 mi) north of the Arctic Circle. In 2012, + Tromsøya had a population of 36,088. Substantial parts of the urban area + are also situated on the mainland to the east, and on parts of Kvaløya—a + large island to the west. Tromsøya is connected to the mainland by the Tromsø + Bridge and the Tromsøysund Tunnel, and to the island of Kvaløya by the + Sandnessund Bridge. Tromsø Airport connects the city to many destinations + in Europe. The city is warmer than most other places located on the same + latitude, due to the warming effect of the Gulf Stream.""", + """The city centre of Tromsø contains the highest number of old wooden + houses in Northern Norway, the oldest house dating from 1789. The Arctic + Cathedral, a modern church from 1965, is probably the most famous landmark + in Tromsø. The city is a cultural centre for its region, with several + festivals taking place in the summer. Some of Norway's best-known + musicians, Torbjørn Brundtland and Svein Berge of the electronica duo + Röyksopp and Lene Marlin grew up and started their careers in Tromsø. + Noted electronic musician Geir Jenssen also hails from Tromsø.""", + ] + + desc = summarize_paragraphs(example_paras, min_size=200, max_size=500) + + self.assertEqual( + desc, + "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:" + " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in" + " Troms county, Norway. The administrative centre of the municipality is" + " the city of Tromsø. Outside of Norway, Tromso and Tromsö are" + " alternative spellings of the city.Tromsø is considered the northernmost" + " city in the world with a population above 50,000. The most populous town" + " north of it is Alta, Norway, with a population of 14,272 (2013).", + ) + + desc = summarize_paragraphs(example_paras[1:], min_size=200, max_size=500) + + self.assertEqual( + desc, + "Tromsø lies in Northern Norway. The municipality has a population of" + " (2015) 72,066, but with an annual influx of students it has over 75,000" + " most of the year. It is the largest urban area in Northern Norway and the" + " third largest north of the Arctic Circle (following Murmansk and Norilsk)." + " Most of Tromsø, including the city centre, is located on the island of" + " Tromsøya, 350 kilometres (217 mi) north of the Arctic Circle. In 2012," + " Tromsøya had a population of 36,088. Substantial parts of the urban…", + ) + + def test_short_summarize(self): + example_paras = [ + "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:" + " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in" + " Troms county, Norway.", + "Tromsø lies in Northern Norway. The municipality has a population of" + " (2015) 72,066, but with an annual influx of students it has over 75,000" + " most of the year.", + "The city centre of Tromsø contains the highest number of old wooden" + " houses in Northern Norway, the oldest house dating from 1789. The Arctic" + " Cathedral, a modern church from 1965, is probably the most famous landmark" + " in Tromsø.", + ] + + desc = summarize_paragraphs(example_paras, min_size=200, max_size=500) + + self.assertEqual( + desc, + "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:" + " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in" + " Troms county, Norway.\n" + "\n" + "Tromsø lies in Northern Norway. The municipality has a population of" + " (2015) 72,066, but with an annual influx of students it has over 75,000" + " most of the year.", + ) + + def test_small_then_large_summarize(self): + example_paras = [ + "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:" + " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in" + " Troms county, Norway.", + "Tromsø lies in Northern Norway. The municipality has a population of" + " (2015) 72,066, but with an annual influx of students it has over 75,000" + " most of the year." + " The city centre of Tromsø contains the highest number of old wooden" + " houses in Northern Norway, the oldest house dating from 1789. The Arctic" + " Cathedral, a modern church from 1965, is probably the most famous landmark" + " in Tromsø.", + ] + + desc = summarize_paragraphs(example_paras, min_size=200, max_size=500) + self.assertEqual( + desc, + "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:" + " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in" + " Troms county, Norway.\n" + "\n" + "Tromsø lies in Northern Norway. The municipality has a population of" + " (2015) 72,066, but with an annual influx of students it has over 75,000" + " most of the year. The city centre of Tromsø contains the highest number" + " of old wooden houses in Northern Norway, the oldest house dating from" + " 1789. The Arctic Cathedral, a modern church from…", + ) + + +class CalcOgTestCase(unittest.TestCase): + if not lxml: + skip = "url preview feature requires lxml" + + def test_simple(self): + html = b""" + + Foo + + Some text. + + + """ + + tree = decode_body(html, "http://example.com/test.html") + og = parse_html_to_open_graph(tree, "http://example.com/test.html") + + self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) + + def test_comment(self): + html = b""" + + Foo + + + Some text. + + + """ + + tree = decode_body(html, "http://example.com/test.html") + og = parse_html_to_open_graph(tree, "http://example.com/test.html") + + self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) + + def test_comment2(self): + html = b""" + + Foo + + Some text. + + Some more text. +

Text

+ More text + + + """ + + tree = decode_body(html, "http://example.com/test.html") + og = parse_html_to_open_graph(tree, "http://example.com/test.html") + + self.assertEqual( + og, + { + "og:title": "Foo", + "og:description": "Some text.\n\nSome more text.\n\nText\n\nMore text", + }, + ) + + def test_script(self): + html = b""" + + Foo + + + Some text. + + + """ + + tree = decode_body(html, "http://example.com/test.html") + og = parse_html_to_open_graph(tree, "http://example.com/test.html") + + self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) + + def test_missing_title(self): + html = b""" + + + Some text. + + + """ + + tree = decode_body(html, "http://example.com/test.html") + og = parse_html_to_open_graph(tree, "http://example.com/test.html") + + self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) + + def test_h1_as_title(self): + html = b""" + + + +

Title

+ + + """ + + tree = decode_body(html, "http://example.com/test.html") + og = parse_html_to_open_graph(tree, "http://example.com/test.html") + + self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."}) + + def test_missing_title_and_broken_h1(self): + html = b""" + + +

+ Some text. + + + """ + + tree = decode_body(html, "http://example.com/test.html") + og = parse_html_to_open_graph(tree, "http://example.com/test.html") + + self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) + + def test_empty(self): + """Test a body with no data in it.""" + html = b"" + tree = decode_body(html, "http://example.com/test.html") + self.assertIsNone(tree) + + def test_no_tree(self): + """A valid body with no tree in it.""" + html = b"\x00" + tree = decode_body(html, "http://example.com/test.html") + self.assertIsNone(tree) + + def test_xml(self): + """Test decoding XML and ensure it works properly.""" + # Note that the strip() call is important to ensure the xml tag starts + # at the initial byte. + html = b""" + + + + + FooSome text. + """.strip() + tree = decode_body(html, "http://example.com/test.html") + og = parse_html_to_open_graph(tree, "http://example.com/test.html") + self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) + + def test_invalid_encoding(self): + """An invalid character encoding should be ignored and treated as UTF-8, if possible.""" + html = b""" + + Foo + + Some text. + + + """ + tree = decode_body(html, "http://example.com/test.html", "invalid-encoding") + og = parse_html_to_open_graph(tree, "http://example.com/test.html") + self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) + + def test_invalid_encoding2(self): + """A body which doesn't match the sent character encoding.""" + # Note that this contains an invalid UTF-8 sequence in the title. + html = b""" + + \xff\xff Foo + + Some text. + + + """ + tree = decode_body(html, "http://example.com/test.html") + og = parse_html_to_open_graph(tree, "http://example.com/test.html") + self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."}) + + def test_windows_1252(self): + """A body which uses cp1252, but doesn't declare that.""" + html = b""" + + \xf3 + + Some text. + + + """ + tree = decode_body(html, "http://example.com/test.html") + og = parse_html_to_open_graph(tree, "http://example.com/test.html") + self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."}) + + +class MediaEncodingTestCase(unittest.TestCase): + def test_meta_charset(self): + """A character encoding is found via the meta tag.""" + encodings = _get_html_media_encodings( + b""" + + + + + """, + "text/html", + ) + self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"]) + + # A less well-formed version. + encodings = _get_html_media_encodings( + b""" + + < meta charset = ascii> + + + """, + "text/html", + ) + self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"]) + + def test_meta_charset_underscores(self): + """A character encoding contains underscore.""" + encodings = _get_html_media_encodings( + b""" + + + + + """, + "text/html", + ) + self.assertEqual(list(encodings), ["shift_jis", "utf-8", "cp1252"]) + + def test_xml_encoding(self): + """A character encoding is found via the meta tag.""" + encodings = _get_html_media_encodings( + b""" + + + + """, + "text/html", + ) + self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"]) + + def test_meta_xml_encoding(self): + """Meta tags take precedence over XML encoding.""" + encodings = _get_html_media_encodings( + b""" + + + + + + """, + "text/html", + ) + self.assertEqual(list(encodings), ["utf-16", "ascii", "utf-8", "cp1252"]) + + def test_content_type(self): + """A character encoding is found via the Content-Type header.""" + # Test a few variations of the header. + headers = ( + 'text/html; charset="ascii";', + "text/html;charset=ascii;", + 'text/html; charset="ascii"', + "text/html; charset=ascii", + 'text/html; charset="ascii;', + 'text/html; charset=ascii";', + ) + for header in headers: + encodings = _get_html_media_encodings(b"", header) + self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"]) + + def test_fallback(self): + """A character encoding cannot be found in the body or header.""" + encodings = _get_html_media_encodings(b"", "text/html") + self.assertEqual(list(encodings), ["utf-8", "cp1252"]) + + def test_duplicates(self): + """Ensure each encoding is only attempted once.""" + encodings = _get_html_media_encodings( + b""" + + + + + + """, + 'text/html; charset="UTF_8"', + ) + self.assertEqual(list(encodings), ["utf-8", "cp1252"]) + + def test_unknown_invalid(self): + """A character encoding should be ignored if it is unknown or invalid.""" + encodings = _get_html_media_encodings( + b""" + + + + + """, + 'text/html; charset="invalid"', + ) + self.assertEqual(list(encodings), ["utf-8", "cp1252"]) + + +class RebaseUrlTestCase(unittest.TestCase): + def test_relative(self): + """Relative URLs should be resolved based on the context of the base URL.""" + self.assertEqual( + rebase_url("subpage", "https://example.com/foo/"), + "https://example.com/foo/subpage", + ) + self.assertEqual( + rebase_url("sibling", "https://example.com/foo"), + "https://example.com/sibling", + ) + self.assertEqual( + rebase_url("/bar", "https://example.com/foo/"), + "https://example.com/bar", + ) + + def test_absolute(self): + """Absolute URLs should not be modified.""" + self.assertEqual( + rebase_url("https://alice.com/a/", "https://example.com/foo/"), + "https://alice.com/a/", + ) + + def test_data(self): + """Data URLs should not be modified.""" + self.assertEqual( + rebase_url("data:,Hello%2C%20World%21", "https://example.com/foo/"), + "data:,Hello%2C%20World%21", + ) diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index 16e904f15b..53f6186213 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -12,9 +12,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import base64 import json import os import re +from urllib.parse import urlencode from twisted.internet._resolver import HostResolution from twisted.internet.address import IPv4Address, IPv6Address @@ -23,6 +25,7 @@ from twisted.test.proto_helpers import AccumulatingProtocol from synapse.config.oembed import OEmbedEndpointConfig from synapse.rest.media.v1.preview_url_resource import IMAGE_CACHE_EXPIRY_MS +from synapse.types import JsonDict from synapse.util.stringutils import parse_and_validate_mxc_uri from tests import unittest @@ -142,6 +145,14 @@ class URLPreviewTests(unittest.HomeserverTestCase): def create_test_resource(self): return self.hs.get_media_repository_resource() + def _assert_small_png(self, json_body: JsonDict) -> None: + """Assert properties from the SMALL_PNG test image.""" + self.assertTrue(json_body["og:image"].startswith("mxc://")) + self.assertEqual(json_body["og:image:height"], 1) + self.assertEqual(json_body["og:image:width"], 1) + self.assertEqual(json_body["og:image:type"], "image/png") + self.assertEqual(json_body["matrix:image:size"], 67) + def test_cache_returns_correct_type(self): self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")] @@ -569,6 +580,66 @@ class URLPreviewTests(unittest.HomeserverTestCase): server.data, ) + def test_data_url(self): + """ + Requesting to preview a data URL is not supported. + """ + self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")] + + data = base64.b64encode(SMALL_PNG).decode() + + query_params = urlencode( + { + "url": f'' + } + ) + + channel = self.make_request( + "GET", + f"preview_url?{query_params}", + shorthand=False, + ) + self.pump() + + self.assertEqual(channel.code, 500) + + def test_inline_data_url(self): + """ + An inline image (as a data URL) should be parsed properly. + """ + self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")] + + data = base64.b64encode(SMALL_PNG) + + end_content = ( + b"" b'' b"" + ) % (data,) + + channel = self.make_request( + "GET", + "preview_url?url=http://matrix.org", + shorthand=False, + await_result=False, + ) + self.pump() + + client = self.reactor.tcpClients[0][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b'Content-Type: text/html; charset="utf8"\r\n\r\n' + ) + % (len(end_content),) + + end_content + ) + + self.pump() + self.assertEqual(channel.code, 200) + self._assert_small_png(channel.json_body) + def test_oembed_photo(self): """Test an oEmbed endpoint which returns a 'photo' type which redirects the preview to a new URL.""" self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")] @@ -626,10 +697,7 @@ class URLPreviewTests(unittest.HomeserverTestCase): self.assertEqual(channel.code, 200) body = channel.json_body self.assertEqual(body["og:url"], "http://twitter.com/matrixdotorg/status/12345") - self.assertTrue(body["og:image"].startswith("mxc://")) - self.assertEqual(body["og:image:height"], 1) - self.assertEqual(body["og:image:width"], 1) - self.assertEqual(body["og:image:type"], "image/png") + self._assert_small_png(body) def test_oembed_rich(self): """Test an oEmbed endpoint which returns HTML content via the 'rich' type.""" @@ -820,10 +888,7 @@ class URLPreviewTests(unittest.HomeserverTestCase): self.assertEqual( body["og:url"], "http://www.twitter.com/matrixdotorg/status/12345" ) - self.assertTrue(body["og:image"].startswith("mxc://")) - self.assertEqual(body["og:image:height"], 1) - self.assertEqual(body["og:image:width"], 1) - self.assertEqual(body["og:image:type"], "image/png") + self._assert_small_png(body) def _download_image(self): """Downloads an image into the URL cache. -- cgit 1.4.1