From 1b112840d2c6dafa131eba4f0285409bb7345661 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 8 Oct 2021 14:14:42 -0400 Subject: Autodiscover oEmbed endpoint from returned HTML (#10822) Searches the returned HTML for an oEmbed endpoint using the autodiscovery mechanism (``), and will request it to generate the preview. --- tests/rest/media/v1/test_url_preview.py | 100 +++++++++++++++++++++++++++++++- tests/test_preview.py | 40 ++++++++----- 2 files changed, 123 insertions(+), 17 deletions(-) (limited to 'tests') diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index ce43de780b..8698135a76 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -725,9 +725,107 @@ class URLPreviewTests(unittest.HomeserverTestCase): }, ) + def test_oembed_autodiscovery(self): + """ + Autodiscovery works by finding the link in the HTML response and then requesting an oEmbed URL. + 1. Request a preview of a URL which is not known to the oEmbed code. + 2. It returns HTML including a link to an oEmbed preview. + 3. The oEmbed preview is requested and returns a URL for an image. + 4. The image is requested for thumbnailing. + """ + # This is a little cheesy in that we use the www subdomain (which isn't the + # list of oEmbed patterns) to get "raw" HTML response. + self.lookups["www.twitter.com"] = [(IPv4Address, "10.1.2.3")] + self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")] + self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")] + + result = b""" + + """ + + channel = self.make_request( + "GET", + "preview_url?url=http://www.twitter.com/matrixdotorg/status/12345", + shorthand=False, + await_result=False, + ) + self.pump() + + client = self.reactor.tcpClients[0][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b'Content-Type: text/html; charset="utf8"\r\n\r\n' + ) + % (len(result),) + + result + ) + + self.pump() + + # The oEmbed response. + result2 = { + "version": "1.0", + "type": "photo", + "url": "http://cdn.twitter.com/matrixdotorg", + } + oembed_content = json.dumps(result2).encode("utf-8") + + # Ensure a second request is made to the oEmbed URL. + client = self.reactor.tcpClients[1][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b'Content-Type: application/json; charset="utf8"\r\n\r\n' + ) + % (len(oembed_content),) + + oembed_content + ) + + self.pump() + + # Ensure the URL is what was requested. + self.assertIn(b"/oembed?", server.data) + + # Ensure a third request is made to the photo URL. + client = self.reactor.tcpClients[2][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b"Content-Type: image/png\r\n\r\n" + ) + % (len(SMALL_PNG),) + + SMALL_PNG + ) + + self.pump() + + # Ensure the URL is what was requested. + self.assertIn(b"/matrixdotorg", server.data) + + self.assertEqual(channel.code, 200) + body = channel.json_body + self.assertEqual( + body["og:url"], "http://www.twitter.com/matrixdotorg/status/12345" + ) + self.assertTrue(body["og:image"].startswith("mxc://")) + self.assertEqual(body["og:image:height"], 1) + self.assertEqual(body["og:image:width"], 1) + self.assertEqual(body["og:image:type"], "image/png") + def _download_image(self): """Downloads an image into the URL cache. - Returns: A (host, media_id) tuple representing the MXC URI of the image. """ diff --git a/tests/test_preview.py b/tests/test_preview.py index 48e792b55b..09e017b4d9 100644 --- a/tests/test_preview.py +++ b/tests/test_preview.py @@ -13,7 +13,8 @@ # limitations under the License. from synapse.rest.media.v1.preview_url_resource import ( - decode_and_calc_og, + _calc_og, + decode_body, get_html_media_encoding, summarize_paragraphs, ) @@ -158,7 +159,8 @@ class CalcOgTestCase(unittest.TestCase): """ - og = decode_and_calc_og(html, "http://example.com/test.html") + tree = decode_body(html) + og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -173,7 +175,8 @@ class CalcOgTestCase(unittest.TestCase): """ - og = decode_and_calc_og(html, "http://example.com/test.html") + tree = decode_body(html) + og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -191,7 +194,8 @@ class CalcOgTestCase(unittest.TestCase): """ - og = decode_and_calc_og(html, "http://example.com/test.html") + tree = decode_body(html) + og = _calc_og(tree, "http://example.com/test.html") self.assertEqual( og, @@ -212,7 +216,8 @@ class CalcOgTestCase(unittest.TestCase): """ - og = decode_and_calc_og(html, "http://example.com/test.html") + tree = decode_body(html) + og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -225,7 +230,8 @@ class CalcOgTestCase(unittest.TestCase): """ - og = decode_and_calc_og(html, "http://example.com/test.html") + tree = decode_body(html) + og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) @@ -239,7 +245,8 @@ class CalcOgTestCase(unittest.TestCase): """ - og = decode_and_calc_og(html, "http://example.com/test.html") + tree = decode_body(html) + og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."}) @@ -253,21 +260,22 @@ class CalcOgTestCase(unittest.TestCase): """ - og = decode_and_calc_og(html, "http://example.com/test.html") + tree = decode_body(html) + og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) def test_empty(self): """Test a body with no data in it.""" html = b"" - og = decode_and_calc_og(html, "http://example.com/test.html") - self.assertEqual(og, {}) + tree = decode_body(html) + self.assertIsNone(tree) def test_no_tree(self): """A valid body with no tree in it.""" html = b"\x00" - og = decode_and_calc_og(html, "http://example.com/test.html") - self.assertEqual(og, {}) + tree = decode_body(html) + self.assertIsNone(tree) def test_invalid_encoding(self): """An invalid character encoding should be ignored and treated as UTF-8, if possible.""" @@ -279,9 +287,8 @@ class CalcOgTestCase(unittest.TestCase): """ - og = decode_and_calc_og( - html, "http://example.com/test.html", "invalid-encoding" - ) + tree = decode_body(html, "invalid-encoding") + og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) def test_invalid_encoding2(self): @@ -295,7 +302,8 @@ class CalcOgTestCase(unittest.TestCase): """ - og = decode_and_calc_og(html, "http://example.com/test.html") + tree = decode_body(html) + og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."}) -- cgit 1.4.1