From 7e91107be1a4287873266e588a3c5b415279f4c8 Mon Sep 17 00:00:00 2001
From: Dirk Klimpel <5740567+dklimpel@users.noreply.github.com>
Date: Thu, 3 Mar 2022 17:05:44 +0100
Subject: Add type hints to `tests/rest` (#12146)
* Add type hints to `tests/rest`
* newsfile
* change import from `SigningKey`
---
tests/rest/media/v1/test_html_preview.py | 54 ++++++++++++++++----------------
1 file changed, 27 insertions(+), 27 deletions(-)
(limited to 'tests/rest/media/v1/test_html_preview.py')
diff --git a/tests/rest/media/v1/test_html_preview.py b/tests/rest/media/v1/test_html_preview.py
index a4b57e3d1f..3fb37a2a59 100644
--- a/tests/rest/media/v1/test_html_preview.py
+++ b/tests/rest/media/v1/test_html_preview.py
@@ -32,7 +32,7 @@ class SummarizeTestCase(unittest.TestCase):
if not lxml:
skip = "url preview feature requires lxml"
- def test_long_summarize(self):
+ def test_long_summarize(self) -> None:
example_paras = [
"""Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:
Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in
@@ -90,7 +90,7 @@ class SummarizeTestCase(unittest.TestCase):
" Tromsøya had a population of 36,088. Substantial parts of the urban…",
)
- def test_short_summarize(self):
+ def test_short_summarize(self) -> None:
example_paras = [
"Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
" Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
@@ -117,7 +117,7 @@ class SummarizeTestCase(unittest.TestCase):
" most of the year.",
)
- def test_small_then_large_summarize(self):
+ def test_small_then_large_summarize(self) -> None:
example_paras = [
"Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
" Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
@@ -150,7 +150,7 @@ class CalcOgTestCase(unittest.TestCase):
if not lxml:
skip = "url preview feature requires lxml"
- def test_simple(self):
+ def test_simple(self) -> None:
html = b"""
Foo
@@ -165,7 +165,7 @@ class CalcOgTestCase(unittest.TestCase):
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
- def test_comment(self):
+ def test_comment(self) -> None:
html = b"""
Foo
@@ -181,7 +181,7 @@ class CalcOgTestCase(unittest.TestCase):
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
- def test_comment2(self):
+ def test_comment2(self) -> None:
html = b"""
Foo
@@ -206,7 +206,7 @@ class CalcOgTestCase(unittest.TestCase):
},
)
- def test_script(self):
+ def test_script(self) -> None:
html = b"""
Foo
@@ -222,7 +222,7 @@ class CalcOgTestCase(unittest.TestCase):
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
- def test_missing_title(self):
+ def test_missing_title(self) -> None:
html = b"""
@@ -236,7 +236,7 @@ class CalcOgTestCase(unittest.TestCase):
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
- def test_h1_as_title(self):
+ def test_h1_as_title(self) -> None:
html = b"""
@@ -251,7 +251,7 @@ class CalcOgTestCase(unittest.TestCase):
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
- def test_missing_title_and_broken_h1(self):
+ def test_missing_title_and_broken_h1(self) -> None:
html = b"""
@@ -266,19 +266,19 @@ class CalcOgTestCase(unittest.TestCase):
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
- def test_empty(self):
+ def test_empty(self) -> None:
"""Test a body with no data in it."""
html = b""
tree = decode_body(html, "http://example.com/test.html")
self.assertIsNone(tree)
- def test_no_tree(self):
+ def test_no_tree(self) -> None:
"""A valid body with no tree in it."""
html = b"\x00"
tree = decode_body(html, "http://example.com/test.html")
self.assertIsNone(tree)
- def test_xml(self):
+ def test_xml(self) -> None:
"""Test decoding XML and ensure it works properly."""
# Note that the strip() call is important to ensure the xml tag starts
# at the initial byte.
@@ -293,7 +293,7 @@ class CalcOgTestCase(unittest.TestCase):
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
- def test_invalid_encoding(self):
+ def test_invalid_encoding(self) -> None:
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
html = b"""
@@ -307,7 +307,7 @@ class CalcOgTestCase(unittest.TestCase):
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
- def test_invalid_encoding2(self):
+ def test_invalid_encoding2(self) -> None:
"""A body which doesn't match the sent character encoding."""
# Note that this contains an invalid UTF-8 sequence in the title.
html = b"""
@@ -322,7 +322,7 @@ class CalcOgTestCase(unittest.TestCase):
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
- def test_windows_1252(self):
+ def test_windows_1252(self) -> None:
"""A body which uses cp1252, but doesn't declare that."""
html = b"""
@@ -338,7 +338,7 @@ class CalcOgTestCase(unittest.TestCase):
class MediaEncodingTestCase(unittest.TestCase):
- def test_meta_charset(self):
+ def test_meta_charset(self) -> None:
"""A character encoding is found via the meta tag."""
encodings = _get_html_media_encodings(
b"""
@@ -363,7 +363,7 @@ class MediaEncodingTestCase(unittest.TestCase):
)
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
- def test_meta_charset_underscores(self):
+ def test_meta_charset_underscores(self) -> None:
"""A character encoding contains underscore."""
encodings = _get_html_media_encodings(
b"""
@@ -376,7 +376,7 @@ class MediaEncodingTestCase(unittest.TestCase):
)
self.assertEqual(list(encodings), ["shift_jis", "utf-8", "cp1252"])
- def test_xml_encoding(self):
+ def test_xml_encoding(self) -> None:
"""A character encoding is found via the meta tag."""
encodings = _get_html_media_encodings(
b"""
@@ -388,7 +388,7 @@ class MediaEncodingTestCase(unittest.TestCase):
)
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
- def test_meta_xml_encoding(self):
+ def test_meta_xml_encoding(self) -> None:
"""Meta tags take precedence over XML encoding."""
encodings = _get_html_media_encodings(
b"""
@@ -402,7 +402,7 @@ class MediaEncodingTestCase(unittest.TestCase):
)
self.assertEqual(list(encodings), ["utf-16", "ascii", "utf-8", "cp1252"])
- def test_content_type(self):
+ def test_content_type(self) -> None:
"""A character encoding is found via the Content-Type header."""
# Test a few variations of the header.
headers = (
@@ -417,12 +417,12 @@ class MediaEncodingTestCase(unittest.TestCase):
encodings = _get_html_media_encodings(b"", header)
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
- def test_fallback(self):
+ def test_fallback(self) -> None:
"""A character encoding cannot be found in the body or header."""
encodings = _get_html_media_encodings(b"", "text/html")
self.assertEqual(list(encodings), ["utf-8", "cp1252"])
- def test_duplicates(self):
+ def test_duplicates(self) -> None:
"""Ensure each encoding is only attempted once."""
encodings = _get_html_media_encodings(
b"""
@@ -436,7 +436,7 @@ class MediaEncodingTestCase(unittest.TestCase):
)
self.assertEqual(list(encodings), ["utf-8", "cp1252"])
- def test_unknown_invalid(self):
+ def test_unknown_invalid(self) -> None:
"""A character encoding should be ignored if it is unknown or invalid."""
encodings = _get_html_media_encodings(
b"""
@@ -451,7 +451,7 @@ class MediaEncodingTestCase(unittest.TestCase):
class RebaseUrlTestCase(unittest.TestCase):
- def test_relative(self):
+ def test_relative(self) -> None:
"""Relative URLs should be resolved based on the context of the base URL."""
self.assertEqual(
rebase_url("subpage", "https://example.com/foo/"),
@@ -466,14 +466,14 @@ class RebaseUrlTestCase(unittest.TestCase):
"https://example.com/bar",
)
- def test_absolute(self):
+ def test_absolute(self) -> None:
"""Absolute URLs should not be modified."""
self.assertEqual(
rebase_url("https://alice.com/a/", "https://example.com/foo/"),
"https://alice.com/a/",
)
- def test_data(self):
+ def test_data(self) -> None:
"""Data URLs should not be modified."""
self.assertEqual(
rebase_url("data:,Hello%2C%20World%21", "https://example.com/foo/"),
--
cgit 1.5.1
From 4587b35929d22731644a11120a9e7d6a9c3bc304 Mon Sep 17 00:00:00 2001
From: Patrick Cloke
Date: Wed, 16 Mar 2022 07:21:36 -0400
Subject: Clean-up logic for rebasing URLs during URL preview. (#12219)
By using urljoin from the standard library and reducing the number
of places URLs are rebased.
---
changelog.d/12219.misc | 1 +
synapse/rest/media/v1/preview_html.py | 39 +------------------
synapse/rest/media/v1/preview_url_resource.py | 23 ++++++------
tests/rest/media/v1/test_html_preview.py | 54 ++++++---------------------
4 files changed, 26 insertions(+), 91 deletions(-)
create mode 100644 changelog.d/12219.misc
(limited to 'tests/rest/media/v1/test_html_preview.py')
diff --git a/changelog.d/12219.misc b/changelog.d/12219.misc
new file mode 100644
index 0000000000..6079414092
--- /dev/null
+++ b/changelog.d/12219.misc
@@ -0,0 +1 @@
+Clean-up logic around rebasing URLs for URL image previews.
diff --git a/synapse/rest/media/v1/preview_html.py b/synapse/rest/media/v1/preview_html.py
index 872a9e72e8..4cc9c66fbe 100644
--- a/synapse/rest/media/v1/preview_html.py
+++ b/synapse/rest/media/v1/preview_html.py
@@ -16,7 +16,6 @@ import itertools
import logging
import re
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
-from urllib import parse as urlparse
if TYPE_CHECKING:
from lxml import etree
@@ -144,9 +143,7 @@ def decode_body(
return etree.fromstring(body, parser)
-def parse_html_to_open_graph(
- tree: "etree.Element", media_uri: str
-) -> Dict[str, Optional[str]]:
+def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
"""
Parse the HTML document into an Open Graph response.
@@ -155,7 +152,6 @@ def parse_html_to_open_graph(
Args:
tree: The parsed HTML document.
- media_url: The URI used to download the body.
Returns:
The Open Graph response as a dictionary.
@@ -209,7 +205,7 @@ def parse_html_to_open_graph(
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
)
if meta_image:
- og["og:image"] = rebase_url(meta_image[0], media_uri)
+ og["og:image"] = meta_image[0]
else:
# TODO: consider inlined CSS styles as well as width & height attribs
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
@@ -320,37 +316,6 @@ def _iterate_over_text(
)
-def rebase_url(url: str, base: str) -> str:
- """
- Resolves a potentially relative `url` against an absolute `base` URL.
-
- For example:
-
- >>> rebase_url("subpage", "https://example.com/foo/")
- 'https://example.com/foo/subpage'
- >>> rebase_url("sibling", "https://example.com/foo")
- 'https://example.com/sibling'
- >>> rebase_url("/bar", "https://example.com/foo/")
- 'https://example.com/bar'
- >>> rebase_url("https://alice.com/a/", "https://example.com/foo/")
- 'https://alice.com/a'
- """
- base_parts = urlparse.urlparse(base)
- # Convert the parsed URL to a list for (potential) modification.
- url_parts = list(urlparse.urlparse(url))
- # Add a scheme, if one does not exist.
- if not url_parts[0]:
- url_parts[0] = base_parts.scheme or "http"
- # Fix up the hostname, if this is not a data URL.
- if url_parts[0] != "data" and not url_parts[1]:
- url_parts[1] = base_parts.netloc
- # If the path does not start with a /, nest it under the base path's last
- # directory.
- if not url_parts[2].startswith("/"):
- url_parts[2] = re.sub(r"/[^/]+$", "/", base_parts.path) + url_parts[2]
- return urlparse.urlunparse(url_parts)
-
-
def summarize_paragraphs(
text_nodes: Iterable[str], min_size: int = 200, max_size: int = 500
) -> Optional[str]:
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 14ea88b240..d47af8ead6 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -22,7 +22,7 @@ import shutil
import sys
import traceback
from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple
-from urllib import parse as urlparse
+from urllib.parse import urljoin, urlparse, urlsplit
from urllib.request import urlopen
import attr
@@ -44,11 +44,7 @@ from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.rest.media.v1._base import get_filename_from_headers
from synapse.rest.media.v1.media_storage import MediaStorage
from synapse.rest.media.v1.oembed import OEmbedProvider
-from synapse.rest.media.v1.preview_html import (
- decode_body,
- parse_html_to_open_graph,
- rebase_url,
-)
+from synapse.rest.media.v1.preview_html import decode_body, parse_html_to_open_graph
from synapse.types import JsonDict, UserID
from synapse.util import json_encoder
from synapse.util.async_helpers import ObservableDeferred
@@ -187,7 +183,7 @@ class PreviewUrlResource(DirectServeJsonResource):
ts = self.clock.time_msec()
# XXX: we could move this into _do_preview if we wanted.
- url_tuple = urlparse.urlsplit(url)
+ url_tuple = urlsplit(url)
for entry in self.url_preview_url_blacklist:
match = True
for attrib in entry:
@@ -322,7 +318,7 @@ class PreviewUrlResource(DirectServeJsonResource):
# Parse Open Graph information from the HTML in case the oEmbed
# response failed or is incomplete.
- og_from_html = parse_html_to_open_graph(tree, media_info.uri)
+ og_from_html = parse_html_to_open_graph(tree)
# Compile the Open Graph response by using the scraped
# information from the HTML and overlaying any information
@@ -588,12 +584,17 @@ class PreviewUrlResource(DirectServeJsonResource):
if "og:image" not in og or not og["og:image"]:
return
+ # The image URL from the HTML might be relative to the previewed page,
+ # convert it to an URL which can be requested directly.
+ image_url = og["og:image"]
+ url_parts = urlparse(image_url)
+ if url_parts.scheme != "data":
+ image_url = urljoin(media_info.uri, image_url)
+
# FIXME: it might be cleaner to use the same flow as the main /preview_url
# request itself and benefit from the same caching etc. But for now we
# just rely on the caching on the master request to speed things up.
- image_info = await self._handle_url(
- rebase_url(og["og:image"], media_info.uri), user, allow_data_urls=True
- )
+ image_info = await self._handle_url(image_url, user, allow_data_urls=True)
if _is_media(image_info.media_type):
# TODO: make sure we don't choke on white-on-transparent images
diff --git a/tests/rest/media/v1/test_html_preview.py b/tests/rest/media/v1/test_html_preview.py
index 3fb37a2a59..62e308814d 100644
--- a/tests/rest/media/v1/test_html_preview.py
+++ b/tests/rest/media/v1/test_html_preview.py
@@ -16,7 +16,6 @@ from synapse.rest.media.v1.preview_html import (
_get_html_media_encodings,
decode_body,
parse_html_to_open_graph,
- rebase_url,
summarize_paragraphs,
)
@@ -161,7 +160,7 @@ class CalcOgTestCase(unittest.TestCase):
"""
tree = decode_body(html, "http://example.com/test.html")
- og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+ og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@@ -177,7 +176,7 @@ class CalcOgTestCase(unittest.TestCase):
"""
tree = decode_body(html, "http://example.com/test.html")
- og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+ og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@@ -196,7 +195,7 @@ class CalcOgTestCase(unittest.TestCase):
"""
tree = decode_body(html, "http://example.com/test.html")
- og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+ og = parse_html_to_open_graph(tree)
self.assertEqual(
og,
@@ -218,7 +217,7 @@ class CalcOgTestCase(unittest.TestCase):
"""
tree = decode_body(html, "http://example.com/test.html")
- og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+ og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@@ -232,7 +231,7 @@ class CalcOgTestCase(unittest.TestCase):
"""
tree = decode_body(html, "http://example.com/test.html")
- og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+ og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
@@ -247,7 +246,7 @@ class CalcOgTestCase(unittest.TestCase):
"""
tree = decode_body(html, "http://example.com/test.html")
- og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+ og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
@@ -262,7 +261,7 @@ class CalcOgTestCase(unittest.TestCase):
"""
tree = decode_body(html, "http://example.com/test.html")
- og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+ og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
@@ -290,7 +289,7 @@ class CalcOgTestCase(unittest.TestCase):
FooSome text.
""".strip()
tree = decode_body(html, "http://example.com/test.html")
- og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+ og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
def test_invalid_encoding(self) -> None:
@@ -304,7 +303,7 @@ class CalcOgTestCase(unittest.TestCase):
"""
tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
- og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+ og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
def test_invalid_encoding2(self) -> None:
@@ -319,7 +318,7 @@ class CalcOgTestCase(unittest.TestCase):
"""
tree = decode_body(html, "http://example.com/test.html")
- og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+ og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
def test_windows_1252(self) -> None:
@@ -333,7 +332,7 @@ class CalcOgTestCase(unittest.TestCase):
"""
tree = decode_body(html, "http://example.com/test.html")
- og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+ og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
@@ -448,34 +447,3 @@ class MediaEncodingTestCase(unittest.TestCase):
'text/html; charset="invalid"',
)
self.assertEqual(list(encodings), ["utf-8", "cp1252"])
-
-
-class RebaseUrlTestCase(unittest.TestCase):
- def test_relative(self) -> None:
- """Relative URLs should be resolved based on the context of the base URL."""
- self.assertEqual(
- rebase_url("subpage", "https://example.com/foo/"),
- "https://example.com/foo/subpage",
- )
- self.assertEqual(
- rebase_url("sibling", "https://example.com/foo"),
- "https://example.com/sibling",
- )
- self.assertEqual(
- rebase_url("/bar", "https://example.com/foo/"),
- "https://example.com/bar",
- )
-
- def test_absolute(self) -> None:
- """Absolute URLs should not be modified."""
- self.assertEqual(
- rebase_url("https://alice.com/a/", "https://example.com/foo/"),
- "https://alice.com/a/",
- )
-
- def test_data(self) -> None:
- """Data URLs should not be modified."""
- self.assertEqual(
- rebase_url("data:,Hello%2C%20World%21", "https://example.com/foo/"),
- "data:,Hello%2C%20World%21",
- )
--
cgit 1.5.1