diff options
-rw-r--r-- | synapse/rest/media/v1/oembed.py | 20 | ||||
-rw-r--r-- | synapse/rest/media/v1/preview_html.py | 32 | ||||
-rw-r--r-- | synapse/rest/media/v1/preview_url_resource.py | 10 | ||||
-rw-r--r-- | tests/rest/media/v1/test_html_preview.py | 52 |
4 files changed, 57 insertions, 57 deletions
diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py index c402d5433e..eb28a91a78 100644 --- a/synapse/rest/media/v1/oembed.py +++ b/synapse/rest/media/v1/oembed.py @@ -97,19 +97,19 @@ class OEmbedProvider: # No match. return None - def autodiscover_from_html(self, tree: "BeautifulSoup") -> Optional[str]: + def autodiscover_from_html(self, soup: "BeautifulSoup") -> Optional[str]: """ Search an HTML document for oEmbed autodiscovery information. Args: - tree: The parsed HTML body. + soup: The parsed HTML body. Returns: The URL to use for oEmbed information, or None if no URL was found. """ # Search for link elements with the proper rel and type attributes. # Some providers (e.g. Flickr) use alternative instead of alternate. - for tag in tree.find_all( + for tag in soup.find_all( "link", rel=("alternate", "alternative"), type="application/json+oembed", @@ -198,8 +198,8 @@ class OEmbedProvider: return OEmbedResult(open_graph_response, author_name, cache_age) -def _fetch_urls(tree: "BeautifulSoup", tag_name: str) -> List[str]: - return [tag["src"] for tag in tree.find_all(tag_name, src=True)] +def _fetch_urls(soup: "BeautifulSoup", tag_name: str) -> List[str]: + return [tag["src"] for tag in soup.find_all(tag_name, src=True)] def calc_description_and_urls( @@ -216,22 +216,22 @@ def calc_description_and_urls( html_body: The HTML document, as bytes. url: The URL which is being previewed (not the one which was requested). """ - tree = decode_body(html_body, url) + soup = decode_body(html_body, url) # If there's no body, nothing useful is going to be found. - if not tree: + if not soup: return # Attempt to find interesting URLs (images, videos, embeds). if "og:image" not in open_graph_response: - image_urls = _fetch_urls(tree, "img") + image_urls = _fetch_urls(soup, "img") if image_urls: open_graph_response["og:image"] = image_urls[0] - video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed") + video_urls = _fetch_urls(soup, "video") + _fetch_urls(soup, "embed") if video_urls: open_graph_response["og:video"] = video_urls[0] - description = parse_html_description(tree) + description = parse_html_description(soup) if description: open_graph_response["og:description"] = description diff --git a/synapse/rest/media/v1/preview_html.py b/synapse/rest/media/v1/preview_html.py index 4dc9be5124..9ddc30c63f 100644 --- a/synapse/rest/media/v1/preview_html.py +++ b/synapse/rest/media/v1/preview_html.py @@ -55,14 +55,14 @@ def decode_body(body: Union[bytes, str], uri: str) -> Optional["BeautifulSoup"]: return None -def parse_html_to_open_graph(tree: "BeautifulSoup") -> Dict[str, Optional[str]]: +def parse_html_to_open_graph(soup: "BeautifulSoup") -> Dict[str, Optional[str]]: """ Calculate metadata for an HTML document. This uses BeautifulSoup to search the HTML document for Open Graph data. Args: - tree: The parsed HTML document. + soup: The parsed HTML document. Returns: The Open Graph response as a dictionary. @@ -85,7 +85,7 @@ def parse_html_to_open_graph(tree: "BeautifulSoup") -> Dict[str, Optional[str]]: # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3", og: Dict[str, Optional[str]] = {} - for tag in tree.find_all("meta", property=re.compile(r"^og:"), content=True): + for tag in soup.find_all("meta", property=re.compile(r"^og:"), content=True): # if we've got more than 50 tags, someone is taking the piss if len(og) >= 50: logger.warning("Skipping OG for page with too many 'og:' tags") @@ -103,7 +103,7 @@ def parse_html_to_open_graph(tree: "BeautifulSoup") -> Dict[str, Optional[str]]: if "og:title" not in og: # do some basic spidering of the HTML - title = tree.find(("title", "h1", "h2", "h3")) + title = soup.find(("title", "h1", "h2", "h3")) if title and title.string: og["og:title"] = title.string.strip() else: @@ -111,7 +111,7 @@ def parse_html_to_open_graph(tree: "BeautifulSoup") -> Dict[str, Optional[str]]: if "og:image" not in og: # TODO: extract a favicon failing all else - meta_image = tree.find("meta", image="image") + meta_image = soup.find("meta", image="image") if meta_image: og["og:image"] = meta_image["content"] else: @@ -124,22 +124,22 @@ def parse_html_to_open_graph(tree: "BeautifulSoup") -> Dict[str, Optional[str]]: except ValueError: return False - images = tree.find_all("img", src=True, width=greater_than) + images = soup.find_all("img", src=True, width=greater_than) images = sorted( images, key=lambda i: (-1 * float(i["width"]) * float(i["height"])), ) if not images: - images = tree.find_all("img", src=True) + images = soup.find_all("img", src=True) if images: og["og:image"] = images[0]["src"] if "og:description" not in og: - meta_description = tree.find("meta", description="description") + meta_description = soup.find("meta", description="description") if meta_description: og["og:description"] = meta_description["content"] else: - og["og:description"] = parse_html_description(tree) + og["og:description"] = parse_html_description(soup) elif og["og:description"]: # This must be a non-empty string at this point. assert isinstance(og["og:description"], str) @@ -150,7 +150,7 @@ def parse_html_to_open_graph(tree: "BeautifulSoup") -> Dict[str, Optional[str]]: return og -def parse_html_description(tree: "BeautifulSoup") -> Optional[str]: +def parse_html_description(soup: "BeautifulSoup") -> Optional[str]: """ Calculate a text description based on an HTML document. @@ -161,7 +161,7 @@ def parse_html_description(tree: "BeautifulSoup") -> Optional[str]: This is a very very very coarse approximation to a plain text render of the page. Args: - tree: The parsed HTML document. + soup: The parsed HTML document. Returns: The plain text description, or None if one cannot be generated. @@ -181,18 +181,18 @@ def parse_html_description(tree: "BeautifulSoup") -> Optional[str]: # lines) text_nodes = ( re.sub(r"\s+", "\n", el).strip() - for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE) + for el in _iterate_over_text(soup.find("body"), *TAGS_TO_REMOVE) ) return summarize_paragraphs(text_nodes) def _iterate_over_text( - tree: Optional["Tag"], *tags_to_ignore: Iterable[str] + soup: Optional["Tag"], *tags_to_ignore: Iterable[str] ) -> Generator[str, None, None]: - """Iterate over the tree returning text nodes in a depth first fashion, + """Iterate over the document returning text nodes in a depth first fashion, skipping text nodes inside certain tags. """ - if not tree: + if not soup: return from bs4.element import NavigableString, Tag @@ -200,7 +200,7 @@ def _iterate_over_text( # This is basically a stack that we extend using itertools.chain. # This will either consist of an element to iterate over *or* a string # to be returned. - elements: Iterator["PageElement"] = iter([tree]) + elements: Iterator["PageElement"] = iter([soup]) while True: el = next(elements, None) if el is None: diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index e406f6c704..76ba36c2ea 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -298,16 +298,16 @@ class PreviewUrlResource(DirectServeJsonResource): # define our OG response for this media elif _is_html(media_info.media_type): - # TODO: somehow stop a big HTML tree from exploding synapse's RAM + # TODO: somehow stop a big HTML document from exploding synapse's RAM with open(media_info.filename, "rb") as file: body = file.read() - tree = decode_body(body, media_info.uri) - if tree is not None: + soup = decode_body(body, media_info.uri) + if soup is not None: # Check if this HTML document points to oEmbed information and # defer to that. - oembed_url = self._oembed.autodiscover_from_html(tree) + oembed_url = self._oembed.autodiscover_from_html(soup) og_from_oembed: JsonDict = {} if oembed_url: oembed_info = await self._handle_url( @@ -323,7 +323,7 @@ class PreviewUrlResource(DirectServeJsonResource): # Parse Open Graph information from the HTML in case the oEmbed # response failed or is incomplete. - og_from_html = parse_html_to_open_graph(tree) + og_from_html = parse_html_to_open_graph(soup) # Compile the Open Graph response by using the scraped # information from the HTML and overlaying any information diff --git a/tests/rest/media/v1/test_html_preview.py b/tests/rest/media/v1/test_html_preview.py index 5e600450dc..82b7d317c9 100644 --- a/tests/rest/media/v1/test_html_preview.py +++ b/tests/rest/media/v1/test_html_preview.py @@ -158,8 +158,8 @@ class CalcOgTestCase(unittest.TestCase): </html> """ - tree = decode_body(html, "http://example.com/test.html") - og = parse_html_to_open_graph(tree) + soup = decode_body(html, "http://example.com/test.html") + og = parse_html_to_open_graph(soup) self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -174,8 +174,8 @@ class CalcOgTestCase(unittest.TestCase): </html> """ - tree = decode_body(html, "http://example.com/test.html") - og = parse_html_to_open_graph(tree) + soup = decode_body(html, "http://example.com/test.html") + og = parse_html_to_open_graph(soup) self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -193,8 +193,8 @@ class CalcOgTestCase(unittest.TestCase): </html> """ - tree = decode_body(html, "http://example.com/test.html") - og = parse_html_to_open_graph(tree) + soup = decode_body(html, "http://example.com/test.html") + og = parse_html_to_open_graph(soup) self.assertEqual( og, @@ -215,8 +215,8 @@ class CalcOgTestCase(unittest.TestCase): </html> """ - tree = decode_body(html, "http://example.com/test.html") - og = parse_html_to_open_graph(tree) + soup = decode_body(html, "http://example.com/test.html") + og = parse_html_to_open_graph(soup) self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -229,8 +229,8 @@ class CalcOgTestCase(unittest.TestCase): </html> """ - tree = decode_body(html, "http://example.com/test.html") - og = parse_html_to_open_graph(tree) + soup = decode_body(html, "http://example.com/test.html") + og = parse_html_to_open_graph(soup) self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) @@ -244,8 +244,8 @@ class CalcOgTestCase(unittest.TestCase): </html> """ - tree = decode_body(html, "http://example.com/test.html") - og = parse_html_to_open_graph(tree) + soup = decode_body(html, "http://example.com/test.html") + og = parse_html_to_open_graph(soup) self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."}) @@ -259,22 +259,22 @@ class CalcOgTestCase(unittest.TestCase): </html> """ - tree = decode_body(html, "http://example.com/test.html") - og = parse_html_to_open_graph(tree) + soup = decode_body(html, "http://example.com/test.html") + og = parse_html_to_open_graph(soup) self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) def test_empty(self) -> None: """Test a body with no data in it.""" html = b"" - tree = decode_body(html, "http://example.com/test.html") - self.assertIsNone(tree) + soup = decode_body(html, "http://example.com/test.html") + self.assertIsNone(soup) - def test_no_tree(self) -> None: - """A valid body with no tree in it.""" + def test_no_soup(self): + """A valid body with no soup in it.""" html = b"\x00" - tree = decode_body(html, "http://example.com/test.html") - self.assertIsNone(tree) + soup = decode_body(html, "http://example.com/test.html") + self.assertIsNone(soup) def test_xml(self) -> None: """Test decoding XML and ensure it works properly.""" @@ -287,8 +287,8 @@ class CalcOgTestCase(unittest.TestCase): <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head><title>Foo</title></head><body>Some text.</body></html> """.strip() - tree = decode_body(html, "http://example.com/test.html") - og = parse_html_to_open_graph(tree) + soup = decode_body(html, "http://example.com/test.html") + og = parse_html_to_open_graph(soup) self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) def test_invalid_encoding2(self) -> None: @@ -302,8 +302,8 @@ class CalcOgTestCase(unittest.TestCase): </body> </html> """ - tree = decode_body(html, "http://example.com/test.html") - og = parse_html_to_open_graph(tree) + soup = decode_body(html, "http://example.com/test.html") + og = parse_html_to_open_graph(soup) self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."}) def test_windows_1252(self) -> None: @@ -316,6 +316,6 @@ class CalcOgTestCase(unittest.TestCase): </body> </html> """ - tree = decode_body(html, "http://example.com/test.html") - og = parse_html_to_open_graph(tree) + soup = decode_body(html, "http://example.com/test.html") + og = parse_html_to_open_graph(soup) self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."}) |