diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py
index c402d5433e..eb28a91a78 100644
--- a/synapse/rest/media/v1/oembed.py
+++ b/synapse/rest/media/v1/oembed.py
@@ -97,19 +97,19 @@ class OEmbedProvider:
# No match.
return None
- def autodiscover_from_html(self, tree: "BeautifulSoup") -> Optional[str]:
+ def autodiscover_from_html(self, soup: "BeautifulSoup") -> Optional[str]:
"""
Search an HTML document for oEmbed autodiscovery information.
Args:
- tree: The parsed HTML body.
+ soup: The parsed HTML body.
Returns:
The URL to use for oEmbed information, or None if no URL was found.
"""
# Search for link elements with the proper rel and type attributes.
# Some providers (e.g. Flickr) use alternative instead of alternate.
- for tag in tree.find_all(
+ for tag in soup.find_all(
"link",
rel=("alternate", "alternative"),
type="application/json+oembed",
@@ -198,8 +198,8 @@ class OEmbedProvider:
return OEmbedResult(open_graph_response, author_name, cache_age)
-def _fetch_urls(tree: "BeautifulSoup", tag_name: str) -> List[str]:
- return [tag["src"] for tag in tree.find_all(tag_name, src=True)]
+def _fetch_urls(soup: "BeautifulSoup", tag_name: str) -> List[str]:
+ return [tag["src"] for tag in soup.find_all(tag_name, src=True)]
def calc_description_and_urls(
@@ -216,22 +216,22 @@ def calc_description_and_urls(
html_body: The HTML document, as bytes.
url: The URL which is being previewed (not the one which was requested).
"""
- tree = decode_body(html_body, url)
+ soup = decode_body(html_body, url)
# If there's no body, nothing useful is going to be found.
- if not tree:
+ if not soup:
return
# Attempt to find interesting URLs (images, videos, embeds).
if "og:image" not in open_graph_response:
- image_urls = _fetch_urls(tree, "img")
+ image_urls = _fetch_urls(soup, "img")
if image_urls:
open_graph_response["og:image"] = image_urls[0]
- video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed")
+ video_urls = _fetch_urls(soup, "video") + _fetch_urls(soup, "embed")
if video_urls:
open_graph_response["og:video"] = video_urls[0]
- description = parse_html_description(tree)
+ description = parse_html_description(soup)
if description:
open_graph_response["og:description"] = description
diff --git a/synapse/rest/media/v1/preview_html.py b/synapse/rest/media/v1/preview_html.py
index 4dc9be5124..9ddc30c63f 100644
--- a/synapse/rest/media/v1/preview_html.py
+++ b/synapse/rest/media/v1/preview_html.py
@@ -55,14 +55,14 @@ def decode_body(body: Union[bytes, str], uri: str) -> Optional["BeautifulSoup"]:
return None
-def parse_html_to_open_graph(tree: "BeautifulSoup") -> Dict[str, Optional[str]]:
+def parse_html_to_open_graph(soup: "BeautifulSoup") -> Dict[str, Optional[str]]:
"""
Calculate metadata for an HTML document.
This uses BeautifulSoup to search the HTML document for Open Graph data.
Args:
- tree: The parsed HTML document.
+ soup: The parsed HTML document.
Returns:
The Open Graph response as a dictionary.
@@ -85,7 +85,7 @@ def parse_html_to_open_graph(tree: "BeautifulSoup") -> Dict[str, Optional[str]]:
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
og: Dict[str, Optional[str]] = {}
- for tag in tree.find_all("meta", property=re.compile(r"^og:"), content=True):
+ for tag in soup.find_all("meta", property=re.compile(r"^og:"), content=True):
# if we've got more than 50 tags, someone is taking the piss
if len(og) >= 50:
logger.warning("Skipping OG for page with too many 'og:' tags")
@@ -103,7 +103,7 @@ def parse_html_to_open_graph(tree: "BeautifulSoup") -> Dict[str, Optional[str]]:
if "og:title" not in og:
# do some basic spidering of the HTML
- title = tree.find(("title", "h1", "h2", "h3"))
+ title = soup.find(("title", "h1", "h2", "h3"))
if title and title.string:
og["og:title"] = title.string.strip()
else:
@@ -111,7 +111,7 @@ def parse_html_to_open_graph(tree: "BeautifulSoup") -> Dict[str, Optional[str]]:
if "og:image" not in og:
# TODO: extract a favicon failing all else
- meta_image = tree.find("meta", image="image")
+ meta_image = soup.find("meta", image="image")
if meta_image:
og["og:image"] = meta_image["content"]
else:
@@ -124,22 +124,22 @@ def parse_html_to_open_graph(tree: "BeautifulSoup") -> Dict[str, Optional[str]]:
except ValueError:
return False
- images = tree.find_all("img", src=True, width=greater_than)
+ images = soup.find_all("img", src=True, width=greater_than)
images = sorted(
images,
key=lambda i: (-1 * float(i["width"]) * float(i["height"])),
)
if not images:
- images = tree.find_all("img", src=True)
+ images = soup.find_all("img", src=True)
if images:
og["og:image"] = images[0]["src"]
if "og:description" not in og:
- meta_description = tree.find("meta", description="description")
+ meta_description = soup.find("meta", description="description")
if meta_description:
og["og:description"] = meta_description["content"]
else:
- og["og:description"] = parse_html_description(tree)
+ og["og:description"] = parse_html_description(soup)
elif og["og:description"]:
# This must be a non-empty string at this point.
assert isinstance(og["og:description"], str)
@@ -150,7 +150,7 @@ def parse_html_to_open_graph(tree: "BeautifulSoup") -> Dict[str, Optional[str]]:
return og
-def parse_html_description(tree: "BeautifulSoup") -> Optional[str]:
+def parse_html_description(soup: "BeautifulSoup") -> Optional[str]:
"""
Calculate a text description based on an HTML document.
@@ -161,7 +161,7 @@ def parse_html_description(tree: "BeautifulSoup") -> Optional[str]:
This is a very very very coarse approximation to a plain text render of the page.
Args:
- tree: The parsed HTML document.
+ soup: The parsed HTML document.
Returns:
The plain text description, or None if one cannot be generated.
@@ -181,18 +181,18 @@ def parse_html_description(tree: "BeautifulSoup") -> Optional[str]:
# lines)
text_nodes = (
re.sub(r"\s+", "\n", el).strip()
- for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
+ for el in _iterate_over_text(soup.find("body"), *TAGS_TO_REMOVE)
)
return summarize_paragraphs(text_nodes)
def _iterate_over_text(
- tree: Optional["Tag"], *tags_to_ignore: Iterable[str]
+ soup: Optional["Tag"], *tags_to_ignore: Iterable[str]
) -> Generator[str, None, None]:
- """Iterate over the tree returning text nodes in a depth first fashion,
+ """Iterate over the document returning text nodes in a depth first fashion,
skipping text nodes inside certain tags.
"""
- if not tree:
+ if not soup:
return
from bs4.element import NavigableString, Tag
@@ -200,7 +200,7 @@ def _iterate_over_text(
# This is basically a stack that we extend using itertools.chain.
# This will either consist of an element to iterate over *or* a string
# to be returned.
- elements: Iterator["PageElement"] = iter([tree])
+ elements: Iterator["PageElement"] = iter([soup])
while True:
el = next(elements, None)
if el is None:
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index e406f6c704..76ba36c2ea 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -298,16 +298,16 @@ class PreviewUrlResource(DirectServeJsonResource):
# define our OG response for this media
elif _is_html(media_info.media_type):
- # TODO: somehow stop a big HTML tree from exploding synapse's RAM
+ # TODO: somehow stop a big HTML document from exploding synapse's RAM
with open(media_info.filename, "rb") as file:
body = file.read()
- tree = decode_body(body, media_info.uri)
- if tree is not None:
+ soup = decode_body(body, media_info.uri)
+ if soup is not None:
# Check if this HTML document points to oEmbed information and
# defer to that.
- oembed_url = self._oembed.autodiscover_from_html(tree)
+ oembed_url = self._oembed.autodiscover_from_html(soup)
og_from_oembed: JsonDict = {}
if oembed_url:
oembed_info = await self._handle_url(
@@ -323,7 +323,7 @@ class PreviewUrlResource(DirectServeJsonResource):
# Parse Open Graph information from the HTML in case the oEmbed
# response failed or is incomplete.
- og_from_html = parse_html_to_open_graph(tree)
+ og_from_html = parse_html_to_open_graph(soup)
# Compile the Open Graph response by using the scraped
# information from the HTML and overlaying any information
diff --git a/tests/rest/media/v1/test_html_preview.py b/tests/rest/media/v1/test_html_preview.py
index 5e600450dc..82b7d317c9 100644
--- a/tests/rest/media/v1/test_html_preview.py
+++ b/tests/rest/media/v1/test_html_preview.py
@@ -158,8 +158,8 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
- tree = decode_body(html, "http://example.com/test.html")
- og = parse_html_to_open_graph(tree)
+ soup = decode_body(html, "http://example.com/test.html")
+ og = parse_html_to_open_graph(soup)
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@@ -174,8 +174,8 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
- tree = decode_body(html, "http://example.com/test.html")
- og = parse_html_to_open_graph(tree)
+ soup = decode_body(html, "http://example.com/test.html")
+ og = parse_html_to_open_graph(soup)
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@@ -193,8 +193,8 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
- tree = decode_body(html, "http://example.com/test.html")
- og = parse_html_to_open_graph(tree)
+ soup = decode_body(html, "http://example.com/test.html")
+ og = parse_html_to_open_graph(soup)
self.assertEqual(
og,
@@ -215,8 +215,8 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
- tree = decode_body(html, "http://example.com/test.html")
- og = parse_html_to_open_graph(tree)
+ soup = decode_body(html, "http://example.com/test.html")
+ og = parse_html_to_open_graph(soup)
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@@ -229,8 +229,8 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
- tree = decode_body(html, "http://example.com/test.html")
- og = parse_html_to_open_graph(tree)
+ soup = decode_body(html, "http://example.com/test.html")
+ og = parse_html_to_open_graph(soup)
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
@@ -244,8 +244,8 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
- tree = decode_body(html, "http://example.com/test.html")
- og = parse_html_to_open_graph(tree)
+ soup = decode_body(html, "http://example.com/test.html")
+ og = parse_html_to_open_graph(soup)
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
@@ -259,22 +259,22 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
- tree = decode_body(html, "http://example.com/test.html")
- og = parse_html_to_open_graph(tree)
+ soup = decode_body(html, "http://example.com/test.html")
+ og = parse_html_to_open_graph(soup)
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
def test_empty(self) -> None:
"""Test a body with no data in it."""
html = b""
- tree = decode_body(html, "http://example.com/test.html")
- self.assertIsNone(tree)
+ soup = decode_body(html, "http://example.com/test.html")
+ self.assertIsNone(soup)
- def test_no_tree(self) -> None:
- """A valid body with no tree in it."""
+ def test_no_soup(self):
+ """A valid body with no soup in it."""
html = b"\x00"
- tree = decode_body(html, "http://example.com/test.html")
- self.assertIsNone(tree)
+ soup = decode_body(html, "http://example.com/test.html")
+ self.assertIsNone(soup)
def test_xml(self) -> None:
"""Test decoding XML and ensure it works properly."""
@@ -287,8 +287,8 @@ class CalcOgTestCase(unittest.TestCase):
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head><title>Foo</title></head><body>Some text.</body></html>
""".strip()
- tree = decode_body(html, "http://example.com/test.html")
- og = parse_html_to_open_graph(tree)
+ soup = decode_body(html, "http://example.com/test.html")
+ og = parse_html_to_open_graph(soup)
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
def test_invalid_encoding2(self) -> None:
@@ -302,8 +302,8 @@ class CalcOgTestCase(unittest.TestCase):
</body>
</html>
"""
- tree = decode_body(html, "http://example.com/test.html")
- og = parse_html_to_open_graph(tree)
+ soup = decode_body(html, "http://example.com/test.html")
+ og = parse_html_to_open_graph(soup)
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
def test_windows_1252(self) -> None:
@@ -316,6 +316,6 @@ class CalcOgTestCase(unittest.TestCase):
</body>
</html>
"""
- tree = decode_body(html, "http://example.com/test.html")
- og = parse_html_to_open_graph(tree)
+ soup = decode_body(html, "http://example.com/test.html")
+ og = parse_html_to_open_graph(soup)
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
|