Rename tree to soup.

author: Patrick Cloke <patrickc@matrix.org> 2021-12-09 15:44:38 -0500
committer: Patrick Cloke <patrickc@matrix.org> 2022-05-24 13:21:59 -0400
commit: ecc942ff267ca2e271675f0d0f253c95116f85d1 (patch)
tree: a5ec321241f3d1dd600d80f5eb3ecbaac7b8a62d
parent: Re-use decode_body. (diff)
download: synapse-ecc942ff267ca2e271675f0d0f253c95116f85d1.tar.xz
4 files changed, 57 insertions, 57 deletions
diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py
index c402d5433e..eb28a91a78 100644
--- a/synapse/rest/media/v1/oembed.py
+++ b/synapse/rest/media/v1/oembed.py
@@ -97,19 +97,19 @@ class OEmbedProvider:
         # No match.
         return None
 
-    def autodiscover_from_html(self, tree: "BeautifulSoup") -> Optional[str]:
+    def autodiscover_from_html(self, soup: "BeautifulSoup") -> Optional[str]:
         """
         Search an HTML document for oEmbed autodiscovery information.
 
         Args:
-            tree: The parsed HTML body.
+            soup: The parsed HTML body.
 
         Returns:
             The URL to use for oEmbed information, or None if no URL was found.
         """
         # Search for link elements with the proper rel and type attributes.
         # Some providers (e.g. Flickr) use alternative instead of alternate.
-        for tag in tree.find_all(
+        for tag in soup.find_all(
             "link",
             rel=("alternate", "alternative"),
             type="application/json+oembed",
@@ -198,8 +198,8 @@ class OEmbedProvider:
         return OEmbedResult(open_graph_response, author_name, cache_age)
 
 
-def _fetch_urls(tree: "BeautifulSoup", tag_name: str) -> List[str]:
-    return [tag["src"] for tag in tree.find_all(tag_name, src=True)]
+def _fetch_urls(soup: "BeautifulSoup", tag_name: str) -> List[str]:
+    return [tag["src"] for tag in soup.find_all(tag_name, src=True)]
 
 
 def calc_description_and_urls(
@@ -216,22 +216,22 @@ def calc_description_and_urls(
         html_body: The HTML document, as bytes.
         url: The URL which is being previewed (not the one which was requested).
     """
-    tree = decode_body(html_body, url)
+    soup = decode_body(html_body, url)
 
     # If there's no body, nothing useful is going to be found.
-    if not tree:
+    if not soup:
         return
 
     # Attempt to find interesting URLs (images, videos, embeds).
     if "og:image" not in open_graph_response:
-        image_urls = _fetch_urls(tree, "img")
+        image_urls = _fetch_urls(soup, "img")
         if image_urls:
             open_graph_response["og:image"] = image_urls[0]
 
-    video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed")
+    video_urls = _fetch_urls(soup, "video") + _fetch_urls(soup, "embed")
     if video_urls:
         open_graph_response["og:video"] = video_urls[0]
 
-    description = parse_html_description(tree)
+    description = parse_html_description(soup)
     if description:
         open_graph_response["og:description"] = description
diff --git a/synapse/rest/media/v1/preview_html.py b/synapse/rest/media/v1/preview_html.py
index 4dc9be5124..9ddc30c63f 100644
--- a/synapse/rest/media/v1/preview_html.py
+++ b/synapse/rest/media/v1/preview_html.py
@@ -55,14 +55,14 @@ def decode_body(body: Union[bytes, str], uri: str) -> Optional["BeautifulSoup"]:
         return None
 
 
-def parse_html_to_open_graph(tree: "BeautifulSoup") -> Dict[str, Optional[str]]:
+def parse_html_to_open_graph(soup: "BeautifulSoup") -> Dict[str, Optional[str]]:
     """
     Calculate metadata for an HTML document.
 
     This uses BeautifulSoup to search the HTML document for Open Graph data.
 
     Args:
-        tree: The parsed HTML document.
+        soup: The parsed HTML document.
 
     Returns:
         The Open Graph response as a dictionary.
@@ -85,7 +85,7 @@ def parse_html_to_open_graph(tree: "BeautifulSoup") -> Dict[str, Optional[str]]:
     # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
 
     og: Dict[str, Optional[str]] = {}
-    for tag in tree.find_all("meta", property=re.compile(r"^og:"), content=True):
+    for tag in soup.find_all("meta", property=re.compile(r"^og:"), content=True):
         # if we've got more than 50 tags, someone is taking the piss
         if len(og) >= 50:
             logger.warning("Skipping OG for page with too many 'og:' tags")
@@ -103,7 +103,7 @@ def parse_html_to_open_graph(tree: "BeautifulSoup") -> Dict[str, Optional[str]]:
 
     if "og:title" not in og:
         # do some basic spidering of the HTML
-        title = tree.find(("title", "h1", "h2", "h3"))
+        title = soup.find(("title", "h1", "h2", "h3"))
         if title and title.string:
             og["og:title"] = title.string.strip()
         else:
@@ -111,7 +111,7 @@ def parse_html_to_open_graph(tree: "BeautifulSoup") -> Dict[str, Optional[str]]:
 
     if "og:image" not in og:
         # TODO: extract a favicon failing all else
-        meta_image = tree.find("meta", image="image")
+        meta_image = soup.find("meta", image="image")
         if meta_image:
             og["og:image"] = meta_image["content"]
         else:
@@ -124,22 +124,22 @@ def parse_html_to_open_graph(tree: "BeautifulSoup") -> Dict[str, Optional[str]]:
                 except ValueError:
                     return False
 
-            images = tree.find_all("img", src=True, width=greater_than)
+            images = soup.find_all("img", src=True, width=greater_than)
             images = sorted(
                 images,
                 key=lambda i: (-1 * float(i["width"]) * float(i["height"])),
             )
             if not images:
-                images = tree.find_all("img", src=True)
+                images = soup.find_all("img", src=True)
             if images:
                 og["og:image"] = images[0]["src"]
 
     if "og:description" not in og:
-        meta_description = tree.find("meta", description="description")
+        meta_description = soup.find("meta", description="description")
         if meta_description:
             og["og:description"] = meta_description["content"]
         else:
-            og["og:description"] = parse_html_description(tree)
+            og["og:description"] = parse_html_description(soup)
     elif og["og:description"]:
         # This must be a non-empty string at this point.
         assert isinstance(og["og:description"], str)
@@ -150,7 +150,7 @@ def parse_html_to_open_graph(tree: "BeautifulSoup") -> Dict[str, Optional[str]]:
     return og
 
 
-def parse_html_description(tree: "BeautifulSoup") -> Optional[str]:
+def parse_html_description(soup: "BeautifulSoup") -> Optional[str]:
     """
     Calculate a text description based on an HTML document.
 
@@ -161,7 +161,7 @@ def parse_html_description(tree: "BeautifulSoup") -> Optional[str]:
     This is a very very very coarse approximation to a plain text render of the page.
 
     Args:
-        tree: The parsed HTML document.
+        soup: The parsed HTML document.
 
     Returns:
         The plain text description, or None if one cannot be generated.
@@ -181,18 +181,18 @@ def parse_html_description(tree: "BeautifulSoup") -> Optional[str]:
     # lines)
     text_nodes = (
         re.sub(r"\s+", "\n", el).strip()
-        for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
+        for el in _iterate_over_text(soup.find("body"), *TAGS_TO_REMOVE)
     )
     return summarize_paragraphs(text_nodes)
 
 
 def _iterate_over_text(
-    tree: Optional["Tag"], *tags_to_ignore: Iterable[str]
+    soup: Optional["Tag"], *tags_to_ignore: Iterable[str]
 ) -> Generator[str, None, None]:
-    """Iterate over the tree returning text nodes in a depth first fashion,
+    """Iterate over the document returning text nodes in a depth first fashion,
     skipping text nodes inside certain tags.
     """
-    if not tree:
+    if not soup:
         return
 
     from bs4.element import NavigableString, Tag
@@ -200,7 +200,7 @@ def _iterate_over_text(
     # This is basically a stack that we extend using itertools.chain.
     # This will either consist of an element to iterate over *or* a string
     # to be returned.
-    elements: Iterator["PageElement"] = iter([tree])
+    elements: Iterator["PageElement"] = iter([soup])
     while True:
         el = next(elements, None)
         if el is None:
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index e406f6c704..76ba36c2ea 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -298,16 +298,16 @@ class PreviewUrlResource(DirectServeJsonResource):
 
             # define our OG response for this media
         elif _is_html(media_info.media_type):
-            # TODO: somehow stop a big HTML tree from exploding synapse's RAM
+            # TODO: somehow stop a big HTML document from exploding synapse's RAM
 
             with open(media_info.filename, "rb") as file:
                 body = file.read()
 
-            tree = decode_body(body, media_info.uri)
-            if tree is not None:
+            soup = decode_body(body, media_info.uri)
+            if soup is not None:
                 # Check if this HTML document points to oEmbed information and
                 # defer to that.
-                oembed_url = self._oembed.autodiscover_from_html(tree)
+                oembed_url = self._oembed.autodiscover_from_html(soup)
                 og_from_oembed: JsonDict = {}
                 if oembed_url:
                     oembed_info = await self._handle_url(
@@ -323,7 +323,7 @@ class PreviewUrlResource(DirectServeJsonResource):
 
                 # Parse Open Graph information from the HTML in case the oEmbed
                 # response failed or is incomplete.
-                og_from_html = parse_html_to_open_graph(tree)
+                og_from_html = parse_html_to_open_graph(soup)
 
                 # Compile the Open Graph response by using the scraped
                 # information from the HTML and overlaying any information
diff --git a/tests/rest/media/v1/test_html_preview.py b/tests/rest/media/v1/test_html_preview.py
index 5e600450dc..82b7d317c9 100644
--- a/tests/rest/media/v1/test_html_preview.py
+++ b/tests/rest/media/v1/test_html_preview.py
@@ -158,8 +158,8 @@ class CalcOgTestCase(unittest.TestCase):
         </html>
         """
 
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(soup)
 
         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
 
@@ -174,8 +174,8 @@ class CalcOgTestCase(unittest.TestCase):
         </html>
         """
 
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(soup)
 
         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
 
@@ -193,8 +193,8 @@ class CalcOgTestCase(unittest.TestCase):
         </html>
         """
 
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(soup)
 
         self.assertEqual(
             og,
@@ -215,8 +215,8 @@ class CalcOgTestCase(unittest.TestCase):
         </html>
         """
 
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(soup)
 
         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
 
@@ -229,8 +229,8 @@ class CalcOgTestCase(unittest.TestCase):
         </html>
         """
 
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(soup)
 
         self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
 
@@ -244,8 +244,8 @@ class CalcOgTestCase(unittest.TestCase):
         </html>
         """
 
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(soup)
 
         self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
 
@@ -259,22 +259,22 @@ class CalcOgTestCase(unittest.TestCase):
         </html>
         """
 
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(soup)
 
         self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
 
     def test_empty(self) -> None:
         """Test a body with no data in it."""
         html = b""
-        tree = decode_body(html, "http://example.com/test.html")
-        self.assertIsNone(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        self.assertIsNone(soup)
 
-    def test_no_tree(self) -> None:
-        """A valid body with no tree in it."""
+    def test_no_soup(self):
+        """A valid body with no soup in it."""
         html = b"\x00"
-        tree = decode_body(html, "http://example.com/test.html")
-        self.assertIsNone(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        self.assertIsNone(soup)
 
     def test_xml(self) -> None:
         """Test decoding XML and ensure it works properly."""
@@ -287,8 +287,8 @@ class CalcOgTestCase(unittest.TestCase):
         <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
         <head><title>Foo</title></head><body>Some text.</body></html>
         """.strip()
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(soup)
         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
 
     def test_invalid_encoding2(self) -> None:
@@ -302,8 +302,8 @@ class CalcOgTestCase(unittest.TestCase):
         </body>
         </html>
         """
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(soup)
         self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
 
     def test_windows_1252(self) -> None:
@@ -316,6 +316,6 @@ class CalcOgTestCase(unittest.TestCase):
         </body>
         </html>
         """
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(soup)
         self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
author	Patrick Cloke <patrickc@matrix.org>	2021-12-09 15:44:38 -0500
committer	Patrick Cloke <patrickc@matrix.org>	2022-05-24 13:21:59 -0400
commit	ecc942ff267ca2e271675f0d0f253c95116f85d1 (patch)
tree	a5ec321241f3d1dd600d80f5eb3ecbaac7b8a62d
parent	Re-use decode_body. (diff)
download	synapse-ecc942ff267ca2e271675f0d0f253c95116f85d1.tar.xz