Re-use decode_body.

author: Patrick Cloke <patrickc@matrix.org> 2021-12-09 15:37:00 -0500
committer: Patrick Cloke <patrickc@matrix.org> 2022-05-24 13:19:17 -0400
commit: 11a9925252bfe6c08718740499094d571e4c81a7 (patch)
tree: 2e179837531cc989537bb70130a6409086bd6f9f
parent: Remove dead code. (diff)
download: synapse-11a9925252bfe6c08718740499094d571e4c81a7.tar.xz
2 files changed, 12 insertions, 22 deletions
diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py
index 5ac2e38719..c402d5433e 100644
--- a/synapse/rest/media/v1/oembed.py
+++ b/synapse/rest/media/v1/oembed.py
@@ -17,7 +17,7 @@ from typing import TYPE_CHECKING, List, Optional
 
 import attr
 
-from synapse.rest.media.v1.preview_html import parse_html_description
+from synapse.rest.media.v1.preview_html import decode_body, parse_html_description
 from synapse.types import JsonDict
 from synapse.util import json_decoder
 
@@ -170,7 +170,7 @@ class OEmbedProvider:
             # Process each type separately.
             oembed_type = oembed["type"]
             if oembed_type == "rich":
-                calc_description_and_urls(open_graph_response, oembed["html"])
+                calc_description_and_urls(open_graph_response, oembed["html"], url)
 
             elif oembed_type == "photo":
                 # If this is a photo, use the full image, not the thumbnail.
@@ -178,7 +178,7 @@ class OEmbedProvider:
 
             elif oembed_type == "video":
                 open_graph_response["og:type"] = "video.other"
-                calc_description_and_urls(open_graph_response, oembed["html"])
+                calc_description_and_urls(open_graph_response, oembed["html"], url)
                 open_graph_response["og:video:width"] = oembed["width"]
                 open_graph_response["og:video:height"] = oembed["height"]
 
@@ -202,7 +202,9 @@ def _fetch_urls(tree: "BeautifulSoup", tag_name: str) -> List[str]:
     return [tag["src"] for tag in tree.find_all(tag_name, src=True)]
 
 
-def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None:
+def calc_description_and_urls(
+    open_graph_response: JsonDict, html_body: str, url: str
+) -> None:
     """
     Calculate description for an HTML document.
 
@@ -212,24 +214,12 @@ def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) ->
     Args:
         open_graph_response: The current Open Graph summary. This is updated with additional fields.
         html_body: The HTML document, as bytes.
-
-    Returns:
-        The summary
+        url: The URL which is being previewed (not the one which was requested).
     """
-    # If there's no body, nothing useful is going to be found.
-    if not html_body:
-        return
+    tree = decode_body(html_body, url)
 
-    from bs4 import BeautifulSoup
-    from bs4.builder import ParserRejectedMarkup
-
-    try:
-        tree = BeautifulSoup(html_body, "lxml")
-        # If an empty document is returned, convert to None.
-        if not len(tree):
-            return
-    except ParserRejectedMarkup:
-        logger.warning("Unable to decode HTML body")
+    # If there's no body, nothing useful is going to be found.
+    if not tree:
         return
 
     # Attempt to find interesting URLs (images, videos, embeds).
diff --git a/synapse/rest/media/v1/preview_html.py b/synapse/rest/media/v1/preview_html.py
index c9711956f1..4dc9be5124 100644
--- a/synapse/rest/media/v1/preview_html.py
+++ b/synapse/rest/media/v1/preview_html.py
@@ -14,7 +14,7 @@
 import itertools
 import logging
 import re
-from typing import TYPE_CHECKING, Dict, Generator, Iterable, Iterator, Optional
+from typing import TYPE_CHECKING, Dict, Generator, Iterable, Iterator, Optional, Union
 
 if TYPE_CHECKING:
     from bs4 import BeautifulSoup
@@ -25,7 +25,7 @@ logger = logging.getLogger(__name__)
 _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
 
 
-def decode_body(body: bytes, uri: str) -> Optional["BeautifulSoup"]:
+def decode_body(body: Union[bytes, str], uri: str) -> Optional["BeautifulSoup"]:
     """
     This uses BeautifulSoup to parse the HTML document.
author	Patrick Cloke <patrickc@matrix.org>	2021-12-09 15:37:00 -0500
committer	Patrick Cloke <patrickc@matrix.org>	2022-05-24 13:19:17 -0400
commit	11a9925252bfe6c08718740499094d571e4c81a7 (patch)
tree	2e179837531cc989537bb70130a6409086bd6f9f
parent	Remove dead code. (diff)
download	synapse-11a9925252bfe6c08718740499094d571e4c81a7.tar.xz