From 11a9925252bfe6c08718740499094d571e4c81a7 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Thu, 9 Dec 2021 15:37:00 -0500 Subject: Re-use decode_body. --- synapse/rest/media/v1/oembed.py | 30 ++++++++++-------------------- synapse/rest/media/v1/preview_html.py | 4 ++-- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py index 5ac2e38719..c402d5433e 100644 --- a/synapse/rest/media/v1/oembed.py +++ b/synapse/rest/media/v1/oembed.py @@ -17,7 +17,7 @@ from typing import TYPE_CHECKING, List, Optional import attr -from synapse.rest.media.v1.preview_html import parse_html_description +from synapse.rest.media.v1.preview_html import decode_body, parse_html_description from synapse.types import JsonDict from synapse.util import json_decoder @@ -170,7 +170,7 @@ class OEmbedProvider: # Process each type separately. oembed_type = oembed["type"] if oembed_type == "rich": - calc_description_and_urls(open_graph_response, oembed["html"]) + calc_description_and_urls(open_graph_response, oembed["html"], url) elif oembed_type == "photo": # If this is a photo, use the full image, not the thumbnail. @@ -178,7 +178,7 @@ class OEmbedProvider: elif oembed_type == "video": open_graph_response["og:type"] = "video.other" - calc_description_and_urls(open_graph_response, oembed["html"]) + calc_description_and_urls(open_graph_response, oembed["html"], url) open_graph_response["og:video:width"] = oembed["width"] open_graph_response["og:video:height"] = oembed["height"] @@ -202,7 +202,9 @@ def _fetch_urls(tree: "BeautifulSoup", tag_name: str) -> List[str]: return [tag["src"] for tag in tree.find_all(tag_name, src=True)] -def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None: +def calc_description_and_urls( + open_graph_response: JsonDict, html_body: str, url: str +) -> None: """ Calculate description for an HTML document. @@ -212,24 +214,12 @@ def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> Args: open_graph_response: The current Open Graph summary. This is updated with additional fields. html_body: The HTML document, as bytes. - - Returns: - The summary + url: The URL which is being previewed (not the one which was requested). """ - # If there's no body, nothing useful is going to be found. - if not html_body: - return + tree = decode_body(html_body, url) - from bs4 import BeautifulSoup - from bs4.builder import ParserRejectedMarkup - - try: - tree = BeautifulSoup(html_body, "lxml") - # If an empty document is returned, convert to None. - if not len(tree): - return - except ParserRejectedMarkup: - logger.warning("Unable to decode HTML body") + # If there's no body, nothing useful is going to be found. + if not tree: return # Attempt to find interesting URLs (images, videos, embeds). diff --git a/synapse/rest/media/v1/preview_html.py b/synapse/rest/media/v1/preview_html.py index c9711956f1..4dc9be5124 100644 --- a/synapse/rest/media/v1/preview_html.py +++ b/synapse/rest/media/v1/preview_html.py @@ -14,7 +14,7 @@ import itertools import logging import re -from typing import TYPE_CHECKING, Dict, Generator, Iterable, Iterator, Optional +from typing import TYPE_CHECKING, Dict, Generator, Iterable, Iterator, Optional, Union if TYPE_CHECKING: from bs4 import BeautifulSoup @@ -25,7 +25,7 @@ logger = logging.getLogger(__name__) _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I) -def decode_body(body: bytes, uri: str) -> Optional["BeautifulSoup"]: +def decode_body(body: Union[bytes, str], uri: str) -> Optional["BeautifulSoup"]: """ This uses BeautifulSoup to parse the HTML document. -- cgit 1.4.1