diff options
Diffstat (limited to 'synapse/rest/media/v1/oembed.py')
-rw-r--r-- | synapse/rest/media/v1/oembed.py | 48 |
1 files changed, 20 insertions, 28 deletions
diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py index 2177b46c9e..5ac2e38719 100644 --- a/synapse/rest/media/v1/oembed.py +++ b/synapse/rest/media/v1/oembed.py @@ -22,7 +22,7 @@ from synapse.types import JsonDict from synapse.util import json_decoder if TYPE_CHECKING: - from lxml import etree + from bs4 import BeautifulSoup from synapse.server import HomeServer @@ -97,7 +97,7 @@ class OEmbedProvider: # No match. return None - def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]: + def autodiscover_from_html(self, tree: "BeautifulSoup") -> Optional[str]: """ Search an HTML document for oEmbed autodiscovery information. @@ -108,18 +108,14 @@ class OEmbedProvider: The URL to use for oEmbed information, or None if no URL was found. """ # Search for link elements with the proper rel and type attributes. - for tag in tree.xpath( - "//link[@rel='alternate'][@type='application/json+oembed']" - ): - if "href" in tag.attrib: - return tag.attrib["href"] - # Some providers (e.g. Flickr) use alternative instead of alternate. - for tag in tree.xpath( - "//link[@rel='alternative'][@type='application/json+oembed']" + for tag in tree.find_all( + "link", + rel=("alternate", "alternative"), + type="application/json+oembed", + href=True, ): - if "href" in tag.attrib: - return tag.attrib["href"] + return tag["href"] return None @@ -202,19 +198,15 @@ class OEmbedProvider: return OEmbedResult(open_graph_response, author_name, cache_age) -def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]: - results = [] - for tag in tree.xpath("//*/" + tag_name): - if "src" in tag.attrib: - results.append(tag.attrib["src"]) - return results +def _fetch_urls(tree: "BeautifulSoup", tag_name: str) -> List[str]: + return [tag["src"] for tag in tree.find_all(tag_name, src=True)] def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None: """ Calculate description for an HTML document. - This uses lxml to convert the HTML document into plaintext. If errors + This uses BeautifulSoup to convert the HTML document into plaintext. If errors occur during processing of the document, an empty response is returned. Args: @@ -228,16 +220,16 @@ def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> if not html_body: return - from lxml import etree - - # Create an HTML parser. If this fails, log and return no metadata. - parser = etree.HTMLParser(recover=True, encoding="utf-8") - - # Attempt to parse the body. If this fails, log and return no metadata. - tree = etree.fromstring(html_body, parser) + from bs4 import BeautifulSoup + from bs4.builder import ParserRejectedMarkup - # The data was successfully parsed, but no tree was found. - if tree is None: + try: + tree = BeautifulSoup(html_body, "lxml") + # If an empty document is returned, convert to None. + if not len(tree): + return + except ParserRejectedMarkup: + logger.warning("Unable to decode HTML body") return # Attempt to find interesting URLs (images, videos, embeds). |