From 2b46d28c813ac900431910463c4913bb4345c783 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Thu, 9 Dec 2021 15:27:06 -0500 Subject: Use BeautifulSoup instead of LXML directly. --- mypy.ini | 6 ++ pyproject.toml | 3 +- synapse/rest/media/v1/oembed.py | 48 +++++------- synapse/rest/media/v1/preview_html.py | 138 ++++++++++++++-------------------- 4 files changed, 86 insertions(+), 109 deletions(-) diff --git a/mypy.ini b/mypy.ini index fe3e3f9b8e..071fab2133 100644 --- a/mypy.ini +++ b/mypy.ini @@ -139,6 +139,12 @@ disallow_untyped_defs = True [mypy-authlib.*] ignore_missing_imports = True +[mypy-bcrypt] +ignore_missing_imports = True + +[mypy-bs4.*] +ignore_missing_imports = True + [mypy-canonicaljson] ignore_missing_imports = True diff --git a/pyproject.toml b/pyproject.toml index 9359d211f7..de7145e5fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -173,6 +173,7 @@ authlib = { version = ">=0.14.0", optional = true } # Note: systemd-python 231 appears to have been yanked from pypi systemd-python = { version = ">=231", optional = true } lxml = { version = ">=4.2.0", optional = true } +beautifulsoup4 = { version = ">=4.10.0", optional = true } sentry-sdk = { version = ">=0.7.2", optional = true } opentracing = { version = ">=2.2.0", optional = true } jaeger-client = { version = ">=4.0.0", optional = true } @@ -194,7 +195,7 @@ oidc = ["authlib"] # `systemd.journal.JournalHandler`, as is documented in # `contrib/systemd/log_config.yaml`. systemd = ["systemd-python"] -url_preview = ["lxml"] +url_preview = ["lxml", "beautifulsoup4"] sentry = ["sentry-sdk"] opentracing = ["jaeger-client", "opentracing"] jwt = ["pyjwt"] diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py index 2177b46c9e..5ac2e38719 100644 --- a/synapse/rest/media/v1/oembed.py +++ b/synapse/rest/media/v1/oembed.py @@ -22,7 +22,7 @@ from synapse.types import JsonDict from synapse.util import json_decoder if TYPE_CHECKING: - from lxml import etree + from bs4 import BeautifulSoup from synapse.server import HomeServer @@ -97,7 +97,7 @@ class OEmbedProvider: # No match. return None - def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]: + def autodiscover_from_html(self, tree: "BeautifulSoup") -> Optional[str]: """ Search an HTML document for oEmbed autodiscovery information. @@ -108,18 +108,14 @@ class OEmbedProvider: The URL to use for oEmbed information, or None if no URL was found. """ # Search for link elements with the proper rel and type attributes. - for tag in tree.xpath( - "//link[@rel='alternate'][@type='application/json+oembed']" - ): - if "href" in tag.attrib: - return tag.attrib["href"] - # Some providers (e.g. Flickr) use alternative instead of alternate. - for tag in tree.xpath( - "//link[@rel='alternative'][@type='application/json+oembed']" + for tag in tree.find_all( + "link", + rel=("alternate", "alternative"), + type="application/json+oembed", + href=True, ): - if "href" in tag.attrib: - return tag.attrib["href"] + return tag["href"] return None @@ -202,19 +198,15 @@ class OEmbedProvider: return OEmbedResult(open_graph_response, author_name, cache_age) -def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]: - results = [] - for tag in tree.xpath("//*/" + tag_name): - if "src" in tag.attrib: - results.append(tag.attrib["src"]) - return results +def _fetch_urls(tree: "BeautifulSoup", tag_name: str) -> List[str]: + return [tag["src"] for tag in tree.find_all(tag_name, src=True)] def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None: """ Calculate description for an HTML document. - This uses lxml to convert the HTML document into plaintext. If errors + This uses BeautifulSoup to convert the HTML document into plaintext. If errors occur during processing of the document, an empty response is returned. Args: @@ -228,16 +220,16 @@ def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> if not html_body: return - from lxml import etree - - # Create an HTML parser. If this fails, log and return no metadata. - parser = etree.HTMLParser(recover=True, encoding="utf-8") - - # Attempt to parse the body. If this fails, log and return no metadata. - tree = etree.fromstring(html_body, parser) + from bs4 import BeautifulSoup + from bs4.builder import ParserRejectedMarkup - # The data was successfully parsed, but no tree was found. - if tree is None: + try: + tree = BeautifulSoup(html_body, "lxml") + # If an empty document is returned, convert to None. + if not len(tree): + return + except ParserRejectedMarkup: + logger.warning("Unable to decode HTML body") return # Attempt to find interesting URLs (images, videos, embeds). diff --git a/synapse/rest/media/v1/preview_html.py b/synapse/rest/media/v1/preview_html.py index ca73965fc2..1866df60bb 100644 --- a/synapse/rest/media/v1/preview_html.py +++ b/synapse/rest/media/v1/preview_html.py @@ -15,10 +15,11 @@ import codecs import itertools import logging import re -from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union +from typing import TYPE_CHECKING, Dict, Generator, Iterable, Iterator, Optional, Set if TYPE_CHECKING: - from lxml import etree + from bs4 import BeautifulSoup + from bs4.element import PageElement, Tag logger = logging.getLogger(__name__) @@ -103,9 +104,9 @@ def _get_html_media_encodings( def decode_body( body: bytes, uri: str, content_type: Optional[str] = None -) -> Optional["etree.Element"]: +) -> Optional["BeautifulSoup"]: """ - This uses lxml to parse the HTML document. + This uses BeautifulSoup to parse the HTML document. Args: body: The HTML document, as bytes. @@ -119,36 +120,25 @@ def decode_body( if not body: return None - # The idea here is that multiple encodings are tried until one works. - # Unfortunately the result is never used and then LXML will decode the string - # again with the found encoding. - for encoding in _get_html_media_encodings(body, content_type): - try: - body.decode(encoding) - except Exception: - pass - else: - break - else: + from bs4 import BeautifulSoup + from bs4.builder import ParserRejectedMarkup + + try: + soup = BeautifulSoup(body, "lxml") + # If an empty document is returned, convert to None. + if not len(soup): + return None + return soup + except ParserRejectedMarkup: logger.warning("Unable to decode HTML body for %s", uri) return None - from lxml import etree - - # Create an HTML parser. - parser = etree.HTMLParser(recover=True, encoding=encoding) - - # Attempt to parse the body. Returns None if the body was successfully - # parsed, but no tree was found. - return etree.fromstring(body, parser) - -def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: +def parse_html_to_open_graph(tree: "BeautifulSoup") -> Dict[str, Optional[str]]: """ - Parse the HTML document into an Open Graph response. + Calculate metadata for an HTML document. - This uses lxml to search the HTML document for Open Graph data (or - synthesizes it from the document). + This uses BeautifulSoup to search the HTML document for Open Graph data. Args: tree: The parsed HTML document. @@ -174,13 +164,12 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3", og: Dict[str, Optional[str]] = {} - for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): - if "content" in tag.attrib: - # if we've got more than 50 tags, someone is taking the piss - if len(og) >= 50: - logger.warning("Skipping OG for page with too many 'og:' tags") - return {} - og[tag.attrib["property"]] = tag.attrib["content"] + for tag in tree.find_all("meta", property=re.compile(r"^og:"), content=True): + # if we've got more than 50 tags, someone is taking the piss + if len(og) >= 50: + logger.warning("Skipping OG for page with too many 'og:' tags") + return {} + og[tag["property"]] = tag["content"] # TODO: grab article: meta tags too, e.g.: @@ -193,41 +182,41 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: if "og:title" not in og: # do some basic spidering of the HTML - title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]") - if title and title[0].text is not None: - og["og:title"] = title[0].text.strip() + title = tree.find(("title", "h1", "h2", "h3")) + if title and title.string: + og["og:title"] = title.string.strip() else: og["og:title"] = None if "og:image" not in og: # TODO: extract a favicon failing all else - meta_image = tree.xpath( - "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content" - ) + meta_image = tree.find("meta", image="image") if meta_image: - og["og:image"] = meta_image[0] + og["og:image"] = meta_image["content"] else: # TODO: consider inlined CSS styles as well as width & height attribs - images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]") + def greater_than(tag: "Tag") -> bool: + if "width" not in tag or "height" not in tag: + return False + try: + return int(tag["width"]) > 10 and int(tag["height"]) > 10 + except ValueError: + return False + + images = tree.find_all("img", src=True, width=greater_than) images = sorted( images, - key=lambda i: ( - -1 * float(i.attrib["width"]) * float(i.attrib["height"]) - ), + key=lambda i: (-1 * float(i["width"]) * float(i["height"])), ) if not images: - images = tree.xpath("//img[@src]") + images = tree.find_all("img", src=True) if images: - og["og:image"] = images[0].attrib["src"] + og["og:image"] = images[0]["src"] if "og:description" not in og: - meta_description = tree.xpath( - "//*/meta" - "[translate(@name, 'DESCRIPTION', 'description')='description']" - "/@content" - ) + meta_description = tree.find("meta", description="description") if meta_description: - og["og:description"] = meta_description[0] + og["og:description"] = meta_description["content"] else: og["og:description"] = parse_html_description(tree) elif og["og:description"]: @@ -240,7 +229,7 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: return og -def parse_html_description(tree: "etree.Element") -> Optional[str]: +def parse_html_description(tree: "BeautifulSoup") -> Optional[str]: """ Calculate a text description based on an HTML document. @@ -256,9 +245,6 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]: Returns: The plain text description, or None if one cannot be generated. """ - # We don't just use XPATH here as that is slow on some machines. - - from lxml import etree TAGS_TO_REMOVE = ( "header", @@ -268,7 +254,6 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]: "script", "noscript", "style", - etree.Comment, ) # Split all the text nodes into paragraphs (by splitting on new @@ -281,39 +266,32 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]: def _iterate_over_text( - tree: "etree.Element", *tags_to_ignore: Iterable[Union[str, "etree.Comment"]] + tree: Optional["Tag"], *tags_to_ignore: Iterable[str] ) -> Generator[str, None, None]: """Iterate over the tree returning text nodes in a depth first fashion, skipping text nodes inside certain tags. """ + if not tree: + return + + from bs4.element import NavigableString, Tag + # This is basically a stack that we extend using itertools.chain. # This will either consist of an element to iterate over *or* a string # to be returned. - elements = iter([tree]) + elements: Iterator["PageElement"] = iter([tree]) while True: el = next(elements, None) if el is None: return - if isinstance(el, str): - yield el - elif el.tag not in tags_to_ignore: - # el.text is the text before the first child, so we can immediately - # return it if the text exists. - if el.text: - yield el.text - - # We add to the stack all the elements children, interspersed with - # each child's tail text (if it exists). The tail text of a node - # is text that comes *after* the node, so we always include it even - # if we ignore the child node. - elements = itertools.chain( - itertools.chain.from_iterable( # Basically a flatmap - [child, child.tail] if child.tail else [child] - for child in el.iterchildren() - ), - elements, - ) + # Do not consider sub-classes of NavigableString since those represent + # comments, etc. + if type(el) == NavigableString: + yield str(el) + elif isinstance(el, Tag) and el.name not in tags_to_ignore: + # We add to the stack all the elements children. + elements = itertools.chain(el.contents, elements) def summarize_paragraphs( -- cgit 1.4.1