diff options
Diffstat (limited to 'synapse')
-rw-r--r-- | synapse/media/oembed.py | 32 | ||||
-rw-r--r-- | synapse/media/preview_html.py | 79 |
2 files changed, 75 insertions, 36 deletions
diff --git a/synapse/media/oembed.py b/synapse/media/oembed.py index c0eaf04be5..5ad9eec80b 100644 --- a/synapse/media/oembed.py +++ b/synapse/media/oembed.py @@ -14,7 +14,7 @@ import html import logging import urllib.parse -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, List, Optional, cast import attr @@ -98,7 +98,7 @@ class OEmbedProvider: # No match. return None - def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]: + def autodiscover_from_html(self, tree: "etree._Element") -> Optional[str]: """ Search an HTML document for oEmbed autodiscovery information. @@ -109,18 +109,22 @@ class OEmbedProvider: The URL to use for oEmbed information, or None if no URL was found. """ # Search for link elements with the proper rel and type attributes. - for tag in tree.xpath( - "//link[@rel='alternate'][@type='application/json+oembed']" + # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. + for tag in cast( + List["etree._Element"], + tree.xpath("//link[@rel='alternate'][@type='application/json+oembed']"), ): if "href" in tag.attrib: - return tag.attrib["href"] + return cast(str, tag.attrib["href"]) # Some providers (e.g. Flickr) use alternative instead of alternate. - for tag in tree.xpath( - "//link[@rel='alternative'][@type='application/json+oembed']" + # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. + for tag in cast( + List["etree._Element"], + tree.xpath("//link[@rel='alternative'][@type='application/json+oembed']"), ): if "href" in tag.attrib: - return tag.attrib["href"] + return cast(str, tag.attrib["href"]) return None @@ -212,11 +216,12 @@ class OEmbedProvider: return OEmbedResult(open_graph_response, author_name, cache_age) -def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]: +def _fetch_urls(tree: "etree._Element", tag_name: str) -> List[str]: results = [] - for tag in tree.xpath("//*/" + tag_name): + # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. + for tag in cast(List["etree._Element"], tree.xpath("//*/" + tag_name)): if "src" in tag.attrib: - results.append(tag.attrib["src"]) + results.append(cast(str, tag.attrib["src"])) return results @@ -244,11 +249,12 @@ def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> parser = etree.HTMLParser(recover=True, encoding="utf-8") # Attempt to parse the body. If this fails, log and return no metadata. - tree = etree.fromstring(html_body, parser) + # TODO Develop of lxml-stubs has this correct. + tree = etree.fromstring(html_body, parser) # type: ignore[arg-type] # The data was successfully parsed, but no tree was found. if tree is None: - return + return # type: ignore[unreachable] # Attempt to find interesting URLs (images, videos, embeds). if "og:image" not in open_graph_response: diff --git a/synapse/media/preview_html.py b/synapse/media/preview_html.py index 516d0434f0..1bc7ccb7f3 100644 --- a/synapse/media/preview_html.py +++ b/synapse/media/preview_html.py @@ -24,6 +24,7 @@ from typing import ( Optional, Set, Union, + cast, ) if TYPE_CHECKING: @@ -115,7 +116,7 @@ def _get_html_media_encodings( def decode_body( body: bytes, uri: str, content_type: Optional[str] = None -) -> Optional["etree.Element"]: +) -> Optional["etree._Element"]: """ This uses lxml to parse the HTML document. @@ -152,11 +153,12 @@ def decode_body( # Attempt to parse the body. Returns None if the body was successfully # parsed, but no tree was found. - return etree.fromstring(body, parser) + # TODO Develop of lxml-stubs has this correct. + return etree.fromstring(body, parser) # type: ignore[arg-type] def _get_meta_tags( - tree: "etree.Element", + tree: "etree._Element", property: str, prefix: str, property_mapper: Optional[Callable[[str], Optional[str]]] = None, @@ -175,9 +177,15 @@ def _get_meta_tags( Returns: A map of tag name to value. """ + # This actually returns Dict[str, str], but the caller sets this as a variable + # which is Dict[str, Optional[str]]. results: Dict[str, Optional[str]] = {} - for tag in tree.xpath( - f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]" + # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. + for tag in cast( + List["etree._Element"], + tree.xpath( + f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]" + ), ): # if we've got more than 50 tags, someone is taking the piss if len(results) >= 50: @@ -187,14 +195,15 @@ def _get_meta_tags( ) return {} - key = tag.attrib[property] + key = cast(str, tag.attrib[property]) if property_mapper: - key = property_mapper(key) + new_key = property_mapper(key) # None is a special value used to ignore a value. - if key is None: + if new_key is None: continue + key = new_key - results[key] = tag.attrib["content"] + results[key] = cast(str, tag.attrib["content"]) return results @@ -219,7 +228,7 @@ def _map_twitter_to_open_graph(key: str) -> Optional[str]: return "og" + key[7:] -def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: +def parse_html_to_open_graph(tree: "etree._Element") -> Dict[str, Optional[str]]: """ Parse the HTML document into an Open Graph response. @@ -276,24 +285,36 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: if "og:title" not in og: # Attempt to find a title from the title tag, or the biggest header on the page. - title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()") + # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. + title = cast( + List["etree._ElementUnicodeResult"], + tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()"), + ) if title: og["og:title"] = title[0].strip() else: og["og:title"] = None if "og:image" not in og: - meta_image = tree.xpath( - "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]" + # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. + meta_image = cast( + List["etree._ElementUnicodeResult"], + tree.xpath( + "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]" + ), ) # If a meta image is found, use it. if meta_image: og["og:image"] = meta_image[0] else: # Try to find images which are larger than 10px by 10px. + # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. # # TODO: consider inlined CSS styles as well as width & height attribs - images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]") + images = cast( + List["etree._Element"], + tree.xpath("//img[@src][number(@width)>10][number(@height)>10]"), + ) images = sorted( images, key=lambda i: ( @@ -302,20 +323,29 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: ) # If no images were found, try to find *any* images. if not images: - images = tree.xpath("//img[@src][1]") + # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. + images = cast(List["etree._Element"], tree.xpath("//img[@src][1]")) if images: - og["og:image"] = images[0].attrib["src"] + og["og:image"] = cast(str, images[0].attrib["src"]) # Finally, fallback to the favicon if nothing else. else: - favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]") + # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. + favicons = cast( + List["etree._ElementUnicodeResult"], + tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]"), + ) if favicons: og["og:image"] = favicons[0] if "og:description" not in og: # Check the first meta description tag for content. - meta_description = tree.xpath( - "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]" + # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. + meta_description = cast( + List["etree._ElementUnicodeResult"], + tree.xpath( + "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]" + ), ) # If a meta description is found with content, use it. if meta_description: @@ -332,7 +362,7 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: return og -def parse_html_description(tree: "etree.Element") -> Optional[str]: +def parse_html_description(tree: "etree._Element") -> Optional[str]: """ Calculate a text description based on an HTML document. @@ -368,6 +398,9 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]: "canvas", "img", "picture", + # etree.Comment is a function which creates an etree._Comment element. + # The "tag" attribute of an etree._Comment instance is confusingly the + # etree.Comment function instead of a string. etree.Comment, } @@ -381,8 +414,8 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]: def _iterate_over_text( - tree: Optional["etree.Element"], - tags_to_ignore: Set[Union[str, "etree.Comment"]], + tree: Optional["etree._Element"], + tags_to_ignore: Set[object], stack_limit: int = 1024, ) -> Generator[str, None, None]: """Iterate over the tree returning text nodes in a depth first fashion, @@ -402,7 +435,7 @@ def _iterate_over_text( # This is a stack whose items are elements to iterate over *or* strings # to be returned. - elements: List[Union[str, "etree.Element"]] = [tree] + elements: List[Union[str, "etree._Element"]] = [tree] while elements: el = elements.pop() |