diff --git a/synapse/rest/media/v1/preview_html.py b/synapse/rest/media/v1/preview_html.py
index ed8f21a483..c826a13093 100644
--- a/synapse/rest/media/v1/preview_html.py
+++ b/synapse/rest/media/v1/preview_html.py
@@ -15,7 +15,16 @@ import codecs
import itertools
import logging
import re
-from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
+from typing import (
+ Callable,
+ Dict,
+ Generator,
+ Iterable,
+ Optional,
+ Set,
+ Union,
from lxml import etree
@@ -146,6 +155,70 @@ def decode_body(
return etree.fromstring(body, parser)
+def _get_meta_tags(
+ tree: "etree.Element",
+ property: str,
+ prefix: str,
+ property_mapper: Optional[Callable[[str], Optional[str]]] = None,
+) -> Dict[str, Optional[str]]:
+ """
+ Search for meta tags prefixed with a particular string.
+ Args:
+ tree: The parsed HTML document.
+ property: The name of the property which contains the tag name, e.g.
+ "property" for Open Graph.
+ prefix: The prefix on the property to search for, e.g. "og" for Open Graph.
+ property_mapper: An optional callable to map the property to the Open Graph
+ form. Can return None for a key to ignore that key.
+ Returns:
+ A map of tag name to value.
+ """
+ results: Dict[str, Optional[str]] = {}
+ for tag in tree.xpath(
+ f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]"
+ ):
+ # if we've got more than 50 tags, someone is taking the piss
+ if len(results) >= 50:
+ logger.warning(
+ "Skipping parsing of Open Graph for page with too many '%s:' tags",
+ prefix,
+ )
+ return {}
+ key = tag.attrib[property]
+ if property_mapper:
+ key = property_mapper(key)
+ # None is a special value used to ignore a value.
+ if key is None:
+ continue
+ results[key] = tag.attrib["content"]
+ return results
+def _map_twitter_to_open_graph(key: str) -> Optional[str]:
+ """
+ Map a Twitter card property to the analogous Open Graph property.
+ Args:
+ key: The Twitter card property (starts with "twitter:").
+ Returns:
+ The Open Graph property (starts with "og:") or None to have this property
+ be ignored.
+ """
+ # Twitter card properties with no analogous Open Graph property.
+ if key == "twitter:card" or key == "twitter:creator":
+ return None
+ if key == "twitter:site":
+ return "og:site_name"
+ # Otherwise, swap twitter to og.
+ return "og" + key[7:]
def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
Parse the HTML document into an Open Graph response.
@@ -160,10 +233,8 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
The Open Graph response as a dictionary.
- # if we see any image URLs in the OG response, then spider them
- # (although the client could choose to do this by asking for previews of those
- # URLs to avoid DoSing the server)
+ # Search for Open Graph (og:) meta tags, e.g.:
+ #
# "og:type" : "video",
# "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw",
# "og:site_name" : "YouTube",
@@ -176,19 +247,11 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
# "og:video:height" : "720",
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
- og: Dict[str, Optional[str]] = {}
- for tag in tree.xpath(
- "//*/meta[starts-with(@property, 'og:')][@content][not(@content='')]"
- ):
- # if we've got more than 50 tags, someone is taking the piss
- if len(og) >= 50:
- logger.warning("Skipping OG for page with too many 'og:' tags")
- return {}
- og[tag.attrib["property"]] = tag.attrib["content"]
- # TODO: grab article: meta tags too, e.g.:
+ og = _get_meta_tags(tree, "property", "og")
+ # TODO: Search for properties specific to the different Open Graph types,
+ # such as article: meta tags, e.g.:
+ #
# "article:publisher" : "https://www.facebook.com/thethudonline" />
# "article:author" content="https://www.facebook.com/thethudonline" />
# "article:tag" content="baby" />
@@ -196,6 +259,21 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
# "article:published_time" content="2016-03-31T19:58:24+00:00" />
# "article:modified_time" content="2016-04-01T18:31:53+00:00" />
+ # Search for Twitter Card (twitter:) meta tags, e.g.:
+ #
+ # "twitter:site" : "@matrixdotorg"
+ # "twitter:creator" : "@matrixdotorg"
+ #
+ # Twitter cards tags also duplicate Open Graph tags.
+ #
+ # See https://developer.twitter.com/en/docs/twitter-for-websites/cards/guides/getting-started
+ twitter = _get_meta_tags(tree, "name", "twitter", _map_twitter_to_open_graph)
+ # Merge the Twitter values with the Open Graph values, but do not overwrite
+ # information from Open Graph tags.
+ for key, value in twitter.items():
+ if key not in og:
+ og[key] = value
if "og:title" not in og:
# Attempt to find a title from the title tag, or the biggest header on the page.
title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")