diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index bf3be653aa..ae53b1d23f 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -58,7 +58,10 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__)
-_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
+_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I)
+_xml_encoding_match = re.compile(
+ br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I
+)
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
OG_TAG_NAME_MAXLEN = 50
@@ -300,24 +303,7 @@ class PreviewUrlResource(DirectServeJsonResource):
with open(media_info["filename"], "rb") as file:
body = file.read()
- encoding = None
-
- # Let's try and figure out if it has an encoding set in a meta tag.
- # Limit it to the first 1kb, since it ought to be in the meta tags
- # at the top.
- match = _charset_match.search(body[:1000])
-
- # If we find a match, it should take precedence over the
- # Content-Type header, so set it here.
- if match:
- encoding = match.group(1).decode("ascii")
-
- # If we don't find a match, we'll look at the HTTP Content-Type, and
- # if that doesn't exist, we'll fall back to UTF-8.
- if not encoding:
- content_match = _content_type_match.match(media_info["media_type"])
- encoding = content_match.group(1) if content_match else "utf-8"
-
+ encoding = get_html_media_encoding(body, media_info["media_type"])
og = decode_and_calc_og(body, media_info["uri"], encoding)
# pre-cache the image for posterity
@@ -689,6 +675,48 @@ class PreviewUrlResource(DirectServeJsonResource):
logger.debug("No media removed from url cache")
+def get_html_media_encoding(body: bytes, content_type: str) -> str:
+ """
+ Get the encoding of the body based on the (presumably) HTML body or media_type.
+
+ The precedence used for finding a character encoding is:
+
+ 1. meta tag with a charset declared.
+ 2. The XML document's character encoding attribute.
+ 3. The Content-Type header.
+ 4. Fallback to UTF-8.
+
+ Args:
+ body: The HTML document, as bytes.
+ content_type: The Content-Type header.
+
+ Returns:
+ The character encoding of the body, as a string.
+ """
+ # Limit searches to the first 1kb, since it ought to be at the top.
+ body_start = body[:1024]
+
+ # Let's try and figure out if it has an encoding set in a meta tag.
+ match = _charset_match.search(body_start)
+ if match:
+ return match.group(1).decode("ascii")
+
+ # TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
+
+ # If we didn't find a match, see if it an XML document with an encoding.
+ match = _xml_encoding_match.match(body_start)
+ if match:
+ return match.group(1).decode("ascii")
+
+ # If we don't find a match, we'll look at the HTTP Content-Type, and
+ # if that doesn't exist, we'll fall back to UTF-8.
+ content_match = _content_type_match.match(content_type)
+ if content_match:
+ return content_match.group(1)
+
+ return "utf-8"
+
+
def decode_and_calc_og(
body: bytes, media_uri: str, request_encoding: Optional[str] = None
) -> Dict[str, Optional[str]]:
@@ -725,6 +753,11 @@ def decode_and_calc_og(
def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
# Attempt to parse the body. If this fails, log and return no metadata.
tree = etree.fromstring(body_attempt, parser)
+
+ # The data was successfully parsed, but no tree was found.
+ if tree is None:
+ return {}
+
return _calc_og(tree, media_uri)
# Attempt to parse the body. If this fails, log and return no metadata.
|