diff --git a/changelog.d/11196.bugfix b/changelog.d/11196.bugfix
new file mode 100644
index 0000000000..3861eeb908
--- /dev/null
+++ b/changelog.d/11196.bugfix
@@ -0,0 +1 @@
+Fix a bug introduced in v1.46.0rc1 where URL previews of some XML documents would fail.
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 278fd901e2..8ca97b5b18 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -718,9 +718,12 @@ def decode_body(
if not body:
return None
+ # The idea here is that multiple encodings are tried until one works.
+ # Unfortunately the result is never used and then LXML will decode the string
+ # again with the found encoding.
for encoding in get_html_media_encodings(body, content_type):
try:
- body_str = body.decode(encoding)
+ body.decode(encoding)
except Exception:
pass
else:
@@ -732,11 +735,11 @@ def decode_body(
from lxml import etree
# Create an HTML parser.
- parser = etree.HTMLParser(recover=True, encoding="utf-8")
+ parser = etree.HTMLParser(recover=True, encoding=encoding)
# Attempt to parse the body. Returns None if the body was successfully
# parsed, but no tree was found.
- return etree.fromstring(body_str, parser)
+ return etree.fromstring(body, parser)
def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
diff --git a/tests/test_preview.py b/tests/test_preview.py
index 9a576f9a4e..40b89fb2ef 100644
--- a/tests/test_preview.py
+++ b/tests/test_preview.py
@@ -277,6 +277,21 @@ class CalcOgTestCase(unittest.TestCase):
tree = decode_body(html, "http://example.com/test.html")
self.assertIsNone(tree)
+ def test_xml(self):
+ """Test decoding XML and ensure it works properly."""
+ # Note that the strip() call is important to ensure the xml tag starts
+ # at the initial byte.
+ html = b"""
+ <?xml version="1.0" encoding="UTF-8"?>
+
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+ <head><title>Foo</title></head><body>Some text.</body></html>
+ """.strip()
+ tree = decode_body(html, "http://example.com/test.html")
+ og = _calc_og(tree, "http://example.com/test.html")
+ self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
+
def test_invalid_encoding(self):
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
html = b"""
|