diff options
-rw-r--r-- | changelog.d/11196.bugfix | 1 | ||||
-rw-r--r-- | synapse/rest/media/v1/preview_url_resource.py | 9 | ||||
-rw-r--r-- | tests/test_preview.py | 15 |
3 files changed, 22 insertions, 3 deletions
diff --git a/changelog.d/11196.bugfix b/changelog.d/11196.bugfix new file mode 100644 index 0000000000..3861eeb908 --- /dev/null +++ b/changelog.d/11196.bugfix @@ -0,0 +1 @@ +Fix a bug introduced in v1.46.0rc1 where URL previews of some XML documents would fail. diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 278fd901e2..8ca97b5b18 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -718,9 +718,12 @@ def decode_body( if not body: return None + # The idea here is that multiple encodings are tried until one works. + # Unfortunately the result is never used and then LXML will decode the string + # again with the found encoding. for encoding in get_html_media_encodings(body, content_type): try: - body_str = body.decode(encoding) + body.decode(encoding) except Exception: pass else: @@ -732,11 +735,11 @@ def decode_body( from lxml import etree # Create an HTML parser. - parser = etree.HTMLParser(recover=True, encoding="utf-8") + parser = etree.HTMLParser(recover=True, encoding=encoding) # Attempt to parse the body. Returns None if the body was successfully # parsed, but no tree was found. - return etree.fromstring(body_str, parser) + return etree.fromstring(body, parser) def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]: diff --git a/tests/test_preview.py b/tests/test_preview.py index 9a576f9a4e..40b89fb2ef 100644 --- a/tests/test_preview.py +++ b/tests/test_preview.py @@ -277,6 +277,21 @@ class CalcOgTestCase(unittest.TestCase): tree = decode_body(html, "http://example.com/test.html") self.assertIsNone(tree) + def test_xml(self): + """Test decoding XML and ensure it works properly.""" + # Note that the strip() call is important to ensure the xml tag starts + # at the initial byte. + html = b""" + <?xml version="1.0" encoding="UTF-8"?> + + <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> + <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> + <head><title>Foo</title></head><body>Some text.</body></html> + """.strip() + tree = decode_body(html, "http://example.com/test.html") + og = _calc_og(tree, "http://example.com/test.html") + self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) + def test_invalid_encoding(self): """An invalid character encoding should be ignored and treated as UTF-8, if possible.""" html = b""" |