diff options
author | Patrick Cloke <clokep@users.noreply.github.com> | 2021-01-26 07:32:17 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-01-26 07:32:17 -0500 |
commit | 4937fe3d6be94222b02760866496781f8cc88751 (patch) | |
tree | 40dd8def98be0b515054d67776a1452d2cc41a21 /tests/test_preview.py | |
parent | Update isort to v5.7.0 (#9222) (diff) | |
download | synapse-4937fe3d6be94222b02760866496781f8cc88751.tar.xz |
Try to recover from unknown encodings when previewing media. (#9164)
Treat unknown encodings (according to lxml) as UTF-8 when generating a preview for HTML documents. This isn't fully accurate, but will hopefully give a reasonable title and summary.
Diffstat (limited to '')
-rw-r--r-- | tests/test_preview.py | 29 |
1 files changed, 29 insertions, 0 deletions
diff --git a/tests/test_preview.py b/tests/test_preview.py index c19facc1cb..0c6cbbd921 100644 --- a/tests/test_preview.py +++ b/tests/test_preview.py @@ -261,3 +261,32 @@ class PreviewUrlTestCase(unittest.TestCase): html = "" og = decode_and_calc_og(html, "http://example.com/test.html") self.assertEqual(og, {}) + + def test_invalid_encoding(self): + """An invalid character encoding should be ignored and treated as UTF-8, if possible.""" + html = """ + <html> + <head><title>Foo</title></head> + <body> + Some text. + </body> + </html> + """ + og = decode_and_calc_og( + html, "http://example.com/test.html", "invalid-encoding" + ) + self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) + + def test_invalid_encoding2(self): + """A body which doesn't match the sent character encoding.""" + # Note that this contains an invalid UTF-8 sequence in the title. + html = b""" + <html> + <head><title>\xff\xff Foo</title></head> + <body> + Some text. + </body> + </html> + """ + og = decode_and_calc_og(html, "http://example.com/test.html") + self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."}) |