diff options
author | Patrick Cloke <clokep@users.noreply.github.com> | 2021-02-08 12:33:30 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-02-08 12:33:30 -0500 |
commit | 0963d39ea66be95d44f8f4ae29fa6f33fc6740b4 (patch) | |
tree | f0a24fd1cf8d2889a25704a9b1bcd374a37ed99e /tests | |
parent | Merge pull request #9150 from Yoric/develop-context (diff) | |
download | synapse-0963d39ea66be95d44f8f4ae29fa6f33fc6740b4.tar.xz |
Handle additional errors when previewing URLs. (#9333)
* Handle the case of lxml not finding a document tree. * Parse the document encoding from the XML tag.
Diffstat (limited to 'tests')
-rw-r--r-- | tests/test_preview.py | 103 |
1 files changed, 92 insertions, 11 deletions
diff --git a/tests/test_preview.py b/tests/test_preview.py index 0c6cbbd921..ea83299918 100644 --- a/tests/test_preview.py +++ b/tests/test_preview.py @@ -15,6 +15,7 @@ from synapse.rest.media.v1.preview_url_resource import ( decode_and_calc_og, + get_html_media_encoding, summarize_paragraphs, ) @@ -26,7 +27,7 @@ except ImportError: lxml = None -class PreviewTestCase(unittest.TestCase): +class SummarizeTestCase(unittest.TestCase): if not lxml: skip = "url preview feature requires lxml" @@ -144,12 +145,12 @@ class PreviewTestCase(unittest.TestCase): ) -class PreviewUrlTestCase(unittest.TestCase): +class CalcOgTestCase(unittest.TestCase): if not lxml: skip = "url preview feature requires lxml" def test_simple(self): - html = """ + html = b""" <html> <head><title>Foo</title></head> <body> @@ -163,7 +164,7 @@ class PreviewUrlTestCase(unittest.TestCase): self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) def test_comment(self): - html = """ + html = b""" <html> <head><title>Foo</title></head> <body> @@ -178,7 +179,7 @@ class PreviewUrlTestCase(unittest.TestCase): self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) def test_comment2(self): - html = """ + html = b""" <html> <head><title>Foo</title></head> <body> @@ -202,7 +203,7 @@ class PreviewUrlTestCase(unittest.TestCase): ) def test_script(self): - html = """ + html = b""" <html> <head><title>Foo</title></head> <body> @@ -217,7 +218,7 @@ class PreviewUrlTestCase(unittest.TestCase): self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) def test_missing_title(self): - html = """ + html = b""" <html> <body> Some text. @@ -230,7 +231,7 @@ class PreviewUrlTestCase(unittest.TestCase): self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) def test_h1_as_title(self): - html = """ + html = b""" <html> <meta property="og:description" content="Some text."/> <body> @@ -244,7 +245,7 @@ class PreviewUrlTestCase(unittest.TestCase): self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."}) def test_missing_title_and_broken_h1(self): - html = """ + html = b""" <html> <body> <h1><a href="foo"/></h1> @@ -258,13 +259,20 @@ class PreviewUrlTestCase(unittest.TestCase): self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) def test_empty(self): - html = "" + """Test a body with no data in it.""" + html = b"" + og = decode_and_calc_og(html, "http://example.com/test.html") + self.assertEqual(og, {}) + + def test_no_tree(self): + """A valid body with no tree in it.""" + html = b"\x00" og = decode_and_calc_og(html, "http://example.com/test.html") self.assertEqual(og, {}) def test_invalid_encoding(self): """An invalid character encoding should be ignored and treated as UTF-8, if possible.""" - html = """ + html = b""" <html> <head><title>Foo</title></head> <body> @@ -290,3 +298,76 @@ class PreviewUrlTestCase(unittest.TestCase): """ og = decode_and_calc_og(html, "http://example.com/test.html") self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."}) + + +class MediaEncodingTestCase(unittest.TestCase): + def test_meta_charset(self): + """A character encoding is found via the meta tag.""" + encoding = get_html_media_encoding( + b""" + <html> + <head><meta charset="ascii"> + </head> + </html> + """, + "text/html", + ) + self.assertEqual(encoding, "ascii") + + # A less well-formed version. + encoding = get_html_media_encoding( + b""" + <html> + <head>< meta charset = ascii> + </head> + </html> + """, + "text/html", + ) + self.assertEqual(encoding, "ascii") + + def test_xml_encoding(self): + """A character encoding is found via the meta tag.""" + encoding = get_html_media_encoding( + b""" + <?xml version="1.0" encoding="ascii"?> + <html> + </html> + """, + "text/html", + ) + self.assertEqual(encoding, "ascii") + + def test_meta_xml_encoding(self): + """Meta tags take precedence over XML encoding.""" + encoding = get_html_media_encoding( + b""" + <?xml version="1.0" encoding="ascii"?> + <html> + <head><meta charset="UTF-16"> + </head> + </html> + """, + "text/html", + ) + self.assertEqual(encoding, "UTF-16") + + def test_content_type(self): + """A character encoding is found via the Content-Type header.""" + # Test a few variations of the header. + headers = ( + 'text/html; charset="ascii";', + "text/html;charset=ascii;", + 'text/html; charset="ascii"', + "text/html; charset=ascii", + 'text/html; charset="ascii;', + 'text/html; charset=ascii";', + ) + for header in headers: + encoding = get_html_media_encoding(b"", header) + self.assertEqual(encoding, "ascii") + + def test_fallback(self): + """A character encoding cannot be found in the body or header.""" + encoding = get_html_media_encoding(b"", "text/html") + self.assertEqual(encoding, "utf-8") |