Handle additional errors when previewing URLs. (#9333)

* Handle the case of lxml not finding a document tree. * Parse the document encoding from the XML tag.
author: Patrick Cloke <clokep@users.noreply.github.com> 2021-02-08 12:33:30 -0500
committer: GitHub <noreply@github.com> 2021-02-08 12:33:30 -0500
commit: 0963d39ea66be95d44f8f4ae29fa6f33fc6740b4 (patch)
tree: f0a24fd1cf8d2889a25704a9b1bcd374a37ed99e /tests
parent: Merge pull request #9150 from Yoric/develop-context (diff)
download: synapse-0963d39ea66be95d44f8f4ae29fa6f33fc6740b4.tar.xz
1 files changed, 92 insertions, 11 deletions
diff --git a/tests/test_preview.py b/tests/test_preview.py
index 0c6cbbd921..ea83299918 100644
--- a/tests/test_preview.py
+++ b/tests/test_preview.py
@@ -15,6 +15,7 @@
 
 from synapse.rest.media.v1.preview_url_resource import (
     decode_and_calc_og,
+    get_html_media_encoding,
     summarize_paragraphs,
 )
 
@@ -26,7 +27,7 @@ except ImportError:
     lxml = None
 
 
-class PreviewTestCase(unittest.TestCase):
+class SummarizeTestCase(unittest.TestCase):
     if not lxml:
         skip = "url preview feature requires lxml"
 
@@ -144,12 +145,12 @@ class PreviewTestCase(unittest.TestCase):
         )
 
 
-class PreviewUrlTestCase(unittest.TestCase):
+class CalcOgTestCase(unittest.TestCase):
     if not lxml:
         skip = "url preview feature requires lxml"
 
     def test_simple(self):
-        html = """
+        html = b"""
         <html>
         <head><title>Foo</title></head>
         <body>
@@ -163,7 +164,7 @@ class PreviewUrlTestCase(unittest.TestCase):
         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
 
     def test_comment(self):
-        html = """
+        html = b"""
         <html>
         <head><title>Foo</title></head>
         <body>
@@ -178,7 +179,7 @@ class PreviewUrlTestCase(unittest.TestCase):
         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
 
     def test_comment2(self):
-        html = """
+        html = b"""
         <html>
         <head><title>Foo</title></head>
         <body>
@@ -202,7 +203,7 @@ class PreviewUrlTestCase(unittest.TestCase):
         )
 
     def test_script(self):
-        html = """
+        html = b"""
         <html>
         <head><title>Foo</title></head>
         <body>
@@ -217,7 +218,7 @@ class PreviewUrlTestCase(unittest.TestCase):
         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
 
     def test_missing_title(self):
-        html = """
+        html = b"""
         <html>
         <body>
         Some text.
@@ -230,7 +231,7 @@ class PreviewUrlTestCase(unittest.TestCase):
         self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
 
     def test_h1_as_title(self):
-        html = """
+        html = b"""
         <html>
         <meta property="og:description" content="Some text."/>
         <body>
@@ -244,7 +245,7 @@ class PreviewUrlTestCase(unittest.TestCase):
         self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
 
     def test_missing_title_and_broken_h1(self):
-        html = """
+        html = b"""
         <html>
         <body>
         <h1><a href="foo"/></h1>
@@ -258,13 +259,20 @@ class PreviewUrlTestCase(unittest.TestCase):
         self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
 
     def test_empty(self):
-        html = ""
+        """Test a body with no data in it."""
+        html = b""
+        og = decode_and_calc_og(html, "http://example.com/test.html")
+        self.assertEqual(og, {})
+
+    def test_no_tree(self):
+        """A valid body with no tree in it."""
+        html = b"\x00"
         og = decode_and_calc_og(html, "http://example.com/test.html")
         self.assertEqual(og, {})
 
     def test_invalid_encoding(self):
         """An invalid character encoding should be ignored and treated as UTF-8, if possible."""
-        html = """
+        html = b"""
         <html>
         <head><title>Foo</title></head>
         <body>
@@ -290,3 +298,76 @@ class PreviewUrlTestCase(unittest.TestCase):
         """
         og = decode_and_calc_og(html, "http://example.com/test.html")
         self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
+
+
+class MediaEncodingTestCase(unittest.TestCase):
+    def test_meta_charset(self):
+        """A character encoding is found via the meta tag."""
+        encoding = get_html_media_encoding(
+            b"""
+        <html>
+        <head><meta charset="ascii">
+        </head>
+        </html>
+        """,
+            "text/html",
+        )
+        self.assertEqual(encoding, "ascii")
+
+        # A less well-formed version.
+        encoding = get_html_media_encoding(
+            b"""
+        <html>
+        <head>< meta charset = ascii>
+        </head>
+        </html>
+        """,
+            "text/html",
+        )
+        self.assertEqual(encoding, "ascii")
+
+    def test_xml_encoding(self):
+        """A character encoding is found via the meta tag."""
+        encoding = get_html_media_encoding(
+            b"""
+        <?xml version="1.0" encoding="ascii"?>
+        <html>
+        </html>
+        """,
+            "text/html",
+        )
+        self.assertEqual(encoding, "ascii")
+
+    def test_meta_xml_encoding(self):
+        """Meta tags take precedence over XML encoding."""
+        encoding = get_html_media_encoding(
+            b"""
+        <?xml version="1.0" encoding="ascii"?>
+        <html>
+        <head><meta charset="UTF-16">
+        </head>
+        </html>
+        """,
+            "text/html",
+        )
+        self.assertEqual(encoding, "UTF-16")
+
+    def test_content_type(self):
+        """A character encoding is found via the Content-Type header."""
+        # Test a few variations of the header.
+        headers = (
+            'text/html; charset="ascii";',
+            "text/html;charset=ascii;",
+            'text/html;  charset="ascii"',
+            "text/html; charset=ascii",
+            'text/html; charset="ascii;',
+            'text/html; charset=ascii";',
+        )
+        for header in headers:
+            encoding = get_html_media_encoding(b"", header)
+            self.assertEqual(encoding, "ascii")
+
+    def test_fallback(self):
+        """A character encoding cannot be found in the body or header."""
+        encoding = get_html_media_encoding(b"", "text/html")
+        self.assertEqual(encoding, "utf-8")
author	Patrick Cloke <clokep@users.noreply.github.com>	2021-02-08 12:33:30 -0500
committer	GitHub <noreply@github.com>	2021-02-08 12:33:30 -0500
commit	0963d39ea66be95d44f8f4ae29fa6f33fc6740b4 (patch)
tree	f0a24fd1cf8d2889a25704a9b1bcd374a37ed99e /tests
parent	Merge pull request #9150 from Yoric/develop-context (diff)
download	synapse-0963d39ea66be95d44f8f4ae29fa6f33fc6740b4.tar.xz