diff --git a/tests/test_preview.py b/tests/test_preview.py
index 0c6cbbd921..ea83299918 100644
--- a/tests/test_preview.py
+++ b/tests/test_preview.py
@@ -15,6 +15,7 @@
from synapse.rest.media.v1.preview_url_resource import (
decode_and_calc_og,
+ get_html_media_encoding,
summarize_paragraphs,
)
@@ -26,7 +27,7 @@ except ImportError:
lxml = None
-class PreviewTestCase(unittest.TestCase):
+class SummarizeTestCase(unittest.TestCase):
if not lxml:
skip = "url preview feature requires lxml"
@@ -144,12 +145,12 @@ class PreviewTestCase(unittest.TestCase):
)
-class PreviewUrlTestCase(unittest.TestCase):
+class CalcOgTestCase(unittest.TestCase):
if not lxml:
skip = "url preview feature requires lxml"
def test_simple(self):
- html = """
+ html = b"""
<html>
<head><title>Foo</title></head>
<body>
@@ -163,7 +164,7 @@ class PreviewUrlTestCase(unittest.TestCase):
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
def test_comment(self):
- html = """
+ html = b"""
<html>
<head><title>Foo</title></head>
<body>
@@ -178,7 +179,7 @@ class PreviewUrlTestCase(unittest.TestCase):
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
def test_comment2(self):
- html = """
+ html = b"""
<html>
<head><title>Foo</title></head>
<body>
@@ -202,7 +203,7 @@ class PreviewUrlTestCase(unittest.TestCase):
)
def test_script(self):
- html = """
+ html = b"""
<html>
<head><title>Foo</title></head>
<body>
@@ -217,7 +218,7 @@ class PreviewUrlTestCase(unittest.TestCase):
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
def test_missing_title(self):
- html = """
+ html = b"""
<html>
<body>
Some text.
@@ -230,7 +231,7 @@ class PreviewUrlTestCase(unittest.TestCase):
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
def test_h1_as_title(self):
- html = """
+ html = b"""
<html>
<meta property="og:description" content="Some text."/>
<body>
@@ -244,7 +245,7 @@ class PreviewUrlTestCase(unittest.TestCase):
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
def test_missing_title_and_broken_h1(self):
- html = """
+ html = b"""
<html>
<body>
<h1><a href="foo"/></h1>
@@ -258,13 +259,20 @@ class PreviewUrlTestCase(unittest.TestCase):
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
def test_empty(self):
- html = ""
+ """Test a body with no data in it."""
+ html = b""
+ og = decode_and_calc_og(html, "http://example.com/test.html")
+ self.assertEqual(og, {})
+
+ def test_no_tree(self):
+ """A valid body with no tree in it."""
+ html = b"\x00"
og = decode_and_calc_og(html, "http://example.com/test.html")
self.assertEqual(og, {})
def test_invalid_encoding(self):
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
- html = """
+ html = b"""
<html>
<head><title>Foo</title></head>
<body>
@@ -290,3 +298,76 @@ class PreviewUrlTestCase(unittest.TestCase):
"""
og = decode_and_calc_og(html, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
+
+
+class MediaEncodingTestCase(unittest.TestCase):
+ def test_meta_charset(self):
+ """A character encoding is found via the meta tag."""
+ encoding = get_html_media_encoding(
+ b"""
+ <html>
+ <head><meta charset="ascii">
+ </head>
+ </html>
+ """,
+ "text/html",
+ )
+ self.assertEqual(encoding, "ascii")
+
+ # A less well-formed version.
+ encoding = get_html_media_encoding(
+ b"""
+ <html>
+ <head>< meta charset = ascii>
+ </head>
+ </html>
+ """,
+ "text/html",
+ )
+ self.assertEqual(encoding, "ascii")
+
+ def test_xml_encoding(self):
+ """A character encoding is found via the meta tag."""
+ encoding = get_html_media_encoding(
+ b"""
+ <?xml version="1.0" encoding="ascii"?>
+ <html>
+ </html>
+ """,
+ "text/html",
+ )
+ self.assertEqual(encoding, "ascii")
+
+ def test_meta_xml_encoding(self):
+ """Meta tags take precedence over XML encoding."""
+ encoding = get_html_media_encoding(
+ b"""
+ <?xml version="1.0" encoding="ascii"?>
+ <html>
+ <head><meta charset="UTF-16">
+ </head>
+ </html>
+ """,
+ "text/html",
+ )
+ self.assertEqual(encoding, "UTF-16")
+
+ def test_content_type(self):
+ """A character encoding is found via the Content-Type header."""
+ # Test a few variations of the header.
+ headers = (
+ 'text/html; charset="ascii";',
+ "text/html;charset=ascii;",
+ 'text/html; charset="ascii"',
+ "text/html; charset=ascii",
+ 'text/html; charset="ascii;',
+ 'text/html; charset=ascii";',
+ )
+ for header in headers:
+ encoding = get_html_media_encoding(b"", header)
+ self.assertEqual(encoding, "ascii")
+
+ def test_fallback(self):
+ """A character encoding cannot be found in the body or header."""
+ encoding = get_html_media_encoding(b"", "text/html")
+ self.assertEqual(encoding, "utf-8")
|