3 files changed, 18 insertions, 2 deletions
diff --git a/changelog.d/10410.bugfix b/changelog.d/10410.bugfix
new file mode 100644
index 0000000000..65b418fd35
--- /dev/null
+++ b/changelog.d/10410.bugfix
@@ -0,0 +1 @@
+Improve character set detection in URL previews by supporting underscores (in addition to hyphens). Contributed by @srividyut.
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 172212ee3a..0f051d4041 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -58,9 +58,11 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__)
-_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I)
+_charset_match = re.compile(
+ br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I
+)
_xml_encoding_match = re.compile(
- br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I
+ br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I
)
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
diff --git a/tests/test_preview.py b/tests/test_preview.py
index cac3d81ac1..48e792b55b 100644
--- a/tests/test_preview.py
+++ b/tests/test_preview.py
@@ -325,6 +325,19 @@ class MediaEncodingTestCase(unittest.TestCase):
)
self.assertEqual(encoding, "ascii")
+ def test_meta_charset_underscores(self):
+ """A character encoding contains underscore."""
+ encoding = get_html_media_encoding(
+ b"""
+ <html>
+ <head><meta charset="Shift_JIS">
+ </head>
+ </html>
+ """,
+ "text/html",
+ )
+ self.assertEqual(encoding, "Shift_JIS")
+
def test_xml_encoding(self):
"""A character encoding is found via the meta tag."""
encoding = get_html_media_encoding(
|