summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--changelog.d/10410.bugfix1
-rw-r--r--synapse/rest/media/v1/preview_url_resource.py6
-rw-r--r--tests/test_preview.py13
3 files changed, 18 insertions, 2 deletions
diff --git a/changelog.d/10410.bugfix b/changelog.d/10410.bugfix
new file mode 100644
index 0000000000..65b418fd35
--- /dev/null
+++ b/changelog.d/10410.bugfix
@@ -0,0 +1 @@
+Improve character set detection in URL previews by supporting underscores (in addition to hyphens). Contributed by @srividyut.
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 172212ee3a..0f051d4041 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -58,9 +58,11 @@ if TYPE_CHECKING:
 
 logger = logging.getLogger(__name__)
 
-_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I)
+_charset_match = re.compile(
+    br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I
+)
 _xml_encoding_match = re.compile(
-    br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I
+    br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I
 )
 _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
 
diff --git a/tests/test_preview.py b/tests/test_preview.py
index cac3d81ac1..48e792b55b 100644
--- a/tests/test_preview.py
+++ b/tests/test_preview.py
@@ -325,6 +325,19 @@ class MediaEncodingTestCase(unittest.TestCase):
         )
         self.assertEqual(encoding, "ascii")
 
+    def test_meta_charset_underscores(self):
+        """A character encoding contains underscore."""
+        encoding = get_html_media_encoding(
+            b"""
+        <html>
+        <head><meta charset="Shift_JIS">
+        </head>
+        </html>
+        """,
+            "text/html",
+        )
+        self.assertEqual(encoding, "Shift_JIS")
+
     def test_xml_encoding(self):
         """A character encoding is found via the meta tag."""
         encoding = get_html_media_encoding(