diff options
author | sri-vidyut <srividyut@hotmail.com> | 2021-07-28 02:29:42 +0900 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-07-27 17:29:42 +0000 |
commit | 8e1febc6a1e909eeb4334d5572956f669ee2d290 (patch) | |
tree | ebdf8521585d22567cf12dc1784bcf9288a39d6b | |
parent | Fix `oldest_pdu_in_federation_staging` (#10455) (diff) | |
download | synapse-8e1febc6a1e909eeb4334d5572956f669ee2d290.tar.xz |
Support underscores (in addition to hyphens) for charset detection. (#10410)
Diffstat (limited to '')
-rw-r--r-- | changelog.d/10410.bugfix | 1 | ||||
-rw-r--r-- | synapse/rest/media/v1/preview_url_resource.py | 6 | ||||
-rw-r--r-- | tests/test_preview.py | 13 |
3 files changed, 18 insertions, 2 deletions
diff --git a/changelog.d/10410.bugfix b/changelog.d/10410.bugfix new file mode 100644 index 0000000000..65b418fd35 --- /dev/null +++ b/changelog.d/10410.bugfix @@ -0,0 +1 @@ +Improve character set detection in URL previews by supporting underscores (in addition to hyphens). Contributed by @srividyut. diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 172212ee3a..0f051d4041 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -58,9 +58,11 @@ if TYPE_CHECKING: logger = logging.getLogger(__name__) -_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I) +_charset_match = re.compile( + br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I +) _xml_encoding_match = re.compile( - br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I + br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I ) _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I) diff --git a/tests/test_preview.py b/tests/test_preview.py index cac3d81ac1..48e792b55b 100644 --- a/tests/test_preview.py +++ b/tests/test_preview.py @@ -325,6 +325,19 @@ class MediaEncodingTestCase(unittest.TestCase): ) self.assertEqual(encoding, "ascii") + def test_meta_charset_underscores(self): + """A character encoding contains underscore.""" + encoding = get_html_media_encoding( + b""" + <html> + <head><meta charset="Shift_JIS"> + </head> + </html> + """, + "text/html", + ) + self.assertEqual(encoding, "Shift_JIS") + def test_xml_encoding(self): """A character encoding is found via the meta tag.""" encoding = get_html_media_encoding( |