diff options
author | Amber Brown <hawkowl@atleastfornow.net> | 2018-11-15 11:05:08 -0600 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-11-15 11:05:08 -0600 |
commit | df758e155dac18602c34f63df56907de081a7220 (patch) | |
tree | 838b4e9b7e885c95a05d3fda18186850a629292c | |
parent | Add a coveragerc (#4180) (diff) | |
download | synapse-df758e155dac18602c34f63df56907de081a7220.tar.xz |
Use <meta> tags to discover the per-page encoding of html previews (#4183)
-rw-r--r-- | changelog.d/4183.bugfix | 1 | ||||
-rw-r--r-- | synapse/rest/media/v1/preview_url_resource.py | 31 | ||||
-rw-r--r-- | tests/rest/media/v1/test_url_preview.py | 77 |
3 files changed, 100 insertions, 9 deletions
diff --git a/changelog.d/4183.bugfix b/changelog.d/4183.bugfix new file mode 100644 index 0000000000..3e9ba3826f --- /dev/null +++ b/changelog.d/4183.bugfix @@ -0,0 +1 @@ +URL previews now correctly decode non-UTF-8 text if the header contains a `<meta http-equiv="Content-Type"` header. diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 91d1dafe64..9b15699e4d 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -53,6 +53,9 @@ from ._base import FileInfo logger = logging.getLogger(__name__) +_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I) +_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I) + class PreviewUrlResource(Resource): isLeaf = True @@ -223,15 +226,25 @@ class PreviewUrlResource(Resource): with open(media_info['filename'], 'rb') as file: body = file.read() - # clobber the encoding from the content-type, or default to utf-8 - # XXX: this overrides any <meta/> or XML charset headers in the body - # which may pose problems, but so far seems to work okay. - match = re.match( - r'.*; *charset="?(.*?)"?(;|$)', - media_info['media_type'], - re.I - ) - encoding = match.group(1) if match else "utf-8" + encoding = None + + # Let's try and figure out if it has an encoding set in a meta tag. + # Limit it to the first 1kb, since it ought to be in the meta tags + # at the top. + match = _charset_match.search(body[:1000]) + + # If we find a match, it should take precedence over the + # Content-Type header, so set it here. + if match: + encoding = match.group(1).decode('ascii') + + # If we don't find a match, we'll look at the HTTP Content-Type, and + # if that doesn't exist, we'll fall back to UTF-8. + if not encoding: + match = _content_type_match.match( + media_info['media_type'] + ) + encoding = match.group(1) if match else "utf-8" og = decode_and_calc_og(body, media_info['uri'], encoding) diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index 29579cf091..86c813200a 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -162,3 +162,80 @@ class URLPreviewTests(unittest.HomeserverTestCase): self.assertEqual( channel.json_body, {"og:title": "~matrix~", "og:description": "hi"} ) + + def test_non_ascii_preview_httpequiv(self): + + request, channel = self.make_request( + "GET", "url_preview?url=matrix.org", shorthand=False + ) + request.render(self.preview_url) + self.pump() + + # We've made one fetch + self.assertEqual(len(self.fetches), 1) + + end_content = ( + b'<html><head>' + b'<meta http-equiv="Content-Type" content="text/html; charset=windows-1251"/>' + b'<meta property="og:title" content="\xe4\xea\xe0" />' + b'<meta property="og:description" content="hi" />' + b'</head></html>' + ) + + self.fetches[0][0].callback( + ( + end_content, + ( + len(end_content), + { + b"Content-Length": [b"%d" % (len(end_content))], + # This charset=utf-8 should be ignored, because the + # document has a meta tag overriding it. + b"Content-Type": [b'text/html; charset="utf8"'], + }, + "https://example.com", + 200, + ), + ) + ) + + self.pump() + self.assertEqual(channel.code, 200) + self.assertEqual(channel.json_body["og:title"], u"\u0434\u043a\u0430") + + def test_non_ascii_preview_content_type(self): + + request, channel = self.make_request( + "GET", "url_preview?url=matrix.org", shorthand=False + ) + request.render(self.preview_url) + self.pump() + + # We've made one fetch + self.assertEqual(len(self.fetches), 1) + + end_content = ( + b'<html><head>' + b'<meta property="og:title" content="\xe4\xea\xe0" />' + b'<meta property="og:description" content="hi" />' + b'</head></html>' + ) + + self.fetches[0][0].callback( + ( + end_content, + ( + len(end_content), + { + b"Content-Length": [b"%d" % (len(end_content))], + b"Content-Type": [b'text/html; charset="windows-1251"'], + }, + "https://example.com", + 200, + ), + ) + ) + + self.pump() + self.assertEqual(channel.code, 200) + self.assertEqual(channel.json_body["og:title"], u"\u0434\u043a\u0430") |