diff --git a/changelog.d/4183.bugfix b/changelog.d/4183.bugfix
new file mode 100644
index 0000000000..3e9ba3826f
--- /dev/null
+++ b/changelog.d/4183.bugfix
@@ -0,0 +1 @@
+URL previews now correctly decode non-UTF-8 text if the header contains a `<meta http-equiv="Content-Type"` header.
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 91d1dafe64..9b15699e4d 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -53,6 +53,9 @@ from ._base import FileInfo
logger = logging.getLogger(__name__)
+_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
+_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
+
class PreviewUrlResource(Resource):
isLeaf = True
@@ -223,15 +226,25 @@ class PreviewUrlResource(Resource):
with open(media_info['filename'], 'rb') as file:
body = file.read()
- # clobber the encoding from the content-type, or default to utf-8
- # XXX: this overrides any <meta/> or XML charset headers in the body
- # which may pose problems, but so far seems to work okay.
- match = re.match(
- r'.*; *charset="?(.*?)"?(;|$)',
- media_info['media_type'],
- re.I
- )
- encoding = match.group(1) if match else "utf-8"
+ encoding = None
+
+ # Let's try and figure out if it has an encoding set in a meta tag.
+ # Limit it to the first 1kb, since it ought to be in the meta tags
+ # at the top.
+ match = _charset_match.search(body[:1000])
+
+ # If we find a match, it should take precedence over the
+ # Content-Type header, so set it here.
+ if match:
+ encoding = match.group(1).decode('ascii')
+
+ # If we don't find a match, we'll look at the HTTP Content-Type, and
+ # if that doesn't exist, we'll fall back to UTF-8.
+ if not encoding:
+ match = _content_type_match.match(
+ media_info['media_type']
+ )
+ encoding = match.group(1) if match else "utf-8"
og = decode_and_calc_og(body, media_info['uri'], encoding)
diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py
index 29579cf091..86c813200a 100644
--- a/tests/rest/media/v1/test_url_preview.py
+++ b/tests/rest/media/v1/test_url_preview.py
@@ -162,3 +162,80 @@ class URLPreviewTests(unittest.HomeserverTestCase):
self.assertEqual(
channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
)
+
+ def test_non_ascii_preview_httpequiv(self):
+
+ request, channel = self.make_request(
+ "GET", "url_preview?url=matrix.org", shorthand=False
+ )
+ request.render(self.preview_url)
+ self.pump()
+
+ # We've made one fetch
+ self.assertEqual(len(self.fetches), 1)
+
+ end_content = (
+ b'<html><head>'
+ b'<meta http-equiv="Content-Type" content="text/html; charset=windows-1251"/>'
+ b'<meta property="og:title" content="\xe4\xea\xe0" />'
+ b'<meta property="og:description" content="hi" />'
+ b'</head></html>'
+ )
+
+ self.fetches[0][0].callback(
+ (
+ end_content,
+ (
+ len(end_content),
+ {
+ b"Content-Length": [b"%d" % (len(end_content))],
+ # This charset=utf-8 should be ignored, because the
+ # document has a meta tag overriding it.
+ b"Content-Type": [b'text/html; charset="utf8"'],
+ },
+ "https://example.com",
+ 200,
+ ),
+ )
+ )
+
+ self.pump()
+ self.assertEqual(channel.code, 200)
+ self.assertEqual(channel.json_body["og:title"], u"\u0434\u043a\u0430")
+
+ def test_non_ascii_preview_content_type(self):
+
+ request, channel = self.make_request(
+ "GET", "url_preview?url=matrix.org", shorthand=False
+ )
+ request.render(self.preview_url)
+ self.pump()
+
+ # We've made one fetch
+ self.assertEqual(len(self.fetches), 1)
+
+ end_content = (
+ b'<html><head>'
+ b'<meta property="og:title" content="\xe4\xea\xe0" />'
+ b'<meta property="og:description" content="hi" />'
+ b'</head></html>'
+ )
+
+ self.fetches[0][0].callback(
+ (
+ end_content,
+ (
+ len(end_content),
+ {
+ b"Content-Length": [b"%d" % (len(end_content))],
+ b"Content-Type": [b'text/html; charset="windows-1251"'],
+ },
+ "https://example.com",
+ 200,
+ ),
+ )
+ )
+
+ self.pump()
+ self.assertEqual(channel.code, 200)
+ self.assertEqual(channel.json_body["og:title"], u"\u0434\u043a\u0430")
|