explicitly pass in the charset from Content-Type to lxml to fix cyrillic woes better

author: Matthew Hodgson <matthew@matrix.org> 2016-04-15 14:32:25 +0100
committer: Matthew Hodgson <matthew@matrix.org> 2016-04-15 14:32:25 +0100
commit: aaabbd3e9e514b3779b2004ff8e9f74dd9dc4b6a (patch)
tree: 5f07ca3f761e77fefb1dd9e464c0f2cd1fe7f68a /synapse/rest/media/v1
parent: fix cyrillic URL previews by hardcoding all page decoding to UTF-8 for now, r... (diff)
download: synapse-aaabbd3e9e514b3779b2004ff8e9f74dd9dc4b6a.tar.xz
1 files changed, 18 insertions, 6 deletions
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 7e937b0446..9bb7c72cfc 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -179,16 +179,28 @@ class PreviewUrlResource(BaseMediaResource):
         elif self._is_html(media_info['media_type']):
             # TODO: somehow stop a big HTML tree from exploding synapse's RAM
 
-            from lxml import html
+            from lxml import etree
 
-            # XXX: always manually try to decode body as utf-8 first, which
-            # seems to help with most character encoding woes.
-            # XXX: handle non-utf-8 encodings?
             file = open(media_info['filename'])
             body = file.read()
             file.close()
-            tree = html.fromstring(body.decode('utf-8', 'ignore'))
-            og = yield self._calc_og(tree, media_info, requester)
+
+            # clobber the encoding from the content-type, or default to utf-8
+            # XXX: this overrides any <meta/> or XML charset headers in the body
+            # which may pose problems, but so far seems to work okay.
+            match = re.match(r'.*; *charset=(.*?)(;|$)', media_info['media_type'], re.I)
+            encoding = match.group(1) if match else "utf-8"
+
+            try:
+                parser = etree.HTMLParser(recover=True, encoding=encoding)
+                tree = etree.fromstring(body, parser)
+                og = yield self._calc_og(tree, media_info, requester)
+            except UnicodeDecodeError:
+                # blindly try decoding the body as utf-8, which seems to fix
+                # the charset mismatches on https://google.com
+                parser = etree.HTMLParser(recover=True, encoding=encoding)
+                tree = etree.fromstring(body.decode('utf-8', 'ignore'), parser)
+                og = yield self._calc_og(tree, media_info, requester)
 
         else:
             logger.warn("Failed to find any OG data in %s", url)
author	Matthew Hodgson <matthew@matrix.org>	2016-04-15 14:32:25 +0100
committer	Matthew Hodgson <matthew@matrix.org>	2016-04-15 14:32:25 +0100
commit	aaabbd3e9e514b3779b2004ff8e9f74dd9dc4b6a (patch)
tree	5f07ca3f761e77fefb1dd9e464c0f2cd1fe7f68a /synapse/rest/media/v1
parent	fix cyrillic URL previews by hardcoding all page decoding to UTF-8 for now, r... (diff)
download	synapse-aaabbd3e9e514b3779b2004ff8e9f74dd9dc4b6a.tar.xz