summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--synapse/rest/media/v1/preview_url_resource.py30
1 files changed, 17 insertions, 13 deletions
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index c27ba72735..9bb7c72cfc 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -179,23 +179,27 @@ class PreviewUrlResource(BaseMediaResource):
         elif self._is_html(media_info['media_type']):
             # TODO: somehow stop a big HTML tree from exploding synapse's RAM
 
-            from lxml import html
+            from lxml import etree
+
+            file = open(media_info['filename'])
+            body = file.read()
+            file.close()
+
+            # clobber the encoding from the content-type, or default to utf-8
+            # XXX: this overrides any <meta/> or XML charset headers in the body
+            # which may pose problems, but so far seems to work okay.
+            match = re.match(r'.*; *charset=(.*?)(;|$)', media_info['media_type'], re.I)
+            encoding = match.group(1) if match else "utf-8"
 
             try:
-                tree = html.parse(media_info['filename'])
+                parser = etree.HTMLParser(recover=True, encoding=encoding)
+                tree = etree.fromstring(body, parser)
                 og = yield self._calc_og(tree, media_info, requester)
             except UnicodeDecodeError:
-                # XXX: evil evil bodge
-                # Empirically, sites like google.com mix Latin-1 and utf-8
-                # encodings in the same page.  The rogue Latin-1 characters
-                # cause lxml to choke with a UnicodeDecodeError, so if we
-                # see this we go and do a manual decode of the HTML before
-                # handing it to lxml as utf-8 encoding, counter-intuitively,
-                # which seems to make it happier...
-                file = open(media_info['filename'])
-                body = file.read()
-                file.close()
-                tree = html.fromstring(body.decode('utf-8', 'ignore'))
+                # blindly try decoding the body as utf-8, which seems to fix
+                # the charset mismatches on https://google.com
+                parser = etree.HTMLParser(recover=True, encoding=encoding)
+                tree = etree.fromstring(body.decode('utf-8', 'ignore'), parser)
                 og = yield self._calc_og(tree, media_info, requester)
 
         else: