From aaabbd3e9e514b3779b2004ff8e9f74dd9dc4b6a Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Fri, 15 Apr 2016 14:32:25 +0100 Subject: explicitly pass in the charset from Content-Type to lxml to fix cyrillic woes better --- synapse/rest/media/v1/preview_url_resource.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'synapse') diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 7e937b0446..9bb7c72cfc 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -179,16 +179,28 @@ class PreviewUrlResource(BaseMediaResource): elif self._is_html(media_info['media_type']): # TODO: somehow stop a big HTML tree from exploding synapse's RAM - from lxml import html + from lxml import etree - # XXX: always manually try to decode body as utf-8 first, which - # seems to help with most character encoding woes. - # XXX: handle non-utf-8 encodings? file = open(media_info['filename']) body = file.read() file.close() - tree = html.fromstring(body.decode('utf-8', 'ignore')) - og = yield self._calc_og(tree, media_info, requester) + + # clobber the encoding from the content-type, or default to utf-8 + # XXX: this overrides any or XML charset headers in the body + # which may pose problems, but so far seems to work okay. + match = re.match(r'.*; *charset=(.*?)(;|$)', media_info['media_type'], re.I) + encoding = match.group(1) if match else "utf-8" + + try: + parser = etree.HTMLParser(recover=True, encoding=encoding) + tree = etree.fromstring(body, parser) + og = yield self._calc_og(tree, media_info, requester) + except UnicodeDecodeError: + # blindly try decoding the body as utf-8, which seems to fix + # the charset mismatches on https://google.com + parser = etree.HTMLParser(recover=True, encoding=encoding) + tree = etree.fromstring(body.decode('utf-8', 'ignore'), parser) + og = yield self._calc_og(tree, media_info, requester) else: logger.warn("Failed to find any OG data in %s", url) -- cgit 1.4.1