diff options
author | Matthew Hodgson <matthew@matrix.org> | 2016-04-15 14:32:25 +0100 |
---|---|---|
committer | Matthew Hodgson <matthew@matrix.org> | 2016-04-15 14:32:25 +0100 |
commit | aaabbd3e9e514b3779b2004ff8e9f74dd9dc4b6a (patch) | |
tree | 5f07ca3f761e77fefb1dd9e464c0f2cd1fe7f68a /synapse | |
parent | fix cyrillic URL previews by hardcoding all page decoding to UTF-8 for now, r... (diff) | |
download | synapse-aaabbd3e9e514b3779b2004ff8e9f74dd9dc4b6a.tar.xz |
explicitly pass in the charset from Content-Type to lxml to fix cyrillic woes better
Diffstat (limited to 'synapse')
-rw-r--r-- | synapse/rest/media/v1/preview_url_resource.py | 24 |
1 files changed, 18 insertions, 6 deletions
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 7e937b0446..9bb7c72cfc 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -179,16 +179,28 @@ class PreviewUrlResource(BaseMediaResource): elif self._is_html(media_info['media_type']): # TODO: somehow stop a big HTML tree from exploding synapse's RAM - from lxml import html + from lxml import etree - # XXX: always manually try to decode body as utf-8 first, which - # seems to help with most character encoding woes. - # XXX: handle non-utf-8 encodings? file = open(media_info['filename']) body = file.read() file.close() - tree = html.fromstring(body.decode('utf-8', 'ignore')) - og = yield self._calc_og(tree, media_info, requester) + + # clobber the encoding from the content-type, or default to utf-8 + # XXX: this overrides any <meta/> or XML charset headers in the body + # which may pose problems, but so far seems to work okay. + match = re.match(r'.*; *charset=(.*?)(;|$)', media_info['media_type'], re.I) + encoding = match.group(1) if match else "utf-8" + + try: + parser = etree.HTMLParser(recover=True, encoding=encoding) + tree = etree.fromstring(body, parser) + og = yield self._calc_og(tree, media_info, requester) + except UnicodeDecodeError: + # blindly try decoding the body as utf-8, which seems to fix + # the charset mismatches on https://google.com + parser = etree.HTMLParser(recover=True, encoding=encoding) + tree = etree.fromstring(body.decode('utf-8', 'ignore'), parser) + og = yield self._calc_og(tree, media_info, requester) else: logger.warn("Failed to find any OG data in %s", url) |