diff options
author | Matthew Hodgson <matthew@matrix.org> | 2016-04-15 13:19:57 +0100 |
---|---|---|
committer | Matthew Hodgson <matthew@matrix.org> | 2016-04-15 13:20:08 +0100 |
commit | 84f9cac4d0a7f19b432e683981f66c20339a60f5 (patch) | |
tree | 09843c3d696552e5db69f0cab65482407ebbaf34 /synapse/rest/media/v1 | |
parent | Merge pull request #731 from matrix-org/erikj/timed_otu (diff) | |
download | synapse-84f9cac4d0a7f19b432e683981f66c20339a60f5.tar.xz |
fix cyrillic URL previews by hardcoding all page decoding to UTF-8 for now, rather than relying on lxml's heuristics which seem to get it wrong
Diffstat (limited to 'synapse/rest/media/v1')
-rw-r--r-- | synapse/rest/media/v1/preview_url_resource.py | 24 |
1 files changed, 8 insertions, 16 deletions
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index c27ba72735..7e937b0446 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -181,22 +181,14 @@ class PreviewUrlResource(BaseMediaResource): from lxml import html - try: - tree = html.parse(media_info['filename']) - og = yield self._calc_og(tree, media_info, requester) - except UnicodeDecodeError: - # XXX: evil evil bodge - # Empirically, sites like google.com mix Latin-1 and utf-8 - # encodings in the same page. The rogue Latin-1 characters - # cause lxml to choke with a UnicodeDecodeError, so if we - # see this we go and do a manual decode of the HTML before - # handing it to lxml as utf-8 encoding, counter-intuitively, - # which seems to make it happier... - file = open(media_info['filename']) - body = file.read() - file.close() - tree = html.fromstring(body.decode('utf-8', 'ignore')) - og = yield self._calc_og(tree, media_info, requester) + # XXX: always manually try to decode body as utf-8 first, which + # seems to help with most character encoding woes. + # XXX: handle non-utf-8 encodings? + file = open(media_info['filename']) + body = file.read() + file.close() + tree = html.fromstring(body.decode('utf-8', 'ignore')) + og = yield self._calc_og(tree, media_info, requester) else: logger.warn("Failed to find any OG data in %s", url) |