summary refs log tree commit diff
path: root/synapse
diff options
context:
space:
mode:
authorMatthew Hodgson <matthew@matrix.org>2016-04-15 13:19:57 +0100
committerMatthew Hodgson <matthew@matrix.org>2016-04-15 13:20:08 +0100
commit84f9cac4d0a7f19b432e683981f66c20339a60f5 (patch)
tree09843c3d696552e5db69f0cab65482407ebbaf34 /synapse
parentMerge pull request #731 from matrix-org/erikj/timed_otu (diff)
downloadsynapse-84f9cac4d0a7f19b432e683981f66c20339a60f5.tar.xz
fix cyrillic URL previews by hardcoding all page decoding to UTF-8 for now, rather than relying on lxml's heuristics which seem to get it wrong
Diffstat (limited to 'synapse')
-rw-r--r--synapse/rest/media/v1/preview_url_resource.py24
1 files changed, 8 insertions, 16 deletions
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py

index c27ba72735..7e937b0446 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -181,22 +181,14 @@ class PreviewUrlResource(BaseMediaResource): from lxml import html - try: - tree = html.parse(media_info['filename']) - og = yield self._calc_og(tree, media_info, requester) - except UnicodeDecodeError: - # XXX: evil evil bodge - # Empirically, sites like google.com mix Latin-1 and utf-8 - # encodings in the same page. The rogue Latin-1 characters - # cause lxml to choke with a UnicodeDecodeError, so if we - # see this we go and do a manual decode of the HTML before - # handing it to lxml as utf-8 encoding, counter-intuitively, - # which seems to make it happier... - file = open(media_info['filename']) - body = file.read() - file.close() - tree = html.fromstring(body.decode('utf-8', 'ignore')) - og = yield self._calc_og(tree, media_info, requester) + # XXX: always manually try to decode body as utf-8 first, which + # seems to help with most character encoding woes. + # XXX: handle non-utf-8 encodings? + file = open(media_info['filename']) + body = file.read() + file.close() + tree = html.fromstring(body.decode('utf-8', 'ignore')) + og = yield self._calc_og(tree, media_info, requester) else: logger.warn("Failed to find any OG data in %s", url)