fix cyrillic URL previews by hardcoding all page decoding to UTF-8 for now, rather than relying on lxml's heuristics which seem to get it wrong
1 files changed, 8 insertions, 16 deletions
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index c27ba72735..7e937b0446 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -181,22 +181,14 @@ class PreviewUrlResource(BaseMediaResource):
from lxml import html
- try:
- tree = html.parse(media_info['filename'])
- og = yield self._calc_og(tree, media_info, requester)
- except UnicodeDecodeError:
- # XXX: evil evil bodge
- # Empirically, sites like google.com mix Latin-1 and utf-8
- # encodings in the same page. The rogue Latin-1 characters
- # cause lxml to choke with a UnicodeDecodeError, so if we
- # see this we go and do a manual decode of the HTML before
- # handing it to lxml as utf-8 encoding, counter-intuitively,
- # which seems to make it happier...
- file = open(media_info['filename'])
- body = file.read()
- file.close()
- tree = html.fromstring(body.decode('utf-8', 'ignore'))
- og = yield self._calc_og(tree, media_info, requester)
+ # XXX: always manually try to decode body as utf-8 first, which
+ # seems to help with most character encoding woes.
+ # XXX: handle non-utf-8 encodings?
+ file = open(media_info['filename'])
+ body = file.read()
+ file.close()
+ tree = html.fromstring(body.decode('utf-8', 'ignore'))
+ og = yield self._calc_og(tree, media_info, requester)
else:
logger.warn("Failed to find any OG data in %s", url)
|