diff --git a/synapse/handlers/auth.py b/synapse/handlers/auth.py
index 916632c7d7..61fe56032a 100644
--- a/synapse/handlers/auth.py
+++ b/synapse/handlers/auth.py
@@ -432,11 +432,15 @@ class AuthHandler(BaseHandler):
Returns:
True if the user_id successfully authenticated
"""
- defer.returnValue((
- (yield self._check_ldap_password(user_id, password))
- or
- (yield self._check_local_password(user_id, password))
- ))
+ valid_ldap = yield self._check_ldap_password(user_id, password)
+ if valid_ldap:
+ defer.returnValue(True)
+
+ valid_local_password = yield self._check_local_password(user_id, password)
+ if valid_local_password:
+ defer.returnValue(True)
+
+ defer.returnValue(False)
@defer.inlineCallbacks
def _check_local_password(self, user_id, password):
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index c27ba72735..9bb7c72cfc 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -179,23 +179,27 @@ class PreviewUrlResource(BaseMediaResource):
elif self._is_html(media_info['media_type']):
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
- from lxml import html
+ from lxml import etree
+
+ file = open(media_info['filename'])
+ body = file.read()
+ file.close()
+
+ # clobber the encoding from the content-type, or default to utf-8
+ # XXX: this overrides any <meta/> or XML charset headers in the body
+ # which may pose problems, but so far seems to work okay.
+ match = re.match(r'.*; *charset=(.*?)(;|$)', media_info['media_type'], re.I)
+ encoding = match.group(1) if match else "utf-8"
try:
- tree = html.parse(media_info['filename'])
+ parser = etree.HTMLParser(recover=True, encoding=encoding)
+ tree = etree.fromstring(body, parser)
og = yield self._calc_og(tree, media_info, requester)
except UnicodeDecodeError:
- # XXX: evil evil bodge
- # Empirically, sites like google.com mix Latin-1 and utf-8
- # encodings in the same page. The rogue Latin-1 characters
- # cause lxml to choke with a UnicodeDecodeError, so if we
- # see this we go and do a manual decode of the HTML before
- # handing it to lxml as utf-8 encoding, counter-intuitively,
- # which seems to make it happier...
- file = open(media_info['filename'])
- body = file.read()
- file.close()
- tree = html.fromstring(body.decode('utf-8', 'ignore'))
+ # blindly try decoding the body as utf-8, which seems to fix
+ # the charset mismatches on https://google.com
+ parser = etree.HTMLParser(recover=True, encoding=encoding)
+ tree = etree.fromstring(body.decode('utf-8', 'ignore'), parser)
og = yield self._calc_og(tree, media_info, requester)
else:
|