2 files changed, 26 insertions, 18 deletions
diff --git a/synapse/handlers/auth.py b/synapse/handlers/auth.py
index 916632c7d7..61fe56032a 100644
--- a/synapse/handlers/auth.py
+++ b/synapse/handlers/auth.py
@@ -432,11 +432,15 @@ class AuthHandler(BaseHandler):
         Returns:
             True if the user_id successfully authenticated
         """
-        defer.returnValue((
-            (yield self._check_ldap_password(user_id, password))
-            or
-            (yield self._check_local_password(user_id, password))
-        ))
+        valid_ldap = yield self._check_ldap_password(user_id, password)
+        if valid_ldap:
+            defer.returnValue(True)
+
+        valid_local_password = yield self._check_local_password(user_id, password)
+        if valid_local_password:
+            defer.returnValue(True)
+
+        defer.returnValue(False)
 
     @defer.inlineCallbacks
     def _check_local_password(self, user_id, password):
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index c27ba72735..9bb7c72cfc 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -179,23 +179,27 @@ class PreviewUrlResource(BaseMediaResource):
         elif self._is_html(media_info['media_type']):
             # TODO: somehow stop a big HTML tree from exploding synapse's RAM
 
-            from lxml import html
+            from lxml import etree
+
+            file = open(media_info['filename'])
+            body = file.read()
+            file.close()
+
+            # clobber the encoding from the content-type, or default to utf-8
+            # XXX: this overrides any <meta/> or XML charset headers in the body
+            # which may pose problems, but so far seems to work okay.
+            match = re.match(r'.*; *charset=(.*?)(;|$)', media_info['media_type'], re.I)
+            encoding = match.group(1) if match else "utf-8"
 
             try:
-                tree = html.parse(media_info['filename'])
+                parser = etree.HTMLParser(recover=True, encoding=encoding)
+                tree = etree.fromstring(body, parser)
                 og = yield self._calc_og(tree, media_info, requester)
             except UnicodeDecodeError:
-                # XXX: evil evil bodge
-                # Empirically, sites like google.com mix Latin-1 and utf-8
-                # encodings in the same page.  The rogue Latin-1 characters
-                # cause lxml to choke with a UnicodeDecodeError, so if we
-                # see this we go and do a manual decode of the HTML before
-                # handing it to lxml as utf-8 encoding, counter-intuitively,
-                # which seems to make it happier...
-                file = open(media_info['filename'])
-                body = file.read()
-                file.close()
-                tree = html.fromstring(body.decode('utf-8', 'ignore'))
+                # blindly try decoding the body as utf-8, which seems to fix
+                # the charset mismatches on https://google.com
+                parser = etree.HTMLParser(recover=True, encoding=encoding)
+                tree = etree.fromstring(body.decode('utf-8', 'ignore'), parser)
                 og = yield self._calc_og(tree, media_info, requester)
 
         else: