diff options
-rw-r--r-- | synapse/handlers/auth.py | 14 | ||||
-rw-r--r-- | synapse/rest/media/v1/preview_url_resource.py | 30 |
2 files changed, 26 insertions, 18 deletions
diff --git a/synapse/handlers/auth.py b/synapse/handlers/auth.py index 916632c7d7..61fe56032a 100644 --- a/synapse/handlers/auth.py +++ b/synapse/handlers/auth.py @@ -432,11 +432,15 @@ class AuthHandler(BaseHandler): Returns: True if the user_id successfully authenticated """ - defer.returnValue(( - (yield self._check_ldap_password(user_id, password)) - or - (yield self._check_local_password(user_id, password)) - )) + valid_ldap = yield self._check_ldap_password(user_id, password) + if valid_ldap: + defer.returnValue(True) + + valid_local_password = yield self._check_local_password(user_id, password) + if valid_local_password: + defer.returnValue(True) + + defer.returnValue(False) @defer.inlineCallbacks def _check_local_password(self, user_id, password): diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index c27ba72735..9bb7c72cfc 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -179,23 +179,27 @@ class PreviewUrlResource(BaseMediaResource): elif self._is_html(media_info['media_type']): # TODO: somehow stop a big HTML tree from exploding synapse's RAM - from lxml import html + from lxml import etree + + file = open(media_info['filename']) + body = file.read() + file.close() + + # clobber the encoding from the content-type, or default to utf-8 + # XXX: this overrides any <meta/> or XML charset headers in the body + # which may pose problems, but so far seems to work okay. + match = re.match(r'.*; *charset=(.*?)(;|$)', media_info['media_type'], re.I) + encoding = match.group(1) if match else "utf-8" try: - tree = html.parse(media_info['filename']) + parser = etree.HTMLParser(recover=True, encoding=encoding) + tree = etree.fromstring(body, parser) og = yield self._calc_og(tree, media_info, requester) except UnicodeDecodeError: - # XXX: evil evil bodge - # Empirically, sites like google.com mix Latin-1 and utf-8 - # encodings in the same page. The rogue Latin-1 characters - # cause lxml to choke with a UnicodeDecodeError, so if we - # see this we go and do a manual decode of the HTML before - # handing it to lxml as utf-8 encoding, counter-intuitively, - # which seems to make it happier... - file = open(media_info['filename']) - body = file.read() - file.close() - tree = html.fromstring(body.decode('utf-8', 'ignore')) + # blindly try decoding the body as utf-8, which seems to fix + # the charset mismatches on https://google.com + parser = etree.HTMLParser(recover=True, encoding=encoding) + tree = etree.fromstring(body.decode('utf-8', 'ignore'), parser) og = yield self._calc_og(tree, media_info, requester) else: |