diff options
author | Amber Brown <hawkowl@atleastfornow.net> | 2018-11-15 11:05:08 -0600 |
---|---|---|
committer | Andrew Morgan <andrew@amorgan.xyz> | 2019-02-13 14:24:42 +0000 |
commit | dc768f208ec5a336e8d36e1c153be974e107abba (patch) | |
tree | 7e52c3767eff84e16165df9b4b62a37e908c42cb /synapse/rest/media | |
parent | Add a coveragerc (#4180) (diff) | |
download | synapse-dc768f208ec5a336e8d36e1c153be974e107abba.tar.xz |
Use <meta> tags to discover the per-page encoding of html previews (#4183)
Diffstat (limited to 'synapse/rest/media')
-rw-r--r-- | synapse/rest/media/v1/preview_url_resource.py | 31 |
1 files changed, 22 insertions, 9 deletions
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 91d1dafe64..9b15699e4d 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -53,6 +53,9 @@ from ._base import FileInfo logger = logging.getLogger(__name__) +_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I) +_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I) + class PreviewUrlResource(Resource): isLeaf = True @@ -223,15 +226,25 @@ class PreviewUrlResource(Resource): with open(media_info['filename'], 'rb') as file: body = file.read() - # clobber the encoding from the content-type, or default to utf-8 - # XXX: this overrides any <meta/> or XML charset headers in the body - # which may pose problems, but so far seems to work okay. - match = re.match( - r'.*; *charset="?(.*?)"?(;|$)', - media_info['media_type'], - re.I - ) - encoding = match.group(1) if match else "utf-8" + encoding = None + + # Let's try and figure out if it has an encoding set in a meta tag. + # Limit it to the first 1kb, since it ought to be in the meta tags + # at the top. + match = _charset_match.search(body[:1000]) + + # If we find a match, it should take precedence over the + # Content-Type header, so set it here. + if match: + encoding = match.group(1).decode('ascii') + + # If we don't find a match, we'll look at the HTTP Content-Type, and + # if that doesn't exist, we'll fall back to UTF-8. + if not encoding: + match = _content_type_match.match( + media_info['media_type'] + ) + encoding = match.group(1) if match else "utf-8" og = decode_and_calc_og(body, media_info['uri'], encoding) |