diff options
Diffstat (limited to 'synapse/rest/media/v1')
-rw-r--r-- | synapse/rest/media/v1/_base.py | 94 | ||||
-rw-r--r-- | synapse/rest/media/v1/download_resource.py | 3 | ||||
-rw-r--r-- | synapse/rest/media/v1/media_repository.py | 125 | ||||
-rw-r--r-- | synapse/rest/media/v1/media_storage.py | 1 | ||||
-rw-r--r-- | synapse/rest/media/v1/preview_url_resource.py | 114 | ||||
-rw-r--r-- | synapse/rest/media/v1/storage_provider.py | 6 | ||||
-rw-r--r-- | synapse/rest/media/v1/thumbnail_resource.py | 72 | ||||
-rw-r--r-- | synapse/rest/media/v1/thumbnailer.py | 19 | ||||
-rw-r--r-- | synapse/rest/media/v1/upload_resource.py | 8 |
9 files changed, 247 insertions, 195 deletions
diff --git a/synapse/rest/media/v1/_base.py b/synapse/rest/media/v1/_base.py index 5fefee4dde..3689777266 100644 --- a/synapse/rest/media/v1/_base.py +++ b/synapse/rest/media/v1/_base.py @@ -17,7 +17,6 @@ import logging import os -from six import PY3 from six.moves import urllib from twisted.internet import defer @@ -30,6 +29,22 @@ from synapse.util.stringutils import is_ascii logger = logging.getLogger(__name__) +# list all text content types that will have the charset default to UTF-8 when +# none is given +TEXT_CONTENT_TYPES = [ + "text/css", + "text/csv", + "text/html", + "text/calendar", + "text/plain", + "text/javascript", + "application/json", + "application/ld+json", + "application/rtf", + "image/svg+xml", + "text/xml", +] + def parse_media_id(request): try: @@ -96,7 +111,14 @@ def add_file_headers(request, media_type, file_size, upload_name): def _quote(x): return urllib.parse.quote(x.encode("utf-8")) - request.setHeader(b"Content-Type", media_type.encode("UTF-8")) + # Default to a UTF-8 charset for text content types. + # ex, uses UTF-8 for 'text/css' but not 'text/css; charset=UTF-16' + if media_type.lower() in TEXT_CONTENT_TYPES: + content_type = media_type + "; charset=UTF-8" + else: + content_type = media_type + + request.setHeader(b"Content-Type", content_type.encode("UTF-8")) if upload_name: # RFC6266 section 4.1 [1] defines both `filename` and `filename*`. # @@ -135,27 +157,25 @@ def add_file_headers(request, media_type, file_size, upload_name): # separators as defined in RFC2616. SP and HT are handled separately. # see _can_encode_filename_as_token. -_FILENAME_SEPARATOR_CHARS = set( - ( - "(", - ")", - "<", - ">", - "@", - ",", - ";", - ":", - "\\", - '"', - "/", - "[", - "]", - "?", - "=", - "{", - "}", - ) -) +_FILENAME_SEPARATOR_CHARS = { + "(", + ")", + "<", + ">", + "@", + ",", + ";", + ":", + "\\", + '"', + "/", + "[", + "]", + "?", + "=", + "{", + "}", +} def _can_encode_filename_as_token(x): @@ -195,7 +215,7 @@ def respond_with_responder(request, responder, media_type, file_size, upload_nam respond_404(request) return - logger.debug("Responding to media request with responder %s") + logger.debug("Responding to media request with responder %s", responder) add_file_headers(request, media_type, file_size, upload_name) try: with responder: @@ -303,23 +323,15 @@ def get_filename_from_headers(headers): upload_name_utf8 = upload_name_utf8[7:] # We have a filename*= section. This MUST be ASCII, and any UTF-8 # bytes are %-quoted. - if PY3: - try: - # Once it is decoded, we can then unquote the %-encoded - # parts strictly into a unicode string. - upload_name = urllib.parse.unquote( - upload_name_utf8.decode("ascii"), errors="strict" - ) - except UnicodeDecodeError: - # Incorrect UTF-8. - pass - else: - # On Python 2, we first unquote the %-encoded parts and then - # decode it strictly using UTF-8. - try: - upload_name = urllib.parse.unquote(upload_name_utf8).decode("utf8") - except UnicodeDecodeError: - pass + try: + # Once it is decoded, we can then unquote the %-encoded + # parts strictly into a unicode string. + upload_name = urllib.parse.unquote( + upload_name_utf8.decode("ascii"), errors="strict" + ) + except UnicodeDecodeError: + # Incorrect UTF-8. + pass # If there isn't check for an ascii name. if not upload_name: diff --git a/synapse/rest/media/v1/download_resource.py b/synapse/rest/media/v1/download_resource.py index 66a01559e1..24d3ae5bbc 100644 --- a/synapse/rest/media/v1/download_resource.py +++ b/synapse/rest/media/v1/download_resource.py @@ -50,6 +50,9 @@ class DownloadResource(DirectServeResource): b" media-src 'self';" b" object-src 'self';", ) + request.setHeader( + b"Referrer-Policy", b"no-referrer", + ) server_name, media_id, name = parse_media_id(request) if server_name == self.server_name: await self.media_repo.get_local_media(request, media_id, name) diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index b972e152a9..fd10d42f2f 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -18,12 +18,12 @@ import errno import logging import os import shutil +from typing import Dict, Tuple from six import iteritems import twisted.internet.error import twisted.web.http -from twisted.internet import defer from twisted.web.resource import Resource from synapse.api.errors import ( @@ -113,15 +113,14 @@ class MediaRepository(object): "update_recently_accessed_media", self._update_recently_accessed ) - @defer.inlineCallbacks - def _update_recently_accessed(self): + async def _update_recently_accessed(self): remote_media = self.recently_accessed_remotes self.recently_accessed_remotes = set() local_media = self.recently_accessed_locals self.recently_accessed_locals = set() - yield self.store.update_cached_last_access_time( + await self.store.update_cached_last_access_time( local_media, remote_media, self.clock.time_msec() ) @@ -137,8 +136,7 @@ class MediaRepository(object): else: self.recently_accessed_locals.add(media_id) - @defer.inlineCallbacks - def create_content( + async def create_content( self, media_type, upload_name, content, content_length, auth_user ): """Store uploaded content for a local user and return the mxc URL @@ -157,11 +155,11 @@ class MediaRepository(object): file_info = FileInfo(server_name=None, file_id=media_id) - fname = yield self.media_storage.store_file(content, file_info) + fname = await self.media_storage.store_file(content, file_info) logger.info("Stored local media in file %r", fname) - yield self.store.store_local_media( + await self.store.store_local_media( media_id=media_id, media_type=media_type, time_now_ms=self.clock.time_msec(), @@ -170,12 +168,11 @@ class MediaRepository(object): user_id=auth_user, ) - yield self._generate_thumbnails(None, media_id, media_id, media_type) + await self._generate_thumbnails(None, media_id, media_id, media_type) return "mxc://%s/%s" % (self.server_name, media_id) - @defer.inlineCallbacks - def get_local_media(self, request, media_id, name): + async def get_local_media(self, request, media_id, name): """Responds to reqests for local media, if exists, or returns 404. Args: @@ -189,7 +186,7 @@ class MediaRepository(object): Deferred: Resolves once a response has successfully been written to request """ - media_info = yield self.store.get_local_media(media_id) + media_info = await self.store.get_local_media(media_id) if not media_info or media_info["quarantined_by"]: respond_404(request) return @@ -203,13 +200,12 @@ class MediaRepository(object): file_info = FileInfo(None, media_id, url_cache=url_cache) - responder = yield self.media_storage.fetch_media(file_info) - yield respond_with_responder( + responder = await self.media_storage.fetch_media(file_info) + await respond_with_responder( request, responder, media_type, media_length, upload_name ) - @defer.inlineCallbacks - def get_remote_media(self, request, server_name, media_id, name): + async def get_remote_media(self, request, server_name, media_id, name): """Respond to requests for remote media. Args: @@ -235,8 +231,8 @@ class MediaRepository(object): # We linearize here to ensure that we don't try and download remote # media multiple times concurrently key = (server_name, media_id) - with (yield self.remote_media_linearizer.queue(key)): - responder, media_info = yield self._get_remote_media_impl( + with (await self.remote_media_linearizer.queue(key)): + responder, media_info = await self._get_remote_media_impl( server_name, media_id ) @@ -245,14 +241,13 @@ class MediaRepository(object): media_type = media_info["media_type"] media_length = media_info["media_length"] upload_name = name if name else media_info["upload_name"] - yield respond_with_responder( + await respond_with_responder( request, responder, media_type, media_length, upload_name ) else: respond_404(request) - @defer.inlineCallbacks - def get_remote_media_info(self, server_name, media_id): + async def get_remote_media_info(self, server_name, media_id): """Gets the media info associated with the remote file, downloading if necessary. @@ -273,8 +268,8 @@ class MediaRepository(object): # We linearize here to ensure that we don't try and download remote # media multiple times concurrently key = (server_name, media_id) - with (yield self.remote_media_linearizer.queue(key)): - responder, media_info = yield self._get_remote_media_impl( + with (await self.remote_media_linearizer.queue(key)): + responder, media_info = await self._get_remote_media_impl( server_name, media_id ) @@ -285,8 +280,7 @@ class MediaRepository(object): return media_info - @defer.inlineCallbacks - def _get_remote_media_impl(self, server_name, media_id): + async def _get_remote_media_impl(self, server_name, media_id): """Looks for media in local cache, if not there then attempt to download from remote server. @@ -298,7 +292,7 @@ class MediaRepository(object): Returns: Deferred[(Responder, media_info)] """ - media_info = yield self.store.get_cached_remote_media(server_name, media_id) + media_info = await self.store.get_cached_remote_media(server_name, media_id) # file_id is the ID we use to track the file locally. If we've already # seen the file then reuse the existing ID, otherwise genereate a new @@ -316,19 +310,18 @@ class MediaRepository(object): logger.info("Media is quarantined") raise NotFoundError() - responder = yield self.media_storage.fetch_media(file_info) + responder = await self.media_storage.fetch_media(file_info) if responder: return responder, media_info # Failed to find the file anywhere, lets download it. - media_info = yield self._download_remote_file(server_name, media_id, file_id) + media_info = await self._download_remote_file(server_name, media_id, file_id) - responder = yield self.media_storage.fetch_media(file_info) + responder = await self.media_storage.fetch_media(file_info) return responder, media_info - @defer.inlineCallbacks - def _download_remote_file(self, server_name, media_id, file_id): + async def _download_remote_file(self, server_name, media_id, file_id): """Attempt to download the remote file from the given server name, using the given file_id as the local id. @@ -350,7 +343,7 @@ class MediaRepository(object): ("/_matrix/media/v1/download", server_name, media_id) ) try: - length, headers = yield self.client.get_file( + length, headers = await self.client.get_file( server_name, request_path, output_stream=f, @@ -363,7 +356,7 @@ class MediaRepository(object): }, ) except RequestSendFailed as e: - logger.warn( + logger.warning( "Request failed fetching remote media %s/%s: %r", server_name, media_id, @@ -372,7 +365,7 @@ class MediaRepository(object): raise SynapseError(502, "Failed to fetch remote media") except HttpResponseException as e: - logger.warn( + logger.warning( "HTTP error fetching remote media %s/%s: %s", server_name, media_id, @@ -383,10 +376,12 @@ class MediaRepository(object): raise SynapseError(502, "Failed to fetch remote media") except SynapseError: - logger.warn("Failed to fetch remote media %s/%s", server_name, media_id) + logger.warning( + "Failed to fetch remote media %s/%s", server_name, media_id + ) raise except NotRetryingDestination: - logger.warn("Not retrying destination %r", server_name) + logger.warning("Not retrying destination %r", server_name) raise SynapseError(502, "Failed to fetch remote media") except Exception: logger.exception( @@ -394,7 +389,7 @@ class MediaRepository(object): ) raise SynapseError(502, "Failed to fetch remote media") - yield finish() + await finish() media_type = headers[b"Content-Type"][0].decode("ascii") upload_name = get_filename_from_headers(headers) @@ -402,7 +397,7 @@ class MediaRepository(object): logger.info("Stored remote media in file %r", fname) - yield self.store.store_cached_remote_media( + await self.store.store_cached_remote_media( origin=server_name, media_id=media_id, media_type=media_type, @@ -420,7 +415,7 @@ class MediaRepository(object): "filesystem_id": file_id, } - yield self._generate_thumbnails(server_name, media_id, file_id, media_type) + await self._generate_thumbnails(server_name, media_id, file_id, media_type) return media_info @@ -455,16 +450,15 @@ class MediaRepository(object): return t_byte_source - @defer.inlineCallbacks - def generate_local_exact_thumbnail( + async def generate_local_exact_thumbnail( self, media_id, t_width, t_height, t_method, t_type, url_cache ): - input_path = yield self.media_storage.ensure_media_is_in_local_cache( + input_path = await self.media_storage.ensure_media_is_in_local_cache( FileInfo(None, media_id, url_cache=url_cache) ) thumbnailer = Thumbnailer(input_path) - t_byte_source = yield defer_to_thread( + t_byte_source = await defer_to_thread( self.hs.get_reactor(), self._generate_thumbnail, thumbnailer, @@ -487,7 +481,7 @@ class MediaRepository(object): thumbnail_type=t_type, ) - output_path = yield self.media_storage.store_file( + output_path = await self.media_storage.store_file( t_byte_source, file_info ) finally: @@ -497,22 +491,21 @@ class MediaRepository(object): t_len = os.path.getsize(output_path) - yield self.store.store_local_thumbnail( + await self.store.store_local_thumbnail( media_id, t_width, t_height, t_type, t_method, t_len ) return output_path - @defer.inlineCallbacks - def generate_remote_exact_thumbnail( + async def generate_remote_exact_thumbnail( self, server_name, file_id, media_id, t_width, t_height, t_method, t_type ): - input_path = yield self.media_storage.ensure_media_is_in_local_cache( + input_path = await self.media_storage.ensure_media_is_in_local_cache( FileInfo(server_name, file_id, url_cache=False) ) thumbnailer = Thumbnailer(input_path) - t_byte_source = yield defer_to_thread( + t_byte_source = await defer_to_thread( self.hs.get_reactor(), self._generate_thumbnail, thumbnailer, @@ -534,7 +527,7 @@ class MediaRepository(object): thumbnail_type=t_type, ) - output_path = yield self.media_storage.store_file( + output_path = await self.media_storage.store_file( t_byte_source, file_info ) finally: @@ -544,7 +537,7 @@ class MediaRepository(object): t_len = os.path.getsize(output_path) - yield self.store.store_remote_media_thumbnail( + await self.store.store_remote_media_thumbnail( server_name, media_id, file_id, @@ -557,8 +550,7 @@ class MediaRepository(object): return output_path - @defer.inlineCallbacks - def _generate_thumbnails( + async def _generate_thumbnails( self, server_name, media_id, file_id, media_type, url_cache=False ): """Generate and store thumbnails for an image. @@ -579,7 +571,7 @@ class MediaRepository(object): if not requirements: return - input_path = yield self.media_storage.ensure_media_is_in_local_cache( + input_path = await self.media_storage.ensure_media_is_in_local_cache( FileInfo(server_name, file_id, url_cache=url_cache) ) @@ -597,13 +589,13 @@ class MediaRepository(object): return if thumbnailer.transpose_method is not None: - m_width, m_height = yield defer_to_thread( + m_width, m_height = await defer_to_thread( self.hs.get_reactor(), thumbnailer.transpose ) # We deduplicate the thumbnail sizes by ignoring the cropped versions if # they have the same dimensions of a scaled one. - thumbnails = {} + thumbnails = {} # type: Dict[Tuple[int, int, str], str] for r_width, r_height, r_method, r_type in requirements: if r_method == "crop": thumbnails.setdefault((r_width, r_height, r_type), r_method) @@ -617,11 +609,11 @@ class MediaRepository(object): for (t_width, t_height, t_type), t_method in iteritems(thumbnails): # Generate the thumbnail if t_method == "crop": - t_byte_source = yield defer_to_thread( + t_byte_source = await defer_to_thread( self.hs.get_reactor(), thumbnailer.crop, t_width, t_height, t_type ) elif t_method == "scale": - t_byte_source = yield defer_to_thread( + t_byte_source = await defer_to_thread( self.hs.get_reactor(), thumbnailer.scale, t_width, t_height, t_type ) else: @@ -643,7 +635,7 @@ class MediaRepository(object): url_cache=url_cache, ) - output_path = yield self.media_storage.store_file( + output_path = await self.media_storage.store_file( t_byte_source, file_info ) finally: @@ -653,7 +645,7 @@ class MediaRepository(object): # Write to database if server_name: - yield self.store.store_remote_media_thumbnail( + await self.store.store_remote_media_thumbnail( server_name, media_id, file_id, @@ -664,15 +656,14 @@ class MediaRepository(object): t_len, ) else: - yield self.store.store_local_thumbnail( + await self.store.store_local_thumbnail( media_id, t_width, t_height, t_type, t_method, t_len ) return {"width": m_width, "height": m_height} - @defer.inlineCallbacks - def delete_old_remote_media(self, before_ts): - old_media = yield self.store.get_remote_media_before(before_ts) + async def delete_old_remote_media(self, before_ts): + old_media = await self.store.get_remote_media_before(before_ts) deleted = 0 @@ -686,12 +677,12 @@ class MediaRepository(object): # TODO: Should we delete from the backup store - with (yield self.remote_media_linearizer.queue(key)): + with (await self.remote_media_linearizer.queue(key)): full_path = self.filepaths.remote_media_filepath(origin, file_id) try: os.remove(full_path) except OSError as e: - logger.warn("Failed to remove file: %r", full_path) + logger.warning("Failed to remove file: %r", full_path) if e.errno == errno.ENOENT: pass else: @@ -702,7 +693,7 @@ class MediaRepository(object): ) shutil.rmtree(thumbnail_dir, ignore_errors=True) - yield self.store.delete_remote_media(origin, media_id) + await self.store.delete_remote_media(origin, media_id) deleted += 1 return {"deleted": deleted} diff --git a/synapse/rest/media/v1/media_storage.py b/synapse/rest/media/v1/media_storage.py index 3b87717a5a..683a79c966 100644 --- a/synapse/rest/media/v1/media_storage.py +++ b/synapse/rest/media/v1/media_storage.py @@ -148,6 +148,7 @@ class MediaStorage(object): for provider in self.storage_providers: res = yield provider.fetch(path, file_info) if res: + logger.debug("Streaming %s from %s", path, provider) return res return None diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 7a56cd4b6c..f206605727 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -23,6 +23,7 @@ import re import shutil import sys import traceback +from typing import Dict, Optional import six from six import string_types @@ -56,6 +57,9 @@ logger = logging.getLogger(__name__) _charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I) _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I) +OG_TAG_NAME_MAXLEN = 50 +OG_TAG_VALUE_MAXLEN = 1000 + class PreviewUrlResource(DirectServeResource): isLeaf = True @@ -74,12 +78,15 @@ class PreviewUrlResource(DirectServeResource): treq_args={"browser_like_redirects": True}, ip_whitelist=hs.config.url_preview_ip_range_whitelist, ip_blacklist=hs.config.url_preview_ip_range_blacklist, + http_proxy=os.getenvb(b"http_proxy"), + https_proxy=os.getenvb(b"HTTPS_PROXY"), ) self.media_repo = media_repo self.primary_base_path = media_repo.primary_base_path self.media_storage = media_storage self.url_preview_url_blacklist = hs.config.url_preview_url_blacklist + self.url_preview_accept_language = hs.config.url_preview_accept_language # memory cache mapping urls to an ObservableDeferred returning # JSON-encoded OG metadata @@ -117,8 +124,10 @@ class PreviewUrlResource(DirectServeResource): pattern = entry[attrib] value = getattr(url_tuple, attrib) logger.debug( - ("Matching attrib '%s' with value '%s' against" " pattern '%s'") - % (attrib, value, pattern) + "Matching attrib '%s' with value '%s' against pattern '%s'", + attrib, + value, + pattern, ) if value is None: @@ -134,7 +143,7 @@ class PreviewUrlResource(DirectServeResource): match = False continue if match: - logger.warn("URL %s blocked by url_blacklist entry %s", url, entry) + logger.warning("URL %s blocked by url_blacklist entry %s", url, entry) raise SynapseError( 403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN ) @@ -157,8 +166,7 @@ class PreviewUrlResource(DirectServeResource): og = await make_deferred_yieldable(defer.maybeDeferred(observable.observe)) respond_with_json_bytes(request, 200, og, send_cors=True) - @defer.inlineCallbacks - def _do_preview(self, url, user, ts): + async def _do_preview(self, url, user, ts): """Check the db, and download the URL and build a preview Args: @@ -167,11 +175,11 @@ class PreviewUrlResource(DirectServeResource): ts (int): Returns: - Deferred[str]: json-encoded og data + Deferred[bytes]: json-encoded og data """ # check the URL cache in the DB (which will also provide us with # historical previews, if we have any) - cache_result = yield self.store.get_url_cache(url, ts) + cache_result = await self.store.get_url_cache(url, ts) if ( cache_result and cache_result["expires_ts"] > ts @@ -184,13 +192,13 @@ class PreviewUrlResource(DirectServeResource): og = og.encode("utf8") return og - media_info = yield self._download_url(url, user) + media_info = await self._download_url(url, user) - logger.debug("got media_info of '%s'" % media_info) + logger.debug("got media_info of '%s'", media_info) if _is_media(media_info["media_type"]): file_id = media_info["filesystem_id"] - dims = yield self.media_repo._generate_thumbnails( + dims = await self.media_repo._generate_thumbnails( None, file_id, file_id, media_info["media_type"], url_cache=True ) @@ -206,7 +214,7 @@ class PreviewUrlResource(DirectServeResource): og["og:image:width"] = dims["width"] og["og:image:height"] = dims["height"] else: - logger.warn("Couldn't get dims for %s" % url) + logger.warning("Couldn't get dims for %s" % url) # define our OG response for this media elif _is_html(media_info["media_type"]): @@ -230,8 +238,8 @@ class PreviewUrlResource(DirectServeResource): # If we don't find a match, we'll look at the HTTP Content-Type, and # if that doesn't exist, we'll fall back to UTF-8. if not encoding: - match = _content_type_match.match(media_info["media_type"]) - encoding = match.group(1) if match else "utf-8" + content_match = _content_type_match.match(media_info["media_type"]) + encoding = content_match.group(1) if content_match else "utf-8" og = decode_and_calc_og(body, media_info["uri"], encoding) @@ -240,21 +248,21 @@ class PreviewUrlResource(DirectServeResource): # request itself and benefit from the same caching etc. But for now we # just rely on the caching on the master request to speed things up. if "og:image" in og and og["og:image"]: - image_info = yield self._download_url( + image_info = await self._download_url( _rebase_url(og["og:image"], media_info["uri"]), user ) if _is_media(image_info["media_type"]): # TODO: make sure we don't choke on white-on-transparent images file_id = image_info["filesystem_id"] - dims = yield self.media_repo._generate_thumbnails( + dims = await self.media_repo._generate_thumbnails( None, file_id, file_id, image_info["media_type"], url_cache=True ) if dims: og["og:image:width"] = dims["width"] og["og:image:height"] = dims["height"] else: - logger.warn("Couldn't get dims for %s" % og["og:image"]) + logger.warning("Couldn't get dims for %s", og["og:image"]) og["og:image"] = "mxc://%s/%s" % ( self.server_name, @@ -265,15 +273,27 @@ class PreviewUrlResource(DirectServeResource): else: del og["og:image"] else: - logger.warn("Failed to find any OG data in %s", url) + logger.warning("Failed to find any OG data in %s", url) og = {} - logger.debug("Calculated OG for %s as %s" % (url, og)) + # filter out any stupidly long values + keys_to_remove = [] + for k, v in og.items(): + # values can be numeric as well as strings, hence the cast to str + if len(k) > OG_TAG_NAME_MAXLEN or len(str(v)) > OG_TAG_VALUE_MAXLEN: + logger.warning( + "Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN] + ) + keys_to_remove.append(k) + for k in keys_to_remove: + del og[k] + + logger.debug("Calculated OG for %s as %s", url, og) - jsonog = json.dumps(og).encode("utf8") + jsonog = json.dumps(og) # store OG in history-aware DB cache - yield self.store.store_url_cache( + await self.store.store_url_cache( url, media_info["response_code"], media_info["etag"], @@ -283,10 +303,9 @@ class PreviewUrlResource(DirectServeResource): media_info["created_ts"], ) - return jsonog + return jsonog.encode("utf8") - @defer.inlineCallbacks - def _download_url(self, url, user): + async def _download_url(self, url, user): # TODO: we should probably honour robots.txt... except in practice # we're most likely being explicitly triggered by a human rather than a # bot, so are we really a robot? @@ -297,9 +316,12 @@ class PreviewUrlResource(DirectServeResource): with self.media_storage.store_into_file(file_info) as (f, fname, finish): try: - logger.debug("Trying to get url '%s'" % url) - length, headers, uri, code = yield self.client.get_file( - url, output_stream=f, max_size=self.max_spider_size + logger.debug("Trying to get preview for url '%s'", url) + length, headers, uri, code = await self.client.get_file( + url, + output_stream=f, + max_size=self.max_spider_size, + headers={"Accept-Language": self.url_preview_accept_language}, ) except SynapseError: # Pass SynapseErrors through directly, so that the servlet @@ -317,7 +339,7 @@ class PreviewUrlResource(DirectServeResource): ) except Exception as e: # FIXME: pass through 404s and other error messages nicely - logger.warn("Error downloading %s: %r", url, e) + logger.warning("Error downloading %s: %r", url, e) raise SynapseError( 500, @@ -325,7 +347,7 @@ class PreviewUrlResource(DirectServeResource): % (traceback.format_exception_only(sys.exc_info()[0], e),), Codes.UNKNOWN, ) - yield finish() + await finish() try: if b"Content-Type" in headers: @@ -336,7 +358,7 @@ class PreviewUrlResource(DirectServeResource): download_name = get_filename_from_headers(headers) - yield self.store.store_local_media( + await self.store.store_local_media( media_id=file_id, media_type=media_type, time_now_ms=self.clock.time_msec(), @@ -373,22 +395,21 @@ class PreviewUrlResource(DirectServeResource): "expire_url_cache_data", self._expire_url_cache_data ) - @defer.inlineCallbacks - def _expire_url_cache_data(self): + async def _expire_url_cache_data(self): """Clean up expired url cache content, media and thumbnails. """ # TODO: Delete from backup media store now = self.clock.time_msec() - logger.info("Running url preview cache expiry") + logger.debug("Running url preview cache expiry") - if not (yield self.store.has_completed_background_updates()): + if not (await self.store.db.updates.has_completed_background_updates()): logger.info("Still running DB updates; skipping expiry") return # First we delete expired url cache entries - media_ids = yield self.store.get_expired_url_cache(now) + media_ids = await self.store.get_expired_url_cache(now) removed_media = [] for media_id in media_ids: @@ -398,7 +419,7 @@ class PreviewUrlResource(DirectServeResource): except OSError as e: # If the path doesn't exist, meh if e.errno != errno.ENOENT: - logger.warn("Failed to remove media: %r: %s", media_id, e) + logger.warning("Failed to remove media: %r: %s", media_id, e) continue removed_media.append(media_id) @@ -410,17 +431,19 @@ class PreviewUrlResource(DirectServeResource): except Exception: pass - yield self.store.delete_url_cache(removed_media) + await self.store.delete_url_cache(removed_media) if removed_media: logger.info("Deleted %d entries from url cache", len(removed_media)) + else: + logger.debug("No entries removed from url cache") # Now we delete old images associated with the url cache. # These may be cached for a bit on the client (i.e., they # may have a room open with a preview url thing open). # So we wait a couple of days before deleting, just in case. expire_before = now - 2 * 24 * 60 * 60 * 1000 - media_ids = yield self.store.get_url_cache_media_before(expire_before) + media_ids = await self.store.get_url_cache_media_before(expire_before) removed_media = [] for media_id in media_ids: @@ -430,7 +453,7 @@ class PreviewUrlResource(DirectServeResource): except OSError as e: # If the path doesn't exist, meh if e.errno != errno.ENOENT: - logger.warn("Failed to remove media: %r: %s", media_id, e) + logger.warning("Failed to remove media: %r: %s", media_id, e) continue try: @@ -446,7 +469,7 @@ class PreviewUrlResource(DirectServeResource): except OSError as e: # If the path doesn't exist, meh if e.errno != errno.ENOENT: - logger.warn("Failed to remove media: %r: %s", media_id, e) + logger.warning("Failed to remove media: %r: %s", media_id, e) continue removed_media.append(media_id) @@ -458,9 +481,12 @@ class PreviewUrlResource(DirectServeResource): except Exception: pass - yield self.store.delete_url_cache_media(removed_media) + await self.store.delete_url_cache_media(removed_media) - logger.info("Deleted %d media from url cache", len(removed_media)) + if removed_media: + logger.info("Deleted %d media from url cache", len(removed_media)) + else: + logger.debug("No media removed from url cache") def decode_and_calc_og(body, media_uri, request_encoding=None): @@ -499,9 +525,13 @@ def _calc_og(tree, media_uri): # "og:video:height" : "720", # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3", - og = {} + og = {} # type: Dict[str, Optional[str]] for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): if "content" in tag.attrib: + # if we've got more than 50 tags, someone is taking the piss + if len(og) >= 50: + logger.warning("Skipping OG for page with too many 'og:' tags") + return {} og[tag.attrib["property"]] = tag.attrib["content"] # TODO: grab article: meta tags too, e.g.: diff --git a/synapse/rest/media/v1/storage_provider.py b/synapse/rest/media/v1/storage_provider.py index 37687ea7f4..858680be26 100644 --- a/synapse/rest/media/v1/storage_provider.py +++ b/synapse/rest/media/v1/storage_provider.py @@ -77,6 +77,9 @@ class StorageProviderWrapper(StorageProvider): self.store_synchronous = store_synchronous self.store_remote = store_remote + def __str__(self): + return "StorageProviderWrapper[%s]" % (self.backend,) + def store_file(self, path, file_info): if not file_info.server_name and not self.store_local: return defer.succeed(None) @@ -114,6 +117,9 @@ class FileStorageProviderBackend(StorageProvider): self.cache_directory = hs.config.media_store_path self.base_directory = config + def __str__(self): + return "FileStorageProviderBackend[%s]" % (self.base_directory,) + def store_file(self, path, file_info): """See StorageProvider.store_file""" diff --git a/synapse/rest/media/v1/thumbnail_resource.py b/synapse/rest/media/v1/thumbnail_resource.py index 08329884ac..0b87220234 100644 --- a/synapse/rest/media/v1/thumbnail_resource.py +++ b/synapse/rest/media/v1/thumbnail_resource.py @@ -16,8 +16,6 @@ import logging -from twisted.internet import defer - from synapse.http.server import ( DirectServeResource, set_cors_headers, @@ -79,11 +77,10 @@ class ThumbnailResource(DirectServeResource): ) self.media_repo.mark_recently_accessed(server_name, media_id) - @defer.inlineCallbacks - def _respond_local_thumbnail( + async def _respond_local_thumbnail( self, request, media_id, width, height, method, m_type ): - media_info = yield self.store.get_local_media(media_id) + media_info = await self.store.get_local_media(media_id) if not media_info: respond_404(request) @@ -93,7 +90,7 @@ class ThumbnailResource(DirectServeResource): respond_404(request) return - thumbnail_infos = yield self.store.get_local_media_thumbnails(media_id) + thumbnail_infos = await self.store.get_local_media_thumbnails(media_id) if thumbnail_infos: thumbnail_info = self._select_thumbnail( @@ -114,14 +111,13 @@ class ThumbnailResource(DirectServeResource): t_type = file_info.thumbnail_type t_length = thumbnail_info["thumbnail_length"] - responder = yield self.media_storage.fetch_media(file_info) - yield respond_with_responder(request, responder, t_type, t_length) + responder = await self.media_storage.fetch_media(file_info) + await respond_with_responder(request, responder, t_type, t_length) else: logger.info("Couldn't find any generated thumbnails") respond_404(request) - @defer.inlineCallbacks - def _select_or_generate_local_thumbnail( + async def _select_or_generate_local_thumbnail( self, request, media_id, @@ -130,7 +126,7 @@ class ThumbnailResource(DirectServeResource): desired_method, desired_type, ): - media_info = yield self.store.get_local_media(media_id) + media_info = await self.store.get_local_media(media_id) if not media_info: respond_404(request) @@ -140,7 +136,7 @@ class ThumbnailResource(DirectServeResource): respond_404(request) return - thumbnail_infos = yield self.store.get_local_media_thumbnails(media_id) + thumbnail_infos = await self.store.get_local_media_thumbnails(media_id) for info in thumbnail_infos: t_w = info["thumbnail_width"] == desired_width t_h = info["thumbnail_height"] == desired_height @@ -162,15 +158,15 @@ class ThumbnailResource(DirectServeResource): t_type = file_info.thumbnail_type t_length = info["thumbnail_length"] - responder = yield self.media_storage.fetch_media(file_info) + responder = await self.media_storage.fetch_media(file_info) if responder: - yield respond_with_responder(request, responder, t_type, t_length) + await respond_with_responder(request, responder, t_type, t_length) return logger.debug("We don't have a thumbnail of that size. Generating") # Okay, so we generate one. - file_path = yield self.media_repo.generate_local_exact_thumbnail( + file_path = await self.media_repo.generate_local_exact_thumbnail( media_id, desired_width, desired_height, @@ -180,13 +176,12 @@ class ThumbnailResource(DirectServeResource): ) if file_path: - yield respond_with_file(request, desired_type, file_path) + await respond_with_file(request, desired_type, file_path) else: - logger.warn("Failed to generate thumbnail") + logger.warning("Failed to generate thumbnail") respond_404(request) - @defer.inlineCallbacks - def _select_or_generate_remote_thumbnail( + async def _select_or_generate_remote_thumbnail( self, request, server_name, @@ -196,9 +191,9 @@ class ThumbnailResource(DirectServeResource): desired_method, desired_type, ): - media_info = yield self.media_repo.get_remote_media_info(server_name, media_id) + media_info = await self.media_repo.get_remote_media_info(server_name, media_id) - thumbnail_infos = yield self.store.get_remote_media_thumbnails( + thumbnail_infos = await self.store.get_remote_media_thumbnails( server_name, media_id ) @@ -224,15 +219,15 @@ class ThumbnailResource(DirectServeResource): t_type = file_info.thumbnail_type t_length = info["thumbnail_length"] - responder = yield self.media_storage.fetch_media(file_info) + responder = await self.media_storage.fetch_media(file_info) if responder: - yield respond_with_responder(request, responder, t_type, t_length) + await respond_with_responder(request, responder, t_type, t_length) return logger.debug("We don't have a thumbnail of that size. Generating") # Okay, so we generate one. - file_path = yield self.media_repo.generate_remote_exact_thumbnail( + file_path = await self.media_repo.generate_remote_exact_thumbnail( server_name, file_id, media_id, @@ -243,21 +238,20 @@ class ThumbnailResource(DirectServeResource): ) if file_path: - yield respond_with_file(request, desired_type, file_path) + await respond_with_file(request, desired_type, file_path) else: - logger.warn("Failed to generate thumbnail") + logger.warning("Failed to generate thumbnail") respond_404(request) - @defer.inlineCallbacks - def _respond_remote_thumbnail( + async def _respond_remote_thumbnail( self, request, server_name, media_id, width, height, method, m_type ): # TODO: Don't download the whole remote file # We should proxy the thumbnail from the remote server instead of # downloading the remote file and generating our own thumbnails. - media_info = yield self.media_repo.get_remote_media_info(server_name, media_id) + media_info = await self.media_repo.get_remote_media_info(server_name, media_id) - thumbnail_infos = yield self.store.get_remote_media_thumbnails( + thumbnail_infos = await self.store.get_remote_media_thumbnails( server_name, media_id ) @@ -278,8 +272,8 @@ class ThumbnailResource(DirectServeResource): t_type = file_info.thumbnail_type t_length = thumbnail_info["thumbnail_length"] - responder = yield self.media_storage.fetch_media(file_info) - yield respond_with_responder(request, responder, t_type, t_length) + responder = await self.media_storage.fetch_media(file_info) + await respond_with_responder(request, responder, t_type, t_length) else: logger.info("Failed to find any generated thumbnails") respond_404(request) @@ -296,8 +290,8 @@ class ThumbnailResource(DirectServeResource): d_h = desired_height if desired_method.lower() == "crop": - info_list = [] - info_list2 = [] + crop_info_list = [] + crop_info_list2 = [] for info in thumbnail_infos: t_w = info["thumbnail_width"] t_h = info["thumbnail_height"] @@ -309,7 +303,7 @@ class ThumbnailResource(DirectServeResource): type_quality = desired_type != info["thumbnail_type"] length_quality = info["thumbnail_length"] if t_w >= d_w or t_h >= d_h: - info_list.append( + crop_info_list.append( ( aspect_quality, min_quality, @@ -320,7 +314,7 @@ class ThumbnailResource(DirectServeResource): ) ) else: - info_list2.append( + crop_info_list2.append( ( aspect_quality, min_quality, @@ -330,10 +324,10 @@ class ThumbnailResource(DirectServeResource): info, ) ) - if info_list: - return min(info_list)[-1] + if crop_info_list: + return min(crop_info_list)[-1] else: - return min(info_list2)[-1] + return min(crop_info_list2)[-1] else: info_list = [] info_list2 = [] diff --git a/synapse/rest/media/v1/thumbnailer.py b/synapse/rest/media/v1/thumbnailer.py index c995d7e043..c234ea7421 100644 --- a/synapse/rest/media/v1/thumbnailer.py +++ b/synapse/rest/media/v1/thumbnailer.py @@ -82,13 +82,21 @@ class Thumbnailer(object): else: return (max_height * self.width) // self.height, max_height + def _resize(self, width, height): + # 1-bit or 8-bit color palette images need converting to RGB + # otherwise they will be scaled using nearest neighbour which + # looks awful + if self.image.mode in ["1", "P"]: + self.image = self.image.convert("RGB") + return self.image.resize((width, height), Image.ANTIALIAS) + def scale(self, width, height, output_type): """Rescales the image to the given dimensions. Returns: BytesIO: the bytes of the encoded image ready to be written to disk """ - scaled = self.image.resize((width, height), Image.ANTIALIAS) + scaled = self._resize(width, height) return self._encode_image(scaled, output_type) def crop(self, width, height, output_type): @@ -107,13 +115,13 @@ class Thumbnailer(object): """ if width * self.height > height * self.width: scaled_height = (width * self.height) // self.width - scaled_image = self.image.resize((width, scaled_height), Image.ANTIALIAS) + scaled_image = self._resize(width, scaled_height) crop_top = (scaled_height - height) // 2 crop_bottom = height + crop_top cropped = scaled_image.crop((0, crop_top, width, crop_bottom)) else: scaled_width = (height * self.width) // self.height - scaled_image = self.image.resize((scaled_width, height), Image.ANTIALIAS) + scaled_image = self._resize(scaled_width, height) crop_left = (scaled_width - width) // 2 crop_right = width + crop_left cropped = scaled_image.crop((crop_left, 0, crop_right, height)) @@ -121,5 +129,8 @@ class Thumbnailer(object): def _encode_image(self, output_image, output_type): output_bytes_io = BytesIO() - output_image.save(output_bytes_io, self.FORMATS[output_type], quality=80) + fmt = self.FORMATS[output_type] + if fmt == "JPEG": + output_image = output_image.convert("RGB") + output_image.save(output_bytes_io, fmt, quality=80) return output_bytes_io diff --git a/synapse/rest/media/v1/upload_resource.py b/synapse/rest/media/v1/upload_resource.py index 5d76bbdf68..83d005812d 100644 --- a/synapse/rest/media/v1/upload_resource.py +++ b/synapse/rest/media/v1/upload_resource.py @@ -17,7 +17,7 @@ import logging from twisted.web.server import NOT_DONE_YET -from synapse.api.errors import SynapseError +from synapse.api.errors import Codes, SynapseError from synapse.http.server import ( DirectServeResource, respond_with_json, @@ -56,7 +56,11 @@ class UploadResource(DirectServeResource): if content_length is None: raise SynapseError(msg="Request must specify a Content-Length", code=400) if int(content_length) > self.max_upload_size: - raise SynapseError(msg="Upload request body is too large", code=413) + raise SynapseError( + msg="Upload request body is too large", + code=413, + errcode=Codes.TOO_LARGE, + ) upload_name = parse_string(request, b"filename", encoding=None) if upload_name: |