From 4fc8875876374ec8f97a3b3cc344a4e3abcf769f Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 27 Feb 2023 08:26:05 -0500 Subject: Refactor media modules. (#15146) * Removes the `v1` directory from `test.rest.media.v1`. * Moves the non-REST code from `synapse.rest.media.v1` to `synapse.media`. * Flatten the `v1` directory from `synapse.rest.media`, but leave compatiblity with 3rd party media repositories and spam checkers. --- synapse/rest/media/config_resource.py | 41 + synapse/rest/media/download_resource.py | 75 ++ synapse/rest/media/media_repository_resource.py | 93 ++ synapse/rest/media/preview_url_resource.py | 869 ++++++++++++++++++ synapse/rest/media/thumbnail_resource.py | 554 +++++++++++ synapse/rest/media/upload_resource.py | 108 +++ synapse/rest/media/v1/_base.py | 470 +--------- synapse/rest/media/v1/config_resource.py | 41 - synapse/rest/media/v1/download_resource.py | 76 -- synapse/rest/media/v1/filepath.py | 410 --------- synapse/rest/media/v1/media_repository.py | 1112 ----------------------- synapse/rest/media/v1/media_storage.py | 365 +------- synapse/rest/media/v1/oembed.py | 265 ------ synapse/rest/media/v1/preview_html.py | 501 ---------- synapse/rest/media/v1/preview_url_resource.py | 871 ------------------ synapse/rest/media/v1/storage_provider.py | 172 +--- synapse/rest/media/v1/thumbnail_resource.py | 555 ----------- synapse/rest/media/v1/thumbnailer.py | 221 ----- synapse/rest/media/v1/upload_resource.py | 108 --- 19 files changed, 1752 insertions(+), 5155 deletions(-) create mode 100644 synapse/rest/media/config_resource.py create mode 100644 synapse/rest/media/download_resource.py create mode 100644 synapse/rest/media/media_repository_resource.py create mode 100644 synapse/rest/media/preview_url_resource.py create mode 100644 synapse/rest/media/thumbnail_resource.py create mode 100644 synapse/rest/media/upload_resource.py delete mode 100644 synapse/rest/media/v1/config_resource.py delete mode 100644 synapse/rest/media/v1/download_resource.py delete mode 100644 synapse/rest/media/v1/filepath.py delete mode 100644 synapse/rest/media/v1/media_repository.py delete mode 100644 synapse/rest/media/v1/oembed.py delete mode 100644 synapse/rest/media/v1/preview_html.py delete mode 100644 synapse/rest/media/v1/preview_url_resource.py delete mode 100644 synapse/rest/media/v1/thumbnail_resource.py delete mode 100644 synapse/rest/media/v1/thumbnailer.py delete mode 100644 synapse/rest/media/v1/upload_resource.py (limited to 'synapse/rest') diff --git a/synapse/rest/media/config_resource.py b/synapse/rest/media/config_resource.py new file mode 100644 index 0000000000..a95804d327 --- /dev/null +++ b/synapse/rest/media/config_resource.py @@ -0,0 +1,41 @@ +# Copyright 2018 Will Hunt +# Copyright 2020-2021 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import TYPE_CHECKING + +from synapse.http.server import DirectServeJsonResource, respond_with_json +from synapse.http.site import SynapseRequest + +if TYPE_CHECKING: + from synapse.server import HomeServer + + +class MediaConfigResource(DirectServeJsonResource): + isLeaf = True + + def __init__(self, hs: "HomeServer"): + super().__init__() + config = hs.config + self.clock = hs.get_clock() + self.auth = hs.get_auth() + self.limits_dict = {"m.upload.size": config.media.max_upload_size} + + async def _async_render_GET(self, request: SynapseRequest) -> None: + await self.auth.get_user_by_req(request) + respond_with_json(request, 200, self.limits_dict, send_cors=True) + + async def _async_render_OPTIONS(self, request: SynapseRequest) -> None: + respond_with_json(request, 200, {}, send_cors=True) diff --git a/synapse/rest/media/download_resource.py b/synapse/rest/media/download_resource.py new file mode 100644 index 0000000000..8f270cf4cc --- /dev/null +++ b/synapse/rest/media/download_resource.py @@ -0,0 +1,75 @@ +# Copyright 2014-2016 OpenMarket Ltd +# Copyright 2020-2021 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +from typing import TYPE_CHECKING + +from synapse.http.server import ( + DirectServeJsonResource, + set_corp_headers, + set_cors_headers, +) +from synapse.http.servlet import parse_boolean +from synapse.http.site import SynapseRequest +from synapse.media._base import parse_media_id, respond_404 + +if TYPE_CHECKING: + from synapse.media.media_repository import MediaRepository + from synapse.server import HomeServer + +logger = logging.getLogger(__name__) + + +class DownloadResource(DirectServeJsonResource): + isLeaf = True + + def __init__(self, hs: "HomeServer", media_repo: "MediaRepository"): + super().__init__() + self.media_repo = media_repo + self.server_name = hs.hostname + + async def _async_render_GET(self, request: SynapseRequest) -> None: + set_cors_headers(request) + set_corp_headers(request) + request.setHeader( + b"Content-Security-Policy", + b"sandbox;" + b" default-src 'none';" + b" script-src 'none';" + b" plugin-types application/pdf;" + b" style-src 'unsafe-inline';" + b" media-src 'self';" + b" object-src 'self';", + ) + # Limited non-standard form of CSP for IE11 + request.setHeader(b"X-Content-Security-Policy", b"sandbox;") + request.setHeader( + b"Referrer-Policy", + b"no-referrer", + ) + server_name, media_id, name = parse_media_id(request) + if server_name == self.server_name: + await self.media_repo.get_local_media(request, media_id, name) + else: + allow_remote = parse_boolean(request, "allow_remote", default=True) + if not allow_remote: + logger.info( + "Rejecting request for remote media %s/%s due to allow_remote", + server_name, + media_id, + ) + respond_404(request) + return + + await self.media_repo.get_remote_media(request, server_name, media_id, name) diff --git a/synapse/rest/media/media_repository_resource.py b/synapse/rest/media/media_repository_resource.py new file mode 100644 index 0000000000..5ebaa3b032 --- /dev/null +++ b/synapse/rest/media/media_repository_resource.py @@ -0,0 +1,93 @@ +# Copyright 2014-2016 OpenMarket Ltd +# Copyright 2018-2021 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from synapse.config._base import ConfigError +from synapse.http.server import UnrecognizedRequestResource + +from .config_resource import MediaConfigResource +from .download_resource import DownloadResource +from .preview_url_resource import PreviewUrlResource +from .thumbnail_resource import ThumbnailResource +from .upload_resource import UploadResource + +if TYPE_CHECKING: + from synapse.server import HomeServer + + +class MediaRepositoryResource(UnrecognizedRequestResource): + """File uploading and downloading. + + Uploads are POSTed to a resource which returns a token which is used to GET + the download:: + + => POST /_matrix/media/r0/upload HTTP/1.1 + Content-Type: + Content-Length: + + + + <= HTTP/1.1 200 OK + Content-Type: application/json + + { "content_uri": "mxc:///" } + + => GET /_matrix/media/r0/download// HTTP/1.1 + + <= HTTP/1.1 200 OK + Content-Type: + Content-Disposition: attachment;filename= + + + + Clients can get thumbnails by supplying a desired width and height and + thumbnailing method:: + + => GET /_matrix/media/r0/thumbnail/ + /?width=&height=&method= HTTP/1.1 + + <= HTTP/1.1 200 OK + Content-Type: image/jpeg or image/png + + + + The thumbnail methods are "crop" and "scale". "scale" tries to return an + image where either the width or the height is smaller than the requested + size. The client should then scale and letterbox the image if it needs to + fit within a given rectangle. "crop" tries to return an image where the + width and height are close to the requested size and the aspect matches + the requested size. The client should scale the image if it needs to fit + within a given rectangle. + """ + + def __init__(self, hs: "HomeServer"): + # If we're not configured to use it, raise if we somehow got here. + if not hs.config.media.can_load_media_repo: + raise ConfigError("Synapse is not configured to use a media repo.") + + super().__init__() + media_repo = hs.get_media_repository() + + self.putChild(b"upload", UploadResource(hs, media_repo)) + self.putChild(b"download", DownloadResource(hs, media_repo)) + self.putChild( + b"thumbnail", ThumbnailResource(hs, media_repo, media_repo.media_storage) + ) + if hs.config.media.url_preview_enabled: + self.putChild( + b"preview_url", + PreviewUrlResource(hs, media_repo, media_repo.media_storage), + ) + self.putChild(b"config", MediaConfigResource(hs)) diff --git a/synapse/rest/media/preview_url_resource.py b/synapse/rest/media/preview_url_resource.py new file mode 100644 index 0000000000..7ada728757 --- /dev/null +++ b/synapse/rest/media/preview_url_resource.py @@ -0,0 +1,869 @@ +# Copyright 2016 OpenMarket Ltd +# Copyright 2020-2021 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import datetime +import errno +import fnmatch +import logging +import os +import re +import shutil +import sys +import traceback +from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple +from urllib.parse import urljoin, urlparse, urlsplit +from urllib.request import urlopen + +import attr + +from twisted.internet.defer import Deferred +from twisted.internet.error import DNSLookupError + +from synapse.api.errors import Codes, SynapseError +from synapse.http.client import SimpleHttpClient +from synapse.http.server import ( + DirectServeJsonResource, + respond_with_json, + respond_with_json_bytes, +) +from synapse.http.servlet import parse_integer, parse_string +from synapse.http.site import SynapseRequest +from synapse.logging.context import make_deferred_yieldable, run_in_background +from synapse.media._base import FileInfo, get_filename_from_headers +from synapse.media.media_storage import MediaStorage +from synapse.media.oembed import OEmbedProvider +from synapse.media.preview_html import decode_body, parse_html_to_open_graph +from synapse.metrics.background_process_metrics import run_as_background_process +from synapse.types import JsonDict, UserID +from synapse.util import json_encoder +from synapse.util.async_helpers import ObservableDeferred +from synapse.util.caches.expiringcache import ExpiringCache +from synapse.util.stringutils import random_string + +if TYPE_CHECKING: + from synapse.media.media_repository import MediaRepository + from synapse.server import HomeServer + +logger = logging.getLogger(__name__) + +OG_TAG_NAME_MAXLEN = 50 +OG_TAG_VALUE_MAXLEN = 1000 + +ONE_HOUR = 60 * 60 * 1000 +ONE_DAY = 24 * ONE_HOUR +IMAGE_CACHE_EXPIRY_MS = 2 * ONE_DAY + + +@attr.s(slots=True, frozen=True, auto_attribs=True) +class DownloadResult: + length: int + uri: str + response_code: int + media_type: str + download_name: Optional[str] + expires: int + etag: Optional[str] + + +@attr.s(slots=True, frozen=True, auto_attribs=True) +class MediaInfo: + """ + Information parsed from downloading media being previewed. + """ + + # The Content-Type header of the response. + media_type: str + # The length (in bytes) of the downloaded media. + media_length: int + # The media filename, according to the server. This is parsed from the + # returned headers, if possible. + download_name: Optional[str] + # The time of the preview. + created_ts_ms: int + # Information from the media storage provider about where the file is stored + # on disk. + filesystem_id: str + filename: str + # The URI being previewed. + uri: str + # The HTTP response code. + response_code: int + # The timestamp (in milliseconds) of when this preview expires. + expires: int + # The ETag header of the response. + etag: Optional[str] + + +class PreviewUrlResource(DirectServeJsonResource): + """ + The `GET /_matrix/media/r0/preview_url` endpoint provides a generic preview API + for URLs which outputs Open Graph (https://ogp.me/) responses (with some Matrix + specific additions). + + This does have trade-offs compared to other designs: + + * Pros: + * Simple and flexible; can be used by any clients at any point + * Cons: + * If each homeserver provides one of these independently, all the homeservers in a + room may needlessly DoS the target URI + * The URL metadata must be stored somewhere, rather than just using Matrix + itself to store the media. + * Matrix cannot be used to distribute the metadata between homeservers. + + When Synapse is asked to preview a URL it does the following: + + 1. Checks against a URL blacklist (defined as `url_preview_url_blacklist` in the + config). + 2. Checks the URL against an in-memory cache and returns the result if it exists. (This + is also used to de-duplicate processing of multiple in-flight requests at once.) + 3. Kicks off a background process to generate a preview: + 1. Checks URL and timestamp against the database cache and returns the result if it + has not expired and was successful (a 2xx return code). + 2. Checks if the URL matches an oEmbed (https://oembed.com/) pattern. If it + does, update the URL to download. + 3. Downloads the URL and stores it into a file via the media storage provider + and saves the local media metadata. + 4. If the media is an image: + 1. Generates thumbnails. + 2. Generates an Open Graph response based on image properties. + 5. If the media is HTML: + 1. Decodes the HTML via the stored file. + 2. Generates an Open Graph response from the HTML. + 3. If a JSON oEmbed URL was found in the HTML via autodiscovery: + 1. Downloads the URL and stores it into a file via the media storage provider + and saves the local media metadata. + 2. Convert the oEmbed response to an Open Graph response. + 3. Override any Open Graph data from the HTML with data from oEmbed. + 4. If an image exists in the Open Graph response: + 1. Downloads the URL and stores it into a file via the media storage + provider and saves the local media metadata. + 2. Generates thumbnails. + 3. Updates the Open Graph response based on image properties. + 6. If the media is JSON and an oEmbed URL was found: + 1. Convert the oEmbed response to an Open Graph response. + 2. If a thumbnail or image is in the oEmbed response: + 1. Downloads the URL and stores it into a file via the media storage + provider and saves the local media metadata. + 2. Generates thumbnails. + 3. Updates the Open Graph response based on image properties. + 7. Stores the result in the database cache. + 4. Returns the result. + + If any additional requests (e.g. from oEmbed autodiscovery, step 5.3 or + image thumbnailing, step 5.4 or 6.4) fails then the URL preview as a whole + does not fail. As much information as possible is returned. + + The in-memory cache expires after 1 hour. + + Expired entries in the database cache (and their associated media files) are + deleted every 10 seconds. The default expiration time is 1 hour from download. + """ + + isLeaf = True + + def __init__( + self, + hs: "HomeServer", + media_repo: "MediaRepository", + media_storage: MediaStorage, + ): + super().__init__() + + self.auth = hs.get_auth() + self.clock = hs.get_clock() + self.filepaths = media_repo.filepaths + self.max_spider_size = hs.config.media.max_spider_size + self.server_name = hs.hostname + self.store = hs.get_datastores().main + self.client = SimpleHttpClient( + hs, + treq_args={"browser_like_redirects": True}, + ip_whitelist=hs.config.media.url_preview_ip_range_whitelist, + ip_blacklist=hs.config.media.url_preview_ip_range_blacklist, + use_proxy=True, + ) + self.media_repo = media_repo + self.primary_base_path = media_repo.primary_base_path + self.media_storage = media_storage + + self._oembed = OEmbedProvider(hs) + + # We run the background jobs if we're the instance specified (or no + # instance is specified, where we assume there is only one instance + # serving media). + instance_running_jobs = hs.config.media.media_instance_running_background_jobs + self._worker_run_media_background_jobs = ( + instance_running_jobs is None + or instance_running_jobs == hs.get_instance_name() + ) + + self.url_preview_url_blacklist = hs.config.media.url_preview_url_blacklist + self.url_preview_accept_language = hs.config.media.url_preview_accept_language + + # memory cache mapping urls to an ObservableDeferred returning + # JSON-encoded OG metadata + self._cache: ExpiringCache[str, ObservableDeferred] = ExpiringCache( + cache_name="url_previews", + clock=self.clock, + # don't spider URLs more often than once an hour + expiry_ms=ONE_HOUR, + ) + + if self._worker_run_media_background_jobs: + self._cleaner_loop = self.clock.looping_call( + self._start_expire_url_cache_data, 10 * 1000 + ) + + async def _async_render_OPTIONS(self, request: SynapseRequest) -> None: + request.setHeader(b"Allow", b"OPTIONS, GET") + respond_with_json(request, 200, {}, send_cors=True) + + async def _async_render_GET(self, request: SynapseRequest) -> None: + # XXX: if get_user_by_req fails, what should we do in an async render? + requester = await self.auth.get_user_by_req(request) + url = parse_string(request, "url", required=True) + ts = parse_integer(request, "ts") + if ts is None: + ts = self.clock.time_msec() + + # XXX: we could move this into _do_preview if we wanted. + url_tuple = urlsplit(url) + for entry in self.url_preview_url_blacklist: + match = True + for attrib in entry: + pattern = entry[attrib] + value = getattr(url_tuple, attrib) + logger.debug( + "Matching attrib '%s' with value '%s' against pattern '%s'", + attrib, + value, + pattern, + ) + + if value is None: + match = False + continue + + # Some attributes might not be parsed as strings by urlsplit (such as the + # port, which is parsed as an int). Because we use match functions that + # expect strings, we want to make sure that's what we give them. + value_str = str(value) + + if pattern.startswith("^"): + if not re.match(pattern, value_str): + match = False + continue + else: + if not fnmatch.fnmatch(value_str, pattern): + match = False + continue + if match: + logger.warning("URL %s blocked by url_blacklist entry %s", url, entry) + raise SynapseError( + 403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN + ) + + # the in-memory cache: + # * ensures that only one request is active at a time + # * takes load off the DB for the thundering herds + # * also caches any failures (unlike the DB) so we don't keep + # requesting the same endpoint + + observable = self._cache.get(url) + + if not observable: + download = run_in_background(self._do_preview, url, requester.user, ts) + observable = ObservableDeferred(download, consumeErrors=True) + self._cache[url] = observable + else: + logger.info("Returning cached response") + + og = await make_deferred_yieldable(observable.observe()) + respond_with_json_bytes(request, 200, og, send_cors=True) + + async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes: + """Check the db, and download the URL and build a preview + + Args: + url: The URL to preview. + user: The user requesting the preview. + ts: The timestamp requested for the preview. + + Returns: + json-encoded og data + """ + # check the URL cache in the DB (which will also provide us with + # historical previews, if we have any) + cache_result = await self.store.get_url_cache(url, ts) + if ( + cache_result + and cache_result["expires_ts"] > ts + and cache_result["response_code"] / 100 == 2 + ): + # It may be stored as text in the database, not as bytes (such as + # PostgreSQL). If so, encode it back before handing it on. + og = cache_result["og"] + if isinstance(og, str): + og = og.encode("utf8") + return og + + # If this URL can be accessed via oEmbed, use that instead. + url_to_download = url + oembed_url = self._oembed.get_oembed_url(url) + if oembed_url: + url_to_download = oembed_url + + media_info = await self._handle_url(url_to_download, user) + + logger.debug("got media_info of '%s'", media_info) + + # The number of milliseconds that the response should be considered valid. + expiration_ms = media_info.expires + author_name: Optional[str] = None + + if _is_media(media_info.media_type): + file_id = media_info.filesystem_id + dims = await self.media_repo._generate_thumbnails( + None, file_id, file_id, media_info.media_type, url_cache=True + ) + + og = { + "og:description": media_info.download_name, + "og:image": f"mxc://{self.server_name}/{media_info.filesystem_id}", + "og:image:type": media_info.media_type, + "matrix:image:size": media_info.media_length, + } + + if dims: + og["og:image:width"] = dims["width"] + og["og:image:height"] = dims["height"] + else: + logger.warning("Couldn't get dims for %s" % url) + + # define our OG response for this media + elif _is_html(media_info.media_type): + # TODO: somehow stop a big HTML tree from exploding synapse's RAM + + with open(media_info.filename, "rb") as file: + body = file.read() + + tree = decode_body(body, media_info.uri, media_info.media_type) + if tree is not None: + # Check if this HTML document points to oEmbed information and + # defer to that. + oembed_url = self._oembed.autodiscover_from_html(tree) + og_from_oembed: JsonDict = {} + if oembed_url: + try: + oembed_info = await self._handle_url( + oembed_url, user, allow_data_urls=True + ) + except Exception as e: + # Fetching the oEmbed info failed, don't block the entire URL preview. + logger.warning( + "oEmbed fetch failed during URL preview: %s errored with %s", + oembed_url, + e, + ) + else: + ( + og_from_oembed, + author_name, + expiration_ms, + ) = await self._handle_oembed_response( + url, oembed_info, expiration_ms + ) + + # Parse Open Graph information from the HTML in case the oEmbed + # response failed or is incomplete. + og_from_html = parse_html_to_open_graph(tree) + + # Compile the Open Graph response by using the scraped + # information from the HTML and overlaying any information + # from the oEmbed response. + og = {**og_from_html, **og_from_oembed} + + await self._precache_image_url(user, media_info, og) + else: + og = {} + + elif oembed_url: + # Handle the oEmbed information. + og, author_name, expiration_ms = await self._handle_oembed_response( + url, media_info, expiration_ms + ) + await self._precache_image_url(user, media_info, og) + + else: + logger.warning("Failed to find any OG data in %s", url) + og = {} + + # If we don't have a title but we have author_name, copy it as + # title + if not og.get("og:title") and author_name: + og["og:title"] = author_name + + # filter out any stupidly long values + keys_to_remove = [] + for k, v in og.items(): + # values can be numeric as well as strings, hence the cast to str + if len(k) > OG_TAG_NAME_MAXLEN or len(str(v)) > OG_TAG_VALUE_MAXLEN: + logger.warning( + "Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN] + ) + keys_to_remove.append(k) + for k in keys_to_remove: + del og[k] + + logger.debug("Calculated OG for %s as %s", url, og) + + jsonog = json_encoder.encode(og) + + # Cap the amount of time to consider a response valid. + expiration_ms = min(expiration_ms, ONE_DAY) + + # store OG in history-aware DB cache + await self.store.store_url_cache( + url, + media_info.response_code, + media_info.etag, + media_info.created_ts_ms + expiration_ms, + jsonog, + media_info.filesystem_id, + media_info.created_ts_ms, + ) + + return jsonog.encode("utf8") + + async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult: + """ + Fetches a remote URL and parses the headers. + + Args: + url: The URL to fetch. + output_stream: The stream to write the content to. + + Returns: + A tuple of: + Media length, URL downloaded, the HTTP response code, + the media type, the downloaded file name, the number of + milliseconds the result is valid for, the etag header. + """ + + try: + logger.debug("Trying to get preview for url '%s'", url) + length, headers, uri, code = await self.client.get_file( + url, + output_stream=output_stream, + max_size=self.max_spider_size, + headers={ + b"Accept-Language": self.url_preview_accept_language, + # Use a custom user agent for the preview because some sites will only return + # Open Graph metadata to crawler user agents. Omit the Synapse version + # string to avoid leaking information. + b"User-Agent": [ + "Synapse (bot; +https://github.com/matrix-org/synapse)" + ], + }, + is_allowed_content_type=_is_previewable, + ) + except SynapseError: + # Pass SynapseErrors through directly, so that the servlet + # handler will return a SynapseError to the client instead of + # blank data or a 500. + raise + except DNSLookupError: + # DNS lookup returned no results + # Note: This will also be the case if one of the resolved IP + # addresses is blacklisted + raise SynapseError( + 502, + "DNS resolution failure during URL preview generation", + Codes.UNKNOWN, + ) + except Exception as e: + # FIXME: pass through 404s and other error messages nicely + logger.warning("Error downloading %s: %r", url, e) + + raise SynapseError( + 500, + "Failed to download content: %s" + % (traceback.format_exception_only(sys.exc_info()[0], e),), + Codes.UNKNOWN, + ) + + if b"Content-Type" in headers: + media_type = headers[b"Content-Type"][0].decode("ascii") + else: + media_type = "application/octet-stream" + + download_name = get_filename_from_headers(headers) + + # FIXME: we should calculate a proper expiration based on the + # Cache-Control and Expire headers. But for now, assume 1 hour. + expires = ONE_HOUR + etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None + + return DownloadResult( + length, uri, code, media_type, download_name, expires, etag + ) + + async def _parse_data_url( + self, url: str, output_stream: BinaryIO + ) -> DownloadResult: + """ + Parses a data: URL. + + Args: + url: The URL to parse. + output_stream: The stream to write the content to. + + Returns: + A tuple of: + Media length, URL downloaded, the HTTP response code, + the media type, the downloaded file name, the number of + milliseconds the result is valid for, the etag header. + """ + + try: + logger.debug("Trying to parse data url '%s'", url) + with urlopen(url) as url_info: + # TODO Can this be more efficient. + output_stream.write(url_info.read()) + except Exception as e: + logger.warning("Error parsing data: URL %s: %r", url, e) + + raise SynapseError( + 500, + "Failed to parse data URL: %s" + % (traceback.format_exception_only(sys.exc_info()[0], e),), + Codes.UNKNOWN, + ) + + return DownloadResult( + # Read back the length that has been written. + length=output_stream.tell(), + uri=url, + # If it was parsed, consider this a 200 OK. + response_code=200, + # urlopen shoves the media-type from the data URL into the content type + # header object. + media_type=url_info.headers.get_content_type(), + # Some features are not supported by data: URLs. + download_name=None, + expires=ONE_HOUR, + etag=None, + ) + + async def _handle_url( + self, url: str, user: UserID, allow_data_urls: bool = False + ) -> MediaInfo: + """ + Fetches content from a URL and parses the result to generate a MediaInfo. + + It uses the media storage provider to persist the fetched content and + stores the mapping into the database. + + Args: + url: The URL to fetch. + user: The user who ahs requested this URL. + allow_data_urls: True if data URLs should be allowed. + + Returns: + A MediaInfo object describing the fetched content. + """ + + # TODO: we should probably honour robots.txt... except in practice + # we're most likely being explicitly triggered by a human rather than a + # bot, so are we really a robot? + + file_id = datetime.date.today().isoformat() + "_" + random_string(16) + + file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True) + + with self.media_storage.store_into_file(file_info) as (f, fname, finish): + if url.startswith("data:"): + if not allow_data_urls: + raise SynapseError( + 500, "Previewing of data: URLs is forbidden", Codes.UNKNOWN + ) + + download_result = await self._parse_data_url(url, f) + else: + download_result = await self._download_url(url, f) + + await finish() + + try: + time_now_ms = self.clock.time_msec() + + await self.store.store_local_media( + media_id=file_id, + media_type=download_result.media_type, + time_now_ms=time_now_ms, + upload_name=download_result.download_name, + media_length=download_result.length, + user_id=user, + url_cache=url, + ) + + except Exception as e: + logger.error("Error handling downloaded %s: %r", url, e) + # TODO: we really ought to delete the downloaded file in this + # case, since we won't have recorded it in the db, and will + # therefore not expire it. + raise + + return MediaInfo( + media_type=download_result.media_type, + media_length=download_result.length, + download_name=download_result.download_name, + created_ts_ms=time_now_ms, + filesystem_id=file_id, + filename=fname, + uri=download_result.uri, + response_code=download_result.response_code, + expires=download_result.expires, + etag=download_result.etag, + ) + + async def _precache_image_url( + self, user: UserID, media_info: MediaInfo, og: JsonDict + ) -> None: + """ + Pre-cache the image (if one exists) for posterity + + Args: + user: The user requesting the preview. + media_info: The media being previewed. + og: The Open Graph dictionary. This is modified with image information. + """ + # If there's no image or it is blank, there's nothing to do. + if "og:image" not in og: + return + + # Remove the raw image URL, this will be replaced with an MXC URL, if successful. + image_url = og.pop("og:image") + if not image_url: + return + + # The image URL from the HTML might be relative to the previewed page, + # convert it to an URL which can be requested directly. + url_parts = urlparse(image_url) + if url_parts.scheme != "data": + image_url = urljoin(media_info.uri, image_url) + + # FIXME: it might be cleaner to use the same flow as the main /preview_url + # request itself and benefit from the same caching etc. But for now we + # just rely on the caching on the master request to speed things up. + try: + image_info = await self._handle_url(image_url, user, allow_data_urls=True) + except Exception as e: + # Pre-caching the image failed, don't block the entire URL preview. + logger.warning( + "Pre-caching image failed during URL preview: %s errored with %s", + image_url, + e, + ) + return + + if _is_media(image_info.media_type): + # TODO: make sure we don't choke on white-on-transparent images + file_id = image_info.filesystem_id + dims = await self.media_repo._generate_thumbnails( + None, file_id, file_id, image_info.media_type, url_cache=True + ) + if dims: + og["og:image:width"] = dims["width"] + og["og:image:height"] = dims["height"] + else: + logger.warning("Couldn't get dims for %s", image_url) + + og["og:image"] = f"mxc://{self.server_name}/{image_info.filesystem_id}" + og["og:image:type"] = image_info.media_type + og["matrix:image:size"] = image_info.media_length + + async def _handle_oembed_response( + self, url: str, media_info: MediaInfo, expiration_ms: int + ) -> Tuple[JsonDict, Optional[str], int]: + """ + Parse the downloaded oEmbed info. + + Args: + url: The URL which is being previewed (not the one which was + requested). + media_info: The media being previewed. + expiration_ms: The length of time, in milliseconds, the media is valid for. + + Returns: + A tuple of: + The Open Graph dictionary, if the oEmbed info can be parsed. + The author name if it could be retrieved from oEmbed. + The (possibly updated) length of time, in milliseconds, the media is valid for. + """ + # If JSON was not returned, there's nothing to do. + if not _is_json(media_info.media_type): + return {}, None, expiration_ms + + with open(media_info.filename, "rb") as file: + body = file.read() + + oembed_response = self._oembed.parse_oembed_response(url, body) + open_graph_result = oembed_response.open_graph_result + + # Use the cache age from the oEmbed result, if one was given. + if open_graph_result and oembed_response.cache_age is not None: + expiration_ms = oembed_response.cache_age + + return open_graph_result, oembed_response.author_name, expiration_ms + + def _start_expire_url_cache_data(self) -> Deferred: + return run_as_background_process( + "expire_url_cache_data", self._expire_url_cache_data + ) + + async def _expire_url_cache_data(self) -> None: + """Clean up expired url cache content, media and thumbnails.""" + + assert self._worker_run_media_background_jobs + + now = self.clock.time_msec() + + logger.debug("Running url preview cache expiry") + + def try_remove_parent_dirs(dirs: Iterable[str]) -> None: + """Attempt to remove the given chain of parent directories + + Args: + dirs: The list of directory paths to delete, with children appearing + before their parents. + """ + for dir in dirs: + try: + os.rmdir(dir) + except FileNotFoundError: + # Already deleted, continue with deleting the rest + pass + except OSError as e: + # Failed, skip deleting the rest of the parent dirs + if e.errno != errno.ENOTEMPTY: + logger.warning( + "Failed to remove media directory while clearing url preview cache: %r: %s", + dir, + e, + ) + break + + # First we delete expired url cache entries + media_ids = await self.store.get_expired_url_cache(now) + + removed_media = [] + for media_id in media_ids: + fname = self.filepaths.url_cache_filepath(media_id) + try: + os.remove(fname) + except FileNotFoundError: + pass # If the path doesn't exist, meh + except OSError as e: + logger.warning( + "Failed to remove media while clearing url preview cache: %r: %s", + media_id, + e, + ) + continue + + removed_media.append(media_id) + + dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id) + try_remove_parent_dirs(dirs) + + await self.store.delete_url_cache(removed_media) + + if removed_media: + logger.debug( + "Deleted %d entries from url preview cache", len(removed_media) + ) + else: + logger.debug("No entries removed from url preview cache") + + # Now we delete old images associated with the url cache. + # These may be cached for a bit on the client (i.e., they + # may have a room open with a preview url thing open). + # So we wait a couple of days before deleting, just in case. + expire_before = now - IMAGE_CACHE_EXPIRY_MS + media_ids = await self.store.get_url_cache_media_before(expire_before) + + removed_media = [] + for media_id in media_ids: + fname = self.filepaths.url_cache_filepath(media_id) + try: + os.remove(fname) + except FileNotFoundError: + pass # If the path doesn't exist, meh + except OSError as e: + logger.warning( + "Failed to remove media from url preview cache: %r: %s", media_id, e + ) + continue + + dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id) + try_remove_parent_dirs(dirs) + + thumbnail_dir = self.filepaths.url_cache_thumbnail_directory(media_id) + try: + shutil.rmtree(thumbnail_dir) + except FileNotFoundError: + pass # If the path doesn't exist, meh + except OSError as e: + logger.warning( + "Failed to remove media from url preview cache: %r: %s", media_id, e + ) + continue + + removed_media.append(media_id) + + dirs = self.filepaths.url_cache_thumbnail_dirs_to_delete(media_id) + # Note that one of the directories to be deleted has already been + # removed by the `rmtree` above. + try_remove_parent_dirs(dirs) + + await self.store.delete_url_cache_media(removed_media) + + if removed_media: + logger.debug("Deleted %d media from url preview cache", len(removed_media)) + else: + logger.debug("No media removed from url preview cache") + + +def _is_media(content_type: str) -> bool: + return content_type.lower().startswith("image/") + + +def _is_html(content_type: str) -> bool: + content_type = content_type.lower() + return content_type.startswith("text/html") or content_type.startswith( + "application/xhtml" + ) + + +def _is_json(content_type: str) -> bool: + return content_type.lower().startswith("application/json") + + +def _is_previewable(content_type: str) -> bool: + """Returns True for content types for which we will perform URL preview and False + otherwise.""" + + return _is_html(content_type) or _is_media(content_type) or _is_json(content_type) diff --git a/synapse/rest/media/thumbnail_resource.py b/synapse/rest/media/thumbnail_resource.py new file mode 100644 index 0000000000..4ee2a0dbda --- /dev/null +++ b/synapse/rest/media/thumbnail_resource.py @@ -0,0 +1,554 @@ +# Copyright 2014-2016 OpenMarket Ltd +# Copyright 2020-2021 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple + +from synapse.api.errors import Codes, SynapseError, cs_error +from synapse.config.repository import THUMBNAIL_SUPPORTED_MEDIA_FORMAT_MAP +from synapse.http.server import ( + DirectServeJsonResource, + respond_with_json, + set_corp_headers, + set_cors_headers, +) +from synapse.http.servlet import parse_integer, parse_string +from synapse.http.site import SynapseRequest +from synapse.media._base import ( + FileInfo, + ThumbnailInfo, + parse_media_id, + respond_404, + respond_with_file, + respond_with_responder, +) +from synapse.media.media_storage import MediaStorage + +if TYPE_CHECKING: + from synapse.media.media_repository import MediaRepository + from synapse.server import HomeServer + +logger = logging.getLogger(__name__) + + +class ThumbnailResource(DirectServeJsonResource): + isLeaf = True + + def __init__( + self, + hs: "HomeServer", + media_repo: "MediaRepository", + media_storage: MediaStorage, + ): + super().__init__() + + self.store = hs.get_datastores().main + self.media_repo = media_repo + self.media_storage = media_storage + self.dynamic_thumbnails = hs.config.media.dynamic_thumbnails + self.server_name = hs.hostname + + async def _async_render_GET(self, request: SynapseRequest) -> None: + set_cors_headers(request) + set_corp_headers(request) + server_name, media_id, _ = parse_media_id(request) + width = parse_integer(request, "width", required=True) + height = parse_integer(request, "height", required=True) + method = parse_string(request, "method", "scale") + # TODO Parse the Accept header to get an prioritised list of thumbnail types. + m_type = "image/png" + + if server_name == self.server_name: + if self.dynamic_thumbnails: + await self._select_or_generate_local_thumbnail( + request, media_id, width, height, method, m_type + ) + else: + await self._respond_local_thumbnail( + request, media_id, width, height, method, m_type + ) + self.media_repo.mark_recently_accessed(None, media_id) + else: + if self.dynamic_thumbnails: + await self._select_or_generate_remote_thumbnail( + request, server_name, media_id, width, height, method, m_type + ) + else: + await self._respond_remote_thumbnail( + request, server_name, media_id, width, height, method, m_type + ) + self.media_repo.mark_recently_accessed(server_name, media_id) + + async def _respond_local_thumbnail( + self, + request: SynapseRequest, + media_id: str, + width: int, + height: int, + method: str, + m_type: str, + ) -> None: + media_info = await self.store.get_local_media(media_id) + + if not media_info: + respond_404(request) + return + if media_info["quarantined_by"]: + logger.info("Media is quarantined") + respond_404(request) + return + + thumbnail_infos = await self.store.get_local_media_thumbnails(media_id) + await self._select_and_respond_with_thumbnail( + request, + width, + height, + method, + m_type, + thumbnail_infos, + media_id, + media_id, + url_cache=bool(media_info["url_cache"]), + server_name=None, + ) + + async def _select_or_generate_local_thumbnail( + self, + request: SynapseRequest, + media_id: str, + desired_width: int, + desired_height: int, + desired_method: str, + desired_type: str, + ) -> None: + media_info = await self.store.get_local_media(media_id) + + if not media_info: + respond_404(request) + return + if media_info["quarantined_by"]: + logger.info("Media is quarantined") + respond_404(request) + return + + thumbnail_infos = await self.store.get_local_media_thumbnails(media_id) + for info in thumbnail_infos: + t_w = info["thumbnail_width"] == desired_width + t_h = info["thumbnail_height"] == desired_height + t_method = info["thumbnail_method"] == desired_method + t_type = info["thumbnail_type"] == desired_type + + if t_w and t_h and t_method and t_type: + file_info = FileInfo( + server_name=None, + file_id=media_id, + url_cache=media_info["url_cache"], + thumbnail=ThumbnailInfo( + width=info["thumbnail_width"], + height=info["thumbnail_height"], + type=info["thumbnail_type"], + method=info["thumbnail_method"], + ), + ) + + t_type = file_info.thumbnail_type + t_length = info["thumbnail_length"] + + responder = await self.media_storage.fetch_media(file_info) + if responder: + await respond_with_responder(request, responder, t_type, t_length) + return + + logger.debug("We don't have a thumbnail of that size. Generating") + + # Okay, so we generate one. + file_path = await self.media_repo.generate_local_exact_thumbnail( + media_id, + desired_width, + desired_height, + desired_method, + desired_type, + url_cache=bool(media_info["url_cache"]), + ) + + if file_path: + await respond_with_file(request, desired_type, file_path) + else: + logger.warning("Failed to generate thumbnail") + raise SynapseError(400, "Failed to generate thumbnail.") + + async def _select_or_generate_remote_thumbnail( + self, + request: SynapseRequest, + server_name: str, + media_id: str, + desired_width: int, + desired_height: int, + desired_method: str, + desired_type: str, + ) -> None: + media_info = await self.media_repo.get_remote_media_info(server_name, media_id) + + thumbnail_infos = await self.store.get_remote_media_thumbnails( + server_name, media_id + ) + + file_id = media_info["filesystem_id"] + + for info in thumbnail_infos: + t_w = info["thumbnail_width"] == desired_width + t_h = info["thumbnail_height"] == desired_height + t_method = info["thumbnail_method"] == desired_method + t_type = info["thumbnail_type"] == desired_type + + if t_w and t_h and t_method and t_type: + file_info = FileInfo( + server_name=server_name, + file_id=media_info["filesystem_id"], + thumbnail=ThumbnailInfo( + width=info["thumbnail_width"], + height=info["thumbnail_height"], + type=info["thumbnail_type"], + method=info["thumbnail_method"], + ), + ) + + t_type = file_info.thumbnail_type + t_length = info["thumbnail_length"] + + responder = await self.media_storage.fetch_media(file_info) + if responder: + await respond_with_responder(request, responder, t_type, t_length) + return + + logger.debug("We don't have a thumbnail of that size. Generating") + + # Okay, so we generate one. + file_path = await self.media_repo.generate_remote_exact_thumbnail( + server_name, + file_id, + media_id, + desired_width, + desired_height, + desired_method, + desired_type, + ) + + if file_path: + await respond_with_file(request, desired_type, file_path) + else: + logger.warning("Failed to generate thumbnail") + raise SynapseError(400, "Failed to generate thumbnail.") + + async def _respond_remote_thumbnail( + self, + request: SynapseRequest, + server_name: str, + media_id: str, + width: int, + height: int, + method: str, + m_type: str, + ) -> None: + # TODO: Don't download the whole remote file + # We should proxy the thumbnail from the remote server instead of + # downloading the remote file and generating our own thumbnails. + media_info = await self.media_repo.get_remote_media_info(server_name, media_id) + + thumbnail_infos = await self.store.get_remote_media_thumbnails( + server_name, media_id + ) + await self._select_and_respond_with_thumbnail( + request, + width, + height, + method, + m_type, + thumbnail_infos, + media_id, + media_info["filesystem_id"], + url_cache=False, + server_name=server_name, + ) + + async def _select_and_respond_with_thumbnail( + self, + request: SynapseRequest, + desired_width: int, + desired_height: int, + desired_method: str, + desired_type: str, + thumbnail_infos: List[Dict[str, Any]], + media_id: str, + file_id: str, + url_cache: bool, + server_name: Optional[str] = None, + ) -> None: + """ + Respond to a request with an appropriate thumbnail from the previously generated thumbnails. + + Args: + request: The incoming request. + desired_width: The desired width, the returned thumbnail may be larger than this. + desired_height: The desired height, the returned thumbnail may be larger than this. + desired_method: The desired method used to generate the thumbnail. + desired_type: The desired content-type of the thumbnail. + thumbnail_infos: A list of dictionaries of candidate thumbnails. + file_id: The ID of the media that a thumbnail is being requested for. + url_cache: True if this is from a URL cache. + server_name: The server name, if this is a remote thumbnail. + """ + logger.debug( + "_select_and_respond_with_thumbnail: media_id=%s desired=%sx%s (%s) thumbnail_infos=%s", + media_id, + desired_width, + desired_height, + desired_method, + thumbnail_infos, + ) + + # If `dynamic_thumbnails` is enabled, we expect Synapse to go down a + # different code path to handle it. + assert not self.dynamic_thumbnails + + if thumbnail_infos: + file_info = self._select_thumbnail( + desired_width, + desired_height, + desired_method, + desired_type, + thumbnail_infos, + file_id, + url_cache, + server_name, + ) + if not file_info: + logger.info("Couldn't find a thumbnail matching the desired inputs") + respond_404(request) + return + + # The thumbnail property must exist. + assert file_info.thumbnail is not None + + responder = await self.media_storage.fetch_media(file_info) + if responder: + await respond_with_responder( + request, + responder, + file_info.thumbnail.type, + file_info.thumbnail.length, + ) + return + + # If we can't find the thumbnail we regenerate it. This can happen + # if e.g. we've deleted the thumbnails but still have the original + # image somewhere. + # + # Since we have an entry for the thumbnail in the DB we a) know we + # have have successfully generated the thumbnail in the past (so we + # don't need to worry about repeatedly failing to generate + # thumbnails), and b) have already calculated that appropriate + # width/height/method so we can just call the "generate exact" + # methods. + + # First let's check that we do actually have the original image + # still. This will throw a 404 if we don't. + # TODO: We should refetch the thumbnails for remote media. + await self.media_storage.ensure_media_is_in_local_cache( + FileInfo(server_name, file_id, url_cache=url_cache) + ) + + if server_name: + await self.media_repo.generate_remote_exact_thumbnail( + server_name, + file_id=file_id, + media_id=media_id, + t_width=file_info.thumbnail.width, + t_height=file_info.thumbnail.height, + t_method=file_info.thumbnail.method, + t_type=file_info.thumbnail.type, + ) + else: + await self.media_repo.generate_local_exact_thumbnail( + media_id=media_id, + t_width=file_info.thumbnail.width, + t_height=file_info.thumbnail.height, + t_method=file_info.thumbnail.method, + t_type=file_info.thumbnail.type, + url_cache=url_cache, + ) + + responder = await self.media_storage.fetch_media(file_info) + await respond_with_responder( + request, + responder, + file_info.thumbnail.type, + file_info.thumbnail.length, + ) + else: + # This might be because: + # 1. We can't create thumbnails for the given media (corrupted or + # unsupported file type), or + # 2. The thumbnailing process never ran or errored out initially + # when the media was first uploaded (these bugs should be + # reported and fixed). + # Note that we don't attempt to generate a thumbnail now because + # `dynamic_thumbnails` is disabled. + logger.info("Failed to find any generated thumbnails") + + respond_with_json( + request, + 400, + cs_error( + "Cannot find any thumbnails for the requested media (%r). This might mean the media is not a supported_media_format=(%s) or that thumbnailing failed for some other reason. (Dynamic thumbnails are disabled on this server.)" + % ( + request.postpath, + ", ".join(THUMBNAIL_SUPPORTED_MEDIA_FORMAT_MAP.keys()), + ), + code=Codes.UNKNOWN, + ), + send_cors=True, + ) + + def _select_thumbnail( + self, + desired_width: int, + desired_height: int, + desired_method: str, + desired_type: str, + thumbnail_infos: List[Dict[str, Any]], + file_id: str, + url_cache: bool, + server_name: Optional[str], + ) -> Optional[FileInfo]: + """ + Choose an appropriate thumbnail from the previously generated thumbnails. + + Args: + desired_width: The desired width, the returned thumbnail may be larger than this. + desired_height: The desired height, the returned thumbnail may be larger than this. + desired_method: The desired method used to generate the thumbnail. + desired_type: The desired content-type of the thumbnail. + thumbnail_infos: A list of dictionaries of candidate thumbnails. + file_id: The ID of the media that a thumbnail is being requested for. + url_cache: True if this is from a URL cache. + server_name: The server name, if this is a remote thumbnail. + + Returns: + The thumbnail which best matches the desired parameters. + """ + desired_method = desired_method.lower() + + # The chosen thumbnail. + thumbnail_info = None + + d_w = desired_width + d_h = desired_height + + if desired_method == "crop": + # Thumbnails that match equal or larger sizes of desired width/height. + crop_info_list: List[Tuple[int, int, int, bool, int, Dict[str, Any]]] = [] + # Other thumbnails. + crop_info_list2: List[Tuple[int, int, int, bool, int, Dict[str, Any]]] = [] + for info in thumbnail_infos: + # Skip thumbnails generated with different methods. + if info["thumbnail_method"] != "crop": + continue + + t_w = info["thumbnail_width"] + t_h = info["thumbnail_height"] + aspect_quality = abs(d_w * t_h - d_h * t_w) + min_quality = 0 if d_w <= t_w and d_h <= t_h else 1 + size_quality = abs((d_w - t_w) * (d_h - t_h)) + type_quality = desired_type != info["thumbnail_type"] + length_quality = info["thumbnail_length"] + if t_w >= d_w or t_h >= d_h: + crop_info_list.append( + ( + aspect_quality, + min_quality, + size_quality, + type_quality, + length_quality, + info, + ) + ) + else: + crop_info_list2.append( + ( + aspect_quality, + min_quality, + size_quality, + type_quality, + length_quality, + info, + ) + ) + # Pick the most appropriate thumbnail. Some values of `desired_width` and + # `desired_height` may result in a tie, in which case we avoid comparing on + # the thumbnail info dictionary and pick the thumbnail that appears earlier + # in the list of candidates. + if crop_info_list: + thumbnail_info = min(crop_info_list, key=lambda t: t[:-1])[-1] + elif crop_info_list2: + thumbnail_info = min(crop_info_list2, key=lambda t: t[:-1])[-1] + elif desired_method == "scale": + # Thumbnails that match equal or larger sizes of desired width/height. + info_list: List[Tuple[int, bool, int, Dict[str, Any]]] = [] + # Other thumbnails. + info_list2: List[Tuple[int, bool, int, Dict[str, Any]]] = [] + + for info in thumbnail_infos: + # Skip thumbnails generated with different methods. + if info["thumbnail_method"] != "scale": + continue + + t_w = info["thumbnail_width"] + t_h = info["thumbnail_height"] + size_quality = abs((d_w - t_w) * (d_h - t_h)) + type_quality = desired_type != info["thumbnail_type"] + length_quality = info["thumbnail_length"] + if t_w >= d_w or t_h >= d_h: + info_list.append((size_quality, type_quality, length_quality, info)) + else: + info_list2.append( + (size_quality, type_quality, length_quality, info) + ) + # Pick the most appropriate thumbnail. Some values of `desired_width` and + # `desired_height` may result in a tie, in which case we avoid comparing on + # the thumbnail info dictionary and pick the thumbnail that appears earlier + # in the list of candidates. + if info_list: + thumbnail_info = min(info_list, key=lambda t: t[:-1])[-1] + elif info_list2: + thumbnail_info = min(info_list2, key=lambda t: t[:-1])[-1] + + if thumbnail_info: + return FileInfo( + file_id=file_id, + url_cache=url_cache, + server_name=server_name, + thumbnail=ThumbnailInfo( + width=thumbnail_info["thumbnail_width"], + height=thumbnail_info["thumbnail_height"], + type=thumbnail_info["thumbnail_type"], + method=thumbnail_info["thumbnail_method"], + length=thumbnail_info["thumbnail_length"], + ), + ) + + # No matching thumbnail was found. + return None diff --git a/synapse/rest/media/upload_resource.py b/synapse/rest/media/upload_resource.py new file mode 100644 index 0000000000..697348613b --- /dev/null +++ b/synapse/rest/media/upload_resource.py @@ -0,0 +1,108 @@ +# Copyright 2014-2016 OpenMarket Ltd +# Copyright 2020-2021 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import IO, TYPE_CHECKING, Dict, List, Optional + +from synapse.api.errors import Codes, SynapseError +from synapse.http.server import DirectServeJsonResource, respond_with_json +from synapse.http.servlet import parse_bytes_from_args +from synapse.http.site import SynapseRequest +from synapse.media.media_storage import SpamMediaException + +if TYPE_CHECKING: + from synapse.media.media_repository import MediaRepository + from synapse.server import HomeServer + +logger = logging.getLogger(__name__) + + +class UploadResource(DirectServeJsonResource): + isLeaf = True + + def __init__(self, hs: "HomeServer", media_repo: "MediaRepository"): + super().__init__() + + self.media_repo = media_repo + self.filepaths = media_repo.filepaths + self.store = hs.get_datastores().main + self.clock = hs.get_clock() + self.server_name = hs.hostname + self.auth = hs.get_auth() + self.max_upload_size = hs.config.media.max_upload_size + self.clock = hs.get_clock() + + async def _async_render_OPTIONS(self, request: SynapseRequest) -> None: + respond_with_json(request, 200, {}, send_cors=True) + + async def _async_render_POST(self, request: SynapseRequest) -> None: + requester = await self.auth.get_user_by_req(request) + raw_content_length = request.getHeader("Content-Length") + if raw_content_length is None: + raise SynapseError(msg="Request must specify a Content-Length", code=400) + try: + content_length = int(raw_content_length) + except ValueError: + raise SynapseError(msg="Content-Length value is invalid", code=400) + if content_length > self.max_upload_size: + raise SynapseError( + msg="Upload request body is too large", + code=413, + errcode=Codes.TOO_LARGE, + ) + + args: Dict[bytes, List[bytes]] = request.args # type: ignore + upload_name_bytes = parse_bytes_from_args(args, "filename") + if upload_name_bytes: + try: + upload_name: Optional[str] = upload_name_bytes.decode("utf8") + except UnicodeDecodeError: + raise SynapseError( + msg="Invalid UTF-8 filename parameter: %r" % (upload_name_bytes,), + code=400, + ) + + # If the name is falsey (e.g. an empty byte string) ensure it is None. + else: + upload_name = None + + headers = request.requestHeaders + + if headers.hasHeader(b"Content-Type"): + content_type_headers = headers.getRawHeaders(b"Content-Type") + assert content_type_headers # for mypy + media_type = content_type_headers[0].decode("ascii") + else: + media_type = "application/octet-stream" + + # if headers.hasHeader(b"Content-Disposition"): + # disposition = headers.getRawHeaders(b"Content-Disposition")[0] + # TODO(markjh): parse content-dispostion + + try: + content: IO = request.content # type: ignore + content_uri = await self.media_repo.create_content( + media_type, upload_name, content, content_length, requester.user + ) + except SpamMediaException: + # For uploading of media we want to respond with a 400, instead of + # the default 404, as that would just be confusing. + raise SynapseError(400, "Bad content") + + logger.info("Uploaded content with URI '%s'", content_uri) + + respond_with_json( + request, 200, {"content_uri": str(content_uri)}, send_cors=True + ) diff --git a/synapse/rest/media/v1/_base.py b/synapse/rest/media/v1/_base.py index ef8334ae25..88427a5737 100644 --- a/synapse/rest/media/v1/_base.py +++ b/synapse/rest/media/v1/_base.py @@ -1,5 +1,4 @@ -# Copyright 2014-2016 OpenMarket Ltd -# Copyright 2019-2021 The Matrix.org Foundation C.I.C. +# Copyright 2023 The Matrix.org Foundation C.I.C. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,468 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# -import logging -import os -import urllib -from abc import ABC, abstractmethod -from types import TracebackType -from typing import Awaitable, Dict, Generator, List, Optional, Tuple, Type - -import attr - -from twisted.internet.interfaces import IConsumer -from twisted.protocols.basic import FileSender -from twisted.web.server import Request - -from synapse.api.errors import Codes, SynapseError, cs_error -from synapse.http.server import finish_request, respond_with_json -from synapse.http.site import SynapseRequest -from synapse.logging.context import make_deferred_yieldable -from synapse.util.stringutils import is_ascii, parse_and_validate_server_name - -logger = logging.getLogger(__name__) - -# list all text content types that will have the charset default to UTF-8 when -# none is given -TEXT_CONTENT_TYPES = [ - "text/css", - "text/csv", - "text/html", - "text/calendar", - "text/plain", - "text/javascript", - "application/json", - "application/ld+json", - "application/rtf", - "image/svg+xml", - "text/xml", -] - - -def parse_media_id(request: Request) -> Tuple[str, str, Optional[str]]: - """Parses the server name, media ID and optional file name from the request URI - - Also performs some rough validation on the server name. - - Args: - request: The `Request`. - - Returns: - A tuple containing the parsed server name, media ID and optional file name. - - Raises: - SynapseError(404): if parsing or validation fail for any reason - """ - try: - # The type on postpath seems incorrect in Twisted 21.2.0. - postpath: List[bytes] = request.postpath # type: ignore - assert postpath - - # This allows users to append e.g. /test.png to the URL. Useful for - # clients that parse the URL to see content type. - server_name_bytes, media_id_bytes = postpath[:2] - server_name = server_name_bytes.decode("utf-8") - media_id = media_id_bytes.decode("utf8") - - # Validate the server name, raising if invalid - parse_and_validate_server_name(server_name) - - file_name = None - if len(postpath) > 2: - try: - file_name = urllib.parse.unquote(postpath[-1].decode("utf-8")) - except UnicodeDecodeError: - pass - return server_name, media_id, file_name - except Exception: - raise SynapseError( - 404, "Invalid media id token %r" % (request.postpath,), Codes.UNKNOWN - ) - - -def respond_404(request: SynapseRequest) -> None: - respond_with_json( - request, - 404, - cs_error("Not found %r" % (request.postpath,), code=Codes.NOT_FOUND), - send_cors=True, - ) - - -async def respond_with_file( - request: SynapseRequest, - media_type: str, - file_path: str, - file_size: Optional[int] = None, - upload_name: Optional[str] = None, -) -> None: - logger.debug("Responding with %r", file_path) - - if os.path.isfile(file_path): - if file_size is None: - stat = os.stat(file_path) - file_size = stat.st_size - - add_file_headers(request, media_type, file_size, upload_name) - - with open(file_path, "rb") as f: - await make_deferred_yieldable(FileSender().beginFileTransfer(f, request)) - - finish_request(request) - else: - respond_404(request) - - -def add_file_headers( - request: Request, - media_type: str, - file_size: Optional[int], - upload_name: Optional[str], -) -> None: - """Adds the correct response headers in preparation for responding with the - media. - - Args: - request - media_type: The media/content type. - file_size: Size in bytes of the media, if known. - upload_name: The name of the requested file, if any. - """ - - def _quote(x: str) -> str: - return urllib.parse.quote(x.encode("utf-8")) - - # Default to a UTF-8 charset for text content types. - # ex, uses UTF-8 for 'text/css' but not 'text/css; charset=UTF-16' - if media_type.lower() in TEXT_CONTENT_TYPES: - content_type = media_type + "; charset=UTF-8" - else: - content_type = media_type - - request.setHeader(b"Content-Type", content_type.encode("UTF-8")) - if upload_name: - # RFC6266 section 4.1 [1] defines both `filename` and `filename*`. - # - # `filename` is defined to be a `value`, which is defined by RFC2616 - # section 3.6 [2] to be a `token` or a `quoted-string`, where a `token` - # is (essentially) a single US-ASCII word, and a `quoted-string` is a - # US-ASCII string surrounded by double-quotes, using backslash as an - # escape character. Note that %-encoding is *not* permitted. - # - # `filename*` is defined to be an `ext-value`, which is defined in - # RFC5987 section 3.2.1 [3] to be `charset "'" [ language ] "'" value-chars`, - # where `value-chars` is essentially a %-encoded string in the given charset. - # - # [1]: https://tools.ietf.org/html/rfc6266#section-4.1 - # [2]: https://tools.ietf.org/html/rfc2616#section-3.6 - # [3]: https://tools.ietf.org/html/rfc5987#section-3.2.1 - - # We avoid the quoted-string version of `filename`, because (a) synapse didn't - # correctly interpret those as of 0.99.2 and (b) they are a bit of a pain and we - # may as well just do the filename* version. - if _can_encode_filename_as_token(upload_name): - disposition = "inline; filename=%s" % (upload_name,) - else: - disposition = "inline; filename*=utf-8''%s" % (_quote(upload_name),) - - request.setHeader(b"Content-Disposition", disposition.encode("ascii")) - - # cache for at least a day. - # XXX: we might want to turn this off for data we don't want to - # recommend caching as it's sensitive or private - or at least - # select private. don't bother setting Expires as all our - # clients are smart enough to be happy with Cache-Control - request.setHeader(b"Cache-Control", b"public,max-age=86400,s-maxage=86400") - if file_size is not None: - request.setHeader(b"Content-Length", b"%d" % (file_size,)) - - # Tell web crawlers to not index, archive, or follow links in media. This - # should help to prevent things in the media repo from showing up in web - # search results. - request.setHeader(b"X-Robots-Tag", "noindex, nofollow, noarchive, noimageindex") - - -# separators as defined in RFC2616. SP and HT are handled separately. -# see _can_encode_filename_as_token. -_FILENAME_SEPARATOR_CHARS = { - "(", - ")", - "<", - ">", - "@", - ",", - ";", - ":", - "\\", - '"', - "/", - "[", - "]", - "?", - "=", - "{", - "}", -} - - -def _can_encode_filename_as_token(x: str) -> bool: - for c in x: - # from RFC2616: - # - # token = 1* - # - # separators = "(" | ")" | "<" | ">" | "@" - # | "," | ";" | ":" | "\" | <"> - # | "/" | "[" | "]" | "?" | "=" - # | "{" | "}" | SP | HT - # - # CHAR = - # - # CTL = - # - if ord(c) >= 127 or ord(c) <= 32 or c in _FILENAME_SEPARATOR_CHARS: - return False - return True - - -async def respond_with_responder( - request: SynapseRequest, - responder: "Optional[Responder]", - media_type: str, - file_size: Optional[int], - upload_name: Optional[str] = None, -) -> None: - """Responds to the request with given responder. If responder is None then - returns 404. - - Args: - request - responder - media_type: The media/content type. - file_size: Size in bytes of the media. If not known it should be None - upload_name: The name of the requested file, if any. - """ - if not responder: - respond_404(request) - return - - # If we have a responder we *must* use it as a context manager. - with responder: - if request._disconnected: - logger.warning( - "Not sending response to request %s, already disconnected.", request - ) - return - - logger.debug("Responding to media request with responder %s", responder) - add_file_headers(request, media_type, file_size, upload_name) - try: - await responder.write_to_consumer(request) - except Exception as e: - # The majority of the time this will be due to the client having gone - # away. Unfortunately, Twisted simply throws a generic exception at us - # in that case. - logger.warning("Failed to write to consumer: %s %s", type(e), e) - - # Unregister the producer, if it has one, so Twisted doesn't complain - if request.producer: - request.unregisterProducer() - - finish_request(request) - - -class Responder(ABC): - """Represents a response that can be streamed to the requester. - - Responder is a context manager which *must* be used, so that any resources - held can be cleaned up. - """ - - @abstractmethod - def write_to_consumer(self, consumer: IConsumer) -> Awaitable: - """Stream response into consumer - - Args: - consumer: The consumer to stream into. - - Returns: - Resolves once the response has finished being written - """ - raise NotImplementedError() - - def __enter__(self) -> None: # noqa: B027 - pass - - def __exit__( # noqa: B027 - self, - exc_type: Optional[Type[BaseException]], - exc_val: Optional[BaseException], - exc_tb: Optional[TracebackType], - ) -> None: - pass - - -@attr.s(slots=True, frozen=True, auto_attribs=True) -class ThumbnailInfo: - """Details about a generated thumbnail.""" - - width: int - height: int - method: str - # Content type of thumbnail, e.g. image/png - type: str - # The size of the media file, in bytes. - length: Optional[int] = None - - -@attr.s(slots=True, frozen=True, auto_attribs=True) -class FileInfo: - """Details about a requested/uploaded file.""" - - # The server name where the media originated from, or None if local. - server_name: Optional[str] - # The local ID of the file. For local files this is the same as the media_id - file_id: str - # If the file is for the url preview cache - url_cache: bool = False - # Whether the file is a thumbnail or not. - thumbnail: Optional[ThumbnailInfo] = None - - # The below properties exist to maintain compatibility with third-party modules. - @property - def thumbnail_width(self) -> Optional[int]: - if not self.thumbnail: - return None - return self.thumbnail.width - - @property - def thumbnail_height(self) -> Optional[int]: - if not self.thumbnail: - return None - return self.thumbnail.height - - @property - def thumbnail_method(self) -> Optional[str]: - if not self.thumbnail: - return None - return self.thumbnail.method - - @property - def thumbnail_type(self) -> Optional[str]: - if not self.thumbnail: - return None - return self.thumbnail.type - - @property - def thumbnail_length(self) -> Optional[int]: - if not self.thumbnail: - return None - return self.thumbnail.length - - -def get_filename_from_headers(headers: Dict[bytes, List[bytes]]) -> Optional[str]: - """ - Get the filename of the downloaded file by inspecting the - Content-Disposition HTTP header. - - Args: - headers: The HTTP request headers. - - Returns: - The filename, or None. - """ - content_disposition = headers.get(b"Content-Disposition", [b""]) - - # No header, bail out. - if not content_disposition[0]: - return None - - _, params = _parse_header(content_disposition[0]) - - upload_name = None - - # First check if there is a valid UTF-8 filename - upload_name_utf8 = params.get(b"filename*", None) - if upload_name_utf8: - if upload_name_utf8.lower().startswith(b"utf-8''"): - upload_name_utf8 = upload_name_utf8[7:] - # We have a filename*= section. This MUST be ASCII, and any UTF-8 - # bytes are %-quoted. - try: - # Once it is decoded, we can then unquote the %-encoded - # parts strictly into a unicode string. - upload_name = urllib.parse.unquote( - upload_name_utf8.decode("ascii"), errors="strict" - ) - except UnicodeDecodeError: - # Incorrect UTF-8. - pass - - # If there isn't check for an ascii name. - if not upload_name: - upload_name_ascii = params.get(b"filename", None) - if upload_name_ascii and is_ascii(upload_name_ascii): - upload_name = upload_name_ascii.decode("ascii") - - # This may be None here, indicating we did not find a matching name. - return upload_name - - -def _parse_header(line: bytes) -> Tuple[bytes, Dict[bytes, bytes]]: - """Parse a Content-type like header. - - Cargo-culted from `cgi`, but works on bytes rather than strings. - - Args: - line: header to be parsed - - Returns: - The main content-type, followed by the parameter dictionary - """ - parts = _parseparam(b";" + line) - key = next(parts) - pdict = {} - for p in parts: - i = p.find(b"=") - if i >= 0: - name = p[:i].strip().lower() - value = p[i + 1 :].strip() - - # strip double-quotes - if len(value) >= 2 and value[0:1] == value[-1:] == b'"': - value = value[1:-1] - value = value.replace(b"\\\\", b"\\").replace(b'\\"', b'"') - pdict[name] = value - - return key, pdict - - -def _parseparam(s: bytes) -> Generator[bytes, None, None]: - """Generator which splits the input on ;, respecting double-quoted sequences - - Cargo-culted from `cgi`, but works on bytes rather than strings. - - Args: - s: header to be parsed - - Returns: - The split input - """ - while s[:1] == b";": - s = s[1:] - - # look for the next ; - end = s.find(b";") - - # if there is an odd number of " marks between here and the next ;, skip to the - # next ; instead - while end > 0 and (s.count(b'"', 0, end) - s.count(b'\\"', 0, end)) % 2: - end = s.find(b";", end + 1) - - if end < 0: - end = len(s) - f = s[:end] - yield f.strip() - s = s[end:] +# This exists purely for backwards compatibility with media providers and spam checkers. +from synapse.media._base import FileInfo, Responder # noqa: F401 diff --git a/synapse/rest/media/v1/config_resource.py b/synapse/rest/media/v1/config_resource.py deleted file mode 100644 index a95804d327..0000000000 --- a/synapse/rest/media/v1/config_resource.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2018 Will Hunt -# Copyright 2020-2021 The Matrix.org Foundation C.I.C. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from typing import TYPE_CHECKING - -from synapse.http.server import DirectServeJsonResource, respond_with_json -from synapse.http.site import SynapseRequest - -if TYPE_CHECKING: - from synapse.server import HomeServer - - -class MediaConfigResource(DirectServeJsonResource): - isLeaf = True - - def __init__(self, hs: "HomeServer"): - super().__init__() - config = hs.config - self.clock = hs.get_clock() - self.auth = hs.get_auth() - self.limits_dict = {"m.upload.size": config.media.max_upload_size} - - async def _async_render_GET(self, request: SynapseRequest) -> None: - await self.auth.get_user_by_req(request) - respond_with_json(request, 200, self.limits_dict, send_cors=True) - - async def _async_render_OPTIONS(self, request: SynapseRequest) -> None: - respond_with_json(request, 200, {}, send_cors=True) diff --git a/synapse/rest/media/v1/download_resource.py b/synapse/rest/media/v1/download_resource.py deleted file mode 100644 index 048a042692..0000000000 --- a/synapse/rest/media/v1/download_resource.py +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright 2014-2016 OpenMarket Ltd -# Copyright 2020-2021 The Matrix.org Foundation C.I.C. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -from typing import TYPE_CHECKING - -from synapse.http.server import ( - DirectServeJsonResource, - set_corp_headers, - set_cors_headers, -) -from synapse.http.servlet import parse_boolean -from synapse.http.site import SynapseRequest - -from ._base import parse_media_id, respond_404 - -if TYPE_CHECKING: - from synapse.rest.media.v1.media_repository import MediaRepository - from synapse.server import HomeServer - -logger = logging.getLogger(__name__) - - -class DownloadResource(DirectServeJsonResource): - isLeaf = True - - def __init__(self, hs: "HomeServer", media_repo: "MediaRepository"): - super().__init__() - self.media_repo = media_repo - self.server_name = hs.hostname - - async def _async_render_GET(self, request: SynapseRequest) -> None: - set_cors_headers(request) - set_corp_headers(request) - request.setHeader( - b"Content-Security-Policy", - b"sandbox;" - b" default-src 'none';" - b" script-src 'none';" - b" plugin-types application/pdf;" - b" style-src 'unsafe-inline';" - b" media-src 'self';" - b" object-src 'self';", - ) - # Limited non-standard form of CSP for IE11 - request.setHeader(b"X-Content-Security-Policy", b"sandbox;") - request.setHeader( - b"Referrer-Policy", - b"no-referrer", - ) - server_name, media_id, name = parse_media_id(request) - if server_name == self.server_name: - await self.media_repo.get_local_media(request, media_id, name) - else: - allow_remote = parse_boolean(request, "allow_remote", default=True) - if not allow_remote: - logger.info( - "Rejecting request for remote media %s/%s due to allow_remote", - server_name, - media_id, - ) - respond_404(request) - return - - await self.media_repo.get_remote_media(request, server_name, media_id, name) diff --git a/synapse/rest/media/v1/filepath.py b/synapse/rest/media/v1/filepath.py deleted file mode 100644 index 1f6441c412..0000000000 --- a/synapse/rest/media/v1/filepath.py +++ /dev/null @@ -1,410 +0,0 @@ -# Copyright 2014-2016 OpenMarket Ltd -# Copyright 2020-2021 The Matrix.org Foundation C.I.C. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import functools -import os -import re -import string -from typing import Any, Callable, List, TypeVar, Union, cast - -NEW_FORMAT_ID_RE = re.compile(r"^\d\d\d\d-\d\d-\d\d") - - -F = TypeVar("F", bound=Callable[..., str]) - - -def _wrap_in_base_path(func: F) -> F: - """Takes a function that returns a relative path and turns it into an - absolute path based on the location of the primary media store - """ - - @functools.wraps(func) - def _wrapped(self: "MediaFilePaths", *args: Any, **kwargs: Any) -> str: - path = func(self, *args, **kwargs) - return os.path.join(self.base_path, path) - - return cast(F, _wrapped) - - -GetPathMethod = TypeVar( - "GetPathMethod", bound=Union[Callable[..., str], Callable[..., List[str]]] -) - - -def _wrap_with_jail_check(relative: bool) -> Callable[[GetPathMethod], GetPathMethod]: - """Wraps a path-returning method to check that the returned path(s) do not escape - the media store directory. - - The path-returning method may return either a single path, or a list of paths. - - The check is not expected to ever fail, unless `func` is missing a call to - `_validate_path_component`, or `_validate_path_component` is buggy. - - Args: - relative: A boolean indicating whether the wrapped method returns paths relative - to the media store directory. - - Returns: - A method which will wrap a path-returning method, adding a check to ensure that - the returned path(s) lie within the media store directory. The check will raise - a `ValueError` if it fails. - """ - - def _wrap_with_jail_check_inner(func: GetPathMethod) -> GetPathMethod: - @functools.wraps(func) - def _wrapped( - self: "MediaFilePaths", *args: Any, **kwargs: Any - ) -> Union[str, List[str]]: - path_or_paths = func(self, *args, **kwargs) - - if isinstance(path_or_paths, list): - paths_to_check = path_or_paths - else: - paths_to_check = [path_or_paths] - - for path in paths_to_check: - # Construct the path that will ultimately be used. - # We cannot guess whether `path` is relative to the media store - # directory, since the media store directory may itself be a relative - # path. - if relative: - path = os.path.join(self.base_path, path) - normalized_path = os.path.normpath(path) - - # Now that `normpath` has eliminated `../`s and `./`s from the path, - # `os.path.commonpath` can be used to check whether it lies within the - # media store directory. - if ( - os.path.commonpath([normalized_path, self.normalized_base_path]) - != self.normalized_base_path - ): - # The path resolves to outside the media store directory, - # or `self.base_path` is `.`, which is an unlikely configuration. - raise ValueError(f"Invalid media store path: {path!r}") - - # Note that `os.path.normpath`/`abspath` has a subtle caveat: - # `a/b/c/../c` will normalize to `a/b/c`, but the former refers to a - # different path if `a/b/c` is a symlink. That is, the check above is - # not perfect and may allow a certain restricted subset of untrustworthy - # paths through. Since the check above is secondary to the main - # `_validate_path_component` checks, it's less important for it to be - # perfect. - # - # As an alternative, `os.path.realpath` will resolve symlinks, but - # proves problematic if there are symlinks inside the media store. - # eg. if `url_store/` is symlinked to elsewhere, its canonical path - # won't match that of the main media store directory. - - return path_or_paths - - return cast(GetPathMethod, _wrapped) - - return _wrap_with_jail_check_inner - - -ALLOWED_CHARACTERS = set( - string.ascii_letters - + string.digits - + "_-" - + ".[]:" # Domain names, IPv6 addresses and ports in server names -) -FORBIDDEN_NAMES = { - "", - os.path.curdir, # "." for the current platform - os.path.pardir, # ".." for the current platform -} - - -def _validate_path_component(name: str) -> str: - """Checks that the given string can be safely used as a path component - - Args: - name: The path component to check. - - Returns: - The path component if valid. - - Raises: - ValueError: If `name` cannot be safely used as a path component. - """ - if not ALLOWED_CHARACTERS.issuperset(name) or name in FORBIDDEN_NAMES: - raise ValueError(f"Invalid path component: {name!r}") - - return name - - -class MediaFilePaths: - """Describes where files are stored on disk. - - Most of the functions have a `*_rel` variant which returns a file path that - is relative to the base media store path. This is mainly used when we want - to write to the backup media store (when one is configured) - """ - - def __init__(self, primary_base_path: str): - self.base_path = primary_base_path - self.normalized_base_path = os.path.normpath(self.base_path) - - # Refuse to initialize if paths cannot be validated correctly for the current - # platform. - assert os.path.sep not in ALLOWED_CHARACTERS - assert os.path.altsep not in ALLOWED_CHARACTERS - # On Windows, paths have all sorts of weirdness which `_validate_path_component` - # does not consider. In any case, the remote media store can't work correctly - # for certain homeservers there, since ":"s aren't allowed in paths. - assert os.name == "posix" - - @_wrap_with_jail_check(relative=True) - def local_media_filepath_rel(self, media_id: str) -> str: - return os.path.join( - "local_content", - _validate_path_component(media_id[0:2]), - _validate_path_component(media_id[2:4]), - _validate_path_component(media_id[4:]), - ) - - local_media_filepath = _wrap_in_base_path(local_media_filepath_rel) - - @_wrap_with_jail_check(relative=True) - def local_media_thumbnail_rel( - self, media_id: str, width: int, height: int, content_type: str, method: str - ) -> str: - top_level_type, sub_type = content_type.split("/") - file_name = "%i-%i-%s-%s-%s" % (width, height, top_level_type, sub_type, method) - return os.path.join( - "local_thumbnails", - _validate_path_component(media_id[0:2]), - _validate_path_component(media_id[2:4]), - _validate_path_component(media_id[4:]), - _validate_path_component(file_name), - ) - - local_media_thumbnail = _wrap_in_base_path(local_media_thumbnail_rel) - - @_wrap_with_jail_check(relative=False) - def local_media_thumbnail_dir(self, media_id: str) -> str: - """ - Retrieve the local store path of thumbnails of a given media_id - - Args: - media_id: The media ID to query. - Returns: - Path of local_thumbnails from media_id - """ - return os.path.join( - self.base_path, - "local_thumbnails", - _validate_path_component(media_id[0:2]), - _validate_path_component(media_id[2:4]), - _validate_path_component(media_id[4:]), - ) - - @_wrap_with_jail_check(relative=True) - def remote_media_filepath_rel(self, server_name: str, file_id: str) -> str: - return os.path.join( - "remote_content", - _validate_path_component(server_name), - _validate_path_component(file_id[0:2]), - _validate_path_component(file_id[2:4]), - _validate_path_component(file_id[4:]), - ) - - remote_media_filepath = _wrap_in_base_path(remote_media_filepath_rel) - - @_wrap_with_jail_check(relative=True) - def remote_media_thumbnail_rel( - self, - server_name: str, - file_id: str, - width: int, - height: int, - content_type: str, - method: str, - ) -> str: - top_level_type, sub_type = content_type.split("/") - file_name = "%i-%i-%s-%s-%s" % (width, height, top_level_type, sub_type, method) - return os.path.join( - "remote_thumbnail", - _validate_path_component(server_name), - _validate_path_component(file_id[0:2]), - _validate_path_component(file_id[2:4]), - _validate_path_component(file_id[4:]), - _validate_path_component(file_name), - ) - - remote_media_thumbnail = _wrap_in_base_path(remote_media_thumbnail_rel) - - # Legacy path that was used to store thumbnails previously. - # Should be removed after some time, when most of the thumbnails are stored - # using the new path. - @_wrap_with_jail_check(relative=True) - def remote_media_thumbnail_rel_legacy( - self, server_name: str, file_id: str, width: int, height: int, content_type: str - ) -> str: - top_level_type, sub_type = content_type.split("/") - file_name = "%i-%i-%s-%s" % (width, height, top_level_type, sub_type) - return os.path.join( - "remote_thumbnail", - _validate_path_component(server_name), - _validate_path_component(file_id[0:2]), - _validate_path_component(file_id[2:4]), - _validate_path_component(file_id[4:]), - _validate_path_component(file_name), - ) - - @_wrap_with_jail_check(relative=False) - def remote_media_thumbnail_dir(self, server_name: str, file_id: str) -> str: - return os.path.join( - self.base_path, - "remote_thumbnail", - _validate_path_component(server_name), - _validate_path_component(file_id[0:2]), - _validate_path_component(file_id[2:4]), - _validate_path_component(file_id[4:]), - ) - - @_wrap_with_jail_check(relative=True) - def url_cache_filepath_rel(self, media_id: str) -> str: - if NEW_FORMAT_ID_RE.match(media_id): - # Media id is of the form - # E.g.: 2017-09-28-fsdRDt24DS234dsf - return os.path.join( - "url_cache", - _validate_path_component(media_id[:10]), - _validate_path_component(media_id[11:]), - ) - else: - return os.path.join( - "url_cache", - _validate_path_component(media_id[0:2]), - _validate_path_component(media_id[2:4]), - _validate_path_component(media_id[4:]), - ) - - url_cache_filepath = _wrap_in_base_path(url_cache_filepath_rel) - - @_wrap_with_jail_check(relative=False) - def url_cache_filepath_dirs_to_delete(self, media_id: str) -> List[str]: - "The dirs to try and remove if we delete the media_id file" - if NEW_FORMAT_ID_RE.match(media_id): - return [ - os.path.join( - self.base_path, "url_cache", _validate_path_component(media_id[:10]) - ) - ] - else: - return [ - os.path.join( - self.base_path, - "url_cache", - _validate_path_component(media_id[0:2]), - _validate_path_component(media_id[2:4]), - ), - os.path.join( - self.base_path, "url_cache", _validate_path_component(media_id[0:2]) - ), - ] - - @_wrap_with_jail_check(relative=True) - def url_cache_thumbnail_rel( - self, media_id: str, width: int, height: int, content_type: str, method: str - ) -> str: - # Media id is of the form - # E.g.: 2017-09-28-fsdRDt24DS234dsf - - top_level_type, sub_type = content_type.split("/") - file_name = "%i-%i-%s-%s-%s" % (width, height, top_level_type, sub_type, method) - - if NEW_FORMAT_ID_RE.match(media_id): - return os.path.join( - "url_cache_thumbnails", - _validate_path_component(media_id[:10]), - _validate_path_component(media_id[11:]), - _validate_path_component(file_name), - ) - else: - return os.path.join( - "url_cache_thumbnails", - _validate_path_component(media_id[0:2]), - _validate_path_component(media_id[2:4]), - _validate_path_component(media_id[4:]), - _validate_path_component(file_name), - ) - - url_cache_thumbnail = _wrap_in_base_path(url_cache_thumbnail_rel) - - @_wrap_with_jail_check(relative=True) - def url_cache_thumbnail_directory_rel(self, media_id: str) -> str: - # Media id is of the form - # E.g.: 2017-09-28-fsdRDt24DS234dsf - - if NEW_FORMAT_ID_RE.match(media_id): - return os.path.join( - "url_cache_thumbnails", - _validate_path_component(media_id[:10]), - _validate_path_component(media_id[11:]), - ) - else: - return os.path.join( - "url_cache_thumbnails", - _validate_path_component(media_id[0:2]), - _validate_path_component(media_id[2:4]), - _validate_path_component(media_id[4:]), - ) - - url_cache_thumbnail_directory = _wrap_in_base_path( - url_cache_thumbnail_directory_rel - ) - - @_wrap_with_jail_check(relative=False) - def url_cache_thumbnail_dirs_to_delete(self, media_id: str) -> List[str]: - "The dirs to try and remove if we delete the media_id thumbnails" - # Media id is of the form - # E.g.: 2017-09-28-fsdRDt24DS234dsf - if NEW_FORMAT_ID_RE.match(media_id): - return [ - os.path.join( - self.base_path, - "url_cache_thumbnails", - _validate_path_component(media_id[:10]), - _validate_path_component(media_id[11:]), - ), - os.path.join( - self.base_path, - "url_cache_thumbnails", - _validate_path_component(media_id[:10]), - ), - ] - else: - return [ - os.path.join( - self.base_path, - "url_cache_thumbnails", - _validate_path_component(media_id[0:2]), - _validate_path_component(media_id[2:4]), - _validate_path_component(media_id[4:]), - ), - os.path.join( - self.base_path, - "url_cache_thumbnails", - _validate_path_component(media_id[0:2]), - _validate_path_component(media_id[2:4]), - ), - os.path.join( - self.base_path, - "url_cache_thumbnails", - _validate_path_component(media_id[0:2]), - ), - ] diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py deleted file mode 100644 index c70e1837af..0000000000 --- a/synapse/rest/media/v1/media_repository.py +++ /dev/null @@ -1,1112 +0,0 @@ -# Copyright 2014-2016 OpenMarket Ltd -# Copyright 2018-2021 The Matrix.org Foundation C.I.C. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import errno -import logging -import os -import shutil -from io import BytesIO -from typing import IO, TYPE_CHECKING, Dict, List, Optional, Set, Tuple - -from matrix_common.types.mxc_uri import MXCUri - -import twisted.internet.error -import twisted.web.http -from twisted.internet.defer import Deferred - -from synapse.api.errors import ( - FederationDeniedError, - HttpResponseException, - NotFoundError, - RequestSendFailed, - SynapseError, -) -from synapse.config._base import ConfigError -from synapse.config.repository import ThumbnailRequirement -from synapse.http.server import UnrecognizedRequestResource -from synapse.http.site import SynapseRequest -from synapse.logging.context import defer_to_thread -from synapse.metrics.background_process_metrics import run_as_background_process -from synapse.types import UserID -from synapse.util.async_helpers import Linearizer -from synapse.util.retryutils import NotRetryingDestination -from synapse.util.stringutils import random_string - -from ._base import ( - FileInfo, - Responder, - ThumbnailInfo, - get_filename_from_headers, - respond_404, - respond_with_responder, -) -from .config_resource import MediaConfigResource -from .download_resource import DownloadResource -from .filepath import MediaFilePaths -from .media_storage import MediaStorage -from .preview_url_resource import PreviewUrlResource -from .storage_provider import StorageProviderWrapper -from .thumbnail_resource import ThumbnailResource -from .thumbnailer import Thumbnailer, ThumbnailError -from .upload_resource import UploadResource - -if TYPE_CHECKING: - from synapse.server import HomeServer - -logger = logging.getLogger(__name__) - -# How often to run the background job to update the "recently accessed" -# attribute of local and remote media. -UPDATE_RECENTLY_ACCESSED_TS = 60 * 1000 # 1 minute -# How often to run the background job to check for local and remote media -# that should be purged according to the configured media retention settings. -MEDIA_RETENTION_CHECK_PERIOD_MS = 60 * 60 * 1000 # 1 hour - - -class MediaRepository: - def __init__(self, hs: "HomeServer"): - self.hs = hs - self.auth = hs.get_auth() - self.client = hs.get_federation_http_client() - self.clock = hs.get_clock() - self.server_name = hs.hostname - self.store = hs.get_datastores().main - self.max_upload_size = hs.config.media.max_upload_size - self.max_image_pixels = hs.config.media.max_image_pixels - - Thumbnailer.set_limits(self.max_image_pixels) - - self.primary_base_path: str = hs.config.media.media_store_path - self.filepaths: MediaFilePaths = MediaFilePaths(self.primary_base_path) - - self.dynamic_thumbnails = hs.config.media.dynamic_thumbnails - self.thumbnail_requirements = hs.config.media.thumbnail_requirements - - self.remote_media_linearizer = Linearizer(name="media_remote") - - self.recently_accessed_remotes: Set[Tuple[str, str]] = set() - self.recently_accessed_locals: Set[str] = set() - - self.federation_domain_whitelist = ( - hs.config.federation.federation_domain_whitelist - ) - - # List of StorageProviders where we should search for media and - # potentially upload to. - storage_providers = [] - - for ( - clz, - provider_config, - wrapper_config, - ) in hs.config.media.media_storage_providers: - backend = clz(hs, provider_config) - provider = StorageProviderWrapper( - backend, - store_local=wrapper_config.store_local, - store_remote=wrapper_config.store_remote, - store_synchronous=wrapper_config.store_synchronous, - ) - storage_providers.append(provider) - - self.media_storage = MediaStorage( - self.hs, self.primary_base_path, self.filepaths, storage_providers - ) - - self.clock.looping_call( - self._start_update_recently_accessed, UPDATE_RECENTLY_ACCESSED_TS - ) - - # Media retention configuration options - self._media_retention_local_media_lifetime_ms = ( - hs.config.media.media_retention_local_media_lifetime_ms - ) - self._media_retention_remote_media_lifetime_ms = ( - hs.config.media.media_retention_remote_media_lifetime_ms - ) - - # Check whether local or remote media retention is configured - if ( - hs.config.media.media_retention_local_media_lifetime_ms is not None - or hs.config.media.media_retention_remote_media_lifetime_ms is not None - ): - # Run the background job to apply media retention rules routinely, - # with the duration between runs dictated by the homeserver config. - self.clock.looping_call( - self._start_apply_media_retention_rules, - MEDIA_RETENTION_CHECK_PERIOD_MS, - ) - - def _start_update_recently_accessed(self) -> Deferred: - return run_as_background_process( - "update_recently_accessed_media", self._update_recently_accessed - ) - - def _start_apply_media_retention_rules(self) -> Deferred: - return run_as_background_process( - "apply_media_retention_rules", self._apply_media_retention_rules - ) - - async def _update_recently_accessed(self) -> None: - remote_media = self.recently_accessed_remotes - self.recently_accessed_remotes = set() - - local_media = self.recently_accessed_locals - self.recently_accessed_locals = set() - - await self.store.update_cached_last_access_time( - local_media, remote_media, self.clock.time_msec() - ) - - def mark_recently_accessed(self, server_name: Optional[str], media_id: str) -> None: - """Mark the given media as recently accessed. - - Args: - server_name: Origin server of media, or None if local - media_id: The media ID of the content - """ - if server_name: - self.recently_accessed_remotes.add((server_name, media_id)) - else: - self.recently_accessed_locals.add(media_id) - - async def create_content( - self, - media_type: str, - upload_name: Optional[str], - content: IO, - content_length: int, - auth_user: UserID, - ) -> MXCUri: - """Store uploaded content for a local user and return the mxc URL - - Args: - media_type: The content type of the file. - upload_name: The name of the file, if provided. - content: A file like object that is the content to store - content_length: The length of the content - auth_user: The user_id of the uploader - - Returns: - The mxc url of the stored content - """ - - media_id = random_string(24) - - file_info = FileInfo(server_name=None, file_id=media_id) - - fname = await self.media_storage.store_file(content, file_info) - - logger.info("Stored local media in file %r", fname) - - await self.store.store_local_media( - media_id=media_id, - media_type=media_type, - time_now_ms=self.clock.time_msec(), - upload_name=upload_name, - media_length=content_length, - user_id=auth_user, - ) - - await self._generate_thumbnails(None, media_id, media_id, media_type) - - return MXCUri(self.server_name, media_id) - - async def get_local_media( - self, request: SynapseRequest, media_id: str, name: Optional[str] - ) -> None: - """Responds to requests for local media, if exists, or returns 404. - - Args: - request: The incoming request. - media_id: The media ID of the content. (This is the same as - the file_id for local content.) - name: Optional name that, if specified, will be used as - the filename in the Content-Disposition header of the response. - - Returns: - Resolves once a response has successfully been written to request - """ - media_info = await self.store.get_local_media(media_id) - if not media_info or media_info["quarantined_by"]: - respond_404(request) - return - - self.mark_recently_accessed(None, media_id) - - media_type = media_info["media_type"] - if not media_type: - media_type = "application/octet-stream" - media_length = media_info["media_length"] - upload_name = name if name else media_info["upload_name"] - url_cache = media_info["url_cache"] - - file_info = FileInfo(None, media_id, url_cache=bool(url_cache)) - - responder = await self.media_storage.fetch_media(file_info) - await respond_with_responder( - request, responder, media_type, media_length, upload_name - ) - - async def get_remote_media( - self, - request: SynapseRequest, - server_name: str, - media_id: str, - name: Optional[str], - ) -> None: - """Respond to requests for remote media. - - Args: - request: The incoming request. - server_name: Remote server_name where the media originated. - media_id: The media ID of the content (as defined by the remote server). - name: Optional name that, if specified, will be used as - the filename in the Content-Disposition header of the response. - - Returns: - Resolves once a response has successfully been written to request - """ - if ( - self.federation_domain_whitelist is not None - and server_name not in self.federation_domain_whitelist - ): - raise FederationDeniedError(server_name) - - self.mark_recently_accessed(server_name, media_id) - - # We linearize here to ensure that we don't try and download remote - # media multiple times concurrently - key = (server_name, media_id) - async with self.remote_media_linearizer.queue(key): - responder, media_info = await self._get_remote_media_impl( - server_name, media_id - ) - - # We deliberately stream the file outside the lock - if responder: - media_type = media_info["media_type"] - media_length = media_info["media_length"] - upload_name = name if name else media_info["upload_name"] - await respond_with_responder( - request, responder, media_type, media_length, upload_name - ) - else: - respond_404(request) - - async def get_remote_media_info(self, server_name: str, media_id: str) -> dict: - """Gets the media info associated with the remote file, downloading - if necessary. - - Args: - server_name: Remote server_name where the media originated. - media_id: The media ID of the content (as defined by the remote server). - - Returns: - The media info of the file - """ - if ( - self.federation_domain_whitelist is not None - and server_name not in self.federation_domain_whitelist - ): - raise FederationDeniedError(server_name) - - # We linearize here to ensure that we don't try and download remote - # media multiple times concurrently - key = (server_name, media_id) - async with self.remote_media_linearizer.queue(key): - responder, media_info = await self._get_remote_media_impl( - server_name, media_id - ) - - # Ensure we actually use the responder so that it releases resources - if responder: - with responder: - pass - - return media_info - - async def _get_remote_media_impl( - self, server_name: str, media_id: str - ) -> Tuple[Optional[Responder], dict]: - """Looks for media in local cache, if not there then attempt to - download from remote server. - - Args: - server_name: Remote server_name where the media originated. - media_id: The media ID of the content (as defined by the - remote server). - - Returns: - A tuple of responder and the media info of the file. - """ - media_info = await self.store.get_cached_remote_media(server_name, media_id) - - # file_id is the ID we use to track the file locally. If we've already - # seen the file then reuse the existing ID, otherwise generate a new - # one. - - # If we have an entry in the DB, try and look for it - if media_info: - file_id = media_info["filesystem_id"] - file_info = FileInfo(server_name, file_id) - - if media_info["quarantined_by"]: - logger.info("Media is quarantined") - raise NotFoundError() - - if not media_info["media_type"]: - media_info["media_type"] = "application/octet-stream" - - responder = await self.media_storage.fetch_media(file_info) - if responder: - return responder, media_info - - # Failed to find the file anywhere, lets download it. - - try: - media_info = await self._download_remote_file( - server_name, - media_id, - ) - except SynapseError: - raise - except Exception as e: - # An exception may be because we downloaded media in another - # process, so let's check if we magically have the media. - media_info = await self.store.get_cached_remote_media(server_name, media_id) - if not media_info: - raise e - - file_id = media_info["filesystem_id"] - if not media_info["media_type"]: - media_info["media_type"] = "application/octet-stream" - file_info = FileInfo(server_name, file_id) - - # We generate thumbnails even if another process downloaded the media - # as a) it's conceivable that the other download request dies before it - # generates thumbnails, but mainly b) we want to be sure the thumbnails - # have finished being generated before responding to the client, - # otherwise they'll request thumbnails and get a 404 if they're not - # ready yet. - await self._generate_thumbnails( - server_name, media_id, file_id, media_info["media_type"] - ) - - responder = await self.media_storage.fetch_media(file_info) - return responder, media_info - - async def _download_remote_file( - self, - server_name: str, - media_id: str, - ) -> dict: - """Attempt to download the remote file from the given server name, - using the given file_id as the local id. - - Args: - server_name: Originating server - media_id: The media ID of the content (as defined by the - remote server). This is different than the file_id, which is - locally generated. - file_id: Local file ID - - Returns: - The media info of the file. - """ - - file_id = random_string(24) - - file_info = FileInfo(server_name=server_name, file_id=file_id) - - with self.media_storage.store_into_file(file_info) as (f, fname, finish): - request_path = "/".join( - ("/_matrix/media/r0/download", server_name, media_id) - ) - try: - length, headers = await self.client.get_file( - server_name, - request_path, - output_stream=f, - max_size=self.max_upload_size, - args={ - # tell the remote server to 404 if it doesn't - # recognise the server_name, to make sure we don't - # end up with a routing loop. - "allow_remote": "false" - }, - ) - except RequestSendFailed as e: - logger.warning( - "Request failed fetching remote media %s/%s: %r", - server_name, - media_id, - e, - ) - raise SynapseError(502, "Failed to fetch remote media") - - except HttpResponseException as e: - logger.warning( - "HTTP error fetching remote media %s/%s: %s", - server_name, - media_id, - e.response, - ) - if e.code == twisted.web.http.NOT_FOUND: - raise e.to_synapse_error() - raise SynapseError(502, "Failed to fetch remote media") - - except SynapseError: - logger.warning( - "Failed to fetch remote media %s/%s", server_name, media_id - ) - raise - except NotRetryingDestination: - logger.warning("Not retrying destination %r", server_name) - raise SynapseError(502, "Failed to fetch remote media") - except Exception: - logger.exception( - "Failed to fetch remote media %s/%s", server_name, media_id - ) - raise SynapseError(502, "Failed to fetch remote media") - - await finish() - - if b"Content-Type" in headers: - media_type = headers[b"Content-Type"][0].decode("ascii") - else: - media_type = "application/octet-stream" - upload_name = get_filename_from_headers(headers) - time_now_ms = self.clock.time_msec() - - # Multiple remote media download requests can race (when using - # multiple media repos), so this may throw a violation constraint - # exception. If it does we'll delete the newly downloaded file from - # disk (as we're in the ctx manager). - # - # However: we've already called `finish()` so we may have also - # written to the storage providers. This is preferable to the - # alternative where we call `finish()` *after* this, where we could - # end up having an entry in the DB but fail to write the files to - # the storage providers. - await self.store.store_cached_remote_media( - origin=server_name, - media_id=media_id, - media_type=media_type, - time_now_ms=self.clock.time_msec(), - upload_name=upload_name, - media_length=length, - filesystem_id=file_id, - ) - - logger.info("Stored remote media in file %r", fname) - - media_info = { - "media_type": media_type, - "media_length": length, - "upload_name": upload_name, - "created_ts": time_now_ms, - "filesystem_id": file_id, - } - - return media_info - - def _get_thumbnail_requirements( - self, media_type: str - ) -> Tuple[ThumbnailRequirement, ...]: - scpos = media_type.find(";") - if scpos > 0: - media_type = media_type[:scpos] - return self.thumbnail_requirements.get(media_type, ()) - - def _generate_thumbnail( - self, - thumbnailer: Thumbnailer, - t_width: int, - t_height: int, - t_method: str, - t_type: str, - ) -> Optional[BytesIO]: - m_width = thumbnailer.width - m_height = thumbnailer.height - - if m_width * m_height >= self.max_image_pixels: - logger.info( - "Image too large to thumbnail %r x %r > %r", - m_width, - m_height, - self.max_image_pixels, - ) - return None - - if thumbnailer.transpose_method is not None: - m_width, m_height = thumbnailer.transpose() - - if t_method == "crop": - return thumbnailer.crop(t_width, t_height, t_type) - elif t_method == "scale": - t_width, t_height = thumbnailer.aspect(t_width, t_height) - t_width = min(m_width, t_width) - t_height = min(m_height, t_height) - return thumbnailer.scale(t_width, t_height, t_type) - - return None - - async def generate_local_exact_thumbnail( - self, - media_id: str, - t_width: int, - t_height: int, - t_method: str, - t_type: str, - url_cache: bool, - ) -> Optional[str]: - input_path = await self.media_storage.ensure_media_is_in_local_cache( - FileInfo(None, media_id, url_cache=url_cache) - ) - - try: - thumbnailer = Thumbnailer(input_path) - except ThumbnailError as e: - logger.warning( - "Unable to generate a thumbnail for local media %s using a method of %s and type of %s: %s", - media_id, - t_method, - t_type, - e, - ) - return None - - with thumbnailer: - t_byte_source = await defer_to_thread( - self.hs.get_reactor(), - self._generate_thumbnail, - thumbnailer, - t_width, - t_height, - t_method, - t_type, - ) - - if t_byte_source: - try: - file_info = FileInfo( - server_name=None, - file_id=media_id, - url_cache=url_cache, - thumbnail=ThumbnailInfo( - width=t_width, - height=t_height, - method=t_method, - type=t_type, - ), - ) - - output_path = await self.media_storage.store_file( - t_byte_source, file_info - ) - finally: - t_byte_source.close() - - logger.info("Stored thumbnail in file %r", output_path) - - t_len = os.path.getsize(output_path) - - await self.store.store_local_thumbnail( - media_id, t_width, t_height, t_type, t_method, t_len - ) - - return output_path - - # Could not generate thumbnail. - return None - - async def generate_remote_exact_thumbnail( - self, - server_name: str, - file_id: str, - media_id: str, - t_width: int, - t_height: int, - t_method: str, - t_type: str, - ) -> Optional[str]: - input_path = await self.media_storage.ensure_media_is_in_local_cache( - FileInfo(server_name, file_id) - ) - - try: - thumbnailer = Thumbnailer(input_path) - except ThumbnailError as e: - logger.warning( - "Unable to generate a thumbnail for remote media %s from %s using a method of %s and type of %s: %s", - media_id, - server_name, - t_method, - t_type, - e, - ) - return None - - with thumbnailer: - t_byte_source = await defer_to_thread( - self.hs.get_reactor(), - self._generate_thumbnail, - thumbnailer, - t_width, - t_height, - t_method, - t_type, - ) - - if t_byte_source: - try: - file_info = FileInfo( - server_name=server_name, - file_id=file_id, - thumbnail=ThumbnailInfo( - width=t_width, - height=t_height, - method=t_method, - type=t_type, - ), - ) - - output_path = await self.media_storage.store_file( - t_byte_source, file_info - ) - finally: - t_byte_source.close() - - logger.info("Stored thumbnail in file %r", output_path) - - t_len = os.path.getsize(output_path) - - await self.store.store_remote_media_thumbnail( - server_name, - media_id, - file_id, - t_width, - t_height, - t_type, - t_method, - t_len, - ) - - return output_path - - # Could not generate thumbnail. - return None - - async def _generate_thumbnails( - self, - server_name: Optional[str], - media_id: str, - file_id: str, - media_type: str, - url_cache: bool = False, - ) -> Optional[dict]: - """Generate and store thumbnails for an image. - - Args: - server_name: The server name if remote media, else None if local - media_id: The media ID of the content. (This is the same as - the file_id for local content) - file_id: Local file ID - media_type: The content type of the file - url_cache: If we are thumbnailing images downloaded for the URL cache, - used exclusively by the url previewer - - Returns: - Dict with "width" and "height" keys of original image or None if the - media cannot be thumbnailed. - """ - requirements = self._get_thumbnail_requirements(media_type) - if not requirements: - return None - - input_path = await self.media_storage.ensure_media_is_in_local_cache( - FileInfo(server_name, file_id, url_cache=url_cache) - ) - - try: - thumbnailer = Thumbnailer(input_path) - except ThumbnailError as e: - logger.warning( - "Unable to generate thumbnails for remote media %s from %s of type %s: %s", - media_id, - server_name, - media_type, - e, - ) - return None - - with thumbnailer: - m_width = thumbnailer.width - m_height = thumbnailer.height - - if m_width * m_height >= self.max_image_pixels: - logger.info( - "Image too large to thumbnail %r x %r > %r", - m_width, - m_height, - self.max_image_pixels, - ) - return None - - if thumbnailer.transpose_method is not None: - m_width, m_height = await defer_to_thread( - self.hs.get_reactor(), thumbnailer.transpose - ) - - # We deduplicate the thumbnail sizes by ignoring the cropped versions if - # they have the same dimensions of a scaled one. - thumbnails: Dict[Tuple[int, int, str], str] = {} - for requirement in requirements: - if requirement.method == "crop": - thumbnails.setdefault( - (requirement.width, requirement.height, requirement.media_type), - requirement.method, - ) - elif requirement.method == "scale": - t_width, t_height = thumbnailer.aspect( - requirement.width, requirement.height - ) - t_width = min(m_width, t_width) - t_height = min(m_height, t_height) - thumbnails[ - (t_width, t_height, requirement.media_type) - ] = requirement.method - - # Now we generate the thumbnails for each dimension, store it - for (t_width, t_height, t_type), t_method in thumbnails.items(): - # Generate the thumbnail - if t_method == "crop": - t_byte_source = await defer_to_thread( - self.hs.get_reactor(), - thumbnailer.crop, - t_width, - t_height, - t_type, - ) - elif t_method == "scale": - t_byte_source = await defer_to_thread( - self.hs.get_reactor(), - thumbnailer.scale, - t_width, - t_height, - t_type, - ) - else: - logger.error("Unrecognized method: %r", t_method) - continue - - if not t_byte_source: - continue - - file_info = FileInfo( - server_name=server_name, - file_id=file_id, - url_cache=url_cache, - thumbnail=ThumbnailInfo( - width=t_width, - height=t_height, - method=t_method, - type=t_type, - ), - ) - - with self.media_storage.store_into_file(file_info) as ( - f, - fname, - finish, - ): - try: - await self.media_storage.write_to_file(t_byte_source, f) - await finish() - finally: - t_byte_source.close() - - t_len = os.path.getsize(fname) - - # Write to database - if server_name: - # Multiple remote media download requests can race (when - # using multiple media repos), so this may throw a violation - # constraint exception. If it does we'll delete the newly - # generated thumbnail from disk (as we're in the ctx - # manager). - # - # However: we've already called `finish()` so we may have - # also written to the storage providers. This is preferable - # to the alternative where we call `finish()` *after* this, - # where we could end up having an entry in the DB but fail - # to write the files to the storage providers. - try: - await self.store.store_remote_media_thumbnail( - server_name, - media_id, - file_id, - t_width, - t_height, - t_type, - t_method, - t_len, - ) - except Exception as e: - thumbnail_exists = ( - await self.store.get_remote_media_thumbnail( - server_name, - media_id, - t_width, - t_height, - t_type, - ) - ) - if not thumbnail_exists: - raise e - else: - await self.store.store_local_thumbnail( - media_id, t_width, t_height, t_type, t_method, t_len - ) - - return {"width": m_width, "height": m_height} - - async def _apply_media_retention_rules(self) -> None: - """ - Purge old local and remote media according to the media retention rules - defined in the homeserver config. - """ - # Purge remote media - if self._media_retention_remote_media_lifetime_ms is not None: - # Calculate a threshold timestamp derived from the configured lifetime. Any - # media that has not been accessed since this timestamp will be removed. - remote_media_threshold_timestamp_ms = ( - self.clock.time_msec() - self._media_retention_remote_media_lifetime_ms - ) - - logger.info( - "Purging remote media last accessed before" - f" {remote_media_threshold_timestamp_ms}" - ) - - await self.delete_old_remote_media( - before_ts=remote_media_threshold_timestamp_ms - ) - - # And now do the same for local media - if self._media_retention_local_media_lifetime_ms is not None: - # This works the same as the remote media threshold - local_media_threshold_timestamp_ms = ( - self.clock.time_msec() - self._media_retention_local_media_lifetime_ms - ) - - logger.info( - "Purging local media last accessed before" - f" {local_media_threshold_timestamp_ms}" - ) - - await self.delete_old_local_media( - before_ts=local_media_threshold_timestamp_ms, - keep_profiles=True, - delete_quarantined_media=False, - delete_protected_media=False, - ) - - async def delete_old_remote_media(self, before_ts: int) -> Dict[str, int]: - old_media = await self.store.get_remote_media_ids( - before_ts, include_quarantined_media=False - ) - - deleted = 0 - - for media in old_media: - origin = media["media_origin"] - media_id = media["media_id"] - file_id = media["filesystem_id"] - key = (origin, media_id) - - logger.info("Deleting: %r", key) - - # TODO: Should we delete from the backup store - - async with self.remote_media_linearizer.queue(key): - full_path = self.filepaths.remote_media_filepath(origin, file_id) - try: - os.remove(full_path) - except OSError as e: - logger.warning("Failed to remove file: %r", full_path) - if e.errno == errno.ENOENT: - pass - else: - continue - - thumbnail_dir = self.filepaths.remote_media_thumbnail_dir( - origin, file_id - ) - shutil.rmtree(thumbnail_dir, ignore_errors=True) - - await self.store.delete_remote_media(origin, media_id) - deleted += 1 - - return {"deleted": deleted} - - async def delete_local_media_ids( - self, media_ids: List[str] - ) -> Tuple[List[str], int]: - """ - Delete the given local or remote media ID from this server - - Args: - media_id: The media ID to delete. - Returns: - A tuple of (list of deleted media IDs, total deleted media IDs). - """ - return await self._remove_local_media_from_disk(media_ids) - - async def delete_old_local_media( - self, - before_ts: int, - size_gt: int = 0, - keep_profiles: bool = True, - delete_quarantined_media: bool = False, - delete_protected_media: bool = False, - ) -> Tuple[List[str], int]: - """ - Delete local or remote media from this server by size and timestamp. Removes - media files, any thumbnails and cached URLs. - - Args: - before_ts: Unix timestamp in ms. - Files that were last used before this timestamp will be deleted. - size_gt: Size of the media in bytes. Files that are larger will be deleted. - keep_profiles: Switch to delete also files that are still used in image data - (e.g user profile, room avatar). If false these files will be deleted. - delete_quarantined_media: If True, media marked as quarantined will be deleted. - delete_protected_media: If True, media marked as protected will be deleted. - - Returns: - A tuple of (list of deleted media IDs, total deleted media IDs). - """ - old_media = await self.store.get_local_media_ids( - before_ts, - size_gt, - keep_profiles, - include_quarantined_media=delete_quarantined_media, - include_protected_media=delete_protected_media, - ) - return await self._remove_local_media_from_disk(old_media) - - async def _remove_local_media_from_disk( - self, media_ids: List[str] - ) -> Tuple[List[str], int]: - """ - Delete local or remote media from this server. Removes media files, - any thumbnails and cached URLs. - - Args: - media_ids: List of media_id to delete - Returns: - A tuple of (list of deleted media IDs, total deleted media IDs). - """ - removed_media = [] - for media_id in media_ids: - logger.info("Deleting media with ID '%s'", media_id) - full_path = self.filepaths.local_media_filepath(media_id) - try: - os.remove(full_path) - except OSError as e: - logger.warning("Failed to remove file: %r: %s", full_path, e) - if e.errno == errno.ENOENT: - pass - else: - continue - - thumbnail_dir = self.filepaths.local_media_thumbnail_dir(media_id) - shutil.rmtree(thumbnail_dir, ignore_errors=True) - - await self.store.delete_remote_media(self.server_name, media_id) - - await self.store.delete_url_cache((media_id,)) - await self.store.delete_url_cache_media((media_id,)) - - removed_media.append(media_id) - - return removed_media, len(removed_media) - - -class MediaRepositoryResource(UnrecognizedRequestResource): - """File uploading and downloading. - - Uploads are POSTed to a resource which returns a token which is used to GET - the download:: - - => POST /_matrix/media/r0/upload HTTP/1.1 - Content-Type: - Content-Length: - - - - <= HTTP/1.1 200 OK - Content-Type: application/json - - { "content_uri": "mxc:///" } - - => GET /_matrix/media/r0/download// HTTP/1.1 - - <= HTTP/1.1 200 OK - Content-Type: - Content-Disposition: attachment;filename= - - - - Clients can get thumbnails by supplying a desired width and height and - thumbnailing method:: - - => GET /_matrix/media/r0/thumbnail/ - /?width=&height=&method= HTTP/1.1 - - <= HTTP/1.1 200 OK - Content-Type: image/jpeg or image/png - - - - The thumbnail methods are "crop" and "scale". "scale" tries to return an - image where either the width or the height is smaller than the requested - size. The client should then scale and letterbox the image if it needs to - fit within a given rectangle. "crop" tries to return an image where the - width and height are close to the requested size and the aspect matches - the requested size. The client should scale the image if it needs to fit - within a given rectangle. - """ - - def __init__(self, hs: "HomeServer"): - # If we're not configured to use it, raise if we somehow got here. - if not hs.config.media.can_load_media_repo: - raise ConfigError("Synapse is not configured to use a media repo.") - - super().__init__() - media_repo = hs.get_media_repository() - - self.putChild(b"upload", UploadResource(hs, media_repo)) - self.putChild(b"download", DownloadResource(hs, media_repo)) - self.putChild( - b"thumbnail", ThumbnailResource(hs, media_repo, media_repo.media_storage) - ) - if hs.config.media.url_preview_enabled: - self.putChild( - b"preview_url", - PreviewUrlResource(hs, media_repo, media_repo.media_storage), - ) - self.putChild(b"config", MediaConfigResource(hs)) diff --git a/synapse/rest/media/v1/media_storage.py b/synapse/rest/media/v1/media_storage.py index db25848744..11b0e8e231 100644 --- a/synapse/rest/media/v1/media_storage.py +++ b/synapse/rest/media/v1/media_storage.py @@ -1,4 +1,4 @@ -# Copyright 2018-2021 The Matrix.org Foundation C.I.C. +# Copyright 2023 The Matrix.org Foundation C.I.C. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,364 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import contextlib -import logging -import os -import shutil -from types import TracebackType -from typing import ( - IO, - TYPE_CHECKING, - Any, - Awaitable, - BinaryIO, - Callable, - Generator, - Optional, - Sequence, - Tuple, - Type, -) - -import attr - -from twisted.internet.defer import Deferred -from twisted.internet.interfaces import IConsumer -from twisted.protocols.basic import FileSender - -import synapse -from synapse.api.errors import NotFoundError -from synapse.logging.context import defer_to_thread, make_deferred_yieldable -from synapse.util import Clock -from synapse.util.file_consumer import BackgroundFileConsumer - -from ._base import FileInfo, Responder -from .filepath import MediaFilePaths - -if TYPE_CHECKING: - from synapse.rest.media.v1.storage_provider import StorageProvider - from synapse.server import HomeServer - -logger = logging.getLogger(__name__) - - -class MediaStorage: - """Responsible for storing/fetching files from local sources. - - Args: - hs - local_media_directory: Base path where we store media on disk - filepaths - storage_providers: List of StorageProvider that are used to fetch and store files. - """ - - def __init__( - self, - hs: "HomeServer", - local_media_directory: str, - filepaths: MediaFilePaths, - storage_providers: Sequence["StorageProvider"], - ): - self.hs = hs - self.reactor = hs.get_reactor() - self.local_media_directory = local_media_directory - self.filepaths = filepaths - self.storage_providers = storage_providers - self.spam_checker = hs.get_spam_checker() - self.clock = hs.get_clock() - - async def store_file(self, source: IO, file_info: FileInfo) -> str: - """Write `source` to the on disk media store, and also any other - configured storage providers - - Args: - source: A file like object that should be written - file_info: Info about the file to store - - Returns: - the file path written to in the primary media store - """ - - with self.store_into_file(file_info) as (f, fname, finish_cb): - # Write to the main repository - await self.write_to_file(source, f) - await finish_cb() - - return fname - - async def write_to_file(self, source: IO, output: IO) -> None: - """Asynchronously write the `source` to `output`.""" - await defer_to_thread(self.reactor, _write_file_synchronously, source, output) - - @contextlib.contextmanager - def store_into_file( - self, file_info: FileInfo - ) -> Generator[Tuple[BinaryIO, str, Callable[[], Awaitable[None]]], None, None]: - """Context manager used to get a file like object to write into, as - described by file_info. - - Actually yields a 3-tuple (file, fname, finish_cb), where file is a file - like object that can be written to, fname is the absolute path of file - on disk, and finish_cb is a function that returns an awaitable. - - fname can be used to read the contents from after upload, e.g. to - generate thumbnails. - - finish_cb must be called and waited on after the file has been - successfully been written to. Should not be called if there was an - error. - - Args: - file_info: Info about the file to store - - Example: - - with media_storage.store_into_file(info) as (f, fname, finish_cb): - # .. write into f ... - await finish_cb() - """ - - path = self._file_info_to_path(file_info) - fname = os.path.join(self.local_media_directory, path) - - dirname = os.path.dirname(fname) - os.makedirs(dirname, exist_ok=True) - - finished_called = [False] - - try: - with open(fname, "wb") as f: - - async def finish() -> None: - # Ensure that all writes have been flushed and close the - # file. - f.flush() - f.close() - - spam_check = await self.spam_checker.check_media_file_for_spam( - ReadableFileWrapper(self.clock, fname), file_info - ) - if spam_check != synapse.module_api.NOT_SPAM: - logger.info("Blocking media due to spam checker") - # Note that we'll delete the stored media, due to the - # try/except below. The media also won't be stored in - # the DB. - # We currently ignore any additional field returned by - # the spam-check API. - raise SpamMediaException(errcode=spam_check[0]) - - for provider in self.storage_providers: - await provider.store_file(path, file_info) - - finished_called[0] = True - - yield f, fname, finish - except Exception as e: - try: - os.remove(fname) - except Exception: - pass - - raise e from None - - if not finished_called: - raise Exception("Finished callback not called") - - async def fetch_media(self, file_info: FileInfo) -> Optional[Responder]: - """Attempts to fetch media described by file_info from the local cache - and configured storage providers. - - Args: - file_info - - Returns: - Returns a Responder if the file was found, otherwise None. - """ - paths = [self._file_info_to_path(file_info)] - - # fallback for remote thumbnails with no method in the filename - if file_info.thumbnail and file_info.server_name: - paths.append( - self.filepaths.remote_media_thumbnail_rel_legacy( - server_name=file_info.server_name, - file_id=file_info.file_id, - width=file_info.thumbnail.width, - height=file_info.thumbnail.height, - content_type=file_info.thumbnail.type, - ) - ) - - for path in paths: - local_path = os.path.join(self.local_media_directory, path) - if os.path.exists(local_path): - logger.debug("responding with local file %s", local_path) - return FileResponder(open(local_path, "rb")) - logger.debug("local file %s did not exist", local_path) - - for provider in self.storage_providers: - for path in paths: - res: Any = await provider.fetch(path, file_info) - if res: - logger.debug("Streaming %s from %s", path, provider) - return res - logger.debug("%s not found on %s", path, provider) - - return None - - async def ensure_media_is_in_local_cache(self, file_info: FileInfo) -> str: - """Ensures that the given file is in the local cache. Attempts to - download it from storage providers if it isn't. - - Args: - file_info - - Returns: - Full path to local file - """ - path = self._file_info_to_path(file_info) - local_path = os.path.join(self.local_media_directory, path) - if os.path.exists(local_path): - return local_path - - # Fallback for paths without method names - # Should be removed in the future - if file_info.thumbnail and file_info.server_name: - legacy_path = self.filepaths.remote_media_thumbnail_rel_legacy( - server_name=file_info.server_name, - file_id=file_info.file_id, - width=file_info.thumbnail.width, - height=file_info.thumbnail.height, - content_type=file_info.thumbnail.type, - ) - legacy_local_path = os.path.join(self.local_media_directory, legacy_path) - if os.path.exists(legacy_local_path): - return legacy_local_path - - dirname = os.path.dirname(local_path) - os.makedirs(dirname, exist_ok=True) - - for provider in self.storage_providers: - res: Any = await provider.fetch(path, file_info) - if res: - with res: - consumer = BackgroundFileConsumer( - open(local_path, "wb"), self.reactor - ) - await res.write_to_consumer(consumer) - await consumer.wait() - return local_path - - raise NotFoundError() - - def _file_info_to_path(self, file_info: FileInfo) -> str: - """Converts file_info into a relative path. - - The path is suitable for storing files under a directory, e.g. used to - store files on local FS under the base media repository directory. - """ - if file_info.url_cache: - if file_info.thumbnail: - return self.filepaths.url_cache_thumbnail_rel( - media_id=file_info.file_id, - width=file_info.thumbnail.width, - height=file_info.thumbnail.height, - content_type=file_info.thumbnail.type, - method=file_info.thumbnail.method, - ) - return self.filepaths.url_cache_filepath_rel(file_info.file_id) - - if file_info.server_name: - if file_info.thumbnail: - return self.filepaths.remote_media_thumbnail_rel( - server_name=file_info.server_name, - file_id=file_info.file_id, - width=file_info.thumbnail.width, - height=file_info.thumbnail.height, - content_type=file_info.thumbnail.type, - method=file_info.thumbnail.method, - ) - return self.filepaths.remote_media_filepath_rel( - file_info.server_name, file_info.file_id - ) - - if file_info.thumbnail: - return self.filepaths.local_media_thumbnail_rel( - media_id=file_info.file_id, - width=file_info.thumbnail.width, - height=file_info.thumbnail.height, - content_type=file_info.thumbnail.type, - method=file_info.thumbnail.method, - ) - return self.filepaths.local_media_filepath_rel(file_info.file_id) - - -def _write_file_synchronously(source: IO, dest: IO) -> None: - """Write `source` to the file like `dest` synchronously. Should be called - from a thread. - - Args: - source: A file like object that's to be written - dest: A file like object to be written to - """ - source.seek(0) # Ensure we read from the start of the file - shutil.copyfileobj(source, dest) - - -class FileResponder(Responder): - """Wraps an open file that can be sent to a request. - - Args: - open_file: A file like object to be streamed ot the client, - is closed when finished streaming. - """ - - def __init__(self, open_file: IO): - self.open_file = open_file - - def write_to_consumer(self, consumer: IConsumer) -> Deferred: - return make_deferred_yieldable( - FileSender().beginFileTransfer(self.open_file, consumer) - ) - - def __exit__( - self, - exc_type: Optional[Type[BaseException]], - exc_val: Optional[BaseException], - exc_tb: Optional[TracebackType], - ) -> None: - self.open_file.close() - - -class SpamMediaException(NotFoundError): - """The media was blocked by a spam checker, so we simply 404 the request (in - the same way as if it was quarantined). - """ - - -@attr.s(slots=True, auto_attribs=True) -class ReadableFileWrapper: - """Wrapper that allows reading a file in chunks, yielding to the reactor, - and writing to a callback. - - This is simplified `FileSender` that takes an IO object rather than an - `IConsumer`. - """ - - CHUNK_SIZE = 2**14 - - clock: Clock - path: str - - async def write_chunks_to(self, callback: Callable[[bytes], object]) -> None: - """Reads the file in chunks and calls the callback with each chunk.""" - - with open(self.path, "rb") as file: - while True: - chunk = file.read(self.CHUNK_SIZE) - if not chunk: - break - - callback(chunk) +# - # We yield to the reactor by sleeping for 0 seconds. - await self.clock.sleep(0) +# This exists purely for backwards compatibility with spam checkers. +from synapse.media.media_storage import ReadableFileWrapper # noqa: F401 diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py deleted file mode 100644 index 7592aa5d47..0000000000 --- a/synapse/rest/media/v1/oembed.py +++ /dev/null @@ -1,265 +0,0 @@ -# Copyright 2021 The Matrix.org Foundation C.I.C. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import html -import logging -import urllib.parse -from typing import TYPE_CHECKING, List, Optional - -import attr - -from synapse.rest.media.v1.preview_html import parse_html_description -from synapse.types import JsonDict -from synapse.util import json_decoder - -if TYPE_CHECKING: - from lxml import etree - - from synapse.server import HomeServer - -logger = logging.getLogger(__name__) - - -@attr.s(slots=True, frozen=True, auto_attribs=True) -class OEmbedResult: - # The Open Graph result (converted from the oEmbed result). - open_graph_result: JsonDict - # The author_name of the oEmbed result - author_name: Optional[str] - # Number of milliseconds to cache the content, according to the oEmbed response. - # - # This will be None if no cache-age is provided in the oEmbed response (or - # if the oEmbed response cannot be turned into an Open Graph response). - cache_age: Optional[int] - - -class OEmbedProvider: - """ - A helper for accessing oEmbed content. - - It can be used to check if a URL should be accessed via oEmbed and for - requesting/parsing oEmbed content. - """ - - def __init__(self, hs: "HomeServer"): - self._oembed_patterns = {} - for oembed_endpoint in hs.config.oembed.oembed_patterns: - api_endpoint = oembed_endpoint.api_endpoint - - # Only JSON is supported at the moment. This could be declared in - # the formats field. Otherwise, if the endpoint ends in .xml assume - # it doesn't support JSON. - if ( - oembed_endpoint.formats is not None - and "json" not in oembed_endpoint.formats - ) or api_endpoint.endswith(".xml"): - logger.info( - "Ignoring oEmbed endpoint due to not supporting JSON: %s", - api_endpoint, - ) - continue - - # Iterate through each URL pattern and point it to the endpoint. - for pattern in oembed_endpoint.url_patterns: - self._oembed_patterns[pattern] = api_endpoint - - def get_oembed_url(self, url: str) -> Optional[str]: - """ - Check whether the URL should be downloaded as oEmbed content instead. - - Args: - url: The URL to check. - - Returns: - A URL to use instead or None if the original URL should be used. - """ - for url_pattern, endpoint in self._oembed_patterns.items(): - if url_pattern.fullmatch(url): - # TODO Specify max height / width. - - # Note that only the JSON format is supported, some endpoints want - # this in the URL, others want it as an argument. - endpoint = endpoint.replace("{format}", "json") - - args = {"url": url, "format": "json"} - query_str = urllib.parse.urlencode(args, True) - return f"{endpoint}?{query_str}" - - # No match. - return None - - def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]: - """ - Search an HTML document for oEmbed autodiscovery information. - - Args: - tree: The parsed HTML body. - - Returns: - The URL to use for oEmbed information, or None if no URL was found. - """ - # Search for link elements with the proper rel and type attributes. - for tag in tree.xpath( - "//link[@rel='alternate'][@type='application/json+oembed']" - ): - if "href" in tag.attrib: - return tag.attrib["href"] - - # Some providers (e.g. Flickr) use alternative instead of alternate. - for tag in tree.xpath( - "//link[@rel='alternative'][@type='application/json+oembed']" - ): - if "href" in tag.attrib: - return tag.attrib["href"] - - return None - - def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult: - """ - Parse the oEmbed response into an Open Graph response. - - Args: - url: The URL which is being previewed (not the one which was - requested). - raw_body: The oEmbed response as JSON encoded as bytes. - - Returns: - json-encoded Open Graph data - """ - - try: - # oEmbed responses *must* be UTF-8 according to the spec. - oembed = json_decoder.decode(raw_body.decode("utf-8")) - except ValueError: - return OEmbedResult({}, None, None) - - # The version is a required string field, but not always provided, - # or sometimes provided as a float. Be lenient. - oembed_version = oembed.get("version", "1.0") - if oembed_version != "1.0" and oembed_version != 1: - return OEmbedResult({}, None, None) - - # Attempt to parse the cache age, if possible. - try: - cache_age = int(oembed.get("cache_age")) * 1000 - except (TypeError, ValueError): - # If the cache age cannot be parsed (e.g. wrong type or invalid - # string), ignore it. - cache_age = None - - # The oEmbed response converted to Open Graph. - open_graph_response: JsonDict = {"og:url": url} - - title = oembed.get("title") - if title and isinstance(title, str): - # A common WordPress plug-in seems to incorrectly escape entities - # in the oEmbed response. - open_graph_response["og:title"] = html.unescape(title) - - author_name = oembed.get("author_name") - if not isinstance(author_name, str): - author_name = None - - # Use the provider name and as the site. - provider_name = oembed.get("provider_name") - if provider_name and isinstance(provider_name, str): - open_graph_response["og:site_name"] = provider_name - - # If a thumbnail exists, use it. Note that dimensions will be calculated later. - thumbnail_url = oembed.get("thumbnail_url") - if thumbnail_url and isinstance(thumbnail_url, str): - open_graph_response["og:image"] = thumbnail_url - - # Process each type separately. - oembed_type = oembed.get("type") - if oembed_type == "rich": - html_str = oembed.get("html") - if isinstance(html_str, str): - calc_description_and_urls(open_graph_response, html_str) - - elif oembed_type == "photo": - # If this is a photo, use the full image, not the thumbnail. - url = oembed.get("url") - if url and isinstance(url, str): - open_graph_response["og:image"] = url - - elif oembed_type == "video": - open_graph_response["og:type"] = "video.other" - html_str = oembed.get("html") - if html_str and isinstance(html_str, str): - calc_description_and_urls(open_graph_response, oembed["html"]) - for size in ("width", "height"): - val = oembed.get(size) - if type(val) is int: - open_graph_response[f"og:video:{size}"] = val - - elif oembed_type == "link": - open_graph_response["og:type"] = "website" - - else: - logger.warning("Unknown oEmbed type: %s", oembed_type) - - return OEmbedResult(open_graph_response, author_name, cache_age) - - -def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]: - results = [] - for tag in tree.xpath("//*/" + tag_name): - if "src" in tag.attrib: - results.append(tag.attrib["src"]) - return results - - -def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None: - """ - Calculate description for an HTML document. - - This uses lxml to convert the HTML document into plaintext. If errors - occur during processing of the document, an empty response is returned. - - Args: - open_graph_response: The current Open Graph summary. This is updated with additional fields. - html_body: The HTML document, as bytes. - - Returns: - The summary - """ - # If there's no body, nothing useful is going to be found. - if not html_body: - return - - from lxml import etree - - # Create an HTML parser. If this fails, log and return no metadata. - parser = etree.HTMLParser(recover=True, encoding="utf-8") - - # Attempt to parse the body. If this fails, log and return no metadata. - tree = etree.fromstring(html_body, parser) - - # The data was successfully parsed, but no tree was found. - if tree is None: - return - - # Attempt to find interesting URLs (images, videos, embeds). - if "og:image" not in open_graph_response: - image_urls = _fetch_urls(tree, "img") - if image_urls: - open_graph_response["og:image"] = image_urls[0] - - video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed") - if video_urls: - open_graph_response["og:video"] = video_urls[0] - - description = parse_html_description(tree) - if description: - open_graph_response["og:description"] = description diff --git a/synapse/rest/media/v1/preview_html.py b/synapse/rest/media/v1/preview_html.py deleted file mode 100644 index 516d0434f0..0000000000 --- a/synapse/rest/media/v1/preview_html.py +++ /dev/null @@ -1,501 +0,0 @@ -# Copyright 2021 The Matrix.org Foundation C.I.C. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import codecs -import logging -import re -from typing import ( - TYPE_CHECKING, - Callable, - Dict, - Generator, - Iterable, - List, - Optional, - Set, - Union, -) - -if TYPE_CHECKING: - from lxml import etree - -logger = logging.getLogger(__name__) - -_charset_match = re.compile( - rb'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I -) -_xml_encoding_match = re.compile( - rb'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I -) -_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I) - -# Certain elements aren't meant for display. -ARIA_ROLES_TO_IGNORE = {"directory", "menu", "menubar", "toolbar"} - - -def _normalise_encoding(encoding: str) -> Optional[str]: - """Use the Python codec's name as the normalised entry.""" - try: - return codecs.lookup(encoding).name - except LookupError: - return None - - -def _get_html_media_encodings( - body: bytes, content_type: Optional[str] -) -> Iterable[str]: - """ - Get potential encoding of the body based on the (presumably) HTML body or the content-type header. - - The precedence used for finding a character encoding is: - - 1. tag with a charset declared. - 2. The XML document's character encoding attribute. - 3. The Content-Type header. - 4. Fallback to utf-8. - 5. Fallback to windows-1252. - - This roughly follows the algorithm used by BeautifulSoup's bs4.dammit.EncodingDetector. - - Args: - body: The HTML document, as bytes. - content_type: The Content-Type header. - - Returns: - The character encoding of the body, as a string. - """ - # There's no point in returning an encoding more than once. - attempted_encodings: Set[str] = set() - - # Limit searches to the first 1kb, since it ought to be at the top. - body_start = body[:1024] - - # Check if it has an encoding set in a meta tag. - match = _charset_match.search(body_start) - if match: - encoding = _normalise_encoding(match.group(1).decode("ascii")) - if encoding: - attempted_encodings.add(encoding) - yield encoding - - # TODO Support - - # Check if it has an XML document with an encoding. - match = _xml_encoding_match.match(body_start) - if match: - encoding = _normalise_encoding(match.group(1).decode("ascii")) - if encoding and encoding not in attempted_encodings: - attempted_encodings.add(encoding) - yield encoding - - # Check the HTTP Content-Type header for a character set. - if content_type: - content_match = _content_type_match.match(content_type) - if content_match: - encoding = _normalise_encoding(content_match.group(1)) - if encoding and encoding not in attempted_encodings: - attempted_encodings.add(encoding) - yield encoding - - # Finally, fallback to UTF-8, then windows-1252. - for fallback in ("utf-8", "cp1252"): - if fallback not in attempted_encodings: - yield fallback - - -def decode_body( - body: bytes, uri: str, content_type: Optional[str] = None -) -> Optional["etree.Element"]: - """ - This uses lxml to parse the HTML document. - - Args: - body: The HTML document, as bytes. - uri: The URI used to download the body. - content_type: The Content-Type header. - - Returns: - The parsed HTML body, or None if an error occurred during processed. - """ - # If there's no body, nothing useful is going to be found. - if not body: - return None - - # The idea here is that multiple encodings are tried until one works. - # Unfortunately the result is never used and then LXML will decode the string - # again with the found encoding. - for encoding in _get_html_media_encodings(body, content_type): - try: - body.decode(encoding) - except Exception: - pass - else: - break - else: - logger.warning("Unable to decode HTML body for %s", uri) - return None - - from lxml import etree - - # Create an HTML parser. - parser = etree.HTMLParser(recover=True, encoding=encoding) - - # Attempt to parse the body. Returns None if the body was successfully - # parsed, but no tree was found. - return etree.fromstring(body, parser) - - -def _get_meta_tags( - tree: "etree.Element", - property: str, - prefix: str, - property_mapper: Optional[Callable[[str], Optional[str]]] = None, -) -> Dict[str, Optional[str]]: - """ - Search for meta tags prefixed with a particular string. - - Args: - tree: The parsed HTML document. - property: The name of the property which contains the tag name, e.g. - "property" for Open Graph. - prefix: The prefix on the property to search for, e.g. "og" for Open Graph. - property_mapper: An optional callable to map the property to the Open Graph - form. Can return None for a key to ignore that key. - - Returns: - A map of tag name to value. - """ - results: Dict[str, Optional[str]] = {} - for tag in tree.xpath( - f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]" - ): - # if we've got more than 50 tags, someone is taking the piss - if len(results) >= 50: - logger.warning( - "Skipping parsing of Open Graph for page with too many '%s:' tags", - prefix, - ) - return {} - - key = tag.attrib[property] - if property_mapper: - key = property_mapper(key) - # None is a special value used to ignore a value. - if key is None: - continue - - results[key] = tag.attrib["content"] - - return results - - -def _map_twitter_to_open_graph(key: str) -> Optional[str]: - """ - Map a Twitter card property to the analogous Open Graph property. - - Args: - key: The Twitter card property (starts with "twitter:"). - - Returns: - The Open Graph property (starts with "og:") or None to have this property - be ignored. - """ - # Twitter card properties with no analogous Open Graph property. - if key == "twitter:card" or key == "twitter:creator": - return None - if key == "twitter:site": - return "og:site_name" - # Otherwise, swap twitter to og. - return "og" + key[7:] - - -def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: - """ - Parse the HTML document into an Open Graph response. - - This uses lxml to search the HTML document for Open Graph data (or - synthesizes it from the document). - - Args: - tree: The parsed HTML document. - - Returns: - The Open Graph response as a dictionary. - """ - - # Search for Open Graph (og:) meta tags, e.g.: - # - # "og:type" : "video", - # "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw", - # "og:site_name" : "YouTube", - # "og:video:type" : "application/x-shockwave-flash", - # "og:description" : "Fun stuff happening here", - # "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon", - # "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg", - # "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", - # "og:video:width" : "1280" - # "og:video:height" : "720", - # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3", - - og = _get_meta_tags(tree, "property", "og") - - # TODO: Search for properties specific to the different Open Graph types, - # such as article: meta tags, e.g.: - # - # "article:publisher" : "https://www.facebook.com/thethudonline" /> - # "article:author" content="https://www.facebook.com/thethudonline" /> - # "article:tag" content="baby" /> - # "article:section" content="Breaking News" /> - # "article:published_time" content="2016-03-31T19:58:24+00:00" /> - # "article:modified_time" content="2016-04-01T18:31:53+00:00" /> - - # Search for Twitter Card (twitter:) meta tags, e.g.: - # - # "twitter:site" : "@matrixdotorg" - # "twitter:creator" : "@matrixdotorg" - # - # Twitter cards tags also duplicate Open Graph tags. - # - # See https://developer.twitter.com/en/docs/twitter-for-websites/cards/guides/getting-started - twitter = _get_meta_tags(tree, "name", "twitter", _map_twitter_to_open_graph) - # Merge the Twitter values with the Open Graph values, but do not overwrite - # information from Open Graph tags. - for key, value in twitter.items(): - if key not in og: - og[key] = value - - if "og:title" not in og: - # Attempt to find a title from the title tag, or the biggest header on the page. - title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()") - if title: - og["og:title"] = title[0].strip() - else: - og["og:title"] = None - - if "og:image" not in og: - meta_image = tree.xpath( - "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]" - ) - # If a meta image is found, use it. - if meta_image: - og["og:image"] = meta_image[0] - else: - # Try to find images which are larger than 10px by 10px. - # - # TODO: consider inlined CSS styles as well as width & height attribs - images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]") - images = sorted( - images, - key=lambda i: ( - -1 * float(i.attrib["width"]) * float(i.attrib["height"]) - ), - ) - # If no images were found, try to find *any* images. - if not images: - images = tree.xpath("//img[@src][1]") - if images: - og["og:image"] = images[0].attrib["src"] - - # Finally, fallback to the favicon if nothing else. - else: - favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]") - if favicons: - og["og:image"] = favicons[0] - - if "og:description" not in og: - # Check the first meta description tag for content. - meta_description = tree.xpath( - "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]" - ) - # If a meta description is found with content, use it. - if meta_description: - og["og:description"] = meta_description[0] - else: - og["og:description"] = parse_html_description(tree) - elif og["og:description"]: - # This must be a non-empty string at this point. - assert isinstance(og["og:description"], str) - og["og:description"] = summarize_paragraphs([og["og:description"]]) - - # TODO: delete the url downloads to stop diskfilling, - # as we only ever cared about its OG - return og - - -def parse_html_description(tree: "etree.Element") -> Optional[str]: - """ - Calculate a text description based on an HTML document. - - Grabs any text nodes which are inside the tag, unless they are within - an HTML5 semantic markup tag (
,