diff options
author | Patrick Cloke <clokep@users.noreply.github.com> | 2023-02-27 08:26:05 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-02-27 08:26:05 -0500 |
commit | 4fc8875876374ec8f97a3b3cc344a4e3abcf769f (patch) | |
tree | 41b920427c0f62cd1463324c89fe7f5ce3d15164 /synapse/rest/media/v1 | |
parent | Small fixes to `MatrixFederationHttpClient` docstrings (#15148) (diff) | |
download | synapse-4fc8875876374ec8f97a3b3cc344a4e3abcf769f.tar.xz |
Refactor media modules. (#15146)
* Removes the `v1` directory from `test.rest.media.v1`. * Moves the non-REST code from `synapse.rest.media.v1` to `synapse.media`. * Flatten the `v1` directory from `synapse.rest.media`, but leave compatiblity with 3rd party media repositories and spam checkers.
Diffstat (limited to 'synapse/rest/media/v1')
-rw-r--r-- | synapse/rest/media/v1/_base.py | 470 | ||||
-rw-r--r-- | synapse/rest/media/v1/config_resource.py | 41 | ||||
-rw-r--r-- | synapse/rest/media/v1/download_resource.py | 76 | ||||
-rw-r--r-- | synapse/rest/media/v1/filepath.py | 410 | ||||
-rw-r--r-- | synapse/rest/media/v1/media_repository.py | 1112 | ||||
-rw-r--r-- | synapse/rest/media/v1/media_storage.py | 365 | ||||
-rw-r--r-- | synapse/rest/media/v1/oembed.py | 265 | ||||
-rw-r--r-- | synapse/rest/media/v1/preview_html.py | 501 | ||||
-rw-r--r-- | synapse/rest/media/v1/preview_url_resource.py | 871 | ||||
-rw-r--r-- | synapse/rest/media/v1/storage_provider.py | 172 | ||||
-rw-r--r-- | synapse/rest/media/v1/thumbnail_resource.py | 555 | ||||
-rw-r--r-- | synapse/rest/media/v1/thumbnailer.py | 221 | ||||
-rw-r--r-- | synapse/rest/media/v1/upload_resource.py | 108 |
13 files changed, 12 insertions, 5155 deletions
diff --git a/synapse/rest/media/v1/_base.py b/synapse/rest/media/v1/_base.py index ef8334ae25..88427a5737 100644 --- a/synapse/rest/media/v1/_base.py +++ b/synapse/rest/media/v1/_base.py @@ -1,5 +1,4 @@ -# Copyright 2014-2016 OpenMarket Ltd -# Copyright 2019-2021 The Matrix.org Foundation C.I.C. +# Copyright 2023 The Matrix.org Foundation C.I.C. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,468 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# -import logging -import os -import urllib -from abc import ABC, abstractmethod -from types import TracebackType -from typing import Awaitable, Dict, Generator, List, Optional, Tuple, Type - -import attr - -from twisted.internet.interfaces import IConsumer -from twisted.protocols.basic import FileSender -from twisted.web.server import Request - -from synapse.api.errors import Codes, SynapseError, cs_error -from synapse.http.server import finish_request, respond_with_json -from synapse.http.site import SynapseRequest -from synapse.logging.context import make_deferred_yieldable -from synapse.util.stringutils import is_ascii, parse_and_validate_server_name - -logger = logging.getLogger(__name__) - -# list all text content types that will have the charset default to UTF-8 when -# none is given -TEXT_CONTENT_TYPES = [ - "text/css", - "text/csv", - "text/html", - "text/calendar", - "text/plain", - "text/javascript", - "application/json", - "application/ld+json", - "application/rtf", - "image/svg+xml", - "text/xml", -] - - -def parse_media_id(request: Request) -> Tuple[str, str, Optional[str]]: - """Parses the server name, media ID and optional file name from the request URI - - Also performs some rough validation on the server name. - - Args: - request: The `Request`. - - Returns: - A tuple containing the parsed server name, media ID and optional file name. - - Raises: - SynapseError(404): if parsing or validation fail for any reason - """ - try: - # The type on postpath seems incorrect in Twisted 21.2.0. - postpath: List[bytes] = request.postpath # type: ignore - assert postpath - - # This allows users to append e.g. /test.png to the URL. Useful for - # clients that parse the URL to see content type. - server_name_bytes, media_id_bytes = postpath[:2] - server_name = server_name_bytes.decode("utf-8") - media_id = media_id_bytes.decode("utf8") - - # Validate the server name, raising if invalid - parse_and_validate_server_name(server_name) - - file_name = None - if len(postpath) > 2: - try: - file_name = urllib.parse.unquote(postpath[-1].decode("utf-8")) - except UnicodeDecodeError: - pass - return server_name, media_id, file_name - except Exception: - raise SynapseError( - 404, "Invalid media id token %r" % (request.postpath,), Codes.UNKNOWN - ) - - -def respond_404(request: SynapseRequest) -> None: - respond_with_json( - request, - 404, - cs_error("Not found %r" % (request.postpath,), code=Codes.NOT_FOUND), - send_cors=True, - ) - - -async def respond_with_file( - request: SynapseRequest, - media_type: str, - file_path: str, - file_size: Optional[int] = None, - upload_name: Optional[str] = None, -) -> None: - logger.debug("Responding with %r", file_path) - - if os.path.isfile(file_path): - if file_size is None: - stat = os.stat(file_path) - file_size = stat.st_size - - add_file_headers(request, media_type, file_size, upload_name) - - with open(file_path, "rb") as f: - await make_deferred_yieldable(FileSender().beginFileTransfer(f, request)) - - finish_request(request) - else: - respond_404(request) - - -def add_file_headers( - request: Request, - media_type: str, - file_size: Optional[int], - upload_name: Optional[str], -) -> None: - """Adds the correct response headers in preparation for responding with the - media. - - Args: - request - media_type: The media/content type. - file_size: Size in bytes of the media, if known. - upload_name: The name of the requested file, if any. - """ - - def _quote(x: str) -> str: - return urllib.parse.quote(x.encode("utf-8")) - - # Default to a UTF-8 charset for text content types. - # ex, uses UTF-8 for 'text/css' but not 'text/css; charset=UTF-16' - if media_type.lower() in TEXT_CONTENT_TYPES: - content_type = media_type + "; charset=UTF-8" - else: - content_type = media_type - - request.setHeader(b"Content-Type", content_type.encode("UTF-8")) - if upload_name: - # RFC6266 section 4.1 [1] defines both `filename` and `filename*`. - # - # `filename` is defined to be a `value`, which is defined by RFC2616 - # section 3.6 [2] to be a `token` or a `quoted-string`, where a `token` - # is (essentially) a single US-ASCII word, and a `quoted-string` is a - # US-ASCII string surrounded by double-quotes, using backslash as an - # escape character. Note that %-encoding is *not* permitted. - # - # `filename*` is defined to be an `ext-value`, which is defined in - # RFC5987 section 3.2.1 [3] to be `charset "'" [ language ] "'" value-chars`, - # where `value-chars` is essentially a %-encoded string in the given charset. - # - # [1]: https://tools.ietf.org/html/rfc6266#section-4.1 - # [2]: https://tools.ietf.org/html/rfc2616#section-3.6 - # [3]: https://tools.ietf.org/html/rfc5987#section-3.2.1 - - # We avoid the quoted-string version of `filename`, because (a) synapse didn't - # correctly interpret those as of 0.99.2 and (b) they are a bit of a pain and we - # may as well just do the filename* version. - if _can_encode_filename_as_token(upload_name): - disposition = "inline; filename=%s" % (upload_name,) - else: - disposition = "inline; filename*=utf-8''%s" % (_quote(upload_name),) - - request.setHeader(b"Content-Disposition", disposition.encode("ascii")) - - # cache for at least a day. - # XXX: we might want to turn this off for data we don't want to - # recommend caching as it's sensitive or private - or at least - # select private. don't bother setting Expires as all our - # clients are smart enough to be happy with Cache-Control - request.setHeader(b"Cache-Control", b"public,max-age=86400,s-maxage=86400") - if file_size is not None: - request.setHeader(b"Content-Length", b"%d" % (file_size,)) - - # Tell web crawlers to not index, archive, or follow links in media. This - # should help to prevent things in the media repo from showing up in web - # search results. - request.setHeader(b"X-Robots-Tag", "noindex, nofollow, noarchive, noimageindex") - - -# separators as defined in RFC2616. SP and HT are handled separately. -# see _can_encode_filename_as_token. -_FILENAME_SEPARATOR_CHARS = { - "(", - ")", - "<", - ">", - "@", - ",", - ";", - ":", - "\\", - '"', - "/", - "[", - "]", - "?", - "=", - "{", - "}", -} - - -def _can_encode_filename_as_token(x: str) -> bool: - for c in x: - # from RFC2616: - # - # token = 1*<any CHAR except CTLs or separators> - # - # separators = "(" | ")" | "<" | ">" | "@" - # | "," | ";" | ":" | "\" | <"> - # | "/" | "[" | "]" | "?" | "=" - # | "{" | "}" | SP | HT - # - # CHAR = <any US-ASCII character (octets 0 - 127)> - # - # CTL = <any US-ASCII control character - # (octets 0 - 31) and DEL (127)> - # - if ord(c) >= 127 or ord(c) <= 32 or c in _FILENAME_SEPARATOR_CHARS: - return False - return True - - -async def respond_with_responder( - request: SynapseRequest, - responder: "Optional[Responder]", - media_type: str, - file_size: Optional[int], - upload_name: Optional[str] = None, -) -> None: - """Responds to the request with given responder. If responder is None then - returns 404. - - Args: - request - responder - media_type: The media/content type. - file_size: Size in bytes of the media. If not known it should be None - upload_name: The name of the requested file, if any. - """ - if not responder: - respond_404(request) - return - - # If we have a responder we *must* use it as a context manager. - with responder: - if request._disconnected: - logger.warning( - "Not sending response to request %s, already disconnected.", request - ) - return - - logger.debug("Responding to media request with responder %s", responder) - add_file_headers(request, media_type, file_size, upload_name) - try: - await responder.write_to_consumer(request) - except Exception as e: - # The majority of the time this will be due to the client having gone - # away. Unfortunately, Twisted simply throws a generic exception at us - # in that case. - logger.warning("Failed to write to consumer: %s %s", type(e), e) - - # Unregister the producer, if it has one, so Twisted doesn't complain - if request.producer: - request.unregisterProducer() - - finish_request(request) - - -class Responder(ABC): - """Represents a response that can be streamed to the requester. - - Responder is a context manager which *must* be used, so that any resources - held can be cleaned up. - """ - - @abstractmethod - def write_to_consumer(self, consumer: IConsumer) -> Awaitable: - """Stream response into consumer - - Args: - consumer: The consumer to stream into. - - Returns: - Resolves once the response has finished being written - """ - raise NotImplementedError() - - def __enter__(self) -> None: # noqa: B027 - pass - - def __exit__( # noqa: B027 - self, - exc_type: Optional[Type[BaseException]], - exc_val: Optional[BaseException], - exc_tb: Optional[TracebackType], - ) -> None: - pass - - -@attr.s(slots=True, frozen=True, auto_attribs=True) -class ThumbnailInfo: - """Details about a generated thumbnail.""" - - width: int - height: int - method: str - # Content type of thumbnail, e.g. image/png - type: str - # The size of the media file, in bytes. - length: Optional[int] = None - - -@attr.s(slots=True, frozen=True, auto_attribs=True) -class FileInfo: - """Details about a requested/uploaded file.""" - - # The server name where the media originated from, or None if local. - server_name: Optional[str] - # The local ID of the file. For local files this is the same as the media_id - file_id: str - # If the file is for the url preview cache - url_cache: bool = False - # Whether the file is a thumbnail or not. - thumbnail: Optional[ThumbnailInfo] = None - - # The below properties exist to maintain compatibility with third-party modules. - @property - def thumbnail_width(self) -> Optional[int]: - if not self.thumbnail: - return None - return self.thumbnail.width - - @property - def thumbnail_height(self) -> Optional[int]: - if not self.thumbnail: - return None - return self.thumbnail.height - - @property - def thumbnail_method(self) -> Optional[str]: - if not self.thumbnail: - return None - return self.thumbnail.method - - @property - def thumbnail_type(self) -> Optional[str]: - if not self.thumbnail: - return None - return self.thumbnail.type - - @property - def thumbnail_length(self) -> Optional[int]: - if not self.thumbnail: - return None - return self.thumbnail.length - - -def get_filename_from_headers(headers: Dict[bytes, List[bytes]]) -> Optional[str]: - """ - Get the filename of the downloaded file by inspecting the - Content-Disposition HTTP header. - - Args: - headers: The HTTP request headers. - - Returns: - The filename, or None. - """ - content_disposition = headers.get(b"Content-Disposition", [b""]) - - # No header, bail out. - if not content_disposition[0]: - return None - - _, params = _parse_header(content_disposition[0]) - - upload_name = None - - # First check if there is a valid UTF-8 filename - upload_name_utf8 = params.get(b"filename*", None) - if upload_name_utf8: - if upload_name_utf8.lower().startswith(b"utf-8''"): - upload_name_utf8 = upload_name_utf8[7:] - # We have a filename*= section. This MUST be ASCII, and any UTF-8 - # bytes are %-quoted. - try: - # Once it is decoded, we can then unquote the %-encoded - # parts strictly into a unicode string. - upload_name = urllib.parse.unquote( - upload_name_utf8.decode("ascii"), errors="strict" - ) - except UnicodeDecodeError: - # Incorrect UTF-8. - pass - - # If there isn't check for an ascii name. - if not upload_name: - upload_name_ascii = params.get(b"filename", None) - if upload_name_ascii and is_ascii(upload_name_ascii): - upload_name = upload_name_ascii.decode("ascii") - - # This may be None here, indicating we did not find a matching name. - return upload_name - - -def _parse_header(line: bytes) -> Tuple[bytes, Dict[bytes, bytes]]: - """Parse a Content-type like header. - - Cargo-culted from `cgi`, but works on bytes rather than strings. - - Args: - line: header to be parsed - - Returns: - The main content-type, followed by the parameter dictionary - """ - parts = _parseparam(b";" + line) - key = next(parts) - pdict = {} - for p in parts: - i = p.find(b"=") - if i >= 0: - name = p[:i].strip().lower() - value = p[i + 1 :].strip() - - # strip double-quotes - if len(value) >= 2 and value[0:1] == value[-1:] == b'"': - value = value[1:-1] - value = value.replace(b"\\\\", b"\\").replace(b'\\"', b'"') - pdict[name] = value - - return key, pdict - - -def _parseparam(s: bytes) -> Generator[bytes, None, None]: - """Generator which splits the input on ;, respecting double-quoted sequences - - Cargo-culted from `cgi`, but works on bytes rather than strings. - - Args: - s: header to be parsed - - Returns: - The split input - """ - while s[:1] == b";": - s = s[1:] - - # look for the next ; - end = s.find(b";") - - # if there is an odd number of " marks between here and the next ;, skip to the - # next ; instead - while end > 0 and (s.count(b'"', 0, end) - s.count(b'\\"', 0, end)) % 2: - end = s.find(b";", end + 1) - - if end < 0: - end = len(s) - f = s[:end] - yield f.strip() - s = s[end:] +# This exists purely for backwards compatibility with media providers and spam checkers. +from synapse.media._base import FileInfo, Responder # noqa: F401 diff --git a/synapse/rest/media/v1/config_resource.py b/synapse/rest/media/v1/config_resource.py deleted file mode 100644 index a95804d327..0000000000 --- a/synapse/rest/media/v1/config_resource.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2018 Will Hunt <will@half-shot.uk> -# Copyright 2020-2021 The Matrix.org Foundation C.I.C. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from typing import TYPE_CHECKING - -from synapse.http.server import DirectServeJsonResource, respond_with_json -from synapse.http.site import SynapseRequest - -if TYPE_CHECKING: - from synapse.server import HomeServer - - -class MediaConfigResource(DirectServeJsonResource): - isLeaf = True - - def __init__(self, hs: "HomeServer"): - super().__init__() - config = hs.config - self.clock = hs.get_clock() - self.auth = hs.get_auth() - self.limits_dict = {"m.upload.size": config.media.max_upload_size} - - async def _async_render_GET(self, request: SynapseRequest) -> None: - await self.auth.get_user_by_req(request) - respond_with_json(request, 200, self.limits_dict, send_cors=True) - - async def _async_render_OPTIONS(self, request: SynapseRequest) -> None: - respond_with_json(request, 200, {}, send_cors=True) diff --git a/synapse/rest/media/v1/download_resource.py b/synapse/rest/media/v1/download_resource.py deleted file mode 100644 index 048a042692..0000000000 --- a/synapse/rest/media/v1/download_resource.py +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright 2014-2016 OpenMarket Ltd -# Copyright 2020-2021 The Matrix.org Foundation C.I.C. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -from typing import TYPE_CHECKING - -from synapse.http.server import ( - DirectServeJsonResource, - set_corp_headers, - set_cors_headers, -) -from synapse.http.servlet import parse_boolean -from synapse.http.site import SynapseRequest - -from ._base import parse_media_id, respond_404 - -if TYPE_CHECKING: - from synapse.rest.media.v1.media_repository import MediaRepository - from synapse.server import HomeServer - -logger = logging.getLogger(__name__) - - -class DownloadResource(DirectServeJsonResource): - isLeaf = True - - def __init__(self, hs: "HomeServer", media_repo: "MediaRepository"): - super().__init__() - self.media_repo = media_repo - self.server_name = hs.hostname - - async def _async_render_GET(self, request: SynapseRequest) -> None: - set_cors_headers(request) - set_corp_headers(request) - request.setHeader( - b"Content-Security-Policy", - b"sandbox;" - b" default-src 'none';" - b" script-src 'none';" - b" plugin-types application/pdf;" - b" style-src 'unsafe-inline';" - b" media-src 'self';" - b" object-src 'self';", - ) - # Limited non-standard form of CSP for IE11 - request.setHeader(b"X-Content-Security-Policy", b"sandbox;") - request.setHeader( - b"Referrer-Policy", - b"no-referrer", - ) - server_name, media_id, name = parse_media_id(request) - if server_name == self.server_name: - await self.media_repo.get_local_media(request, media_id, name) - else: - allow_remote = parse_boolean(request, "allow_remote", default=True) - if not allow_remote: - logger.info( - "Rejecting request for remote media %s/%s due to allow_remote", - server_name, - media_id, - ) - respond_404(request) - return - - await self.media_repo.get_remote_media(request, server_name, media_id, name) diff --git a/synapse/rest/media/v1/filepath.py b/synapse/rest/media/v1/filepath.py deleted file mode 100644 index 1f6441c412..0000000000 --- a/synapse/rest/media/v1/filepath.py +++ /dev/null @@ -1,410 +0,0 @@ -# Copyright 2014-2016 OpenMarket Ltd -# Copyright 2020-2021 The Matrix.org Foundation C.I.C. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import functools -import os -import re -import string -from typing import Any, Callable, List, TypeVar, Union, cast - -NEW_FORMAT_ID_RE = re.compile(r"^\d\d\d\d-\d\d-\d\d") - - -F = TypeVar("F", bound=Callable[..., str]) - - -def _wrap_in_base_path(func: F) -> F: - """Takes a function that returns a relative path and turns it into an - absolute path based on the location of the primary media store - """ - - @functools.wraps(func) - def _wrapped(self: "MediaFilePaths", *args: Any, **kwargs: Any) -> str: - path = func(self, *args, **kwargs) - return os.path.join(self.base_path, path) - - return cast(F, _wrapped) - - -GetPathMethod = TypeVar( - "GetPathMethod", bound=Union[Callable[..., str], Callable[..., List[str]]] -) - - -def _wrap_with_jail_check(relative: bool) -> Callable[[GetPathMethod], GetPathMethod]: - """Wraps a path-returning method to check that the returned path(s) do not escape - the media store directory. - - The path-returning method may return either a single path, or a list of paths. - - The check is not expected to ever fail, unless `func` is missing a call to - `_validate_path_component`, or `_validate_path_component` is buggy. - - Args: - relative: A boolean indicating whether the wrapped method returns paths relative - to the media store directory. - - Returns: - A method which will wrap a path-returning method, adding a check to ensure that - the returned path(s) lie within the media store directory. The check will raise - a `ValueError` if it fails. - """ - - def _wrap_with_jail_check_inner(func: GetPathMethod) -> GetPathMethod: - @functools.wraps(func) - def _wrapped( - self: "MediaFilePaths", *args: Any, **kwargs: Any - ) -> Union[str, List[str]]: - path_or_paths = func(self, *args, **kwargs) - - if isinstance(path_or_paths, list): - paths_to_check = path_or_paths - else: - paths_to_check = [path_or_paths] - - for path in paths_to_check: - # Construct the path that will ultimately be used. - # We cannot guess whether `path` is relative to the media store - # directory, since the media store directory may itself be a relative - # path. - if relative: - path = os.path.join(self.base_path, path) - normalized_path = os.path.normpath(path) - - # Now that `normpath` has eliminated `../`s and `./`s from the path, - # `os.path.commonpath` can be used to check whether it lies within the - # media store directory. - if ( - os.path.commonpath([normalized_path, self.normalized_base_path]) - != self.normalized_base_path - ): - # The path resolves to outside the media store directory, - # or `self.base_path` is `.`, which is an unlikely configuration. - raise ValueError(f"Invalid media store path: {path!r}") - - # Note that `os.path.normpath`/`abspath` has a subtle caveat: - # `a/b/c/../c` will normalize to `a/b/c`, but the former refers to a - # different path if `a/b/c` is a symlink. That is, the check above is - # not perfect and may allow a certain restricted subset of untrustworthy - # paths through. Since the check above is secondary to the main - # `_validate_path_component` checks, it's less important for it to be - # perfect. - # - # As an alternative, `os.path.realpath` will resolve symlinks, but - # proves problematic if there are symlinks inside the media store. - # eg. if `url_store/` is symlinked to elsewhere, its canonical path - # won't match that of the main media store directory. - - return path_or_paths - - return cast(GetPathMethod, _wrapped) - - return _wrap_with_jail_check_inner - - -ALLOWED_CHARACTERS = set( - string.ascii_letters - + string.digits - + "_-" - + ".[]:" # Domain names, IPv6 addresses and ports in server names -) -FORBIDDEN_NAMES = { - "", - os.path.curdir, # "." for the current platform - os.path.pardir, # ".." for the current platform -} - - -def _validate_path_component(name: str) -> str: - """Checks that the given string can be safely used as a path component - - Args: - name: The path component to check. - - Returns: - The path component if valid. - - Raises: - ValueError: If `name` cannot be safely used as a path component. - """ - if not ALLOWED_CHARACTERS.issuperset(name) or name in FORBIDDEN_NAMES: - raise ValueError(f"Invalid path component: {name!r}") - - return name - - -class MediaFilePaths: - """Describes where files are stored on disk. - - Most of the functions have a `*_rel` variant which returns a file path that - is relative to the base media store path. This is mainly used when we want - to write to the backup media store (when one is configured) - """ - - def __init__(self, primary_base_path: str): - self.base_path = primary_base_path - self.normalized_base_path = os.path.normpath(self.base_path) - - # Refuse to initialize if paths cannot be validated correctly for the current - # platform. - assert os.path.sep not in ALLOWED_CHARACTERS - assert os.path.altsep not in ALLOWED_CHARACTERS - # On Windows, paths have all sorts of weirdness which `_validate_path_component` - # does not consider. In any case, the remote media store can't work correctly - # for certain homeservers there, since ":"s aren't allowed in paths. - assert os.name == "posix" - - @_wrap_with_jail_check(relative=True) - def local_media_filepath_rel(self, media_id: str) -> str: - return os.path.join( - "local_content", - _validate_path_component(media_id[0:2]), - _validate_path_component(media_id[2:4]), - _validate_path_component(media_id[4:]), - ) - - local_media_filepath = _wrap_in_base_path(local_media_filepath_rel) - - @_wrap_with_jail_check(relative=True) - def local_media_thumbnail_rel( - self, media_id: str, width: int, height: int, content_type: str, method: str - ) -> str: - top_level_type, sub_type = content_type.split("/") - file_name = "%i-%i-%s-%s-%s" % (width, height, top_level_type, sub_type, method) - return os.path.join( - "local_thumbnails", - _validate_path_component(media_id[0:2]), - _validate_path_component(media_id[2:4]), - _validate_path_component(media_id[4:]), - _validate_path_component(file_name), - ) - - local_media_thumbnail = _wrap_in_base_path(local_media_thumbnail_rel) - - @_wrap_with_jail_check(relative=False) - def local_media_thumbnail_dir(self, media_id: str) -> str: - """ - Retrieve the local store path of thumbnails of a given media_id - - Args: - media_id: The media ID to query. - Returns: - Path of local_thumbnails from media_id - """ - return os.path.join( - self.base_path, - "local_thumbnails", - _validate_path_component(media_id[0:2]), - _validate_path_component(media_id[2:4]), - _validate_path_component(media_id[4:]), - ) - - @_wrap_with_jail_check(relative=True) - def remote_media_filepath_rel(self, server_name: str, file_id: str) -> str: - return os.path.join( - "remote_content", - _validate_path_component(server_name), - _validate_path_component(file_id[0:2]), - _validate_path_component(file_id[2:4]), - _validate_path_component(file_id[4:]), - ) - - remote_media_filepath = _wrap_in_base_path(remote_media_filepath_rel) - - @_wrap_with_jail_check(relative=True) - def remote_media_thumbnail_rel( - self, - server_name: str, - file_id: str, - width: int, - height: int, - content_type: str, - method: str, - ) -> str: - top_level_type, sub_type = content_type.split("/") - file_name = "%i-%i-%s-%s-%s" % (width, height, top_level_type, sub_type, method) - return os.path.join( - "remote_thumbnail", - _validate_path_component(server_name), - _validate_path_component(file_id[0:2]), - _validate_path_component(file_id[2:4]), - _validate_path_component(file_id[4:]), - _validate_path_component(file_name), - ) - - remote_media_thumbnail = _wrap_in_base_path(remote_media_thumbnail_rel) - - # Legacy path that was used to store thumbnails previously. - # Should be removed after some time, when most of the thumbnails are stored - # using the new path. - @_wrap_with_jail_check(relative=True) - def remote_media_thumbnail_rel_legacy( - self, server_name: str, file_id: str, width: int, height: int, content_type: str - ) -> str: - top_level_type, sub_type = content_type.split("/") - file_name = "%i-%i-%s-%s" % (width, height, top_level_type, sub_type) - return os.path.join( - "remote_thumbnail", - _validate_path_component(server_name), - _validate_path_component(file_id[0:2]), - _validate_path_component(file_id[2:4]), - _validate_path_component(file_id[4:]), - _validate_path_component(file_name), - ) - - @_wrap_with_jail_check(relative=False) - def remote_media_thumbnail_dir(self, server_name: str, file_id: str) -> str: - return os.path.join( - self.base_path, - "remote_thumbnail", - _validate_path_component(server_name), - _validate_path_component(file_id[0:2]), - _validate_path_component(file_id[2:4]), - _validate_path_component(file_id[4:]), - ) - - @_wrap_with_jail_check(relative=True) - def url_cache_filepath_rel(self, media_id: str) -> str: - if NEW_FORMAT_ID_RE.match(media_id): - # Media id is of the form <DATE><RANDOM_STRING> - # E.g.: 2017-09-28-fsdRDt24DS234dsf - return os.path.join( - "url_cache", - _validate_path_component(media_id[:10]), - _validate_path_component(media_id[11:]), - ) - else: - return os.path.join( - "url_cache", - _validate_path_component(media_id[0:2]), - _validate_path_component(media_id[2:4]), - _validate_path_component(media_id[4:]), - ) - - url_cache_filepath = _wrap_in_base_path(url_cache_filepath_rel) - - @_wrap_with_jail_check(relative=False) - def url_cache_filepath_dirs_to_delete(self, media_id: str) -> List[str]: - "The dirs to try and remove if we delete the media_id file" - if NEW_FORMAT_ID_RE.match(media_id): - return [ - os.path.join( - self.base_path, "url_cache", _validate_path_component(media_id[:10]) - ) - ] - else: - return [ - os.path.join( - self.base_path, - "url_cache", - _validate_path_component(media_id[0:2]), - _validate_path_component(media_id[2:4]), - ), - os.path.join( - self.base_path, "url_cache", _validate_path_component(media_id[0:2]) - ), - ] - - @_wrap_with_jail_check(relative=True) - def url_cache_thumbnail_rel( - self, media_id: str, width: int, height: int, content_type: str, method: str - ) -> str: - # Media id is of the form <DATE><RANDOM_STRING> - # E.g.: 2017-09-28-fsdRDt24DS234dsf - - top_level_type, sub_type = content_type.split("/") - file_name = "%i-%i-%s-%s-%s" % (width, height, top_level_type, sub_type, method) - - if NEW_FORMAT_ID_RE.match(media_id): - return os.path.join( - "url_cache_thumbnails", - _validate_path_component(media_id[:10]), - _validate_path_component(media_id[11:]), - _validate_path_component(file_name), - ) - else: - return os.path.join( - "url_cache_thumbnails", - _validate_path_component(media_id[0:2]), - _validate_path_component(media_id[2:4]), - _validate_path_component(media_id[4:]), - _validate_path_component(file_name), - ) - - url_cache_thumbnail = _wrap_in_base_path(url_cache_thumbnail_rel) - - @_wrap_with_jail_check(relative=True) - def url_cache_thumbnail_directory_rel(self, media_id: str) -> str: - # Media id is of the form <DATE><RANDOM_STRING> - # E.g.: 2017-09-28-fsdRDt24DS234dsf - - if NEW_FORMAT_ID_RE.match(media_id): - return os.path.join( - "url_cache_thumbnails", - _validate_path_component(media_id[:10]), - _validate_path_component(media_id[11:]), - ) - else: - return os.path.join( - "url_cache_thumbnails", - _validate_path_component(media_id[0:2]), - _validate_path_component(media_id[2:4]), - _validate_path_component(media_id[4:]), - ) - - url_cache_thumbnail_directory = _wrap_in_base_path( - url_cache_thumbnail_directory_rel - ) - - @_wrap_with_jail_check(relative=False) - def url_cache_thumbnail_dirs_to_delete(self, media_id: str) -> List[str]: - "The dirs to try and remove if we delete the media_id thumbnails" - # Media id is of the form <DATE><RANDOM_STRING> - # E.g.: 2017-09-28-fsdRDt24DS234dsf - if NEW_FORMAT_ID_RE.match(media_id): - return [ - os.path.join( - self.base_path, - "url_cache_thumbnails", - _validate_path_component(media_id[:10]), - _validate_path_component(media_id[11:]), - ), - os.path.join( - self.base_path, - "url_cache_thumbnails", - _validate_path_component(media_id[:10]), - ), - ] - else: - return [ - os.path.join( - self.base_path, - "url_cache_thumbnails", - _validate_path_component(media_id[0:2]), - _validate_path_component(media_id[2:4]), - _validate_path_component(media_id[4:]), - ), - os.path.join( - self.base_path, - "url_cache_thumbnails", - _validate_path_component(media_id[0:2]), - _validate_path_component(media_id[2:4]), - ), - os.path.join( - self.base_path, - "url_cache_thumbnails", - _validate_path_component(media_id[0:2]), - ), - ] diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py deleted file mode 100644 index c70e1837af..0000000000 --- a/synapse/rest/media/v1/media_repository.py +++ /dev/null @@ -1,1112 +0,0 @@ -# Copyright 2014-2016 OpenMarket Ltd -# Copyright 2018-2021 The Matrix.org Foundation C.I.C. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import errno -import logging -import os -import shutil -from io import BytesIO -from typing import IO, TYPE_CHECKING, Dict, List, Optional, Set, Tuple - -from matrix_common.types.mxc_uri import MXCUri - -import twisted.internet.error -import twisted.web.http -from twisted.internet.defer import Deferred - -from synapse.api.errors import ( - FederationDeniedError, - HttpResponseException, - NotFoundError, - RequestSendFailed, - SynapseError, -) -from synapse.config._base import ConfigError -from synapse.config.repository import ThumbnailRequirement -from synapse.http.server import UnrecognizedRequestResource -from synapse.http.site import SynapseRequest -from synapse.logging.context import defer_to_thread -from synapse.metrics.background_process_metrics import run_as_background_process -from synapse.types import UserID -from synapse.util.async_helpers import Linearizer -from synapse.util.retryutils import NotRetryingDestination -from synapse.util.stringutils import random_string - -from ._base import ( - FileInfo, - Responder, - ThumbnailInfo, - get_filename_from_headers, - respond_404, - respond_with_responder, -) -from .config_resource import MediaConfigResource -from .download_resource import DownloadResource -from .filepath import MediaFilePaths -from .media_storage import MediaStorage -from .preview_url_resource import PreviewUrlResource -from .storage_provider import StorageProviderWrapper -from .thumbnail_resource import ThumbnailResource -from .thumbnailer import Thumbnailer, ThumbnailError -from .upload_resource import UploadResource - -if TYPE_CHECKING: - from synapse.server import HomeServer - -logger = logging.getLogger(__name__) - -# How often to run the background job to update the "recently accessed" -# attribute of local and remote media. -UPDATE_RECENTLY_ACCESSED_TS = 60 * 1000 # 1 minute -# How often to run the background job to check for local and remote media -# that should be purged according to the configured media retention settings. -MEDIA_RETENTION_CHECK_PERIOD_MS = 60 * 60 * 1000 # 1 hour - - -class MediaRepository: - def __init__(self, hs: "HomeServer"): - self.hs = hs - self.auth = hs.get_auth() - self.client = hs.get_federation_http_client() - self.clock = hs.get_clock() - self.server_name = hs.hostname - self.store = hs.get_datastores().main - self.max_upload_size = hs.config.media.max_upload_size - self.max_image_pixels = hs.config.media.max_image_pixels - - Thumbnailer.set_limits(self.max_image_pixels) - - self.primary_base_path: str = hs.config.media.media_store_path - self.filepaths: MediaFilePaths = MediaFilePaths(self.primary_base_path) - - self.dynamic_thumbnails = hs.config.media.dynamic_thumbnails - self.thumbnail_requirements = hs.config.media.thumbnail_requirements - - self.remote_media_linearizer = Linearizer(name="media_remote") - - self.recently_accessed_remotes: Set[Tuple[str, str]] = set() - self.recently_accessed_locals: Set[str] = set() - - self.federation_domain_whitelist = ( - hs.config.federation.federation_domain_whitelist - ) - - # List of StorageProviders where we should search for media and - # potentially upload to. - storage_providers = [] - - for ( - clz, - provider_config, - wrapper_config, - ) in hs.config.media.media_storage_providers: - backend = clz(hs, provider_config) - provider = StorageProviderWrapper( - backend, - store_local=wrapper_config.store_local, - store_remote=wrapper_config.store_remote, - store_synchronous=wrapper_config.store_synchronous, - ) - storage_providers.append(provider) - - self.media_storage = MediaStorage( - self.hs, self.primary_base_path, self.filepaths, storage_providers - ) - - self.clock.looping_call( - self._start_update_recently_accessed, UPDATE_RECENTLY_ACCESSED_TS - ) - - # Media retention configuration options - self._media_retention_local_media_lifetime_ms = ( - hs.config.media.media_retention_local_media_lifetime_ms - ) - self._media_retention_remote_media_lifetime_ms = ( - hs.config.media.media_retention_remote_media_lifetime_ms - ) - - # Check whether local or remote media retention is configured - if ( - hs.config.media.media_retention_local_media_lifetime_ms is not None - or hs.config.media.media_retention_remote_media_lifetime_ms is not None - ): - # Run the background job to apply media retention rules routinely, - # with the duration between runs dictated by the homeserver config. - self.clock.looping_call( - self._start_apply_media_retention_rules, - MEDIA_RETENTION_CHECK_PERIOD_MS, - ) - - def _start_update_recently_accessed(self) -> Deferred: - return run_as_background_process( - "update_recently_accessed_media", self._update_recently_accessed - ) - - def _start_apply_media_retention_rules(self) -> Deferred: - return run_as_background_process( - "apply_media_retention_rules", self._apply_media_retention_rules - ) - - async def _update_recently_accessed(self) -> None: - remote_media = self.recently_accessed_remotes - self.recently_accessed_remotes = set() - - local_media = self.recently_accessed_locals - self.recently_accessed_locals = set() - - await self.store.update_cached_last_access_time( - local_media, remote_media, self.clock.time_msec() - ) - - def mark_recently_accessed(self, server_name: Optional[str], media_id: str) -> None: - """Mark the given media as recently accessed. - - Args: - server_name: Origin server of media, or None if local - media_id: The media ID of the content - """ - if server_name: - self.recently_accessed_remotes.add((server_name, media_id)) - else: - self.recently_accessed_locals.add(media_id) - - async def create_content( - self, - media_type: str, - upload_name: Optional[str], - content: IO, - content_length: int, - auth_user: UserID, - ) -> MXCUri: - """Store uploaded content for a local user and return the mxc URL - - Args: - media_type: The content type of the file. - upload_name: The name of the file, if provided. - content: A file like object that is the content to store - content_length: The length of the content - auth_user: The user_id of the uploader - - Returns: - The mxc url of the stored content - """ - - media_id = random_string(24) - - file_info = FileInfo(server_name=None, file_id=media_id) - - fname = await self.media_storage.store_file(content, file_info) - - logger.info("Stored local media in file %r", fname) - - await self.store.store_local_media( - media_id=media_id, - media_type=media_type, - time_now_ms=self.clock.time_msec(), - upload_name=upload_name, - media_length=content_length, - user_id=auth_user, - ) - - await self._generate_thumbnails(None, media_id, media_id, media_type) - - return MXCUri(self.server_name, media_id) - - async def get_local_media( - self, request: SynapseRequest, media_id: str, name: Optional[str] - ) -> None: - """Responds to requests for local media, if exists, or returns 404. - - Args: - request: The incoming request. - media_id: The media ID of the content. (This is the same as - the file_id for local content.) - name: Optional name that, if specified, will be used as - the filename in the Content-Disposition header of the response. - - Returns: - Resolves once a response has successfully been written to request - """ - media_info = await self.store.get_local_media(media_id) - if not media_info or media_info["quarantined_by"]: - respond_404(request) - return - - self.mark_recently_accessed(None, media_id) - - media_type = media_info["media_type"] - if not media_type: - media_type = "application/octet-stream" - media_length = media_info["media_length"] - upload_name = name if name else media_info["upload_name"] - url_cache = media_info["url_cache"] - - file_info = FileInfo(None, media_id, url_cache=bool(url_cache)) - - responder = await self.media_storage.fetch_media(file_info) - await respond_with_responder( - request, responder, media_type, media_length, upload_name - ) - - async def get_remote_media( - self, - request: SynapseRequest, - server_name: str, - media_id: str, - name: Optional[str], - ) -> None: - """Respond to requests for remote media. - - Args: - request: The incoming request. - server_name: Remote server_name where the media originated. - media_id: The media ID of the content (as defined by the remote server). - name: Optional name that, if specified, will be used as - the filename in the Content-Disposition header of the response. - - Returns: - Resolves once a response has successfully been written to request - """ - if ( - self.federation_domain_whitelist is not None - and server_name not in self.federation_domain_whitelist - ): - raise FederationDeniedError(server_name) - - self.mark_recently_accessed(server_name, media_id) - - # We linearize here to ensure that we don't try and download remote - # media multiple times concurrently - key = (server_name, media_id) - async with self.remote_media_linearizer.queue(key): - responder, media_info = await self._get_remote_media_impl( - server_name, media_id - ) - - # We deliberately stream the file outside the lock - if responder: - media_type = media_info["media_type"] - media_length = media_info["media_length"] - upload_name = name if name else media_info["upload_name"] - await respond_with_responder( - request, responder, media_type, media_length, upload_name - ) - else: - respond_404(request) - - async def get_remote_media_info(self, server_name: str, media_id: str) -> dict: - """Gets the media info associated with the remote file, downloading - if necessary. - - Args: - server_name: Remote server_name where the media originated. - media_id: The media ID of the content (as defined by the remote server). - - Returns: - The media info of the file - """ - if ( - self.federation_domain_whitelist is not None - and server_name not in self.federation_domain_whitelist - ): - raise FederationDeniedError(server_name) - - # We linearize here to ensure that we don't try and download remote - # media multiple times concurrently - key = (server_name, media_id) - async with self.remote_media_linearizer.queue(key): - responder, media_info = await self._get_remote_media_impl( - server_name, media_id - ) - - # Ensure we actually use the responder so that it releases resources - if responder: - with responder: - pass - - return media_info - - async def _get_remote_media_impl( - self, server_name: str, media_id: str - ) -> Tuple[Optional[Responder], dict]: - """Looks for media in local cache, if not there then attempt to - download from remote server. - - Args: - server_name: Remote server_name where the media originated. - media_id: The media ID of the content (as defined by the - remote server). - - Returns: - A tuple of responder and the media info of the file. - """ - media_info = await self.store.get_cached_remote_media(server_name, media_id) - - # file_id is the ID we use to track the file locally. If we've already - # seen the file then reuse the existing ID, otherwise generate a new - # one. - - # If we have an entry in the DB, try and look for it - if media_info: - file_id = media_info["filesystem_id"] - file_info = FileInfo(server_name, file_id) - - if media_info["quarantined_by"]: - logger.info("Media is quarantined") - raise NotFoundError() - - if not media_info["media_type"]: - media_info["media_type"] = "application/octet-stream" - - responder = await self.media_storage.fetch_media(file_info) - if responder: - return responder, media_info - - # Failed to find the file anywhere, lets download it. - - try: - media_info = await self._download_remote_file( - server_name, - media_id, - ) - except SynapseError: - raise - except Exception as e: - # An exception may be because we downloaded media in another - # process, so let's check if we magically have the media. - media_info = await self.store.get_cached_remote_media(server_name, media_id) - if not media_info: - raise e - - file_id = media_info["filesystem_id"] - if not media_info["media_type"]: - media_info["media_type"] = "application/octet-stream" - file_info = FileInfo(server_name, file_id) - - # We generate thumbnails even if another process downloaded the media - # as a) it's conceivable that the other download request dies before it - # generates thumbnails, but mainly b) we want to be sure the thumbnails - # have finished being generated before responding to the client, - # otherwise they'll request thumbnails and get a 404 if they're not - # ready yet. - await self._generate_thumbnails( - server_name, media_id, file_id, media_info["media_type"] - ) - - responder = await self.media_storage.fetch_media(file_info) - return responder, media_info - - async def _download_remote_file( - self, - server_name: str, - media_id: str, - ) -> dict: - """Attempt to download the remote file from the given server name, - using the given file_id as the local id. - - Args: - server_name: Originating server - media_id: The media ID of the content (as defined by the - remote server). This is different than the file_id, which is - locally generated. - file_id: Local file ID - - Returns: - The media info of the file. - """ - - file_id = random_string(24) - - file_info = FileInfo(server_name=server_name, file_id=file_id) - - with self.media_storage.store_into_file(file_info) as (f, fname, finish): - request_path = "/".join( - ("/_matrix/media/r0/download", server_name, media_id) - ) - try: - length, headers = await self.client.get_file( - server_name, - request_path, - output_stream=f, - max_size=self.max_upload_size, - args={ - # tell the remote server to 404 if it doesn't - # recognise the server_name, to make sure we don't - # end up with a routing loop. - "allow_remote": "false" - }, - ) - except RequestSendFailed as e: - logger.warning( - "Request failed fetching remote media %s/%s: %r", - server_name, - media_id, - e, - ) - raise SynapseError(502, "Failed to fetch remote media") - - except HttpResponseException as e: - logger.warning( - "HTTP error fetching remote media %s/%s: %s", - server_name, - media_id, - e.response, - ) - if e.code == twisted.web.http.NOT_FOUND: - raise e.to_synapse_error() - raise SynapseError(502, "Failed to fetch remote media") - - except SynapseError: - logger.warning( - "Failed to fetch remote media %s/%s", server_name, media_id - ) - raise - except NotRetryingDestination: - logger.warning("Not retrying destination %r", server_name) - raise SynapseError(502, "Failed to fetch remote media") - except Exception: - logger.exception( - "Failed to fetch remote media %s/%s", server_name, media_id - ) - raise SynapseError(502, "Failed to fetch remote media") - - await finish() - - if b"Content-Type" in headers: - media_type = headers[b"Content-Type"][0].decode("ascii") - else: - media_type = "application/octet-stream" - upload_name = get_filename_from_headers(headers) - time_now_ms = self.clock.time_msec() - - # Multiple remote media download requests can race (when using - # multiple media repos), so this may throw a violation constraint - # exception. If it does we'll delete the newly downloaded file from - # disk (as we're in the ctx manager). - # - # However: we've already called `finish()` so we may have also - # written to the storage providers. This is preferable to the - # alternative where we call `finish()` *after* this, where we could - # end up having an entry in the DB but fail to write the files to - # the storage providers. - await self.store.store_cached_remote_media( - origin=server_name, - media_id=media_id, - media_type=media_type, - time_now_ms=self.clock.time_msec(), - upload_name=upload_name, - media_length=length, - filesystem_id=file_id, - ) - - logger.info("Stored remote media in file %r", fname) - - media_info = { - "media_type": media_type, - "media_length": length, - "upload_name": upload_name, - "created_ts": time_now_ms, - "filesystem_id": file_id, - } - - return media_info - - def _get_thumbnail_requirements( - self, media_type: str - ) -> Tuple[ThumbnailRequirement, ...]: - scpos = media_type.find(";") - if scpos > 0: - media_type = media_type[:scpos] - return self.thumbnail_requirements.get(media_type, ()) - - def _generate_thumbnail( - self, - thumbnailer: Thumbnailer, - t_width: int, - t_height: int, - t_method: str, - t_type: str, - ) -> Optional[BytesIO]: - m_width = thumbnailer.width - m_height = thumbnailer.height - - if m_width * m_height >= self.max_image_pixels: - logger.info( - "Image too large to thumbnail %r x %r > %r", - m_width, - m_height, - self.max_image_pixels, - ) - return None - - if thumbnailer.transpose_method is not None: - m_width, m_height = thumbnailer.transpose() - - if t_method == "crop": - return thumbnailer.crop(t_width, t_height, t_type) - elif t_method == "scale": - t_width, t_height = thumbnailer.aspect(t_width, t_height) - t_width = min(m_width, t_width) - t_height = min(m_height, t_height) - return thumbnailer.scale(t_width, t_height, t_type) - - return None - - async def generate_local_exact_thumbnail( - self, - media_id: str, - t_width: int, - t_height: int, - t_method: str, - t_type: str, - url_cache: bool, - ) -> Optional[str]: - input_path = await self.media_storage.ensure_media_is_in_local_cache( - FileInfo(None, media_id, url_cache=url_cache) - ) - - try: - thumbnailer = Thumbnailer(input_path) - except ThumbnailError as e: - logger.warning( - "Unable to generate a thumbnail for local media %s using a method of %s and type of %s: %s", - media_id, - t_method, - t_type, - e, - ) - return None - - with thumbnailer: - t_byte_source = await defer_to_thread( - self.hs.get_reactor(), - self._generate_thumbnail, - thumbnailer, - t_width, - t_height, - t_method, - t_type, - ) - - if t_byte_source: - try: - file_info = FileInfo( - server_name=None, - file_id=media_id, - url_cache=url_cache, - thumbnail=ThumbnailInfo( - width=t_width, - height=t_height, - method=t_method, - type=t_type, - ), - ) - - output_path = await self.media_storage.store_file( - t_byte_source, file_info - ) - finally: - t_byte_source.close() - - logger.info("Stored thumbnail in file %r", output_path) - - t_len = os.path.getsize(output_path) - - await self.store.store_local_thumbnail( - media_id, t_width, t_height, t_type, t_method, t_len - ) - - return output_path - - # Could not generate thumbnail. - return None - - async def generate_remote_exact_thumbnail( - self, - server_name: str, - file_id: str, - media_id: str, - t_width: int, - t_height: int, - t_method: str, - t_type: str, - ) -> Optional[str]: - input_path = await self.media_storage.ensure_media_is_in_local_cache( - FileInfo(server_name, file_id) - ) - - try: - thumbnailer = Thumbnailer(input_path) - except ThumbnailError as e: - logger.warning( - "Unable to generate a thumbnail for remote media %s from %s using a method of %s and type of %s: %s", - media_id, - server_name, - t_method, - t_type, - e, - ) - return None - - with thumbnailer: - t_byte_source = await defer_to_thread( - self.hs.get_reactor(), - self._generate_thumbnail, - thumbnailer, - t_width, - t_height, - t_method, - t_type, - ) - - if t_byte_source: - try: - file_info = FileInfo( - server_name=server_name, - file_id=file_id, - thumbnail=ThumbnailInfo( - width=t_width, - height=t_height, - method=t_method, - type=t_type, - ), - ) - - output_path = await self.media_storage.store_file( - t_byte_source, file_info - ) - finally: - t_byte_source.close() - - logger.info("Stored thumbnail in file %r", output_path) - - t_len = os.path.getsize(output_path) - - await self.store.store_remote_media_thumbnail( - server_name, - media_id, - file_id, - t_width, - t_height, - t_type, - t_method, - t_len, - ) - - return output_path - - # Could not generate thumbnail. - return None - - async def _generate_thumbnails( - self, - server_name: Optional[str], - media_id: str, - file_id: str, - media_type: str, - url_cache: bool = False, - ) -> Optional[dict]: - """Generate and store thumbnails for an image. - - Args: - server_name: The server name if remote media, else None if local - media_id: The media ID of the content. (This is the same as - the file_id for local content) - file_id: Local file ID - media_type: The content type of the file - url_cache: If we are thumbnailing images downloaded for the URL cache, - used exclusively by the url previewer - - Returns: - Dict with "width" and "height" keys of original image or None if the - media cannot be thumbnailed. - """ - requirements = self._get_thumbnail_requirements(media_type) - if not requirements: - return None - - input_path = await self.media_storage.ensure_media_is_in_local_cache( - FileInfo(server_name, file_id, url_cache=url_cache) - ) - - try: - thumbnailer = Thumbnailer(input_path) - except ThumbnailError as e: - logger.warning( - "Unable to generate thumbnails for remote media %s from %s of type %s: %s", - media_id, - server_name, - media_type, - e, - ) - return None - - with thumbnailer: - m_width = thumbnailer.width - m_height = thumbnailer.height - - if m_width * m_height >= self.max_image_pixels: - logger.info( - "Image too large to thumbnail %r x %r > %r", - m_width, - m_height, - self.max_image_pixels, - ) - return None - - if thumbnailer.transpose_method is not None: - m_width, m_height = await defer_to_thread( - self.hs.get_reactor(), thumbnailer.transpose - ) - - # We deduplicate the thumbnail sizes by ignoring the cropped versions if - # they have the same dimensions of a scaled one. - thumbnails: Dict[Tuple[int, int, str], str] = {} - for requirement in requirements: - if requirement.method == "crop": - thumbnails.setdefault( - (requirement.width, requirement.height, requirement.media_type), - requirement.method, - ) - elif requirement.method == "scale": - t_width, t_height = thumbnailer.aspect( - requirement.width, requirement.height - ) - t_width = min(m_width, t_width) - t_height = min(m_height, t_height) - thumbnails[ - (t_width, t_height, requirement.media_type) - ] = requirement.method - - # Now we generate the thumbnails for each dimension, store it - for (t_width, t_height, t_type), t_method in thumbnails.items(): - # Generate the thumbnail - if t_method == "crop": - t_byte_source = await defer_to_thread( - self.hs.get_reactor(), - thumbnailer.crop, - t_width, - t_height, - t_type, - ) - elif t_method == "scale": - t_byte_source = await defer_to_thread( - self.hs.get_reactor(), - thumbnailer.scale, - t_width, - t_height, - t_type, - ) - else: - logger.error("Unrecognized method: %r", t_method) - continue - - if not t_byte_source: - continue - - file_info = FileInfo( - server_name=server_name, - file_id=file_id, - url_cache=url_cache, - thumbnail=ThumbnailInfo( - width=t_width, - height=t_height, - method=t_method, - type=t_type, - ), - ) - - with self.media_storage.store_into_file(file_info) as ( - f, - fname, - finish, - ): - try: - await self.media_storage.write_to_file(t_byte_source, f) - await finish() - finally: - t_byte_source.close() - - t_len = os.path.getsize(fname) - - # Write to database - if server_name: - # Multiple remote media download requests can race (when - # using multiple media repos), so this may throw a violation - # constraint exception. If it does we'll delete the newly - # generated thumbnail from disk (as we're in the ctx - # manager). - # - # However: we've already called `finish()` so we may have - # also written to the storage providers. This is preferable - # to the alternative where we call `finish()` *after* this, - # where we could end up having an entry in the DB but fail - # to write the files to the storage providers. - try: - await self.store.store_remote_media_thumbnail( - server_name, - media_id, - file_id, - t_width, - t_height, - t_type, - t_method, - t_len, - ) - except Exception as e: - thumbnail_exists = ( - await self.store.get_remote_media_thumbnail( - server_name, - media_id, - t_width, - t_height, - t_type, - ) - ) - if not thumbnail_exists: - raise e - else: - await self.store.store_local_thumbnail( - media_id, t_width, t_height, t_type, t_method, t_len - ) - - return {"width": m_width, "height": m_height} - - async def _apply_media_retention_rules(self) -> None: - """ - Purge old local and remote media according to the media retention rules - defined in the homeserver config. - """ - # Purge remote media - if self._media_retention_remote_media_lifetime_ms is not None: - # Calculate a threshold timestamp derived from the configured lifetime. Any - # media that has not been accessed since this timestamp will be removed. - remote_media_threshold_timestamp_ms = ( - self.clock.time_msec() - self._media_retention_remote_media_lifetime_ms - ) - - logger.info( - "Purging remote media last accessed before" - f" {remote_media_threshold_timestamp_ms}" - ) - - await self.delete_old_remote_media( - before_ts=remote_media_threshold_timestamp_ms - ) - - # And now do the same for local media - if self._media_retention_local_media_lifetime_ms is not None: - # This works the same as the remote media threshold - local_media_threshold_timestamp_ms = ( - self.clock.time_msec() - self._media_retention_local_media_lifetime_ms - ) - - logger.info( - "Purging local media last accessed before" - f" {local_media_threshold_timestamp_ms}" - ) - - await self.delete_old_local_media( - before_ts=local_media_threshold_timestamp_ms, - keep_profiles=True, - delete_quarantined_media=False, - delete_protected_media=False, - ) - - async def delete_old_remote_media(self, before_ts: int) -> Dict[str, int]: - old_media = await self.store.get_remote_media_ids( - before_ts, include_quarantined_media=False - ) - - deleted = 0 - - for media in old_media: - origin = media["media_origin"] - media_id = media["media_id"] - file_id = media["filesystem_id"] - key = (origin, media_id) - - logger.info("Deleting: %r", key) - - # TODO: Should we delete from the backup store - - async with self.remote_media_linearizer.queue(key): - full_path = self.filepaths.remote_media_filepath(origin, file_id) - try: - os.remove(full_path) - except OSError as e: - logger.warning("Failed to remove file: %r", full_path) - if e.errno == errno.ENOENT: - pass - else: - continue - - thumbnail_dir = self.filepaths.remote_media_thumbnail_dir( - origin, file_id - ) - shutil.rmtree(thumbnail_dir, ignore_errors=True) - - await self.store.delete_remote_media(origin, media_id) - deleted += 1 - - return {"deleted": deleted} - - async def delete_local_media_ids( - self, media_ids: List[str] - ) -> Tuple[List[str], int]: - """ - Delete the given local or remote media ID from this server - - Args: - media_id: The media ID to delete. - Returns: - A tuple of (list of deleted media IDs, total deleted media IDs). - """ - return await self._remove_local_media_from_disk(media_ids) - - async def delete_old_local_media( - self, - before_ts: int, - size_gt: int = 0, - keep_profiles: bool = True, - delete_quarantined_media: bool = False, - delete_protected_media: bool = False, - ) -> Tuple[List[str], int]: - """ - Delete local or remote media from this server by size and timestamp. Removes - media files, any thumbnails and cached URLs. - - Args: - before_ts: Unix timestamp in ms. - Files that were last used before this timestamp will be deleted. - size_gt: Size of the media in bytes. Files that are larger will be deleted. - keep_profiles: Switch to delete also files that are still used in image data - (e.g user profile, room avatar). If false these files will be deleted. - delete_quarantined_media: If True, media marked as quarantined will be deleted. - delete_protected_media: If True, media marked as protected will be deleted. - - Returns: - A tuple of (list of deleted media IDs, total deleted media IDs). - """ - old_media = await self.store.get_local_media_ids( - before_ts, - size_gt, - keep_profiles, - include_quarantined_media=delete_quarantined_media, - include_protected_media=delete_protected_media, - ) - return await self._remove_local_media_from_disk(old_media) - - async def _remove_local_media_from_disk( - self, media_ids: List[str] - ) -> Tuple[List[str], int]: - """ - Delete local or remote media from this server. Removes media files, - any thumbnails and cached URLs. - - Args: - media_ids: List of media_id to delete - Returns: - A tuple of (list of deleted media IDs, total deleted media IDs). - """ - removed_media = [] - for media_id in media_ids: - logger.info("Deleting media with ID '%s'", media_id) - full_path = self.filepaths.local_media_filepath(media_id) - try: - os.remove(full_path) - except OSError as e: - logger.warning("Failed to remove file: %r: %s", full_path, e) - if e.errno == errno.ENOENT: - pass - else: - continue - - thumbnail_dir = self.filepaths.local_media_thumbnail_dir(media_id) - shutil.rmtree(thumbnail_dir, ignore_errors=True) - - await self.store.delete_remote_media(self.server_name, media_id) - - await self.store.delete_url_cache((media_id,)) - await self.store.delete_url_cache_media((media_id,)) - - removed_media.append(media_id) - - return removed_media, len(removed_media) - - -class MediaRepositoryResource(UnrecognizedRequestResource): - """File uploading and downloading. - - Uploads are POSTed to a resource which returns a token which is used to GET - the download:: - - => POST /_matrix/media/r0/upload HTTP/1.1 - Content-Type: <media-type> - Content-Length: <content-length> - - <media> - - <= HTTP/1.1 200 OK - Content-Type: application/json - - { "content_uri": "mxc://<server-name>/<media-id>" } - - => GET /_matrix/media/r0/download/<server-name>/<media-id> HTTP/1.1 - - <= HTTP/1.1 200 OK - Content-Type: <media-type> - Content-Disposition: attachment;filename=<upload-filename> - - <media> - - Clients can get thumbnails by supplying a desired width and height and - thumbnailing method:: - - => GET /_matrix/media/r0/thumbnail/<server_name> - /<media-id>?width=<w>&height=<h>&method=<m> HTTP/1.1 - - <= HTTP/1.1 200 OK - Content-Type: image/jpeg or image/png - - <thumbnail> - - The thumbnail methods are "crop" and "scale". "scale" tries to return an - image where either the width or the height is smaller than the requested - size. The client should then scale and letterbox the image if it needs to - fit within a given rectangle. "crop" tries to return an image where the - width and height are close to the requested size and the aspect matches - the requested size. The client should scale the image if it needs to fit - within a given rectangle. - """ - - def __init__(self, hs: "HomeServer"): - # If we're not configured to use it, raise if we somehow got here. - if not hs.config.media.can_load_media_repo: - raise ConfigError("Synapse is not configured to use a media repo.") - - super().__init__() - media_repo = hs.get_media_repository() - - self.putChild(b"upload", UploadResource(hs, media_repo)) - self.putChild(b"download", DownloadResource(hs, media_repo)) - self.putChild( - b"thumbnail", ThumbnailResource(hs, media_repo, media_repo.media_storage) - ) - if hs.config.media.url_preview_enabled: - self.putChild( - b"preview_url", - PreviewUrlResource(hs, media_repo, media_repo.media_storage), - ) - self.putChild(b"config", MediaConfigResource(hs)) diff --git a/synapse/rest/media/v1/media_storage.py b/synapse/rest/media/v1/media_storage.py index db25848744..11b0e8e231 100644 --- a/synapse/rest/media/v1/media_storage.py +++ b/synapse/rest/media/v1/media_storage.py @@ -1,4 +1,4 @@ -# Copyright 2018-2021 The Matrix.org Foundation C.I.C. +# Copyright 2023 The Matrix.org Foundation C.I.C. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,364 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import contextlib -import logging -import os -import shutil -from types import TracebackType -from typing import ( - IO, - TYPE_CHECKING, - Any, - Awaitable, - BinaryIO, - Callable, - Generator, - Optional, - Sequence, - Tuple, - Type, -) - -import attr - -from twisted.internet.defer import Deferred -from twisted.internet.interfaces import IConsumer -from twisted.protocols.basic import FileSender - -import synapse -from synapse.api.errors import NotFoundError -from synapse.logging.context import defer_to_thread, make_deferred_yieldable -from synapse.util import Clock -from synapse.util.file_consumer import BackgroundFileConsumer - -from ._base import FileInfo, Responder -from .filepath import MediaFilePaths - -if TYPE_CHECKING: - from synapse.rest.media.v1.storage_provider import StorageProvider - from synapse.server import HomeServer - -logger = logging.getLogger(__name__) - - -class MediaStorage: - """Responsible for storing/fetching files from local sources. - - Args: - hs - local_media_directory: Base path where we store media on disk - filepaths - storage_providers: List of StorageProvider that are used to fetch and store files. - """ - - def __init__( - self, - hs: "HomeServer", - local_media_directory: str, - filepaths: MediaFilePaths, - storage_providers: Sequence["StorageProvider"], - ): - self.hs = hs - self.reactor = hs.get_reactor() - self.local_media_directory = local_media_directory - self.filepaths = filepaths - self.storage_providers = storage_providers - self.spam_checker = hs.get_spam_checker() - self.clock = hs.get_clock() - - async def store_file(self, source: IO, file_info: FileInfo) -> str: - """Write `source` to the on disk media store, and also any other - configured storage providers - - Args: - source: A file like object that should be written - file_info: Info about the file to store - - Returns: - the file path written to in the primary media store - """ - - with self.store_into_file(file_info) as (f, fname, finish_cb): - # Write to the main repository - await self.write_to_file(source, f) - await finish_cb() - - return fname - - async def write_to_file(self, source: IO, output: IO) -> None: - """Asynchronously write the `source` to `output`.""" - await defer_to_thread(self.reactor, _write_file_synchronously, source, output) - - @contextlib.contextmanager - def store_into_file( - self, file_info: FileInfo - ) -> Generator[Tuple[BinaryIO, str, Callable[[], Awaitable[None]]], None, None]: - """Context manager used to get a file like object to write into, as - described by file_info. - - Actually yields a 3-tuple (file, fname, finish_cb), where file is a file - like object that can be written to, fname is the absolute path of file - on disk, and finish_cb is a function that returns an awaitable. - - fname can be used to read the contents from after upload, e.g. to - generate thumbnails. - - finish_cb must be called and waited on after the file has been - successfully been written to. Should not be called if there was an - error. - - Args: - file_info: Info about the file to store - - Example: - - with media_storage.store_into_file(info) as (f, fname, finish_cb): - # .. write into f ... - await finish_cb() - """ - - path = self._file_info_to_path(file_info) - fname = os.path.join(self.local_media_directory, path) - - dirname = os.path.dirname(fname) - os.makedirs(dirname, exist_ok=True) - - finished_called = [False] - - try: - with open(fname, "wb") as f: - - async def finish() -> None: - # Ensure that all writes have been flushed and close the - # file. - f.flush() - f.close() - - spam_check = await self.spam_checker.check_media_file_for_spam( - ReadableFileWrapper(self.clock, fname), file_info - ) - if spam_check != synapse.module_api.NOT_SPAM: - logger.info("Blocking media due to spam checker") - # Note that we'll delete the stored media, due to the - # try/except below. The media also won't be stored in - # the DB. - # We currently ignore any additional field returned by - # the spam-check API. - raise SpamMediaException(errcode=spam_check[0]) - - for provider in self.storage_providers: - await provider.store_file(path, file_info) - - finished_called[0] = True - - yield f, fname, finish - except Exception as e: - try: - os.remove(fname) - except Exception: - pass - - raise e from None - - if not finished_called: - raise Exception("Finished callback not called") - - async def fetch_media(self, file_info: FileInfo) -> Optional[Responder]: - """Attempts to fetch media described by file_info from the local cache - and configured storage providers. - - Args: - file_info - - Returns: - Returns a Responder if the file was found, otherwise None. - """ - paths = [self._file_info_to_path(file_info)] - - # fallback for remote thumbnails with no method in the filename - if file_info.thumbnail and file_info.server_name: - paths.append( - self.filepaths.remote_media_thumbnail_rel_legacy( - server_name=file_info.server_name, - file_id=file_info.file_id, - width=file_info.thumbnail.width, - height=file_info.thumbnail.height, - content_type=file_info.thumbnail.type, - ) - ) - - for path in paths: - local_path = os.path.join(self.local_media_directory, path) - if os.path.exists(local_path): - logger.debug("responding with local file %s", local_path) - return FileResponder(open(local_path, "rb")) - logger.debug("local file %s did not exist", local_path) - - for provider in self.storage_providers: - for path in paths: - res: Any = await provider.fetch(path, file_info) - if res: - logger.debug("Streaming %s from %s", path, provider) - return res - logger.debug("%s not found on %s", path, provider) - - return None - - async def ensure_media_is_in_local_cache(self, file_info: FileInfo) -> str: - """Ensures that the given file is in the local cache. Attempts to - download it from storage providers if it isn't. - - Args: - file_info - - Returns: - Full path to local file - """ - path = self._file_info_to_path(file_info) - local_path = os.path.join(self.local_media_directory, path) - if os.path.exists(local_path): - return local_path - - # Fallback for paths without method names - # Should be removed in the future - if file_info.thumbnail and file_info.server_name: - legacy_path = self.filepaths.remote_media_thumbnail_rel_legacy( - server_name=file_info.server_name, - file_id=file_info.file_id, - width=file_info.thumbnail.width, - height=file_info.thumbnail.height, - content_type=file_info.thumbnail.type, - ) - legacy_local_path = os.path.join(self.local_media_directory, legacy_path) - if os.path.exists(legacy_local_path): - return legacy_local_path - - dirname = os.path.dirname(local_path) - os.makedirs(dirname, exist_ok=True) - - for provider in self.storage_providers: - res: Any = await provider.fetch(path, file_info) - if res: - with res: - consumer = BackgroundFileConsumer( - open(local_path, "wb"), self.reactor - ) - await res.write_to_consumer(consumer) - await consumer.wait() - return local_path - - raise NotFoundError() - - def _file_info_to_path(self, file_info: FileInfo) -> str: - """Converts file_info into a relative path. - - The path is suitable for storing files under a directory, e.g. used to - store files on local FS under the base media repository directory. - """ - if file_info.url_cache: - if file_info.thumbnail: - return self.filepaths.url_cache_thumbnail_rel( - media_id=file_info.file_id, - width=file_info.thumbnail.width, - height=file_info.thumbnail.height, - content_type=file_info.thumbnail.type, - method=file_info.thumbnail.method, - ) - return self.filepaths.url_cache_filepath_rel(file_info.file_id) - - if file_info.server_name: - if file_info.thumbnail: - return self.filepaths.remote_media_thumbnail_rel( - server_name=file_info.server_name, - file_id=file_info.file_id, - width=file_info.thumbnail.width, - height=file_info.thumbnail.height, - content_type=file_info.thumbnail.type, - method=file_info.thumbnail.method, - ) - return self.filepaths.remote_media_filepath_rel( - file_info.server_name, file_info.file_id - ) - - if file_info.thumbnail: - return self.filepaths.local_media_thumbnail_rel( - media_id=file_info.file_id, - width=file_info.thumbnail.width, - height=file_info.thumbnail.height, - content_type=file_info.thumbnail.type, - method=file_info.thumbnail.method, - ) - return self.filepaths.local_media_filepath_rel(file_info.file_id) - - -def _write_file_synchronously(source: IO, dest: IO) -> None: - """Write `source` to the file like `dest` synchronously. Should be called - from a thread. - - Args: - source: A file like object that's to be written - dest: A file like object to be written to - """ - source.seek(0) # Ensure we read from the start of the file - shutil.copyfileobj(source, dest) - - -class FileResponder(Responder): - """Wraps an open file that can be sent to a request. - - Args: - open_file: A file like object to be streamed ot the client, - is closed when finished streaming. - """ - - def __init__(self, open_file: IO): - self.open_file = open_file - - def write_to_consumer(self, consumer: IConsumer) -> Deferred: - return make_deferred_yieldable( - FileSender().beginFileTransfer(self.open_file, consumer) - ) - - def __exit__( - self, - exc_type: Optional[Type[BaseException]], - exc_val: Optional[BaseException], - exc_tb: Optional[TracebackType], - ) -> None: - self.open_file.close() - - -class SpamMediaException(NotFoundError): - """The media was blocked by a spam checker, so we simply 404 the request (in - the same way as if it was quarantined). - """ - - -@attr.s(slots=True, auto_attribs=True) -class ReadableFileWrapper: - """Wrapper that allows reading a file in chunks, yielding to the reactor, - and writing to a callback. - - This is simplified `FileSender` that takes an IO object rather than an - `IConsumer`. - """ - - CHUNK_SIZE = 2**14 - - clock: Clock - path: str - - async def write_chunks_to(self, callback: Callable[[bytes], object]) -> None: - """Reads the file in chunks and calls the callback with each chunk.""" - - with open(self.path, "rb") as file: - while True: - chunk = file.read(self.CHUNK_SIZE) - if not chunk: - break - - callback(chunk) +# - # We yield to the reactor by sleeping for 0 seconds. - await self.clock.sleep(0) +# This exists purely for backwards compatibility with spam checkers. +from synapse.media.media_storage import ReadableFileWrapper # noqa: F401 diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py deleted file mode 100644 index 7592aa5d47..0000000000 --- a/synapse/rest/media/v1/oembed.py +++ /dev/null @@ -1,265 +0,0 @@ -# Copyright 2021 The Matrix.org Foundation C.I.C. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import html -import logging -import urllib.parse -from typing import TYPE_CHECKING, List, Optional - -import attr - -from synapse.rest.media.v1.preview_html import parse_html_description -from synapse.types import JsonDict -from synapse.util import json_decoder - -if TYPE_CHECKING: - from lxml import etree - - from synapse.server import HomeServer - -logger = logging.getLogger(__name__) - - -@attr.s(slots=True, frozen=True, auto_attribs=True) -class OEmbedResult: - # The Open Graph result (converted from the oEmbed result). - open_graph_result: JsonDict - # The author_name of the oEmbed result - author_name: Optional[str] - # Number of milliseconds to cache the content, according to the oEmbed response. - # - # This will be None if no cache-age is provided in the oEmbed response (or - # if the oEmbed response cannot be turned into an Open Graph response). - cache_age: Optional[int] - - -class OEmbedProvider: - """ - A helper for accessing oEmbed content. - - It can be used to check if a URL should be accessed via oEmbed and for - requesting/parsing oEmbed content. - """ - - def __init__(self, hs: "HomeServer"): - self._oembed_patterns = {} - for oembed_endpoint in hs.config.oembed.oembed_patterns: - api_endpoint = oembed_endpoint.api_endpoint - - # Only JSON is supported at the moment. This could be declared in - # the formats field. Otherwise, if the endpoint ends in .xml assume - # it doesn't support JSON. - if ( - oembed_endpoint.formats is not None - and "json" not in oembed_endpoint.formats - ) or api_endpoint.endswith(".xml"): - logger.info( - "Ignoring oEmbed endpoint due to not supporting JSON: %s", - api_endpoint, - ) - continue - - # Iterate through each URL pattern and point it to the endpoint. - for pattern in oembed_endpoint.url_patterns: - self._oembed_patterns[pattern] = api_endpoint - - def get_oembed_url(self, url: str) -> Optional[str]: - """ - Check whether the URL should be downloaded as oEmbed content instead. - - Args: - url: The URL to check. - - Returns: - A URL to use instead or None if the original URL should be used. - """ - for url_pattern, endpoint in self._oembed_patterns.items(): - if url_pattern.fullmatch(url): - # TODO Specify max height / width. - - # Note that only the JSON format is supported, some endpoints want - # this in the URL, others want it as an argument. - endpoint = endpoint.replace("{format}", "json") - - args = {"url": url, "format": "json"} - query_str = urllib.parse.urlencode(args, True) - return f"{endpoint}?{query_str}" - - # No match. - return None - - def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]: - """ - Search an HTML document for oEmbed autodiscovery information. - - Args: - tree: The parsed HTML body. - - Returns: - The URL to use for oEmbed information, or None if no URL was found. - """ - # Search for link elements with the proper rel and type attributes. - for tag in tree.xpath( - "//link[@rel='alternate'][@type='application/json+oembed']" - ): - if "href" in tag.attrib: - return tag.attrib["href"] - - # Some providers (e.g. Flickr) use alternative instead of alternate. - for tag in tree.xpath( - "//link[@rel='alternative'][@type='application/json+oembed']" - ): - if "href" in tag.attrib: - return tag.attrib["href"] - - return None - - def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult: - """ - Parse the oEmbed response into an Open Graph response. - - Args: - url: The URL which is being previewed (not the one which was - requested). - raw_body: The oEmbed response as JSON encoded as bytes. - - Returns: - json-encoded Open Graph data - """ - - try: - # oEmbed responses *must* be UTF-8 according to the spec. - oembed = json_decoder.decode(raw_body.decode("utf-8")) - except ValueError: - return OEmbedResult({}, None, None) - - # The version is a required string field, but not always provided, - # or sometimes provided as a float. Be lenient. - oembed_version = oembed.get("version", "1.0") - if oembed_version != "1.0" and oembed_version != 1: - return OEmbedResult({}, None, None) - - # Attempt to parse the cache age, if possible. - try: - cache_age = int(oembed.get("cache_age")) * 1000 - except (TypeError, ValueError): - # If the cache age cannot be parsed (e.g. wrong type or invalid - # string), ignore it. - cache_age = None - - # The oEmbed response converted to Open Graph. - open_graph_response: JsonDict = {"og:url": url} - - title = oembed.get("title") - if title and isinstance(title, str): - # A common WordPress plug-in seems to incorrectly escape entities - # in the oEmbed response. - open_graph_response["og:title"] = html.unescape(title) - - author_name = oembed.get("author_name") - if not isinstance(author_name, str): - author_name = None - - # Use the provider name and as the site. - provider_name = oembed.get("provider_name") - if provider_name and isinstance(provider_name, str): - open_graph_response["og:site_name"] = provider_name - - # If a thumbnail exists, use it. Note that dimensions will be calculated later. - thumbnail_url = oembed.get("thumbnail_url") - if thumbnail_url and isinstance(thumbnail_url, str): - open_graph_response["og:image"] = thumbnail_url - - # Process each type separately. - oembed_type = oembed.get("type") - if oembed_type == "rich": - html_str = oembed.get("html") - if isinstance(html_str, str): - calc_description_and_urls(open_graph_response, html_str) - - elif oembed_type == "photo": - # If this is a photo, use the full image, not the thumbnail. - url = oembed.get("url") - if url and isinstance(url, str): - open_graph_response["og:image"] = url - - elif oembed_type == "video": - open_graph_response["og:type"] = "video.other" - html_str = oembed.get("html") - if html_str and isinstance(html_str, str): - calc_description_and_urls(open_graph_response, oembed["html"]) - for size in ("width", "height"): - val = oembed.get(size) - if type(val) is int: - open_graph_response[f"og:video:{size}"] = val - - elif oembed_type == "link": - open_graph_response["og:type"] = "website" - - else: - logger.warning("Unknown oEmbed type: %s", oembed_type) - - return OEmbedResult(open_graph_response, author_name, cache_age) - - -def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]: - results = [] - for tag in tree.xpath("//*/" + tag_name): - if "src" in tag.attrib: - results.append(tag.attrib["src"]) - return results - - -def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None: - """ - Calculate description for an HTML document. - - This uses lxml to convert the HTML document into plaintext. If errors - occur during processing of the document, an empty response is returned. - - Args: - open_graph_response: The current Open Graph summary. This is updated with additional fields. - html_body: The HTML document, as bytes. - - Returns: - The summary - """ - # If there's no body, nothing useful is going to be found. - if not html_body: - return - - from lxml import etree - - # Create an HTML parser. If this fails, log and return no metadata. - parser = etree.HTMLParser(recover=True, encoding="utf-8") - - # Attempt to parse the body. If this fails, log and return no metadata. - tree = etree.fromstring(html_body, parser) - - # The data was successfully parsed, but no tree was found. - if tree is None: - return - - # Attempt to find interesting URLs (images, videos, embeds). - if "og:image" not in open_graph_response: - image_urls = _fetch_urls(tree, "img") - if image_urls: - open_graph_response["og:image"] = image_urls[0] - - video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed") - if video_urls: - open_graph_response["og:video"] = video_urls[0] - - description = parse_html_description(tree) - if description: - open_graph_response["og:description"] = description diff --git a/synapse/rest/media/v1/preview_html.py b/synapse/rest/media/v1/preview_html.py deleted file mode 100644 index 516d0434f0..0000000000 --- a/synapse/rest/media/v1/preview_html.py +++ /dev/null @@ -1,501 +0,0 @@ -# Copyright 2021 The Matrix.org Foundation C.I.C. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import codecs -import logging -import re -from typing import ( - TYPE_CHECKING, - Callable, - Dict, - Generator, - Iterable, - List, - Optional, - Set, - Union, -) - -if TYPE_CHECKING: - from lxml import etree - -logger = logging.getLogger(__name__) - -_charset_match = re.compile( - rb'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I -) -_xml_encoding_match = re.compile( - rb'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I -) -_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I) - -# Certain elements aren't meant for display. -ARIA_ROLES_TO_IGNORE = {"directory", "menu", "menubar", "toolbar"} - - -def _normalise_encoding(encoding: str) -> Optional[str]: - """Use the Python codec's name as the normalised entry.""" - try: - return codecs.lookup(encoding).name - except LookupError: - return None - - -def _get_html_media_encodings( - body: bytes, content_type: Optional[str] -) -> Iterable[str]: - """ - Get potential encoding of the body based on the (presumably) HTML body or the content-type header. - - The precedence used for finding a character encoding is: - - 1. <meta> tag with a charset declared. - 2. The XML document's character encoding attribute. - 3. The Content-Type header. - 4. Fallback to utf-8. - 5. Fallback to windows-1252. - - This roughly follows the algorithm used by BeautifulSoup's bs4.dammit.EncodingDetector. - - Args: - body: The HTML document, as bytes. - content_type: The Content-Type header. - - Returns: - The character encoding of the body, as a string. - """ - # There's no point in returning an encoding more than once. - attempted_encodings: Set[str] = set() - - # Limit searches to the first 1kb, since it ought to be at the top. - body_start = body[:1024] - - # Check if it has an encoding set in a meta tag. - match = _charset_match.search(body_start) - if match: - encoding = _normalise_encoding(match.group(1).decode("ascii")) - if encoding: - attempted_encodings.add(encoding) - yield encoding - - # TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> - - # Check if it has an XML document with an encoding. - match = _xml_encoding_match.match(body_start) - if match: - encoding = _normalise_encoding(match.group(1).decode("ascii")) - if encoding and encoding not in attempted_encodings: - attempted_encodings.add(encoding) - yield encoding - - # Check the HTTP Content-Type header for a character set. - if content_type: - content_match = _content_type_match.match(content_type) - if content_match: - encoding = _normalise_encoding(content_match.group(1)) - if encoding and encoding not in attempted_encodings: - attempted_encodings.add(encoding) - yield encoding - - # Finally, fallback to UTF-8, then windows-1252. - for fallback in ("utf-8", "cp1252"): - if fallback not in attempted_encodings: - yield fallback - - -def decode_body( - body: bytes, uri: str, content_type: Optional[str] = None -) -> Optional["etree.Element"]: - """ - This uses lxml to parse the HTML document. - - Args: - body: The HTML document, as bytes. - uri: The URI used to download the body. - content_type: The Content-Type header. - - Returns: - The parsed HTML body, or None if an error occurred during processed. - """ - # If there's no body, nothing useful is going to be found. - if not body: - return None - - # The idea here is that multiple encodings are tried until one works. - # Unfortunately the result is never used and then LXML will decode the string - # again with the found encoding. - for encoding in _get_html_media_encodings(body, content_type): - try: - body.decode(encoding) - except Exception: - pass - else: - break - else: - logger.warning("Unable to decode HTML body for %s", uri) - return None - - from lxml import etree - - # Create an HTML parser. - parser = etree.HTMLParser(recover=True, encoding=encoding) - - # Attempt to parse the body. Returns None if the body was successfully - # parsed, but no tree was found. - return etree.fromstring(body, parser) - - -def _get_meta_tags( - tree: "etree.Element", - property: str, - prefix: str, - property_mapper: Optional[Callable[[str], Optional[str]]] = None, -) -> Dict[str, Optional[str]]: - """ - Search for meta tags prefixed with a particular string. - - Args: - tree: The parsed HTML document. - property: The name of the property which contains the tag name, e.g. - "property" for Open Graph. - prefix: The prefix on the property to search for, e.g. "og" for Open Graph. - property_mapper: An optional callable to map the property to the Open Graph - form. Can return None for a key to ignore that key. - - Returns: - A map of tag name to value. - """ - results: Dict[str, Optional[str]] = {} - for tag in tree.xpath( - f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]" - ): - # if we've got more than 50 tags, someone is taking the piss - if len(results) >= 50: - logger.warning( - "Skipping parsing of Open Graph for page with too many '%s:' tags", - prefix, - ) - return {} - - key = tag.attrib[property] - if property_mapper: - key = property_mapper(key) - # None is a special value used to ignore a value. - if key is None: - continue - - results[key] = tag.attrib["content"] - - return results - - -def _map_twitter_to_open_graph(key: str) -> Optional[str]: - """ - Map a Twitter card property to the analogous Open Graph property. - - Args: - key: The Twitter card property (starts with "twitter:"). - - Returns: - The Open Graph property (starts with "og:") or None to have this property - be ignored. - """ - # Twitter card properties with no analogous Open Graph property. - if key == "twitter:card" or key == "twitter:creator": - return None - if key == "twitter:site": - return "og:site_name" - # Otherwise, swap twitter to og. - return "og" + key[7:] - - -def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: - """ - Parse the HTML document into an Open Graph response. - - This uses lxml to search the HTML document for Open Graph data (or - synthesizes it from the document). - - Args: - tree: The parsed HTML document. - - Returns: - The Open Graph response as a dictionary. - """ - - # Search for Open Graph (og:) meta tags, e.g.: - # - # "og:type" : "video", - # "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw", - # "og:site_name" : "YouTube", - # "og:video:type" : "application/x-shockwave-flash", - # "og:description" : "Fun stuff happening here", - # "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon", - # "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg", - # "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", - # "og:video:width" : "1280" - # "og:video:height" : "720", - # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3", - - og = _get_meta_tags(tree, "property", "og") - - # TODO: Search for properties specific to the different Open Graph types, - # such as article: meta tags, e.g.: - # - # "article:publisher" : "https://www.facebook.com/thethudonline" /> - # "article:author" content="https://www.facebook.com/thethudonline" /> - # "article:tag" content="baby" /> - # "article:section" content="Breaking News" /> - # "article:published_time" content="2016-03-31T19:58:24+00:00" /> - # "article:modified_time" content="2016-04-01T18:31:53+00:00" /> - - # Search for Twitter Card (twitter:) meta tags, e.g.: - # - # "twitter:site" : "@matrixdotorg" - # "twitter:creator" : "@matrixdotorg" - # - # Twitter cards tags also duplicate Open Graph tags. - # - # See https://developer.twitter.com/en/docs/twitter-for-websites/cards/guides/getting-started - twitter = _get_meta_tags(tree, "name", "twitter", _map_twitter_to_open_graph) - # Merge the Twitter values with the Open Graph values, but do not overwrite - # information from Open Graph tags. - for key, value in twitter.items(): - if key not in og: - og[key] = value - - if "og:title" not in og: - # Attempt to find a title from the title tag, or the biggest header on the page. - title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()") - if title: - og["og:title"] = title[0].strip() - else: - og["og:title"] = None - - if "og:image" not in og: - meta_image = tree.xpath( - "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]" - ) - # If a meta image is found, use it. - if meta_image: - og["og:image"] = meta_image[0] - else: - # Try to find images which are larger than 10px by 10px. - # - # TODO: consider inlined CSS styles as well as width & height attribs - images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]") - images = sorted( - images, - key=lambda i: ( - -1 * float(i.attrib["width"]) * float(i.attrib["height"]) - ), - ) - # If no images were found, try to find *any* images. - if not images: - images = tree.xpath("//img[@src][1]") - if images: - og["og:image"] = images[0].attrib["src"] - - # Finally, fallback to the favicon if nothing else. - else: - favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]") - if favicons: - og["og:image"] = favicons[0] - - if "og:description" not in og: - # Check the first meta description tag for content. - meta_description = tree.xpath( - "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]" - ) - # If a meta description is found with content, use it. - if meta_description: - og["og:description"] = meta_description[0] - else: - og["og:description"] = parse_html_description(tree) - elif og["og:description"]: - # This must be a non-empty string at this point. - assert isinstance(og["og:description"], str) - og["og:description"] = summarize_paragraphs([og["og:description"]]) - - # TODO: delete the url downloads to stop diskfilling, - # as we only ever cared about its OG - return og - - -def parse_html_description(tree: "etree.Element") -> Optional[str]: - """ - Calculate a text description based on an HTML document. - - Grabs any text nodes which are inside the <body/> tag, unless they are within - an HTML5 semantic markup tag (<header/>, <nav/>, <aside/>, <footer/>), or - if they are within a <script/>, <svg/> or <style/> tag, or if they are within - a tag whose content is usually only shown to old browsers - (<iframe/>, <video/>, <canvas/>, <picture/>). - - This is a very very very coarse approximation to a plain text render of the page. - - Args: - tree: The parsed HTML document. - - Returns: - The plain text description, or None if one cannot be generated. - """ - # We don't just use XPATH here as that is slow on some machines. - - from lxml import etree - - TAGS_TO_REMOVE = { - "header", - "nav", - "aside", - "footer", - "script", - "noscript", - "style", - "svg", - "iframe", - "video", - "canvas", - "img", - "picture", - etree.Comment, - } - - # Split all the text nodes into paragraphs (by splitting on new - # lines) - text_nodes = ( - re.sub(r"\s+", "\n", el).strip() - for el in _iterate_over_text(tree.find("body"), TAGS_TO_REMOVE) - ) - return summarize_paragraphs(text_nodes) - - -def _iterate_over_text( - tree: Optional["etree.Element"], - tags_to_ignore: Set[Union[str, "etree.Comment"]], - stack_limit: int = 1024, -) -> Generator[str, None, None]: - """Iterate over the tree returning text nodes in a depth first fashion, - skipping text nodes inside certain tags. - - Args: - tree: The parent element to iterate. Can be None if there isn't one. - tags_to_ignore: Set of tags to ignore - stack_limit: Maximum stack size limit for depth-first traversal. - Nodes will be dropped if this limit is hit, which may truncate the - textual result. - Intended to limit the maximum working memory when generating a preview. - """ - - if tree is None: - return - - # This is a stack whose items are elements to iterate over *or* strings - # to be returned. - elements: List[Union[str, "etree.Element"]] = [tree] - while elements: - el = elements.pop() - - if isinstance(el, str): - yield el - elif el.tag not in tags_to_ignore: - # If the element isn't meant for display, ignore it. - if el.get("role") in ARIA_ROLES_TO_IGNORE: - continue - - # el.text is the text before the first child, so we can immediately - # return it if the text exists. - if el.text: - yield el.text - - # We add to the stack all the element's children, interspersed with - # each child's tail text (if it exists). - # - # We iterate in reverse order so that earlier pieces of text appear - # closer to the top of the stack. - for child in el.iterchildren(reversed=True): - if len(elements) > stack_limit: - # We've hit our limit for working memory - break - - if child.tail: - # The tail text of a node is text that comes *after* the node, - # so we always include it even if we ignore the child node. - elements.append(child.tail) - - elements.append(child) - - -def summarize_paragraphs( - text_nodes: Iterable[str], min_size: int = 200, max_size: int = 500 -) -> Optional[str]: - """ - Try to get a summary respecting first paragraph and then word boundaries. - - Args: - text_nodes: The paragraphs to summarize. - min_size: The minimum number of words to include. - max_size: The maximum number of words to include. - - Returns: - A summary of the text nodes, or None if that was not possible. - """ - - # TODO: Respect sentences? - - description = "" - - # Keep adding paragraphs until we get to the MIN_SIZE. - for text_node in text_nodes: - if len(description) < min_size: - text_node = re.sub(r"[\t \r\n]+", " ", text_node) - description += text_node + "\n\n" - else: - break - - description = description.strip() - description = re.sub(r"[\t ]+", " ", description) - description = re.sub(r"[\t \r\n]*[\r\n]+", "\n\n", description) - - # If the concatenation of paragraphs to get above MIN_SIZE - # took us over MAX_SIZE, then we need to truncate mid paragraph - if len(description) > max_size: - new_desc = "" - - # This splits the paragraph into words, but keeping the - # (preceding) whitespace intact so we can easily concat - # words back together. - for match in re.finditer(r"\s*\S+", description): - word = match.group() - - # Keep adding words while the total length is less than - # MAX_SIZE. - if len(word) + len(new_desc) < max_size: - new_desc += word - else: - # At this point the next word *will* take us over - # MAX_SIZE, but we also want to ensure that its not - # a huge word. If it is add it anyway and we'll - # truncate later. - if len(new_desc) < min_size: - new_desc += word - break - - # Double check that we're not over the limit - if len(new_desc) > max_size: - new_desc = new_desc[:max_size] - - # We always add an ellipsis because at the very least - # we chopped mid paragraph. - description = new_desc.strip() + "…" - return description if description else None diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py deleted file mode 100644 index 4a594ab9d8..0000000000 --- a/synapse/rest/media/v1/preview_url_resource.py +++ /dev/null @@ -1,871 +0,0 @@ -# Copyright 2016 OpenMarket Ltd -# Copyright 2020-2021 The Matrix.org Foundation C.I.C. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import datetime -import errno -import fnmatch -import logging -import os -import re -import shutil -import sys -import traceback -from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple -from urllib.parse import urljoin, urlparse, urlsplit -from urllib.request import urlopen - -import attr - -from twisted.internet.defer import Deferred -from twisted.internet.error import DNSLookupError - -from synapse.api.errors import Codes, SynapseError -from synapse.http.client import SimpleHttpClient -from synapse.http.server import ( - DirectServeJsonResource, - respond_with_json, - respond_with_json_bytes, -) -from synapse.http.servlet import parse_integer, parse_string -from synapse.http.site import SynapseRequest -from synapse.logging.context import make_deferred_yieldable, run_in_background -from synapse.metrics.background_process_metrics import run_as_background_process -from synapse.rest.media.v1._base import get_filename_from_headers -from synapse.rest.media.v1.media_storage import MediaStorage -from synapse.rest.media.v1.oembed import OEmbedProvider -from synapse.rest.media.v1.preview_html import decode_body, parse_html_to_open_graph -from synapse.types import JsonDict, UserID -from synapse.util import json_encoder -from synapse.util.async_helpers import ObservableDeferred -from synapse.util.caches.expiringcache import ExpiringCache -from synapse.util.stringutils import random_string - -from ._base import FileInfo - -if TYPE_CHECKING: - from synapse.rest.media.v1.media_repository import MediaRepository - from synapse.server import HomeServer - -logger = logging.getLogger(__name__) - -OG_TAG_NAME_MAXLEN = 50 -OG_TAG_VALUE_MAXLEN = 1000 - -ONE_HOUR = 60 * 60 * 1000 -ONE_DAY = 24 * ONE_HOUR -IMAGE_CACHE_EXPIRY_MS = 2 * ONE_DAY - - -@attr.s(slots=True, frozen=True, auto_attribs=True) -class DownloadResult: - length: int - uri: str - response_code: int - media_type: str - download_name: Optional[str] - expires: int - etag: Optional[str] - - -@attr.s(slots=True, frozen=True, auto_attribs=True) -class MediaInfo: - """ - Information parsed from downloading media being previewed. - """ - - # The Content-Type header of the response. - media_type: str - # The length (in bytes) of the downloaded media. - media_length: int - # The media filename, according to the server. This is parsed from the - # returned headers, if possible. - download_name: Optional[str] - # The time of the preview. - created_ts_ms: int - # Information from the media storage provider about where the file is stored - # on disk. - filesystem_id: str - filename: str - # The URI being previewed. - uri: str - # The HTTP response code. - response_code: int - # The timestamp (in milliseconds) of when this preview expires. - expires: int - # The ETag header of the response. - etag: Optional[str] - - -class PreviewUrlResource(DirectServeJsonResource): - """ - The `GET /_matrix/media/r0/preview_url` endpoint provides a generic preview API - for URLs which outputs Open Graph (https://ogp.me/) responses (with some Matrix - specific additions). - - This does have trade-offs compared to other designs: - - * Pros: - * Simple and flexible; can be used by any clients at any point - * Cons: - * If each homeserver provides one of these independently, all the homeservers in a - room may needlessly DoS the target URI - * The URL metadata must be stored somewhere, rather than just using Matrix - itself to store the media. - * Matrix cannot be used to distribute the metadata between homeservers. - - When Synapse is asked to preview a URL it does the following: - - 1. Checks against a URL blacklist (defined as `url_preview_url_blacklist` in the - config). - 2. Checks the URL against an in-memory cache and returns the result if it exists. (This - is also used to de-duplicate processing of multiple in-flight requests at once.) - 3. Kicks off a background process to generate a preview: - 1. Checks URL and timestamp against the database cache and returns the result if it - has not expired and was successful (a 2xx return code). - 2. Checks if the URL matches an oEmbed (https://oembed.com/) pattern. If it - does, update the URL to download. - 3. Downloads the URL and stores it into a file via the media storage provider - and saves the local media metadata. - 4. If the media is an image: - 1. Generates thumbnails. - 2. Generates an Open Graph response based on image properties. - 5. If the media is HTML: - 1. Decodes the HTML via the stored file. - 2. Generates an Open Graph response from the HTML. - 3. If a JSON oEmbed URL was found in the HTML via autodiscovery: - 1. Downloads the URL and stores it into a file via the media storage provider - and saves the local media metadata. - 2. Convert the oEmbed response to an Open Graph response. - 3. Override any Open Graph data from the HTML with data from oEmbed. - 4. If an image exists in the Open Graph response: - 1. Downloads the URL and stores it into a file via the media storage - provider and saves the local media metadata. - 2. Generates thumbnails. - 3. Updates the Open Graph response based on image properties. - 6. If the media is JSON and an oEmbed URL was found: - 1. Convert the oEmbed response to an Open Graph response. - 2. If a thumbnail or image is in the oEmbed response: - 1. Downloads the URL and stores it into a file via the media storage - provider and saves the local media metadata. - 2. Generates thumbnails. - 3. Updates the Open Graph response based on image properties. - 7. Stores the result in the database cache. - 4. Returns the result. - - If any additional requests (e.g. from oEmbed autodiscovery, step 5.3 or - image thumbnailing, step 5.4 or 6.4) fails then the URL preview as a whole - does not fail. As much information as possible is returned. - - The in-memory cache expires after 1 hour. - - Expired entries in the database cache (and their associated media files) are - deleted every 10 seconds. The default expiration time is 1 hour from download. - """ - - isLeaf = True - - def __init__( - self, - hs: "HomeServer", - media_repo: "MediaRepository", - media_storage: MediaStorage, - ): - super().__init__() - - self.auth = hs.get_auth() - self.clock = hs.get_clock() - self.filepaths = media_repo.filepaths - self.max_spider_size = hs.config.media.max_spider_size - self.server_name = hs.hostname - self.store = hs.get_datastores().main - self.client = SimpleHttpClient( - hs, - treq_args={"browser_like_redirects": True}, - ip_whitelist=hs.config.media.url_preview_ip_range_whitelist, - ip_blacklist=hs.config.media.url_preview_ip_range_blacklist, - use_proxy=True, - ) - self.media_repo = media_repo - self.primary_base_path = media_repo.primary_base_path - self.media_storage = media_storage - - self._oembed = OEmbedProvider(hs) - - # We run the background jobs if we're the instance specified (or no - # instance is specified, where we assume there is only one instance - # serving media). - instance_running_jobs = hs.config.media.media_instance_running_background_jobs - self._worker_run_media_background_jobs = ( - instance_running_jobs is None - or instance_running_jobs == hs.get_instance_name() - ) - - self.url_preview_url_blacklist = hs.config.media.url_preview_url_blacklist - self.url_preview_accept_language = hs.config.media.url_preview_accept_language - - # memory cache mapping urls to an ObservableDeferred returning - # JSON-encoded OG metadata - self._cache: ExpiringCache[str, ObservableDeferred] = ExpiringCache( - cache_name="url_previews", - clock=self.clock, - # don't spider URLs more often than once an hour - expiry_ms=ONE_HOUR, - ) - - if self._worker_run_media_background_jobs: - self._cleaner_loop = self.clock.looping_call( - self._start_expire_url_cache_data, 10 * 1000 - ) - - async def _async_render_OPTIONS(self, request: SynapseRequest) -> None: - request.setHeader(b"Allow", b"OPTIONS, GET") - respond_with_json(request, 200, {}, send_cors=True) - - async def _async_render_GET(self, request: SynapseRequest) -> None: - # XXX: if get_user_by_req fails, what should we do in an async render? - requester = await self.auth.get_user_by_req(request) - url = parse_string(request, "url", required=True) - ts = parse_integer(request, "ts") - if ts is None: - ts = self.clock.time_msec() - - # XXX: we could move this into _do_preview if we wanted. - url_tuple = urlsplit(url) - for entry in self.url_preview_url_blacklist: - match = True - for attrib in entry: - pattern = entry[attrib] - value = getattr(url_tuple, attrib) - logger.debug( - "Matching attrib '%s' with value '%s' against pattern '%s'", - attrib, - value, - pattern, - ) - - if value is None: - match = False - continue - - # Some attributes might not be parsed as strings by urlsplit (such as the - # port, which is parsed as an int). Because we use match functions that - # expect strings, we want to make sure that's what we give them. - value_str = str(value) - - if pattern.startswith("^"): - if not re.match(pattern, value_str): - match = False - continue - else: - if not fnmatch.fnmatch(value_str, pattern): - match = False - continue - if match: - logger.warning("URL %s blocked by url_blacklist entry %s", url, entry) - raise SynapseError( - 403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN - ) - - # the in-memory cache: - # * ensures that only one request is active at a time - # * takes load off the DB for the thundering herds - # * also caches any failures (unlike the DB) so we don't keep - # requesting the same endpoint - - observable = self._cache.get(url) - - if not observable: - download = run_in_background(self._do_preview, url, requester.user, ts) - observable = ObservableDeferred(download, consumeErrors=True) - self._cache[url] = observable - else: - logger.info("Returning cached response") - - og = await make_deferred_yieldable(observable.observe()) - respond_with_json_bytes(request, 200, og, send_cors=True) - - async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes: - """Check the db, and download the URL and build a preview - - Args: - url: The URL to preview. - user: The user requesting the preview. - ts: The timestamp requested for the preview. - - Returns: - json-encoded og data - """ - # check the URL cache in the DB (which will also provide us with - # historical previews, if we have any) - cache_result = await self.store.get_url_cache(url, ts) - if ( - cache_result - and cache_result["expires_ts"] > ts - and cache_result["response_code"] / 100 == 2 - ): - # It may be stored as text in the database, not as bytes (such as - # PostgreSQL). If so, encode it back before handing it on. - og = cache_result["og"] - if isinstance(og, str): - og = og.encode("utf8") - return og - - # If this URL can be accessed via oEmbed, use that instead. - url_to_download = url - oembed_url = self._oembed.get_oembed_url(url) - if oembed_url: - url_to_download = oembed_url - - media_info = await self._handle_url(url_to_download, user) - - logger.debug("got media_info of '%s'", media_info) - - # The number of milliseconds that the response should be considered valid. - expiration_ms = media_info.expires - author_name: Optional[str] = None - - if _is_media(media_info.media_type): - file_id = media_info.filesystem_id - dims = await self.media_repo._generate_thumbnails( - None, file_id, file_id, media_info.media_type, url_cache=True - ) - - og = { - "og:description": media_info.download_name, - "og:image": f"mxc://{self.server_name}/{media_info.filesystem_id}", - "og:image:type": media_info.media_type, - "matrix:image:size": media_info.media_length, - } - - if dims: - og["og:image:width"] = dims["width"] - og["og:image:height"] = dims["height"] - else: - logger.warning("Couldn't get dims for %s" % url) - - # define our OG response for this media - elif _is_html(media_info.media_type): - # TODO: somehow stop a big HTML tree from exploding synapse's RAM - - with open(media_info.filename, "rb") as file: - body = file.read() - - tree = decode_body(body, media_info.uri, media_info.media_type) - if tree is not None: - # Check if this HTML document points to oEmbed information and - # defer to that. - oembed_url = self._oembed.autodiscover_from_html(tree) - og_from_oembed: JsonDict = {} - if oembed_url: - try: - oembed_info = await self._handle_url( - oembed_url, user, allow_data_urls=True - ) - except Exception as e: - # Fetching the oEmbed info failed, don't block the entire URL preview. - logger.warning( - "oEmbed fetch failed during URL preview: %s errored with %s", - oembed_url, - e, - ) - else: - ( - og_from_oembed, - author_name, - expiration_ms, - ) = await self._handle_oembed_response( - url, oembed_info, expiration_ms - ) - - # Parse Open Graph information from the HTML in case the oEmbed - # response failed or is incomplete. - og_from_html = parse_html_to_open_graph(tree) - - # Compile the Open Graph response by using the scraped - # information from the HTML and overlaying any information - # from the oEmbed response. - og = {**og_from_html, **og_from_oembed} - - await self._precache_image_url(user, media_info, og) - else: - og = {} - - elif oembed_url: - # Handle the oEmbed information. - og, author_name, expiration_ms = await self._handle_oembed_response( - url, media_info, expiration_ms - ) - await self._precache_image_url(user, media_info, og) - - else: - logger.warning("Failed to find any OG data in %s", url) - og = {} - - # If we don't have a title but we have author_name, copy it as - # title - if not og.get("og:title") and author_name: - og["og:title"] = author_name - - # filter out any stupidly long values - keys_to_remove = [] - for k, v in og.items(): - # values can be numeric as well as strings, hence the cast to str - if len(k) > OG_TAG_NAME_MAXLEN or len(str(v)) > OG_TAG_VALUE_MAXLEN: - logger.warning( - "Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN] - ) - keys_to_remove.append(k) - for k in keys_to_remove: - del og[k] - - logger.debug("Calculated OG for %s as %s", url, og) - - jsonog = json_encoder.encode(og) - - # Cap the amount of time to consider a response valid. - expiration_ms = min(expiration_ms, ONE_DAY) - - # store OG in history-aware DB cache - await self.store.store_url_cache( - url, - media_info.response_code, - media_info.etag, - media_info.created_ts_ms + expiration_ms, - jsonog, - media_info.filesystem_id, - media_info.created_ts_ms, - ) - - return jsonog.encode("utf8") - - async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult: - """ - Fetches a remote URL and parses the headers. - - Args: - url: The URL to fetch. - output_stream: The stream to write the content to. - - Returns: - A tuple of: - Media length, URL downloaded, the HTTP response code, - the media type, the downloaded file name, the number of - milliseconds the result is valid for, the etag header. - """ - - try: - logger.debug("Trying to get preview for url '%s'", url) - length, headers, uri, code = await self.client.get_file( - url, - output_stream=output_stream, - max_size=self.max_spider_size, - headers={ - b"Accept-Language": self.url_preview_accept_language, - # Use a custom user agent for the preview because some sites will only return - # Open Graph metadata to crawler user agents. Omit the Synapse version - # string to avoid leaking information. - b"User-Agent": [ - "Synapse (bot; +https://github.com/matrix-org/synapse)" - ], - }, - is_allowed_content_type=_is_previewable, - ) - except SynapseError: - # Pass SynapseErrors through directly, so that the servlet - # handler will return a SynapseError to the client instead of - # blank data or a 500. - raise - except DNSLookupError: - # DNS lookup returned no results - # Note: This will also be the case if one of the resolved IP - # addresses is blacklisted - raise SynapseError( - 502, - "DNS resolution failure during URL preview generation", - Codes.UNKNOWN, - ) - except Exception as e: - # FIXME: pass through 404s and other error messages nicely - logger.warning("Error downloading %s: %r", url, e) - - raise SynapseError( - 500, - "Failed to download content: %s" - % (traceback.format_exception_only(sys.exc_info()[0], e),), - Codes.UNKNOWN, - ) - - if b"Content-Type" in headers: - media_type = headers[b"Content-Type"][0].decode("ascii") - else: - media_type = "application/octet-stream" - - download_name = get_filename_from_headers(headers) - - # FIXME: we should calculate a proper expiration based on the - # Cache-Control and Expire headers. But for now, assume 1 hour. - expires = ONE_HOUR - etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None - - return DownloadResult( - length, uri, code, media_type, download_name, expires, etag - ) - - async def _parse_data_url( - self, url: str, output_stream: BinaryIO - ) -> DownloadResult: - """ - Parses a data: URL. - - Args: - url: The URL to parse. - output_stream: The stream to write the content to. - - Returns: - A tuple of: - Media length, URL downloaded, the HTTP response code, - the media type, the downloaded file name, the number of - milliseconds the result is valid for, the etag header. - """ - - try: - logger.debug("Trying to parse data url '%s'", url) - with urlopen(url) as url_info: - # TODO Can this be more efficient. - output_stream.write(url_info.read()) - except Exception as e: - logger.warning("Error parsing data: URL %s: %r", url, e) - - raise SynapseError( - 500, - "Failed to parse data URL: %s" - % (traceback.format_exception_only(sys.exc_info()[0], e),), - Codes.UNKNOWN, - ) - - return DownloadResult( - # Read back the length that has been written. - length=output_stream.tell(), - uri=url, - # If it was parsed, consider this a 200 OK. - response_code=200, - # urlopen shoves the media-type from the data URL into the content type - # header object. - media_type=url_info.headers.get_content_type(), - # Some features are not supported by data: URLs. - download_name=None, - expires=ONE_HOUR, - etag=None, - ) - - async def _handle_url( - self, url: str, user: UserID, allow_data_urls: bool = False - ) -> MediaInfo: - """ - Fetches content from a URL and parses the result to generate a MediaInfo. - - It uses the media storage provider to persist the fetched content and - stores the mapping into the database. - - Args: - url: The URL to fetch. - user: The user who ahs requested this URL. - allow_data_urls: True if data URLs should be allowed. - - Returns: - A MediaInfo object describing the fetched content. - """ - - # TODO: we should probably honour robots.txt... except in practice - # we're most likely being explicitly triggered by a human rather than a - # bot, so are we really a robot? - - file_id = datetime.date.today().isoformat() + "_" + random_string(16) - - file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True) - - with self.media_storage.store_into_file(file_info) as (f, fname, finish): - if url.startswith("data:"): - if not allow_data_urls: - raise SynapseError( - 500, "Previewing of data: URLs is forbidden", Codes.UNKNOWN - ) - - download_result = await self._parse_data_url(url, f) - else: - download_result = await self._download_url(url, f) - - await finish() - - try: - time_now_ms = self.clock.time_msec() - - await self.store.store_local_media( - media_id=file_id, - media_type=download_result.media_type, - time_now_ms=time_now_ms, - upload_name=download_result.download_name, - media_length=download_result.length, - user_id=user, - url_cache=url, - ) - - except Exception as e: - logger.error("Error handling downloaded %s: %r", url, e) - # TODO: we really ought to delete the downloaded file in this - # case, since we won't have recorded it in the db, and will - # therefore not expire it. - raise - - return MediaInfo( - media_type=download_result.media_type, - media_length=download_result.length, - download_name=download_result.download_name, - created_ts_ms=time_now_ms, - filesystem_id=file_id, - filename=fname, - uri=download_result.uri, - response_code=download_result.response_code, - expires=download_result.expires, - etag=download_result.etag, - ) - - async def _precache_image_url( - self, user: UserID, media_info: MediaInfo, og: JsonDict - ) -> None: - """ - Pre-cache the image (if one exists) for posterity - - Args: - user: The user requesting the preview. - media_info: The media being previewed. - og: The Open Graph dictionary. This is modified with image information. - """ - # If there's no image or it is blank, there's nothing to do. - if "og:image" not in og: - return - - # Remove the raw image URL, this will be replaced with an MXC URL, if successful. - image_url = og.pop("og:image") - if not image_url: - return - - # The image URL from the HTML might be relative to the previewed page, - # convert it to an URL which can be requested directly. - url_parts = urlparse(image_url) - if url_parts.scheme != "data": - image_url = urljoin(media_info.uri, image_url) - - # FIXME: it might be cleaner to use the same flow as the main /preview_url - # request itself and benefit from the same caching etc. But for now we - # just rely on the caching on the master request to speed things up. - try: - image_info = await self._handle_url(image_url, user, allow_data_urls=True) - except Exception as e: - # Pre-caching the image failed, don't block the entire URL preview. - logger.warning( - "Pre-caching image failed during URL preview: %s errored with %s", - image_url, - e, - ) - return - - if _is_media(image_info.media_type): - # TODO: make sure we don't choke on white-on-transparent images - file_id = image_info.filesystem_id - dims = await self.media_repo._generate_thumbnails( - None, file_id, file_id, image_info.media_type, url_cache=True - ) - if dims: - og["og:image:width"] = dims["width"] - og["og:image:height"] = dims["height"] - else: - logger.warning("Couldn't get dims for %s", image_url) - - og["og:image"] = f"mxc://{self.server_name}/{image_info.filesystem_id}" - og["og:image:type"] = image_info.media_type - og["matrix:image:size"] = image_info.media_length - - async def _handle_oembed_response( - self, url: str, media_info: MediaInfo, expiration_ms: int - ) -> Tuple[JsonDict, Optional[str], int]: - """ - Parse the downloaded oEmbed info. - - Args: - url: The URL which is being previewed (not the one which was - requested). - media_info: The media being previewed. - expiration_ms: The length of time, in milliseconds, the media is valid for. - - Returns: - A tuple of: - The Open Graph dictionary, if the oEmbed info can be parsed. - The author name if it could be retrieved from oEmbed. - The (possibly updated) length of time, in milliseconds, the media is valid for. - """ - # If JSON was not returned, there's nothing to do. - if not _is_json(media_info.media_type): - return {}, None, expiration_ms - - with open(media_info.filename, "rb") as file: - body = file.read() - - oembed_response = self._oembed.parse_oembed_response(url, body) - open_graph_result = oembed_response.open_graph_result - - # Use the cache age from the oEmbed result, if one was given. - if open_graph_result and oembed_response.cache_age is not None: - expiration_ms = oembed_response.cache_age - - return open_graph_result, oembed_response.author_name, expiration_ms - - def _start_expire_url_cache_data(self) -> Deferred: - return run_as_background_process( - "expire_url_cache_data", self._expire_url_cache_data - ) - - async def _expire_url_cache_data(self) -> None: - """Clean up expired url cache content, media and thumbnails.""" - - assert self._worker_run_media_background_jobs - - now = self.clock.time_msec() - - logger.debug("Running url preview cache expiry") - - def try_remove_parent_dirs(dirs: Iterable[str]) -> None: - """Attempt to remove the given chain of parent directories - - Args: - dirs: The list of directory paths to delete, with children appearing - before their parents. - """ - for dir in dirs: - try: - os.rmdir(dir) - except FileNotFoundError: - # Already deleted, continue with deleting the rest - pass - except OSError as e: - # Failed, skip deleting the rest of the parent dirs - if e.errno != errno.ENOTEMPTY: - logger.warning( - "Failed to remove media directory while clearing url preview cache: %r: %s", - dir, - e, - ) - break - - # First we delete expired url cache entries - media_ids = await self.store.get_expired_url_cache(now) - - removed_media = [] - for media_id in media_ids: - fname = self.filepaths.url_cache_filepath(media_id) - try: - os.remove(fname) - except FileNotFoundError: - pass # If the path doesn't exist, meh - except OSError as e: - logger.warning( - "Failed to remove media while clearing url preview cache: %r: %s", - media_id, - e, - ) - continue - - removed_media.append(media_id) - - dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id) - try_remove_parent_dirs(dirs) - - await self.store.delete_url_cache(removed_media) - - if removed_media: - logger.debug( - "Deleted %d entries from url preview cache", len(removed_media) - ) - else: - logger.debug("No entries removed from url preview cache") - - # Now we delete old images associated with the url cache. - # These may be cached for a bit on the client (i.e., they - # may have a room open with a preview url thing open). - # So we wait a couple of days before deleting, just in case. - expire_before = now - IMAGE_CACHE_EXPIRY_MS - media_ids = await self.store.get_url_cache_media_before(expire_before) - - removed_media = [] - for media_id in media_ids: - fname = self.filepaths.url_cache_filepath(media_id) - try: - os.remove(fname) - except FileNotFoundError: - pass # If the path doesn't exist, meh - except OSError as e: - logger.warning( - "Failed to remove media from url preview cache: %r: %s", media_id, e - ) - continue - - dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id) - try_remove_parent_dirs(dirs) - - thumbnail_dir = self.filepaths.url_cache_thumbnail_directory(media_id) - try: - shutil.rmtree(thumbnail_dir) - except FileNotFoundError: - pass # If the path doesn't exist, meh - except OSError as e: - logger.warning( - "Failed to remove media from url preview cache: %r: %s", media_id, e - ) - continue - - removed_media.append(media_id) - - dirs = self.filepaths.url_cache_thumbnail_dirs_to_delete(media_id) - # Note that one of the directories to be deleted has already been - # removed by the `rmtree` above. - try_remove_parent_dirs(dirs) - - await self.store.delete_url_cache_media(removed_media) - - if removed_media: - logger.debug("Deleted %d media from url preview cache", len(removed_media)) - else: - logger.debug("No media removed from url preview cache") - - -def _is_media(content_type: str) -> bool: - return content_type.lower().startswith("image/") - - -def _is_html(content_type: str) -> bool: - content_type = content_type.lower() - return content_type.startswith("text/html") or content_type.startswith( - "application/xhtml" - ) - - -def _is_json(content_type: str) -> bool: - return content_type.lower().startswith("application/json") - - -def _is_previewable(content_type: str) -> bool: - """Returns True for content types for which we will perform URL preview and False - otherwise.""" - - return _is_html(content_type) or _is_media(content_type) or _is_json(content_type) diff --git a/synapse/rest/media/v1/storage_provider.py b/synapse/rest/media/v1/storage_provider.py index 1c9b71d69c..d7653f30ae 100644 --- a/synapse/rest/media/v1/storage_provider.py +++ b/synapse/rest/media/v1/storage_provider.py @@ -1,4 +1,4 @@ -# Copyright 2018-2021 The Matrix.org Foundation C.I.C. +# Copyright 2023 The Matrix.org Foundation C.I.C. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,171 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# -import abc -import logging -import os -import shutil -from typing import TYPE_CHECKING, Callable, Optional - -from synapse.config._base import Config -from synapse.logging.context import defer_to_thread, run_in_background -from synapse.util.async_helpers import maybe_awaitable - -from ._base import FileInfo, Responder -from .media_storage import FileResponder - -logger = logging.getLogger(__name__) - -if TYPE_CHECKING: - from synapse.server import HomeServer - - -class StorageProvider(metaclass=abc.ABCMeta): - """A storage provider is a service that can store uploaded media and - retrieve them. - """ - - @abc.abstractmethod - async def store_file(self, path: str, file_info: FileInfo) -> None: - """Store the file described by file_info. The actual contents can be - retrieved by reading the file in file_info.upload_path. - - Args: - path: Relative path of file in local cache - file_info: The metadata of the file. - """ - - @abc.abstractmethod - async def fetch(self, path: str, file_info: FileInfo) -> Optional[Responder]: - """Attempt to fetch the file described by file_info and stream it - into writer. - - Args: - path: Relative path of file in local cache - file_info: The metadata of the file. - - Returns: - Returns a Responder if the provider has the file, otherwise returns None. - """ - - -class StorageProviderWrapper(StorageProvider): - """Wraps a storage provider and provides various config options - - Args: - backend: The storage provider to wrap. - store_local: Whether to store new local files or not. - store_synchronous: Whether to wait for file to be successfully - uploaded, or todo the upload in the background. - store_remote: Whether remote media should be uploaded - """ - - def __init__( - self, - backend: StorageProvider, - store_local: bool, - store_synchronous: bool, - store_remote: bool, - ): - self.backend = backend - self.store_local = store_local - self.store_synchronous = store_synchronous - self.store_remote = store_remote - - def __str__(self) -> str: - return "StorageProviderWrapper[%s]" % (self.backend,) - - async def store_file(self, path: str, file_info: FileInfo) -> None: - if not file_info.server_name and not self.store_local: - return None - - if file_info.server_name and not self.store_remote: - return None - - if file_info.url_cache: - # The URL preview cache is short lived and not worth offloading or - # backing up. - return None - - if self.store_synchronous: - # store_file is supposed to return an Awaitable, but guard - # against improper implementations. - await maybe_awaitable(self.backend.store_file(path, file_info)) # type: ignore - else: - # TODO: Handle errors. - async def store() -> None: - try: - return await maybe_awaitable( - self.backend.store_file(path, file_info) - ) - except Exception: - logger.exception("Error storing file") - - run_in_background(store) - - async def fetch(self, path: str, file_info: FileInfo) -> Optional[Responder]: - if file_info.url_cache: - # Files in the URL preview cache definitely aren't stored here, - # so avoid any potentially slow I/O or network access. - return None - - # store_file is supposed to return an Awaitable, but guard - # against improper implementations. - return await maybe_awaitable(self.backend.fetch(path, file_info)) - - -class FileStorageProviderBackend(StorageProvider): - """A storage provider that stores files in a directory on a filesystem. - - Args: - hs - config: The config returned by `parse_config`. - """ - - def __init__(self, hs: "HomeServer", config: str): - self.hs = hs - self.cache_directory = hs.config.media.media_store_path - self.base_directory = config - - def __str__(self) -> str: - return "FileStorageProviderBackend[%s]" % (self.base_directory,) - - async def store_file(self, path: str, file_info: FileInfo) -> None: - """See StorageProvider.store_file""" - - primary_fname = os.path.join(self.cache_directory, path) - backup_fname = os.path.join(self.base_directory, path) - - dirname = os.path.dirname(backup_fname) - os.makedirs(dirname, exist_ok=True) - - # mypy needs help inferring the type of the second parameter, which is generic - shutil_copyfile: Callable[[str, str], str] = shutil.copyfile - await defer_to_thread( - self.hs.get_reactor(), - shutil_copyfile, - primary_fname, - backup_fname, - ) - - async def fetch(self, path: str, file_info: FileInfo) -> Optional[Responder]: - """See StorageProvider.fetch""" - - backup_fname = os.path.join(self.base_directory, path) - if os.path.isfile(backup_fname): - return FileResponder(open(backup_fname, "rb")) - - return None - - @staticmethod - def parse_config(config: dict) -> str: - """Called on startup to parse config supplied. This should parse - the config and raise if there is a problem. - - The returned value is passed into the constructor. - - In this case we only care about a single param, the directory, so let's - just pull that out. - """ - return Config.ensure_directory(config["directory"]) +# This exists purely for backwards compatibility with media providers. +from synapse.media.storage_provider import StorageProvider # noqa: F401 diff --git a/synapse/rest/media/v1/thumbnail_resource.py b/synapse/rest/media/v1/thumbnail_resource.py deleted file mode 100644 index 3e720018b3..0000000000 --- a/synapse/rest/media/v1/thumbnail_resource.py +++ /dev/null @@ -1,555 +0,0 @@ -# Copyright 2014-2016 OpenMarket Ltd -# Copyright 2020-2021 The Matrix.org Foundation C.I.C. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import logging -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple - -from synapse.api.errors import Codes, SynapseError, cs_error -from synapse.config.repository import THUMBNAIL_SUPPORTED_MEDIA_FORMAT_MAP -from synapse.http.server import ( - DirectServeJsonResource, - respond_with_json, - set_corp_headers, - set_cors_headers, -) -from synapse.http.servlet import parse_integer, parse_string -from synapse.http.site import SynapseRequest -from synapse.rest.media.v1.media_storage import MediaStorage - -from ._base import ( - FileInfo, - ThumbnailInfo, - parse_media_id, - respond_404, - respond_with_file, - respond_with_responder, -) - -if TYPE_CHECKING: - from synapse.rest.media.v1.media_repository import MediaRepository - from synapse.server import HomeServer - -logger = logging.getLogger(__name__) - - -class ThumbnailResource(DirectServeJsonResource): - isLeaf = True - - def __init__( - self, - hs: "HomeServer", - media_repo: "MediaRepository", - media_storage: MediaStorage, - ): - super().__init__() - - self.store = hs.get_datastores().main - self.media_repo = media_repo - self.media_storage = media_storage - self.dynamic_thumbnails = hs.config.media.dynamic_thumbnails - self.server_name = hs.hostname - - async def _async_render_GET(self, request: SynapseRequest) -> None: - set_cors_headers(request) - set_corp_headers(request) - server_name, media_id, _ = parse_media_id(request) - width = parse_integer(request, "width", required=True) - height = parse_integer(request, "height", required=True) - method = parse_string(request, "method", "scale") - # TODO Parse the Accept header to get an prioritised list of thumbnail types. - m_type = "image/png" - - if server_name == self.server_name: - if self.dynamic_thumbnails: - await self._select_or_generate_local_thumbnail( - request, media_id, width, height, method, m_type - ) - else: - await self._respond_local_thumbnail( - request, media_id, width, height, method, m_type - ) - self.media_repo.mark_recently_accessed(None, media_id) - else: - if self.dynamic_thumbnails: - await self._select_or_generate_remote_thumbnail( - request, server_name, media_id, width, height, method, m_type - ) - else: - await self._respond_remote_thumbnail( - request, server_name, media_id, width, height, method, m_type - ) - self.media_repo.mark_recently_accessed(server_name, media_id) - - async def _respond_local_thumbnail( - self, - request: SynapseRequest, - media_id: str, - width: int, - height: int, - method: str, - m_type: str, - ) -> None: - media_info = await self.store.get_local_media(media_id) - - if not media_info: - respond_404(request) - return - if media_info["quarantined_by"]: - logger.info("Media is quarantined") - respond_404(request) - return - - thumbnail_infos = await self.store.get_local_media_thumbnails(media_id) - await self._select_and_respond_with_thumbnail( - request, - width, - height, - method, - m_type, - thumbnail_infos, - media_id, - media_id, - url_cache=bool(media_info["url_cache"]), - server_name=None, - ) - - async def _select_or_generate_local_thumbnail( - self, - request: SynapseRequest, - media_id: str, - desired_width: int, - desired_height: int, - desired_method: str, - desired_type: str, - ) -> None: - media_info = await self.store.get_local_media(media_id) - - if not media_info: - respond_404(request) - return - if media_info["quarantined_by"]: - logger.info("Media is quarantined") - respond_404(request) - return - - thumbnail_infos = await self.store.get_local_media_thumbnails(media_id) - for info in thumbnail_infos: - t_w = info["thumbnail_width"] == desired_width - t_h = info["thumbnail_height"] == desired_height - t_method = info["thumbnail_method"] == desired_method - t_type = info["thumbnail_type"] == desired_type - - if t_w and t_h and t_method and t_type: - file_info = FileInfo( - server_name=None, - file_id=media_id, - url_cache=media_info["url_cache"], - thumbnail=ThumbnailInfo( - width=info["thumbnail_width"], - height=info["thumbnail_height"], - type=info["thumbnail_type"], - method=info["thumbnail_method"], - ), - ) - - t_type = file_info.thumbnail_type - t_length = info["thumbnail_length"] - - responder = await self.media_storage.fetch_media(file_info) - if responder: - await respond_with_responder(request, responder, t_type, t_length) - return - - logger.debug("We don't have a thumbnail of that size. Generating") - - # Okay, so we generate one. - file_path = await self.media_repo.generate_local_exact_thumbnail( - media_id, - desired_width, - desired_height, - desired_method, - desired_type, - url_cache=bool(media_info["url_cache"]), - ) - - if file_path: - await respond_with_file(request, desired_type, file_path) - else: - logger.warning("Failed to generate thumbnail") - raise SynapseError(400, "Failed to generate thumbnail.") - - async def _select_or_generate_remote_thumbnail( - self, - request: SynapseRequest, - server_name: str, - media_id: str, - desired_width: int, - desired_height: int, - desired_method: str, - desired_type: str, - ) -> None: - media_info = await self.media_repo.get_remote_media_info(server_name, media_id) - - thumbnail_infos = await self.store.get_remote_media_thumbnails( - server_name, media_id - ) - - file_id = media_info["filesystem_id"] - - for info in thumbnail_infos: - t_w = info["thumbnail_width"] == desired_width - t_h = info["thumbnail_height"] == desired_height - t_method = info["thumbnail_method"] == desired_method - t_type = info["thumbnail_type"] == desired_type - - if t_w and t_h and t_method and t_type: - file_info = FileInfo( - server_name=server_name, - file_id=media_info["filesystem_id"], - thumbnail=ThumbnailInfo( - width=info["thumbnail_width"], - height=info["thumbnail_height"], - type=info["thumbnail_type"], - method=info["thumbnail_method"], - ), - ) - - t_type = file_info.thumbnail_type - t_length = info["thumbnail_length"] - - responder = await self.media_storage.fetch_media(file_info) - if responder: - await respond_with_responder(request, responder, t_type, t_length) - return - - logger.debug("We don't have a thumbnail of that size. Generating") - - # Okay, so we generate one. - file_path = await self.media_repo.generate_remote_exact_thumbnail( - server_name, - file_id, - media_id, - desired_width, - desired_height, - desired_method, - desired_type, - ) - - if file_path: - await respond_with_file(request, desired_type, file_path) - else: - logger.warning("Failed to generate thumbnail") - raise SynapseError(400, "Failed to generate thumbnail.") - - async def _respond_remote_thumbnail( - self, - request: SynapseRequest, - server_name: str, - media_id: str, - width: int, - height: int, - method: str, - m_type: str, - ) -> None: - # TODO: Don't download the whole remote file - # We should proxy the thumbnail from the remote server instead of - # downloading the remote file and generating our own thumbnails. - media_info = await self.media_repo.get_remote_media_info(server_name, media_id) - - thumbnail_infos = await self.store.get_remote_media_thumbnails( - server_name, media_id - ) - await self._select_and_respond_with_thumbnail( - request, - width, - height, - method, - m_type, - thumbnail_infos, - media_id, - media_info["filesystem_id"], - url_cache=False, - server_name=server_name, - ) - - async def _select_and_respond_with_thumbnail( - self, - request: SynapseRequest, - desired_width: int, - desired_height: int, - desired_method: str, - desired_type: str, - thumbnail_infos: List[Dict[str, Any]], - media_id: str, - file_id: str, - url_cache: bool, - server_name: Optional[str] = None, - ) -> None: - """ - Respond to a request with an appropriate thumbnail from the previously generated thumbnails. - - Args: - request: The incoming request. - desired_width: The desired width, the returned thumbnail may be larger than this. - desired_height: The desired height, the returned thumbnail may be larger than this. - desired_method: The desired method used to generate the thumbnail. - desired_type: The desired content-type of the thumbnail. - thumbnail_infos: A list of dictionaries of candidate thumbnails. - file_id: The ID of the media that a thumbnail is being requested for. - url_cache: True if this is from a URL cache. - server_name: The server name, if this is a remote thumbnail. - """ - logger.debug( - "_select_and_respond_with_thumbnail: media_id=%s desired=%sx%s (%s) thumbnail_infos=%s", - media_id, - desired_width, - desired_height, - desired_method, - thumbnail_infos, - ) - - # If `dynamic_thumbnails` is enabled, we expect Synapse to go down a - # different code path to handle it. - assert not self.dynamic_thumbnails - - if thumbnail_infos: - file_info = self._select_thumbnail( - desired_width, - desired_height, - desired_method, - desired_type, - thumbnail_infos, - file_id, - url_cache, - server_name, - ) - if not file_info: - logger.info("Couldn't find a thumbnail matching the desired inputs") - respond_404(request) - return - - # The thumbnail property must exist. - assert file_info.thumbnail is not None - - responder = await self.media_storage.fetch_media(file_info) - if responder: - await respond_with_responder( - request, - responder, - file_info.thumbnail.type, - file_info.thumbnail.length, - ) - return - - # If we can't find the thumbnail we regenerate it. This can happen - # if e.g. we've deleted the thumbnails but still have the original - # image somewhere. - # - # Since we have an entry for the thumbnail in the DB we a) know we - # have have successfully generated the thumbnail in the past (so we - # don't need to worry about repeatedly failing to generate - # thumbnails), and b) have already calculated that appropriate - # width/height/method so we can just call the "generate exact" - # methods. - - # First let's check that we do actually have the original image - # still. This will throw a 404 if we don't. - # TODO: We should refetch the thumbnails for remote media. - await self.media_storage.ensure_media_is_in_local_cache( - FileInfo(server_name, file_id, url_cache=url_cache) - ) - - if server_name: - await self.media_repo.generate_remote_exact_thumbnail( - server_name, - file_id=file_id, - media_id=media_id, - t_width=file_info.thumbnail.width, - t_height=file_info.thumbnail.height, - t_method=file_info.thumbnail.method, - t_type=file_info.thumbnail.type, - ) - else: - await self.media_repo.generate_local_exact_thumbnail( - media_id=media_id, - t_width=file_info.thumbnail.width, - t_height=file_info.thumbnail.height, - t_method=file_info.thumbnail.method, - t_type=file_info.thumbnail.type, - url_cache=url_cache, - ) - - responder = await self.media_storage.fetch_media(file_info) - await respond_with_responder( - request, - responder, - file_info.thumbnail.type, - file_info.thumbnail.length, - ) - else: - # This might be because: - # 1. We can't create thumbnails for the given media (corrupted or - # unsupported file type), or - # 2. The thumbnailing process never ran or errored out initially - # when the media was first uploaded (these bugs should be - # reported and fixed). - # Note that we don't attempt to generate a thumbnail now because - # `dynamic_thumbnails` is disabled. - logger.info("Failed to find any generated thumbnails") - - respond_with_json( - request, - 400, - cs_error( - "Cannot find any thumbnails for the requested media (%r). This might mean the media is not a supported_media_format=(%s) or that thumbnailing failed for some other reason. (Dynamic thumbnails are disabled on this server.)" - % ( - request.postpath, - ", ".join(THUMBNAIL_SUPPORTED_MEDIA_FORMAT_MAP.keys()), - ), - code=Codes.UNKNOWN, - ), - send_cors=True, - ) - - def _select_thumbnail( - self, - desired_width: int, - desired_height: int, - desired_method: str, - desired_type: str, - thumbnail_infos: List[Dict[str, Any]], - file_id: str, - url_cache: bool, - server_name: Optional[str], - ) -> Optional[FileInfo]: - """ - Choose an appropriate thumbnail from the previously generated thumbnails. - - Args: - desired_width: The desired width, the returned thumbnail may be larger than this. - desired_height: The desired height, the returned thumbnail may be larger than this. - desired_method: The desired method used to generate the thumbnail. - desired_type: The desired content-type of the thumbnail. - thumbnail_infos: A list of dictionaries of candidate thumbnails. - file_id: The ID of the media that a thumbnail is being requested for. - url_cache: True if this is from a URL cache. - server_name: The server name, if this is a remote thumbnail. - - Returns: - The thumbnail which best matches the desired parameters. - """ - desired_method = desired_method.lower() - - # The chosen thumbnail. - thumbnail_info = None - - d_w = desired_width - d_h = desired_height - - if desired_method == "crop": - # Thumbnails that match equal or larger sizes of desired width/height. - crop_info_list: List[Tuple[int, int, int, bool, int, Dict[str, Any]]] = [] - # Other thumbnails. - crop_info_list2: List[Tuple[int, int, int, bool, int, Dict[str, Any]]] = [] - for info in thumbnail_infos: - # Skip thumbnails generated with different methods. - if info["thumbnail_method"] != "crop": - continue - - t_w = info["thumbnail_width"] - t_h = info["thumbnail_height"] - aspect_quality = abs(d_w * t_h - d_h * t_w) - min_quality = 0 if d_w <= t_w and d_h <= t_h else 1 - size_quality = abs((d_w - t_w) * (d_h - t_h)) - type_quality = desired_type != info["thumbnail_type"] - length_quality = info["thumbnail_length"] - if t_w >= d_w or t_h >= d_h: - crop_info_list.append( - ( - aspect_quality, - min_quality, - size_quality, - type_quality, - length_quality, - info, - ) - ) - else: - crop_info_list2.append( - ( - aspect_quality, - min_quality, - size_quality, - type_quality, - length_quality, - info, - ) - ) - # Pick the most appropriate thumbnail. Some values of `desired_width` and - # `desired_height` may result in a tie, in which case we avoid comparing on - # the thumbnail info dictionary and pick the thumbnail that appears earlier - # in the list of candidates. - if crop_info_list: - thumbnail_info = min(crop_info_list, key=lambda t: t[:-1])[-1] - elif crop_info_list2: - thumbnail_info = min(crop_info_list2, key=lambda t: t[:-1])[-1] - elif desired_method == "scale": - # Thumbnails that match equal or larger sizes of desired width/height. - info_list: List[Tuple[int, bool, int, Dict[str, Any]]] = [] - # Other thumbnails. - info_list2: List[Tuple[int, bool, int, Dict[str, Any]]] = [] - - for info in thumbnail_infos: - # Skip thumbnails generated with different methods. - if info["thumbnail_method"] != "scale": - continue - - t_w = info["thumbnail_width"] - t_h = info["thumbnail_height"] - size_quality = abs((d_w - t_w) * (d_h - t_h)) - type_quality = desired_type != info["thumbnail_type"] - length_quality = info["thumbnail_length"] - if t_w >= d_w or t_h >= d_h: - info_list.append((size_quality, type_quality, length_quality, info)) - else: - info_list2.append( - (size_quality, type_quality, length_quality, info) - ) - # Pick the most appropriate thumbnail. Some values of `desired_width` and - # `desired_height` may result in a tie, in which case we avoid comparing on - # the thumbnail info dictionary and pick the thumbnail that appears earlier - # in the list of candidates. - if info_list: - thumbnail_info = min(info_list, key=lambda t: t[:-1])[-1] - elif info_list2: - thumbnail_info = min(info_list2, key=lambda t: t[:-1])[-1] - - if thumbnail_info: - return FileInfo( - file_id=file_id, - url_cache=url_cache, - server_name=server_name, - thumbnail=ThumbnailInfo( - width=thumbnail_info["thumbnail_width"], - height=thumbnail_info["thumbnail_height"], - type=thumbnail_info["thumbnail_type"], - method=thumbnail_info["thumbnail_method"], - length=thumbnail_info["thumbnail_length"], - ), - ) - - # No matching thumbnail was found. - return None diff --git a/synapse/rest/media/v1/thumbnailer.py b/synapse/rest/media/v1/thumbnailer.py deleted file mode 100644 index f909a4fb9a..0000000000 --- a/synapse/rest/media/v1/thumbnailer.py +++ /dev/null @@ -1,221 +0,0 @@ -# Copyright 2014-2016 OpenMarket Ltd -# Copyright 2020-2021 The Matrix.org Foundation C.I.C. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -from io import BytesIO -from types import TracebackType -from typing import Optional, Tuple, Type - -from PIL import Image - -logger = logging.getLogger(__name__) - -EXIF_ORIENTATION_TAG = 0x0112 -EXIF_TRANSPOSE_MAPPINGS = { - 2: Image.FLIP_LEFT_RIGHT, - 3: Image.ROTATE_180, - 4: Image.FLIP_TOP_BOTTOM, - 5: Image.TRANSPOSE, - 6: Image.ROTATE_270, - 7: Image.TRANSVERSE, - 8: Image.ROTATE_90, -} - - -class ThumbnailError(Exception): - """An error occurred generating a thumbnail.""" - - -class Thumbnailer: - FORMATS = {"image/jpeg": "JPEG", "image/png": "PNG"} - - @staticmethod - def set_limits(max_image_pixels: int) -> None: - Image.MAX_IMAGE_PIXELS = max_image_pixels - - def __init__(self, input_path: str): - # Have we closed the image? - self._closed = False - - try: - self.image = Image.open(input_path) - except OSError as e: - # If an error occurs opening the image, a thumbnail won't be able to - # be generated. - raise ThumbnailError from e - except Image.DecompressionBombError as e: - # If an image decompression bomb error occurs opening the image, - # then the image exceeds the pixel limit and a thumbnail won't - # be able to be generated. - raise ThumbnailError from e - - self.width, self.height = self.image.size - self.transpose_method = None - try: - # We don't use ImageOps.exif_transpose since it crashes with big EXIF - # - # Ignore safety: Pillow seems to acknowledge that this method is - # "private, experimental, but generally widely used". Pillow 6 - # includes a public getexif() method (no underscore) that we might - # consider using instead when we can bump that dependency. - # - # At the time of writing, Debian buster (currently oldstable) - # provides version 5.4.1. It's expected to EOL in mid-2022, see - # https://wiki.debian.org/DebianReleases#Production_Releases - image_exif = self.image._getexif() # type: ignore - if image_exif is not None: - image_orientation = image_exif.get(EXIF_ORIENTATION_TAG) - assert type(image_orientation) is int - self.transpose_method = EXIF_TRANSPOSE_MAPPINGS.get(image_orientation) - except Exception as e: - # A lot of parsing errors can happen when parsing EXIF - logger.info("Error parsing image EXIF information: %s", e) - - def transpose(self) -> Tuple[int, int]: - """Transpose the image using its EXIF Orientation tag - - Returns: - A tuple containing the new image size in pixels as (width, height). - """ - if self.transpose_method is not None: - # Safety: `transpose` takes an int rather than e.g. an IntEnum. - # self.transpose_method is set above to be a value in - # EXIF_TRANSPOSE_MAPPINGS, and that only contains correct values. - with self.image: - self.image = self.image.transpose(self.transpose_method) # type: ignore[arg-type] - self.width, self.height = self.image.size - self.transpose_method = None - # We don't need EXIF any more - self.image.info["exif"] = None - return self.image.size - - def aspect(self, max_width: int, max_height: int) -> Tuple[int, int]: - """Calculate the largest size that preserves aspect ratio which - fits within the given rectangle:: - - (w_in / h_in) = (w_out / h_out) - w_out = max(min(w_max, h_max * (w_in / h_in)), 1) - h_out = max(min(h_max, w_max * (h_in / w_in)), 1) - - Args: - max_width: The largest possible width. - max_height: The largest possible height. - """ - - if max_width * self.height < max_height * self.width: - return max_width, max((max_width * self.height) // self.width, 1) - else: - return max((max_height * self.width) // self.height, 1), max_height - - def _resize(self, width: int, height: int) -> Image.Image: - # 1-bit or 8-bit color palette images need converting to RGB - # otherwise they will be scaled using nearest neighbour which - # looks awful. - # - # If the image has transparency, use RGBA instead. - if self.image.mode in ["1", "L", "P"]: - if self.image.info.get("transparency", None) is not None: - with self.image: - self.image = self.image.convert("RGBA") - else: - with self.image: - self.image = self.image.convert("RGB") - return self.image.resize((width, height), Image.ANTIALIAS) - - def scale(self, width: int, height: int, output_type: str) -> BytesIO: - """Rescales the image to the given dimensions. - - Returns: - The bytes of the encoded image ready to be written to disk - """ - with self._resize(width, height) as scaled: - return self._encode_image(scaled, output_type) - - def crop(self, width: int, height: int, output_type: str) -> BytesIO: - """Rescales and crops the image to the given dimensions preserving - aspect:: - (w_in / h_in) = (w_scaled / h_scaled) - w_scaled = max(w_out, h_out * (w_in / h_in)) - h_scaled = max(h_out, w_out * (h_in / w_in)) - - Args: - max_width: The largest possible width. - max_height: The largest possible height. - - Returns: - The bytes of the encoded image ready to be written to disk - """ - if width * self.height > height * self.width: - scaled_width = width - scaled_height = (width * self.height) // self.width - crop_top = (scaled_height - height) // 2 - crop_bottom = height + crop_top - crop = (0, crop_top, width, crop_bottom) - else: - scaled_width = (height * self.width) // self.height - scaled_height = height - crop_left = (scaled_width - width) // 2 - crop_right = width + crop_left - crop = (crop_left, 0, crop_right, height) - - with self._resize(scaled_width, scaled_height) as scaled_image: - with scaled_image.crop(crop) as cropped: - return self._encode_image(cropped, output_type) - - def _encode_image(self, output_image: Image.Image, output_type: str) -> BytesIO: - output_bytes_io = BytesIO() - fmt = self.FORMATS[output_type] - if fmt == "JPEG": - output_image = output_image.convert("RGB") - output_image.save(output_bytes_io, fmt, quality=80) - return output_bytes_io - - def close(self) -> None: - """Closes the underlying image file. - - Once closed no other functions can be called. - - Can be called multiple times. - """ - - if self._closed: - return - - self._closed = True - - # Since we run this on the finalizer then we need to handle `__init__` - # raising an exception before it can define `self.image`. - image = getattr(self, "image", None) - if image is None: - return - - image.close() - - def __enter__(self) -> "Thumbnailer": - """Make `Thumbnailer` a context manager that calls `close` on - `__exit__`. - """ - return self - - def __exit__( - self, - type: Optional[Type[BaseException]], - value: Optional[BaseException], - traceback: Optional[TracebackType], - ) -> None: - self.close() - - def __del__(self) -> None: - # Make sure we actually do close the image, rather than leak data. - self.close() diff --git a/synapse/rest/media/v1/upload_resource.py b/synapse/rest/media/v1/upload_resource.py deleted file mode 100644 index 97548b54e5..0000000000 --- a/synapse/rest/media/v1/upload_resource.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright 2014-2016 OpenMarket Ltd -# Copyright 2020-2021 The Matrix.org Foundation C.I.C. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from typing import IO, TYPE_CHECKING, Dict, List, Optional - -from synapse.api.errors import Codes, SynapseError -from synapse.http.server import DirectServeJsonResource, respond_with_json -from synapse.http.servlet import parse_bytes_from_args -from synapse.http.site import SynapseRequest -from synapse.rest.media.v1.media_storage import SpamMediaException - -if TYPE_CHECKING: - from synapse.rest.media.v1.media_repository import MediaRepository - from synapse.server import HomeServer - -logger = logging.getLogger(__name__) - - -class UploadResource(DirectServeJsonResource): - isLeaf = True - - def __init__(self, hs: "HomeServer", media_repo: "MediaRepository"): - super().__init__() - - self.media_repo = media_repo - self.filepaths = media_repo.filepaths - self.store = hs.get_datastores().main - self.clock = hs.get_clock() - self.server_name = hs.hostname - self.auth = hs.get_auth() - self.max_upload_size = hs.config.media.max_upload_size - self.clock = hs.get_clock() - - async def _async_render_OPTIONS(self, request: SynapseRequest) -> None: - respond_with_json(request, 200, {}, send_cors=True) - - async def _async_render_POST(self, request: SynapseRequest) -> None: - requester = await self.auth.get_user_by_req(request) - raw_content_length = request.getHeader("Content-Length") - if raw_content_length is None: - raise SynapseError(msg="Request must specify a Content-Length", code=400) - try: - content_length = int(raw_content_length) - except ValueError: - raise SynapseError(msg="Content-Length value is invalid", code=400) - if content_length > self.max_upload_size: - raise SynapseError( - msg="Upload request body is too large", - code=413, - errcode=Codes.TOO_LARGE, - ) - - args: Dict[bytes, List[bytes]] = request.args # type: ignore - upload_name_bytes = parse_bytes_from_args(args, "filename") - if upload_name_bytes: - try: - upload_name: Optional[str] = upload_name_bytes.decode("utf8") - except UnicodeDecodeError: - raise SynapseError( - msg="Invalid UTF-8 filename parameter: %r" % (upload_name_bytes,), - code=400, - ) - - # If the name is falsey (e.g. an empty byte string) ensure it is None. - else: - upload_name = None - - headers = request.requestHeaders - - if headers.hasHeader(b"Content-Type"): - content_type_headers = headers.getRawHeaders(b"Content-Type") - assert content_type_headers # for mypy - media_type = content_type_headers[0].decode("ascii") - else: - media_type = "application/octet-stream" - - # if headers.hasHeader(b"Content-Disposition"): - # disposition = headers.getRawHeaders(b"Content-Disposition")[0] - # TODO(markjh): parse content-dispostion - - try: - content: IO = request.content # type: ignore - content_uri = await self.media_repo.create_content( - media_type, upload_name, content, content_length, requester.user - ) - except SpamMediaException: - # For uploading of media we want to respond with a 400, instead of - # the default 404, as that would just be confusing. - raise SynapseError(400, "Bad content") - - logger.info("Uploaded content with URI '%s'", content_uri) - - respond_with_json( - request, 200, {"content_uri": str(content_uri)}, send_cors=True - ) |