From 4fc8875876374ec8f97a3b3cc344a4e3abcf769f Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@users.noreply.github.com>
Date: Mon, 27 Feb 2023 08:26:05 -0500
Subject: Refactor media modules. (#15146)

* Removes the `v1` directory from `test.rest.media.v1`.
* Moves the non-REST code from `synapse.rest.media.v1` to `synapse.media`.
* Flatten the `v1` directory from `synapse.rest.media`,  but leave compatiblity
  with 3rd party media repositories and spam checkers.
---
 synapse/rest/media/config_resource.py           |   41 +
 synapse/rest/media/download_resource.py         |   75 ++
 synapse/rest/media/media_repository_resource.py |   93 ++
 synapse/rest/media/preview_url_resource.py      |  869 ++++++++++++++++++
 synapse/rest/media/thumbnail_resource.py        |  554 +++++++++++
 synapse/rest/media/upload_resource.py           |  108 +++
 synapse/rest/media/v1/_base.py                  |  470 +---------
 synapse/rest/media/v1/config_resource.py        |   41 -
 synapse/rest/media/v1/download_resource.py      |   76 --
 synapse/rest/media/v1/filepath.py               |  410 ---------
 synapse/rest/media/v1/media_repository.py       | 1112 -----------------------
 synapse/rest/media/v1/media_storage.py          |  365 +-------
 synapse/rest/media/v1/oembed.py                 |  265 ------
 synapse/rest/media/v1/preview_html.py           |  501 ----------
 synapse/rest/media/v1/preview_url_resource.py   |  871 ------------------
 synapse/rest/media/v1/storage_provider.py       |  172 +---
 synapse/rest/media/v1/thumbnail_resource.py     |  555 -----------
 synapse/rest/media/v1/thumbnailer.py            |  221 -----
 synapse/rest/media/v1/upload_resource.py        |  108 ---
 19 files changed, 1752 insertions(+), 5155 deletions(-)
 create mode 100644 synapse/rest/media/config_resource.py
 create mode 100644 synapse/rest/media/download_resource.py
 create mode 100644 synapse/rest/media/media_repository_resource.py
 create mode 100644 synapse/rest/media/preview_url_resource.py
 create mode 100644 synapse/rest/media/thumbnail_resource.py
 create mode 100644 synapse/rest/media/upload_resource.py
 delete mode 100644 synapse/rest/media/v1/config_resource.py
 delete mode 100644 synapse/rest/media/v1/download_resource.py
 delete mode 100644 synapse/rest/media/v1/filepath.py
 delete mode 100644 synapse/rest/media/v1/media_repository.py
 delete mode 100644 synapse/rest/media/v1/oembed.py
 delete mode 100644 synapse/rest/media/v1/preview_html.py
 delete mode 100644 synapse/rest/media/v1/preview_url_resource.py
 delete mode 100644 synapse/rest/media/v1/thumbnail_resource.py
 delete mode 100644 synapse/rest/media/v1/thumbnailer.py
 delete mode 100644 synapse/rest/media/v1/upload_resource.py

(limited to 'synapse/rest')

diff --git a/synapse/rest/media/config_resource.py b/synapse/rest/media/config_resource.py
new file mode 100644
index 0000000000..a95804d327
--- /dev/null
+++ b/synapse/rest/media/config_resource.py
@@ -0,0 +1,41 @@
+# Copyright 2018 Will Hunt <will@half-shot.uk>
+# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import TYPE_CHECKING
+
+from synapse.http.server import DirectServeJsonResource, respond_with_json
+from synapse.http.site import SynapseRequest
+
+if TYPE_CHECKING:
+    from synapse.server import HomeServer
+
+
+class MediaConfigResource(DirectServeJsonResource):
+    isLeaf = True
+
+    def __init__(self, hs: "HomeServer"):
+        super().__init__()
+        config = hs.config
+        self.clock = hs.get_clock()
+        self.auth = hs.get_auth()
+        self.limits_dict = {"m.upload.size": config.media.max_upload_size}
+
+    async def _async_render_GET(self, request: SynapseRequest) -> None:
+        await self.auth.get_user_by_req(request)
+        respond_with_json(request, 200, self.limits_dict, send_cors=True)
+
+    async def _async_render_OPTIONS(self, request: SynapseRequest) -> None:
+        respond_with_json(request, 200, {}, send_cors=True)
diff --git a/synapse/rest/media/download_resource.py b/synapse/rest/media/download_resource.py
new file mode 100644
index 0000000000..8f270cf4cc
--- /dev/null
+++ b/synapse/rest/media/download_resource.py
@@ -0,0 +1,75 @@
+# Copyright 2014-2016 OpenMarket Ltd
+# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import TYPE_CHECKING
+
+from synapse.http.server import (
+    DirectServeJsonResource,
+    set_corp_headers,
+    set_cors_headers,
+)
+from synapse.http.servlet import parse_boolean
+from synapse.http.site import SynapseRequest
+from synapse.media._base import parse_media_id, respond_404
+
+if TYPE_CHECKING:
+    from synapse.media.media_repository import MediaRepository
+    from synapse.server import HomeServer
+
+logger = logging.getLogger(__name__)
+
+
+class DownloadResource(DirectServeJsonResource):
+    isLeaf = True
+
+    def __init__(self, hs: "HomeServer", media_repo: "MediaRepository"):
+        super().__init__()
+        self.media_repo = media_repo
+        self.server_name = hs.hostname
+
+    async def _async_render_GET(self, request: SynapseRequest) -> None:
+        set_cors_headers(request)
+        set_corp_headers(request)
+        request.setHeader(
+            b"Content-Security-Policy",
+            b"sandbox;"
+            b" default-src 'none';"
+            b" script-src 'none';"
+            b" plugin-types application/pdf;"
+            b" style-src 'unsafe-inline';"
+            b" media-src 'self';"
+            b" object-src 'self';",
+        )
+        # Limited non-standard form of CSP for IE11
+        request.setHeader(b"X-Content-Security-Policy", b"sandbox;")
+        request.setHeader(
+            b"Referrer-Policy",
+            b"no-referrer",
+        )
+        server_name, media_id, name = parse_media_id(request)
+        if server_name == self.server_name:
+            await self.media_repo.get_local_media(request, media_id, name)
+        else:
+            allow_remote = parse_boolean(request, "allow_remote", default=True)
+            if not allow_remote:
+                logger.info(
+                    "Rejecting request for remote media %s/%s due to allow_remote",
+                    server_name,
+                    media_id,
+                )
+                respond_404(request)
+                return
+
+            await self.media_repo.get_remote_media(request, server_name, media_id, name)
diff --git a/synapse/rest/media/media_repository_resource.py b/synapse/rest/media/media_repository_resource.py
new file mode 100644
index 0000000000..5ebaa3b032
--- /dev/null
+++ b/synapse/rest/media/media_repository_resource.py
@@ -0,0 +1,93 @@
+# Copyright 2014-2016 OpenMarket Ltd
+# Copyright 2018-2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from synapse.config._base import ConfigError
+from synapse.http.server import UnrecognizedRequestResource
+
+from .config_resource import MediaConfigResource
+from .download_resource import DownloadResource
+from .preview_url_resource import PreviewUrlResource
+from .thumbnail_resource import ThumbnailResource
+from .upload_resource import UploadResource
+
+if TYPE_CHECKING:
+    from synapse.server import HomeServer
+
+
+class MediaRepositoryResource(UnrecognizedRequestResource):
+    """File uploading and downloading.
+
+    Uploads are POSTed to a resource which returns a token which is used to GET
+    the download::
+
+        => POST /_matrix/media/r0/upload HTTP/1.1
+           Content-Type: <media-type>
+           Content-Length: <content-length>
+
+           <media>
+
+        <= HTTP/1.1 200 OK
+           Content-Type: application/json
+
+           { "content_uri": "mxc://<server-name>/<media-id>" }
+
+        => GET /_matrix/media/r0/download/<server-name>/<media-id> HTTP/1.1
+
+        <= HTTP/1.1 200 OK
+           Content-Type: <media-type>
+           Content-Disposition: attachment;filename=<upload-filename>
+
+           <media>
+
+    Clients can get thumbnails by supplying a desired width and height and
+    thumbnailing method::
+
+        => GET /_matrix/media/r0/thumbnail/<server_name>
+                /<media-id>?width=<w>&height=<h>&method=<m> HTTP/1.1
+
+        <= HTTP/1.1 200 OK
+           Content-Type: image/jpeg or image/png
+
+           <thumbnail>
+
+    The thumbnail methods are "crop" and "scale". "scale" tries to return an
+    image where either the width or the height is smaller than the requested
+    size. The client should then scale and letterbox the image if it needs to
+    fit within a given rectangle. "crop" tries to return an image where the
+    width and height are close to the requested size and the aspect matches
+    the requested size. The client should scale the image if it needs to fit
+    within a given rectangle.
+    """
+
+    def __init__(self, hs: "HomeServer"):
+        # If we're not configured to use it, raise if we somehow got here.
+        if not hs.config.media.can_load_media_repo:
+            raise ConfigError("Synapse is not configured to use a media repo.")
+
+        super().__init__()
+        media_repo = hs.get_media_repository()
+
+        self.putChild(b"upload", UploadResource(hs, media_repo))
+        self.putChild(b"download", DownloadResource(hs, media_repo))
+        self.putChild(
+            b"thumbnail", ThumbnailResource(hs, media_repo, media_repo.media_storage)
+        )
+        if hs.config.media.url_preview_enabled:
+            self.putChild(
+                b"preview_url",
+                PreviewUrlResource(hs, media_repo, media_repo.media_storage),
+            )
+        self.putChild(b"config", MediaConfigResource(hs))
diff --git a/synapse/rest/media/preview_url_resource.py b/synapse/rest/media/preview_url_resource.py
new file mode 100644
index 0000000000..7ada728757
--- /dev/null
+++ b/synapse/rest/media/preview_url_resource.py
@@ -0,0 +1,869 @@
+# Copyright 2016 OpenMarket Ltd
+# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import datetime
+import errno
+import fnmatch
+import logging
+import os
+import re
+import shutil
+import sys
+import traceback
+from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple
+from urllib.parse import urljoin, urlparse, urlsplit
+from urllib.request import urlopen
+
+import attr
+
+from twisted.internet.defer import Deferred
+from twisted.internet.error import DNSLookupError
+
+from synapse.api.errors import Codes, SynapseError
+from synapse.http.client import SimpleHttpClient
+from synapse.http.server import (
+    DirectServeJsonResource,
+    respond_with_json,
+    respond_with_json_bytes,
+)
+from synapse.http.servlet import parse_integer, parse_string
+from synapse.http.site import SynapseRequest
+from synapse.logging.context import make_deferred_yieldable, run_in_background
+from synapse.media._base import FileInfo, get_filename_from_headers
+from synapse.media.media_storage import MediaStorage
+from synapse.media.oembed import OEmbedProvider
+from synapse.media.preview_html import decode_body, parse_html_to_open_graph
+from synapse.metrics.background_process_metrics import run_as_background_process
+from synapse.types import JsonDict, UserID
+from synapse.util import json_encoder
+from synapse.util.async_helpers import ObservableDeferred
+from synapse.util.caches.expiringcache import ExpiringCache
+from synapse.util.stringutils import random_string
+
+if TYPE_CHECKING:
+    from synapse.media.media_repository import MediaRepository
+    from synapse.server import HomeServer
+
+logger = logging.getLogger(__name__)
+
+OG_TAG_NAME_MAXLEN = 50
+OG_TAG_VALUE_MAXLEN = 1000
+
+ONE_HOUR = 60 * 60 * 1000
+ONE_DAY = 24 * ONE_HOUR
+IMAGE_CACHE_EXPIRY_MS = 2 * ONE_DAY
+
+
+@attr.s(slots=True, frozen=True, auto_attribs=True)
+class DownloadResult:
+    length: int
+    uri: str
+    response_code: int
+    media_type: str
+    download_name: Optional[str]
+    expires: int
+    etag: Optional[str]
+
+
+@attr.s(slots=True, frozen=True, auto_attribs=True)
+class MediaInfo:
+    """
+    Information parsed from downloading media being previewed.
+    """
+
+    # The Content-Type header of the response.
+    media_type: str
+    # The length (in bytes) of the downloaded media.
+    media_length: int
+    # The media filename, according to the server. This is parsed from the
+    # returned headers, if possible.
+    download_name: Optional[str]
+    # The time of the preview.
+    created_ts_ms: int
+    # Information from the media storage provider about where the file is stored
+    # on disk.
+    filesystem_id: str
+    filename: str
+    # The URI being previewed.
+    uri: str
+    # The HTTP response code.
+    response_code: int
+    # The timestamp (in milliseconds) of when this preview expires.
+    expires: int
+    # The ETag header of the response.
+    etag: Optional[str]
+
+
+class PreviewUrlResource(DirectServeJsonResource):
+    """
+    The `GET /_matrix/media/r0/preview_url` endpoint provides a generic preview API
+    for URLs which outputs Open Graph (https://ogp.me/) responses (with some Matrix
+    specific additions).
+
+    This does have trade-offs compared to other designs:
+
+    * Pros:
+      * Simple and flexible; can be used by any clients at any point
+    * Cons:
+      * If each homeserver provides one of these independently, all the homeservers in a
+        room may needlessly DoS the target URI
+      * The URL metadata must be stored somewhere, rather than just using Matrix
+        itself to store the media.
+      * Matrix cannot be used to distribute the metadata between homeservers.
+
+    When Synapse is asked to preview a URL it does the following:
+
+    1. Checks against a URL blacklist (defined as `url_preview_url_blacklist` in the
+       config).
+    2. Checks the URL against an in-memory cache and returns the result if it exists. (This
+       is also used to de-duplicate processing of multiple in-flight requests at once.)
+    3. Kicks off a background process to generate a preview:
+       1. Checks URL and timestamp against the database cache and returns the result if it
+          has not expired and was successful (a 2xx return code).
+       2. Checks if the URL matches an oEmbed (https://oembed.com/) pattern. If it
+          does, update the URL to download.
+       3. Downloads the URL and stores it into a file via the media storage provider
+          and saves the local media metadata.
+       4. If the media is an image:
+          1. Generates thumbnails.
+          2. Generates an Open Graph response based on image properties.
+       5. If the media is HTML:
+          1. Decodes the HTML via the stored file.
+          2. Generates an Open Graph response from the HTML.
+          3. If a JSON oEmbed URL was found in the HTML via autodiscovery:
+             1. Downloads the URL and stores it into a file via the media storage provider
+                and saves the local media metadata.
+             2. Convert the oEmbed response to an Open Graph response.
+             3. Override any Open Graph data from the HTML with data from oEmbed.
+          4. If an image exists in the Open Graph response:
+             1. Downloads the URL and stores it into a file via the media storage
+                provider and saves the local media metadata.
+             2. Generates thumbnails.
+             3. Updates the Open Graph response based on image properties.
+       6. If the media is JSON and an oEmbed URL was found:
+          1. Convert the oEmbed response to an Open Graph response.
+          2. If a thumbnail or image is in the oEmbed response:
+             1. Downloads the URL and stores it into a file via the media storage
+                provider and saves the local media metadata.
+             2. Generates thumbnails.
+             3. Updates the Open Graph response based on image properties.
+       7. Stores the result in the database cache.
+    4. Returns the result.
+
+    If any additional requests (e.g. from oEmbed autodiscovery, step 5.3 or
+    image thumbnailing, step 5.4 or 6.4) fails then the URL preview as a whole
+    does not fail. As much information as possible is returned.
+
+    The in-memory cache expires after 1 hour.
+
+    Expired entries in the database cache (and their associated media files) are
+    deleted every 10 seconds. The default expiration time is 1 hour from download.
+    """
+
+    isLeaf = True
+
+    def __init__(
+        self,
+        hs: "HomeServer",
+        media_repo: "MediaRepository",
+        media_storage: MediaStorage,
+    ):
+        super().__init__()
+
+        self.auth = hs.get_auth()
+        self.clock = hs.get_clock()
+        self.filepaths = media_repo.filepaths
+        self.max_spider_size = hs.config.media.max_spider_size
+        self.server_name = hs.hostname
+        self.store = hs.get_datastores().main
+        self.client = SimpleHttpClient(
+            hs,
+            treq_args={"browser_like_redirects": True},
+            ip_whitelist=hs.config.media.url_preview_ip_range_whitelist,
+            ip_blacklist=hs.config.media.url_preview_ip_range_blacklist,
+            use_proxy=True,
+        )
+        self.media_repo = media_repo
+        self.primary_base_path = media_repo.primary_base_path
+        self.media_storage = media_storage
+
+        self._oembed = OEmbedProvider(hs)
+
+        # We run the background jobs if we're the instance specified (or no
+        # instance is specified, where we assume there is only one instance
+        # serving media).
+        instance_running_jobs = hs.config.media.media_instance_running_background_jobs
+        self._worker_run_media_background_jobs = (
+            instance_running_jobs is None
+            or instance_running_jobs == hs.get_instance_name()
+        )
+
+        self.url_preview_url_blacklist = hs.config.media.url_preview_url_blacklist
+        self.url_preview_accept_language = hs.config.media.url_preview_accept_language
+
+        # memory cache mapping urls to an ObservableDeferred returning
+        # JSON-encoded OG metadata
+        self._cache: ExpiringCache[str, ObservableDeferred] = ExpiringCache(
+            cache_name="url_previews",
+            clock=self.clock,
+            # don't spider URLs more often than once an hour
+            expiry_ms=ONE_HOUR,
+        )
+
+        if self._worker_run_media_background_jobs:
+            self._cleaner_loop = self.clock.looping_call(
+                self._start_expire_url_cache_data, 10 * 1000
+            )
+
+    async def _async_render_OPTIONS(self, request: SynapseRequest) -> None:
+        request.setHeader(b"Allow", b"OPTIONS, GET")
+        respond_with_json(request, 200, {}, send_cors=True)
+
+    async def _async_render_GET(self, request: SynapseRequest) -> None:
+        # XXX: if get_user_by_req fails, what should we do in an async render?
+        requester = await self.auth.get_user_by_req(request)
+        url = parse_string(request, "url", required=True)
+        ts = parse_integer(request, "ts")
+        if ts is None:
+            ts = self.clock.time_msec()
+
+        # XXX: we could move this into _do_preview if we wanted.
+        url_tuple = urlsplit(url)
+        for entry in self.url_preview_url_blacklist:
+            match = True
+            for attrib in entry:
+                pattern = entry[attrib]
+                value = getattr(url_tuple, attrib)
+                logger.debug(
+                    "Matching attrib '%s' with value '%s' against pattern '%s'",
+                    attrib,
+                    value,
+                    pattern,
+                )
+
+                if value is None:
+                    match = False
+                    continue
+
+                # Some attributes might not be parsed as strings by urlsplit (such as the
+                # port, which is parsed as an int). Because we use match functions that
+                # expect strings, we want to make sure that's what we give them.
+                value_str = str(value)
+
+                if pattern.startswith("^"):
+                    if not re.match(pattern, value_str):
+                        match = False
+                        continue
+                else:
+                    if not fnmatch.fnmatch(value_str, pattern):
+                        match = False
+                        continue
+            if match:
+                logger.warning("URL %s blocked by url_blacklist entry %s", url, entry)
+                raise SynapseError(
+                    403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN
+                )
+
+        # the in-memory cache:
+        # * ensures that only one request is active at a time
+        # * takes load off the DB for the thundering herds
+        # * also caches any failures (unlike the DB) so we don't keep
+        #    requesting the same endpoint
+
+        observable = self._cache.get(url)
+
+        if not observable:
+            download = run_in_background(self._do_preview, url, requester.user, ts)
+            observable = ObservableDeferred(download, consumeErrors=True)
+            self._cache[url] = observable
+        else:
+            logger.info("Returning cached response")
+
+        og = await make_deferred_yieldable(observable.observe())
+        respond_with_json_bytes(request, 200, og, send_cors=True)
+
+    async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
+        """Check the db, and download the URL and build a preview
+
+        Args:
+            url: The URL to preview.
+            user: The user requesting the preview.
+            ts: The timestamp requested for the preview.
+
+        Returns:
+            json-encoded og data
+        """
+        # check the URL cache in the DB (which will also provide us with
+        # historical previews, if we have any)
+        cache_result = await self.store.get_url_cache(url, ts)
+        if (
+            cache_result
+            and cache_result["expires_ts"] > ts
+            and cache_result["response_code"] / 100 == 2
+        ):
+            # It may be stored as text in the database, not as bytes (such as
+            # PostgreSQL). If so, encode it back before handing it on.
+            og = cache_result["og"]
+            if isinstance(og, str):
+                og = og.encode("utf8")
+            return og
+
+        # If this URL can be accessed via oEmbed, use that instead.
+        url_to_download = url
+        oembed_url = self._oembed.get_oembed_url(url)
+        if oembed_url:
+            url_to_download = oembed_url
+
+        media_info = await self._handle_url(url_to_download, user)
+
+        logger.debug("got media_info of '%s'", media_info)
+
+        # The number of milliseconds that the response should be considered valid.
+        expiration_ms = media_info.expires
+        author_name: Optional[str] = None
+
+        if _is_media(media_info.media_type):
+            file_id = media_info.filesystem_id
+            dims = await self.media_repo._generate_thumbnails(
+                None, file_id, file_id, media_info.media_type, url_cache=True
+            )
+
+            og = {
+                "og:description": media_info.download_name,
+                "og:image": f"mxc://{self.server_name}/{media_info.filesystem_id}",
+                "og:image:type": media_info.media_type,
+                "matrix:image:size": media_info.media_length,
+            }
+
+            if dims:
+                og["og:image:width"] = dims["width"]
+                og["og:image:height"] = dims["height"]
+            else:
+                logger.warning("Couldn't get dims for %s" % url)
+
+            # define our OG response for this media
+        elif _is_html(media_info.media_type):
+            # TODO: somehow stop a big HTML tree from exploding synapse's RAM
+
+            with open(media_info.filename, "rb") as file:
+                body = file.read()
+
+            tree = decode_body(body, media_info.uri, media_info.media_type)
+            if tree is not None:
+                # Check if this HTML document points to oEmbed information and
+                # defer to that.
+                oembed_url = self._oembed.autodiscover_from_html(tree)
+                og_from_oembed: JsonDict = {}
+                if oembed_url:
+                    try:
+                        oembed_info = await self._handle_url(
+                            oembed_url, user, allow_data_urls=True
+                        )
+                    except Exception as e:
+                        # Fetching the oEmbed info failed, don't block the entire URL preview.
+                        logger.warning(
+                            "oEmbed fetch failed during URL preview: %s errored with %s",
+                            oembed_url,
+                            e,
+                        )
+                    else:
+                        (
+                            og_from_oembed,
+                            author_name,
+                            expiration_ms,
+                        ) = await self._handle_oembed_response(
+                            url, oembed_info, expiration_ms
+                        )
+
+                # Parse Open Graph information from the HTML in case the oEmbed
+                # response failed or is incomplete.
+                og_from_html = parse_html_to_open_graph(tree)
+
+                # Compile the Open Graph response by using the scraped
+                # information from the HTML and overlaying any information
+                # from the oEmbed response.
+                og = {**og_from_html, **og_from_oembed}
+
+                await self._precache_image_url(user, media_info, og)
+            else:
+                og = {}
+
+        elif oembed_url:
+            # Handle the oEmbed information.
+            og, author_name, expiration_ms = await self._handle_oembed_response(
+                url, media_info, expiration_ms
+            )
+            await self._precache_image_url(user, media_info, og)
+
+        else:
+            logger.warning("Failed to find any OG data in %s", url)
+            og = {}
+
+        # If we don't have a title but we have author_name, copy it as
+        # title
+        if not og.get("og:title") and author_name:
+            og["og:title"] = author_name
+
+        # filter out any stupidly long values
+        keys_to_remove = []
+        for k, v in og.items():
+            # values can be numeric as well as strings, hence the cast to str
+            if len(k) > OG_TAG_NAME_MAXLEN or len(str(v)) > OG_TAG_VALUE_MAXLEN:
+                logger.warning(
+                    "Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN]
+                )
+                keys_to_remove.append(k)
+        for k in keys_to_remove:
+            del og[k]
+
+        logger.debug("Calculated OG for %s as %s", url, og)
+
+        jsonog = json_encoder.encode(og)
+
+        # Cap the amount of time to consider a response valid.
+        expiration_ms = min(expiration_ms, ONE_DAY)
+
+        # store OG in history-aware DB cache
+        await self.store.store_url_cache(
+            url,
+            media_info.response_code,
+            media_info.etag,
+            media_info.created_ts_ms + expiration_ms,
+            jsonog,
+            media_info.filesystem_id,
+            media_info.created_ts_ms,
+        )
+
+        return jsonog.encode("utf8")
+
+    async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult:
+        """
+        Fetches a remote URL and parses the headers.
+
+        Args:
+             url: The URL to fetch.
+             output_stream: The stream to write the content to.
+
+        Returns:
+            A tuple of:
+                Media length, URL downloaded, the HTTP response code,
+                the media type, the downloaded file name, the number of
+                milliseconds the result is valid for, the etag header.
+        """
+
+        try:
+            logger.debug("Trying to get preview for url '%s'", url)
+            length, headers, uri, code = await self.client.get_file(
+                url,
+                output_stream=output_stream,
+                max_size=self.max_spider_size,
+                headers={
+                    b"Accept-Language": self.url_preview_accept_language,
+                    # Use a custom user agent for the preview because some sites will only return
+                    # Open Graph metadata to crawler user agents. Omit the Synapse version
+                    # string to avoid leaking information.
+                    b"User-Agent": [
+                        "Synapse (bot; +https://github.com/matrix-org/synapse)"
+                    ],
+                },
+                is_allowed_content_type=_is_previewable,
+            )
+        except SynapseError:
+            # Pass SynapseErrors through directly, so that the servlet
+            # handler will return a SynapseError to the client instead of
+            # blank data or a 500.
+            raise
+        except DNSLookupError:
+            # DNS lookup returned no results
+            # Note: This will also be the case if one of the resolved IP
+            # addresses is blacklisted
+            raise SynapseError(
+                502,
+                "DNS resolution failure during URL preview generation",
+                Codes.UNKNOWN,
+            )
+        except Exception as e:
+            # FIXME: pass through 404s and other error messages nicely
+            logger.warning("Error downloading %s: %r", url, e)
+
+            raise SynapseError(
+                500,
+                "Failed to download content: %s"
+                % (traceback.format_exception_only(sys.exc_info()[0], e),),
+                Codes.UNKNOWN,
+            )
+
+        if b"Content-Type" in headers:
+            media_type = headers[b"Content-Type"][0].decode("ascii")
+        else:
+            media_type = "application/octet-stream"
+
+        download_name = get_filename_from_headers(headers)
+
+        # FIXME: we should calculate a proper expiration based on the
+        # Cache-Control and Expire headers.  But for now, assume 1 hour.
+        expires = ONE_HOUR
+        etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
+
+        return DownloadResult(
+            length, uri, code, media_type, download_name, expires, etag
+        )
+
+    async def _parse_data_url(
+        self, url: str, output_stream: BinaryIO
+    ) -> DownloadResult:
+        """
+        Parses a data: URL.
+
+        Args:
+             url: The URL to parse.
+             output_stream: The stream to write the content to.
+
+        Returns:
+            A tuple of:
+                Media length, URL downloaded, the HTTP response code,
+                the media type, the downloaded file name, the number of
+                milliseconds the result is valid for, the etag header.
+        """
+
+        try:
+            logger.debug("Trying to parse data url '%s'", url)
+            with urlopen(url) as url_info:
+                # TODO Can this be more efficient.
+                output_stream.write(url_info.read())
+        except Exception as e:
+            logger.warning("Error parsing data: URL %s: %r", url, e)
+
+            raise SynapseError(
+                500,
+                "Failed to parse data URL: %s"
+                % (traceback.format_exception_only(sys.exc_info()[0], e),),
+                Codes.UNKNOWN,
+            )
+
+        return DownloadResult(
+            # Read back the length that has been written.
+            length=output_stream.tell(),
+            uri=url,
+            # If it was parsed, consider this a 200 OK.
+            response_code=200,
+            # urlopen shoves the media-type from the data URL into the content type
+            # header object.
+            media_type=url_info.headers.get_content_type(),
+            # Some features are not supported by data: URLs.
+            download_name=None,
+            expires=ONE_HOUR,
+            etag=None,
+        )
+
+    async def _handle_url(
+        self, url: str, user: UserID, allow_data_urls: bool = False
+    ) -> MediaInfo:
+        """
+        Fetches content from a URL and parses the result to generate a MediaInfo.
+
+        It uses the media storage provider to persist the fetched content and
+        stores the mapping into the database.
+
+        Args:
+             url: The URL to fetch.
+             user: The user who ahs requested this URL.
+             allow_data_urls: True if data URLs should be allowed.
+
+        Returns:
+            A MediaInfo object describing the fetched content.
+        """
+
+        # TODO: we should probably honour robots.txt... except in practice
+        # we're most likely being explicitly triggered by a human rather than a
+        # bot, so are we really a robot?
+
+        file_id = datetime.date.today().isoformat() + "_" + random_string(16)
+
+        file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)
+
+        with self.media_storage.store_into_file(file_info) as (f, fname, finish):
+            if url.startswith("data:"):
+                if not allow_data_urls:
+                    raise SynapseError(
+                        500, "Previewing of data: URLs is forbidden", Codes.UNKNOWN
+                    )
+
+                download_result = await self._parse_data_url(url, f)
+            else:
+                download_result = await self._download_url(url, f)
+
+            await finish()
+
+        try:
+            time_now_ms = self.clock.time_msec()
+
+            await self.store.store_local_media(
+                media_id=file_id,
+                media_type=download_result.media_type,
+                time_now_ms=time_now_ms,
+                upload_name=download_result.download_name,
+                media_length=download_result.length,
+                user_id=user,
+                url_cache=url,
+            )
+
+        except Exception as e:
+            logger.error("Error handling downloaded %s: %r", url, e)
+            # TODO: we really ought to delete the downloaded file in this
+            # case, since we won't have recorded it in the db, and will
+            # therefore not expire it.
+            raise
+
+        return MediaInfo(
+            media_type=download_result.media_type,
+            media_length=download_result.length,
+            download_name=download_result.download_name,
+            created_ts_ms=time_now_ms,
+            filesystem_id=file_id,
+            filename=fname,
+            uri=download_result.uri,
+            response_code=download_result.response_code,
+            expires=download_result.expires,
+            etag=download_result.etag,
+        )
+
+    async def _precache_image_url(
+        self, user: UserID, media_info: MediaInfo, og: JsonDict
+    ) -> None:
+        """
+        Pre-cache the image (if one exists) for posterity
+
+        Args:
+            user: The user requesting the preview.
+            media_info: The media being previewed.
+            og: The Open Graph dictionary. This is modified with image information.
+        """
+        # If there's no image or it is blank, there's nothing to do.
+        if "og:image" not in og:
+            return
+
+        # Remove the raw image URL, this will be replaced with an MXC URL, if successful.
+        image_url = og.pop("og:image")
+        if not image_url:
+            return
+
+        # The image URL from the HTML might be relative to the previewed page,
+        # convert it to an URL which can be requested directly.
+        url_parts = urlparse(image_url)
+        if url_parts.scheme != "data":
+            image_url = urljoin(media_info.uri, image_url)
+
+        # FIXME: it might be cleaner to use the same flow as the main /preview_url
+        # request itself and benefit from the same caching etc.  But for now we
+        # just rely on the caching on the master request to speed things up.
+        try:
+            image_info = await self._handle_url(image_url, user, allow_data_urls=True)
+        except Exception as e:
+            # Pre-caching the image failed, don't block the entire URL preview.
+            logger.warning(
+                "Pre-caching image failed during URL preview: %s errored with %s",
+                image_url,
+                e,
+            )
+            return
+
+        if _is_media(image_info.media_type):
+            # TODO: make sure we don't choke on white-on-transparent images
+            file_id = image_info.filesystem_id
+            dims = await self.media_repo._generate_thumbnails(
+                None, file_id, file_id, image_info.media_type, url_cache=True
+            )
+            if dims:
+                og["og:image:width"] = dims["width"]
+                og["og:image:height"] = dims["height"]
+            else:
+                logger.warning("Couldn't get dims for %s", image_url)
+
+            og["og:image"] = f"mxc://{self.server_name}/{image_info.filesystem_id}"
+            og["og:image:type"] = image_info.media_type
+            og["matrix:image:size"] = image_info.media_length
+
+    async def _handle_oembed_response(
+        self, url: str, media_info: MediaInfo, expiration_ms: int
+    ) -> Tuple[JsonDict, Optional[str], int]:
+        """
+        Parse the downloaded oEmbed info.
+
+        Args:
+            url: The URL which is being previewed (not the one which was
+                requested).
+            media_info: The media being previewed.
+            expiration_ms: The length of time, in milliseconds, the media is valid for.
+
+        Returns:
+            A tuple of:
+                The Open Graph dictionary, if the oEmbed info can be parsed.
+                The author name if it could be retrieved from oEmbed.
+                The (possibly updated) length of time, in milliseconds, the media is valid for.
+        """
+        # If JSON was not returned, there's nothing to do.
+        if not _is_json(media_info.media_type):
+            return {}, None, expiration_ms
+
+        with open(media_info.filename, "rb") as file:
+            body = file.read()
+
+        oembed_response = self._oembed.parse_oembed_response(url, body)
+        open_graph_result = oembed_response.open_graph_result
+
+        # Use the cache age from the oEmbed result, if one was given.
+        if open_graph_result and oembed_response.cache_age is not None:
+            expiration_ms = oembed_response.cache_age
+
+        return open_graph_result, oembed_response.author_name, expiration_ms
+
+    def _start_expire_url_cache_data(self) -> Deferred:
+        return run_as_background_process(
+            "expire_url_cache_data", self._expire_url_cache_data
+        )
+
+    async def _expire_url_cache_data(self) -> None:
+        """Clean up expired url cache content, media and thumbnails."""
+
+        assert self._worker_run_media_background_jobs
+
+        now = self.clock.time_msec()
+
+        logger.debug("Running url preview cache expiry")
+
+        def try_remove_parent_dirs(dirs: Iterable[str]) -> None:
+            """Attempt to remove the given chain of parent directories
+
+            Args:
+                dirs: The list of directory paths to delete, with children appearing
+                    before their parents.
+            """
+            for dir in dirs:
+                try:
+                    os.rmdir(dir)
+                except FileNotFoundError:
+                    # Already deleted, continue with deleting the rest
+                    pass
+                except OSError as e:
+                    # Failed, skip deleting the rest of the parent dirs
+                    if e.errno != errno.ENOTEMPTY:
+                        logger.warning(
+                            "Failed to remove media directory while clearing url preview cache: %r: %s",
+                            dir,
+                            e,
+                        )
+                    break
+
+        # First we delete expired url cache entries
+        media_ids = await self.store.get_expired_url_cache(now)
+
+        removed_media = []
+        for media_id in media_ids:
+            fname = self.filepaths.url_cache_filepath(media_id)
+            try:
+                os.remove(fname)
+            except FileNotFoundError:
+                pass  # If the path doesn't exist, meh
+            except OSError as e:
+                logger.warning(
+                    "Failed to remove media while clearing url preview cache: %r: %s",
+                    media_id,
+                    e,
+                )
+                continue
+
+            removed_media.append(media_id)
+
+            dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
+            try_remove_parent_dirs(dirs)
+
+        await self.store.delete_url_cache(removed_media)
+
+        if removed_media:
+            logger.debug(
+                "Deleted %d entries from url preview cache", len(removed_media)
+            )
+        else:
+            logger.debug("No entries removed from url preview cache")
+
+        # Now we delete old images associated with the url cache.
+        # These may be cached for a bit on the client (i.e., they
+        # may have a room open with a preview url thing open).
+        # So we wait a couple of days before deleting, just in case.
+        expire_before = now - IMAGE_CACHE_EXPIRY_MS
+        media_ids = await self.store.get_url_cache_media_before(expire_before)
+
+        removed_media = []
+        for media_id in media_ids:
+            fname = self.filepaths.url_cache_filepath(media_id)
+            try:
+                os.remove(fname)
+            except FileNotFoundError:
+                pass  # If the path doesn't exist, meh
+            except OSError as e:
+                logger.warning(
+                    "Failed to remove media from url preview cache: %r: %s", media_id, e
+                )
+                continue
+
+            dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
+            try_remove_parent_dirs(dirs)
+
+            thumbnail_dir = self.filepaths.url_cache_thumbnail_directory(media_id)
+            try:
+                shutil.rmtree(thumbnail_dir)
+            except FileNotFoundError:
+                pass  # If the path doesn't exist, meh
+            except OSError as e:
+                logger.warning(
+                    "Failed to remove media from url preview cache: %r: %s", media_id, e
+                )
+                continue
+
+            removed_media.append(media_id)
+
+            dirs = self.filepaths.url_cache_thumbnail_dirs_to_delete(media_id)
+            # Note that one of the directories to be deleted has already been
+            # removed by the `rmtree` above.
+            try_remove_parent_dirs(dirs)
+
+        await self.store.delete_url_cache_media(removed_media)
+
+        if removed_media:
+            logger.debug("Deleted %d media from url preview cache", len(removed_media))
+        else:
+            logger.debug("No media removed from url preview cache")
+
+
+def _is_media(content_type: str) -> bool:
+    return content_type.lower().startswith("image/")
+
+
+def _is_html(content_type: str) -> bool:
+    content_type = content_type.lower()
+    return content_type.startswith("text/html") or content_type.startswith(
+        "application/xhtml"
+    )
+
+
+def _is_json(content_type: str) -> bool:
+    return content_type.lower().startswith("application/json")
+
+
+def _is_previewable(content_type: str) -> bool:
+    """Returns True for content types for which we will perform URL preview and False
+    otherwise."""
+
+    return _is_html(content_type) or _is_media(content_type) or _is_json(content_type)
diff --git a/synapse/rest/media/thumbnail_resource.py b/synapse/rest/media/thumbnail_resource.py
new file mode 100644
index 0000000000..4ee2a0dbda
--- /dev/null
+++ b/synapse/rest/media/thumbnail_resource.py
@@ -0,0 +1,554 @@
+# Copyright 2014-2016 OpenMarket Ltd
+# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+from synapse.api.errors import Codes, SynapseError, cs_error
+from synapse.config.repository import THUMBNAIL_SUPPORTED_MEDIA_FORMAT_MAP
+from synapse.http.server import (
+    DirectServeJsonResource,
+    respond_with_json,
+    set_corp_headers,
+    set_cors_headers,
+)
+from synapse.http.servlet import parse_integer, parse_string
+from synapse.http.site import SynapseRequest
+from synapse.media._base import (
+    FileInfo,
+    ThumbnailInfo,
+    parse_media_id,
+    respond_404,
+    respond_with_file,
+    respond_with_responder,
+)
+from synapse.media.media_storage import MediaStorage
+
+if TYPE_CHECKING:
+    from synapse.media.media_repository import MediaRepository
+    from synapse.server import HomeServer
+
+logger = logging.getLogger(__name__)
+
+
+class ThumbnailResource(DirectServeJsonResource):
+    isLeaf = True
+
+    def __init__(
+        self,
+        hs: "HomeServer",
+        media_repo: "MediaRepository",
+        media_storage: MediaStorage,
+    ):
+        super().__init__()
+
+        self.store = hs.get_datastores().main
+        self.media_repo = media_repo
+        self.media_storage = media_storage
+        self.dynamic_thumbnails = hs.config.media.dynamic_thumbnails
+        self.server_name = hs.hostname
+
+    async def _async_render_GET(self, request: SynapseRequest) -> None:
+        set_cors_headers(request)
+        set_corp_headers(request)
+        server_name, media_id, _ = parse_media_id(request)
+        width = parse_integer(request, "width", required=True)
+        height = parse_integer(request, "height", required=True)
+        method = parse_string(request, "method", "scale")
+        # TODO Parse the Accept header to get an prioritised list of thumbnail types.
+        m_type = "image/png"
+
+        if server_name == self.server_name:
+            if self.dynamic_thumbnails:
+                await self._select_or_generate_local_thumbnail(
+                    request, media_id, width, height, method, m_type
+                )
+            else:
+                await self._respond_local_thumbnail(
+                    request, media_id, width, height, method, m_type
+                )
+            self.media_repo.mark_recently_accessed(None, media_id)
+        else:
+            if self.dynamic_thumbnails:
+                await self._select_or_generate_remote_thumbnail(
+                    request, server_name, media_id, width, height, method, m_type
+                )
+            else:
+                await self._respond_remote_thumbnail(
+                    request, server_name, media_id, width, height, method, m_type
+                )
+            self.media_repo.mark_recently_accessed(server_name, media_id)
+
+    async def _respond_local_thumbnail(
+        self,
+        request: SynapseRequest,
+        media_id: str,
+        width: int,
+        height: int,
+        method: str,
+        m_type: str,
+    ) -> None:
+        media_info = await self.store.get_local_media(media_id)
+
+        if not media_info:
+            respond_404(request)
+            return
+        if media_info["quarantined_by"]:
+            logger.info("Media is quarantined")
+            respond_404(request)
+            return
+
+        thumbnail_infos = await self.store.get_local_media_thumbnails(media_id)
+        await self._select_and_respond_with_thumbnail(
+            request,
+            width,
+            height,
+            method,
+            m_type,
+            thumbnail_infos,
+            media_id,
+            media_id,
+            url_cache=bool(media_info["url_cache"]),
+            server_name=None,
+        )
+
+    async def _select_or_generate_local_thumbnail(
+        self,
+        request: SynapseRequest,
+        media_id: str,
+        desired_width: int,
+        desired_height: int,
+        desired_method: str,
+        desired_type: str,
+    ) -> None:
+        media_info = await self.store.get_local_media(media_id)
+
+        if not media_info:
+            respond_404(request)
+            return
+        if media_info["quarantined_by"]:
+            logger.info("Media is quarantined")
+            respond_404(request)
+            return
+
+        thumbnail_infos = await self.store.get_local_media_thumbnails(media_id)
+        for info in thumbnail_infos:
+            t_w = info["thumbnail_width"] == desired_width
+            t_h = info["thumbnail_height"] == desired_height
+            t_method = info["thumbnail_method"] == desired_method
+            t_type = info["thumbnail_type"] == desired_type
+
+            if t_w and t_h and t_method and t_type:
+                file_info = FileInfo(
+                    server_name=None,
+                    file_id=media_id,
+                    url_cache=media_info["url_cache"],
+                    thumbnail=ThumbnailInfo(
+                        width=info["thumbnail_width"],
+                        height=info["thumbnail_height"],
+                        type=info["thumbnail_type"],
+                        method=info["thumbnail_method"],
+                    ),
+                )
+
+                t_type = file_info.thumbnail_type
+                t_length = info["thumbnail_length"]
+
+                responder = await self.media_storage.fetch_media(file_info)
+                if responder:
+                    await respond_with_responder(request, responder, t_type, t_length)
+                    return
+
+        logger.debug("We don't have a thumbnail of that size. Generating")
+
+        # Okay, so we generate one.
+        file_path = await self.media_repo.generate_local_exact_thumbnail(
+            media_id,
+            desired_width,
+            desired_height,
+            desired_method,
+            desired_type,
+            url_cache=bool(media_info["url_cache"]),
+        )
+
+        if file_path:
+            await respond_with_file(request, desired_type, file_path)
+        else:
+            logger.warning("Failed to generate thumbnail")
+            raise SynapseError(400, "Failed to generate thumbnail.")
+
+    async def _select_or_generate_remote_thumbnail(
+        self,
+        request: SynapseRequest,
+        server_name: str,
+        media_id: str,
+        desired_width: int,
+        desired_height: int,
+        desired_method: str,
+        desired_type: str,
+    ) -> None:
+        media_info = await self.media_repo.get_remote_media_info(server_name, media_id)
+
+        thumbnail_infos = await self.store.get_remote_media_thumbnails(
+            server_name, media_id
+        )
+
+        file_id = media_info["filesystem_id"]
+
+        for info in thumbnail_infos:
+            t_w = info["thumbnail_width"] == desired_width
+            t_h = info["thumbnail_height"] == desired_height
+            t_method = info["thumbnail_method"] == desired_method
+            t_type = info["thumbnail_type"] == desired_type
+
+            if t_w and t_h and t_method and t_type:
+                file_info = FileInfo(
+                    server_name=server_name,
+                    file_id=media_info["filesystem_id"],
+                    thumbnail=ThumbnailInfo(
+                        width=info["thumbnail_width"],
+                        height=info["thumbnail_height"],
+                        type=info["thumbnail_type"],
+                        method=info["thumbnail_method"],
+                    ),
+                )
+
+                t_type = file_info.thumbnail_type
+                t_length = info["thumbnail_length"]
+
+                responder = await self.media_storage.fetch_media(file_info)
+                if responder:
+                    await respond_with_responder(request, responder, t_type, t_length)
+                    return
+
+        logger.debug("We don't have a thumbnail of that size. Generating")
+
+        # Okay, so we generate one.
+        file_path = await self.media_repo.generate_remote_exact_thumbnail(
+            server_name,
+            file_id,
+            media_id,
+            desired_width,
+            desired_height,
+            desired_method,
+            desired_type,
+        )
+
+        if file_path:
+            await respond_with_file(request, desired_type, file_path)
+        else:
+            logger.warning("Failed to generate thumbnail")
+            raise SynapseError(400, "Failed to generate thumbnail.")
+
+    async def _respond_remote_thumbnail(
+        self,
+        request: SynapseRequest,
+        server_name: str,
+        media_id: str,
+        width: int,
+        height: int,
+        method: str,
+        m_type: str,
+    ) -> None:
+        # TODO: Don't download the whole remote file
+        # We should proxy the thumbnail from the remote server instead of
+        # downloading the remote file and generating our own thumbnails.
+        media_info = await self.media_repo.get_remote_media_info(server_name, media_id)
+
+        thumbnail_infos = await self.store.get_remote_media_thumbnails(
+            server_name, media_id
+        )
+        await self._select_and_respond_with_thumbnail(
+            request,
+            width,
+            height,
+            method,
+            m_type,
+            thumbnail_infos,
+            media_id,
+            media_info["filesystem_id"],
+            url_cache=False,
+            server_name=server_name,
+        )
+
+    async def _select_and_respond_with_thumbnail(
+        self,
+        request: SynapseRequest,
+        desired_width: int,
+        desired_height: int,
+        desired_method: str,
+        desired_type: str,
+        thumbnail_infos: List[Dict[str, Any]],
+        media_id: str,
+        file_id: str,
+        url_cache: bool,
+        server_name: Optional[str] = None,
+    ) -> None:
+        """
+        Respond to a request with an appropriate thumbnail from the previously generated thumbnails.
+
+        Args:
+            request: The incoming request.
+            desired_width: The desired width, the returned thumbnail may be larger than this.
+            desired_height: The desired height, the returned thumbnail may be larger than this.
+            desired_method: The desired method used to generate the thumbnail.
+            desired_type: The desired content-type of the thumbnail.
+            thumbnail_infos: A list of dictionaries of candidate thumbnails.
+            file_id: The ID of the media that a thumbnail is being requested for.
+            url_cache: True if this is from a URL cache.
+            server_name: The server name, if this is a remote thumbnail.
+        """
+        logger.debug(
+            "_select_and_respond_with_thumbnail: media_id=%s desired=%sx%s (%s) thumbnail_infos=%s",
+            media_id,
+            desired_width,
+            desired_height,
+            desired_method,
+            thumbnail_infos,
+        )
+
+        # If `dynamic_thumbnails` is enabled, we expect Synapse to go down a
+        # different code path to handle it.
+        assert not self.dynamic_thumbnails
+
+        if thumbnail_infos:
+            file_info = self._select_thumbnail(
+                desired_width,
+                desired_height,
+                desired_method,
+                desired_type,
+                thumbnail_infos,
+                file_id,
+                url_cache,
+                server_name,
+            )
+            if not file_info:
+                logger.info("Couldn't find a thumbnail matching the desired inputs")
+                respond_404(request)
+                return
+
+            # The thumbnail property must exist.
+            assert file_info.thumbnail is not None
+
+            responder = await self.media_storage.fetch_media(file_info)
+            if responder:
+                await respond_with_responder(
+                    request,
+                    responder,
+                    file_info.thumbnail.type,
+                    file_info.thumbnail.length,
+                )
+                return
+
+            # If we can't find the thumbnail we regenerate it. This can happen
+            # if e.g. we've deleted the thumbnails but still have the original
+            # image somewhere.
+            #
+            # Since we have an entry for the thumbnail in the DB we a) know we
+            # have have successfully generated the thumbnail in the past (so we
+            # don't need to worry about repeatedly failing to generate
+            # thumbnails), and b) have already calculated that appropriate
+            # width/height/method so we can just call the "generate exact"
+            # methods.
+
+            # First let's check that we do actually have the original image
+            # still. This will throw a 404 if we don't.
+            # TODO: We should refetch the thumbnails for remote media.
+            await self.media_storage.ensure_media_is_in_local_cache(
+                FileInfo(server_name, file_id, url_cache=url_cache)
+            )
+
+            if server_name:
+                await self.media_repo.generate_remote_exact_thumbnail(
+                    server_name,
+                    file_id=file_id,
+                    media_id=media_id,
+                    t_width=file_info.thumbnail.width,
+                    t_height=file_info.thumbnail.height,
+                    t_method=file_info.thumbnail.method,
+                    t_type=file_info.thumbnail.type,
+                )
+            else:
+                await self.media_repo.generate_local_exact_thumbnail(
+                    media_id=media_id,
+                    t_width=file_info.thumbnail.width,
+                    t_height=file_info.thumbnail.height,
+                    t_method=file_info.thumbnail.method,
+                    t_type=file_info.thumbnail.type,
+                    url_cache=url_cache,
+                )
+
+            responder = await self.media_storage.fetch_media(file_info)
+            await respond_with_responder(
+                request,
+                responder,
+                file_info.thumbnail.type,
+                file_info.thumbnail.length,
+            )
+        else:
+            # This might be because:
+            # 1. We can't create thumbnails for the given media (corrupted or
+            #    unsupported file type), or
+            # 2. The thumbnailing process never ran or errored out initially
+            #    when the media was first uploaded (these bugs should be
+            #    reported and fixed).
+            # Note that we don't attempt to generate a thumbnail now because
+            # `dynamic_thumbnails` is disabled.
+            logger.info("Failed to find any generated thumbnails")
+
+            respond_with_json(
+                request,
+                400,
+                cs_error(
+                    "Cannot find any thumbnails for the requested media (%r). This might mean the media is not a supported_media_format=(%s) or that thumbnailing failed for some other reason. (Dynamic thumbnails are disabled on this server.)"
+                    % (
+                        request.postpath,
+                        ", ".join(THUMBNAIL_SUPPORTED_MEDIA_FORMAT_MAP.keys()),
+                    ),
+                    code=Codes.UNKNOWN,
+                ),
+                send_cors=True,
+            )
+
+    def _select_thumbnail(
+        self,
+        desired_width: int,
+        desired_height: int,
+        desired_method: str,
+        desired_type: str,
+        thumbnail_infos: List[Dict[str, Any]],
+        file_id: str,
+        url_cache: bool,
+        server_name: Optional[str],
+    ) -> Optional[FileInfo]:
+        """
+        Choose an appropriate thumbnail from the previously generated thumbnails.
+
+        Args:
+            desired_width: The desired width, the returned thumbnail may be larger than this.
+            desired_height: The desired height, the returned thumbnail may be larger than this.
+            desired_method: The desired method used to generate the thumbnail.
+            desired_type: The desired content-type of the thumbnail.
+            thumbnail_infos: A list of dictionaries of candidate thumbnails.
+            file_id: The ID of the media that a thumbnail is being requested for.
+            url_cache: True if this is from a URL cache.
+            server_name: The server name, if this is a remote thumbnail.
+
+        Returns:
+             The thumbnail which best matches the desired parameters.
+        """
+        desired_method = desired_method.lower()
+
+        # The chosen thumbnail.
+        thumbnail_info = None
+
+        d_w = desired_width
+        d_h = desired_height
+
+        if desired_method == "crop":
+            # Thumbnails that match equal or larger sizes of desired width/height.
+            crop_info_list: List[Tuple[int, int, int, bool, int, Dict[str, Any]]] = []
+            # Other thumbnails.
+            crop_info_list2: List[Tuple[int, int, int, bool, int, Dict[str, Any]]] = []
+            for info in thumbnail_infos:
+                # Skip thumbnails generated with different methods.
+                if info["thumbnail_method"] != "crop":
+                    continue
+
+                t_w = info["thumbnail_width"]
+                t_h = info["thumbnail_height"]
+                aspect_quality = abs(d_w * t_h - d_h * t_w)
+                min_quality = 0 if d_w <= t_w and d_h <= t_h else 1
+                size_quality = abs((d_w - t_w) * (d_h - t_h))
+                type_quality = desired_type != info["thumbnail_type"]
+                length_quality = info["thumbnail_length"]
+                if t_w >= d_w or t_h >= d_h:
+                    crop_info_list.append(
+                        (
+                            aspect_quality,
+                            min_quality,
+                            size_quality,
+                            type_quality,
+                            length_quality,
+                            info,
+                        )
+                    )
+                else:
+                    crop_info_list2.append(
+                        (
+                            aspect_quality,
+                            min_quality,
+                            size_quality,
+                            type_quality,
+                            length_quality,
+                            info,
+                        )
+                    )
+            # Pick the most appropriate thumbnail. Some values of `desired_width` and
+            # `desired_height` may result in a tie, in which case we avoid comparing on
+            # the thumbnail info dictionary and pick the thumbnail that appears earlier
+            # in the list of candidates.
+            if crop_info_list:
+                thumbnail_info = min(crop_info_list, key=lambda t: t[:-1])[-1]
+            elif crop_info_list2:
+                thumbnail_info = min(crop_info_list2, key=lambda t: t[:-1])[-1]
+        elif desired_method == "scale":
+            # Thumbnails that match equal or larger sizes of desired width/height.
+            info_list: List[Tuple[int, bool, int, Dict[str, Any]]] = []
+            # Other thumbnails.
+            info_list2: List[Tuple[int, bool, int, Dict[str, Any]]] = []
+
+            for info in thumbnail_infos:
+                # Skip thumbnails generated with different methods.
+                if info["thumbnail_method"] != "scale":
+                    continue
+
+                t_w = info["thumbnail_width"]
+                t_h = info["thumbnail_height"]
+                size_quality = abs((d_w - t_w) * (d_h - t_h))
+                type_quality = desired_type != info["thumbnail_type"]
+                length_quality = info["thumbnail_length"]
+                if t_w >= d_w or t_h >= d_h:
+                    info_list.append((size_quality, type_quality, length_quality, info))
+                else:
+                    info_list2.append(
+                        (size_quality, type_quality, length_quality, info)
+                    )
+            # Pick the most appropriate thumbnail. Some values of `desired_width` and
+            # `desired_height` may result in a tie, in which case we avoid comparing on
+            # the thumbnail info dictionary and pick the thumbnail that appears earlier
+            # in the list of candidates.
+            if info_list:
+                thumbnail_info = min(info_list, key=lambda t: t[:-1])[-1]
+            elif info_list2:
+                thumbnail_info = min(info_list2, key=lambda t: t[:-1])[-1]
+
+        if thumbnail_info:
+            return FileInfo(
+                file_id=file_id,
+                url_cache=url_cache,
+                server_name=server_name,
+                thumbnail=ThumbnailInfo(
+                    width=thumbnail_info["thumbnail_width"],
+                    height=thumbnail_info["thumbnail_height"],
+                    type=thumbnail_info["thumbnail_type"],
+                    method=thumbnail_info["thumbnail_method"],
+                    length=thumbnail_info["thumbnail_length"],
+                ),
+            )
+
+        # No matching thumbnail was found.
+        return None
diff --git a/synapse/rest/media/upload_resource.py b/synapse/rest/media/upload_resource.py
new file mode 100644
index 0000000000..697348613b
--- /dev/null
+++ b/synapse/rest/media/upload_resource.py
@@ -0,0 +1,108 @@
+# Copyright 2014-2016 OpenMarket Ltd
+# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import IO, TYPE_CHECKING, Dict, List, Optional
+
+from synapse.api.errors import Codes, SynapseError
+from synapse.http.server import DirectServeJsonResource, respond_with_json
+from synapse.http.servlet import parse_bytes_from_args
+from synapse.http.site import SynapseRequest
+from synapse.media.media_storage import SpamMediaException
+
+if TYPE_CHECKING:
+    from synapse.media.media_repository import MediaRepository
+    from synapse.server import HomeServer
+
+logger = logging.getLogger(__name__)
+
+
+class UploadResource(DirectServeJsonResource):
+    isLeaf = True
+
+    def __init__(self, hs: "HomeServer", media_repo: "MediaRepository"):
+        super().__init__()
+
+        self.media_repo = media_repo
+        self.filepaths = media_repo.filepaths
+        self.store = hs.get_datastores().main
+        self.clock = hs.get_clock()
+        self.server_name = hs.hostname
+        self.auth = hs.get_auth()
+        self.max_upload_size = hs.config.media.max_upload_size
+        self.clock = hs.get_clock()
+
+    async def _async_render_OPTIONS(self, request: SynapseRequest) -> None:
+        respond_with_json(request, 200, {}, send_cors=True)
+
+    async def _async_render_POST(self, request: SynapseRequest) -> None:
+        requester = await self.auth.get_user_by_req(request)
+        raw_content_length = request.getHeader("Content-Length")
+        if raw_content_length is None:
+            raise SynapseError(msg="Request must specify a Content-Length", code=400)
+        try:
+            content_length = int(raw_content_length)
+        except ValueError:
+            raise SynapseError(msg="Content-Length value is invalid", code=400)
+        if content_length > self.max_upload_size:
+            raise SynapseError(
+                msg="Upload request body is too large",
+                code=413,
+                errcode=Codes.TOO_LARGE,
+            )
+
+        args: Dict[bytes, List[bytes]] = request.args  # type: ignore
+        upload_name_bytes = parse_bytes_from_args(args, "filename")
+        if upload_name_bytes:
+            try:
+                upload_name: Optional[str] = upload_name_bytes.decode("utf8")
+            except UnicodeDecodeError:
+                raise SynapseError(
+                    msg="Invalid UTF-8 filename parameter: %r" % (upload_name_bytes,),
+                    code=400,
+                )
+
+        # If the name is falsey (e.g. an empty byte string) ensure it is None.
+        else:
+            upload_name = None
+
+        headers = request.requestHeaders
+
+        if headers.hasHeader(b"Content-Type"):
+            content_type_headers = headers.getRawHeaders(b"Content-Type")
+            assert content_type_headers  # for mypy
+            media_type = content_type_headers[0].decode("ascii")
+        else:
+            media_type = "application/octet-stream"
+
+        # if headers.hasHeader(b"Content-Disposition"):
+        #     disposition = headers.getRawHeaders(b"Content-Disposition")[0]
+        # TODO(markjh): parse content-dispostion
+
+        try:
+            content: IO = request.content  # type: ignore
+            content_uri = await self.media_repo.create_content(
+                media_type, upload_name, content, content_length, requester.user
+            )
+        except SpamMediaException:
+            # For uploading of media we want to respond with a 400, instead of
+            # the default 404, as that would just be confusing.
+            raise SynapseError(400, "Bad content")
+
+        logger.info("Uploaded content with URI '%s'", content_uri)
+
+        respond_with_json(
+            request, 200, {"content_uri": str(content_uri)}, send_cors=True
+        )
diff --git a/synapse/rest/media/v1/_base.py b/synapse/rest/media/v1/_base.py
index ef8334ae25..88427a5737 100644
--- a/synapse/rest/media/v1/_base.py
+++ b/synapse/rest/media/v1/_base.py
@@ -1,5 +1,4 @@
-# Copyright 2014-2016 OpenMarket Ltd
-# Copyright 2019-2021 The Matrix.org Foundation C.I.C.
+# Copyright 2023 The Matrix.org Foundation C.I.C.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,468 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+#
 
-import logging
-import os
-import urllib
-from abc import ABC, abstractmethod
-from types import TracebackType
-from typing import Awaitable, Dict, Generator, List, Optional, Tuple, Type
-
-import attr
-
-from twisted.internet.interfaces import IConsumer
-from twisted.protocols.basic import FileSender
-from twisted.web.server import Request
-
-from synapse.api.errors import Codes, SynapseError, cs_error
-from synapse.http.server import finish_request, respond_with_json
-from synapse.http.site import SynapseRequest
-from synapse.logging.context import make_deferred_yieldable
-from synapse.util.stringutils import is_ascii, parse_and_validate_server_name
-
-logger = logging.getLogger(__name__)
-
-# list all text content types that will have the charset default to UTF-8 when
-# none is given
-TEXT_CONTENT_TYPES = [
-    "text/css",
-    "text/csv",
-    "text/html",
-    "text/calendar",
-    "text/plain",
-    "text/javascript",
-    "application/json",
-    "application/ld+json",
-    "application/rtf",
-    "image/svg+xml",
-    "text/xml",
-]
-
-
-def parse_media_id(request: Request) -> Tuple[str, str, Optional[str]]:
-    """Parses the server name, media ID and optional file name from the request URI
-
-    Also performs some rough validation on the server name.
-
-    Args:
-        request: The `Request`.
-
-    Returns:
-        A tuple containing the parsed server name, media ID and optional file name.
-
-    Raises:
-        SynapseError(404): if parsing or validation fail for any reason
-    """
-    try:
-        # The type on postpath seems incorrect in Twisted 21.2.0.
-        postpath: List[bytes] = request.postpath  # type: ignore
-        assert postpath
-
-        # This allows users to append e.g. /test.png to the URL. Useful for
-        # clients that parse the URL to see content type.
-        server_name_bytes, media_id_bytes = postpath[:2]
-        server_name = server_name_bytes.decode("utf-8")
-        media_id = media_id_bytes.decode("utf8")
-
-        # Validate the server name, raising if invalid
-        parse_and_validate_server_name(server_name)
-
-        file_name = None
-        if len(postpath) > 2:
-            try:
-                file_name = urllib.parse.unquote(postpath[-1].decode("utf-8"))
-            except UnicodeDecodeError:
-                pass
-        return server_name, media_id, file_name
-    except Exception:
-        raise SynapseError(
-            404, "Invalid media id token %r" % (request.postpath,), Codes.UNKNOWN
-        )
-
-
-def respond_404(request: SynapseRequest) -> None:
-    respond_with_json(
-        request,
-        404,
-        cs_error("Not found %r" % (request.postpath,), code=Codes.NOT_FOUND),
-        send_cors=True,
-    )
-
-
-async def respond_with_file(
-    request: SynapseRequest,
-    media_type: str,
-    file_path: str,
-    file_size: Optional[int] = None,
-    upload_name: Optional[str] = None,
-) -> None:
-    logger.debug("Responding with %r", file_path)
-
-    if os.path.isfile(file_path):
-        if file_size is None:
-            stat = os.stat(file_path)
-            file_size = stat.st_size
-
-        add_file_headers(request, media_type, file_size, upload_name)
-
-        with open(file_path, "rb") as f:
-            await make_deferred_yieldable(FileSender().beginFileTransfer(f, request))
-
-        finish_request(request)
-    else:
-        respond_404(request)
-
-
-def add_file_headers(
-    request: Request,
-    media_type: str,
-    file_size: Optional[int],
-    upload_name: Optional[str],
-) -> None:
-    """Adds the correct response headers in preparation for responding with the
-    media.
-
-    Args:
-        request
-        media_type: The media/content type.
-        file_size: Size in bytes of the media, if known.
-        upload_name: The name of the requested file, if any.
-    """
-
-    def _quote(x: str) -> str:
-        return urllib.parse.quote(x.encode("utf-8"))
-
-    # Default to a UTF-8 charset for text content types.
-    # ex, uses UTF-8 for 'text/css' but not 'text/css; charset=UTF-16'
-    if media_type.lower() in TEXT_CONTENT_TYPES:
-        content_type = media_type + "; charset=UTF-8"
-    else:
-        content_type = media_type
-
-    request.setHeader(b"Content-Type", content_type.encode("UTF-8"))
-    if upload_name:
-        # RFC6266 section 4.1 [1] defines both `filename` and `filename*`.
-        #
-        # `filename` is defined to be a `value`, which is defined by RFC2616
-        # section 3.6 [2] to be a `token` or a `quoted-string`, where a `token`
-        # is (essentially) a single US-ASCII word, and a `quoted-string` is a
-        # US-ASCII string surrounded by double-quotes, using backslash as an
-        # escape character. Note that %-encoding is *not* permitted.
-        #
-        # `filename*` is defined to be an `ext-value`, which is defined in
-        # RFC5987 section 3.2.1 [3] to be `charset "'" [ language ] "'" value-chars`,
-        # where `value-chars` is essentially a %-encoded string in the given charset.
-        #
-        # [1]: https://tools.ietf.org/html/rfc6266#section-4.1
-        # [2]: https://tools.ietf.org/html/rfc2616#section-3.6
-        # [3]: https://tools.ietf.org/html/rfc5987#section-3.2.1
-
-        # We avoid the quoted-string version of `filename`, because (a) synapse didn't
-        # correctly interpret those as of 0.99.2 and (b) they are a bit of a pain and we
-        # may as well just do the filename* version.
-        if _can_encode_filename_as_token(upload_name):
-            disposition = "inline; filename=%s" % (upload_name,)
-        else:
-            disposition = "inline; filename*=utf-8''%s" % (_quote(upload_name),)
-
-        request.setHeader(b"Content-Disposition", disposition.encode("ascii"))
-
-    # cache for at least a day.
-    # XXX: we might want to turn this off for data we don't want to
-    # recommend caching as it's sensitive or private - or at least
-    # select private. don't bother setting Expires as all our
-    # clients are smart enough to be happy with Cache-Control
-    request.setHeader(b"Cache-Control", b"public,max-age=86400,s-maxage=86400")
-    if file_size is not None:
-        request.setHeader(b"Content-Length", b"%d" % (file_size,))
-
-    # Tell web crawlers to not index, archive, or follow links in media. This
-    # should help to prevent things in the media repo from showing up in web
-    # search results.
-    request.setHeader(b"X-Robots-Tag", "noindex, nofollow, noarchive, noimageindex")
-
-
-# separators as defined in RFC2616. SP and HT are handled separately.
-# see _can_encode_filename_as_token.
-_FILENAME_SEPARATOR_CHARS = {
-    "(",
-    ")",
-    "<",
-    ">",
-    "@",
-    ",",
-    ";",
-    ":",
-    "\\",
-    '"',
-    "/",
-    "[",
-    "]",
-    "?",
-    "=",
-    "{",
-    "}",
-}
-
-
-def _can_encode_filename_as_token(x: str) -> bool:
-    for c in x:
-        # from RFC2616:
-        #
-        #        token          = 1*<any CHAR except CTLs or separators>
-        #
-        #        separators     = "(" | ")" | "<" | ">" | "@"
-        #                       | "," | ";" | ":" | "\" | <">
-        #                       | "/" | "[" | "]" | "?" | "="
-        #                       | "{" | "}" | SP | HT
-        #
-        #        CHAR           = <any US-ASCII character (octets 0 - 127)>
-        #
-        #        CTL            = <any US-ASCII control character
-        #                         (octets 0 - 31) and DEL (127)>
-        #
-        if ord(c) >= 127 or ord(c) <= 32 or c in _FILENAME_SEPARATOR_CHARS:
-            return False
-    return True
-
-
-async def respond_with_responder(
-    request: SynapseRequest,
-    responder: "Optional[Responder]",
-    media_type: str,
-    file_size: Optional[int],
-    upload_name: Optional[str] = None,
-) -> None:
-    """Responds to the request with given responder. If responder is None then
-    returns 404.
-
-    Args:
-        request
-        responder
-        media_type: The media/content type.
-        file_size: Size in bytes of the media. If not known it should be None
-        upload_name: The name of the requested file, if any.
-    """
-    if not responder:
-        respond_404(request)
-        return
-
-    # If we have a responder we *must* use it as a context manager.
-    with responder:
-        if request._disconnected:
-            logger.warning(
-                "Not sending response to request %s, already disconnected.", request
-            )
-            return
-
-        logger.debug("Responding to media request with responder %s", responder)
-        add_file_headers(request, media_type, file_size, upload_name)
-        try:
-            await responder.write_to_consumer(request)
-        except Exception as e:
-            # The majority of the time this will be due to the client having gone
-            # away. Unfortunately, Twisted simply throws a generic exception at us
-            # in that case.
-            logger.warning("Failed to write to consumer: %s %s", type(e), e)
-
-            # Unregister the producer, if it has one, so Twisted doesn't complain
-            if request.producer:
-                request.unregisterProducer()
-
-    finish_request(request)
-
-
-class Responder(ABC):
-    """Represents a response that can be streamed to the requester.
-
-    Responder is a context manager which *must* be used, so that any resources
-    held can be cleaned up.
-    """
-
-    @abstractmethod
-    def write_to_consumer(self, consumer: IConsumer) -> Awaitable:
-        """Stream response into consumer
-
-        Args:
-            consumer: The consumer to stream into.
-
-        Returns:
-            Resolves once the response has finished being written
-        """
-        raise NotImplementedError()
-
-    def __enter__(self) -> None:  # noqa: B027
-        pass
-
-    def __exit__(  # noqa: B027
-        self,
-        exc_type: Optional[Type[BaseException]],
-        exc_val: Optional[BaseException],
-        exc_tb: Optional[TracebackType],
-    ) -> None:
-        pass
-
-
-@attr.s(slots=True, frozen=True, auto_attribs=True)
-class ThumbnailInfo:
-    """Details about a generated thumbnail."""
-
-    width: int
-    height: int
-    method: str
-    # Content type of thumbnail, e.g. image/png
-    type: str
-    # The size of the media file, in bytes.
-    length: Optional[int] = None
-
-
-@attr.s(slots=True, frozen=True, auto_attribs=True)
-class FileInfo:
-    """Details about a requested/uploaded file."""
-
-    # The server name where the media originated from, or None if local.
-    server_name: Optional[str]
-    # The local ID of the file. For local files this is the same as the media_id
-    file_id: str
-    # If the file is for the url preview cache
-    url_cache: bool = False
-    # Whether the file is a thumbnail or not.
-    thumbnail: Optional[ThumbnailInfo] = None
-
-    # The below properties exist to maintain compatibility with third-party modules.
-    @property
-    def thumbnail_width(self) -> Optional[int]:
-        if not self.thumbnail:
-            return None
-        return self.thumbnail.width
-
-    @property
-    def thumbnail_height(self) -> Optional[int]:
-        if not self.thumbnail:
-            return None
-        return self.thumbnail.height
-
-    @property
-    def thumbnail_method(self) -> Optional[str]:
-        if not self.thumbnail:
-            return None
-        return self.thumbnail.method
-
-    @property
-    def thumbnail_type(self) -> Optional[str]:
-        if not self.thumbnail:
-            return None
-        return self.thumbnail.type
-
-    @property
-    def thumbnail_length(self) -> Optional[int]:
-        if not self.thumbnail:
-            return None
-        return self.thumbnail.length
-
-
-def get_filename_from_headers(headers: Dict[bytes, List[bytes]]) -> Optional[str]:
-    """
-    Get the filename of the downloaded file by inspecting the
-    Content-Disposition HTTP header.
-
-    Args:
-        headers: The HTTP request headers.
-
-    Returns:
-        The filename, or None.
-    """
-    content_disposition = headers.get(b"Content-Disposition", [b""])
-
-    # No header, bail out.
-    if not content_disposition[0]:
-        return None
-
-    _, params = _parse_header(content_disposition[0])
-
-    upload_name = None
-
-    # First check if there is a valid UTF-8 filename
-    upload_name_utf8 = params.get(b"filename*", None)
-    if upload_name_utf8:
-        if upload_name_utf8.lower().startswith(b"utf-8''"):
-            upload_name_utf8 = upload_name_utf8[7:]
-            # We have a filename*= section. This MUST be ASCII, and any UTF-8
-            # bytes are %-quoted.
-            try:
-                # Once it is decoded, we can then unquote the %-encoded
-                # parts strictly into a unicode string.
-                upload_name = urllib.parse.unquote(
-                    upload_name_utf8.decode("ascii"), errors="strict"
-                )
-            except UnicodeDecodeError:
-                # Incorrect UTF-8.
-                pass
-
-    # If there isn't check for an ascii name.
-    if not upload_name:
-        upload_name_ascii = params.get(b"filename", None)
-        if upload_name_ascii and is_ascii(upload_name_ascii):
-            upload_name = upload_name_ascii.decode("ascii")
-
-    # This may be None here, indicating we did not find a matching name.
-    return upload_name
-
-
-def _parse_header(line: bytes) -> Tuple[bytes, Dict[bytes, bytes]]:
-    """Parse a Content-type like header.
-
-    Cargo-culted from `cgi`, but works on bytes rather than strings.
-
-    Args:
-        line: header to be parsed
-
-    Returns:
-        The main content-type, followed by the parameter dictionary
-    """
-    parts = _parseparam(b";" + line)
-    key = next(parts)
-    pdict = {}
-    for p in parts:
-        i = p.find(b"=")
-        if i >= 0:
-            name = p[:i].strip().lower()
-            value = p[i + 1 :].strip()
-
-            # strip double-quotes
-            if len(value) >= 2 and value[0:1] == value[-1:] == b'"':
-                value = value[1:-1]
-                value = value.replace(b"\\\\", b"\\").replace(b'\\"', b'"')
-            pdict[name] = value
-
-    return key, pdict
-
-
-def _parseparam(s: bytes) -> Generator[bytes, None, None]:
-    """Generator which splits the input on ;, respecting double-quoted sequences
-
-    Cargo-culted from `cgi`, but works on bytes rather than strings.
-
-    Args:
-        s: header to be parsed
-
-    Returns:
-        The split input
-    """
-    while s[:1] == b";":
-        s = s[1:]
-
-        # look for the next ;
-        end = s.find(b";")
-
-        # if there is an odd number of " marks between here and the next ;, skip to the
-        # next ; instead
-        while end > 0 and (s.count(b'"', 0, end) - s.count(b'\\"', 0, end)) % 2:
-            end = s.find(b";", end + 1)
-
-        if end < 0:
-            end = len(s)
-        f = s[:end]
-        yield f.strip()
-        s = s[end:]
+# This exists purely for backwards compatibility with media providers and spam checkers.
+from synapse.media._base import FileInfo, Responder  # noqa: F401
diff --git a/synapse/rest/media/v1/config_resource.py b/synapse/rest/media/v1/config_resource.py
deleted file mode 100644
index a95804d327..0000000000
--- a/synapse/rest/media/v1/config_resource.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright 2018 Will Hunt <will@half-shot.uk>
-# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from typing import TYPE_CHECKING
-
-from synapse.http.server import DirectServeJsonResource, respond_with_json
-from synapse.http.site import SynapseRequest
-
-if TYPE_CHECKING:
-    from synapse.server import HomeServer
-
-
-class MediaConfigResource(DirectServeJsonResource):
-    isLeaf = True
-
-    def __init__(self, hs: "HomeServer"):
-        super().__init__()
-        config = hs.config
-        self.clock = hs.get_clock()
-        self.auth = hs.get_auth()
-        self.limits_dict = {"m.upload.size": config.media.max_upload_size}
-
-    async def _async_render_GET(self, request: SynapseRequest) -> None:
-        await self.auth.get_user_by_req(request)
-        respond_with_json(request, 200, self.limits_dict, send_cors=True)
-
-    async def _async_render_OPTIONS(self, request: SynapseRequest) -> None:
-        respond_with_json(request, 200, {}, send_cors=True)
diff --git a/synapse/rest/media/v1/download_resource.py b/synapse/rest/media/v1/download_resource.py
deleted file mode 100644
index 048a042692..0000000000
--- a/synapse/rest/media/v1/download_resource.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2014-2016 OpenMarket Ltd
-# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-from typing import TYPE_CHECKING
-
-from synapse.http.server import (
-    DirectServeJsonResource,
-    set_corp_headers,
-    set_cors_headers,
-)
-from synapse.http.servlet import parse_boolean
-from synapse.http.site import SynapseRequest
-
-from ._base import parse_media_id, respond_404
-
-if TYPE_CHECKING:
-    from synapse.rest.media.v1.media_repository import MediaRepository
-    from synapse.server import HomeServer
-
-logger = logging.getLogger(__name__)
-
-
-class DownloadResource(DirectServeJsonResource):
-    isLeaf = True
-
-    def __init__(self, hs: "HomeServer", media_repo: "MediaRepository"):
-        super().__init__()
-        self.media_repo = media_repo
-        self.server_name = hs.hostname
-
-    async def _async_render_GET(self, request: SynapseRequest) -> None:
-        set_cors_headers(request)
-        set_corp_headers(request)
-        request.setHeader(
-            b"Content-Security-Policy",
-            b"sandbox;"
-            b" default-src 'none';"
-            b" script-src 'none';"
-            b" plugin-types application/pdf;"
-            b" style-src 'unsafe-inline';"
-            b" media-src 'self';"
-            b" object-src 'self';",
-        )
-        # Limited non-standard form of CSP for IE11
-        request.setHeader(b"X-Content-Security-Policy", b"sandbox;")
-        request.setHeader(
-            b"Referrer-Policy",
-            b"no-referrer",
-        )
-        server_name, media_id, name = parse_media_id(request)
-        if server_name == self.server_name:
-            await self.media_repo.get_local_media(request, media_id, name)
-        else:
-            allow_remote = parse_boolean(request, "allow_remote", default=True)
-            if not allow_remote:
-                logger.info(
-                    "Rejecting request for remote media %s/%s due to allow_remote",
-                    server_name,
-                    media_id,
-                )
-                respond_404(request)
-                return
-
-            await self.media_repo.get_remote_media(request, server_name, media_id, name)
diff --git a/synapse/rest/media/v1/filepath.py b/synapse/rest/media/v1/filepath.py
deleted file mode 100644
index 1f6441c412..0000000000
--- a/synapse/rest/media/v1/filepath.py
+++ /dev/null
@@ -1,410 +0,0 @@
-# Copyright 2014-2016 OpenMarket Ltd
-# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import functools
-import os
-import re
-import string
-from typing import Any, Callable, List, TypeVar, Union, cast
-
-NEW_FORMAT_ID_RE = re.compile(r"^\d\d\d\d-\d\d-\d\d")
-
-
-F = TypeVar("F", bound=Callable[..., str])
-
-
-def _wrap_in_base_path(func: F) -> F:
-    """Takes a function that returns a relative path and turns it into an
-    absolute path based on the location of the primary media store
-    """
-
-    @functools.wraps(func)
-    def _wrapped(self: "MediaFilePaths", *args: Any, **kwargs: Any) -> str:
-        path = func(self, *args, **kwargs)
-        return os.path.join(self.base_path, path)
-
-    return cast(F, _wrapped)
-
-
-GetPathMethod = TypeVar(
-    "GetPathMethod", bound=Union[Callable[..., str], Callable[..., List[str]]]
-)
-
-
-def _wrap_with_jail_check(relative: bool) -> Callable[[GetPathMethod], GetPathMethod]:
-    """Wraps a path-returning method to check that the returned path(s) do not escape
-    the media store directory.
-
-    The path-returning method may return either a single path, or a list of paths.
-
-    The check is not expected to ever fail, unless `func` is missing a call to
-    `_validate_path_component`, or `_validate_path_component` is buggy.
-
-    Args:
-        relative: A boolean indicating whether the wrapped method returns paths relative
-            to the media store directory.
-
-    Returns:
-        A method which will wrap a path-returning method, adding a check to ensure that
-        the returned path(s) lie within the media store directory. The check will raise
-        a `ValueError` if it fails.
-    """
-
-    def _wrap_with_jail_check_inner(func: GetPathMethod) -> GetPathMethod:
-        @functools.wraps(func)
-        def _wrapped(
-            self: "MediaFilePaths", *args: Any, **kwargs: Any
-        ) -> Union[str, List[str]]:
-            path_or_paths = func(self, *args, **kwargs)
-
-            if isinstance(path_or_paths, list):
-                paths_to_check = path_or_paths
-            else:
-                paths_to_check = [path_or_paths]
-
-            for path in paths_to_check:
-                # Construct the path that will ultimately be used.
-                # We cannot guess whether `path` is relative to the media store
-                # directory, since the media store directory may itself be a relative
-                # path.
-                if relative:
-                    path = os.path.join(self.base_path, path)
-                normalized_path = os.path.normpath(path)
-
-                # Now that `normpath` has eliminated `../`s and `./`s from the path,
-                # `os.path.commonpath` can be used to check whether it lies within the
-                # media store directory.
-                if (
-                    os.path.commonpath([normalized_path, self.normalized_base_path])
-                    != self.normalized_base_path
-                ):
-                    # The path resolves to outside the media store directory,
-                    # or `self.base_path` is `.`, which is an unlikely configuration.
-                    raise ValueError(f"Invalid media store path: {path!r}")
-
-                # Note that `os.path.normpath`/`abspath` has a subtle caveat:
-                # `a/b/c/../c` will normalize to `a/b/c`, but the former refers to a
-                # different path if `a/b/c` is a symlink. That is, the check above is
-                # not perfect and may allow a certain restricted subset of untrustworthy
-                # paths through. Since the check above is secondary to the main
-                # `_validate_path_component` checks, it's less important for it to be
-                # perfect.
-                #
-                # As an alternative, `os.path.realpath` will resolve symlinks, but
-                # proves problematic if there are symlinks inside the media store.
-                # eg. if `url_store/` is symlinked to elsewhere, its canonical path
-                # won't match that of the main media store directory.
-
-            return path_or_paths
-
-        return cast(GetPathMethod, _wrapped)
-
-    return _wrap_with_jail_check_inner
-
-
-ALLOWED_CHARACTERS = set(
-    string.ascii_letters
-    + string.digits
-    + "_-"
-    + ".[]:"  # Domain names, IPv6 addresses and ports in server names
-)
-FORBIDDEN_NAMES = {
-    "",
-    os.path.curdir,  # "." for the current platform
-    os.path.pardir,  # ".." for the current platform
-}
-
-
-def _validate_path_component(name: str) -> str:
-    """Checks that the given string can be safely used as a path component
-
-    Args:
-        name: The path component to check.
-
-    Returns:
-        The path component if valid.
-
-    Raises:
-        ValueError: If `name` cannot be safely used as a path component.
-    """
-    if not ALLOWED_CHARACTERS.issuperset(name) or name in FORBIDDEN_NAMES:
-        raise ValueError(f"Invalid path component: {name!r}")
-
-    return name
-
-
-class MediaFilePaths:
-    """Describes where files are stored on disk.
-
-    Most of the functions have a `*_rel` variant which returns a file path that
-    is relative to the base media store path. This is mainly used when we want
-    to write to the backup media store (when one is configured)
-    """
-
-    def __init__(self, primary_base_path: str):
-        self.base_path = primary_base_path
-        self.normalized_base_path = os.path.normpath(self.base_path)
-
-        # Refuse to initialize if paths cannot be validated correctly for the current
-        # platform.
-        assert os.path.sep not in ALLOWED_CHARACTERS
-        assert os.path.altsep not in ALLOWED_CHARACTERS
-        # On Windows, paths have all sorts of weirdness which `_validate_path_component`
-        # does not consider. In any case, the remote media store can't work correctly
-        # for certain homeservers there, since ":"s aren't allowed in paths.
-        assert os.name == "posix"
-
-    @_wrap_with_jail_check(relative=True)
-    def local_media_filepath_rel(self, media_id: str) -> str:
-        return os.path.join(
-            "local_content",
-            _validate_path_component(media_id[0:2]),
-            _validate_path_component(media_id[2:4]),
-            _validate_path_component(media_id[4:]),
-        )
-
-    local_media_filepath = _wrap_in_base_path(local_media_filepath_rel)
-
-    @_wrap_with_jail_check(relative=True)
-    def local_media_thumbnail_rel(
-        self, media_id: str, width: int, height: int, content_type: str, method: str
-    ) -> str:
-        top_level_type, sub_type = content_type.split("/")
-        file_name = "%i-%i-%s-%s-%s" % (width, height, top_level_type, sub_type, method)
-        return os.path.join(
-            "local_thumbnails",
-            _validate_path_component(media_id[0:2]),
-            _validate_path_component(media_id[2:4]),
-            _validate_path_component(media_id[4:]),
-            _validate_path_component(file_name),
-        )
-
-    local_media_thumbnail = _wrap_in_base_path(local_media_thumbnail_rel)
-
-    @_wrap_with_jail_check(relative=False)
-    def local_media_thumbnail_dir(self, media_id: str) -> str:
-        """
-        Retrieve the local store path of thumbnails of a given media_id
-
-        Args:
-            media_id: The media ID to query.
-        Returns:
-            Path of local_thumbnails from media_id
-        """
-        return os.path.join(
-            self.base_path,
-            "local_thumbnails",
-            _validate_path_component(media_id[0:2]),
-            _validate_path_component(media_id[2:4]),
-            _validate_path_component(media_id[4:]),
-        )
-
-    @_wrap_with_jail_check(relative=True)
-    def remote_media_filepath_rel(self, server_name: str, file_id: str) -> str:
-        return os.path.join(
-            "remote_content",
-            _validate_path_component(server_name),
-            _validate_path_component(file_id[0:2]),
-            _validate_path_component(file_id[2:4]),
-            _validate_path_component(file_id[4:]),
-        )
-
-    remote_media_filepath = _wrap_in_base_path(remote_media_filepath_rel)
-
-    @_wrap_with_jail_check(relative=True)
-    def remote_media_thumbnail_rel(
-        self,
-        server_name: str,
-        file_id: str,
-        width: int,
-        height: int,
-        content_type: str,
-        method: str,
-    ) -> str:
-        top_level_type, sub_type = content_type.split("/")
-        file_name = "%i-%i-%s-%s-%s" % (width, height, top_level_type, sub_type, method)
-        return os.path.join(
-            "remote_thumbnail",
-            _validate_path_component(server_name),
-            _validate_path_component(file_id[0:2]),
-            _validate_path_component(file_id[2:4]),
-            _validate_path_component(file_id[4:]),
-            _validate_path_component(file_name),
-        )
-
-    remote_media_thumbnail = _wrap_in_base_path(remote_media_thumbnail_rel)
-
-    # Legacy path that was used to store thumbnails previously.
-    # Should be removed after some time, when most of the thumbnails are stored
-    # using the new path.
-    @_wrap_with_jail_check(relative=True)
-    def remote_media_thumbnail_rel_legacy(
-        self, server_name: str, file_id: str, width: int, height: int, content_type: str
-    ) -> str:
-        top_level_type, sub_type = content_type.split("/")
-        file_name = "%i-%i-%s-%s" % (width, height, top_level_type, sub_type)
-        return os.path.join(
-            "remote_thumbnail",
-            _validate_path_component(server_name),
-            _validate_path_component(file_id[0:2]),
-            _validate_path_component(file_id[2:4]),
-            _validate_path_component(file_id[4:]),
-            _validate_path_component(file_name),
-        )
-
-    @_wrap_with_jail_check(relative=False)
-    def remote_media_thumbnail_dir(self, server_name: str, file_id: str) -> str:
-        return os.path.join(
-            self.base_path,
-            "remote_thumbnail",
-            _validate_path_component(server_name),
-            _validate_path_component(file_id[0:2]),
-            _validate_path_component(file_id[2:4]),
-            _validate_path_component(file_id[4:]),
-        )
-
-    @_wrap_with_jail_check(relative=True)
-    def url_cache_filepath_rel(self, media_id: str) -> str:
-        if NEW_FORMAT_ID_RE.match(media_id):
-            # Media id is of the form <DATE><RANDOM_STRING>
-            # E.g.: 2017-09-28-fsdRDt24DS234dsf
-            return os.path.join(
-                "url_cache",
-                _validate_path_component(media_id[:10]),
-                _validate_path_component(media_id[11:]),
-            )
-        else:
-            return os.path.join(
-                "url_cache",
-                _validate_path_component(media_id[0:2]),
-                _validate_path_component(media_id[2:4]),
-                _validate_path_component(media_id[4:]),
-            )
-
-    url_cache_filepath = _wrap_in_base_path(url_cache_filepath_rel)
-
-    @_wrap_with_jail_check(relative=False)
-    def url_cache_filepath_dirs_to_delete(self, media_id: str) -> List[str]:
-        "The dirs to try and remove if we delete the media_id file"
-        if NEW_FORMAT_ID_RE.match(media_id):
-            return [
-                os.path.join(
-                    self.base_path, "url_cache", _validate_path_component(media_id[:10])
-                )
-            ]
-        else:
-            return [
-                os.path.join(
-                    self.base_path,
-                    "url_cache",
-                    _validate_path_component(media_id[0:2]),
-                    _validate_path_component(media_id[2:4]),
-                ),
-                os.path.join(
-                    self.base_path, "url_cache", _validate_path_component(media_id[0:2])
-                ),
-            ]
-
-    @_wrap_with_jail_check(relative=True)
-    def url_cache_thumbnail_rel(
-        self, media_id: str, width: int, height: int, content_type: str, method: str
-    ) -> str:
-        # Media id is of the form <DATE><RANDOM_STRING>
-        # E.g.: 2017-09-28-fsdRDt24DS234dsf
-
-        top_level_type, sub_type = content_type.split("/")
-        file_name = "%i-%i-%s-%s-%s" % (width, height, top_level_type, sub_type, method)
-
-        if NEW_FORMAT_ID_RE.match(media_id):
-            return os.path.join(
-                "url_cache_thumbnails",
-                _validate_path_component(media_id[:10]),
-                _validate_path_component(media_id[11:]),
-                _validate_path_component(file_name),
-            )
-        else:
-            return os.path.join(
-                "url_cache_thumbnails",
-                _validate_path_component(media_id[0:2]),
-                _validate_path_component(media_id[2:4]),
-                _validate_path_component(media_id[4:]),
-                _validate_path_component(file_name),
-            )
-
-    url_cache_thumbnail = _wrap_in_base_path(url_cache_thumbnail_rel)
-
-    @_wrap_with_jail_check(relative=True)
-    def url_cache_thumbnail_directory_rel(self, media_id: str) -> str:
-        # Media id is of the form <DATE><RANDOM_STRING>
-        # E.g.: 2017-09-28-fsdRDt24DS234dsf
-
-        if NEW_FORMAT_ID_RE.match(media_id):
-            return os.path.join(
-                "url_cache_thumbnails",
-                _validate_path_component(media_id[:10]),
-                _validate_path_component(media_id[11:]),
-            )
-        else:
-            return os.path.join(
-                "url_cache_thumbnails",
-                _validate_path_component(media_id[0:2]),
-                _validate_path_component(media_id[2:4]),
-                _validate_path_component(media_id[4:]),
-            )
-
-    url_cache_thumbnail_directory = _wrap_in_base_path(
-        url_cache_thumbnail_directory_rel
-    )
-
-    @_wrap_with_jail_check(relative=False)
-    def url_cache_thumbnail_dirs_to_delete(self, media_id: str) -> List[str]:
-        "The dirs to try and remove if we delete the media_id thumbnails"
-        # Media id is of the form <DATE><RANDOM_STRING>
-        # E.g.: 2017-09-28-fsdRDt24DS234dsf
-        if NEW_FORMAT_ID_RE.match(media_id):
-            return [
-                os.path.join(
-                    self.base_path,
-                    "url_cache_thumbnails",
-                    _validate_path_component(media_id[:10]),
-                    _validate_path_component(media_id[11:]),
-                ),
-                os.path.join(
-                    self.base_path,
-                    "url_cache_thumbnails",
-                    _validate_path_component(media_id[:10]),
-                ),
-            ]
-        else:
-            return [
-                os.path.join(
-                    self.base_path,
-                    "url_cache_thumbnails",
-                    _validate_path_component(media_id[0:2]),
-                    _validate_path_component(media_id[2:4]),
-                    _validate_path_component(media_id[4:]),
-                ),
-                os.path.join(
-                    self.base_path,
-                    "url_cache_thumbnails",
-                    _validate_path_component(media_id[0:2]),
-                    _validate_path_component(media_id[2:4]),
-                ),
-                os.path.join(
-                    self.base_path,
-                    "url_cache_thumbnails",
-                    _validate_path_component(media_id[0:2]),
-                ),
-            ]
diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py
deleted file mode 100644
index c70e1837af..0000000000
--- a/synapse/rest/media/v1/media_repository.py
+++ /dev/null
@@ -1,1112 +0,0 @@
-# Copyright 2014-2016 OpenMarket Ltd
-# Copyright 2018-2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import errno
-import logging
-import os
-import shutil
-from io import BytesIO
-from typing import IO, TYPE_CHECKING, Dict, List, Optional, Set, Tuple
-
-from matrix_common.types.mxc_uri import MXCUri
-
-import twisted.internet.error
-import twisted.web.http
-from twisted.internet.defer import Deferred
-
-from synapse.api.errors import (
-    FederationDeniedError,
-    HttpResponseException,
-    NotFoundError,
-    RequestSendFailed,
-    SynapseError,
-)
-from synapse.config._base import ConfigError
-from synapse.config.repository import ThumbnailRequirement
-from synapse.http.server import UnrecognizedRequestResource
-from synapse.http.site import SynapseRequest
-from synapse.logging.context import defer_to_thread
-from synapse.metrics.background_process_metrics import run_as_background_process
-from synapse.types import UserID
-from synapse.util.async_helpers import Linearizer
-from synapse.util.retryutils import NotRetryingDestination
-from synapse.util.stringutils import random_string
-
-from ._base import (
-    FileInfo,
-    Responder,
-    ThumbnailInfo,
-    get_filename_from_headers,
-    respond_404,
-    respond_with_responder,
-)
-from .config_resource import MediaConfigResource
-from .download_resource import DownloadResource
-from .filepath import MediaFilePaths
-from .media_storage import MediaStorage
-from .preview_url_resource import PreviewUrlResource
-from .storage_provider import StorageProviderWrapper
-from .thumbnail_resource import ThumbnailResource
-from .thumbnailer import Thumbnailer, ThumbnailError
-from .upload_resource import UploadResource
-
-if TYPE_CHECKING:
-    from synapse.server import HomeServer
-
-logger = logging.getLogger(__name__)
-
-# How often to run the background job to update the "recently accessed"
-# attribute of local and remote media.
-UPDATE_RECENTLY_ACCESSED_TS = 60 * 1000  # 1 minute
-# How often to run the background job to check for local and remote media
-# that should be purged according to the configured media retention settings.
-MEDIA_RETENTION_CHECK_PERIOD_MS = 60 * 60 * 1000  # 1 hour
-
-
-class MediaRepository:
-    def __init__(self, hs: "HomeServer"):
-        self.hs = hs
-        self.auth = hs.get_auth()
-        self.client = hs.get_federation_http_client()
-        self.clock = hs.get_clock()
-        self.server_name = hs.hostname
-        self.store = hs.get_datastores().main
-        self.max_upload_size = hs.config.media.max_upload_size
-        self.max_image_pixels = hs.config.media.max_image_pixels
-
-        Thumbnailer.set_limits(self.max_image_pixels)
-
-        self.primary_base_path: str = hs.config.media.media_store_path
-        self.filepaths: MediaFilePaths = MediaFilePaths(self.primary_base_path)
-
-        self.dynamic_thumbnails = hs.config.media.dynamic_thumbnails
-        self.thumbnail_requirements = hs.config.media.thumbnail_requirements
-
-        self.remote_media_linearizer = Linearizer(name="media_remote")
-
-        self.recently_accessed_remotes: Set[Tuple[str, str]] = set()
-        self.recently_accessed_locals: Set[str] = set()
-
-        self.federation_domain_whitelist = (
-            hs.config.federation.federation_domain_whitelist
-        )
-
-        # List of StorageProviders where we should search for media and
-        # potentially upload to.
-        storage_providers = []
-
-        for (
-            clz,
-            provider_config,
-            wrapper_config,
-        ) in hs.config.media.media_storage_providers:
-            backend = clz(hs, provider_config)
-            provider = StorageProviderWrapper(
-                backend,
-                store_local=wrapper_config.store_local,
-                store_remote=wrapper_config.store_remote,
-                store_synchronous=wrapper_config.store_synchronous,
-            )
-            storage_providers.append(provider)
-
-        self.media_storage = MediaStorage(
-            self.hs, self.primary_base_path, self.filepaths, storage_providers
-        )
-
-        self.clock.looping_call(
-            self._start_update_recently_accessed, UPDATE_RECENTLY_ACCESSED_TS
-        )
-
-        # Media retention configuration options
-        self._media_retention_local_media_lifetime_ms = (
-            hs.config.media.media_retention_local_media_lifetime_ms
-        )
-        self._media_retention_remote_media_lifetime_ms = (
-            hs.config.media.media_retention_remote_media_lifetime_ms
-        )
-
-        # Check whether local or remote media retention is configured
-        if (
-            hs.config.media.media_retention_local_media_lifetime_ms is not None
-            or hs.config.media.media_retention_remote_media_lifetime_ms is not None
-        ):
-            # Run the background job to apply media retention rules routinely,
-            # with the duration between runs dictated by the homeserver config.
-            self.clock.looping_call(
-                self._start_apply_media_retention_rules,
-                MEDIA_RETENTION_CHECK_PERIOD_MS,
-            )
-
-    def _start_update_recently_accessed(self) -> Deferred:
-        return run_as_background_process(
-            "update_recently_accessed_media", self._update_recently_accessed
-        )
-
-    def _start_apply_media_retention_rules(self) -> Deferred:
-        return run_as_background_process(
-            "apply_media_retention_rules", self._apply_media_retention_rules
-        )
-
-    async def _update_recently_accessed(self) -> None:
-        remote_media = self.recently_accessed_remotes
-        self.recently_accessed_remotes = set()
-
-        local_media = self.recently_accessed_locals
-        self.recently_accessed_locals = set()
-
-        await self.store.update_cached_last_access_time(
-            local_media, remote_media, self.clock.time_msec()
-        )
-
-    def mark_recently_accessed(self, server_name: Optional[str], media_id: str) -> None:
-        """Mark the given media as recently accessed.
-
-        Args:
-            server_name: Origin server of media, or None if local
-            media_id: The media ID of the content
-        """
-        if server_name:
-            self.recently_accessed_remotes.add((server_name, media_id))
-        else:
-            self.recently_accessed_locals.add(media_id)
-
-    async def create_content(
-        self,
-        media_type: str,
-        upload_name: Optional[str],
-        content: IO,
-        content_length: int,
-        auth_user: UserID,
-    ) -> MXCUri:
-        """Store uploaded content for a local user and return the mxc URL
-
-        Args:
-            media_type: The content type of the file.
-            upload_name: The name of the file, if provided.
-            content: A file like object that is the content to store
-            content_length: The length of the content
-            auth_user: The user_id of the uploader
-
-        Returns:
-            The mxc url of the stored content
-        """
-
-        media_id = random_string(24)
-
-        file_info = FileInfo(server_name=None, file_id=media_id)
-
-        fname = await self.media_storage.store_file(content, file_info)
-
-        logger.info("Stored local media in file %r", fname)
-
-        await self.store.store_local_media(
-            media_id=media_id,
-            media_type=media_type,
-            time_now_ms=self.clock.time_msec(),
-            upload_name=upload_name,
-            media_length=content_length,
-            user_id=auth_user,
-        )
-
-        await self._generate_thumbnails(None, media_id, media_id, media_type)
-
-        return MXCUri(self.server_name, media_id)
-
-    async def get_local_media(
-        self, request: SynapseRequest, media_id: str, name: Optional[str]
-    ) -> None:
-        """Responds to requests for local media, if exists, or returns 404.
-
-        Args:
-            request: The incoming request.
-            media_id: The media ID of the content. (This is the same as
-                the file_id for local content.)
-            name: Optional name that, if specified, will be used as
-                the filename in the Content-Disposition header of the response.
-
-        Returns:
-            Resolves once a response has successfully been written to request
-        """
-        media_info = await self.store.get_local_media(media_id)
-        if not media_info or media_info["quarantined_by"]:
-            respond_404(request)
-            return
-
-        self.mark_recently_accessed(None, media_id)
-
-        media_type = media_info["media_type"]
-        if not media_type:
-            media_type = "application/octet-stream"
-        media_length = media_info["media_length"]
-        upload_name = name if name else media_info["upload_name"]
-        url_cache = media_info["url_cache"]
-
-        file_info = FileInfo(None, media_id, url_cache=bool(url_cache))
-
-        responder = await self.media_storage.fetch_media(file_info)
-        await respond_with_responder(
-            request, responder, media_type, media_length, upload_name
-        )
-
-    async def get_remote_media(
-        self,
-        request: SynapseRequest,
-        server_name: str,
-        media_id: str,
-        name: Optional[str],
-    ) -> None:
-        """Respond to requests for remote media.
-
-        Args:
-            request: The incoming request.
-            server_name: Remote server_name where the media originated.
-            media_id: The media ID of the content (as defined by the remote server).
-            name: Optional name that, if specified, will be used as
-                the filename in the Content-Disposition header of the response.
-
-        Returns:
-            Resolves once a response has successfully been written to request
-        """
-        if (
-            self.federation_domain_whitelist is not None
-            and server_name not in self.federation_domain_whitelist
-        ):
-            raise FederationDeniedError(server_name)
-
-        self.mark_recently_accessed(server_name, media_id)
-
-        # We linearize here to ensure that we don't try and download remote
-        # media multiple times concurrently
-        key = (server_name, media_id)
-        async with self.remote_media_linearizer.queue(key):
-            responder, media_info = await self._get_remote_media_impl(
-                server_name, media_id
-            )
-
-        # We deliberately stream the file outside the lock
-        if responder:
-            media_type = media_info["media_type"]
-            media_length = media_info["media_length"]
-            upload_name = name if name else media_info["upload_name"]
-            await respond_with_responder(
-                request, responder, media_type, media_length, upload_name
-            )
-        else:
-            respond_404(request)
-
-    async def get_remote_media_info(self, server_name: str, media_id: str) -> dict:
-        """Gets the media info associated with the remote file, downloading
-        if necessary.
-
-        Args:
-            server_name: Remote server_name where the media originated.
-            media_id: The media ID of the content (as defined by the remote server).
-
-        Returns:
-            The media info of the file
-        """
-        if (
-            self.federation_domain_whitelist is not None
-            and server_name not in self.federation_domain_whitelist
-        ):
-            raise FederationDeniedError(server_name)
-
-        # We linearize here to ensure that we don't try and download remote
-        # media multiple times concurrently
-        key = (server_name, media_id)
-        async with self.remote_media_linearizer.queue(key):
-            responder, media_info = await self._get_remote_media_impl(
-                server_name, media_id
-            )
-
-        # Ensure we actually use the responder so that it releases resources
-        if responder:
-            with responder:
-                pass
-
-        return media_info
-
-    async def _get_remote_media_impl(
-        self, server_name: str, media_id: str
-    ) -> Tuple[Optional[Responder], dict]:
-        """Looks for media in local cache, if not there then attempt to
-        download from remote server.
-
-        Args:
-            server_name: Remote server_name where the media originated.
-            media_id: The media ID of the content (as defined by the
-                remote server).
-
-        Returns:
-            A tuple of responder and the media info of the file.
-        """
-        media_info = await self.store.get_cached_remote_media(server_name, media_id)
-
-        # file_id is the ID we use to track the file locally. If we've already
-        # seen the file then reuse the existing ID, otherwise generate a new
-        # one.
-
-        # If we have an entry in the DB, try and look for it
-        if media_info:
-            file_id = media_info["filesystem_id"]
-            file_info = FileInfo(server_name, file_id)
-
-            if media_info["quarantined_by"]:
-                logger.info("Media is quarantined")
-                raise NotFoundError()
-
-            if not media_info["media_type"]:
-                media_info["media_type"] = "application/octet-stream"
-
-            responder = await self.media_storage.fetch_media(file_info)
-            if responder:
-                return responder, media_info
-
-        # Failed to find the file anywhere, lets download it.
-
-        try:
-            media_info = await self._download_remote_file(
-                server_name,
-                media_id,
-            )
-        except SynapseError:
-            raise
-        except Exception as e:
-            # An exception may be because we downloaded media in another
-            # process, so let's check if we magically have the media.
-            media_info = await self.store.get_cached_remote_media(server_name, media_id)
-            if not media_info:
-                raise e
-
-        file_id = media_info["filesystem_id"]
-        if not media_info["media_type"]:
-            media_info["media_type"] = "application/octet-stream"
-        file_info = FileInfo(server_name, file_id)
-
-        # We generate thumbnails even if another process downloaded the media
-        # as a) it's conceivable that the other download request dies before it
-        # generates thumbnails, but mainly b) we want to be sure the thumbnails
-        # have finished being generated before responding to the client,
-        # otherwise they'll request thumbnails and get a 404 if they're not
-        # ready yet.
-        await self._generate_thumbnails(
-            server_name, media_id, file_id, media_info["media_type"]
-        )
-
-        responder = await self.media_storage.fetch_media(file_info)
-        return responder, media_info
-
-    async def _download_remote_file(
-        self,
-        server_name: str,
-        media_id: str,
-    ) -> dict:
-        """Attempt to download the remote file from the given server name,
-        using the given file_id as the local id.
-
-        Args:
-            server_name: Originating server
-            media_id: The media ID of the content (as defined by the
-                remote server). This is different than the file_id, which is
-                locally generated.
-            file_id: Local file ID
-
-        Returns:
-            The media info of the file.
-        """
-
-        file_id = random_string(24)
-
-        file_info = FileInfo(server_name=server_name, file_id=file_id)
-
-        with self.media_storage.store_into_file(file_info) as (f, fname, finish):
-            request_path = "/".join(
-                ("/_matrix/media/r0/download", server_name, media_id)
-            )
-            try:
-                length, headers = await self.client.get_file(
-                    server_name,
-                    request_path,
-                    output_stream=f,
-                    max_size=self.max_upload_size,
-                    args={
-                        # tell the remote server to 404 if it doesn't
-                        # recognise the server_name, to make sure we don't
-                        # end up with a routing loop.
-                        "allow_remote": "false"
-                    },
-                )
-            except RequestSendFailed as e:
-                logger.warning(
-                    "Request failed fetching remote media %s/%s: %r",
-                    server_name,
-                    media_id,
-                    e,
-                )
-                raise SynapseError(502, "Failed to fetch remote media")
-
-            except HttpResponseException as e:
-                logger.warning(
-                    "HTTP error fetching remote media %s/%s: %s",
-                    server_name,
-                    media_id,
-                    e.response,
-                )
-                if e.code == twisted.web.http.NOT_FOUND:
-                    raise e.to_synapse_error()
-                raise SynapseError(502, "Failed to fetch remote media")
-
-            except SynapseError:
-                logger.warning(
-                    "Failed to fetch remote media %s/%s", server_name, media_id
-                )
-                raise
-            except NotRetryingDestination:
-                logger.warning("Not retrying destination %r", server_name)
-                raise SynapseError(502, "Failed to fetch remote media")
-            except Exception:
-                logger.exception(
-                    "Failed to fetch remote media %s/%s", server_name, media_id
-                )
-                raise SynapseError(502, "Failed to fetch remote media")
-
-            await finish()
-
-            if b"Content-Type" in headers:
-                media_type = headers[b"Content-Type"][0].decode("ascii")
-            else:
-                media_type = "application/octet-stream"
-            upload_name = get_filename_from_headers(headers)
-            time_now_ms = self.clock.time_msec()
-
-            # Multiple remote media download requests can race (when using
-            # multiple media repos), so this may throw a violation constraint
-            # exception. If it does we'll delete the newly downloaded file from
-            # disk (as we're in the ctx manager).
-            #
-            # However: we've already called `finish()` so we may have also
-            # written to the storage providers. This is preferable to the
-            # alternative where we call `finish()` *after* this, where we could
-            # end up having an entry in the DB but fail to write the files to
-            # the storage providers.
-            await self.store.store_cached_remote_media(
-                origin=server_name,
-                media_id=media_id,
-                media_type=media_type,
-                time_now_ms=self.clock.time_msec(),
-                upload_name=upload_name,
-                media_length=length,
-                filesystem_id=file_id,
-            )
-
-        logger.info("Stored remote media in file %r", fname)
-
-        media_info = {
-            "media_type": media_type,
-            "media_length": length,
-            "upload_name": upload_name,
-            "created_ts": time_now_ms,
-            "filesystem_id": file_id,
-        }
-
-        return media_info
-
-    def _get_thumbnail_requirements(
-        self, media_type: str
-    ) -> Tuple[ThumbnailRequirement, ...]:
-        scpos = media_type.find(";")
-        if scpos > 0:
-            media_type = media_type[:scpos]
-        return self.thumbnail_requirements.get(media_type, ())
-
-    def _generate_thumbnail(
-        self,
-        thumbnailer: Thumbnailer,
-        t_width: int,
-        t_height: int,
-        t_method: str,
-        t_type: str,
-    ) -> Optional[BytesIO]:
-        m_width = thumbnailer.width
-        m_height = thumbnailer.height
-
-        if m_width * m_height >= self.max_image_pixels:
-            logger.info(
-                "Image too large to thumbnail %r x %r > %r",
-                m_width,
-                m_height,
-                self.max_image_pixels,
-            )
-            return None
-
-        if thumbnailer.transpose_method is not None:
-            m_width, m_height = thumbnailer.transpose()
-
-        if t_method == "crop":
-            return thumbnailer.crop(t_width, t_height, t_type)
-        elif t_method == "scale":
-            t_width, t_height = thumbnailer.aspect(t_width, t_height)
-            t_width = min(m_width, t_width)
-            t_height = min(m_height, t_height)
-            return thumbnailer.scale(t_width, t_height, t_type)
-
-        return None
-
-    async def generate_local_exact_thumbnail(
-        self,
-        media_id: str,
-        t_width: int,
-        t_height: int,
-        t_method: str,
-        t_type: str,
-        url_cache: bool,
-    ) -> Optional[str]:
-        input_path = await self.media_storage.ensure_media_is_in_local_cache(
-            FileInfo(None, media_id, url_cache=url_cache)
-        )
-
-        try:
-            thumbnailer = Thumbnailer(input_path)
-        except ThumbnailError as e:
-            logger.warning(
-                "Unable to generate a thumbnail for local media %s using a method of %s and type of %s: %s",
-                media_id,
-                t_method,
-                t_type,
-                e,
-            )
-            return None
-
-        with thumbnailer:
-            t_byte_source = await defer_to_thread(
-                self.hs.get_reactor(),
-                self._generate_thumbnail,
-                thumbnailer,
-                t_width,
-                t_height,
-                t_method,
-                t_type,
-            )
-
-        if t_byte_source:
-            try:
-                file_info = FileInfo(
-                    server_name=None,
-                    file_id=media_id,
-                    url_cache=url_cache,
-                    thumbnail=ThumbnailInfo(
-                        width=t_width,
-                        height=t_height,
-                        method=t_method,
-                        type=t_type,
-                    ),
-                )
-
-                output_path = await self.media_storage.store_file(
-                    t_byte_source, file_info
-                )
-            finally:
-                t_byte_source.close()
-
-            logger.info("Stored thumbnail in file %r", output_path)
-
-            t_len = os.path.getsize(output_path)
-
-            await self.store.store_local_thumbnail(
-                media_id, t_width, t_height, t_type, t_method, t_len
-            )
-
-            return output_path
-
-        # Could not generate thumbnail.
-        return None
-
-    async def generate_remote_exact_thumbnail(
-        self,
-        server_name: str,
-        file_id: str,
-        media_id: str,
-        t_width: int,
-        t_height: int,
-        t_method: str,
-        t_type: str,
-    ) -> Optional[str]:
-        input_path = await self.media_storage.ensure_media_is_in_local_cache(
-            FileInfo(server_name, file_id)
-        )
-
-        try:
-            thumbnailer = Thumbnailer(input_path)
-        except ThumbnailError as e:
-            logger.warning(
-                "Unable to generate a thumbnail for remote media %s from %s using a method of %s and type of %s: %s",
-                media_id,
-                server_name,
-                t_method,
-                t_type,
-                e,
-            )
-            return None
-
-        with thumbnailer:
-            t_byte_source = await defer_to_thread(
-                self.hs.get_reactor(),
-                self._generate_thumbnail,
-                thumbnailer,
-                t_width,
-                t_height,
-                t_method,
-                t_type,
-            )
-
-        if t_byte_source:
-            try:
-                file_info = FileInfo(
-                    server_name=server_name,
-                    file_id=file_id,
-                    thumbnail=ThumbnailInfo(
-                        width=t_width,
-                        height=t_height,
-                        method=t_method,
-                        type=t_type,
-                    ),
-                )
-
-                output_path = await self.media_storage.store_file(
-                    t_byte_source, file_info
-                )
-            finally:
-                t_byte_source.close()
-
-            logger.info("Stored thumbnail in file %r", output_path)
-
-            t_len = os.path.getsize(output_path)
-
-            await self.store.store_remote_media_thumbnail(
-                server_name,
-                media_id,
-                file_id,
-                t_width,
-                t_height,
-                t_type,
-                t_method,
-                t_len,
-            )
-
-            return output_path
-
-        # Could not generate thumbnail.
-        return None
-
-    async def _generate_thumbnails(
-        self,
-        server_name: Optional[str],
-        media_id: str,
-        file_id: str,
-        media_type: str,
-        url_cache: bool = False,
-    ) -> Optional[dict]:
-        """Generate and store thumbnails for an image.
-
-        Args:
-            server_name: The server name if remote media, else None if local
-            media_id: The media ID of the content. (This is the same as
-                the file_id for local content)
-            file_id: Local file ID
-            media_type: The content type of the file
-            url_cache: If we are thumbnailing images downloaded for the URL cache,
-                used exclusively by the url previewer
-
-        Returns:
-            Dict with "width" and "height" keys of original image or None if the
-            media cannot be thumbnailed.
-        """
-        requirements = self._get_thumbnail_requirements(media_type)
-        if not requirements:
-            return None
-
-        input_path = await self.media_storage.ensure_media_is_in_local_cache(
-            FileInfo(server_name, file_id, url_cache=url_cache)
-        )
-
-        try:
-            thumbnailer = Thumbnailer(input_path)
-        except ThumbnailError as e:
-            logger.warning(
-                "Unable to generate thumbnails for remote media %s from %s of type %s: %s",
-                media_id,
-                server_name,
-                media_type,
-                e,
-            )
-            return None
-
-        with thumbnailer:
-            m_width = thumbnailer.width
-            m_height = thumbnailer.height
-
-            if m_width * m_height >= self.max_image_pixels:
-                logger.info(
-                    "Image too large to thumbnail %r x %r > %r",
-                    m_width,
-                    m_height,
-                    self.max_image_pixels,
-                )
-                return None
-
-            if thumbnailer.transpose_method is not None:
-                m_width, m_height = await defer_to_thread(
-                    self.hs.get_reactor(), thumbnailer.transpose
-                )
-
-            # We deduplicate the thumbnail sizes by ignoring the cropped versions if
-            # they have the same dimensions of a scaled one.
-            thumbnails: Dict[Tuple[int, int, str], str] = {}
-            for requirement in requirements:
-                if requirement.method == "crop":
-                    thumbnails.setdefault(
-                        (requirement.width, requirement.height, requirement.media_type),
-                        requirement.method,
-                    )
-                elif requirement.method == "scale":
-                    t_width, t_height = thumbnailer.aspect(
-                        requirement.width, requirement.height
-                    )
-                    t_width = min(m_width, t_width)
-                    t_height = min(m_height, t_height)
-                    thumbnails[
-                        (t_width, t_height, requirement.media_type)
-                    ] = requirement.method
-
-            # Now we generate the thumbnails for each dimension, store it
-            for (t_width, t_height, t_type), t_method in thumbnails.items():
-                # Generate the thumbnail
-                if t_method == "crop":
-                    t_byte_source = await defer_to_thread(
-                        self.hs.get_reactor(),
-                        thumbnailer.crop,
-                        t_width,
-                        t_height,
-                        t_type,
-                    )
-                elif t_method == "scale":
-                    t_byte_source = await defer_to_thread(
-                        self.hs.get_reactor(),
-                        thumbnailer.scale,
-                        t_width,
-                        t_height,
-                        t_type,
-                    )
-                else:
-                    logger.error("Unrecognized method: %r", t_method)
-                    continue
-
-                if not t_byte_source:
-                    continue
-
-                file_info = FileInfo(
-                    server_name=server_name,
-                    file_id=file_id,
-                    url_cache=url_cache,
-                    thumbnail=ThumbnailInfo(
-                        width=t_width,
-                        height=t_height,
-                        method=t_method,
-                        type=t_type,
-                    ),
-                )
-
-                with self.media_storage.store_into_file(file_info) as (
-                    f,
-                    fname,
-                    finish,
-                ):
-                    try:
-                        await self.media_storage.write_to_file(t_byte_source, f)
-                        await finish()
-                    finally:
-                        t_byte_source.close()
-
-                    t_len = os.path.getsize(fname)
-
-                    # Write to database
-                    if server_name:
-                        # Multiple remote media download requests can race (when
-                        # using multiple media repos), so this may throw a violation
-                        # constraint exception. If it does we'll delete the newly
-                        # generated thumbnail from disk (as we're in the ctx
-                        # manager).
-                        #
-                        # However: we've already called `finish()` so we may have
-                        # also written to the storage providers. This is preferable
-                        # to the alternative where we call `finish()` *after* this,
-                        # where we could end up having an entry in the DB but fail
-                        # to write the files to the storage providers.
-                        try:
-                            await self.store.store_remote_media_thumbnail(
-                                server_name,
-                                media_id,
-                                file_id,
-                                t_width,
-                                t_height,
-                                t_type,
-                                t_method,
-                                t_len,
-                            )
-                        except Exception as e:
-                            thumbnail_exists = (
-                                await self.store.get_remote_media_thumbnail(
-                                    server_name,
-                                    media_id,
-                                    t_width,
-                                    t_height,
-                                    t_type,
-                                )
-                            )
-                            if not thumbnail_exists:
-                                raise e
-                    else:
-                        await self.store.store_local_thumbnail(
-                            media_id, t_width, t_height, t_type, t_method, t_len
-                        )
-
-        return {"width": m_width, "height": m_height}
-
-    async def _apply_media_retention_rules(self) -> None:
-        """
-        Purge old local and remote media according to the media retention rules
-        defined in the homeserver config.
-        """
-        # Purge remote media
-        if self._media_retention_remote_media_lifetime_ms is not None:
-            # Calculate a threshold timestamp derived from the configured lifetime. Any
-            # media that has not been accessed since this timestamp will be removed.
-            remote_media_threshold_timestamp_ms = (
-                self.clock.time_msec() - self._media_retention_remote_media_lifetime_ms
-            )
-
-            logger.info(
-                "Purging remote media last accessed before"
-                f" {remote_media_threshold_timestamp_ms}"
-            )
-
-            await self.delete_old_remote_media(
-                before_ts=remote_media_threshold_timestamp_ms
-            )
-
-        # And now do the same for local media
-        if self._media_retention_local_media_lifetime_ms is not None:
-            # This works the same as the remote media threshold
-            local_media_threshold_timestamp_ms = (
-                self.clock.time_msec() - self._media_retention_local_media_lifetime_ms
-            )
-
-            logger.info(
-                "Purging local media last accessed before"
-                f" {local_media_threshold_timestamp_ms}"
-            )
-
-            await self.delete_old_local_media(
-                before_ts=local_media_threshold_timestamp_ms,
-                keep_profiles=True,
-                delete_quarantined_media=False,
-                delete_protected_media=False,
-            )
-
-    async def delete_old_remote_media(self, before_ts: int) -> Dict[str, int]:
-        old_media = await self.store.get_remote_media_ids(
-            before_ts, include_quarantined_media=False
-        )
-
-        deleted = 0
-
-        for media in old_media:
-            origin = media["media_origin"]
-            media_id = media["media_id"]
-            file_id = media["filesystem_id"]
-            key = (origin, media_id)
-
-            logger.info("Deleting: %r", key)
-
-            # TODO: Should we delete from the backup store
-
-            async with self.remote_media_linearizer.queue(key):
-                full_path = self.filepaths.remote_media_filepath(origin, file_id)
-                try:
-                    os.remove(full_path)
-                except OSError as e:
-                    logger.warning("Failed to remove file: %r", full_path)
-                    if e.errno == errno.ENOENT:
-                        pass
-                    else:
-                        continue
-
-                thumbnail_dir = self.filepaths.remote_media_thumbnail_dir(
-                    origin, file_id
-                )
-                shutil.rmtree(thumbnail_dir, ignore_errors=True)
-
-                await self.store.delete_remote_media(origin, media_id)
-                deleted += 1
-
-        return {"deleted": deleted}
-
-    async def delete_local_media_ids(
-        self, media_ids: List[str]
-    ) -> Tuple[List[str], int]:
-        """
-        Delete the given local or remote media ID from this server
-
-        Args:
-            media_id: The media ID to delete.
-        Returns:
-            A tuple of (list of deleted media IDs, total deleted media IDs).
-        """
-        return await self._remove_local_media_from_disk(media_ids)
-
-    async def delete_old_local_media(
-        self,
-        before_ts: int,
-        size_gt: int = 0,
-        keep_profiles: bool = True,
-        delete_quarantined_media: bool = False,
-        delete_protected_media: bool = False,
-    ) -> Tuple[List[str], int]:
-        """
-        Delete local or remote media from this server by size and timestamp. Removes
-        media files, any thumbnails and cached URLs.
-
-        Args:
-            before_ts: Unix timestamp in ms.
-                Files that were last used before this timestamp will be deleted.
-            size_gt: Size of the media in bytes. Files that are larger will be deleted.
-            keep_profiles: Switch to delete also files that are still used in image data
-                (e.g user profile, room avatar). If false these files will be deleted.
-            delete_quarantined_media: If True, media marked as quarantined will be deleted.
-            delete_protected_media: If True, media marked as protected will be deleted.
-
-        Returns:
-            A tuple of (list of deleted media IDs, total deleted media IDs).
-        """
-        old_media = await self.store.get_local_media_ids(
-            before_ts,
-            size_gt,
-            keep_profiles,
-            include_quarantined_media=delete_quarantined_media,
-            include_protected_media=delete_protected_media,
-        )
-        return await self._remove_local_media_from_disk(old_media)
-
-    async def _remove_local_media_from_disk(
-        self, media_ids: List[str]
-    ) -> Tuple[List[str], int]:
-        """
-        Delete local or remote media from this server. Removes media files,
-        any thumbnails and cached URLs.
-
-        Args:
-            media_ids: List of media_id to delete
-        Returns:
-            A tuple of (list of deleted media IDs, total deleted media IDs).
-        """
-        removed_media = []
-        for media_id in media_ids:
-            logger.info("Deleting media with ID '%s'", media_id)
-            full_path = self.filepaths.local_media_filepath(media_id)
-            try:
-                os.remove(full_path)
-            except OSError as e:
-                logger.warning("Failed to remove file: %r: %s", full_path, e)
-                if e.errno == errno.ENOENT:
-                    pass
-                else:
-                    continue
-
-            thumbnail_dir = self.filepaths.local_media_thumbnail_dir(media_id)
-            shutil.rmtree(thumbnail_dir, ignore_errors=True)
-
-            await self.store.delete_remote_media(self.server_name, media_id)
-
-            await self.store.delete_url_cache((media_id,))
-            await self.store.delete_url_cache_media((media_id,))
-
-            removed_media.append(media_id)
-
-        return removed_media, len(removed_media)
-
-
-class MediaRepositoryResource(UnrecognizedRequestResource):
-    """File uploading and downloading.
-
-    Uploads are POSTed to a resource which returns a token which is used to GET
-    the download::
-
-        => POST /_matrix/media/r0/upload HTTP/1.1
-           Content-Type: <media-type>
-           Content-Length: <content-length>
-
-           <media>
-
-        <= HTTP/1.1 200 OK
-           Content-Type: application/json
-
-           { "content_uri": "mxc://<server-name>/<media-id>" }
-
-        => GET /_matrix/media/r0/download/<server-name>/<media-id> HTTP/1.1
-
-        <= HTTP/1.1 200 OK
-           Content-Type: <media-type>
-           Content-Disposition: attachment;filename=<upload-filename>
-
-           <media>
-
-    Clients can get thumbnails by supplying a desired width and height and
-    thumbnailing method::
-
-        => GET /_matrix/media/r0/thumbnail/<server_name>
-                /<media-id>?width=<w>&height=<h>&method=<m> HTTP/1.1
-
-        <= HTTP/1.1 200 OK
-           Content-Type: image/jpeg or image/png
-
-           <thumbnail>
-
-    The thumbnail methods are "crop" and "scale". "scale" tries to return an
-    image where either the width or the height is smaller than the requested
-    size. The client should then scale and letterbox the image if it needs to
-    fit within a given rectangle. "crop" tries to return an image where the
-    width and height are close to the requested size and the aspect matches
-    the requested size. The client should scale the image if it needs to fit
-    within a given rectangle.
-    """
-
-    def __init__(self, hs: "HomeServer"):
-        # If we're not configured to use it, raise if we somehow got here.
-        if not hs.config.media.can_load_media_repo:
-            raise ConfigError("Synapse is not configured to use a media repo.")
-
-        super().__init__()
-        media_repo = hs.get_media_repository()
-
-        self.putChild(b"upload", UploadResource(hs, media_repo))
-        self.putChild(b"download", DownloadResource(hs, media_repo))
-        self.putChild(
-            b"thumbnail", ThumbnailResource(hs, media_repo, media_repo.media_storage)
-        )
-        if hs.config.media.url_preview_enabled:
-            self.putChild(
-                b"preview_url",
-                PreviewUrlResource(hs, media_repo, media_repo.media_storage),
-            )
-        self.putChild(b"config", MediaConfigResource(hs))
diff --git a/synapse/rest/media/v1/media_storage.py b/synapse/rest/media/v1/media_storage.py
index db25848744..11b0e8e231 100644
--- a/synapse/rest/media/v1/media_storage.py
+++ b/synapse/rest/media/v1/media_storage.py
@@ -1,4 +1,4 @@
-# Copyright 2018-2021 The Matrix.org Foundation C.I.C.
+# Copyright 2023 The Matrix.org Foundation C.I.C.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,364 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import contextlib
-import logging
-import os
-import shutil
-from types import TracebackType
-from typing import (
-    IO,
-    TYPE_CHECKING,
-    Any,
-    Awaitable,
-    BinaryIO,
-    Callable,
-    Generator,
-    Optional,
-    Sequence,
-    Tuple,
-    Type,
-)
-
-import attr
-
-from twisted.internet.defer import Deferred
-from twisted.internet.interfaces import IConsumer
-from twisted.protocols.basic import FileSender
-
-import synapse
-from synapse.api.errors import NotFoundError
-from synapse.logging.context import defer_to_thread, make_deferred_yieldable
-from synapse.util import Clock
-from synapse.util.file_consumer import BackgroundFileConsumer
-
-from ._base import FileInfo, Responder
-from .filepath import MediaFilePaths
-
-if TYPE_CHECKING:
-    from synapse.rest.media.v1.storage_provider import StorageProvider
-    from synapse.server import HomeServer
-
-logger = logging.getLogger(__name__)
-
-
-class MediaStorage:
-    """Responsible for storing/fetching files from local sources.
-
-    Args:
-        hs
-        local_media_directory: Base path where we store media on disk
-        filepaths
-        storage_providers: List of StorageProvider that are used to fetch and store files.
-    """
-
-    def __init__(
-        self,
-        hs: "HomeServer",
-        local_media_directory: str,
-        filepaths: MediaFilePaths,
-        storage_providers: Sequence["StorageProvider"],
-    ):
-        self.hs = hs
-        self.reactor = hs.get_reactor()
-        self.local_media_directory = local_media_directory
-        self.filepaths = filepaths
-        self.storage_providers = storage_providers
-        self.spam_checker = hs.get_spam_checker()
-        self.clock = hs.get_clock()
-
-    async def store_file(self, source: IO, file_info: FileInfo) -> str:
-        """Write `source` to the on disk media store, and also any other
-        configured storage providers
-
-        Args:
-            source: A file like object that should be written
-            file_info: Info about the file to store
-
-        Returns:
-            the file path written to in the primary media store
-        """
-
-        with self.store_into_file(file_info) as (f, fname, finish_cb):
-            # Write to the main repository
-            await self.write_to_file(source, f)
-            await finish_cb()
-
-        return fname
-
-    async def write_to_file(self, source: IO, output: IO) -> None:
-        """Asynchronously write the `source` to `output`."""
-        await defer_to_thread(self.reactor, _write_file_synchronously, source, output)
-
-    @contextlib.contextmanager
-    def store_into_file(
-        self, file_info: FileInfo
-    ) -> Generator[Tuple[BinaryIO, str, Callable[[], Awaitable[None]]], None, None]:
-        """Context manager used to get a file like object to write into, as
-        described by file_info.
-
-        Actually yields a 3-tuple (file, fname, finish_cb), where file is a file
-        like object that can be written to, fname is the absolute path of file
-        on disk, and finish_cb is a function that returns an awaitable.
-
-        fname can be used to read the contents from after upload, e.g. to
-        generate thumbnails.
-
-        finish_cb must be called and waited on after the file has been
-        successfully been written to. Should not be called if there was an
-        error.
-
-        Args:
-            file_info: Info about the file to store
-
-        Example:
-
-            with media_storage.store_into_file(info) as (f, fname, finish_cb):
-                # .. write into f ...
-                await finish_cb()
-        """
-
-        path = self._file_info_to_path(file_info)
-        fname = os.path.join(self.local_media_directory, path)
-
-        dirname = os.path.dirname(fname)
-        os.makedirs(dirname, exist_ok=True)
-
-        finished_called = [False]
-
-        try:
-            with open(fname, "wb") as f:
-
-                async def finish() -> None:
-                    # Ensure that all writes have been flushed and close the
-                    # file.
-                    f.flush()
-                    f.close()
-
-                    spam_check = await self.spam_checker.check_media_file_for_spam(
-                        ReadableFileWrapper(self.clock, fname), file_info
-                    )
-                    if spam_check != synapse.module_api.NOT_SPAM:
-                        logger.info("Blocking media due to spam checker")
-                        # Note that we'll delete the stored media, due to the
-                        # try/except below. The media also won't be stored in
-                        # the DB.
-                        # We currently ignore any additional field returned by
-                        # the spam-check API.
-                        raise SpamMediaException(errcode=spam_check[0])
-
-                    for provider in self.storage_providers:
-                        await provider.store_file(path, file_info)
-
-                    finished_called[0] = True
-
-                yield f, fname, finish
-        except Exception as e:
-            try:
-                os.remove(fname)
-            except Exception:
-                pass
-
-            raise e from None
-
-        if not finished_called:
-            raise Exception("Finished callback not called")
-
-    async def fetch_media(self, file_info: FileInfo) -> Optional[Responder]:
-        """Attempts to fetch media described by file_info from the local cache
-        and configured storage providers.
-
-        Args:
-            file_info
-
-        Returns:
-            Returns a Responder if the file was found, otherwise None.
-        """
-        paths = [self._file_info_to_path(file_info)]
-
-        # fallback for remote thumbnails with no method in the filename
-        if file_info.thumbnail and file_info.server_name:
-            paths.append(
-                self.filepaths.remote_media_thumbnail_rel_legacy(
-                    server_name=file_info.server_name,
-                    file_id=file_info.file_id,
-                    width=file_info.thumbnail.width,
-                    height=file_info.thumbnail.height,
-                    content_type=file_info.thumbnail.type,
-                )
-            )
-
-        for path in paths:
-            local_path = os.path.join(self.local_media_directory, path)
-            if os.path.exists(local_path):
-                logger.debug("responding with local file %s", local_path)
-                return FileResponder(open(local_path, "rb"))
-            logger.debug("local file %s did not exist", local_path)
-
-        for provider in self.storage_providers:
-            for path in paths:
-                res: Any = await provider.fetch(path, file_info)
-                if res:
-                    logger.debug("Streaming %s from %s", path, provider)
-                    return res
-                logger.debug("%s not found on %s", path, provider)
-
-        return None
-
-    async def ensure_media_is_in_local_cache(self, file_info: FileInfo) -> str:
-        """Ensures that the given file is in the local cache. Attempts to
-        download it from storage providers if it isn't.
-
-        Args:
-            file_info
-
-        Returns:
-            Full path to local file
-        """
-        path = self._file_info_to_path(file_info)
-        local_path = os.path.join(self.local_media_directory, path)
-        if os.path.exists(local_path):
-            return local_path
-
-        # Fallback for paths without method names
-        # Should be removed in the future
-        if file_info.thumbnail and file_info.server_name:
-            legacy_path = self.filepaths.remote_media_thumbnail_rel_legacy(
-                server_name=file_info.server_name,
-                file_id=file_info.file_id,
-                width=file_info.thumbnail.width,
-                height=file_info.thumbnail.height,
-                content_type=file_info.thumbnail.type,
-            )
-            legacy_local_path = os.path.join(self.local_media_directory, legacy_path)
-            if os.path.exists(legacy_local_path):
-                return legacy_local_path
-
-        dirname = os.path.dirname(local_path)
-        os.makedirs(dirname, exist_ok=True)
-
-        for provider in self.storage_providers:
-            res: Any = await provider.fetch(path, file_info)
-            if res:
-                with res:
-                    consumer = BackgroundFileConsumer(
-                        open(local_path, "wb"), self.reactor
-                    )
-                    await res.write_to_consumer(consumer)
-                    await consumer.wait()
-                return local_path
-
-        raise NotFoundError()
-
-    def _file_info_to_path(self, file_info: FileInfo) -> str:
-        """Converts file_info into a relative path.
-
-        The path is suitable for storing files under a directory, e.g. used to
-        store files on local FS under the base media repository directory.
-        """
-        if file_info.url_cache:
-            if file_info.thumbnail:
-                return self.filepaths.url_cache_thumbnail_rel(
-                    media_id=file_info.file_id,
-                    width=file_info.thumbnail.width,
-                    height=file_info.thumbnail.height,
-                    content_type=file_info.thumbnail.type,
-                    method=file_info.thumbnail.method,
-                )
-            return self.filepaths.url_cache_filepath_rel(file_info.file_id)
-
-        if file_info.server_name:
-            if file_info.thumbnail:
-                return self.filepaths.remote_media_thumbnail_rel(
-                    server_name=file_info.server_name,
-                    file_id=file_info.file_id,
-                    width=file_info.thumbnail.width,
-                    height=file_info.thumbnail.height,
-                    content_type=file_info.thumbnail.type,
-                    method=file_info.thumbnail.method,
-                )
-            return self.filepaths.remote_media_filepath_rel(
-                file_info.server_name, file_info.file_id
-            )
-
-        if file_info.thumbnail:
-            return self.filepaths.local_media_thumbnail_rel(
-                media_id=file_info.file_id,
-                width=file_info.thumbnail.width,
-                height=file_info.thumbnail.height,
-                content_type=file_info.thumbnail.type,
-                method=file_info.thumbnail.method,
-            )
-        return self.filepaths.local_media_filepath_rel(file_info.file_id)
-
-
-def _write_file_synchronously(source: IO, dest: IO) -> None:
-    """Write `source` to the file like `dest` synchronously. Should be called
-    from a thread.
-
-    Args:
-        source: A file like object that's to be written
-        dest: A file like object to be written to
-    """
-    source.seek(0)  # Ensure we read from the start of the file
-    shutil.copyfileobj(source, dest)
-
-
-class FileResponder(Responder):
-    """Wraps an open file that can be sent to a request.
-
-    Args:
-        open_file: A file like object to be streamed ot the client,
-            is closed when finished streaming.
-    """
-
-    def __init__(self, open_file: IO):
-        self.open_file = open_file
-
-    def write_to_consumer(self, consumer: IConsumer) -> Deferred:
-        return make_deferred_yieldable(
-            FileSender().beginFileTransfer(self.open_file, consumer)
-        )
-
-    def __exit__(
-        self,
-        exc_type: Optional[Type[BaseException]],
-        exc_val: Optional[BaseException],
-        exc_tb: Optional[TracebackType],
-    ) -> None:
-        self.open_file.close()
-
-
-class SpamMediaException(NotFoundError):
-    """The media was blocked by a spam checker, so we simply 404 the request (in
-    the same way as if it was quarantined).
-    """
-
-
-@attr.s(slots=True, auto_attribs=True)
-class ReadableFileWrapper:
-    """Wrapper that allows reading a file in chunks, yielding to the reactor,
-    and writing to a callback.
-
-    This is simplified `FileSender` that takes an IO object rather than an
-    `IConsumer`.
-    """
-
-    CHUNK_SIZE = 2**14
-
-    clock: Clock
-    path: str
-
-    async def write_chunks_to(self, callback: Callable[[bytes], object]) -> None:
-        """Reads the file in chunks and calls the callback with each chunk."""
-
-        with open(self.path, "rb") as file:
-            while True:
-                chunk = file.read(self.CHUNK_SIZE)
-                if not chunk:
-                    break
-
-                callback(chunk)
+#
 
-                # We yield to the reactor by sleeping for 0 seconds.
-                await self.clock.sleep(0)
+# This exists purely for backwards compatibility with spam checkers.
+from synapse.media.media_storage import ReadableFileWrapper  # noqa: F401
diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py
deleted file mode 100644
index 7592aa5d47..0000000000
--- a/synapse/rest/media/v1/oembed.py
+++ /dev/null
@@ -1,265 +0,0 @@
-#  Copyright 2021 The Matrix.org Foundation C.I.C.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-import html
-import logging
-import urllib.parse
-from typing import TYPE_CHECKING, List, Optional
-
-import attr
-
-from synapse.rest.media.v1.preview_html import parse_html_description
-from synapse.types import JsonDict
-from synapse.util import json_decoder
-
-if TYPE_CHECKING:
-    from lxml import etree
-
-    from synapse.server import HomeServer
-
-logger = logging.getLogger(__name__)
-
-
-@attr.s(slots=True, frozen=True, auto_attribs=True)
-class OEmbedResult:
-    # The Open Graph result (converted from the oEmbed result).
-    open_graph_result: JsonDict
-    # The author_name of the oEmbed result
-    author_name: Optional[str]
-    # Number of milliseconds to cache the content, according to the oEmbed response.
-    #
-    # This will be None if no cache-age is provided in the oEmbed response (or
-    # if the oEmbed response cannot be turned into an Open Graph response).
-    cache_age: Optional[int]
-
-
-class OEmbedProvider:
-    """
-    A helper for accessing oEmbed content.
-
-    It can be used to check if a URL should be accessed via oEmbed and for
-    requesting/parsing oEmbed content.
-    """
-
-    def __init__(self, hs: "HomeServer"):
-        self._oembed_patterns = {}
-        for oembed_endpoint in hs.config.oembed.oembed_patterns:
-            api_endpoint = oembed_endpoint.api_endpoint
-
-            # Only JSON is supported at the moment. This could be declared in
-            # the formats field. Otherwise, if the endpoint ends in .xml assume
-            # it doesn't support JSON.
-            if (
-                oembed_endpoint.formats is not None
-                and "json" not in oembed_endpoint.formats
-            ) or api_endpoint.endswith(".xml"):
-                logger.info(
-                    "Ignoring oEmbed endpoint due to not supporting JSON: %s",
-                    api_endpoint,
-                )
-                continue
-
-            # Iterate through each URL pattern and point it to the endpoint.
-            for pattern in oembed_endpoint.url_patterns:
-                self._oembed_patterns[pattern] = api_endpoint
-
-    def get_oembed_url(self, url: str) -> Optional[str]:
-        """
-        Check whether the URL should be downloaded as oEmbed content instead.
-
-        Args:
-            url: The URL to check.
-
-        Returns:
-            A URL to use instead or None if the original URL should be used.
-        """
-        for url_pattern, endpoint in self._oembed_patterns.items():
-            if url_pattern.fullmatch(url):
-                # TODO Specify max height / width.
-
-                # Note that only the JSON format is supported, some endpoints want
-                # this in the URL, others want it as an argument.
-                endpoint = endpoint.replace("{format}", "json")
-
-                args = {"url": url, "format": "json"}
-                query_str = urllib.parse.urlencode(args, True)
-                return f"{endpoint}?{query_str}"
-
-        # No match.
-        return None
-
-    def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
-        """
-        Search an HTML document for oEmbed autodiscovery information.
-
-        Args:
-            tree: The parsed HTML body.
-
-        Returns:
-            The URL to use for oEmbed information, or None if no URL was found.
-        """
-        # Search for link elements with the proper rel and type attributes.
-        for tag in tree.xpath(
-            "//link[@rel='alternate'][@type='application/json+oembed']"
-        ):
-            if "href" in tag.attrib:
-                return tag.attrib["href"]
-
-        # Some providers (e.g. Flickr) use alternative instead of alternate.
-        for tag in tree.xpath(
-            "//link[@rel='alternative'][@type='application/json+oembed']"
-        ):
-            if "href" in tag.attrib:
-                return tag.attrib["href"]
-
-        return None
-
-    def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
-        """
-        Parse the oEmbed response into an Open Graph response.
-
-        Args:
-            url: The URL which is being previewed (not the one which was
-                requested).
-            raw_body: The oEmbed response as JSON encoded as bytes.
-
-        Returns:
-            json-encoded Open Graph data
-        """
-
-        try:
-            # oEmbed responses *must* be UTF-8 according to the spec.
-            oembed = json_decoder.decode(raw_body.decode("utf-8"))
-        except ValueError:
-            return OEmbedResult({}, None, None)
-
-        # The version is a required string field, but not always provided,
-        # or sometimes provided as a float. Be lenient.
-        oembed_version = oembed.get("version", "1.0")
-        if oembed_version != "1.0" and oembed_version != 1:
-            return OEmbedResult({}, None, None)
-
-        # Attempt to parse the cache age, if possible.
-        try:
-            cache_age = int(oembed.get("cache_age")) * 1000
-        except (TypeError, ValueError):
-            # If the cache age cannot be parsed (e.g. wrong type or invalid
-            # string), ignore it.
-            cache_age = None
-
-        # The oEmbed response converted to Open Graph.
-        open_graph_response: JsonDict = {"og:url": url}
-
-        title = oembed.get("title")
-        if title and isinstance(title, str):
-            # A common WordPress plug-in seems to incorrectly escape entities
-            # in the oEmbed response.
-            open_graph_response["og:title"] = html.unescape(title)
-
-        author_name = oembed.get("author_name")
-        if not isinstance(author_name, str):
-            author_name = None
-
-        # Use the provider name and as the site.
-        provider_name = oembed.get("provider_name")
-        if provider_name and isinstance(provider_name, str):
-            open_graph_response["og:site_name"] = provider_name
-
-        # If a thumbnail exists, use it. Note that dimensions will be calculated later.
-        thumbnail_url = oembed.get("thumbnail_url")
-        if thumbnail_url and isinstance(thumbnail_url, str):
-            open_graph_response["og:image"] = thumbnail_url
-
-        # Process each type separately.
-        oembed_type = oembed.get("type")
-        if oembed_type == "rich":
-            html_str = oembed.get("html")
-            if isinstance(html_str, str):
-                calc_description_and_urls(open_graph_response, html_str)
-
-        elif oembed_type == "photo":
-            # If this is a photo, use the full image, not the thumbnail.
-            url = oembed.get("url")
-            if url and isinstance(url, str):
-                open_graph_response["og:image"] = url
-
-        elif oembed_type == "video":
-            open_graph_response["og:type"] = "video.other"
-            html_str = oembed.get("html")
-            if html_str and isinstance(html_str, str):
-                calc_description_and_urls(open_graph_response, oembed["html"])
-            for size in ("width", "height"):
-                val = oembed.get(size)
-                if type(val) is int:
-                    open_graph_response[f"og:video:{size}"] = val
-
-        elif oembed_type == "link":
-            open_graph_response["og:type"] = "website"
-
-        else:
-            logger.warning("Unknown oEmbed type: %s", oembed_type)
-
-        return OEmbedResult(open_graph_response, author_name, cache_age)
-
-
-def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:
-    results = []
-    for tag in tree.xpath("//*/" + tag_name):
-        if "src" in tag.attrib:
-            results.append(tag.attrib["src"])
-    return results
-
-
-def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None:
-    """
-    Calculate description for an HTML document.
-
-    This uses lxml to convert the HTML document into plaintext. If errors
-    occur during processing of the document, an empty response is returned.
-
-    Args:
-        open_graph_response: The current Open Graph summary. This is updated with additional fields.
-        html_body: The HTML document, as bytes.
-
-    Returns:
-        The summary
-    """
-    # If there's no body, nothing useful is going to be found.
-    if not html_body:
-        return
-
-    from lxml import etree
-
-    # Create an HTML parser. If this fails, log and return no metadata.
-    parser = etree.HTMLParser(recover=True, encoding="utf-8")
-
-    # Attempt to parse the body. If this fails, log and return no metadata.
-    tree = etree.fromstring(html_body, parser)
-
-    # The data was successfully parsed, but no tree was found.
-    if tree is None:
-        return
-
-    # Attempt to find interesting URLs (images, videos, embeds).
-    if "og:image" not in open_graph_response:
-        image_urls = _fetch_urls(tree, "img")
-        if image_urls:
-            open_graph_response["og:image"] = image_urls[0]
-
-    video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed")
-    if video_urls:
-        open_graph_response["og:video"] = video_urls[0]
-
-    description = parse_html_description(tree)
-    if description:
-        open_graph_response["og:description"] = description
diff --git a/synapse/rest/media/v1/preview_html.py b/synapse/rest/media/v1/preview_html.py
deleted file mode 100644
index 516d0434f0..0000000000
--- a/synapse/rest/media/v1/preview_html.py
+++ /dev/null
@@ -1,501 +0,0 @@
-# Copyright 2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import codecs
-import logging
-import re
-from typing import (
-    TYPE_CHECKING,
-    Callable,
-    Dict,
-    Generator,
-    Iterable,
-    List,
-    Optional,
-    Set,
-    Union,
-)
-
-if TYPE_CHECKING:
-    from lxml import etree
-
-logger = logging.getLogger(__name__)
-
-_charset_match = re.compile(
-    rb'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I
-)
-_xml_encoding_match = re.compile(
-    rb'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I
-)
-_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
-
-# Certain elements aren't meant for display.
-ARIA_ROLES_TO_IGNORE = {"directory", "menu", "menubar", "toolbar"}
-
-
-def _normalise_encoding(encoding: str) -> Optional[str]:
-    """Use the Python codec's name as the normalised entry."""
-    try:
-        return codecs.lookup(encoding).name
-    except LookupError:
-        return None
-
-
-def _get_html_media_encodings(
-    body: bytes, content_type: Optional[str]
-) -> Iterable[str]:
-    """
-    Get potential encoding of the body based on the (presumably) HTML body or the content-type header.
-
-    The precedence used for finding a character encoding is:
-
-    1. <meta> tag with a charset declared.
-    2. The XML document's character encoding attribute.
-    3. The Content-Type header.
-    4. Fallback to utf-8.
-    5. Fallback to windows-1252.
-
-    This roughly follows the algorithm used by BeautifulSoup's bs4.dammit.EncodingDetector.
-
-    Args:
-        body: The HTML document, as bytes.
-        content_type: The Content-Type header.
-
-    Returns:
-        The character encoding of the body, as a string.
-    """
-    # There's no point in returning an encoding more than once.
-    attempted_encodings: Set[str] = set()
-
-    # Limit searches to the first 1kb, since it ought to be at the top.
-    body_start = body[:1024]
-
-    # Check if it has an encoding set in a meta tag.
-    match = _charset_match.search(body_start)
-    if match:
-        encoding = _normalise_encoding(match.group(1).decode("ascii"))
-        if encoding:
-            attempted_encodings.add(encoding)
-            yield encoding
-
-    # TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
-
-    # Check if it has an XML document with an encoding.
-    match = _xml_encoding_match.match(body_start)
-    if match:
-        encoding = _normalise_encoding(match.group(1).decode("ascii"))
-        if encoding and encoding not in attempted_encodings:
-            attempted_encodings.add(encoding)
-            yield encoding
-
-    # Check the HTTP Content-Type header for a character set.
-    if content_type:
-        content_match = _content_type_match.match(content_type)
-        if content_match:
-            encoding = _normalise_encoding(content_match.group(1))
-            if encoding and encoding not in attempted_encodings:
-                attempted_encodings.add(encoding)
-                yield encoding
-
-    # Finally, fallback to UTF-8, then windows-1252.
-    for fallback in ("utf-8", "cp1252"):
-        if fallback not in attempted_encodings:
-            yield fallback
-
-
-def decode_body(
-    body: bytes, uri: str, content_type: Optional[str] = None
-) -> Optional["etree.Element"]:
-    """
-    This uses lxml to parse the HTML document.
-
-    Args:
-        body: The HTML document, as bytes.
-        uri: The URI used to download the body.
-        content_type: The Content-Type header.
-
-    Returns:
-        The parsed HTML body, or None if an error occurred during processed.
-    """
-    # If there's no body, nothing useful is going to be found.
-    if not body:
-        return None
-
-    # The idea here is that multiple encodings are tried until one works.
-    # Unfortunately the result is never used and then LXML will decode the string
-    # again with the found encoding.
-    for encoding in _get_html_media_encodings(body, content_type):
-        try:
-            body.decode(encoding)
-        except Exception:
-            pass
-        else:
-            break
-    else:
-        logger.warning("Unable to decode HTML body for %s", uri)
-        return None
-
-    from lxml import etree
-
-    # Create an HTML parser.
-    parser = etree.HTMLParser(recover=True, encoding=encoding)
-
-    # Attempt to parse the body. Returns None if the body was successfully
-    # parsed, but no tree was found.
-    return etree.fromstring(body, parser)
-
-
-def _get_meta_tags(
-    tree: "etree.Element",
-    property: str,
-    prefix: str,
-    property_mapper: Optional[Callable[[str], Optional[str]]] = None,
-) -> Dict[str, Optional[str]]:
-    """
-    Search for meta tags prefixed with a particular string.
-
-    Args:
-        tree: The parsed HTML document.
-        property: The name of the property which contains the tag name, e.g.
-            "property" for Open Graph.
-        prefix: The prefix on the property to search for, e.g. "og" for Open Graph.
-        property_mapper: An optional callable to map the property to the Open Graph
-            form. Can return None for a key to ignore that key.
-
-    Returns:
-        A map of tag name to value.
-    """
-    results: Dict[str, Optional[str]] = {}
-    for tag in tree.xpath(
-        f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]"
-    ):
-        # if we've got more than 50 tags, someone is taking the piss
-        if len(results) >= 50:
-            logger.warning(
-                "Skipping parsing of Open Graph for page with too many '%s:' tags",
-                prefix,
-            )
-            return {}
-
-        key = tag.attrib[property]
-        if property_mapper:
-            key = property_mapper(key)
-            # None is a special value used to ignore a value.
-            if key is None:
-                continue
-
-        results[key] = tag.attrib["content"]
-
-    return results
-
-
-def _map_twitter_to_open_graph(key: str) -> Optional[str]:
-    """
-    Map a Twitter card property to the analogous Open Graph property.
-
-    Args:
-        key: The Twitter card property (starts with "twitter:").
-
-    Returns:
-        The Open Graph property (starts with "og:") or None to have this property
-        be ignored.
-    """
-    # Twitter card properties with no analogous Open Graph property.
-    if key == "twitter:card" or key == "twitter:creator":
-        return None
-    if key == "twitter:site":
-        return "og:site_name"
-    # Otherwise, swap twitter to og.
-    return "og" + key[7:]
-
-
-def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
-    """
-    Parse the HTML document into an Open Graph response.
-
-    This uses lxml to search the HTML document for Open Graph data (or
-    synthesizes it from the document).
-
-    Args:
-        tree: The parsed HTML document.
-
-    Returns:
-        The Open Graph response as a dictionary.
-    """
-
-    # Search for Open Graph (og:) meta tags, e.g.:
-    #
-    # "og:type"         : "video",
-    # "og:url"          : "https://www.youtube.com/watch?v=LXDBoHyjmtw",
-    # "og:site_name"    : "YouTube",
-    # "og:video:type"   : "application/x-shockwave-flash",
-    # "og:description"  : "Fun stuff happening here",
-    # "og:title"        : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon",
-    # "og:image"        : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg",
-    # "og:video:url"    : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1",
-    # "og:video:width"  : "1280"
-    # "og:video:height" : "720",
-    # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
-
-    og = _get_meta_tags(tree, "property", "og")
-
-    # TODO: Search for properties specific to the different Open Graph types,
-    # such as article: meta tags, e.g.:
-    #
-    # "article:publisher" : "https://www.facebook.com/thethudonline" />
-    # "article:author" content="https://www.facebook.com/thethudonline" />
-    # "article:tag" content="baby" />
-    # "article:section" content="Breaking News" />
-    # "article:published_time" content="2016-03-31T19:58:24+00:00" />
-    # "article:modified_time" content="2016-04-01T18:31:53+00:00" />
-
-    # Search for Twitter Card (twitter:) meta tags, e.g.:
-    #
-    # "twitter:site"    : "@matrixdotorg"
-    # "twitter:creator" : "@matrixdotorg"
-    #
-    # Twitter cards tags also duplicate Open Graph tags.
-    #
-    # See https://developer.twitter.com/en/docs/twitter-for-websites/cards/guides/getting-started
-    twitter = _get_meta_tags(tree, "name", "twitter", _map_twitter_to_open_graph)
-    # Merge the Twitter values with the Open Graph values, but do not overwrite
-    # information from Open Graph tags.
-    for key, value in twitter.items():
-        if key not in og:
-            og[key] = value
-
-    if "og:title" not in og:
-        # Attempt to find a title from the title tag, or the biggest header on the page.
-        title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")
-        if title:
-            og["og:title"] = title[0].strip()
-        else:
-            og["og:title"] = None
-
-    if "og:image" not in og:
-        meta_image = tree.xpath(
-            "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
-        )
-        # If a meta image is found, use it.
-        if meta_image:
-            og["og:image"] = meta_image[0]
-        else:
-            # Try to find images which are larger than 10px by 10px.
-            #
-            # TODO: consider inlined CSS styles as well as width & height attribs
-            images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
-            images = sorted(
-                images,
-                key=lambda i: (
-                    -1 * float(i.attrib["width"]) * float(i.attrib["height"])
-                ),
-            )
-            # If no images were found, try to find *any* images.
-            if not images:
-                images = tree.xpath("//img[@src][1]")
-            if images:
-                og["og:image"] = images[0].attrib["src"]
-
-            # Finally, fallback to the favicon if nothing else.
-            else:
-                favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]")
-                if favicons:
-                    og["og:image"] = favicons[0]
-
-    if "og:description" not in og:
-        # Check the first meta description tag for content.
-        meta_description = tree.xpath(
-            "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
-        )
-        # If a meta description is found with content, use it.
-        if meta_description:
-            og["og:description"] = meta_description[0]
-        else:
-            og["og:description"] = parse_html_description(tree)
-    elif og["og:description"]:
-        # This must be a non-empty string at this point.
-        assert isinstance(og["og:description"], str)
-        og["og:description"] = summarize_paragraphs([og["og:description"]])
-
-    # TODO: delete the url downloads to stop diskfilling,
-    # as we only ever cared about its OG
-    return og
-
-
-def parse_html_description(tree: "etree.Element") -> Optional[str]:
-    """
-    Calculate a text description based on an HTML document.
-
-    Grabs any text nodes which are inside the <body/> tag, unless they are within
-    an HTML5 semantic markup tag (<header/>, <nav/>, <aside/>, <footer/>), or
-    if they are within a <script/>, <svg/> or <style/> tag, or if they are within
-    a tag whose content is usually only shown to old browsers
-    (<iframe/>, <video/>, <canvas/>, <picture/>).
-
-    This is a very very very coarse approximation to a plain text render of the page.
-
-    Args:
-        tree: The parsed HTML document.
-
-    Returns:
-        The plain text description, or None if one cannot be generated.
-    """
-    # We don't just use XPATH here as that is slow on some machines.
-
-    from lxml import etree
-
-    TAGS_TO_REMOVE = {
-        "header",
-        "nav",
-        "aside",
-        "footer",
-        "script",
-        "noscript",
-        "style",
-        "svg",
-        "iframe",
-        "video",
-        "canvas",
-        "img",
-        "picture",
-        etree.Comment,
-    }
-
-    # Split all the text nodes into paragraphs (by splitting on new
-    # lines)
-    text_nodes = (
-        re.sub(r"\s+", "\n", el).strip()
-        for el in _iterate_over_text(tree.find("body"), TAGS_TO_REMOVE)
-    )
-    return summarize_paragraphs(text_nodes)
-
-
-def _iterate_over_text(
-    tree: Optional["etree.Element"],
-    tags_to_ignore: Set[Union[str, "etree.Comment"]],
-    stack_limit: int = 1024,
-) -> Generator[str, None, None]:
-    """Iterate over the tree returning text nodes in a depth first fashion,
-    skipping text nodes inside certain tags.
-
-    Args:
-        tree: The parent element to iterate. Can be None if there isn't one.
-        tags_to_ignore: Set of tags to ignore
-        stack_limit: Maximum stack size limit for depth-first traversal.
-            Nodes will be dropped if this limit is hit, which may truncate the
-            textual result.
-            Intended to limit the maximum working memory when generating a preview.
-    """
-
-    if tree is None:
-        return
-
-    # This is a stack whose items are elements to iterate over *or* strings
-    # to be returned.
-    elements: List[Union[str, "etree.Element"]] = [tree]
-    while elements:
-        el = elements.pop()
-
-        if isinstance(el, str):
-            yield el
-        elif el.tag not in tags_to_ignore:
-            # If the element isn't meant for display, ignore it.
-            if el.get("role") in ARIA_ROLES_TO_IGNORE:
-                continue
-
-            # el.text is the text before the first child, so we can immediately
-            # return it if the text exists.
-            if el.text:
-                yield el.text
-
-            # We add to the stack all the element's children, interspersed with
-            # each child's tail text (if it exists).
-            #
-            # We iterate in reverse order so that earlier pieces of text appear
-            # closer to the top of the stack.
-            for child in el.iterchildren(reversed=True):
-                if len(elements) > stack_limit:
-                    # We've hit our limit for working memory
-                    break
-
-                if child.tail:
-                    # The tail text of a node is text that comes *after* the node,
-                    # so we always include it even if we ignore the child node.
-                    elements.append(child.tail)
-
-                elements.append(child)
-
-
-def summarize_paragraphs(
-    text_nodes: Iterable[str], min_size: int = 200, max_size: int = 500
-) -> Optional[str]:
-    """
-    Try to get a summary respecting first paragraph and then word boundaries.
-
-    Args:
-        text_nodes: The paragraphs to summarize.
-        min_size: The minimum number of words to include.
-        max_size: The maximum number of words to include.
-
-    Returns:
-        A summary of the text nodes, or None if that was not possible.
-    """
-
-    # TODO: Respect sentences?
-
-    description = ""
-
-    # Keep adding paragraphs until we get to the MIN_SIZE.
-    for text_node in text_nodes:
-        if len(description) < min_size:
-            text_node = re.sub(r"[\t \r\n]+", " ", text_node)
-            description += text_node + "\n\n"
-        else:
-            break
-
-    description = description.strip()
-    description = re.sub(r"[\t ]+", " ", description)
-    description = re.sub(r"[\t \r\n]*[\r\n]+", "\n\n", description)
-
-    # If the concatenation of paragraphs to get above MIN_SIZE
-    # took us over MAX_SIZE, then we need to truncate mid paragraph
-    if len(description) > max_size:
-        new_desc = ""
-
-        # This splits the paragraph into words, but keeping the
-        # (preceding) whitespace intact so we can easily concat
-        # words back together.
-        for match in re.finditer(r"\s*\S+", description):
-            word = match.group()
-
-            # Keep adding words while the total length is less than
-            # MAX_SIZE.
-            if len(word) + len(new_desc) < max_size:
-                new_desc += word
-            else:
-                # At this point the next word *will* take us over
-                # MAX_SIZE, but we also want to ensure that its not
-                # a huge word. If it is add it anyway and we'll
-                # truncate later.
-                if len(new_desc) < min_size:
-                    new_desc += word
-                break
-
-        # Double check that we're not over the limit
-        if len(new_desc) > max_size:
-            new_desc = new_desc[:max_size]
-
-        # We always add an ellipsis because at the very least
-        # we chopped mid paragraph.
-        description = new_desc.strip() + "…"
-    return description if description else None
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
deleted file mode 100644
index 4a594ab9d8..0000000000
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ /dev/null
@@ -1,871 +0,0 @@
-# Copyright 2016 OpenMarket Ltd
-# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import datetime
-import errno
-import fnmatch
-import logging
-import os
-import re
-import shutil
-import sys
-import traceback
-from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple
-from urllib.parse import urljoin, urlparse, urlsplit
-from urllib.request import urlopen
-
-import attr
-
-from twisted.internet.defer import Deferred
-from twisted.internet.error import DNSLookupError
-
-from synapse.api.errors import Codes, SynapseError
-from synapse.http.client import SimpleHttpClient
-from synapse.http.server import (
-    DirectServeJsonResource,
-    respond_with_json,
-    respond_with_json_bytes,
-)
-from synapse.http.servlet import parse_integer, parse_string
-from synapse.http.site import SynapseRequest
-from synapse.logging.context import make_deferred_yieldable, run_in_background
-from synapse.metrics.background_process_metrics import run_as_background_process
-from synapse.rest.media.v1._base import get_filename_from_headers
-from synapse.rest.media.v1.media_storage import MediaStorage
-from synapse.rest.media.v1.oembed import OEmbedProvider
-from synapse.rest.media.v1.preview_html import decode_body, parse_html_to_open_graph
-from synapse.types import JsonDict, UserID
-from synapse.util import json_encoder
-from synapse.util.async_helpers import ObservableDeferred
-from synapse.util.caches.expiringcache import ExpiringCache
-from synapse.util.stringutils import random_string
-
-from ._base import FileInfo
-
-if TYPE_CHECKING:
-    from synapse.rest.media.v1.media_repository import MediaRepository
-    from synapse.server import HomeServer
-
-logger = logging.getLogger(__name__)
-
-OG_TAG_NAME_MAXLEN = 50
-OG_TAG_VALUE_MAXLEN = 1000
-
-ONE_HOUR = 60 * 60 * 1000
-ONE_DAY = 24 * ONE_HOUR
-IMAGE_CACHE_EXPIRY_MS = 2 * ONE_DAY
-
-
-@attr.s(slots=True, frozen=True, auto_attribs=True)
-class DownloadResult:
-    length: int
-    uri: str
-    response_code: int
-    media_type: str
-    download_name: Optional[str]
-    expires: int
-    etag: Optional[str]
-
-
-@attr.s(slots=True, frozen=True, auto_attribs=True)
-class MediaInfo:
-    """
-    Information parsed from downloading media being previewed.
-    """
-
-    # The Content-Type header of the response.
-    media_type: str
-    # The length (in bytes) of the downloaded media.
-    media_length: int
-    # The media filename, according to the server. This is parsed from the
-    # returned headers, if possible.
-    download_name: Optional[str]
-    # The time of the preview.
-    created_ts_ms: int
-    # Information from the media storage provider about where the file is stored
-    # on disk.
-    filesystem_id: str
-    filename: str
-    # The URI being previewed.
-    uri: str
-    # The HTTP response code.
-    response_code: int
-    # The timestamp (in milliseconds) of when this preview expires.
-    expires: int
-    # The ETag header of the response.
-    etag: Optional[str]
-
-
-class PreviewUrlResource(DirectServeJsonResource):
-    """
-    The `GET /_matrix/media/r0/preview_url` endpoint provides a generic preview API
-    for URLs which outputs Open Graph (https://ogp.me/) responses (with some Matrix
-    specific additions).
-
-    This does have trade-offs compared to other designs:
-
-    * Pros:
-      * Simple and flexible; can be used by any clients at any point
-    * Cons:
-      * If each homeserver provides one of these independently, all the homeservers in a
-        room may needlessly DoS the target URI
-      * The URL metadata must be stored somewhere, rather than just using Matrix
-        itself to store the media.
-      * Matrix cannot be used to distribute the metadata between homeservers.
-
-    When Synapse is asked to preview a URL it does the following:
-
-    1. Checks against a URL blacklist (defined as `url_preview_url_blacklist` in the
-       config).
-    2. Checks the URL against an in-memory cache and returns the result if it exists. (This
-       is also used to de-duplicate processing of multiple in-flight requests at once.)
-    3. Kicks off a background process to generate a preview:
-       1. Checks URL and timestamp against the database cache and returns the result if it
-          has not expired and was successful (a 2xx return code).
-       2. Checks if the URL matches an oEmbed (https://oembed.com/) pattern. If it
-          does, update the URL to download.
-       3. Downloads the URL and stores it into a file via the media storage provider
-          and saves the local media metadata.
-       4. If the media is an image:
-          1. Generates thumbnails.
-          2. Generates an Open Graph response based on image properties.
-       5. If the media is HTML:
-          1. Decodes the HTML via the stored file.
-          2. Generates an Open Graph response from the HTML.
-          3. If a JSON oEmbed URL was found in the HTML via autodiscovery:
-             1. Downloads the URL and stores it into a file via the media storage provider
-                and saves the local media metadata.
-             2. Convert the oEmbed response to an Open Graph response.
-             3. Override any Open Graph data from the HTML with data from oEmbed.
-          4. If an image exists in the Open Graph response:
-             1. Downloads the URL and stores it into a file via the media storage
-                provider and saves the local media metadata.
-             2. Generates thumbnails.
-             3. Updates the Open Graph response based on image properties.
-       6. If the media is JSON and an oEmbed URL was found:
-          1. Convert the oEmbed response to an Open Graph response.
-          2. If a thumbnail or image is in the oEmbed response:
-             1. Downloads the URL and stores it into a file via the media storage
-                provider and saves the local media metadata.
-             2. Generates thumbnails.
-             3. Updates the Open Graph response based on image properties.
-       7. Stores the result in the database cache.
-    4. Returns the result.
-
-    If any additional requests (e.g. from oEmbed autodiscovery, step 5.3 or
-    image thumbnailing, step 5.4 or 6.4) fails then the URL preview as a whole
-    does not fail. As much information as possible is returned.
-
-    The in-memory cache expires after 1 hour.
-
-    Expired entries in the database cache (and their associated media files) are
-    deleted every 10 seconds. The default expiration time is 1 hour from download.
-    """
-
-    isLeaf = True
-
-    def __init__(
-        self,
-        hs: "HomeServer",
-        media_repo: "MediaRepository",
-        media_storage: MediaStorage,
-    ):
-        super().__init__()
-
-        self.auth = hs.get_auth()
-        self.clock = hs.get_clock()
-        self.filepaths = media_repo.filepaths
-        self.max_spider_size = hs.config.media.max_spider_size
-        self.server_name = hs.hostname
-        self.store = hs.get_datastores().main
-        self.client = SimpleHttpClient(
-            hs,
-            treq_args={"browser_like_redirects": True},
-            ip_whitelist=hs.config.media.url_preview_ip_range_whitelist,
-            ip_blacklist=hs.config.media.url_preview_ip_range_blacklist,
-            use_proxy=True,
-        )
-        self.media_repo = media_repo
-        self.primary_base_path = media_repo.primary_base_path
-        self.media_storage = media_storage
-
-        self._oembed = OEmbedProvider(hs)
-
-        # We run the background jobs if we're the instance specified (or no
-        # instance is specified, where we assume there is only one instance
-        # serving media).
-        instance_running_jobs = hs.config.media.media_instance_running_background_jobs
-        self._worker_run_media_background_jobs = (
-            instance_running_jobs is None
-            or instance_running_jobs == hs.get_instance_name()
-        )
-
-        self.url_preview_url_blacklist = hs.config.media.url_preview_url_blacklist
-        self.url_preview_accept_language = hs.config.media.url_preview_accept_language
-
-        # memory cache mapping urls to an ObservableDeferred returning
-        # JSON-encoded OG metadata
-        self._cache: ExpiringCache[str, ObservableDeferred] = ExpiringCache(
-            cache_name="url_previews",
-            clock=self.clock,
-            # don't spider URLs more often than once an hour
-            expiry_ms=ONE_HOUR,
-        )
-
-        if self._worker_run_media_background_jobs:
-            self._cleaner_loop = self.clock.looping_call(
-                self._start_expire_url_cache_data, 10 * 1000
-            )
-
-    async def _async_render_OPTIONS(self, request: SynapseRequest) -> None:
-        request.setHeader(b"Allow", b"OPTIONS, GET")
-        respond_with_json(request, 200, {}, send_cors=True)
-
-    async def _async_render_GET(self, request: SynapseRequest) -> None:
-        # XXX: if get_user_by_req fails, what should we do in an async render?
-        requester = await self.auth.get_user_by_req(request)
-        url = parse_string(request, "url", required=True)
-        ts = parse_integer(request, "ts")
-        if ts is None:
-            ts = self.clock.time_msec()
-
-        # XXX: we could move this into _do_preview if we wanted.
-        url_tuple = urlsplit(url)
-        for entry in self.url_preview_url_blacklist:
-            match = True
-            for attrib in entry:
-                pattern = entry[attrib]
-                value = getattr(url_tuple, attrib)
-                logger.debug(
-                    "Matching attrib '%s' with value '%s' against pattern '%s'",
-                    attrib,
-                    value,
-                    pattern,
-                )
-
-                if value is None:
-                    match = False
-                    continue
-
-                # Some attributes might not be parsed as strings by urlsplit (such as the
-                # port, which is parsed as an int). Because we use match functions that
-                # expect strings, we want to make sure that's what we give them.
-                value_str = str(value)
-
-                if pattern.startswith("^"):
-                    if not re.match(pattern, value_str):
-                        match = False
-                        continue
-                else:
-                    if not fnmatch.fnmatch(value_str, pattern):
-                        match = False
-                        continue
-            if match:
-                logger.warning("URL %s blocked by url_blacklist entry %s", url, entry)
-                raise SynapseError(
-                    403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN
-                )
-
-        # the in-memory cache:
-        # * ensures that only one request is active at a time
-        # * takes load off the DB for the thundering herds
-        # * also caches any failures (unlike the DB) so we don't keep
-        #    requesting the same endpoint
-
-        observable = self._cache.get(url)
-
-        if not observable:
-            download = run_in_background(self._do_preview, url, requester.user, ts)
-            observable = ObservableDeferred(download, consumeErrors=True)
-            self._cache[url] = observable
-        else:
-            logger.info("Returning cached response")
-
-        og = await make_deferred_yieldable(observable.observe())
-        respond_with_json_bytes(request, 200, og, send_cors=True)
-
-    async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
-        """Check the db, and download the URL and build a preview
-
-        Args:
-            url: The URL to preview.
-            user: The user requesting the preview.
-            ts: The timestamp requested for the preview.
-
-        Returns:
-            json-encoded og data
-        """
-        # check the URL cache in the DB (which will also provide us with
-        # historical previews, if we have any)
-        cache_result = await self.store.get_url_cache(url, ts)
-        if (
-            cache_result
-            and cache_result["expires_ts"] > ts
-            and cache_result["response_code"] / 100 == 2
-        ):
-            # It may be stored as text in the database, not as bytes (such as
-            # PostgreSQL). If so, encode it back before handing it on.
-            og = cache_result["og"]
-            if isinstance(og, str):
-                og = og.encode("utf8")
-            return og
-
-        # If this URL can be accessed via oEmbed, use that instead.
-        url_to_download = url
-        oembed_url = self._oembed.get_oembed_url(url)
-        if oembed_url:
-            url_to_download = oembed_url
-
-        media_info = await self._handle_url(url_to_download, user)
-
-        logger.debug("got media_info of '%s'", media_info)
-
-        # The number of milliseconds that the response should be considered valid.
-        expiration_ms = media_info.expires
-        author_name: Optional[str] = None
-
-        if _is_media(media_info.media_type):
-            file_id = media_info.filesystem_id
-            dims = await self.media_repo._generate_thumbnails(
-                None, file_id, file_id, media_info.media_type, url_cache=True
-            )
-
-            og = {
-                "og:description": media_info.download_name,
-                "og:image": f"mxc://{self.server_name}/{media_info.filesystem_id}",
-                "og:image:type": media_info.media_type,
-                "matrix:image:size": media_info.media_length,
-            }
-
-            if dims:
-                og["og:image:width"] = dims["width"]
-                og["og:image:height"] = dims["height"]
-            else:
-                logger.warning("Couldn't get dims for %s" % url)
-
-            # define our OG response for this media
-        elif _is_html(media_info.media_type):
-            # TODO: somehow stop a big HTML tree from exploding synapse's RAM
-
-            with open(media_info.filename, "rb") as file:
-                body = file.read()
-
-            tree = decode_body(body, media_info.uri, media_info.media_type)
-            if tree is not None:
-                # Check if this HTML document points to oEmbed information and
-                # defer to that.
-                oembed_url = self._oembed.autodiscover_from_html(tree)
-                og_from_oembed: JsonDict = {}
-                if oembed_url:
-                    try:
-                        oembed_info = await self._handle_url(
-                            oembed_url, user, allow_data_urls=True
-                        )
-                    except Exception as e:
-                        # Fetching the oEmbed info failed, don't block the entire URL preview.
-                        logger.warning(
-                            "oEmbed fetch failed during URL preview: %s errored with %s",
-                            oembed_url,
-                            e,
-                        )
-                    else:
-                        (
-                            og_from_oembed,
-                            author_name,
-                            expiration_ms,
-                        ) = await self._handle_oembed_response(
-                            url, oembed_info, expiration_ms
-                        )
-
-                # Parse Open Graph information from the HTML in case the oEmbed
-                # response failed or is incomplete.
-                og_from_html = parse_html_to_open_graph(tree)
-
-                # Compile the Open Graph response by using the scraped
-                # information from the HTML and overlaying any information
-                # from the oEmbed response.
-                og = {**og_from_html, **og_from_oembed}
-
-                await self._precache_image_url(user, media_info, og)
-            else:
-                og = {}
-
-        elif oembed_url:
-            # Handle the oEmbed information.
-            og, author_name, expiration_ms = await self._handle_oembed_response(
-                url, media_info, expiration_ms
-            )
-            await self._precache_image_url(user, media_info, og)
-
-        else:
-            logger.warning("Failed to find any OG data in %s", url)
-            og = {}
-
-        # If we don't have a title but we have author_name, copy it as
-        # title
-        if not og.get("og:title") and author_name:
-            og["og:title"] = author_name
-
-        # filter out any stupidly long values
-        keys_to_remove = []
-        for k, v in og.items():
-            # values can be numeric as well as strings, hence the cast to str
-            if len(k) > OG_TAG_NAME_MAXLEN or len(str(v)) > OG_TAG_VALUE_MAXLEN:
-                logger.warning(
-                    "Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN]
-                )
-                keys_to_remove.append(k)
-        for k in keys_to_remove:
-            del og[k]
-
-        logger.debug("Calculated OG for %s as %s", url, og)
-
-        jsonog = json_encoder.encode(og)
-
-        # Cap the amount of time to consider a response valid.
-        expiration_ms = min(expiration_ms, ONE_DAY)
-
-        # store OG in history-aware DB cache
-        await self.store.store_url_cache(
-            url,
-            media_info.response_code,
-            media_info.etag,
-            media_info.created_ts_ms + expiration_ms,
-            jsonog,
-            media_info.filesystem_id,
-            media_info.created_ts_ms,
-        )
-
-        return jsonog.encode("utf8")
-
-    async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult:
-        """
-        Fetches a remote URL and parses the headers.
-
-        Args:
-             url: The URL to fetch.
-             output_stream: The stream to write the content to.
-
-        Returns:
-            A tuple of:
-                Media length, URL downloaded, the HTTP response code,
-                the media type, the downloaded file name, the number of
-                milliseconds the result is valid for, the etag header.
-        """
-
-        try:
-            logger.debug("Trying to get preview for url '%s'", url)
-            length, headers, uri, code = await self.client.get_file(
-                url,
-                output_stream=output_stream,
-                max_size=self.max_spider_size,
-                headers={
-                    b"Accept-Language": self.url_preview_accept_language,
-                    # Use a custom user agent for the preview because some sites will only return
-                    # Open Graph metadata to crawler user agents. Omit the Synapse version
-                    # string to avoid leaking information.
-                    b"User-Agent": [
-                        "Synapse (bot; +https://github.com/matrix-org/synapse)"
-                    ],
-                },
-                is_allowed_content_type=_is_previewable,
-            )
-        except SynapseError:
-            # Pass SynapseErrors through directly, so that the servlet
-            # handler will return a SynapseError to the client instead of
-            # blank data or a 500.
-            raise
-        except DNSLookupError:
-            # DNS lookup returned no results
-            # Note: This will also be the case if one of the resolved IP
-            # addresses is blacklisted
-            raise SynapseError(
-                502,
-                "DNS resolution failure during URL preview generation",
-                Codes.UNKNOWN,
-            )
-        except Exception as e:
-            # FIXME: pass through 404s and other error messages nicely
-            logger.warning("Error downloading %s: %r", url, e)
-
-            raise SynapseError(
-                500,
-                "Failed to download content: %s"
-                % (traceback.format_exception_only(sys.exc_info()[0], e),),
-                Codes.UNKNOWN,
-            )
-
-        if b"Content-Type" in headers:
-            media_type = headers[b"Content-Type"][0].decode("ascii")
-        else:
-            media_type = "application/octet-stream"
-
-        download_name = get_filename_from_headers(headers)
-
-        # FIXME: we should calculate a proper expiration based on the
-        # Cache-Control and Expire headers.  But for now, assume 1 hour.
-        expires = ONE_HOUR
-        etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
-
-        return DownloadResult(
-            length, uri, code, media_type, download_name, expires, etag
-        )
-
-    async def _parse_data_url(
-        self, url: str, output_stream: BinaryIO
-    ) -> DownloadResult:
-        """
-        Parses a data: URL.
-
-        Args:
-             url: The URL to parse.
-             output_stream: The stream to write the content to.
-
-        Returns:
-            A tuple of:
-                Media length, URL downloaded, the HTTP response code,
-                the media type, the downloaded file name, the number of
-                milliseconds the result is valid for, the etag header.
-        """
-
-        try:
-            logger.debug("Trying to parse data url '%s'", url)
-            with urlopen(url) as url_info:
-                # TODO Can this be more efficient.
-                output_stream.write(url_info.read())
-        except Exception as e:
-            logger.warning("Error parsing data: URL %s: %r", url, e)
-
-            raise SynapseError(
-                500,
-                "Failed to parse data URL: %s"
-                % (traceback.format_exception_only(sys.exc_info()[0], e),),
-                Codes.UNKNOWN,
-            )
-
-        return DownloadResult(
-            # Read back the length that has been written.
-            length=output_stream.tell(),
-            uri=url,
-            # If it was parsed, consider this a 200 OK.
-            response_code=200,
-            # urlopen shoves the media-type from the data URL into the content type
-            # header object.
-            media_type=url_info.headers.get_content_type(),
-            # Some features are not supported by data: URLs.
-            download_name=None,
-            expires=ONE_HOUR,
-            etag=None,
-        )
-
-    async def _handle_url(
-        self, url: str, user: UserID, allow_data_urls: bool = False
-    ) -> MediaInfo:
-        """
-        Fetches content from a URL and parses the result to generate a MediaInfo.
-
-        It uses the media storage provider to persist the fetched content and
-        stores the mapping into the database.
-
-        Args:
-             url: The URL to fetch.
-             user: The user who ahs requested this URL.
-             allow_data_urls: True if data URLs should be allowed.
-
-        Returns:
-            A MediaInfo object describing the fetched content.
-        """
-
-        # TODO: we should probably honour robots.txt... except in practice
-        # we're most likely being explicitly triggered by a human rather than a
-        # bot, so are we really a robot?
-
-        file_id = datetime.date.today().isoformat() + "_" + random_string(16)
-
-        file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)
-
-        with self.media_storage.store_into_file(file_info) as (f, fname, finish):
-            if url.startswith("data:"):
-                if not allow_data_urls:
-                    raise SynapseError(
-                        500, "Previewing of data: URLs is forbidden", Codes.UNKNOWN
-                    )
-
-                download_result = await self._parse_data_url(url, f)
-            else:
-                download_result = await self._download_url(url, f)
-
-            await finish()
-
-        try:
-            time_now_ms = self.clock.time_msec()
-
-            await self.store.store_local_media(
-                media_id=file_id,
-                media_type=download_result.media_type,
-                time_now_ms=time_now_ms,
-                upload_name=download_result.download_name,
-                media_length=download_result.length,
-                user_id=user,
-                url_cache=url,
-            )
-
-        except Exception as e:
-            logger.error("Error handling downloaded %s: %r", url, e)
-            # TODO: we really ought to delete the downloaded file in this
-            # case, since we won't have recorded it in the db, and will
-            # therefore not expire it.
-            raise
-
-        return MediaInfo(
-            media_type=download_result.media_type,
-            media_length=download_result.length,
-            download_name=download_result.download_name,
-            created_ts_ms=time_now_ms,
-            filesystem_id=file_id,
-            filename=fname,
-            uri=download_result.uri,
-            response_code=download_result.response_code,
-            expires=download_result.expires,
-            etag=download_result.etag,
-        )
-
-    async def _precache_image_url(
-        self, user: UserID, media_info: MediaInfo, og: JsonDict
-    ) -> None:
-        """
-        Pre-cache the image (if one exists) for posterity
-
-        Args:
-            user: The user requesting the preview.
-            media_info: The media being previewed.
-            og: The Open Graph dictionary. This is modified with image information.
-        """
-        # If there's no image or it is blank, there's nothing to do.
-        if "og:image" not in og:
-            return
-
-        # Remove the raw image URL, this will be replaced with an MXC URL, if successful.
-        image_url = og.pop("og:image")
-        if not image_url:
-            return
-
-        # The image URL from the HTML might be relative to the previewed page,
-        # convert it to an URL which can be requested directly.
-        url_parts = urlparse(image_url)
-        if url_parts.scheme != "data":
-            image_url = urljoin(media_info.uri, image_url)
-
-        # FIXME: it might be cleaner to use the same flow as the main /preview_url
-        # request itself and benefit from the same caching etc.  But for now we
-        # just rely on the caching on the master request to speed things up.
-        try:
-            image_info = await self._handle_url(image_url, user, allow_data_urls=True)
-        except Exception as e:
-            # Pre-caching the image failed, don't block the entire URL preview.
-            logger.warning(
-                "Pre-caching image failed during URL preview: %s errored with %s",
-                image_url,
-                e,
-            )
-            return
-
-        if _is_media(image_info.media_type):
-            # TODO: make sure we don't choke on white-on-transparent images
-            file_id = image_info.filesystem_id
-            dims = await self.media_repo._generate_thumbnails(
-                None, file_id, file_id, image_info.media_type, url_cache=True
-            )
-            if dims:
-                og["og:image:width"] = dims["width"]
-                og["og:image:height"] = dims["height"]
-            else:
-                logger.warning("Couldn't get dims for %s", image_url)
-
-            og["og:image"] = f"mxc://{self.server_name}/{image_info.filesystem_id}"
-            og["og:image:type"] = image_info.media_type
-            og["matrix:image:size"] = image_info.media_length
-
-    async def _handle_oembed_response(
-        self, url: str, media_info: MediaInfo, expiration_ms: int
-    ) -> Tuple[JsonDict, Optional[str], int]:
-        """
-        Parse the downloaded oEmbed info.
-
-        Args:
-            url: The URL which is being previewed (not the one which was
-                requested).
-            media_info: The media being previewed.
-            expiration_ms: The length of time, in milliseconds, the media is valid for.
-
-        Returns:
-            A tuple of:
-                The Open Graph dictionary, if the oEmbed info can be parsed.
-                The author name if it could be retrieved from oEmbed.
-                The (possibly updated) length of time, in milliseconds, the media is valid for.
-        """
-        # If JSON was not returned, there's nothing to do.
-        if not _is_json(media_info.media_type):
-            return {}, None, expiration_ms
-
-        with open(media_info.filename, "rb") as file:
-            body = file.read()
-
-        oembed_response = self._oembed.parse_oembed_response(url, body)
-        open_graph_result = oembed_response.open_graph_result
-
-        # Use the cache age from the oEmbed result, if one was given.
-        if open_graph_result and oembed_response.cache_age is not None:
-            expiration_ms = oembed_response.cache_age
-
-        return open_graph_result, oembed_response.author_name, expiration_ms
-
-    def _start_expire_url_cache_data(self) -> Deferred:
-        return run_as_background_process(
-            "expire_url_cache_data", self._expire_url_cache_data
-        )
-
-    async def _expire_url_cache_data(self) -> None:
-        """Clean up expired url cache content, media and thumbnails."""
-
-        assert self._worker_run_media_background_jobs
-
-        now = self.clock.time_msec()
-
-        logger.debug("Running url preview cache expiry")
-
-        def try_remove_parent_dirs(dirs: Iterable[str]) -> None:
-            """Attempt to remove the given chain of parent directories
-
-            Args:
-                dirs: The list of directory paths to delete, with children appearing
-                    before their parents.
-            """
-            for dir in dirs:
-                try:
-                    os.rmdir(dir)
-                except FileNotFoundError:
-                    # Already deleted, continue with deleting the rest
-                    pass
-                except OSError as e:
-                    # Failed, skip deleting the rest of the parent dirs
-                    if e.errno != errno.ENOTEMPTY:
-                        logger.warning(
-                            "Failed to remove media directory while clearing url preview cache: %r: %s",
-                            dir,
-                            e,
-                        )
-                    break
-
-        # First we delete expired url cache entries
-        media_ids = await self.store.get_expired_url_cache(now)
-
-        removed_media = []
-        for media_id in media_ids:
-            fname = self.filepaths.url_cache_filepath(media_id)
-            try:
-                os.remove(fname)
-            except FileNotFoundError:
-                pass  # If the path doesn't exist, meh
-            except OSError as e:
-                logger.warning(
-                    "Failed to remove media while clearing url preview cache: %r: %s",
-                    media_id,
-                    e,
-                )
-                continue
-
-            removed_media.append(media_id)
-
-            dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
-            try_remove_parent_dirs(dirs)
-
-        await self.store.delete_url_cache(removed_media)
-
-        if removed_media:
-            logger.debug(
-                "Deleted %d entries from url preview cache", len(removed_media)
-            )
-        else:
-            logger.debug("No entries removed from url preview cache")
-
-        # Now we delete old images associated with the url cache.
-        # These may be cached for a bit on the client (i.e., they
-        # may have a room open with a preview url thing open).
-        # So we wait a couple of days before deleting, just in case.
-        expire_before = now - IMAGE_CACHE_EXPIRY_MS
-        media_ids = await self.store.get_url_cache_media_before(expire_before)
-
-        removed_media = []
-        for media_id in media_ids:
-            fname = self.filepaths.url_cache_filepath(media_id)
-            try:
-                os.remove(fname)
-            except FileNotFoundError:
-                pass  # If the path doesn't exist, meh
-            except OSError as e:
-                logger.warning(
-                    "Failed to remove media from url preview cache: %r: %s", media_id, e
-                )
-                continue
-
-            dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
-            try_remove_parent_dirs(dirs)
-
-            thumbnail_dir = self.filepaths.url_cache_thumbnail_directory(media_id)
-            try:
-                shutil.rmtree(thumbnail_dir)
-            except FileNotFoundError:
-                pass  # If the path doesn't exist, meh
-            except OSError as e:
-                logger.warning(
-                    "Failed to remove media from url preview cache: %r: %s", media_id, e
-                )
-                continue
-
-            removed_media.append(media_id)
-
-            dirs = self.filepaths.url_cache_thumbnail_dirs_to_delete(media_id)
-            # Note that one of the directories to be deleted has already been
-            # removed by the `rmtree` above.
-            try_remove_parent_dirs(dirs)
-
-        await self.store.delete_url_cache_media(removed_media)
-
-        if removed_media:
-            logger.debug("Deleted %d media from url preview cache", len(removed_media))
-        else:
-            logger.debug("No media removed from url preview cache")
-
-
-def _is_media(content_type: str) -> bool:
-    return content_type.lower().startswith("image/")
-
-
-def _is_html(content_type: str) -> bool:
-    content_type = content_type.lower()
-    return content_type.startswith("text/html") or content_type.startswith(
-        "application/xhtml"
-    )
-
-
-def _is_json(content_type: str) -> bool:
-    return content_type.lower().startswith("application/json")
-
-
-def _is_previewable(content_type: str) -> bool:
-    """Returns True for content types for which we will perform URL preview and False
-    otherwise."""
-
-    return _is_html(content_type) or _is_media(content_type) or _is_json(content_type)
diff --git a/synapse/rest/media/v1/storage_provider.py b/synapse/rest/media/v1/storage_provider.py
index 1c9b71d69c..d7653f30ae 100644
--- a/synapse/rest/media/v1/storage_provider.py
+++ b/synapse/rest/media/v1/storage_provider.py
@@ -1,4 +1,4 @@
-# Copyright 2018-2021 The Matrix.org Foundation C.I.C.
+# Copyright 2023 The Matrix.org Foundation C.I.C.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,171 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+#
 
-import abc
-import logging
-import os
-import shutil
-from typing import TYPE_CHECKING, Callable, Optional
-
-from synapse.config._base import Config
-from synapse.logging.context import defer_to_thread, run_in_background
-from synapse.util.async_helpers import maybe_awaitable
-
-from ._base import FileInfo, Responder
-from .media_storage import FileResponder
-
-logger = logging.getLogger(__name__)
-
-if TYPE_CHECKING:
-    from synapse.server import HomeServer
-
-
-class StorageProvider(metaclass=abc.ABCMeta):
-    """A storage provider is a service that can store uploaded media and
-    retrieve them.
-    """
-
-    @abc.abstractmethod
-    async def store_file(self, path: str, file_info: FileInfo) -> None:
-        """Store the file described by file_info. The actual contents can be
-        retrieved by reading the file in file_info.upload_path.
-
-        Args:
-            path: Relative path of file in local cache
-            file_info: The metadata of the file.
-        """
-
-    @abc.abstractmethod
-    async def fetch(self, path: str, file_info: FileInfo) -> Optional[Responder]:
-        """Attempt to fetch the file described by file_info and stream it
-        into writer.
-
-        Args:
-            path: Relative path of file in local cache
-            file_info: The metadata of the file.
-
-        Returns:
-            Returns a Responder if the provider has the file, otherwise returns None.
-        """
-
-
-class StorageProviderWrapper(StorageProvider):
-    """Wraps a storage provider and provides various config options
-
-    Args:
-        backend: The storage provider to wrap.
-        store_local: Whether to store new local files or not.
-        store_synchronous: Whether to wait for file to be successfully
-            uploaded, or todo the upload in the background.
-        store_remote: Whether remote media should be uploaded
-    """
-
-    def __init__(
-        self,
-        backend: StorageProvider,
-        store_local: bool,
-        store_synchronous: bool,
-        store_remote: bool,
-    ):
-        self.backend = backend
-        self.store_local = store_local
-        self.store_synchronous = store_synchronous
-        self.store_remote = store_remote
-
-    def __str__(self) -> str:
-        return "StorageProviderWrapper[%s]" % (self.backend,)
-
-    async def store_file(self, path: str, file_info: FileInfo) -> None:
-        if not file_info.server_name and not self.store_local:
-            return None
-
-        if file_info.server_name and not self.store_remote:
-            return None
-
-        if file_info.url_cache:
-            # The URL preview cache is short lived and not worth offloading or
-            # backing up.
-            return None
-
-        if self.store_synchronous:
-            # store_file is supposed to return an Awaitable, but guard
-            # against improper implementations.
-            await maybe_awaitable(self.backend.store_file(path, file_info))  # type: ignore
-        else:
-            # TODO: Handle errors.
-            async def store() -> None:
-                try:
-                    return await maybe_awaitable(
-                        self.backend.store_file(path, file_info)
-                    )
-                except Exception:
-                    logger.exception("Error storing file")
-
-            run_in_background(store)
-
-    async def fetch(self, path: str, file_info: FileInfo) -> Optional[Responder]:
-        if file_info.url_cache:
-            # Files in the URL preview cache definitely aren't stored here,
-            # so avoid any potentially slow I/O or network access.
-            return None
-
-        # store_file is supposed to return an Awaitable, but guard
-        # against improper implementations.
-        return await maybe_awaitable(self.backend.fetch(path, file_info))
-
-
-class FileStorageProviderBackend(StorageProvider):
-    """A storage provider that stores files in a directory on a filesystem.
-
-    Args:
-        hs
-        config: The config returned by `parse_config`.
-    """
-
-    def __init__(self, hs: "HomeServer", config: str):
-        self.hs = hs
-        self.cache_directory = hs.config.media.media_store_path
-        self.base_directory = config
-
-    def __str__(self) -> str:
-        return "FileStorageProviderBackend[%s]" % (self.base_directory,)
-
-    async def store_file(self, path: str, file_info: FileInfo) -> None:
-        """See StorageProvider.store_file"""
-
-        primary_fname = os.path.join(self.cache_directory, path)
-        backup_fname = os.path.join(self.base_directory, path)
-
-        dirname = os.path.dirname(backup_fname)
-        os.makedirs(dirname, exist_ok=True)
-
-        # mypy needs help inferring the type of the second parameter, which is generic
-        shutil_copyfile: Callable[[str, str], str] = shutil.copyfile
-        await defer_to_thread(
-            self.hs.get_reactor(),
-            shutil_copyfile,
-            primary_fname,
-            backup_fname,
-        )
-
-    async def fetch(self, path: str, file_info: FileInfo) -> Optional[Responder]:
-        """See StorageProvider.fetch"""
-
-        backup_fname = os.path.join(self.base_directory, path)
-        if os.path.isfile(backup_fname):
-            return FileResponder(open(backup_fname, "rb"))
-
-        return None
-
-    @staticmethod
-    def parse_config(config: dict) -> str:
-        """Called on startup to parse config supplied. This should parse
-        the config and raise if there is a problem.
-
-        The returned value is passed into the constructor.
-
-        In this case we only care about a single param, the directory, so let's
-        just pull that out.
-        """
-        return Config.ensure_directory(config["directory"])
+# This exists purely for backwards compatibility with media providers.
+from synapse.media.storage_provider import StorageProvider  # noqa: F401
diff --git a/synapse/rest/media/v1/thumbnail_resource.py b/synapse/rest/media/v1/thumbnail_resource.py
deleted file mode 100644
index 3e720018b3..0000000000
--- a/synapse/rest/media/v1/thumbnail_resource.py
+++ /dev/null
@@ -1,555 +0,0 @@
-# Copyright 2014-2016 OpenMarket Ltd
-# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import logging
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
-
-from synapse.api.errors import Codes, SynapseError, cs_error
-from synapse.config.repository import THUMBNAIL_SUPPORTED_MEDIA_FORMAT_MAP
-from synapse.http.server import (
-    DirectServeJsonResource,
-    respond_with_json,
-    set_corp_headers,
-    set_cors_headers,
-)
-from synapse.http.servlet import parse_integer, parse_string
-from synapse.http.site import SynapseRequest
-from synapse.rest.media.v1.media_storage import MediaStorage
-
-from ._base import (
-    FileInfo,
-    ThumbnailInfo,
-    parse_media_id,
-    respond_404,
-    respond_with_file,
-    respond_with_responder,
-)
-
-if TYPE_CHECKING:
-    from synapse.rest.media.v1.media_repository import MediaRepository
-    from synapse.server import HomeServer
-
-logger = logging.getLogger(__name__)
-
-
-class ThumbnailResource(DirectServeJsonResource):
-    isLeaf = True
-
-    def __init__(
-        self,
-        hs: "HomeServer",
-        media_repo: "MediaRepository",
-        media_storage: MediaStorage,
-    ):
-        super().__init__()
-
-        self.store = hs.get_datastores().main
-        self.media_repo = media_repo
-        self.media_storage = media_storage
-        self.dynamic_thumbnails = hs.config.media.dynamic_thumbnails
-        self.server_name = hs.hostname
-
-    async def _async_render_GET(self, request: SynapseRequest) -> None:
-        set_cors_headers(request)
-        set_corp_headers(request)
-        server_name, media_id, _ = parse_media_id(request)
-        width = parse_integer(request, "width", required=True)
-        height = parse_integer(request, "height", required=True)
-        method = parse_string(request, "method", "scale")
-        # TODO Parse the Accept header to get an prioritised list of thumbnail types.
-        m_type = "image/png"
-
-        if server_name == self.server_name:
-            if self.dynamic_thumbnails:
-                await self._select_or_generate_local_thumbnail(
-                    request, media_id, width, height, method, m_type
-                )
-            else:
-                await self._respond_local_thumbnail(
-                    request, media_id, width, height, method, m_type
-                )
-            self.media_repo.mark_recently_accessed(None, media_id)
-        else:
-            if self.dynamic_thumbnails:
-                await self._select_or_generate_remote_thumbnail(
-                    request, server_name, media_id, width, height, method, m_type
-                )
-            else:
-                await self._respond_remote_thumbnail(
-                    request, server_name, media_id, width, height, method, m_type
-                )
-            self.media_repo.mark_recently_accessed(server_name, media_id)
-
-    async def _respond_local_thumbnail(
-        self,
-        request: SynapseRequest,
-        media_id: str,
-        width: int,
-        height: int,
-        method: str,
-        m_type: str,
-    ) -> None:
-        media_info = await self.store.get_local_media(media_id)
-
-        if not media_info:
-            respond_404(request)
-            return
-        if media_info["quarantined_by"]:
-            logger.info("Media is quarantined")
-            respond_404(request)
-            return
-
-        thumbnail_infos = await self.store.get_local_media_thumbnails(media_id)
-        await self._select_and_respond_with_thumbnail(
-            request,
-            width,
-            height,
-            method,
-            m_type,
-            thumbnail_infos,
-            media_id,
-            media_id,
-            url_cache=bool(media_info["url_cache"]),
-            server_name=None,
-        )
-
-    async def _select_or_generate_local_thumbnail(
-        self,
-        request: SynapseRequest,
-        media_id: str,
-        desired_width: int,
-        desired_height: int,
-        desired_method: str,
-        desired_type: str,
-    ) -> None:
-        media_info = await self.store.get_local_media(media_id)
-
-        if not media_info:
-            respond_404(request)
-            return
-        if media_info["quarantined_by"]:
-            logger.info("Media is quarantined")
-            respond_404(request)
-            return
-
-        thumbnail_infos = await self.store.get_local_media_thumbnails(media_id)
-        for info in thumbnail_infos:
-            t_w = info["thumbnail_width"] == desired_width
-            t_h = info["thumbnail_height"] == desired_height
-            t_method = info["thumbnail_method"] == desired_method
-            t_type = info["thumbnail_type"] == desired_type
-
-            if t_w and t_h and t_method and t_type:
-                file_info = FileInfo(
-                    server_name=None,
-                    file_id=media_id,
-                    url_cache=media_info["url_cache"],
-                    thumbnail=ThumbnailInfo(
-                        width=info["thumbnail_width"],
-                        height=info["thumbnail_height"],
-                        type=info["thumbnail_type"],
-                        method=info["thumbnail_method"],
-                    ),
-                )
-
-                t_type = file_info.thumbnail_type
-                t_length = info["thumbnail_length"]
-
-                responder = await self.media_storage.fetch_media(file_info)
-                if responder:
-                    await respond_with_responder(request, responder, t_type, t_length)
-                    return
-
-        logger.debug("We don't have a thumbnail of that size. Generating")
-
-        # Okay, so we generate one.
-        file_path = await self.media_repo.generate_local_exact_thumbnail(
-            media_id,
-            desired_width,
-            desired_height,
-            desired_method,
-            desired_type,
-            url_cache=bool(media_info["url_cache"]),
-        )
-
-        if file_path:
-            await respond_with_file(request, desired_type, file_path)
-        else:
-            logger.warning("Failed to generate thumbnail")
-            raise SynapseError(400, "Failed to generate thumbnail.")
-
-    async def _select_or_generate_remote_thumbnail(
-        self,
-        request: SynapseRequest,
-        server_name: str,
-        media_id: str,
-        desired_width: int,
-        desired_height: int,
-        desired_method: str,
-        desired_type: str,
-    ) -> None:
-        media_info = await self.media_repo.get_remote_media_info(server_name, media_id)
-
-        thumbnail_infos = await self.store.get_remote_media_thumbnails(
-            server_name, media_id
-        )
-
-        file_id = media_info["filesystem_id"]
-
-        for info in thumbnail_infos:
-            t_w = info["thumbnail_width"] == desired_width
-            t_h = info["thumbnail_height"] == desired_height
-            t_method = info["thumbnail_method"] == desired_method
-            t_type = info["thumbnail_type"] == desired_type
-
-            if t_w and t_h and t_method and t_type:
-                file_info = FileInfo(
-                    server_name=server_name,
-                    file_id=media_info["filesystem_id"],
-                    thumbnail=ThumbnailInfo(
-                        width=info["thumbnail_width"],
-                        height=info["thumbnail_height"],
-                        type=info["thumbnail_type"],
-                        method=info["thumbnail_method"],
-                    ),
-                )
-
-                t_type = file_info.thumbnail_type
-                t_length = info["thumbnail_length"]
-
-                responder = await self.media_storage.fetch_media(file_info)
-                if responder:
-                    await respond_with_responder(request, responder, t_type, t_length)
-                    return
-
-        logger.debug("We don't have a thumbnail of that size. Generating")
-
-        # Okay, so we generate one.
-        file_path = await self.media_repo.generate_remote_exact_thumbnail(
-            server_name,
-            file_id,
-            media_id,
-            desired_width,
-            desired_height,
-            desired_method,
-            desired_type,
-        )
-
-        if file_path:
-            await respond_with_file(request, desired_type, file_path)
-        else:
-            logger.warning("Failed to generate thumbnail")
-            raise SynapseError(400, "Failed to generate thumbnail.")
-
-    async def _respond_remote_thumbnail(
-        self,
-        request: SynapseRequest,
-        server_name: str,
-        media_id: str,
-        width: int,
-        height: int,
-        method: str,
-        m_type: str,
-    ) -> None:
-        # TODO: Don't download the whole remote file
-        # We should proxy the thumbnail from the remote server instead of
-        # downloading the remote file and generating our own thumbnails.
-        media_info = await self.media_repo.get_remote_media_info(server_name, media_id)
-
-        thumbnail_infos = await self.store.get_remote_media_thumbnails(
-            server_name, media_id
-        )
-        await self._select_and_respond_with_thumbnail(
-            request,
-            width,
-            height,
-            method,
-            m_type,
-            thumbnail_infos,
-            media_id,
-            media_info["filesystem_id"],
-            url_cache=False,
-            server_name=server_name,
-        )
-
-    async def _select_and_respond_with_thumbnail(
-        self,
-        request: SynapseRequest,
-        desired_width: int,
-        desired_height: int,
-        desired_method: str,
-        desired_type: str,
-        thumbnail_infos: List[Dict[str, Any]],
-        media_id: str,
-        file_id: str,
-        url_cache: bool,
-        server_name: Optional[str] = None,
-    ) -> None:
-        """
-        Respond to a request with an appropriate thumbnail from the previously generated thumbnails.
-
-        Args:
-            request: The incoming request.
-            desired_width: The desired width, the returned thumbnail may be larger than this.
-            desired_height: The desired height, the returned thumbnail may be larger than this.
-            desired_method: The desired method used to generate the thumbnail.
-            desired_type: The desired content-type of the thumbnail.
-            thumbnail_infos: A list of dictionaries of candidate thumbnails.
-            file_id: The ID of the media that a thumbnail is being requested for.
-            url_cache: True if this is from a URL cache.
-            server_name: The server name, if this is a remote thumbnail.
-        """
-        logger.debug(
-            "_select_and_respond_with_thumbnail: media_id=%s desired=%sx%s (%s) thumbnail_infos=%s",
-            media_id,
-            desired_width,
-            desired_height,
-            desired_method,
-            thumbnail_infos,
-        )
-
-        # If `dynamic_thumbnails` is enabled, we expect Synapse to go down a
-        # different code path to handle it.
-        assert not self.dynamic_thumbnails
-
-        if thumbnail_infos:
-            file_info = self._select_thumbnail(
-                desired_width,
-                desired_height,
-                desired_method,
-                desired_type,
-                thumbnail_infos,
-                file_id,
-                url_cache,
-                server_name,
-            )
-            if not file_info:
-                logger.info("Couldn't find a thumbnail matching the desired inputs")
-                respond_404(request)
-                return
-
-            # The thumbnail property must exist.
-            assert file_info.thumbnail is not None
-
-            responder = await self.media_storage.fetch_media(file_info)
-            if responder:
-                await respond_with_responder(
-                    request,
-                    responder,
-                    file_info.thumbnail.type,
-                    file_info.thumbnail.length,
-                )
-                return
-
-            # If we can't find the thumbnail we regenerate it. This can happen
-            # if e.g. we've deleted the thumbnails but still have the original
-            # image somewhere.
-            #
-            # Since we have an entry for the thumbnail in the DB we a) know we
-            # have have successfully generated the thumbnail in the past (so we
-            # don't need to worry about repeatedly failing to generate
-            # thumbnails), and b) have already calculated that appropriate
-            # width/height/method so we can just call the "generate exact"
-            # methods.
-
-            # First let's check that we do actually have the original image
-            # still. This will throw a 404 if we don't.
-            # TODO: We should refetch the thumbnails for remote media.
-            await self.media_storage.ensure_media_is_in_local_cache(
-                FileInfo(server_name, file_id, url_cache=url_cache)
-            )
-
-            if server_name:
-                await self.media_repo.generate_remote_exact_thumbnail(
-                    server_name,
-                    file_id=file_id,
-                    media_id=media_id,
-                    t_width=file_info.thumbnail.width,
-                    t_height=file_info.thumbnail.height,
-                    t_method=file_info.thumbnail.method,
-                    t_type=file_info.thumbnail.type,
-                )
-            else:
-                await self.media_repo.generate_local_exact_thumbnail(
-                    media_id=media_id,
-                    t_width=file_info.thumbnail.width,
-                    t_height=file_info.thumbnail.height,
-                    t_method=file_info.thumbnail.method,
-                    t_type=file_info.thumbnail.type,
-                    url_cache=url_cache,
-                )
-
-            responder = await self.media_storage.fetch_media(file_info)
-            await respond_with_responder(
-                request,
-                responder,
-                file_info.thumbnail.type,
-                file_info.thumbnail.length,
-            )
-        else:
-            # This might be because:
-            # 1. We can't create thumbnails for the given media (corrupted or
-            #    unsupported file type), or
-            # 2. The thumbnailing process never ran or errored out initially
-            #    when the media was first uploaded (these bugs should be
-            #    reported and fixed).
-            # Note that we don't attempt to generate a thumbnail now because
-            # `dynamic_thumbnails` is disabled.
-            logger.info("Failed to find any generated thumbnails")
-
-            respond_with_json(
-                request,
-                400,
-                cs_error(
-                    "Cannot find any thumbnails for the requested media (%r). This might mean the media is not a supported_media_format=(%s) or that thumbnailing failed for some other reason. (Dynamic thumbnails are disabled on this server.)"
-                    % (
-                        request.postpath,
-                        ", ".join(THUMBNAIL_SUPPORTED_MEDIA_FORMAT_MAP.keys()),
-                    ),
-                    code=Codes.UNKNOWN,
-                ),
-                send_cors=True,
-            )
-
-    def _select_thumbnail(
-        self,
-        desired_width: int,
-        desired_height: int,
-        desired_method: str,
-        desired_type: str,
-        thumbnail_infos: List[Dict[str, Any]],
-        file_id: str,
-        url_cache: bool,
-        server_name: Optional[str],
-    ) -> Optional[FileInfo]:
-        """
-        Choose an appropriate thumbnail from the previously generated thumbnails.
-
-        Args:
-            desired_width: The desired width, the returned thumbnail may be larger than this.
-            desired_height: The desired height, the returned thumbnail may be larger than this.
-            desired_method: The desired method used to generate the thumbnail.
-            desired_type: The desired content-type of the thumbnail.
-            thumbnail_infos: A list of dictionaries of candidate thumbnails.
-            file_id: The ID of the media that a thumbnail is being requested for.
-            url_cache: True if this is from a URL cache.
-            server_name: The server name, if this is a remote thumbnail.
-
-        Returns:
-             The thumbnail which best matches the desired parameters.
-        """
-        desired_method = desired_method.lower()
-
-        # The chosen thumbnail.
-        thumbnail_info = None
-
-        d_w = desired_width
-        d_h = desired_height
-
-        if desired_method == "crop":
-            # Thumbnails that match equal or larger sizes of desired width/height.
-            crop_info_list: List[Tuple[int, int, int, bool, int, Dict[str, Any]]] = []
-            # Other thumbnails.
-            crop_info_list2: List[Tuple[int, int, int, bool, int, Dict[str, Any]]] = []
-            for info in thumbnail_infos:
-                # Skip thumbnails generated with different methods.
-                if info["thumbnail_method"] != "crop":
-                    continue
-
-                t_w = info["thumbnail_width"]
-                t_h = info["thumbnail_height"]
-                aspect_quality = abs(d_w * t_h - d_h * t_w)
-                min_quality = 0 if d_w <= t_w and d_h <= t_h else 1
-                size_quality = abs((d_w - t_w) * (d_h - t_h))
-                type_quality = desired_type != info["thumbnail_type"]
-                length_quality = info["thumbnail_length"]
-                if t_w >= d_w or t_h >= d_h:
-                    crop_info_list.append(
-                        (
-                            aspect_quality,
-                            min_quality,
-                            size_quality,
-                            type_quality,
-                            length_quality,
-                            info,
-                        )
-                    )
-                else:
-                    crop_info_list2.append(
-                        (
-                            aspect_quality,
-                            min_quality,
-                            size_quality,
-                            type_quality,
-                            length_quality,
-                            info,
-                        )
-                    )
-            # Pick the most appropriate thumbnail. Some values of `desired_width` and
-            # `desired_height` may result in a tie, in which case we avoid comparing on
-            # the thumbnail info dictionary and pick the thumbnail that appears earlier
-            # in the list of candidates.
-            if crop_info_list:
-                thumbnail_info = min(crop_info_list, key=lambda t: t[:-1])[-1]
-            elif crop_info_list2:
-                thumbnail_info = min(crop_info_list2, key=lambda t: t[:-1])[-1]
-        elif desired_method == "scale":
-            # Thumbnails that match equal or larger sizes of desired width/height.
-            info_list: List[Tuple[int, bool, int, Dict[str, Any]]] = []
-            # Other thumbnails.
-            info_list2: List[Tuple[int, bool, int, Dict[str, Any]]] = []
-
-            for info in thumbnail_infos:
-                # Skip thumbnails generated with different methods.
-                if info["thumbnail_method"] != "scale":
-                    continue
-
-                t_w = info["thumbnail_width"]
-                t_h = info["thumbnail_height"]
-                size_quality = abs((d_w - t_w) * (d_h - t_h))
-                type_quality = desired_type != info["thumbnail_type"]
-                length_quality = info["thumbnail_length"]
-                if t_w >= d_w or t_h >= d_h:
-                    info_list.append((size_quality, type_quality, length_quality, info))
-                else:
-                    info_list2.append(
-                        (size_quality, type_quality, length_quality, info)
-                    )
-            # Pick the most appropriate thumbnail. Some values of `desired_width` and
-            # `desired_height` may result in a tie, in which case we avoid comparing on
-            # the thumbnail info dictionary and pick the thumbnail that appears earlier
-            # in the list of candidates.
-            if info_list:
-                thumbnail_info = min(info_list, key=lambda t: t[:-1])[-1]
-            elif info_list2:
-                thumbnail_info = min(info_list2, key=lambda t: t[:-1])[-1]
-
-        if thumbnail_info:
-            return FileInfo(
-                file_id=file_id,
-                url_cache=url_cache,
-                server_name=server_name,
-                thumbnail=ThumbnailInfo(
-                    width=thumbnail_info["thumbnail_width"],
-                    height=thumbnail_info["thumbnail_height"],
-                    type=thumbnail_info["thumbnail_type"],
-                    method=thumbnail_info["thumbnail_method"],
-                    length=thumbnail_info["thumbnail_length"],
-                ),
-            )
-
-        # No matching thumbnail was found.
-        return None
diff --git a/synapse/rest/media/v1/thumbnailer.py b/synapse/rest/media/v1/thumbnailer.py
deleted file mode 100644
index f909a4fb9a..0000000000
--- a/synapse/rest/media/v1/thumbnailer.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Copyright 2014-2016 OpenMarket Ltd
-# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-from io import BytesIO
-from types import TracebackType
-from typing import Optional, Tuple, Type
-
-from PIL import Image
-
-logger = logging.getLogger(__name__)
-
-EXIF_ORIENTATION_TAG = 0x0112
-EXIF_TRANSPOSE_MAPPINGS = {
-    2: Image.FLIP_LEFT_RIGHT,
-    3: Image.ROTATE_180,
-    4: Image.FLIP_TOP_BOTTOM,
-    5: Image.TRANSPOSE,
-    6: Image.ROTATE_270,
-    7: Image.TRANSVERSE,
-    8: Image.ROTATE_90,
-}
-
-
-class ThumbnailError(Exception):
-    """An error occurred generating a thumbnail."""
-
-
-class Thumbnailer:
-    FORMATS = {"image/jpeg": "JPEG", "image/png": "PNG"}
-
-    @staticmethod
-    def set_limits(max_image_pixels: int) -> None:
-        Image.MAX_IMAGE_PIXELS = max_image_pixels
-
-    def __init__(self, input_path: str):
-        # Have we closed the image?
-        self._closed = False
-
-        try:
-            self.image = Image.open(input_path)
-        except OSError as e:
-            # If an error occurs opening the image, a thumbnail won't be able to
-            # be generated.
-            raise ThumbnailError from e
-        except Image.DecompressionBombError as e:
-            # If an image decompression bomb error occurs opening the image,
-            # then the image exceeds the pixel limit and a thumbnail won't
-            # be able to be generated.
-            raise ThumbnailError from e
-
-        self.width, self.height = self.image.size
-        self.transpose_method = None
-        try:
-            # We don't use ImageOps.exif_transpose since it crashes with big EXIF
-            #
-            # Ignore safety: Pillow seems to acknowledge that this method is
-            # "private, experimental, but generally widely used". Pillow 6
-            # includes a public getexif() method (no underscore) that we might
-            # consider using instead when we can bump that dependency.
-            #
-            # At the time of writing, Debian buster (currently oldstable)
-            # provides version 5.4.1. It's expected to EOL in mid-2022, see
-            # https://wiki.debian.org/DebianReleases#Production_Releases
-            image_exif = self.image._getexif()  # type: ignore
-            if image_exif is not None:
-                image_orientation = image_exif.get(EXIF_ORIENTATION_TAG)
-                assert type(image_orientation) is int
-                self.transpose_method = EXIF_TRANSPOSE_MAPPINGS.get(image_orientation)
-        except Exception as e:
-            # A lot of parsing errors can happen when parsing EXIF
-            logger.info("Error parsing image EXIF information: %s", e)
-
-    def transpose(self) -> Tuple[int, int]:
-        """Transpose the image using its EXIF Orientation tag
-
-        Returns:
-            A tuple containing the new image size in pixels as (width, height).
-        """
-        if self.transpose_method is not None:
-            # Safety: `transpose` takes an int rather than e.g. an IntEnum.
-            # self.transpose_method is set above to be a value in
-            # EXIF_TRANSPOSE_MAPPINGS, and that only contains correct values.
-            with self.image:
-                self.image = self.image.transpose(self.transpose_method)  # type: ignore[arg-type]
-            self.width, self.height = self.image.size
-            self.transpose_method = None
-            # We don't need EXIF any more
-            self.image.info["exif"] = None
-        return self.image.size
-
-    def aspect(self, max_width: int, max_height: int) -> Tuple[int, int]:
-        """Calculate the largest size that preserves aspect ratio which
-        fits within the given rectangle::
-
-            (w_in / h_in) = (w_out / h_out)
-            w_out = max(min(w_max, h_max * (w_in / h_in)), 1)
-            h_out = max(min(h_max, w_max * (h_in / w_in)), 1)
-
-        Args:
-            max_width: The largest possible width.
-            max_height: The largest possible height.
-        """
-
-        if max_width * self.height < max_height * self.width:
-            return max_width, max((max_width * self.height) // self.width, 1)
-        else:
-            return max((max_height * self.width) // self.height, 1), max_height
-
-    def _resize(self, width: int, height: int) -> Image.Image:
-        # 1-bit or 8-bit color palette images need converting to RGB
-        # otherwise they will be scaled using nearest neighbour which
-        # looks awful.
-        #
-        # If the image has transparency, use RGBA instead.
-        if self.image.mode in ["1", "L", "P"]:
-            if self.image.info.get("transparency", None) is not None:
-                with self.image:
-                    self.image = self.image.convert("RGBA")
-            else:
-                with self.image:
-                    self.image = self.image.convert("RGB")
-        return self.image.resize((width, height), Image.ANTIALIAS)
-
-    def scale(self, width: int, height: int, output_type: str) -> BytesIO:
-        """Rescales the image to the given dimensions.
-
-        Returns:
-            The bytes of the encoded image ready to be written to disk
-        """
-        with self._resize(width, height) as scaled:
-            return self._encode_image(scaled, output_type)
-
-    def crop(self, width: int, height: int, output_type: str) -> BytesIO:
-        """Rescales and crops the image to the given dimensions preserving
-        aspect::
-            (w_in / h_in) = (w_scaled / h_scaled)
-            w_scaled = max(w_out, h_out * (w_in / h_in))
-            h_scaled = max(h_out, w_out * (h_in / w_in))
-
-        Args:
-            max_width: The largest possible width.
-            max_height: The largest possible height.
-
-        Returns:
-            The bytes of the encoded image ready to be written to disk
-        """
-        if width * self.height > height * self.width:
-            scaled_width = width
-            scaled_height = (width * self.height) // self.width
-            crop_top = (scaled_height - height) // 2
-            crop_bottom = height + crop_top
-            crop = (0, crop_top, width, crop_bottom)
-        else:
-            scaled_width = (height * self.width) // self.height
-            scaled_height = height
-            crop_left = (scaled_width - width) // 2
-            crop_right = width + crop_left
-            crop = (crop_left, 0, crop_right, height)
-
-        with self._resize(scaled_width, scaled_height) as scaled_image:
-            with scaled_image.crop(crop) as cropped:
-                return self._encode_image(cropped, output_type)
-
-    def _encode_image(self, output_image: Image.Image, output_type: str) -> BytesIO:
-        output_bytes_io = BytesIO()
-        fmt = self.FORMATS[output_type]
-        if fmt == "JPEG":
-            output_image = output_image.convert("RGB")
-        output_image.save(output_bytes_io, fmt, quality=80)
-        return output_bytes_io
-
-    def close(self) -> None:
-        """Closes the underlying image file.
-
-        Once closed no other functions can be called.
-
-        Can be called multiple times.
-        """
-
-        if self._closed:
-            return
-
-        self._closed = True
-
-        # Since we run this on the finalizer then we need to handle `__init__`
-        # raising an exception before it can define `self.image`.
-        image = getattr(self, "image", None)
-        if image is None:
-            return
-
-        image.close()
-
-    def __enter__(self) -> "Thumbnailer":
-        """Make `Thumbnailer` a context manager that calls `close` on
-        `__exit__`.
-        """
-        return self
-
-    def __exit__(
-        self,
-        type: Optional[Type[BaseException]],
-        value: Optional[BaseException],
-        traceback: Optional[TracebackType],
-    ) -> None:
-        self.close()
-
-    def __del__(self) -> None:
-        # Make sure we actually do close the image, rather than leak data.
-        self.close()
diff --git a/synapse/rest/media/v1/upload_resource.py b/synapse/rest/media/v1/upload_resource.py
deleted file mode 100644
index 97548b54e5..0000000000
--- a/synapse/rest/media/v1/upload_resource.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright 2014-2016 OpenMarket Ltd
-# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from typing import IO, TYPE_CHECKING, Dict, List, Optional
-
-from synapse.api.errors import Codes, SynapseError
-from synapse.http.server import DirectServeJsonResource, respond_with_json
-from synapse.http.servlet import parse_bytes_from_args
-from synapse.http.site import SynapseRequest
-from synapse.rest.media.v1.media_storage import SpamMediaException
-
-if TYPE_CHECKING:
-    from synapse.rest.media.v1.media_repository import MediaRepository
-    from synapse.server import HomeServer
-
-logger = logging.getLogger(__name__)
-
-
-class UploadResource(DirectServeJsonResource):
-    isLeaf = True
-
-    def __init__(self, hs: "HomeServer", media_repo: "MediaRepository"):
-        super().__init__()
-
-        self.media_repo = media_repo
-        self.filepaths = media_repo.filepaths
-        self.store = hs.get_datastores().main
-        self.clock = hs.get_clock()
-        self.server_name = hs.hostname
-        self.auth = hs.get_auth()
-        self.max_upload_size = hs.config.media.max_upload_size
-        self.clock = hs.get_clock()
-
-    async def _async_render_OPTIONS(self, request: SynapseRequest) -> None:
-        respond_with_json(request, 200, {}, send_cors=True)
-
-    async def _async_render_POST(self, request: SynapseRequest) -> None:
-        requester = await self.auth.get_user_by_req(request)
-        raw_content_length = request.getHeader("Content-Length")
-        if raw_content_length is None:
-            raise SynapseError(msg="Request must specify a Content-Length", code=400)
-        try:
-            content_length = int(raw_content_length)
-        except ValueError:
-            raise SynapseError(msg="Content-Length value is invalid", code=400)
-        if content_length > self.max_upload_size:
-            raise SynapseError(
-                msg="Upload request body is too large",
-                code=413,
-                errcode=Codes.TOO_LARGE,
-            )
-
-        args: Dict[bytes, List[bytes]] = request.args  # type: ignore
-        upload_name_bytes = parse_bytes_from_args(args, "filename")
-        if upload_name_bytes:
-            try:
-                upload_name: Optional[str] = upload_name_bytes.decode("utf8")
-            except UnicodeDecodeError:
-                raise SynapseError(
-                    msg="Invalid UTF-8 filename parameter: %r" % (upload_name_bytes,),
-                    code=400,
-                )
-
-        # If the name is falsey (e.g. an empty byte string) ensure it is None.
-        else:
-            upload_name = None
-
-        headers = request.requestHeaders
-
-        if headers.hasHeader(b"Content-Type"):
-            content_type_headers = headers.getRawHeaders(b"Content-Type")
-            assert content_type_headers  # for mypy
-            media_type = content_type_headers[0].decode("ascii")
-        else:
-            media_type = "application/octet-stream"
-
-        # if headers.hasHeader(b"Content-Disposition"):
-        #     disposition = headers.getRawHeaders(b"Content-Disposition")[0]
-        # TODO(markjh): parse content-dispostion
-
-        try:
-            content: IO = request.content  # type: ignore
-            content_uri = await self.media_repo.create_content(
-                media_type, upload_name, content, content_length, requester.user
-            )
-        except SpamMediaException:
-            # For uploading of media we want to respond with a 400, instead of
-            # the default 404, as that would just be confusing.
-            raise SynapseError(400, "Bad content")
-
-        logger.info("Uploaded content with URI '%s'", content_uri)
-
-        respond_with_json(
-            request, 200, {"content_uri": str(content_uri)}, send_cors=True
-        )
-- 
cgit 1.5.1