From 9bb2eac71962970d02842bca441f4bcdbbf93a11 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 22 Feb 2023 15:29:09 -0500
Subject: Bump black from 22.12.0 to 23.1.0 (#15103)

---
 tests/rest/admin/test_device.py               |  3 ---
 tests/rest/admin/test_media.py                |  5 -----
 tests/rest/admin/test_room.py                 |  1 -
 tests/rest/admin/test_server_notice.py        |  1 -
 tests/rest/client/test_account.py             |  4 ----
 tests/rest/client/test_auth.py                |  2 --
 tests/rest/client/test_capabilities.py        |  1 -
 tests/rest/client/test_consent.py             |  1 -
 tests/rest/client/test_directory.py           |  1 -
 tests/rest/client/test_ephemeral_message.py   |  1 -
 tests/rest/client/test_events.py              |  3 ---
 tests/rest/client/test_filter.py              |  1 -
 tests/rest/client/test_login.py               |  2 --
 tests/rest/client/test_login_token_request.py |  1 -
 tests/rest/client/test_presence.py            |  1 -
 tests/rest/client/test_profile.py             |  3 ---
 tests/rest/client/test_register.py            |  4 ----
 tests/rest/client/test_rendezvous.py          |  1 -
 tests/rest/client/test_rooms.py               | 14 ++------------
 tests/rest/client/test_sync.py                |  3 ---
 tests/rest/client/test_third_party_rules.py   |  3 +++
 tests/rest/media/test_media_retention.py      |  1 -
 tests/rest/media/v1/test_media_storage.py     |  3 ---
 tests/rest/media/v1/test_url_preview.py       |  3 ---
 24 files changed, 5 insertions(+), 58 deletions(-)

(limited to 'tests/rest')

diff --git a/tests/rest/admin/test_device.py b/tests/rest/admin/test_device.py
index 03f2112b07..aaa488bced 100644
--- a/tests/rest/admin/test_device.py
+++ b/tests/rest/admin/test_device.py
@@ -28,7 +28,6 @@ from tests import unittest
 
 
 class DeviceRestTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         synapse.rest.admin.register_servlets,
         login.register_servlets,
@@ -291,7 +290,6 @@ class DeviceRestTestCase(unittest.HomeserverTestCase):
 
 
 class DevicesRestTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         synapse.rest.admin.register_servlets,
         login.register_servlets,
@@ -415,7 +413,6 @@ class DevicesRestTestCase(unittest.HomeserverTestCase):
 
 
 class DeleteDevicesRestTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         synapse.rest.admin.register_servlets,
         login.register_servlets,
diff --git a/tests/rest/admin/test_media.py b/tests/rest/admin/test_media.py
index db77a45ae3..f41319a5b6 100644
--- a/tests/rest/admin/test_media.py
+++ b/tests/rest/admin/test_media.py
@@ -34,7 +34,6 @@ INVALID_TIMESTAMP_IN_S = 1893456000  # 2030-01-01 in seconds
 
 
 class DeleteMediaByIDTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         synapse.rest.admin.register_servlets,
         synapse.rest.admin.register_servlets_for_media_repo,
@@ -196,7 +195,6 @@ class DeleteMediaByIDTestCase(unittest.HomeserverTestCase):
 
 
 class DeleteMediaByDateSizeTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         synapse.rest.admin.register_servlets,
         synapse.rest.admin.register_servlets_for_media_repo,
@@ -594,7 +592,6 @@ class DeleteMediaByDateSizeTestCase(unittest.HomeserverTestCase):
 
 
 class QuarantineMediaByIDTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         synapse.rest.admin.register_servlets,
         synapse.rest.admin.register_servlets_for_media_repo,
@@ -724,7 +721,6 @@ class QuarantineMediaByIDTestCase(unittest.HomeserverTestCase):
 
 
 class ProtectMediaByIDTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         synapse.rest.admin.register_servlets,
         synapse.rest.admin.register_servlets_for_media_repo,
@@ -821,7 +817,6 @@ class ProtectMediaByIDTestCase(unittest.HomeserverTestCase):
 
 
 class PurgeMediaCacheTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         synapse.rest.admin.register_servlets,
         synapse.rest.admin.register_servlets_for_media_repo,
diff --git a/tests/rest/admin/test_room.py b/tests/rest/admin/test_room.py
index 453a6e979c..9dbb778679 100644
--- a/tests/rest/admin/test_room.py
+++ b/tests/rest/admin/test_room.py
@@ -1990,7 +1990,6 @@ class RoomMessagesTestCase(unittest.HomeserverTestCase):
 
 
 class JoinAliasRoomTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         synapse.rest.admin.register_servlets,
         room.register_servlets,
diff --git a/tests/rest/admin/test_server_notice.py b/tests/rest/admin/test_server_notice.py
index f71ff46d87..28b999573e 100644
--- a/tests/rest/admin/test_server_notice.py
+++ b/tests/rest/admin/test_server_notice.py
@@ -28,7 +28,6 @@ from tests.unittest import override_config
 
 
 class ServerNoticeTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         synapse.rest.admin.register_servlets,
         login.register_servlets,
diff --git a/tests/rest/client/test_account.py b/tests/rest/client/test_account.py
index e2ee1a1766..2b05dffc7d 100644
--- a/tests/rest/client/test_account.py
+++ b/tests/rest/client/test_account.py
@@ -40,7 +40,6 @@ from tests.unittest import override_config
 
 
 class PasswordResetTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         account.register_servlets,
         synapse.rest.admin.register_servlets_for_client_rest_resource,
@@ -408,7 +407,6 @@ class PasswordResetTestCase(unittest.HomeserverTestCase):
 
 
 class DeactivateTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         synapse.rest.admin.register_servlets_for_client_rest_resource,
         login.register_servlets,
@@ -492,7 +490,6 @@ class DeactivateTestCase(unittest.HomeserverTestCase):
 
 
 class WhoamiTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         synapse.rest.admin.register_servlets_for_client_rest_resource,
         login.register_servlets,
@@ -567,7 +564,6 @@ class WhoamiTestCase(unittest.HomeserverTestCase):
 
 
 class ThreepidEmailRestTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         account.register_servlets,
         login.register_servlets,
diff --git a/tests/rest/client/test_auth.py b/tests/rest/client/test_auth.py
index a144610078..0d8fe77b88 100644
--- a/tests/rest/client/test_auth.py
+++ b/tests/rest/client/test_auth.py
@@ -52,7 +52,6 @@ class DummyRecaptchaChecker(UserInteractiveAuthChecker):
 
 
 class FallbackAuthTests(unittest.HomeserverTestCase):
-
     servlets = [
         auth.register_servlets,
         register.register_servlets,
@@ -60,7 +59,6 @@ class FallbackAuthTests(unittest.HomeserverTestCase):
     hijack_auth = False
 
     def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
-
         config = self.default_config()
 
         config["enable_registration_captcha"] = True
diff --git a/tests/rest/client/test_capabilities.py b/tests/rest/client/test_capabilities.py
index d1751e1557..c16e8d43f4 100644
--- a/tests/rest/client/test_capabilities.py
+++ b/tests/rest/client/test_capabilities.py
@@ -26,7 +26,6 @@ from tests.unittest import override_config
 
 
 class CapabilitiesTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         synapse.rest.admin.register_servlets_for_client_rest_resource,
         capabilities.register_servlets,
diff --git a/tests/rest/client/test_consent.py b/tests/rest/client/test_consent.py
index b1ca81a911..bb845179d3 100644
--- a/tests/rest/client/test_consent.py
+++ b/tests/rest/client/test_consent.py
@@ -38,7 +38,6 @@ class ConsentResourceTestCase(unittest.HomeserverTestCase):
     hijack_auth = False
 
     def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
-
         config = self.default_config()
         config["form_secret"] = "123abc"
 
diff --git a/tests/rest/client/test_directory.py b/tests/rest/client/test_directory.py
index 7a88aa2cda..6490e883bf 100644
--- a/tests/rest/client/test_directory.py
+++ b/tests/rest/client/test_directory.py
@@ -28,7 +28,6 @@ from tests.unittest import override_config
 
 
 class DirectoryTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         admin.register_servlets_for_client_rest_resource,
         directory.register_servlets,
diff --git a/tests/rest/client/test_ephemeral_message.py b/tests/rest/client/test_ephemeral_message.py
index 9fa1f82dfe..f31ebc8021 100644
--- a/tests/rest/client/test_ephemeral_message.py
+++ b/tests/rest/client/test_ephemeral_message.py
@@ -26,7 +26,6 @@ from tests import unittest
 
 
 class EphemeralMessageTestCase(unittest.HomeserverTestCase):
-
     user_id = "@user:test"
 
     servlets = [
diff --git a/tests/rest/client/test_events.py b/tests/rest/client/test_events.py
index a9b7db9db2..54df2a252c 100644
--- a/tests/rest/client/test_events.py
+++ b/tests/rest/client/test_events.py
@@ -38,7 +38,6 @@ class EventStreamPermissionsTestCase(unittest.HomeserverTestCase):
     ]
 
     def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
-
         config = self.default_config()
         config["enable_registration_captcha"] = False
         config["enable_registration"] = True
@@ -51,7 +50,6 @@ class EventStreamPermissionsTestCase(unittest.HomeserverTestCase):
         return hs
 
     def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
-
         # register an account
         self.user_id = self.register_user("sid1", "pass")
         self.token = self.login(self.user_id, "pass")
@@ -142,7 +140,6 @@ class GetEventsTestCase(unittest.HomeserverTestCase):
     ]
 
     def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
-
         # register an account
         self.user_id = self.register_user("sid1", "pass")
         self.token = self.login(self.user_id, "pass")
diff --git a/tests/rest/client/test_filter.py b/tests/rest/client/test_filter.py
index 830762fd53..91678abf13 100644
--- a/tests/rest/client/test_filter.py
+++ b/tests/rest/client/test_filter.py
@@ -25,7 +25,6 @@ PATH_PREFIX = "/_matrix/client/v2_alpha"
 
 
 class FilterTestCase(unittest.HomeserverTestCase):
-
     user_id = "@apple:test"
     hijack_auth = True
     EXAMPLE_FILTER = {"room": {"timeline": {"types": ["m.room.message"]}}}
diff --git a/tests/rest/client/test_login.py b/tests/rest/client/test_login.py
index ff5baa9f0a..62acf4f44e 100644
--- a/tests/rest/client/test_login.py
+++ b/tests/rest/client/test_login.py
@@ -89,7 +89,6 @@ ADDITIONAL_LOGIN_FLOWS = [
 
 
 class LoginRestServletTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         synapse.rest.admin.register_servlets_for_client_rest_resource,
         login.register_servlets,
@@ -737,7 +736,6 @@ class MultiSSOTestCase(unittest.HomeserverTestCase):
 
 
 class CASTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         login.register_servlets,
     ]
diff --git a/tests/rest/client/test_login_token_request.py b/tests/rest/client/test_login_token_request.py
index 6aedc1a11c..b8187db982 100644
--- a/tests/rest/client/test_login_token_request.py
+++ b/tests/rest/client/test_login_token_request.py
@@ -26,7 +26,6 @@ endpoint = "/_matrix/client/unstable/org.matrix.msc3882/login/token"
 
 
 class LoginTokenRequestServletTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         login.register_servlets,
         admin.register_servlets,
diff --git a/tests/rest/client/test_presence.py b/tests/rest/client/test_presence.py
index 67e16880e6..dcbb125a3b 100644
--- a/tests/rest/client/test_presence.py
+++ b/tests/rest/client/test_presence.py
@@ -35,7 +35,6 @@ class PresenceTestCase(unittest.HomeserverTestCase):
     servlets = [presence.register_servlets]
 
     def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
-
         self.presence_handler = Mock(spec=PresenceHandler)
         self.presence_handler.set_state.return_value = make_awaitable(None)
 
diff --git a/tests/rest/client/test_profile.py b/tests/rest/client/test_profile.py
index 8de5a342ae..27c93ad761 100644
--- a/tests/rest/client/test_profile.py
+++ b/tests/rest/client/test_profile.py
@@ -30,7 +30,6 @@ from tests import unittest
 
 
 class ProfileTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         admin.register_servlets_for_client_rest_resource,
         login.register_servlets,
@@ -324,7 +323,6 @@ class ProfileTestCase(unittest.HomeserverTestCase):
 
 
 class ProfilesRestrictedTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         admin.register_servlets_for_client_rest_resource,
         login.register_servlets,
@@ -404,7 +402,6 @@ class ProfilesRestrictedTestCase(unittest.HomeserverTestCase):
 
 
 class OwnProfileUnrestrictedTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         admin.register_servlets_for_client_rest_resource,
         login.register_servlets,
diff --git a/tests/rest/client/test_register.py b/tests/rest/client/test_register.py
index 4c561f9525..b228dba861 100644
--- a/tests/rest/client/test_register.py
+++ b/tests/rest/client/test_register.py
@@ -40,7 +40,6 @@ from tests.unittest import override_config
 
 
 class RegisterRestServletTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         login.register_servlets,
         register.register_servlets,
@@ -797,7 +796,6 @@ class RegisterRestServletTestCase(unittest.HomeserverTestCase):
 
 
 class AccountValidityTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         register.register_servlets,
         synapse.rest.admin.register_servlets_for_client_rest_resource,
@@ -913,7 +911,6 @@ class AccountValidityTestCase(unittest.HomeserverTestCase):
 
 
 class AccountValidityRenewalByEmailTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         register.register_servlets,
         synapse.rest.admin.register_servlets_for_client_rest_resource,
@@ -1132,7 +1129,6 @@ class AccountValidityRenewalByEmailTestCase(unittest.HomeserverTestCase):
 
 
 class AccountValidityBackgroundJobTestCase(unittest.HomeserverTestCase):
-
     servlets = [synapse.rest.admin.register_servlets_for_client_rest_resource]
 
     def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
diff --git a/tests/rest/client/test_rendezvous.py b/tests/rest/client/test_rendezvous.py
index c0eb5d01a6..8dbd64be55 100644
--- a/tests/rest/client/test_rendezvous.py
+++ b/tests/rest/client/test_rendezvous.py
@@ -25,7 +25,6 @@ endpoint = "/_matrix/client/unstable/org.matrix.msc3886/rendezvous"
 
 
 class RendezvousServletTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         rendezvous.register_servlets,
     ]
diff --git a/tests/rest/client/test_rooms.py b/tests/rest/client/test_rooms.py
index cfad182b2f..4dd763096d 100644
--- a/tests/rest/client/test_rooms.py
+++ b/tests/rest/client/test_rooms.py
@@ -65,7 +65,6 @@ class RoomBase(unittest.HomeserverTestCase):
     servlets = [room.register_servlets, room.register_deprecated_servlets]
 
     def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
-
         self.hs = self.setup_test_homeserver(
             "red",
             federation_http_client=None,
@@ -92,7 +91,6 @@ class RoomPermissionsTestCase(RoomBase):
     rmcreator_id = "@notme:red"
 
     def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
-
         self.helper.auth_user_id = self.rmcreator_id
         # create some rooms under the name rmcreator_id
         self.uncreated_rmid = "!aa:test"
@@ -1127,7 +1125,6 @@ class RoomInviteRatelimitTestCase(RoomBase):
 
 
 class RoomJoinTestCase(RoomBase):
-
     servlets = [
         admin.register_servlets,
         login.register_servlets,
@@ -2102,7 +2099,6 @@ class RoomSearchTestCase(unittest.HomeserverTestCase):
     hijack_auth = False
 
     def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
-
         # Register the user who does the searching
         self.user_id2 = self.register_user("user", "pass")
         self.access_token = self.login("user", "pass")
@@ -2195,7 +2191,6 @@ class RoomSearchTestCase(unittest.HomeserverTestCase):
 
 
 class PublicRoomsRestrictedTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         synapse.rest.admin.register_servlets_for_client_rest_resource,
         room.register_servlets,
@@ -2203,7 +2198,6 @@ class PublicRoomsRestrictedTestCase(unittest.HomeserverTestCase):
     ]
 
     def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
-
         self.url = b"/_matrix/client/r0/publicRooms"
 
         config = self.default_config()
@@ -2225,7 +2219,6 @@ class PublicRoomsRestrictedTestCase(unittest.HomeserverTestCase):
 
 
 class PublicRoomsRoomTypeFilterTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         synapse.rest.admin.register_servlets_for_client_rest_resource,
         room.register_servlets,
@@ -2233,7 +2226,6 @@ class PublicRoomsRoomTypeFilterTestCase(unittest.HomeserverTestCase):
     ]
 
     def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
-
         config = self.default_config()
         config["allow_public_rooms_without_auth"] = True
         self.hs = self.setup_test_homeserver(config=config)
@@ -2414,7 +2406,6 @@ class PublicRoomsTestRemoteSearchFallbackTestCase(unittest.HomeserverTestCase):
 
 
 class PerRoomProfilesForbiddenTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         synapse.rest.admin.register_servlets_for_client_rest_resource,
         room.register_servlets,
@@ -2983,7 +2974,6 @@ class RelationsTestCase(PaginationTestCase):
 
 
 class ContextTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         synapse.rest.admin.register_servlets_for_client_rest_resource,
         room.register_servlets,
@@ -3359,7 +3349,6 @@ class RoomCanonicalAliasTestCase(unittest.HomeserverTestCase):
 
 
 class ThreepidInviteTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         admin.register_servlets,
         login.register_servlets,
@@ -3438,7 +3427,8 @@ class ThreepidInviteTestCase(unittest.HomeserverTestCase):
         """
         Test allowing/blocking threepid invites with a spam-check module.
 
-        In this test, we use the more recent API in which callbacks return a `Union[Codes, Literal["NOT_SPAM"]]`."""
+        In this test, we use the more recent API in which callbacks return a `Union[Codes, Literal["NOT_SPAM"]]`.
+        """
         # Mock a few functions to prevent the test from failing due to failing to talk to
         # a remote IS. We keep the mock for make_and_store_3pid_invite around so we
         # can check its call_count later on during the test.
diff --git a/tests/rest/client/test_sync.py b/tests/rest/client/test_sync.py
index b9047194dd..9c876c7a32 100644
--- a/tests/rest/client/test_sync.py
+++ b/tests/rest/client/test_sync.py
@@ -41,7 +41,6 @@ from tests.server import TimedOutException
 
 
 class FilterTestCase(unittest.HomeserverTestCase):
-
     user_id = "@apple:test"
     servlets = [
         synapse.rest.admin.register_servlets_for_client_rest_resource,
@@ -191,7 +190,6 @@ class SyncFilterTestCase(unittest.HomeserverTestCase):
 
 
 class SyncTypingTests(unittest.HomeserverTestCase):
-
     servlets = [
         synapse.rest.admin.register_servlets_for_client_rest_resource,
         room.register_servlets,
@@ -892,7 +890,6 @@ class DeviceListSyncTestCase(unittest.HomeserverTestCase):
 
 
 class ExcludeRoomTestCase(unittest.HomeserverTestCase):
-
     servlets = [
         synapse.rest.admin.register_servlets,
         login.register_servlets,
diff --git a/tests/rest/client/test_third_party_rules.py b/tests/rest/client/test_third_party_rules.py
index 5fa3440691..c0f93f898a 100644
--- a/tests/rest/client/test_third_party_rules.py
+++ b/tests/rest/client/test_third_party_rules.py
@@ -137,6 +137,7 @@ class ThirdPartyRulesTestCase(unittest.FederatingHomeserverTestCase):
         """Tests that a forbidden event is forbidden from being sent, but an allowed one
         can be sent.
         """
+
         # patch the rules module with a Mock which will return False for some event
         # types
         async def check(
@@ -243,6 +244,7 @@ class ThirdPartyRulesTestCase(unittest.FederatingHomeserverTestCase):
 
     def test_modify_event(self) -> None:
         """The module can return a modified version of the event"""
+
         # first patch the event checker so that it will modify the event
         async def check(
             ev: EventBase, state: StateMap[EventBase]
@@ -275,6 +277,7 @@ class ThirdPartyRulesTestCase(unittest.FederatingHomeserverTestCase):
 
     def test_message_edit(self) -> None:
         """Ensure that the module doesn't cause issues with edited messages."""
+
         # first patch the event checker so that it will modify the event
         async def check(
             ev: EventBase, state: StateMap[EventBase]
diff --git a/tests/rest/media/test_media_retention.py b/tests/rest/media/test_media_retention.py
index 23f227aed6..b59d9dfd4d 100644
--- a/tests/rest/media/test_media_retention.py
+++ b/tests/rest/media/test_media_retention.py
@@ -31,7 +31,6 @@ from tests.utils import MockClock
 
 
 class MediaRetentionTestCase(unittest.HomeserverTestCase):
-
     ONE_DAY_IN_MS = 24 * 60 * 60 * 1000
     THIRTY_DAYS_IN_MS = 30 * ONE_DAY_IN_MS
 
diff --git a/tests/rest/media/v1/test_media_storage.py b/tests/rest/media/v1/test_media_storage.py
index 17a3b06a8e..8ed27179c4 100644
--- a/tests/rest/media/v1/test_media_storage.py
+++ b/tests/rest/media/v1/test_media_storage.py
@@ -52,7 +52,6 @@ from tests.utils import default_config
 
 
 class MediaStorageTests(unittest.HomeserverTestCase):
-
     needs_threadpool = True
 
     def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
@@ -207,7 +206,6 @@ class MediaRepoTests(unittest.HomeserverTestCase):
     user_id = "@test:user"
 
     def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
-
         self.fetches: List[
             Tuple[
                 "Deferred[Tuple[bytes, Tuple[int, Dict[bytes, List[bytes]]]]]",
@@ -268,7 +266,6 @@ class MediaRepoTests(unittest.HomeserverTestCase):
         return hs
 
     def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
-
         media_resource = hs.get_media_repository_resource()
         self.download_resource = media_resource.children[b"download"]
         self.thumbnail_resource = media_resource.children[b"thumbnail"]
diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py
index 2c321f8d04..6fcf60ce19 100644
--- a/tests/rest/media/v1/test_url_preview.py
+++ b/tests/rest/media/v1/test_url_preview.py
@@ -58,7 +58,6 @@ class URLPreviewTests(unittest.HomeserverTestCase):
     )
 
     def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
-
         config = self.default_config()
         config["url_preview_enabled"] = True
         config["max_spider_size"] = 9999999
@@ -118,7 +117,6 @@ class URLPreviewTests(unittest.HomeserverTestCase):
         return hs
 
     def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
-
         self.media_repo = hs.get_media_repository_resource()
         self.preview_url = self.media_repo.children[b"preview_url"]
 
@@ -133,7 +131,6 @@ class URLPreviewTests(unittest.HomeserverTestCase):
                 addressTypes: Optional[Sequence[Type[IAddress]]] = None,
                 transportSemantics: str = "TCP",
             ) -> IResolutionReceiver:
-
                 resolution = HostResolution(hostName)
                 resolutionReceiver.resolutionBegan(resolution)
                 if hostName not in self.lookups:
-- 
cgit 1.5.1


From 682151a464f688768d5bd8308e16bd4024ad2e57 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@users.noreply.github.com>
Date: Thu, 23 Feb 2023 16:08:53 -0500
Subject: Do not fail completely if oEmbed autodiscovery fails. (#15092)

Previously if an autodiscovered oEmbed request failed (e.g. the
oEmbed endpoint is down or does not exist) then the entire URL
preview would fail. Instead we now return everything we can, even
if this additional request fails.
---
 changelog.d/15092.bugfix                      |  1 +
 synapse/rest/media/v1/preview_url_resource.py | 33 ++++++++++++++------
 tests/rest/media/v1/test_url_preview.py       | 44 +++++++++++++++++++++++++--
 3 files changed, 65 insertions(+), 13 deletions(-)
 create mode 100644 changelog.d/15092.bugfix

(limited to 'tests/rest')

diff --git a/changelog.d/15092.bugfix b/changelog.d/15092.bugfix
new file mode 100644
index 0000000000..67509c5c69
--- /dev/null
+++ b/changelog.d/15092.bugfix
@@ -0,0 +1 @@
+Fix a long-standing bug where a URL preview would break if the discovered oEmbed failed to download.
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index a8f6fd6b35..4a594ab9d8 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -163,6 +163,10 @@ class PreviewUrlResource(DirectServeJsonResource):
        7. Stores the result in the database cache.
     4. Returns the result.
 
+    If any additional requests (e.g. from oEmbed autodiscovery, step 5.3 or
+    image thumbnailing, step 5.4 or 6.4) fails then the URL preview as a whole
+    does not fail. As much information as possible is returned.
+
     The in-memory cache expires after 1 hour.
 
     Expired entries in the database cache (and their associated media files) are
@@ -364,16 +368,25 @@ class PreviewUrlResource(DirectServeJsonResource):
                 oembed_url = self._oembed.autodiscover_from_html(tree)
                 og_from_oembed: JsonDict = {}
                 if oembed_url:
-                    oembed_info = await self._handle_url(
-                        oembed_url, user, allow_data_urls=True
-                    )
-                    (
-                        og_from_oembed,
-                        author_name,
-                        expiration_ms,
-                    ) = await self._handle_oembed_response(
-                        url, oembed_info, expiration_ms
-                    )
+                    try:
+                        oembed_info = await self._handle_url(
+                            oembed_url, user, allow_data_urls=True
+                        )
+                    except Exception as e:
+                        # Fetching the oEmbed info failed, don't block the entire URL preview.
+                        logger.warning(
+                            "oEmbed fetch failed during URL preview: %s errored with %s",
+                            oembed_url,
+                            e,
+                        )
+                    else:
+                        (
+                            og_from_oembed,
+                            author_name,
+                            expiration_ms,
+                        ) = await self._handle_oembed_response(
+                            url, oembed_info, expiration_ms
+                        )
 
                 # Parse Open Graph information from the HTML in case the oEmbed
                 # response failed or is incomplete.
diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py
index 6fcf60ce19..2acfccec61 100644
--- a/tests/rest/media/v1/test_url_preview.py
+++ b/tests/rest/media/v1/test_url_preview.py
@@ -657,7 +657,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
         """If the preview image doesn't exist, ensure some data is returned."""
         self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
 
-        end_content = (
+        result = (
             b"""<html><body><img src="http://cdn.matrix.org/foo.jpg"></body></html>"""
         )
 
@@ -678,8 +678,8 @@ class URLPreviewTests(unittest.HomeserverTestCase):
                 b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
                 b'Content-Type: text/html; charset="utf8"\r\n\r\n'
             )
-            % (len(end_content),)
-            + end_content
+            % (len(result),)
+            + result
         )
 
         self.pump()
@@ -688,6 +688,44 @@ class URLPreviewTests(unittest.HomeserverTestCase):
         # The image should not be in the result.
         self.assertNotIn("og:image", channel.json_body)
 
+    def test_oembed_failure(self) -> None:
+        """If the autodiscovered oEmbed URL fails, ensure some data is returned."""
+        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
+
+        result = b"""
+        <title>oEmbed Autodiscovery Fail</title>
+        <link rel="alternate" type="application/json+oembed"
+            href="http://example.com/oembed?url=http%3A%2F%2Fmatrix.org&format=json"
+            title="matrixdotorg" />
+        """
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://matrix.org",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b'Content-Type: text/html; charset="utf8"\r\n\r\n'
+            )
+            % (len(result),)
+            + result
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 200)
+
+        # The image should not be in the result.
+        self.assertEqual(channel.json_body["og:title"], "oEmbed Autodiscovery Fail")
+
     def test_data_url(self) -> None:
         """
         Requesting to preview a data URL is not supported.
-- 
cgit 1.5.1


From 1c95ddd09bbc46046a3412e7bb03a87aa3b6f65a Mon Sep 17 00:00:00 2001
From: Shay <hillerys@element.io>
Date: Fri, 24 Feb 2023 13:15:29 -0800
Subject: Batch up storing state groups when creating new room (#14918)

---
 changelog.d/14918.misc                      |   1 +
 synapse/events/snapshot.py                  |  49 +++++++++++
 synapse/handlers/message.py                 |  16 ++--
 synapse/handlers/room.py                    |  37 ++++----
 synapse/handlers/room_batch.py              |   4 +-
 synapse/handlers/room_member.py             |  13 ++-
 synapse/storage/databases/state/store.py    | 119 ++++++++++++++++++++++++++
 tests/handlers/test_message.py              |  25 ++++--
 tests/handlers/test_register.py             |   3 +-
 tests/push/test_bulk_push_rule_evaluator.py |  13 +--
 tests/rest/client/test_rooms.py             |   4 +-
 tests/storage/test_event_chain.py           |   6 +-
 tests/storage/test_state.py                 | 126 ++++++++++++++++++++++++++++
 tests/unittest.py                           |   4 +-
 14 files changed, 371 insertions(+), 49 deletions(-)
 create mode 100644 changelog.d/14918.misc

(limited to 'tests/rest')

diff --git a/changelog.d/14918.misc b/changelog.d/14918.misc
new file mode 100644
index 0000000000..828794354a
--- /dev/null
+++ b/changelog.d/14918.misc
@@ -0,0 +1 @@
+Batch up storing state groups when creating a new room.
\ No newline at end of file
diff --git a/synapse/events/snapshot.py b/synapse/events/snapshot.py
index e0d82ad81c..a91a5d1e3c 100644
--- a/synapse/events/snapshot.py
+++ b/synapse/events/snapshot.py
@@ -23,6 +23,7 @@ from synapse.types import JsonDict, StateMap
 
 if TYPE_CHECKING:
     from synapse.storage.controllers import StorageControllers
+    from synapse.storage.databases import StateGroupDataStore
     from synapse.storage.databases.main import DataStore
     from synapse.types.state import StateFilter
 
@@ -348,6 +349,54 @@ class UnpersistedEventContext(UnpersistedEventContextBase):
     partial_state: bool
     state_map_before_event: Optional[StateMap[str]] = None
 
+    @classmethod
+    async def batch_persist_unpersisted_contexts(
+        cls,
+        events_and_context: List[Tuple[EventBase, "UnpersistedEventContextBase"]],
+        room_id: str,
+        last_known_state_group: int,
+        datastore: "StateGroupDataStore",
+    ) -> List[Tuple[EventBase, EventContext]]:
+        """
+        Takes a list of events and their associated unpersisted contexts and persists
+        the unpersisted contexts, returning a list of events and persisted contexts.
+        Note that all the events must be in a linear chain (ie a <- b <- c).
+
+        Args:
+            events_and_context: A list of events and their unpersisted contexts
+            room_id: the room_id for the events
+            last_known_state_group: the last persisted state group
+            datastore: a state datastore
+        """
+        amended_events_and_context = await datastore.store_state_deltas_for_batched(
+            events_and_context, room_id, last_known_state_group
+        )
+
+        events_and_persisted_context = []
+        for event, unpersisted_context in amended_events_and_context:
+            if event.is_state():
+                context = EventContext(
+                    storage=unpersisted_context._storage,
+                    state_group=unpersisted_context.state_group_after_event,
+                    state_group_before_event=unpersisted_context.state_group_before_event,
+                    state_delta_due_to_event=unpersisted_context.state_delta_due_to_event,
+                    partial_state=unpersisted_context.partial_state,
+                    prev_group=unpersisted_context.state_group_before_event,
+                    delta_ids=unpersisted_context.state_delta_due_to_event,
+                )
+            else:
+                context = EventContext(
+                    storage=unpersisted_context._storage,
+                    state_group=unpersisted_context.state_group_after_event,
+                    state_group_before_event=unpersisted_context.state_group_before_event,
+                    state_delta_due_to_event=unpersisted_context.state_delta_due_to_event,
+                    partial_state=unpersisted_context.partial_state,
+                    prev_group=unpersisted_context.prev_group_for_state_group_before_event,
+                    delta_ids=unpersisted_context.delta_ids_to_state_group_before_event,
+                )
+            events_and_persisted_context.append((event, context))
+        return events_and_persisted_context
+
     async def get_prev_state_ids(
         self, state_filter: Optional["StateFilter"] = None
     ) -> StateMap[str]:
diff --git a/synapse/handlers/message.py b/synapse/handlers/message.py
index aa90d0000d..e433d6b01f 100644
--- a/synapse/handlers/message.py
+++ b/synapse/handlers/message.py
@@ -574,7 +574,7 @@ class EventCreationHandler:
         state_map: Optional[StateMap[str]] = None,
         for_batch: bool = False,
         current_state_group: Optional[int] = None,
-    ) -> Tuple[EventBase, EventContext]:
+    ) -> Tuple[EventBase, UnpersistedEventContextBase]:
         """
         Given a dict from a client, create a new event. If bool for_batch is true, will
         create an event using the prev_event_ids, and will create an event context for
@@ -721,8 +721,6 @@ class EventCreationHandler:
             current_state_group=current_state_group,
         )
 
-        context = await unpersisted_context.persist(event)
-
         # In an ideal world we wouldn't need the second part of this condition. However,
         # this behaviour isn't spec'd yet, meaning we should be able to deactivate this
         # behaviour. Another reason is that this code is also evaluated each time a new
@@ -739,7 +737,7 @@ class EventCreationHandler:
                 assert state_map is not None
                 prev_event_id = state_map.get((EventTypes.Member, event.sender))
             else:
-                prev_state_ids = await context.get_prev_state_ids(
+                prev_state_ids = await unpersisted_context.get_prev_state_ids(
                     StateFilter.from_types([(EventTypes.Member, None)])
                 )
                 prev_event_id = prev_state_ids.get((EventTypes.Member, event.sender))
@@ -764,8 +762,7 @@ class EventCreationHandler:
                 )
 
         self.validator.validate_new(event, self.config)
-
-        return event, context
+        return event, unpersisted_context
 
     async def _is_exempt_from_privacy_policy(
         self, builder: EventBuilder, requester: Requester
@@ -1005,7 +1002,7 @@ class EventCreationHandler:
         max_retries = 5
         for i in range(max_retries):
             try:
-                event, context = await self.create_event(
+                event, unpersisted_context = await self.create_event(
                     requester,
                     event_dict,
                     txn_id=txn_id,
@@ -1016,6 +1013,7 @@ class EventCreationHandler:
                     historical=historical,
                     depth=depth,
                 )
+                context = await unpersisted_context.persist(event)
 
                 assert self.hs.is_mine_id(event.sender), "User must be our own: %s" % (
                     event.sender,
@@ -1190,7 +1188,6 @@ class EventCreationHandler:
         if for_batch:
             assert prev_event_ids is not None
             assert state_map is not None
-            assert current_state_group is not None
             auth_ids = self._event_auth_handler.compute_auth_events(builder, state_map)
             event = await builder.build(
                 prev_event_ids=prev_event_ids, auth_event_ids=auth_ids, depth=depth
@@ -2046,7 +2043,7 @@ class EventCreationHandler:
                 max_retries = 5
                 for i in range(max_retries):
                     try:
-                        event, context = await self.create_event(
+                        event, unpersisted_context = await self.create_event(
                             requester,
                             {
                                 "type": EventTypes.Dummy,
@@ -2055,6 +2052,7 @@ class EventCreationHandler:
                                 "sender": user_id,
                             },
                         )
+                        context = await unpersisted_context.persist(event)
 
                         event.internal_metadata.proactively_send = False
 
diff --git a/synapse/handlers/room.py b/synapse/handlers/room.py
index a26ec02284..b1784638f4 100644
--- a/synapse/handlers/room.py
+++ b/synapse/handlers/room.py
@@ -51,6 +51,7 @@ from synapse.api.filtering import Filter
 from synapse.api.room_versions import KNOWN_ROOM_VERSIONS, RoomVersion
 from synapse.event_auth import validate_event_for_room_version
 from synapse.events import EventBase
+from synapse.events.snapshot import UnpersistedEventContext
 from synapse.events.utils import copy_and_fixup_power_levels_contents
 from synapse.handlers.relations import BundledAggregations
 from synapse.module_api import NOT_SPAM
@@ -211,7 +212,7 @@ class RoomCreationHandler:
                 # the required power level to send the tombstone event.
                 (
                     tombstone_event,
-                    tombstone_context,
+                    tombstone_unpersisted_context,
                 ) = await self.event_creation_handler.create_event(
                     requester,
                     {
@@ -225,6 +226,9 @@ class RoomCreationHandler:
                         },
                     },
                 )
+                tombstone_context = await tombstone_unpersisted_context.persist(
+                    tombstone_event
+                )
                 validate_event_for_room_version(tombstone_event)
                 await self._event_auth_handler.check_auth_rules_from_context(
                     tombstone_event
@@ -1092,7 +1096,7 @@ class RoomCreationHandler:
             content: JsonDict,
             for_batch: bool,
             **kwargs: Any,
-        ) -> Tuple[EventBase, synapse.events.snapshot.EventContext]:
+        ) -> Tuple[EventBase, synapse.events.snapshot.UnpersistedEventContextBase]:
             """
             Creates an event and associated event context.
             Args:
@@ -1111,20 +1115,23 @@ class RoomCreationHandler:
 
             event_dict = create_event_dict(etype, content, **kwargs)
 
-            new_event, new_context = await self.event_creation_handler.create_event(
+            (
+                new_event,
+                new_unpersisted_context,
+            ) = await self.event_creation_handler.create_event(
                 creator,
                 event_dict,
                 prev_event_ids=prev_event,
                 depth=depth,
                 state_map=state_map,
                 for_batch=for_batch,
-                current_state_group=current_state_group,
             )
+
             depth += 1
             prev_event = [new_event.event_id]
             state_map[(new_event.type, new_event.state_key)] = new_event.event_id
 
-            return new_event, new_context
+            return new_event, new_unpersisted_context
 
         try:
             config = self._presets_dict[preset_config]
@@ -1134,10 +1141,10 @@ class RoomCreationHandler:
             )
 
         creation_content.update({"creator": creator_id})
-        creation_event, creation_context = await create_event(
+        creation_event, unpersisted_creation_context = await create_event(
             EventTypes.Create, creation_content, False
         )
-
+        creation_context = await unpersisted_creation_context.persist(creation_event)
         logger.debug("Sending %s in new room", EventTypes.Member)
         ev = await self.event_creation_handler.handle_new_client_event(
             requester=creator,
@@ -1181,7 +1188,6 @@ class RoomCreationHandler:
             power_event, power_context = await create_event(
                 EventTypes.PowerLevels, pl_content, True
             )
-            current_state_group = power_context._state_group
             events_to_send.append((power_event, power_context))
         else:
             power_level_content: JsonDict = {
@@ -1230,14 +1236,12 @@ class RoomCreationHandler:
                 power_level_content,
                 True,
             )
-            current_state_group = pl_context._state_group
             events_to_send.append((pl_event, pl_context))
 
         if room_alias and (EventTypes.CanonicalAlias, "") not in initial_state:
             room_alias_event, room_alias_context = await create_event(
                 EventTypes.CanonicalAlias, {"alias": room_alias.to_string()}, True
             )
-            current_state_group = room_alias_context._state_group
             events_to_send.append((room_alias_event, room_alias_context))
 
         if (EventTypes.JoinRules, "") not in initial_state:
@@ -1246,7 +1250,6 @@ class RoomCreationHandler:
                 {"join_rule": config["join_rules"]},
                 True,
             )
-            current_state_group = join_rules_context._state_group
             events_to_send.append((join_rules_event, join_rules_context))
 
         if (EventTypes.RoomHistoryVisibility, "") not in initial_state:
@@ -1255,7 +1258,6 @@ class RoomCreationHandler:
                 {"history_visibility": config["history_visibility"]},
                 True,
             )
-            current_state_group = visibility_context._state_group
             events_to_send.append((visibility_event, visibility_context))
 
         if config["guest_can_join"]:
@@ -1265,14 +1267,12 @@ class RoomCreationHandler:
                     {EventContentFields.GUEST_ACCESS: GuestAccess.CAN_JOIN},
                     True,
                 )
-                current_state_group = guest_access_context._state_group
                 events_to_send.append((guest_access_event, guest_access_context))
 
         for (etype, state_key), content in initial_state.items():
             event, context = await create_event(
                 etype, content, True, state_key=state_key
             )
-            current_state_group = context._state_group
             events_to_send.append((event, context))
 
         if config["encrypted"]:
@@ -1284,9 +1284,16 @@ class RoomCreationHandler:
             )
             events_to_send.append((encryption_event, encryption_context))
 
+        datastore = self.hs.get_datastores().state
+        events_and_context = (
+            await UnpersistedEventContext.batch_persist_unpersisted_contexts(
+                events_to_send, room_id, current_state_group, datastore
+            )
+        )
+
         last_event = await self.event_creation_handler.handle_new_client_event(
             creator,
-            events_to_send,
+            events_and_context,
             ignore_shadow_ban=True,
             ratelimit=False,
         )
diff --git a/synapse/handlers/room_batch.py b/synapse/handlers/room_batch.py
index 5d4ca0e2d2..bf9df60218 100644
--- a/synapse/handlers/room_batch.py
+++ b/synapse/handlers/room_batch.py
@@ -327,7 +327,7 @@ class RoomBatchHandler:
             # Mark all events as historical
             event_dict["content"][EventContentFields.MSC2716_HISTORICAL] = True
 
-            event, context = await self.event_creation_handler.create_event(
+            event, unpersisted_context = await self.event_creation_handler.create_event(
                 await self.create_requester_for_user_id_from_app_service(
                     ev["sender"], app_service_requester.app_service
                 ),
@@ -345,7 +345,7 @@ class RoomBatchHandler:
                 historical=True,
                 depth=inherited_depth,
             )
-
+            context = await unpersisted_context.persist(event)
             assert context._state_group
 
             # Normally this is done when persisting the event but we have to
diff --git a/synapse/handlers/room_member.py b/synapse/handlers/room_member.py
index a965c7ec76..de7476f300 100644
--- a/synapse/handlers/room_member.py
+++ b/synapse/handlers/room_member.py
@@ -414,7 +414,10 @@ class RoomMemberHandler(metaclass=abc.ABCMeta):
         max_retries = 5
         for i in range(max_retries):
             try:
-                event, context = await self.event_creation_handler.create_event(
+                (
+                    event,
+                    unpersisted_context,
+                ) = await self.event_creation_handler.create_event(
                     requester,
                     {
                         "type": EventTypes.Member,
@@ -435,7 +438,7 @@ class RoomMemberHandler(metaclass=abc.ABCMeta):
                     outlier=outlier,
                     historical=historical,
                 )
-
+                context = await unpersisted_context.persist(event)
                 prev_state_ids = await context.get_prev_state_ids(
                     StateFilter.from_types([(EventTypes.Member, None)])
                 )
@@ -1944,7 +1947,10 @@ class RoomMemberMasterHandler(RoomMemberHandler):
         max_retries = 5
         for i in range(max_retries):
             try:
-                event, context = await self.event_creation_handler.create_event(
+                (
+                    event,
+                    unpersisted_context,
+                ) = await self.event_creation_handler.create_event(
                     requester,
                     event_dict,
                     txn_id=txn_id,
@@ -1952,6 +1958,7 @@ class RoomMemberMasterHandler(RoomMemberHandler):
                     auth_event_ids=auth_event_ids,
                     outlier=True,
                 )
+                context = await unpersisted_context.persist(event)
                 event.internal_metadata.out_of_band_membership = True
 
                 result_event = (
diff --git a/synapse/storage/databases/state/store.py b/synapse/storage/databases/state/store.py
index 89b1faa6c8..bf4cdfdf29 100644
--- a/synapse/storage/databases/state/store.py
+++ b/synapse/storage/databases/state/store.py
@@ -18,6 +18,8 @@ from typing import TYPE_CHECKING, Collection, Dict, Iterable, List, Optional, Se
 import attr
 
 from synapse.api.constants import EventTypes
+from synapse.events import EventBase
+from synapse.events.snapshot import UnpersistedEventContext, UnpersistedEventContextBase
 from synapse.storage._base import SQLBaseStore
 from synapse.storage.database import (
     DatabasePool,
@@ -401,6 +403,123 @@ class StateGroupDataStore(StateBackgroundUpdateStore, SQLBaseStore):
                 fetched_keys=non_member_types,
             )
 
+    async def store_state_deltas_for_batched(
+        self,
+        events_and_context: List[Tuple[EventBase, UnpersistedEventContextBase]],
+        room_id: str,
+        prev_group: int,
+    ) -> List[Tuple[EventBase, UnpersistedEventContext]]:
+        """Generate and store state deltas for a group of events and contexts created to be
+        batch persisted. Note that all the events must be in a linear chain (ie a <- b <- c).
+
+        Args:
+            events_and_context: the events to generate and store a state groups for
+            and their associated contexts
+            room_id: the id of the room the events were created for
+            prev_group: the state group of the last event persisted before the batched events
+            were created
+        """
+
+        def insert_deltas_group_txn(
+            txn: LoggingTransaction,
+            events_and_context: List[Tuple[EventBase, UnpersistedEventContext]],
+            prev_group: int,
+        ) -> List[Tuple[EventBase, UnpersistedEventContext]]:
+            """Generate and store state groups for the provided events and contexts.
+
+            Requires that we have the state as a delta from the last persisted state group.
+
+            Returns:
+                A list of state groups
+            """
+            is_in_db = self.db_pool.simple_select_one_onecol_txn(
+                txn,
+                table="state_groups",
+                keyvalues={"id": prev_group},
+                retcol="id",
+                allow_none=True,
+            )
+            if not is_in_db:
+                raise Exception(
+                    "Trying to persist state with unpersisted prev_group: %r"
+                    % (prev_group,)
+                )
+
+            num_state_groups = sum(
+                1 for event, _ in events_and_context if event.is_state()
+            )
+
+            state_groups = self._state_group_seq_gen.get_next_mult_txn(
+                txn, num_state_groups
+            )
+
+            sg_before = prev_group
+            state_group_iter = iter(state_groups)
+            for event, context in events_and_context:
+                if not event.is_state():
+                    context.state_group_after_event = sg_before
+                    context.state_group_before_event = sg_before
+                    continue
+
+                sg_after = next(state_group_iter)
+                context.state_group_after_event = sg_after
+                context.state_group_before_event = sg_before
+                context.state_delta_due_to_event = {
+                    (event.type, event.state_key): event.event_id
+                }
+                sg_before = sg_after
+
+            self.db_pool.simple_insert_many_txn(
+                txn,
+                table="state_groups",
+                keys=("id", "room_id", "event_id"),
+                values=[
+                    (context.state_group_after_event, room_id, event.event_id)
+                    for event, context in events_and_context
+                    if event.is_state()
+                ],
+            )
+
+            self.db_pool.simple_insert_many_txn(
+                txn,
+                table="state_group_edges",
+                keys=("state_group", "prev_state_group"),
+                values=[
+                    (
+                        context.state_group_after_event,
+                        context.state_group_before_event,
+                    )
+                    for event, context in events_and_context
+                    if event.is_state()
+                ],
+            )
+
+            self.db_pool.simple_insert_many_txn(
+                txn,
+                table="state_groups_state",
+                keys=("state_group", "room_id", "type", "state_key", "event_id"),
+                values=[
+                    (
+                        context.state_group_after_event,
+                        room_id,
+                        key[0],
+                        key[1],
+                        state_id,
+                    )
+                    for event, context in events_and_context
+                    if context.state_delta_due_to_event is not None
+                    for key, state_id in context.state_delta_due_to_event.items()
+                ],
+            )
+            return events_and_context
+
+        return await self.db_pool.runInteraction(
+            "store_state_deltas_for_batched.insert_deltas_group",
+            insert_deltas_group_txn,
+            events_and_context,
+            prev_group,
+        )
+
     async def store_state_group(
         self,
         event_id: str,
diff --git a/tests/handlers/test_message.py b/tests/handlers/test_message.py
index 69d384442f..9691d66b48 100644
--- a/tests/handlers/test_message.py
+++ b/tests/handlers/test_message.py
@@ -18,7 +18,7 @@ from twisted.test.proto_helpers import MemoryReactor
 
 from synapse.api.constants import EventTypes
 from synapse.events import EventBase
-from synapse.events.snapshot import EventContext
+from synapse.events.snapshot import EventContext, UnpersistedEventContextBase
 from synapse.rest import admin
 from synapse.rest.client import login, room
 from synapse.server import HomeServer
@@ -79,7 +79,9 @@ class EventCreationTestCase(unittest.HomeserverTestCase):
 
         return memberEvent, memberEventContext
 
-    def _create_duplicate_event(self, txn_id: str) -> Tuple[EventBase, EventContext]:
+    def _create_duplicate_event(
+        self, txn_id: str
+    ) -> Tuple[EventBase, UnpersistedEventContextBase]:
         """Create a new event with the given transaction ID. All events produced
         by this method will be considered duplicates.
         """
@@ -107,7 +109,8 @@ class EventCreationTestCase(unittest.HomeserverTestCase):
 
         txn_id = "something_suitably_random"
 
-        event1, context = self._create_duplicate_event(txn_id)
+        event1, unpersisted_context = self._create_duplicate_event(txn_id)
+        context = self.get_success(unpersisted_context.persist(event1))
 
         ret_event1 = self.get_success(
             self.handler.handle_new_client_event(
@@ -119,7 +122,8 @@ class EventCreationTestCase(unittest.HomeserverTestCase):
 
         self.assertEqual(event1.event_id, ret_event1.event_id)
 
-        event2, context = self._create_duplicate_event(txn_id)
+        event2, unpersisted_context = self._create_duplicate_event(txn_id)
+        context = self.get_success(unpersisted_context.persist(event2))
 
         # We want to test that the deduplication at the persit event end works,
         # so we want to make sure we test with different events.
@@ -140,7 +144,9 @@ class EventCreationTestCase(unittest.HomeserverTestCase):
 
         # Let's test that calling `persist_event` directly also does the right
         # thing.
-        event3, context = self._create_duplicate_event(txn_id)
+        event3, unpersisted_context = self._create_duplicate_event(txn_id)
+        context = self.get_success(unpersisted_context.persist(event3))
+
         self.assertNotEqual(event1.event_id, event3.event_id)
 
         ret_event3, event_pos3, _ = self.get_success(
@@ -154,7 +160,8 @@ class EventCreationTestCase(unittest.HomeserverTestCase):
 
         # Let's test that calling `persist_events` directly also does the right
         # thing.
-        event4, context = self._create_duplicate_event(txn_id)
+        event4, unpersisted_context = self._create_duplicate_event(txn_id)
+        context = self.get_success(unpersisted_context.persist(event4))
         self.assertNotEqual(event1.event_id, event3.event_id)
 
         events, _ = self.get_success(
@@ -174,8 +181,10 @@ class EventCreationTestCase(unittest.HomeserverTestCase):
         txn_id = "something_else_suitably_random"
 
         # Create two duplicate events to persist at the same time
-        event1, context1 = self._create_duplicate_event(txn_id)
-        event2, context2 = self._create_duplicate_event(txn_id)
+        event1, unpersisted_context1 = self._create_duplicate_event(txn_id)
+        context1 = self.get_success(unpersisted_context1.persist(event1))
+        event2, unpersisted_context2 = self._create_duplicate_event(txn_id)
+        context2 = self.get_success(unpersisted_context2.persist(event2))
 
         # Ensure their event IDs are different to start with
         self.assertNotEqual(event1.event_id, event2.event_id)
diff --git a/tests/handlers/test_register.py b/tests/handlers/test_register.py
index 1db99b3c00..aff1ec4758 100644
--- a/tests/handlers/test_register.py
+++ b/tests/handlers/test_register.py
@@ -507,7 +507,7 @@ class RegistrationTestCase(unittest.HomeserverTestCase):
         # Lower the permissions of the inviter.
         event_creation_handler = self.hs.get_event_creation_handler()
         requester = create_requester(inviter)
-        event, context = self.get_success(
+        event, unpersisted_context = self.get_success(
             event_creation_handler.create_event(
                 requester,
                 {
@@ -519,6 +519,7 @@ class RegistrationTestCase(unittest.HomeserverTestCase):
                 },
             )
         )
+        context = self.get_success(unpersisted_context.persist(event))
         self.get_success(
             event_creation_handler.handle_new_client_event(
                 requester, events_and_context=[(event, context)]
diff --git a/tests/push/test_bulk_push_rule_evaluator.py b/tests/push/test_bulk_push_rule_evaluator.py
index dce6899e78..1458076a90 100644
--- a/tests/push/test_bulk_push_rule_evaluator.py
+++ b/tests/push/test_bulk_push_rule_evaluator.py
@@ -130,7 +130,7 @@ class TestBulkPushRuleEvaluator(HomeserverTestCase):
 
         # Create a new message event, and try to evaluate it under the dodgy
         # power level event.
-        event, context = self.get_success(
+        event, unpersisted_context = self.get_success(
             self.event_creation_handler.create_event(
                 self.requester,
                 {
@@ -145,6 +145,7 @@ class TestBulkPushRuleEvaluator(HomeserverTestCase):
                 prev_event_ids=[pl_event_id],
             )
         )
+        context = self.get_success(unpersisted_context.persist(event))
 
         bulk_evaluator = BulkPushRuleEvaluator(self.hs)
         # should not raise
@@ -170,7 +171,7 @@ class TestBulkPushRuleEvaluator(HomeserverTestCase):
         """Ensure that push rules are not calculated when disabled in the config"""
 
         # Create a new message event which should cause a notification.
-        event, context = self.get_success(
+        event, unpersisted_context = self.get_success(
             self.event_creation_handler.create_event(
                 self.requester,
                 {
@@ -184,6 +185,7 @@ class TestBulkPushRuleEvaluator(HomeserverTestCase):
                 },
             )
         )
+        context = self.get_success(unpersisted_context.persist(event))
 
         bulk_evaluator = BulkPushRuleEvaluator(self.hs)
         # Mock the method which calculates push rules -- we do this instead of
@@ -200,7 +202,7 @@ class TestBulkPushRuleEvaluator(HomeserverTestCase):
     ) -> bool:
         """Returns true iff the `mentions` trigger an event push action."""
         # Create a new message event which should cause a notification.
-        event, context = self.get_success(
+        event, unpersisted_context = self.get_success(
             self.event_creation_handler.create_event(
                 self.requester,
                 {
@@ -211,7 +213,7 @@ class TestBulkPushRuleEvaluator(HomeserverTestCase):
                 },
             )
         )
-
+        context = self.get_success(unpersisted_context.persist(event))
         # Execute the push rule machinery.
         self.get_success(bulk_evaluator.action_for_events_by_user([(event, context)]))
 
@@ -390,7 +392,7 @@ class TestBulkPushRuleEvaluator(HomeserverTestCase):
         bulk_evaluator = BulkPushRuleEvaluator(self.hs)
 
         # Create & persist an event to use as the parent of the relation.
-        event, context = self.get_success(
+        event, unpersisted_context = self.get_success(
             self.event_creation_handler.create_event(
                 self.requester,
                 {
@@ -404,6 +406,7 @@ class TestBulkPushRuleEvaluator(HomeserverTestCase):
                 },
             )
         )
+        context = self.get_success(unpersisted_context.persist(event))
         self.get_success(
             self.event_creation_handler.handle_new_client_event(
                 self.requester, events_and_context=[(event, context)]
diff --git a/tests/rest/client/test_rooms.py b/tests/rest/client/test_rooms.py
index 4dd763096d..a4900703c4 100644
--- a/tests/rest/client/test_rooms.py
+++ b/tests/rest/client/test_rooms.py
@@ -713,7 +713,7 @@ class RoomsCreateTestCase(RoomBase):
         self.assertEqual(HTTPStatus.OK, channel.code, channel.result)
         self.assertTrue("room_id" in channel.json_body)
         assert channel.resource_usage is not None
-        self.assertEqual(33, channel.resource_usage.db_txn_count)
+        self.assertEqual(30, channel.resource_usage.db_txn_count)
 
     def test_post_room_initial_state(self) -> None:
         # POST with initial_state config key, expect new room id
@@ -726,7 +726,7 @@ class RoomsCreateTestCase(RoomBase):
         self.assertEqual(HTTPStatus.OK, channel.code, channel.result)
         self.assertTrue("room_id" in channel.json_body)
         assert channel.resource_usage is not None
-        self.assertEqual(36, channel.resource_usage.db_txn_count)
+        self.assertEqual(32, channel.resource_usage.db_txn_count)
 
     def test_post_room_visibility_key(self) -> None:
         # POST with visibility config key, expect new room id
diff --git a/tests/storage/test_event_chain.py b/tests/storage/test_event_chain.py
index 73d11e7786..e39b63edac 100644
--- a/tests/storage/test_event_chain.py
+++ b/tests/storage/test_event_chain.py
@@ -522,7 +522,7 @@ class EventChainBackgroundUpdateTestCase(HomeserverTestCase):
         latest_event_ids = self.get_success(
             self.store.get_prev_events_for_room(room_id)
         )
-        event, context = self.get_success(
+        event, unpersisted_context = self.get_success(
             event_handler.create_event(
                 self.requester,
                 {
@@ -535,6 +535,7 @@ class EventChainBackgroundUpdateTestCase(HomeserverTestCase):
                 prev_event_ids=latest_event_ids,
             )
         )
+        context = self.get_success(unpersisted_context.persist(event))
         self.get_success(
             event_handler.handle_new_client_event(
                 self.requester, events_and_context=[(event, context)]
@@ -544,7 +545,7 @@ class EventChainBackgroundUpdateTestCase(HomeserverTestCase):
         assert state_ids1 is not None
         state1 = set(state_ids1.values())
 
-        event, context = self.get_success(
+        event, unpersisted_context = self.get_success(
             event_handler.create_event(
                 self.requester,
                 {
@@ -557,6 +558,7 @@ class EventChainBackgroundUpdateTestCase(HomeserverTestCase):
                 prev_event_ids=latest_event_ids,
             )
         )
+        context = self.get_success(unpersisted_context.persist(event))
         self.get_success(
             event_handler.handle_new_client_event(
                 self.requester, events_and_context=[(event, context)]
diff --git a/tests/storage/test_state.py b/tests/storage/test_state.py
index e82c03f597..62aed6af0a 100644
--- a/tests/storage/test_state.py
+++ b/tests/storage/test_state.py
@@ -496,3 +496,129 @@ class StateStoreTestCase(HomeserverTestCase):
 
         self.assertEqual(is_all, True)
         self.assertDictEqual({(e5.type, e5.state_key): e5.event_id}, state_dict)
+
+    def test_batched_state_group_storing(self) -> None:
+        creation_event = self.inject_state_event(
+            self.room, self.u_alice, EventTypes.Create, "", {}
+        )
+        state_to_event = self.get_success(
+            self.storage.state.get_state_groups(
+                self.room.to_string(), [creation_event.event_id]
+            )
+        )
+        current_state_group = list(state_to_event.keys())[0]
+
+        # create some unpersisted events and event contexts to store against room
+        events_and_context = []
+        builder = self.event_builder_factory.for_room_version(
+            RoomVersions.V1,
+            {
+                "type": EventTypes.Name,
+                "sender": self.u_alice.to_string(),
+                "state_key": "",
+                "room_id": self.room.to_string(),
+                "content": {"name": "first rename of room"},
+            },
+        )
+
+        event1, unpersisted_context1 = self.get_success(
+            self.event_creation_handler.create_new_client_event(builder)
+        )
+        events_and_context.append((event1, unpersisted_context1))
+
+        builder2 = self.event_builder_factory.for_room_version(
+            RoomVersions.V1,
+            {
+                "type": EventTypes.JoinRules,
+                "sender": self.u_alice.to_string(),
+                "state_key": "",
+                "room_id": self.room.to_string(),
+                "content": {"join_rule": "private"},
+            },
+        )
+
+        event2, unpersisted_context2 = self.get_success(
+            self.event_creation_handler.create_new_client_event(builder2)
+        )
+        events_and_context.append((event2, unpersisted_context2))
+
+        builder3 = self.event_builder_factory.for_room_version(
+            RoomVersions.V1,
+            {
+                "type": EventTypes.Message,
+                "sender": self.u_alice.to_string(),
+                "room_id": self.room.to_string(),
+                "content": {"body": "hello from event 3", "msgtype": "m.text"},
+            },
+        )
+
+        event3, unpersisted_context3 = self.get_success(
+            self.event_creation_handler.create_new_client_event(builder3)
+        )
+        events_and_context.append((event3, unpersisted_context3))
+
+        builder4 = self.event_builder_factory.for_room_version(
+            RoomVersions.V1,
+            {
+                "type": EventTypes.JoinRules,
+                "sender": self.u_alice.to_string(),
+                "state_key": "",
+                "room_id": self.room.to_string(),
+                "content": {"join_rule": "public"},
+            },
+        )
+
+        event4, unpersisted_context4 = self.get_success(
+            self.event_creation_handler.create_new_client_event(builder4)
+        )
+        events_and_context.append((event4, unpersisted_context4))
+
+        processed_events_and_context = self.get_success(
+            self.hs.get_datastores().state.store_state_deltas_for_batched(
+                events_and_context, self.room.to_string(), current_state_group
+            )
+        )
+
+        # check that only state events are in state_groups, and all state events are in state_groups
+        res = self.get_success(
+            self.store.db_pool.simple_select_list(
+                table="state_groups",
+                keyvalues=None,
+                retcols=("event_id",),
+            )
+        )
+
+        events = []
+        for result in res:
+            self.assertNotIn(event3.event_id, result)
+            events.append(result.get("event_id"))
+
+        for event, _ in processed_events_and_context:
+            if event.is_state():
+                self.assertIn(event.event_id, events)
+
+        # check that each unique state has state group in state_groups_state and that the
+        # type/state key is correct, and check that each state event's state group
+        # has an entry and prev event in state_group_edges
+        for event, context in processed_events_and_context:
+            if event.is_state():
+                state = self.get_success(
+                    self.store.db_pool.simple_select_list(
+                        table="state_groups_state",
+                        keyvalues={"state_group": context.state_group_after_event},
+                        retcols=("type", "state_key"),
+                    )
+                )
+                self.assertEqual(event.type, state[0].get("type"))
+                self.assertEqual(event.state_key, state[0].get("state_key"))
+
+                groups = self.get_success(
+                    self.store.db_pool.simple_select_list(
+                        table="state_group_edges",
+                        keyvalues={"state_group": str(context.state_group_after_event)},
+                        retcols=("*",),
+                    )
+                )
+                self.assertEqual(
+                    context.state_group_before_event, groups[0].get("prev_state_group")
+                )
diff --git a/tests/unittest.py b/tests/unittest.py
index b21e7f1221..f9160faa1d 100644
--- a/tests/unittest.py
+++ b/tests/unittest.py
@@ -723,7 +723,7 @@ class HomeserverTestCase(TestCase):
         event_creator = self.hs.get_event_creation_handler()
         requester = create_requester(user)
 
-        event, context = self.get_success(
+        event, unpersisted_context = self.get_success(
             event_creator.create_event(
                 requester,
                 {
@@ -735,7 +735,7 @@ class HomeserverTestCase(TestCase):
                 prev_event_ids=prev_event_ids,
             )
         )
-
+        context = self.get_success(unpersisted_context.persist(event))
         if soft_failed:
             event.internal_metadata.soft_failed = True
 
-- 
cgit 1.5.1


From 4fc8875876374ec8f97a3b3cc344a4e3abcf769f Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@users.noreply.github.com>
Date: Mon, 27 Feb 2023 08:26:05 -0500
Subject: Refactor media modules. (#15146)

* Removes the `v1` directory from `test.rest.media.v1`.
* Moves the non-REST code from `synapse.rest.media.v1` to `synapse.media`.
* Flatten the `v1` directory from `synapse.rest.media`,  but leave compatiblity
  with 3rd party media repositories and spam checkers.
---
 changelog.d/15146.misc                             |    1 +
 synapse/_scripts/move_remote_media_to_new_store.py |    2 +-
 synapse/config/repository.py                       |   12 +-
 synapse/events/spamcheck.py                        |    4 +-
 synapse/media/_base.py                             |  479 ++++++++
 synapse/media/filepath.py                          |  410 +++++++
 synapse/media/media_repository.py                  | 1038 ++++++++++++++++
 synapse/media/media_storage.py                     |  374 ++++++
 synapse/media/oembed.py                            |  265 +++++
 synapse/media/preview_html.py                      |  501 ++++++++
 synapse/media/storage_provider.py                  |  181 +++
 synapse/media/thumbnailer.py                       |  221 ++++
 synapse/rest/media/config_resource.py              |   41 +
 synapse/rest/media/download_resource.py            |   75 ++
 synapse/rest/media/media_repository_resource.py    |   93 ++
 synapse/rest/media/preview_url_resource.py         |  869 ++++++++++++++
 synapse/rest/media/thumbnail_resource.py           |  554 +++++++++
 synapse/rest/media/upload_resource.py              |  108 ++
 synapse/rest/media/v1/_base.py                     |  470 +-------
 synapse/rest/media/v1/config_resource.py           |   41 -
 synapse/rest/media/v1/download_resource.py         |   76 --
 synapse/rest/media/v1/filepath.py                  |  410 -------
 synapse/rest/media/v1/media_repository.py          | 1112 ------------------
 synapse/rest/media/v1/media_storage.py             |  365 +-----
 synapse/rest/media/v1/oembed.py                    |  265 -----
 synapse/rest/media/v1/preview_html.py              |  501 --------
 synapse/rest/media/v1/preview_url_resource.py      |  871 --------------
 synapse/rest/media/v1/storage_provider.py          |  172 +--
 synapse/rest/media/v1/thumbnail_resource.py        |  555 ---------
 synapse/rest/media/v1/thumbnailer.py               |  221 ----
 synapse/rest/media/v1/upload_resource.py           |  108 --
 synapse/server.py                                  |    6 +-
 tests/media/__init__.py                            |   13 +
 tests/media/test_base.py                           |   38 +
 tests/media/test_filepath.py                       |  595 ++++++++++
 tests/media/test_html_preview.py                   |  542 +++++++++
 tests/media/test_media_storage.py                  |  792 +++++++++++++
 tests/media/test_oembed.py                         |  162 +++
 tests/rest/admin/test_media.py                     |    2 +-
 tests/rest/admin/test_user.py                      |    2 +-
 tests/rest/media/test_url_preview.py               | 1234 ++++++++++++++++++++
 tests/rest/media/v1/__init__.py                    |   13 -
 tests/rest/media/v1/test_base.py                   |   38 -
 tests/rest/media/v1/test_filepath.py               |  595 ----------
 tests/rest/media/v1/test_html_preview.py           |  542 ---------
 tests/rest/media/v1/test_media_storage.py          |  792 -------------
 tests/rest/media/v1/test_oembed.py                 |  162 ---
 tests/rest/media/v1/test_url_preview.py            | 1234 --------------------
 48 files changed, 8612 insertions(+), 8545 deletions(-)
 create mode 100644 changelog.d/15146.misc
 create mode 100644 synapse/media/_base.py
 create mode 100644 synapse/media/filepath.py
 create mode 100644 synapse/media/media_repository.py
 create mode 100644 synapse/media/media_storage.py
 create mode 100644 synapse/media/oembed.py
 create mode 100644 synapse/media/preview_html.py
 create mode 100644 synapse/media/storage_provider.py
 create mode 100644 synapse/media/thumbnailer.py
 create mode 100644 synapse/rest/media/config_resource.py
 create mode 100644 synapse/rest/media/download_resource.py
 create mode 100644 synapse/rest/media/media_repository_resource.py
 create mode 100644 synapse/rest/media/preview_url_resource.py
 create mode 100644 synapse/rest/media/thumbnail_resource.py
 create mode 100644 synapse/rest/media/upload_resource.py
 delete mode 100644 synapse/rest/media/v1/config_resource.py
 delete mode 100644 synapse/rest/media/v1/download_resource.py
 delete mode 100644 synapse/rest/media/v1/filepath.py
 delete mode 100644 synapse/rest/media/v1/media_repository.py
 delete mode 100644 synapse/rest/media/v1/oembed.py
 delete mode 100644 synapse/rest/media/v1/preview_html.py
 delete mode 100644 synapse/rest/media/v1/preview_url_resource.py
 delete mode 100644 synapse/rest/media/v1/thumbnail_resource.py
 delete mode 100644 synapse/rest/media/v1/thumbnailer.py
 delete mode 100644 synapse/rest/media/v1/upload_resource.py
 create mode 100644 tests/media/__init__.py
 create mode 100644 tests/media/test_base.py
 create mode 100644 tests/media/test_filepath.py
 create mode 100644 tests/media/test_html_preview.py
 create mode 100644 tests/media/test_media_storage.py
 create mode 100644 tests/media/test_oembed.py
 create mode 100644 tests/rest/media/test_url_preview.py
 delete mode 100644 tests/rest/media/v1/__init__.py
 delete mode 100644 tests/rest/media/v1/test_base.py
 delete mode 100644 tests/rest/media/v1/test_filepath.py
 delete mode 100644 tests/rest/media/v1/test_html_preview.py
 delete mode 100644 tests/rest/media/v1/test_media_storage.py
 delete mode 100644 tests/rest/media/v1/test_oembed.py
 delete mode 100644 tests/rest/media/v1/test_url_preview.py

(limited to 'tests/rest')

diff --git a/changelog.d/15146.misc b/changelog.d/15146.misc
new file mode 100644
index 0000000000..8de5f95239
--- /dev/null
+++ b/changelog.d/15146.misc
@@ -0,0 +1 @@
+Refactor the media modules.
diff --git a/synapse/_scripts/move_remote_media_to_new_store.py b/synapse/_scripts/move_remote_media_to_new_store.py
index 819afaaca6..0dd36bee20 100755
--- a/synapse/_scripts/move_remote_media_to_new_store.py
+++ b/synapse/_scripts/move_remote_media_to_new_store.py
@@ -37,7 +37,7 @@ import os
 import shutil
 import sys
 
-from synapse.rest.media.v1.filepath import MediaFilePaths
+from synapse.media.filepath import MediaFilePaths
 
 logger = logging.getLogger()
 
diff --git a/synapse/config/repository.py b/synapse/config/repository.py
index 2da40c09f0..ecb3edbe3a 100644
--- a/synapse/config/repository.py
+++ b/synapse/config/repository.py
@@ -178,11 +178,13 @@ class ContentRepositoryConfig(Config):
         for i, provider_config in enumerate(storage_providers):
             # We special case the module "file_system" so as not to need to
             # expose FileStorageProviderBackend
-            if provider_config["module"] == "file_system":
-                provider_config["module"] = (
-                    "synapse.rest.media.v1.storage_provider"
-                    ".FileStorageProviderBackend"
-                )
+            if (
+                provider_config["module"] == "file_system"
+                or provider_config["module"] == "synapse.rest.media.v1.storage_provider"
+            ):
+                provider_config[
+                    "module"
+                ] = "synapse.media.storage_provider.FileStorageProviderBackend"
 
             provider_class, parsed_config = load_module(
                 provider_config, ("media_storage_providers", "<item %i>" % i)
diff --git a/synapse/events/spamcheck.py b/synapse/events/spamcheck.py
index 623a2c71ea..765c15bb51 100644
--- a/synapse/events/spamcheck.py
+++ b/synapse/events/spamcheck.py
@@ -33,8 +33,8 @@ from typing_extensions import Literal
 import synapse
 from synapse.api.errors import Codes
 from synapse.logging.opentracing import trace
-from synapse.rest.media.v1._base import FileInfo
-from synapse.rest.media.v1.media_storage import ReadableFileWrapper
+from synapse.media._base import FileInfo
+from synapse.media.media_storage import ReadableFileWrapper
 from synapse.spam_checker_api import RegistrationBehaviour
 from synapse.types import JsonDict, RoomAlias, UserProfile
 from synapse.util.async_helpers import delay_cancellation, maybe_awaitable
diff --git a/synapse/media/_base.py b/synapse/media/_base.py
new file mode 100644
index 0000000000..ef8334ae25
--- /dev/null
+++ b/synapse/media/_base.py
@@ -0,0 +1,479 @@
+# Copyright 2014-2016 OpenMarket Ltd
+# Copyright 2019-2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import urllib
+from abc import ABC, abstractmethod
+from types import TracebackType
+from typing import Awaitable, Dict, Generator, List, Optional, Tuple, Type
+
+import attr
+
+from twisted.internet.interfaces import IConsumer
+from twisted.protocols.basic import FileSender
+from twisted.web.server import Request
+
+from synapse.api.errors import Codes, SynapseError, cs_error
+from synapse.http.server import finish_request, respond_with_json
+from synapse.http.site import SynapseRequest
+from synapse.logging.context import make_deferred_yieldable
+from synapse.util.stringutils import is_ascii, parse_and_validate_server_name
+
+logger = logging.getLogger(__name__)
+
+# list all text content types that will have the charset default to UTF-8 when
+# none is given
+TEXT_CONTENT_TYPES = [
+    "text/css",
+    "text/csv",
+    "text/html",
+    "text/calendar",
+    "text/plain",
+    "text/javascript",
+    "application/json",
+    "application/ld+json",
+    "application/rtf",
+    "image/svg+xml",
+    "text/xml",
+]
+
+
+def parse_media_id(request: Request) -> Tuple[str, str, Optional[str]]:
+    """Parses the server name, media ID and optional file name from the request URI
+
+    Also performs some rough validation on the server name.
+
+    Args:
+        request: The `Request`.
+
+    Returns:
+        A tuple containing the parsed server name, media ID and optional file name.
+
+    Raises:
+        SynapseError(404): if parsing or validation fail for any reason
+    """
+    try:
+        # The type on postpath seems incorrect in Twisted 21.2.0.
+        postpath: List[bytes] = request.postpath  # type: ignore
+        assert postpath
+
+        # This allows users to append e.g. /test.png to the URL. Useful for
+        # clients that parse the URL to see content type.
+        server_name_bytes, media_id_bytes = postpath[:2]
+        server_name = server_name_bytes.decode("utf-8")
+        media_id = media_id_bytes.decode("utf8")
+
+        # Validate the server name, raising if invalid
+        parse_and_validate_server_name(server_name)
+
+        file_name = None
+        if len(postpath) > 2:
+            try:
+                file_name = urllib.parse.unquote(postpath[-1].decode("utf-8"))
+            except UnicodeDecodeError:
+                pass
+        return server_name, media_id, file_name
+    except Exception:
+        raise SynapseError(
+            404, "Invalid media id token %r" % (request.postpath,), Codes.UNKNOWN
+        )
+
+
+def respond_404(request: SynapseRequest) -> None:
+    respond_with_json(
+        request,
+        404,
+        cs_error("Not found %r" % (request.postpath,), code=Codes.NOT_FOUND),
+        send_cors=True,
+    )
+
+
+async def respond_with_file(
+    request: SynapseRequest,
+    media_type: str,
+    file_path: str,
+    file_size: Optional[int] = None,
+    upload_name: Optional[str] = None,
+) -> None:
+    logger.debug("Responding with %r", file_path)
+
+    if os.path.isfile(file_path):
+        if file_size is None:
+            stat = os.stat(file_path)
+            file_size = stat.st_size
+
+        add_file_headers(request, media_type, file_size, upload_name)
+
+        with open(file_path, "rb") as f:
+            await make_deferred_yieldable(FileSender().beginFileTransfer(f, request))
+
+        finish_request(request)
+    else:
+        respond_404(request)
+
+
+def add_file_headers(
+    request: Request,
+    media_type: str,
+    file_size: Optional[int],
+    upload_name: Optional[str],
+) -> None:
+    """Adds the correct response headers in preparation for responding with the
+    media.
+
+    Args:
+        request
+        media_type: The media/content type.
+        file_size: Size in bytes of the media, if known.
+        upload_name: The name of the requested file, if any.
+    """
+
+    def _quote(x: str) -> str:
+        return urllib.parse.quote(x.encode("utf-8"))
+
+    # Default to a UTF-8 charset for text content types.
+    # ex, uses UTF-8 for 'text/css' but not 'text/css; charset=UTF-16'
+    if media_type.lower() in TEXT_CONTENT_TYPES:
+        content_type = media_type + "; charset=UTF-8"
+    else:
+        content_type = media_type
+
+    request.setHeader(b"Content-Type", content_type.encode("UTF-8"))
+    if upload_name:
+        # RFC6266 section 4.1 [1] defines both `filename` and `filename*`.
+        #
+        # `filename` is defined to be a `value`, which is defined by RFC2616
+        # section 3.6 [2] to be a `token` or a `quoted-string`, where a `token`
+        # is (essentially) a single US-ASCII word, and a `quoted-string` is a
+        # US-ASCII string surrounded by double-quotes, using backslash as an
+        # escape character. Note that %-encoding is *not* permitted.
+        #
+        # `filename*` is defined to be an `ext-value`, which is defined in
+        # RFC5987 section 3.2.1 [3] to be `charset "'" [ language ] "'" value-chars`,
+        # where `value-chars` is essentially a %-encoded string in the given charset.
+        #
+        # [1]: https://tools.ietf.org/html/rfc6266#section-4.1
+        # [2]: https://tools.ietf.org/html/rfc2616#section-3.6
+        # [3]: https://tools.ietf.org/html/rfc5987#section-3.2.1
+
+        # We avoid the quoted-string version of `filename`, because (a) synapse didn't
+        # correctly interpret those as of 0.99.2 and (b) they are a bit of a pain and we
+        # may as well just do the filename* version.
+        if _can_encode_filename_as_token(upload_name):
+            disposition = "inline; filename=%s" % (upload_name,)
+        else:
+            disposition = "inline; filename*=utf-8''%s" % (_quote(upload_name),)
+
+        request.setHeader(b"Content-Disposition", disposition.encode("ascii"))
+
+    # cache for at least a day.
+    # XXX: we might want to turn this off for data we don't want to
+    # recommend caching as it's sensitive or private - or at least
+    # select private. don't bother setting Expires as all our
+    # clients are smart enough to be happy with Cache-Control
+    request.setHeader(b"Cache-Control", b"public,max-age=86400,s-maxage=86400")
+    if file_size is not None:
+        request.setHeader(b"Content-Length", b"%d" % (file_size,))
+
+    # Tell web crawlers to not index, archive, or follow links in media. This
+    # should help to prevent things in the media repo from showing up in web
+    # search results.
+    request.setHeader(b"X-Robots-Tag", "noindex, nofollow, noarchive, noimageindex")
+
+
+# separators as defined in RFC2616. SP and HT are handled separately.
+# see _can_encode_filename_as_token.
+_FILENAME_SEPARATOR_CHARS = {
+    "(",
+    ")",
+    "<",
+    ">",
+    "@",
+    ",",
+    ";",
+    ":",
+    "\\",
+    '"',
+    "/",
+    "[",
+    "]",
+    "?",
+    "=",
+    "{",
+    "}",
+}
+
+
+def _can_encode_filename_as_token(x: str) -> bool:
+    for c in x:
+        # from RFC2616:
+        #
+        #        token          = 1*<any CHAR except CTLs or separators>
+        #
+        #        separators     = "(" | ")" | "<" | ">" | "@"
+        #                       | "," | ";" | ":" | "\" | <">
+        #                       | "/" | "[" | "]" | "?" | "="
+        #                       | "{" | "}" | SP | HT
+        #
+        #        CHAR           = <any US-ASCII character (octets 0 - 127)>
+        #
+        #        CTL            = <any US-ASCII control character
+        #                         (octets 0 - 31) and DEL (127)>
+        #
+        if ord(c) >= 127 or ord(c) <= 32 or c in _FILENAME_SEPARATOR_CHARS:
+            return False
+    return True
+
+
+async def respond_with_responder(
+    request: SynapseRequest,
+    responder: "Optional[Responder]",
+    media_type: str,
+    file_size: Optional[int],
+    upload_name: Optional[str] = None,
+) -> None:
+    """Responds to the request with given responder. If responder is None then
+    returns 404.
+
+    Args:
+        request
+        responder
+        media_type: The media/content type.
+        file_size: Size in bytes of the media. If not known it should be None
+        upload_name: The name of the requested file, if any.
+    """
+    if not responder:
+        respond_404(request)
+        return
+
+    # If we have a responder we *must* use it as a context manager.
+    with responder:
+        if request._disconnected:
+            logger.warning(
+                "Not sending response to request %s, already disconnected.", request
+            )
+            return
+
+        logger.debug("Responding to media request with responder %s", responder)
+        add_file_headers(request, media_type, file_size, upload_name)
+        try:
+            await responder.write_to_consumer(request)
+        except Exception as e:
+            # The majority of the time this will be due to the client having gone
+            # away. Unfortunately, Twisted simply throws a generic exception at us
+            # in that case.
+            logger.warning("Failed to write to consumer: %s %s", type(e), e)
+
+            # Unregister the producer, if it has one, so Twisted doesn't complain
+            if request.producer:
+                request.unregisterProducer()
+
+    finish_request(request)
+
+
+class Responder(ABC):
+    """Represents a response that can be streamed to the requester.
+
+    Responder is a context manager which *must* be used, so that any resources
+    held can be cleaned up.
+    """
+
+    @abstractmethod
+    def write_to_consumer(self, consumer: IConsumer) -> Awaitable:
+        """Stream response into consumer
+
+        Args:
+            consumer: The consumer to stream into.
+
+        Returns:
+            Resolves once the response has finished being written
+        """
+        raise NotImplementedError()
+
+    def __enter__(self) -> None:  # noqa: B027
+        pass
+
+    def __exit__(  # noqa: B027
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_val: Optional[BaseException],
+        exc_tb: Optional[TracebackType],
+    ) -> None:
+        pass
+
+
+@attr.s(slots=True, frozen=True, auto_attribs=True)
+class ThumbnailInfo:
+    """Details about a generated thumbnail."""
+
+    width: int
+    height: int
+    method: str
+    # Content type of thumbnail, e.g. image/png
+    type: str
+    # The size of the media file, in bytes.
+    length: Optional[int] = None
+
+
+@attr.s(slots=True, frozen=True, auto_attribs=True)
+class FileInfo:
+    """Details about a requested/uploaded file."""
+
+    # The server name where the media originated from, or None if local.
+    server_name: Optional[str]
+    # The local ID of the file. For local files this is the same as the media_id
+    file_id: str
+    # If the file is for the url preview cache
+    url_cache: bool = False
+    # Whether the file is a thumbnail or not.
+    thumbnail: Optional[ThumbnailInfo] = None
+
+    # The below properties exist to maintain compatibility with third-party modules.
+    @property
+    def thumbnail_width(self) -> Optional[int]:
+        if not self.thumbnail:
+            return None
+        return self.thumbnail.width
+
+    @property
+    def thumbnail_height(self) -> Optional[int]:
+        if not self.thumbnail:
+            return None
+        return self.thumbnail.height
+
+    @property
+    def thumbnail_method(self) -> Optional[str]:
+        if not self.thumbnail:
+            return None
+        return self.thumbnail.method
+
+    @property
+    def thumbnail_type(self) -> Optional[str]:
+        if not self.thumbnail:
+            return None
+        return self.thumbnail.type
+
+    @property
+    def thumbnail_length(self) -> Optional[int]:
+        if not self.thumbnail:
+            return None
+        return self.thumbnail.length
+
+
+def get_filename_from_headers(headers: Dict[bytes, List[bytes]]) -> Optional[str]:
+    """
+    Get the filename of the downloaded file by inspecting the
+    Content-Disposition HTTP header.
+
+    Args:
+        headers: The HTTP request headers.
+
+    Returns:
+        The filename, or None.
+    """
+    content_disposition = headers.get(b"Content-Disposition", [b""])
+
+    # No header, bail out.
+    if not content_disposition[0]:
+        return None
+
+    _, params = _parse_header(content_disposition[0])
+
+    upload_name = None
+
+    # First check if there is a valid UTF-8 filename
+    upload_name_utf8 = params.get(b"filename*", None)
+    if upload_name_utf8:
+        if upload_name_utf8.lower().startswith(b"utf-8''"):
+            upload_name_utf8 = upload_name_utf8[7:]
+            # We have a filename*= section. This MUST be ASCII, and any UTF-8
+            # bytes are %-quoted.
+            try:
+                # Once it is decoded, we can then unquote the %-encoded
+                # parts strictly into a unicode string.
+                upload_name = urllib.parse.unquote(
+                    upload_name_utf8.decode("ascii"), errors="strict"
+                )
+            except UnicodeDecodeError:
+                # Incorrect UTF-8.
+                pass
+
+    # If there isn't check for an ascii name.
+    if not upload_name:
+        upload_name_ascii = params.get(b"filename", None)
+        if upload_name_ascii and is_ascii(upload_name_ascii):
+            upload_name = upload_name_ascii.decode("ascii")
+
+    # This may be None here, indicating we did not find a matching name.
+    return upload_name
+
+
+def _parse_header(line: bytes) -> Tuple[bytes, Dict[bytes, bytes]]:
+    """Parse a Content-type like header.
+
+    Cargo-culted from `cgi`, but works on bytes rather than strings.
+
+    Args:
+        line: header to be parsed
+
+    Returns:
+        The main content-type, followed by the parameter dictionary
+    """
+    parts = _parseparam(b";" + line)
+    key = next(parts)
+    pdict = {}
+    for p in parts:
+        i = p.find(b"=")
+        if i >= 0:
+            name = p[:i].strip().lower()
+            value = p[i + 1 :].strip()
+
+            # strip double-quotes
+            if len(value) >= 2 and value[0:1] == value[-1:] == b'"':
+                value = value[1:-1]
+                value = value.replace(b"\\\\", b"\\").replace(b'\\"', b'"')
+            pdict[name] = value
+
+    return key, pdict
+
+
+def _parseparam(s: bytes) -> Generator[bytes, None, None]:
+    """Generator which splits the input on ;, respecting double-quoted sequences
+
+    Cargo-culted from `cgi`, but works on bytes rather than strings.
+
+    Args:
+        s: header to be parsed
+
+    Returns:
+        The split input
+    """
+    while s[:1] == b";":
+        s = s[1:]
+
+        # look for the next ;
+        end = s.find(b";")
+
+        # if there is an odd number of " marks between here and the next ;, skip to the
+        # next ; instead
+        while end > 0 and (s.count(b'"', 0, end) - s.count(b'\\"', 0, end)) % 2:
+            end = s.find(b";", end + 1)
+
+        if end < 0:
+            end = len(s)
+        f = s[:end]
+        yield f.strip()
+        s = s[end:]
diff --git a/synapse/media/filepath.py b/synapse/media/filepath.py
new file mode 100644
index 0000000000..1f6441c412
--- /dev/null
+++ b/synapse/media/filepath.py
@@ -0,0 +1,410 @@
+# Copyright 2014-2016 OpenMarket Ltd
+# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import os
+import re
+import string
+from typing import Any, Callable, List, TypeVar, Union, cast
+
+NEW_FORMAT_ID_RE = re.compile(r"^\d\d\d\d-\d\d-\d\d")
+
+
+F = TypeVar("F", bound=Callable[..., str])
+
+
+def _wrap_in_base_path(func: F) -> F:
+    """Takes a function that returns a relative path and turns it into an
+    absolute path based on the location of the primary media store
+    """
+
+    @functools.wraps(func)
+    def _wrapped(self: "MediaFilePaths", *args: Any, **kwargs: Any) -> str:
+        path = func(self, *args, **kwargs)
+        return os.path.join(self.base_path, path)
+
+    return cast(F, _wrapped)
+
+
+GetPathMethod = TypeVar(
+    "GetPathMethod", bound=Union[Callable[..., str], Callable[..., List[str]]]
+)
+
+
+def _wrap_with_jail_check(relative: bool) -> Callable[[GetPathMethod], GetPathMethod]:
+    """Wraps a path-returning method to check that the returned path(s) do not escape
+    the media store directory.
+
+    The path-returning method may return either a single path, or a list of paths.
+
+    The check is not expected to ever fail, unless `func` is missing a call to
+    `_validate_path_component`, or `_validate_path_component` is buggy.
+
+    Args:
+        relative: A boolean indicating whether the wrapped method returns paths relative
+            to the media store directory.
+
+    Returns:
+        A method which will wrap a path-returning method, adding a check to ensure that
+        the returned path(s) lie within the media store directory. The check will raise
+        a `ValueError` if it fails.
+    """
+
+    def _wrap_with_jail_check_inner(func: GetPathMethod) -> GetPathMethod:
+        @functools.wraps(func)
+        def _wrapped(
+            self: "MediaFilePaths", *args: Any, **kwargs: Any
+        ) -> Union[str, List[str]]:
+            path_or_paths = func(self, *args, **kwargs)
+
+            if isinstance(path_or_paths, list):
+                paths_to_check = path_or_paths
+            else:
+                paths_to_check = [path_or_paths]
+
+            for path in paths_to_check:
+                # Construct the path that will ultimately be used.
+                # We cannot guess whether `path` is relative to the media store
+                # directory, since the media store directory may itself be a relative
+                # path.
+                if relative:
+                    path = os.path.join(self.base_path, path)
+                normalized_path = os.path.normpath(path)
+
+                # Now that `normpath` has eliminated `../`s and `./`s from the path,
+                # `os.path.commonpath` can be used to check whether it lies within the
+                # media store directory.
+                if (
+                    os.path.commonpath([normalized_path, self.normalized_base_path])
+                    != self.normalized_base_path
+                ):
+                    # The path resolves to outside the media store directory,
+                    # or `self.base_path` is `.`, which is an unlikely configuration.
+                    raise ValueError(f"Invalid media store path: {path!r}")
+
+                # Note that `os.path.normpath`/`abspath` has a subtle caveat:
+                # `a/b/c/../c` will normalize to `a/b/c`, but the former refers to a
+                # different path if `a/b/c` is a symlink. That is, the check above is
+                # not perfect and may allow a certain restricted subset of untrustworthy
+                # paths through. Since the check above is secondary to the main
+                # `_validate_path_component` checks, it's less important for it to be
+                # perfect.
+                #
+                # As an alternative, `os.path.realpath` will resolve symlinks, but
+                # proves problematic if there are symlinks inside the media store.
+                # eg. if `url_store/` is symlinked to elsewhere, its canonical path
+                # won't match that of the main media store directory.
+
+            return path_or_paths
+
+        return cast(GetPathMethod, _wrapped)
+
+    return _wrap_with_jail_check_inner
+
+
+ALLOWED_CHARACTERS = set(
+    string.ascii_letters
+    + string.digits
+    + "_-"
+    + ".[]:"  # Domain names, IPv6 addresses and ports in server names
+)
+FORBIDDEN_NAMES = {
+    "",
+    os.path.curdir,  # "." for the current platform
+    os.path.pardir,  # ".." for the current platform
+}
+
+
+def _validate_path_component(name: str) -> str:
+    """Checks that the given string can be safely used as a path component
+
+    Args:
+        name: The path component to check.
+
+    Returns:
+        The path component if valid.
+
+    Raises:
+        ValueError: If `name` cannot be safely used as a path component.
+    """
+    if not ALLOWED_CHARACTERS.issuperset(name) or name in FORBIDDEN_NAMES:
+        raise ValueError(f"Invalid path component: {name!r}")
+
+    return name
+
+
+class MediaFilePaths:
+    """Describes where files are stored on disk.
+
+    Most of the functions have a `*_rel` variant which returns a file path that
+    is relative to the base media store path. This is mainly used when we want
+    to write to the backup media store (when one is configured)
+    """
+
+    def __init__(self, primary_base_path: str):
+        self.base_path = primary_base_path
+        self.normalized_base_path = os.path.normpath(self.base_path)
+
+        # Refuse to initialize if paths cannot be validated correctly for the current
+        # platform.
+        assert os.path.sep not in ALLOWED_CHARACTERS
+        assert os.path.altsep not in ALLOWED_CHARACTERS
+        # On Windows, paths have all sorts of weirdness which `_validate_path_component`
+        # does not consider. In any case, the remote media store can't work correctly
+        # for certain homeservers there, since ":"s aren't allowed in paths.
+        assert os.name == "posix"
+
+    @_wrap_with_jail_check(relative=True)
+    def local_media_filepath_rel(self, media_id: str) -> str:
+        return os.path.join(
+            "local_content",
+            _validate_path_component(media_id[0:2]),
+            _validate_path_component(media_id[2:4]),
+            _validate_path_component(media_id[4:]),
+        )
+
+    local_media_filepath = _wrap_in_base_path(local_media_filepath_rel)
+
+    @_wrap_with_jail_check(relative=True)
+    def local_media_thumbnail_rel(
+        self, media_id: str, width: int, height: int, content_type: str, method: str
+    ) -> str:
+        top_level_type, sub_type = content_type.split("/")
+        file_name = "%i-%i-%s-%s-%s" % (width, height, top_level_type, sub_type, method)
+        return os.path.join(
+            "local_thumbnails",
+            _validate_path_component(media_id[0:2]),
+            _validate_path_component(media_id[2:4]),
+            _validate_path_component(media_id[4:]),
+            _validate_path_component(file_name),
+        )
+
+    local_media_thumbnail = _wrap_in_base_path(local_media_thumbnail_rel)
+
+    @_wrap_with_jail_check(relative=False)
+    def local_media_thumbnail_dir(self, media_id: str) -> str:
+        """
+        Retrieve the local store path of thumbnails of a given media_id
+
+        Args:
+            media_id: The media ID to query.
+        Returns:
+            Path of local_thumbnails from media_id
+        """
+        return os.path.join(
+            self.base_path,
+            "local_thumbnails",
+            _validate_path_component(media_id[0:2]),
+            _validate_path_component(media_id[2:4]),
+            _validate_path_component(media_id[4:]),
+        )
+
+    @_wrap_with_jail_check(relative=True)
+    def remote_media_filepath_rel(self, server_name: str, file_id: str) -> str:
+        return os.path.join(
+            "remote_content",
+            _validate_path_component(server_name),
+            _validate_path_component(file_id[0:2]),
+            _validate_path_component(file_id[2:4]),
+            _validate_path_component(file_id[4:]),
+        )
+
+    remote_media_filepath = _wrap_in_base_path(remote_media_filepath_rel)
+
+    @_wrap_with_jail_check(relative=True)
+    def remote_media_thumbnail_rel(
+        self,
+        server_name: str,
+        file_id: str,
+        width: int,
+        height: int,
+        content_type: str,
+        method: str,
+    ) -> str:
+        top_level_type, sub_type = content_type.split("/")
+        file_name = "%i-%i-%s-%s-%s" % (width, height, top_level_type, sub_type, method)
+        return os.path.join(
+            "remote_thumbnail",
+            _validate_path_component(server_name),
+            _validate_path_component(file_id[0:2]),
+            _validate_path_component(file_id[2:4]),
+            _validate_path_component(file_id[4:]),
+            _validate_path_component(file_name),
+        )
+
+    remote_media_thumbnail = _wrap_in_base_path(remote_media_thumbnail_rel)
+
+    # Legacy path that was used to store thumbnails previously.
+    # Should be removed after some time, when most of the thumbnails are stored
+    # using the new path.
+    @_wrap_with_jail_check(relative=True)
+    def remote_media_thumbnail_rel_legacy(
+        self, server_name: str, file_id: str, width: int, height: int, content_type: str
+    ) -> str:
+        top_level_type, sub_type = content_type.split("/")
+        file_name = "%i-%i-%s-%s" % (width, height, top_level_type, sub_type)
+        return os.path.join(
+            "remote_thumbnail",
+            _validate_path_component(server_name),
+            _validate_path_component(file_id[0:2]),
+            _validate_path_component(file_id[2:4]),
+            _validate_path_component(file_id[4:]),
+            _validate_path_component(file_name),
+        )
+
+    @_wrap_with_jail_check(relative=False)
+    def remote_media_thumbnail_dir(self, server_name: str, file_id: str) -> str:
+        return os.path.join(
+            self.base_path,
+            "remote_thumbnail",
+            _validate_path_component(server_name),
+            _validate_path_component(file_id[0:2]),
+            _validate_path_component(file_id[2:4]),
+            _validate_path_component(file_id[4:]),
+        )
+
+    @_wrap_with_jail_check(relative=True)
+    def url_cache_filepath_rel(self, media_id: str) -> str:
+        if NEW_FORMAT_ID_RE.match(media_id):
+            # Media id is of the form <DATE><RANDOM_STRING>
+            # E.g.: 2017-09-28-fsdRDt24DS234dsf
+            return os.path.join(
+                "url_cache",
+                _validate_path_component(media_id[:10]),
+                _validate_path_component(media_id[11:]),
+            )
+        else:
+            return os.path.join(
+                "url_cache",
+                _validate_path_component(media_id[0:2]),
+                _validate_path_component(media_id[2:4]),
+                _validate_path_component(media_id[4:]),
+            )
+
+    url_cache_filepath = _wrap_in_base_path(url_cache_filepath_rel)
+
+    @_wrap_with_jail_check(relative=False)
+    def url_cache_filepath_dirs_to_delete(self, media_id: str) -> List[str]:
+        "The dirs to try and remove if we delete the media_id file"
+        if NEW_FORMAT_ID_RE.match(media_id):
+            return [
+                os.path.join(
+                    self.base_path, "url_cache", _validate_path_component(media_id[:10])
+                )
+            ]
+        else:
+            return [
+                os.path.join(
+                    self.base_path,
+                    "url_cache",
+                    _validate_path_component(media_id[0:2]),
+                    _validate_path_component(media_id[2:4]),
+                ),
+                os.path.join(
+                    self.base_path, "url_cache", _validate_path_component(media_id[0:2])
+                ),
+            ]
+
+    @_wrap_with_jail_check(relative=True)
+    def url_cache_thumbnail_rel(
+        self, media_id: str, width: int, height: int, content_type: str, method: str
+    ) -> str:
+        # Media id is of the form <DATE><RANDOM_STRING>
+        # E.g.: 2017-09-28-fsdRDt24DS234dsf
+
+        top_level_type, sub_type = content_type.split("/")
+        file_name = "%i-%i-%s-%s-%s" % (width, height, top_level_type, sub_type, method)
+
+        if NEW_FORMAT_ID_RE.match(media_id):
+            return os.path.join(
+                "url_cache_thumbnails",
+                _validate_path_component(media_id[:10]),
+                _validate_path_component(media_id[11:]),
+                _validate_path_component(file_name),
+            )
+        else:
+            return os.path.join(
+                "url_cache_thumbnails",
+                _validate_path_component(media_id[0:2]),
+                _validate_path_component(media_id[2:4]),
+                _validate_path_component(media_id[4:]),
+                _validate_path_component(file_name),
+            )
+
+    url_cache_thumbnail = _wrap_in_base_path(url_cache_thumbnail_rel)
+
+    @_wrap_with_jail_check(relative=True)
+    def url_cache_thumbnail_directory_rel(self, media_id: str) -> str:
+        # Media id is of the form <DATE><RANDOM_STRING>
+        # E.g.: 2017-09-28-fsdRDt24DS234dsf
+
+        if NEW_FORMAT_ID_RE.match(media_id):
+            return os.path.join(
+                "url_cache_thumbnails",
+                _validate_path_component(media_id[:10]),
+                _validate_path_component(media_id[11:]),
+            )
+        else:
+            return os.path.join(
+                "url_cache_thumbnails",
+                _validate_path_component(media_id[0:2]),
+                _validate_path_component(media_id[2:4]),
+                _validate_path_component(media_id[4:]),
+            )
+
+    url_cache_thumbnail_directory = _wrap_in_base_path(
+        url_cache_thumbnail_directory_rel
+    )
+
+    @_wrap_with_jail_check(relative=False)
+    def url_cache_thumbnail_dirs_to_delete(self, media_id: str) -> List[str]:
+        "The dirs to try and remove if we delete the media_id thumbnails"
+        # Media id is of the form <DATE><RANDOM_STRING>
+        # E.g.: 2017-09-28-fsdRDt24DS234dsf
+        if NEW_FORMAT_ID_RE.match(media_id):
+            return [
+                os.path.join(
+                    self.base_path,
+                    "url_cache_thumbnails",
+                    _validate_path_component(media_id[:10]),
+                    _validate_path_component(media_id[11:]),
+                ),
+                os.path.join(
+                    self.base_path,
+                    "url_cache_thumbnails",
+                    _validate_path_component(media_id[:10]),
+                ),
+            ]
+        else:
+            return [
+                os.path.join(
+                    self.base_path,
+                    "url_cache_thumbnails",
+                    _validate_path_component(media_id[0:2]),
+                    _validate_path_component(media_id[2:4]),
+                    _validate_path_component(media_id[4:]),
+                ),
+                os.path.join(
+                    self.base_path,
+                    "url_cache_thumbnails",
+                    _validate_path_component(media_id[0:2]),
+                    _validate_path_component(media_id[2:4]),
+                ),
+                os.path.join(
+                    self.base_path,
+                    "url_cache_thumbnails",
+                    _validate_path_component(media_id[0:2]),
+                ),
+            ]
diff --git a/synapse/media/media_repository.py b/synapse/media/media_repository.py
new file mode 100644
index 0000000000..b81e3c2b0c
--- /dev/null
+++ b/synapse/media/media_repository.py
@@ -0,0 +1,1038 @@
+# Copyright 2014-2016 OpenMarket Ltd
+# Copyright 2018-2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import errno
+import logging
+import os
+import shutil
+from io import BytesIO
+from typing import IO, TYPE_CHECKING, Dict, List, Optional, Set, Tuple
+
+from matrix_common.types.mxc_uri import MXCUri
+
+import twisted.internet.error
+import twisted.web.http
+from twisted.internet.defer import Deferred
+
+from synapse.api.errors import (
+    FederationDeniedError,
+    HttpResponseException,
+    NotFoundError,
+    RequestSendFailed,
+    SynapseError,
+)
+from synapse.config.repository import ThumbnailRequirement
+from synapse.http.site import SynapseRequest
+from synapse.logging.context import defer_to_thread
+from synapse.media._base import (
+    FileInfo,
+    Responder,
+    ThumbnailInfo,
+    get_filename_from_headers,
+    respond_404,
+    respond_with_responder,
+)
+from synapse.media.filepath import MediaFilePaths
+from synapse.media.media_storage import MediaStorage
+from synapse.media.storage_provider import StorageProviderWrapper
+from synapse.media.thumbnailer import Thumbnailer, ThumbnailError
+from synapse.metrics.background_process_metrics import run_as_background_process
+from synapse.types import UserID
+from synapse.util.async_helpers import Linearizer
+from synapse.util.retryutils import NotRetryingDestination
+from synapse.util.stringutils import random_string
+
+if TYPE_CHECKING:
+    from synapse.server import HomeServer
+
+logger = logging.getLogger(__name__)
+
+# How often to run the background job to update the "recently accessed"
+# attribute of local and remote media.
+UPDATE_RECENTLY_ACCESSED_TS = 60 * 1000  # 1 minute
+# How often to run the background job to check for local and remote media
+# that should be purged according to the configured media retention settings.
+MEDIA_RETENTION_CHECK_PERIOD_MS = 60 * 60 * 1000  # 1 hour
+
+
+class MediaRepository:
+    def __init__(self, hs: "HomeServer"):
+        self.hs = hs
+        self.auth = hs.get_auth()
+        self.client = hs.get_federation_http_client()
+        self.clock = hs.get_clock()
+        self.server_name = hs.hostname
+        self.store = hs.get_datastores().main
+        self.max_upload_size = hs.config.media.max_upload_size
+        self.max_image_pixels = hs.config.media.max_image_pixels
+
+        Thumbnailer.set_limits(self.max_image_pixels)
+
+        self.primary_base_path: str = hs.config.media.media_store_path
+        self.filepaths: MediaFilePaths = MediaFilePaths(self.primary_base_path)
+
+        self.dynamic_thumbnails = hs.config.media.dynamic_thumbnails
+        self.thumbnail_requirements = hs.config.media.thumbnail_requirements
+
+        self.remote_media_linearizer = Linearizer(name="media_remote")
+
+        self.recently_accessed_remotes: Set[Tuple[str, str]] = set()
+        self.recently_accessed_locals: Set[str] = set()
+
+        self.federation_domain_whitelist = (
+            hs.config.federation.federation_domain_whitelist
+        )
+
+        # List of StorageProviders where we should search for media and
+        # potentially upload to.
+        storage_providers = []
+
+        for (
+            clz,
+            provider_config,
+            wrapper_config,
+        ) in hs.config.media.media_storage_providers:
+            backend = clz(hs, provider_config)
+            provider = StorageProviderWrapper(
+                backend,
+                store_local=wrapper_config.store_local,
+                store_remote=wrapper_config.store_remote,
+                store_synchronous=wrapper_config.store_synchronous,
+            )
+            storage_providers.append(provider)
+
+        self.media_storage = MediaStorage(
+            self.hs, self.primary_base_path, self.filepaths, storage_providers
+        )
+
+        self.clock.looping_call(
+            self._start_update_recently_accessed, UPDATE_RECENTLY_ACCESSED_TS
+        )
+
+        # Media retention configuration options
+        self._media_retention_local_media_lifetime_ms = (
+            hs.config.media.media_retention_local_media_lifetime_ms
+        )
+        self._media_retention_remote_media_lifetime_ms = (
+            hs.config.media.media_retention_remote_media_lifetime_ms
+        )
+
+        # Check whether local or remote media retention is configured
+        if (
+            hs.config.media.media_retention_local_media_lifetime_ms is not None
+            or hs.config.media.media_retention_remote_media_lifetime_ms is not None
+        ):
+            # Run the background job to apply media retention rules routinely,
+            # with the duration between runs dictated by the homeserver config.
+            self.clock.looping_call(
+                self._start_apply_media_retention_rules,
+                MEDIA_RETENTION_CHECK_PERIOD_MS,
+            )
+
+    def _start_update_recently_accessed(self) -> Deferred:
+        return run_as_background_process(
+            "update_recently_accessed_media", self._update_recently_accessed
+        )
+
+    def _start_apply_media_retention_rules(self) -> Deferred:
+        return run_as_background_process(
+            "apply_media_retention_rules", self._apply_media_retention_rules
+        )
+
+    async def _update_recently_accessed(self) -> None:
+        remote_media = self.recently_accessed_remotes
+        self.recently_accessed_remotes = set()
+
+        local_media = self.recently_accessed_locals
+        self.recently_accessed_locals = set()
+
+        await self.store.update_cached_last_access_time(
+            local_media, remote_media, self.clock.time_msec()
+        )
+
+    def mark_recently_accessed(self, server_name: Optional[str], media_id: str) -> None:
+        """Mark the given media as recently accessed.
+
+        Args:
+            server_name: Origin server of media, or None if local
+            media_id: The media ID of the content
+        """
+        if server_name:
+            self.recently_accessed_remotes.add((server_name, media_id))
+        else:
+            self.recently_accessed_locals.add(media_id)
+
+    async def create_content(
+        self,
+        media_type: str,
+        upload_name: Optional[str],
+        content: IO,
+        content_length: int,
+        auth_user: UserID,
+    ) -> MXCUri:
+        """Store uploaded content for a local user and return the mxc URL
+
+        Args:
+            media_type: The content type of the file.
+            upload_name: The name of the file, if provided.
+            content: A file like object that is the content to store
+            content_length: The length of the content
+            auth_user: The user_id of the uploader
+
+        Returns:
+            The mxc url of the stored content
+        """
+
+        media_id = random_string(24)
+
+        file_info = FileInfo(server_name=None, file_id=media_id)
+
+        fname = await self.media_storage.store_file(content, file_info)
+
+        logger.info("Stored local media in file %r", fname)
+
+        await self.store.store_local_media(
+            media_id=media_id,
+            media_type=media_type,
+            time_now_ms=self.clock.time_msec(),
+            upload_name=upload_name,
+            media_length=content_length,
+            user_id=auth_user,
+        )
+
+        await self._generate_thumbnails(None, media_id, media_id, media_type)
+
+        return MXCUri(self.server_name, media_id)
+
+    async def get_local_media(
+        self, request: SynapseRequest, media_id: str, name: Optional[str]
+    ) -> None:
+        """Responds to requests for local media, if exists, or returns 404.
+
+        Args:
+            request: The incoming request.
+            media_id: The media ID of the content. (This is the same as
+                the file_id for local content.)
+            name: Optional name that, if specified, will be used as
+                the filename in the Content-Disposition header of the response.
+
+        Returns:
+            Resolves once a response has successfully been written to request
+        """
+        media_info = await self.store.get_local_media(media_id)
+        if not media_info or media_info["quarantined_by"]:
+            respond_404(request)
+            return
+
+        self.mark_recently_accessed(None, media_id)
+
+        media_type = media_info["media_type"]
+        if not media_type:
+            media_type = "application/octet-stream"
+        media_length = media_info["media_length"]
+        upload_name = name if name else media_info["upload_name"]
+        url_cache = media_info["url_cache"]
+
+        file_info = FileInfo(None, media_id, url_cache=bool(url_cache))
+
+        responder = await self.media_storage.fetch_media(file_info)
+        await respond_with_responder(
+            request, responder, media_type, media_length, upload_name
+        )
+
+    async def get_remote_media(
+        self,
+        request: SynapseRequest,
+        server_name: str,
+        media_id: str,
+        name: Optional[str],
+    ) -> None:
+        """Respond to requests for remote media.
+
+        Args:
+            request: The incoming request.
+            server_name: Remote server_name where the media originated.
+            media_id: The media ID of the content (as defined by the remote server).
+            name: Optional name that, if specified, will be used as
+                the filename in the Content-Disposition header of the response.
+
+        Returns:
+            Resolves once a response has successfully been written to request
+        """
+        if (
+            self.federation_domain_whitelist is not None
+            and server_name not in self.federation_domain_whitelist
+        ):
+            raise FederationDeniedError(server_name)
+
+        self.mark_recently_accessed(server_name, media_id)
+
+        # We linearize here to ensure that we don't try and download remote
+        # media multiple times concurrently
+        key = (server_name, media_id)
+        async with self.remote_media_linearizer.queue(key):
+            responder, media_info = await self._get_remote_media_impl(
+                server_name, media_id
+            )
+
+        # We deliberately stream the file outside the lock
+        if responder:
+            media_type = media_info["media_type"]
+            media_length = media_info["media_length"]
+            upload_name = name if name else media_info["upload_name"]
+            await respond_with_responder(
+                request, responder, media_type, media_length, upload_name
+            )
+        else:
+            respond_404(request)
+
+    async def get_remote_media_info(self, server_name: str, media_id: str) -> dict:
+        """Gets the media info associated with the remote file, downloading
+        if necessary.
+
+        Args:
+            server_name: Remote server_name where the media originated.
+            media_id: The media ID of the content (as defined by the remote server).
+
+        Returns:
+            The media info of the file
+        """
+        if (
+            self.federation_domain_whitelist is not None
+            and server_name not in self.federation_domain_whitelist
+        ):
+            raise FederationDeniedError(server_name)
+
+        # We linearize here to ensure that we don't try and download remote
+        # media multiple times concurrently
+        key = (server_name, media_id)
+        async with self.remote_media_linearizer.queue(key):
+            responder, media_info = await self._get_remote_media_impl(
+                server_name, media_id
+            )
+
+        # Ensure we actually use the responder so that it releases resources
+        if responder:
+            with responder:
+                pass
+
+        return media_info
+
+    async def _get_remote_media_impl(
+        self, server_name: str, media_id: str
+    ) -> Tuple[Optional[Responder], dict]:
+        """Looks for media in local cache, if not there then attempt to
+        download from remote server.
+
+        Args:
+            server_name: Remote server_name where the media originated.
+            media_id: The media ID of the content (as defined by the
+                remote server).
+
+        Returns:
+            A tuple of responder and the media info of the file.
+        """
+        media_info = await self.store.get_cached_remote_media(server_name, media_id)
+
+        # file_id is the ID we use to track the file locally. If we've already
+        # seen the file then reuse the existing ID, otherwise generate a new
+        # one.
+
+        # If we have an entry in the DB, try and look for it
+        if media_info:
+            file_id = media_info["filesystem_id"]
+            file_info = FileInfo(server_name, file_id)
+
+            if media_info["quarantined_by"]:
+                logger.info("Media is quarantined")
+                raise NotFoundError()
+
+            if not media_info["media_type"]:
+                media_info["media_type"] = "application/octet-stream"
+
+            responder = await self.media_storage.fetch_media(file_info)
+            if responder:
+                return responder, media_info
+
+        # Failed to find the file anywhere, lets download it.
+
+        try:
+            media_info = await self._download_remote_file(
+                server_name,
+                media_id,
+            )
+        except SynapseError:
+            raise
+        except Exception as e:
+            # An exception may be because we downloaded media in another
+            # process, so let's check if we magically have the media.
+            media_info = await self.store.get_cached_remote_media(server_name, media_id)
+            if not media_info:
+                raise e
+
+        file_id = media_info["filesystem_id"]
+        if not media_info["media_type"]:
+            media_info["media_type"] = "application/octet-stream"
+        file_info = FileInfo(server_name, file_id)
+
+        # We generate thumbnails even if another process downloaded the media
+        # as a) it's conceivable that the other download request dies before it
+        # generates thumbnails, but mainly b) we want to be sure the thumbnails
+        # have finished being generated before responding to the client,
+        # otherwise they'll request thumbnails and get a 404 if they're not
+        # ready yet.
+        await self._generate_thumbnails(
+            server_name, media_id, file_id, media_info["media_type"]
+        )
+
+        responder = await self.media_storage.fetch_media(file_info)
+        return responder, media_info
+
+    async def _download_remote_file(
+        self,
+        server_name: str,
+        media_id: str,
+    ) -> dict:
+        """Attempt to download the remote file from the given server name,
+        using the given file_id as the local id.
+
+        Args:
+            server_name: Originating server
+            media_id: The media ID of the content (as defined by the
+                remote server). This is different than the file_id, which is
+                locally generated.
+            file_id: Local file ID
+
+        Returns:
+            The media info of the file.
+        """
+
+        file_id = random_string(24)
+
+        file_info = FileInfo(server_name=server_name, file_id=file_id)
+
+        with self.media_storage.store_into_file(file_info) as (f, fname, finish):
+            request_path = "/".join(
+                ("/_matrix/media/r0/download", server_name, media_id)
+            )
+            try:
+                length, headers = await self.client.get_file(
+                    server_name,
+                    request_path,
+                    output_stream=f,
+                    max_size=self.max_upload_size,
+                    args={
+                        # tell the remote server to 404 if it doesn't
+                        # recognise the server_name, to make sure we don't
+                        # end up with a routing loop.
+                        "allow_remote": "false"
+                    },
+                )
+            except RequestSendFailed as e:
+                logger.warning(
+                    "Request failed fetching remote media %s/%s: %r",
+                    server_name,
+                    media_id,
+                    e,
+                )
+                raise SynapseError(502, "Failed to fetch remote media")
+
+            except HttpResponseException as e:
+                logger.warning(
+                    "HTTP error fetching remote media %s/%s: %s",
+                    server_name,
+                    media_id,
+                    e.response,
+                )
+                if e.code == twisted.web.http.NOT_FOUND:
+                    raise e.to_synapse_error()
+                raise SynapseError(502, "Failed to fetch remote media")
+
+            except SynapseError:
+                logger.warning(
+                    "Failed to fetch remote media %s/%s", server_name, media_id
+                )
+                raise
+            except NotRetryingDestination:
+                logger.warning("Not retrying destination %r", server_name)
+                raise SynapseError(502, "Failed to fetch remote media")
+            except Exception:
+                logger.exception(
+                    "Failed to fetch remote media %s/%s", server_name, media_id
+                )
+                raise SynapseError(502, "Failed to fetch remote media")
+
+            await finish()
+
+            if b"Content-Type" in headers:
+                media_type = headers[b"Content-Type"][0].decode("ascii")
+            else:
+                media_type = "application/octet-stream"
+            upload_name = get_filename_from_headers(headers)
+            time_now_ms = self.clock.time_msec()
+
+            # Multiple remote media download requests can race (when using
+            # multiple media repos), so this may throw a violation constraint
+            # exception. If it does we'll delete the newly downloaded file from
+            # disk (as we're in the ctx manager).
+            #
+            # However: we've already called `finish()` so we may have also
+            # written to the storage providers. This is preferable to the
+            # alternative where we call `finish()` *after* this, where we could
+            # end up having an entry in the DB but fail to write the files to
+            # the storage providers.
+            await self.store.store_cached_remote_media(
+                origin=server_name,
+                media_id=media_id,
+                media_type=media_type,
+                time_now_ms=self.clock.time_msec(),
+                upload_name=upload_name,
+                media_length=length,
+                filesystem_id=file_id,
+            )
+
+        logger.info("Stored remote media in file %r", fname)
+
+        media_info = {
+            "media_type": media_type,
+            "media_length": length,
+            "upload_name": upload_name,
+            "created_ts": time_now_ms,
+            "filesystem_id": file_id,
+        }
+
+        return media_info
+
+    def _get_thumbnail_requirements(
+        self, media_type: str
+    ) -> Tuple[ThumbnailRequirement, ...]:
+        scpos = media_type.find(";")
+        if scpos > 0:
+            media_type = media_type[:scpos]
+        return self.thumbnail_requirements.get(media_type, ())
+
+    def _generate_thumbnail(
+        self,
+        thumbnailer: Thumbnailer,
+        t_width: int,
+        t_height: int,
+        t_method: str,
+        t_type: str,
+    ) -> Optional[BytesIO]:
+        m_width = thumbnailer.width
+        m_height = thumbnailer.height
+
+        if m_width * m_height >= self.max_image_pixels:
+            logger.info(
+                "Image too large to thumbnail %r x %r > %r",
+                m_width,
+                m_height,
+                self.max_image_pixels,
+            )
+            return None
+
+        if thumbnailer.transpose_method is not None:
+            m_width, m_height = thumbnailer.transpose()
+
+        if t_method == "crop":
+            return thumbnailer.crop(t_width, t_height, t_type)
+        elif t_method == "scale":
+            t_width, t_height = thumbnailer.aspect(t_width, t_height)
+            t_width = min(m_width, t_width)
+            t_height = min(m_height, t_height)
+            return thumbnailer.scale(t_width, t_height, t_type)
+
+        return None
+
+    async def generate_local_exact_thumbnail(
+        self,
+        media_id: str,
+        t_width: int,
+        t_height: int,
+        t_method: str,
+        t_type: str,
+        url_cache: bool,
+    ) -> Optional[str]:
+        input_path = await self.media_storage.ensure_media_is_in_local_cache(
+            FileInfo(None, media_id, url_cache=url_cache)
+        )
+
+        try:
+            thumbnailer = Thumbnailer(input_path)
+        except ThumbnailError as e:
+            logger.warning(
+                "Unable to generate a thumbnail for local media %s using a method of %s and type of %s: %s",
+                media_id,
+                t_method,
+                t_type,
+                e,
+            )
+            return None
+
+        with thumbnailer:
+            t_byte_source = await defer_to_thread(
+                self.hs.get_reactor(),
+                self._generate_thumbnail,
+                thumbnailer,
+                t_width,
+                t_height,
+                t_method,
+                t_type,
+            )
+
+        if t_byte_source:
+            try:
+                file_info = FileInfo(
+                    server_name=None,
+                    file_id=media_id,
+                    url_cache=url_cache,
+                    thumbnail=ThumbnailInfo(
+                        width=t_width,
+                        height=t_height,
+                        method=t_method,
+                        type=t_type,
+                    ),
+                )
+
+                output_path = await self.media_storage.store_file(
+                    t_byte_source, file_info
+                )
+            finally:
+                t_byte_source.close()
+
+            logger.info("Stored thumbnail in file %r", output_path)
+
+            t_len = os.path.getsize(output_path)
+
+            await self.store.store_local_thumbnail(
+                media_id, t_width, t_height, t_type, t_method, t_len
+            )
+
+            return output_path
+
+        # Could not generate thumbnail.
+        return None
+
+    async def generate_remote_exact_thumbnail(
+        self,
+        server_name: str,
+        file_id: str,
+        media_id: str,
+        t_width: int,
+        t_height: int,
+        t_method: str,
+        t_type: str,
+    ) -> Optional[str]:
+        input_path = await self.media_storage.ensure_media_is_in_local_cache(
+            FileInfo(server_name, file_id)
+        )
+
+        try:
+            thumbnailer = Thumbnailer(input_path)
+        except ThumbnailError as e:
+            logger.warning(
+                "Unable to generate a thumbnail for remote media %s from %s using a method of %s and type of %s: %s",
+                media_id,
+                server_name,
+                t_method,
+                t_type,
+                e,
+            )
+            return None
+
+        with thumbnailer:
+            t_byte_source = await defer_to_thread(
+                self.hs.get_reactor(),
+                self._generate_thumbnail,
+                thumbnailer,
+                t_width,
+                t_height,
+                t_method,
+                t_type,
+            )
+
+        if t_byte_source:
+            try:
+                file_info = FileInfo(
+                    server_name=server_name,
+                    file_id=file_id,
+                    thumbnail=ThumbnailInfo(
+                        width=t_width,
+                        height=t_height,
+                        method=t_method,
+                        type=t_type,
+                    ),
+                )
+
+                output_path = await self.media_storage.store_file(
+                    t_byte_source, file_info
+                )
+            finally:
+                t_byte_source.close()
+
+            logger.info("Stored thumbnail in file %r", output_path)
+
+            t_len = os.path.getsize(output_path)
+
+            await self.store.store_remote_media_thumbnail(
+                server_name,
+                media_id,
+                file_id,
+                t_width,
+                t_height,
+                t_type,
+                t_method,
+                t_len,
+            )
+
+            return output_path
+
+        # Could not generate thumbnail.
+        return None
+
+    async def _generate_thumbnails(
+        self,
+        server_name: Optional[str],
+        media_id: str,
+        file_id: str,
+        media_type: str,
+        url_cache: bool = False,
+    ) -> Optional[dict]:
+        """Generate and store thumbnails for an image.
+
+        Args:
+            server_name: The server name if remote media, else None if local
+            media_id: The media ID of the content. (This is the same as
+                the file_id for local content)
+            file_id: Local file ID
+            media_type: The content type of the file
+            url_cache: If we are thumbnailing images downloaded for the URL cache,
+                used exclusively by the url previewer
+
+        Returns:
+            Dict with "width" and "height" keys of original image or None if the
+            media cannot be thumbnailed.
+        """
+        requirements = self._get_thumbnail_requirements(media_type)
+        if not requirements:
+            return None
+
+        input_path = await self.media_storage.ensure_media_is_in_local_cache(
+            FileInfo(server_name, file_id, url_cache=url_cache)
+        )
+
+        try:
+            thumbnailer = Thumbnailer(input_path)
+        except ThumbnailError as e:
+            logger.warning(
+                "Unable to generate thumbnails for remote media %s from %s of type %s: %s",
+                media_id,
+                server_name,
+                media_type,
+                e,
+            )
+            return None
+
+        with thumbnailer:
+            m_width = thumbnailer.width
+            m_height = thumbnailer.height
+
+            if m_width * m_height >= self.max_image_pixels:
+                logger.info(
+                    "Image too large to thumbnail %r x %r > %r",
+                    m_width,
+                    m_height,
+                    self.max_image_pixels,
+                )
+                return None
+
+            if thumbnailer.transpose_method is not None:
+                m_width, m_height = await defer_to_thread(
+                    self.hs.get_reactor(), thumbnailer.transpose
+                )
+
+            # We deduplicate the thumbnail sizes by ignoring the cropped versions if
+            # they have the same dimensions of a scaled one.
+            thumbnails: Dict[Tuple[int, int, str], str] = {}
+            for requirement in requirements:
+                if requirement.method == "crop":
+                    thumbnails.setdefault(
+                        (requirement.width, requirement.height, requirement.media_type),
+                        requirement.method,
+                    )
+                elif requirement.method == "scale":
+                    t_width, t_height = thumbnailer.aspect(
+                        requirement.width, requirement.height
+                    )
+                    t_width = min(m_width, t_width)
+                    t_height = min(m_height, t_height)
+                    thumbnails[
+                        (t_width, t_height, requirement.media_type)
+                    ] = requirement.method
+
+            # Now we generate the thumbnails for each dimension, store it
+            for (t_width, t_height, t_type), t_method in thumbnails.items():
+                # Generate the thumbnail
+                if t_method == "crop":
+                    t_byte_source = await defer_to_thread(
+                        self.hs.get_reactor(),
+                        thumbnailer.crop,
+                        t_width,
+                        t_height,
+                        t_type,
+                    )
+                elif t_method == "scale":
+                    t_byte_source = await defer_to_thread(
+                        self.hs.get_reactor(),
+                        thumbnailer.scale,
+                        t_width,
+                        t_height,
+                        t_type,
+                    )
+                else:
+                    logger.error("Unrecognized method: %r", t_method)
+                    continue
+
+                if not t_byte_source:
+                    continue
+
+                file_info = FileInfo(
+                    server_name=server_name,
+                    file_id=file_id,
+                    url_cache=url_cache,
+                    thumbnail=ThumbnailInfo(
+                        width=t_width,
+                        height=t_height,
+                        method=t_method,
+                        type=t_type,
+                    ),
+                )
+
+                with self.media_storage.store_into_file(file_info) as (
+                    f,
+                    fname,
+                    finish,
+                ):
+                    try:
+                        await self.media_storage.write_to_file(t_byte_source, f)
+                        await finish()
+                    finally:
+                        t_byte_source.close()
+
+                    t_len = os.path.getsize(fname)
+
+                    # Write to database
+                    if server_name:
+                        # Multiple remote media download requests can race (when
+                        # using multiple media repos), so this may throw a violation
+                        # constraint exception. If it does we'll delete the newly
+                        # generated thumbnail from disk (as we're in the ctx
+                        # manager).
+                        #
+                        # However: we've already called `finish()` so we may have
+                        # also written to the storage providers. This is preferable
+                        # to the alternative where we call `finish()` *after* this,
+                        # where we could end up having an entry in the DB but fail
+                        # to write the files to the storage providers.
+                        try:
+                            await self.store.store_remote_media_thumbnail(
+                                server_name,
+                                media_id,
+                                file_id,
+                                t_width,
+                                t_height,
+                                t_type,
+                                t_method,
+                                t_len,
+                            )
+                        except Exception as e:
+                            thumbnail_exists = (
+                                await self.store.get_remote_media_thumbnail(
+                                    server_name,
+                                    media_id,
+                                    t_width,
+                                    t_height,
+                                    t_type,
+                                )
+                            )
+                            if not thumbnail_exists:
+                                raise e
+                    else:
+                        await self.store.store_local_thumbnail(
+                            media_id, t_width, t_height, t_type, t_method, t_len
+                        )
+
+        return {"width": m_width, "height": m_height}
+
+    async def _apply_media_retention_rules(self) -> None:
+        """
+        Purge old local and remote media according to the media retention rules
+        defined in the homeserver config.
+        """
+        # Purge remote media
+        if self._media_retention_remote_media_lifetime_ms is not None:
+            # Calculate a threshold timestamp derived from the configured lifetime. Any
+            # media that has not been accessed since this timestamp will be removed.
+            remote_media_threshold_timestamp_ms = (
+                self.clock.time_msec() - self._media_retention_remote_media_lifetime_ms
+            )
+
+            logger.info(
+                "Purging remote media last accessed before"
+                f" {remote_media_threshold_timestamp_ms}"
+            )
+
+            await self.delete_old_remote_media(
+                before_ts=remote_media_threshold_timestamp_ms
+            )
+
+        # And now do the same for local media
+        if self._media_retention_local_media_lifetime_ms is not None:
+            # This works the same as the remote media threshold
+            local_media_threshold_timestamp_ms = (
+                self.clock.time_msec() - self._media_retention_local_media_lifetime_ms
+            )
+
+            logger.info(
+                "Purging local media last accessed before"
+                f" {local_media_threshold_timestamp_ms}"
+            )
+
+            await self.delete_old_local_media(
+                before_ts=local_media_threshold_timestamp_ms,
+                keep_profiles=True,
+                delete_quarantined_media=False,
+                delete_protected_media=False,
+            )
+
+    async def delete_old_remote_media(self, before_ts: int) -> Dict[str, int]:
+        old_media = await self.store.get_remote_media_ids(
+            before_ts, include_quarantined_media=False
+        )
+
+        deleted = 0
+
+        for media in old_media:
+            origin = media["media_origin"]
+            media_id = media["media_id"]
+            file_id = media["filesystem_id"]
+            key = (origin, media_id)
+
+            logger.info("Deleting: %r", key)
+
+            # TODO: Should we delete from the backup store
+
+            async with self.remote_media_linearizer.queue(key):
+                full_path = self.filepaths.remote_media_filepath(origin, file_id)
+                try:
+                    os.remove(full_path)
+                except OSError as e:
+                    logger.warning("Failed to remove file: %r", full_path)
+                    if e.errno == errno.ENOENT:
+                        pass
+                    else:
+                        continue
+
+                thumbnail_dir = self.filepaths.remote_media_thumbnail_dir(
+                    origin, file_id
+                )
+                shutil.rmtree(thumbnail_dir, ignore_errors=True)
+
+                await self.store.delete_remote_media(origin, media_id)
+                deleted += 1
+
+        return {"deleted": deleted}
+
+    async def delete_local_media_ids(
+        self, media_ids: List[str]
+    ) -> Tuple[List[str], int]:
+        """
+        Delete the given local or remote media ID from this server
+
+        Args:
+            media_id: The media ID to delete.
+        Returns:
+            A tuple of (list of deleted media IDs, total deleted media IDs).
+        """
+        return await self._remove_local_media_from_disk(media_ids)
+
+    async def delete_old_local_media(
+        self,
+        before_ts: int,
+        size_gt: int = 0,
+        keep_profiles: bool = True,
+        delete_quarantined_media: bool = False,
+        delete_protected_media: bool = False,
+    ) -> Tuple[List[str], int]:
+        """
+        Delete local or remote media from this server by size and timestamp. Removes
+        media files, any thumbnails and cached URLs.
+
+        Args:
+            before_ts: Unix timestamp in ms.
+                Files that were last used before this timestamp will be deleted.
+            size_gt: Size of the media in bytes. Files that are larger will be deleted.
+            keep_profiles: Switch to delete also files that are still used in image data
+                (e.g user profile, room avatar). If false these files will be deleted.
+            delete_quarantined_media: If True, media marked as quarantined will be deleted.
+            delete_protected_media: If True, media marked as protected will be deleted.
+
+        Returns:
+            A tuple of (list of deleted media IDs, total deleted media IDs).
+        """
+        old_media = await self.store.get_local_media_ids(
+            before_ts,
+            size_gt,
+            keep_profiles,
+            include_quarantined_media=delete_quarantined_media,
+            include_protected_media=delete_protected_media,
+        )
+        return await self._remove_local_media_from_disk(old_media)
+
+    async def _remove_local_media_from_disk(
+        self, media_ids: List[str]
+    ) -> Tuple[List[str], int]:
+        """
+        Delete local or remote media from this server. Removes media files,
+        any thumbnails and cached URLs.
+
+        Args:
+            media_ids: List of media_id to delete
+        Returns:
+            A tuple of (list of deleted media IDs, total deleted media IDs).
+        """
+        removed_media = []
+        for media_id in media_ids:
+            logger.info("Deleting media with ID '%s'", media_id)
+            full_path = self.filepaths.local_media_filepath(media_id)
+            try:
+                os.remove(full_path)
+            except OSError as e:
+                logger.warning("Failed to remove file: %r: %s", full_path, e)
+                if e.errno == errno.ENOENT:
+                    pass
+                else:
+                    continue
+
+            thumbnail_dir = self.filepaths.local_media_thumbnail_dir(media_id)
+            shutil.rmtree(thumbnail_dir, ignore_errors=True)
+
+            await self.store.delete_remote_media(self.server_name, media_id)
+
+            await self.store.delete_url_cache((media_id,))
+            await self.store.delete_url_cache_media((media_id,))
+
+            removed_media.append(media_id)
+
+        return removed_media, len(removed_media)
diff --git a/synapse/media/media_storage.py b/synapse/media/media_storage.py
new file mode 100644
index 0000000000..a7e22a91e1
--- /dev/null
+++ b/synapse/media/media_storage.py
@@ -0,0 +1,374 @@
+# Copyright 2018-2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import logging
+import os
+import shutil
+from types import TracebackType
+from typing import (
+    IO,
+    TYPE_CHECKING,
+    Any,
+    Awaitable,
+    BinaryIO,
+    Callable,
+    Generator,
+    Optional,
+    Sequence,
+    Tuple,
+    Type,
+)
+
+import attr
+
+from twisted.internet.defer import Deferred
+from twisted.internet.interfaces import IConsumer
+from twisted.protocols.basic import FileSender
+
+import synapse
+from synapse.api.errors import NotFoundError
+from synapse.logging.context import defer_to_thread, make_deferred_yieldable
+from synapse.util import Clock
+from synapse.util.file_consumer import BackgroundFileConsumer
+
+from ._base import FileInfo, Responder
+from .filepath import MediaFilePaths
+
+if TYPE_CHECKING:
+    from synapse.media.storage_provider import StorageProvider
+    from synapse.server import HomeServer
+
+logger = logging.getLogger(__name__)
+
+
+class MediaStorage:
+    """Responsible for storing/fetching files from local sources.
+
+    Args:
+        hs
+        local_media_directory: Base path where we store media on disk
+        filepaths
+        storage_providers: List of StorageProvider that are used to fetch and store files.
+    """
+
+    def __init__(
+        self,
+        hs: "HomeServer",
+        local_media_directory: str,
+        filepaths: MediaFilePaths,
+        storage_providers: Sequence["StorageProvider"],
+    ):
+        self.hs = hs
+        self.reactor = hs.get_reactor()
+        self.local_media_directory = local_media_directory
+        self.filepaths = filepaths
+        self.storage_providers = storage_providers
+        self.spam_checker = hs.get_spam_checker()
+        self.clock = hs.get_clock()
+
+    async def store_file(self, source: IO, file_info: FileInfo) -> str:
+        """Write `source` to the on disk media store, and also any other
+        configured storage providers
+
+        Args:
+            source: A file like object that should be written
+            file_info: Info about the file to store
+
+        Returns:
+            the file path written to in the primary media store
+        """
+
+        with self.store_into_file(file_info) as (f, fname, finish_cb):
+            # Write to the main repository
+            await self.write_to_file(source, f)
+            await finish_cb()
+
+        return fname
+
+    async def write_to_file(self, source: IO, output: IO) -> None:
+        """Asynchronously write the `source` to `output`."""
+        await defer_to_thread(self.reactor, _write_file_synchronously, source, output)
+
+    @contextlib.contextmanager
+    def store_into_file(
+        self, file_info: FileInfo
+    ) -> Generator[Tuple[BinaryIO, str, Callable[[], Awaitable[None]]], None, None]:
+        """Context manager used to get a file like object to write into, as
+        described by file_info.
+
+        Actually yields a 3-tuple (file, fname, finish_cb), where file is a file
+        like object that can be written to, fname is the absolute path of file
+        on disk, and finish_cb is a function that returns an awaitable.
+
+        fname can be used to read the contents from after upload, e.g. to
+        generate thumbnails.
+
+        finish_cb must be called and waited on after the file has been
+        successfully been written to. Should not be called if there was an
+        error.
+
+        Args:
+            file_info: Info about the file to store
+
+        Example:
+
+            with media_storage.store_into_file(info) as (f, fname, finish_cb):
+                # .. write into f ...
+                await finish_cb()
+        """
+
+        path = self._file_info_to_path(file_info)
+        fname = os.path.join(self.local_media_directory, path)
+
+        dirname = os.path.dirname(fname)
+        os.makedirs(dirname, exist_ok=True)
+
+        finished_called = [False]
+
+        try:
+            with open(fname, "wb") as f:
+
+                async def finish() -> None:
+                    # Ensure that all writes have been flushed and close the
+                    # file.
+                    f.flush()
+                    f.close()
+
+                    spam_check = await self.spam_checker.check_media_file_for_spam(
+                        ReadableFileWrapper(self.clock, fname), file_info
+                    )
+                    if spam_check != synapse.module_api.NOT_SPAM:
+                        logger.info("Blocking media due to spam checker")
+                        # Note that we'll delete the stored media, due to the
+                        # try/except below. The media also won't be stored in
+                        # the DB.
+                        # We currently ignore any additional field returned by
+                        # the spam-check API.
+                        raise SpamMediaException(errcode=spam_check[0])
+
+                    for provider in self.storage_providers:
+                        await provider.store_file(path, file_info)
+
+                    finished_called[0] = True
+
+                yield f, fname, finish
+        except Exception as e:
+            try:
+                os.remove(fname)
+            except Exception:
+                pass
+
+            raise e from None
+
+        if not finished_called:
+            raise Exception("Finished callback not called")
+
+    async def fetch_media(self, file_info: FileInfo) -> Optional[Responder]:
+        """Attempts to fetch media described by file_info from the local cache
+        and configured storage providers.
+
+        Args:
+            file_info
+
+        Returns:
+            Returns a Responder if the file was found, otherwise None.
+        """
+        paths = [self._file_info_to_path(file_info)]
+
+        # fallback for remote thumbnails with no method in the filename
+        if file_info.thumbnail and file_info.server_name:
+            paths.append(
+                self.filepaths.remote_media_thumbnail_rel_legacy(
+                    server_name=file_info.server_name,
+                    file_id=file_info.file_id,
+                    width=file_info.thumbnail.width,
+                    height=file_info.thumbnail.height,
+                    content_type=file_info.thumbnail.type,
+                )
+            )
+
+        for path in paths:
+            local_path = os.path.join(self.local_media_directory, path)
+            if os.path.exists(local_path):
+                logger.debug("responding with local file %s", local_path)
+                return FileResponder(open(local_path, "rb"))
+            logger.debug("local file %s did not exist", local_path)
+
+        for provider in self.storage_providers:
+            for path in paths:
+                res: Any = await provider.fetch(path, file_info)
+                if res:
+                    logger.debug("Streaming %s from %s", path, provider)
+                    return res
+                logger.debug("%s not found on %s", path, provider)
+
+        return None
+
+    async def ensure_media_is_in_local_cache(self, file_info: FileInfo) -> str:
+        """Ensures that the given file is in the local cache. Attempts to
+        download it from storage providers if it isn't.
+
+        Args:
+            file_info
+
+        Returns:
+            Full path to local file
+        """
+        path = self._file_info_to_path(file_info)
+        local_path = os.path.join(self.local_media_directory, path)
+        if os.path.exists(local_path):
+            return local_path
+
+        # Fallback for paths without method names
+        # Should be removed in the future
+        if file_info.thumbnail and file_info.server_name:
+            legacy_path = self.filepaths.remote_media_thumbnail_rel_legacy(
+                server_name=file_info.server_name,
+                file_id=file_info.file_id,
+                width=file_info.thumbnail.width,
+                height=file_info.thumbnail.height,
+                content_type=file_info.thumbnail.type,
+            )
+            legacy_local_path = os.path.join(self.local_media_directory, legacy_path)
+            if os.path.exists(legacy_local_path):
+                return legacy_local_path
+
+        dirname = os.path.dirname(local_path)
+        os.makedirs(dirname, exist_ok=True)
+
+        for provider in self.storage_providers:
+            res: Any = await provider.fetch(path, file_info)
+            if res:
+                with res:
+                    consumer = BackgroundFileConsumer(
+                        open(local_path, "wb"), self.reactor
+                    )
+                    await res.write_to_consumer(consumer)
+                    await consumer.wait()
+                return local_path
+
+        raise NotFoundError()
+
+    def _file_info_to_path(self, file_info: FileInfo) -> str:
+        """Converts file_info into a relative path.
+
+        The path is suitable for storing files under a directory, e.g. used to
+        store files on local FS under the base media repository directory.
+        """
+        if file_info.url_cache:
+            if file_info.thumbnail:
+                return self.filepaths.url_cache_thumbnail_rel(
+                    media_id=file_info.file_id,
+                    width=file_info.thumbnail.width,
+                    height=file_info.thumbnail.height,
+                    content_type=file_info.thumbnail.type,
+                    method=file_info.thumbnail.method,
+                )
+            return self.filepaths.url_cache_filepath_rel(file_info.file_id)
+
+        if file_info.server_name:
+            if file_info.thumbnail:
+                return self.filepaths.remote_media_thumbnail_rel(
+                    server_name=file_info.server_name,
+                    file_id=file_info.file_id,
+                    width=file_info.thumbnail.width,
+                    height=file_info.thumbnail.height,
+                    content_type=file_info.thumbnail.type,
+                    method=file_info.thumbnail.method,
+                )
+            return self.filepaths.remote_media_filepath_rel(
+                file_info.server_name, file_info.file_id
+            )
+
+        if file_info.thumbnail:
+            return self.filepaths.local_media_thumbnail_rel(
+                media_id=file_info.file_id,
+                width=file_info.thumbnail.width,
+                height=file_info.thumbnail.height,
+                content_type=file_info.thumbnail.type,
+                method=file_info.thumbnail.method,
+            )
+        return self.filepaths.local_media_filepath_rel(file_info.file_id)
+
+
+def _write_file_synchronously(source: IO, dest: IO) -> None:
+    """Write `source` to the file like `dest` synchronously. Should be called
+    from a thread.
+
+    Args:
+        source: A file like object that's to be written
+        dest: A file like object to be written to
+    """
+    source.seek(0)  # Ensure we read from the start of the file
+    shutil.copyfileobj(source, dest)
+
+
+class FileResponder(Responder):
+    """Wraps an open file that can be sent to a request.
+
+    Args:
+        open_file: A file like object to be streamed ot the client,
+            is closed when finished streaming.
+    """
+
+    def __init__(self, open_file: IO):
+        self.open_file = open_file
+
+    def write_to_consumer(self, consumer: IConsumer) -> Deferred:
+        return make_deferred_yieldable(
+            FileSender().beginFileTransfer(self.open_file, consumer)
+        )
+
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_val: Optional[BaseException],
+        exc_tb: Optional[TracebackType],
+    ) -> None:
+        self.open_file.close()
+
+
+class SpamMediaException(NotFoundError):
+    """The media was blocked by a spam checker, so we simply 404 the request (in
+    the same way as if it was quarantined).
+    """
+
+
+@attr.s(slots=True, auto_attribs=True)
+class ReadableFileWrapper:
+    """Wrapper that allows reading a file in chunks, yielding to the reactor,
+    and writing to a callback.
+
+    This is simplified `FileSender` that takes an IO object rather than an
+    `IConsumer`.
+    """
+
+    CHUNK_SIZE = 2**14
+
+    clock: Clock
+    path: str
+
+    async def write_chunks_to(self, callback: Callable[[bytes], object]) -> None:
+        """Reads the file in chunks and calls the callback with each chunk."""
+
+        with open(self.path, "rb") as file:
+            while True:
+                chunk = file.read(self.CHUNK_SIZE)
+                if not chunk:
+                    break
+
+                callback(chunk)
+
+                # We yield to the reactor by sleeping for 0 seconds.
+                await self.clock.sleep(0)
diff --git a/synapse/media/oembed.py b/synapse/media/oembed.py
new file mode 100644
index 0000000000..c0eaf04be5
--- /dev/null
+++ b/synapse/media/oembed.py
@@ -0,0 +1,265 @@
+#  Copyright 2021 The Matrix.org Foundation C.I.C.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import html
+import logging
+import urllib.parse
+from typing import TYPE_CHECKING, List, Optional
+
+import attr
+
+from synapse.media.preview_html import parse_html_description
+from synapse.types import JsonDict
+from synapse.util import json_decoder
+
+if TYPE_CHECKING:
+    from lxml import etree
+
+    from synapse.server import HomeServer
+
+logger = logging.getLogger(__name__)
+
+
+@attr.s(slots=True, frozen=True, auto_attribs=True)
+class OEmbedResult:
+    # The Open Graph result (converted from the oEmbed result).
+    open_graph_result: JsonDict
+    # The author_name of the oEmbed result
+    author_name: Optional[str]
+    # Number of milliseconds to cache the content, according to the oEmbed response.
+    #
+    # This will be None if no cache-age is provided in the oEmbed response (or
+    # if the oEmbed response cannot be turned into an Open Graph response).
+    cache_age: Optional[int]
+
+
+class OEmbedProvider:
+    """
+    A helper for accessing oEmbed content.
+
+    It can be used to check if a URL should be accessed via oEmbed and for
+    requesting/parsing oEmbed content.
+    """
+
+    def __init__(self, hs: "HomeServer"):
+        self._oembed_patterns = {}
+        for oembed_endpoint in hs.config.oembed.oembed_patterns:
+            api_endpoint = oembed_endpoint.api_endpoint
+
+            # Only JSON is supported at the moment. This could be declared in
+            # the formats field. Otherwise, if the endpoint ends in .xml assume
+            # it doesn't support JSON.
+            if (
+                oembed_endpoint.formats is not None
+                and "json" not in oembed_endpoint.formats
+            ) or api_endpoint.endswith(".xml"):
+                logger.info(
+                    "Ignoring oEmbed endpoint due to not supporting JSON: %s",
+                    api_endpoint,
+                )
+                continue
+
+            # Iterate through each URL pattern and point it to the endpoint.
+            for pattern in oembed_endpoint.url_patterns:
+                self._oembed_patterns[pattern] = api_endpoint
+
+    def get_oembed_url(self, url: str) -> Optional[str]:
+        """
+        Check whether the URL should be downloaded as oEmbed content instead.
+
+        Args:
+            url: The URL to check.
+
+        Returns:
+            A URL to use instead or None if the original URL should be used.
+        """
+        for url_pattern, endpoint in self._oembed_patterns.items():
+            if url_pattern.fullmatch(url):
+                # TODO Specify max height / width.
+
+                # Note that only the JSON format is supported, some endpoints want
+                # this in the URL, others want it as an argument.
+                endpoint = endpoint.replace("{format}", "json")
+
+                args = {"url": url, "format": "json"}
+                query_str = urllib.parse.urlencode(args, True)
+                return f"{endpoint}?{query_str}"
+
+        # No match.
+        return None
+
+    def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
+        """
+        Search an HTML document for oEmbed autodiscovery information.
+
+        Args:
+            tree: The parsed HTML body.
+
+        Returns:
+            The URL to use for oEmbed information, or None if no URL was found.
+        """
+        # Search for link elements with the proper rel and type attributes.
+        for tag in tree.xpath(
+            "//link[@rel='alternate'][@type='application/json+oembed']"
+        ):
+            if "href" in tag.attrib:
+                return tag.attrib["href"]
+
+        # Some providers (e.g. Flickr) use alternative instead of alternate.
+        for tag in tree.xpath(
+            "//link[@rel='alternative'][@type='application/json+oembed']"
+        ):
+            if "href" in tag.attrib:
+                return tag.attrib["href"]
+
+        return None
+
+    def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
+        """
+        Parse the oEmbed response into an Open Graph response.
+
+        Args:
+            url: The URL which is being previewed (not the one which was
+                requested).
+            raw_body: The oEmbed response as JSON encoded as bytes.
+
+        Returns:
+            json-encoded Open Graph data
+        """
+
+        try:
+            # oEmbed responses *must* be UTF-8 according to the spec.
+            oembed = json_decoder.decode(raw_body.decode("utf-8"))
+        except ValueError:
+            return OEmbedResult({}, None, None)
+
+        # The version is a required string field, but not always provided,
+        # or sometimes provided as a float. Be lenient.
+        oembed_version = oembed.get("version", "1.0")
+        if oembed_version != "1.0" and oembed_version != 1:
+            return OEmbedResult({}, None, None)
+
+        # Attempt to parse the cache age, if possible.
+        try:
+            cache_age = int(oembed.get("cache_age")) * 1000
+        except (TypeError, ValueError):
+            # If the cache age cannot be parsed (e.g. wrong type or invalid
+            # string), ignore it.
+            cache_age = None
+
+        # The oEmbed response converted to Open Graph.
+        open_graph_response: JsonDict = {"og:url": url}
+
+        title = oembed.get("title")
+        if title and isinstance(title, str):
+            # A common WordPress plug-in seems to incorrectly escape entities
+            # in the oEmbed response.
+            open_graph_response["og:title"] = html.unescape(title)
+
+        author_name = oembed.get("author_name")
+        if not isinstance(author_name, str):
+            author_name = None
+
+        # Use the provider name and as the site.
+        provider_name = oembed.get("provider_name")
+        if provider_name and isinstance(provider_name, str):
+            open_graph_response["og:site_name"] = provider_name
+
+        # If a thumbnail exists, use it. Note that dimensions will be calculated later.
+        thumbnail_url = oembed.get("thumbnail_url")
+        if thumbnail_url and isinstance(thumbnail_url, str):
+            open_graph_response["og:image"] = thumbnail_url
+
+        # Process each type separately.
+        oembed_type = oembed.get("type")
+        if oembed_type == "rich":
+            html_str = oembed.get("html")
+            if isinstance(html_str, str):
+                calc_description_and_urls(open_graph_response, html_str)
+
+        elif oembed_type == "photo":
+            # If this is a photo, use the full image, not the thumbnail.
+            url = oembed.get("url")
+            if url and isinstance(url, str):
+                open_graph_response["og:image"] = url
+
+        elif oembed_type == "video":
+            open_graph_response["og:type"] = "video.other"
+            html_str = oembed.get("html")
+            if html_str and isinstance(html_str, str):
+                calc_description_and_urls(open_graph_response, oembed["html"])
+            for size in ("width", "height"):
+                val = oembed.get(size)
+                if type(val) is int:
+                    open_graph_response[f"og:video:{size}"] = val
+
+        elif oembed_type == "link":
+            open_graph_response["og:type"] = "website"
+
+        else:
+            logger.warning("Unknown oEmbed type: %s", oembed_type)
+
+        return OEmbedResult(open_graph_response, author_name, cache_age)
+
+
+def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:
+    results = []
+    for tag in tree.xpath("//*/" + tag_name):
+        if "src" in tag.attrib:
+            results.append(tag.attrib["src"])
+    return results
+
+
+def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None:
+    """
+    Calculate description for an HTML document.
+
+    This uses lxml to convert the HTML document into plaintext. If errors
+    occur during processing of the document, an empty response is returned.
+
+    Args:
+        open_graph_response: The current Open Graph summary. This is updated with additional fields.
+        html_body: The HTML document, as bytes.
+
+    Returns:
+        The summary
+    """
+    # If there's no body, nothing useful is going to be found.
+    if not html_body:
+        return
+
+    from lxml import etree
+
+    # Create an HTML parser. If this fails, log and return no metadata.
+    parser = etree.HTMLParser(recover=True, encoding="utf-8")
+
+    # Attempt to parse the body. If this fails, log and return no metadata.
+    tree = etree.fromstring(html_body, parser)
+
+    # The data was successfully parsed, but no tree was found.
+    if tree is None:
+        return
+
+    # Attempt to find interesting URLs (images, videos, embeds).
+    if "og:image" not in open_graph_response:
+        image_urls = _fetch_urls(tree, "img")
+        if image_urls:
+            open_graph_response["og:image"] = image_urls[0]
+
+    video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed")
+    if video_urls:
+        open_graph_response["og:video"] = video_urls[0]
+
+    description = parse_html_description(tree)
+    if description:
+        open_graph_response["og:description"] = description
diff --git a/synapse/media/preview_html.py b/synapse/media/preview_html.py
new file mode 100644
index 0000000000..516d0434f0
--- /dev/null
+++ b/synapse/media/preview_html.py
@@ -0,0 +1,501 @@
+# Copyright 2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import codecs
+import logging
+import re
+from typing import (
+    TYPE_CHECKING,
+    Callable,
+    Dict,
+    Generator,
+    Iterable,
+    List,
+    Optional,
+    Set,
+    Union,
+)
+
+if TYPE_CHECKING:
+    from lxml import etree
+
+logger = logging.getLogger(__name__)
+
+_charset_match = re.compile(
+    rb'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I
+)
+_xml_encoding_match = re.compile(
+    rb'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I
+)
+_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
+
+# Certain elements aren't meant for display.
+ARIA_ROLES_TO_IGNORE = {"directory", "menu", "menubar", "toolbar"}
+
+
+def _normalise_encoding(encoding: str) -> Optional[str]:
+    """Use the Python codec's name as the normalised entry."""
+    try:
+        return codecs.lookup(encoding).name
+    except LookupError:
+        return None
+
+
+def _get_html_media_encodings(
+    body: bytes, content_type: Optional[str]
+) -> Iterable[str]:
+    """
+    Get potential encoding of the body based on the (presumably) HTML body or the content-type header.
+
+    The precedence used for finding a character encoding is:
+
+    1. <meta> tag with a charset declared.
+    2. The XML document's character encoding attribute.
+    3. The Content-Type header.
+    4. Fallback to utf-8.
+    5. Fallback to windows-1252.
+
+    This roughly follows the algorithm used by BeautifulSoup's bs4.dammit.EncodingDetector.
+
+    Args:
+        body: The HTML document, as bytes.
+        content_type: The Content-Type header.
+
+    Returns:
+        The character encoding of the body, as a string.
+    """
+    # There's no point in returning an encoding more than once.
+    attempted_encodings: Set[str] = set()
+
+    # Limit searches to the first 1kb, since it ought to be at the top.
+    body_start = body[:1024]
+
+    # Check if it has an encoding set in a meta tag.
+    match = _charset_match.search(body_start)
+    if match:
+        encoding = _normalise_encoding(match.group(1).decode("ascii"))
+        if encoding:
+            attempted_encodings.add(encoding)
+            yield encoding
+
+    # TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
+
+    # Check if it has an XML document with an encoding.
+    match = _xml_encoding_match.match(body_start)
+    if match:
+        encoding = _normalise_encoding(match.group(1).decode("ascii"))
+        if encoding and encoding not in attempted_encodings:
+            attempted_encodings.add(encoding)
+            yield encoding
+
+    # Check the HTTP Content-Type header for a character set.
+    if content_type:
+        content_match = _content_type_match.match(content_type)
+        if content_match:
+            encoding = _normalise_encoding(content_match.group(1))
+            if encoding and encoding not in attempted_encodings:
+                attempted_encodings.add(encoding)
+                yield encoding
+
+    # Finally, fallback to UTF-8, then windows-1252.
+    for fallback in ("utf-8", "cp1252"):
+        if fallback not in attempted_encodings:
+            yield fallback
+
+
+def decode_body(
+    body: bytes, uri: str, content_type: Optional[str] = None
+) -> Optional["etree.Element"]:
+    """
+    This uses lxml to parse the HTML document.
+
+    Args:
+        body: The HTML document, as bytes.
+        uri: The URI used to download the body.
+        content_type: The Content-Type header.
+
+    Returns:
+        The parsed HTML body, or None if an error occurred during processed.
+    """
+    # If there's no body, nothing useful is going to be found.
+    if not body:
+        return None
+
+    # The idea here is that multiple encodings are tried until one works.
+    # Unfortunately the result is never used and then LXML will decode the string
+    # again with the found encoding.
+    for encoding in _get_html_media_encodings(body, content_type):
+        try:
+            body.decode(encoding)
+        except Exception:
+            pass
+        else:
+            break
+    else:
+        logger.warning("Unable to decode HTML body for %s", uri)
+        return None
+
+    from lxml import etree
+
+    # Create an HTML parser.
+    parser = etree.HTMLParser(recover=True, encoding=encoding)
+
+    # Attempt to parse the body. Returns None if the body was successfully
+    # parsed, but no tree was found.
+    return etree.fromstring(body, parser)
+
+
+def _get_meta_tags(
+    tree: "etree.Element",
+    property: str,
+    prefix: str,
+    property_mapper: Optional[Callable[[str], Optional[str]]] = None,
+) -> Dict[str, Optional[str]]:
+    """
+    Search for meta tags prefixed with a particular string.
+
+    Args:
+        tree: The parsed HTML document.
+        property: The name of the property which contains the tag name, e.g.
+            "property" for Open Graph.
+        prefix: The prefix on the property to search for, e.g. "og" for Open Graph.
+        property_mapper: An optional callable to map the property to the Open Graph
+            form. Can return None for a key to ignore that key.
+
+    Returns:
+        A map of tag name to value.
+    """
+    results: Dict[str, Optional[str]] = {}
+    for tag in tree.xpath(
+        f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]"
+    ):
+        # if we've got more than 50 tags, someone is taking the piss
+        if len(results) >= 50:
+            logger.warning(
+                "Skipping parsing of Open Graph for page with too many '%s:' tags",
+                prefix,
+            )
+            return {}
+
+        key = tag.attrib[property]
+        if property_mapper:
+            key = property_mapper(key)
+            # None is a special value used to ignore a value.
+            if key is None:
+                continue
+
+        results[key] = tag.attrib["content"]
+
+    return results
+
+
+def _map_twitter_to_open_graph(key: str) -> Optional[str]:
+    """
+    Map a Twitter card property to the analogous Open Graph property.
+
+    Args:
+        key: The Twitter card property (starts with "twitter:").
+
+    Returns:
+        The Open Graph property (starts with "og:") or None to have this property
+        be ignored.
+    """
+    # Twitter card properties with no analogous Open Graph property.
+    if key == "twitter:card" or key == "twitter:creator":
+        return None
+    if key == "twitter:site":
+        return "og:site_name"
+    # Otherwise, swap twitter to og.
+    return "og" + key[7:]
+
+
+def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
+    """
+    Parse the HTML document into an Open Graph response.
+
+    This uses lxml to search the HTML document for Open Graph data (or
+    synthesizes it from the document).
+
+    Args:
+        tree: The parsed HTML document.
+
+    Returns:
+        The Open Graph response as a dictionary.
+    """
+
+    # Search for Open Graph (og:) meta tags, e.g.:
+    #
+    # "og:type"         : "video",
+    # "og:url"          : "https://www.youtube.com/watch?v=LXDBoHyjmtw",
+    # "og:site_name"    : "YouTube",
+    # "og:video:type"   : "application/x-shockwave-flash",
+    # "og:description"  : "Fun stuff happening here",
+    # "og:title"        : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon",
+    # "og:image"        : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg",
+    # "og:video:url"    : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1",
+    # "og:video:width"  : "1280"
+    # "og:video:height" : "720",
+    # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
+
+    og = _get_meta_tags(tree, "property", "og")
+
+    # TODO: Search for properties specific to the different Open Graph types,
+    # such as article: meta tags, e.g.:
+    #
+    # "article:publisher" : "https://www.facebook.com/thethudonline" />
+    # "article:author" content="https://www.facebook.com/thethudonline" />
+    # "article:tag" content="baby" />
+    # "article:section" content="Breaking News" />
+    # "article:published_time" content="2016-03-31T19:58:24+00:00" />
+    # "article:modified_time" content="2016-04-01T18:31:53+00:00" />
+
+    # Search for Twitter Card (twitter:) meta tags, e.g.:
+    #
+    # "twitter:site"    : "@matrixdotorg"
+    # "twitter:creator" : "@matrixdotorg"
+    #
+    # Twitter cards tags also duplicate Open Graph tags.
+    #
+    # See https://developer.twitter.com/en/docs/twitter-for-websites/cards/guides/getting-started
+    twitter = _get_meta_tags(tree, "name", "twitter", _map_twitter_to_open_graph)
+    # Merge the Twitter values with the Open Graph values, but do not overwrite
+    # information from Open Graph tags.
+    for key, value in twitter.items():
+        if key not in og:
+            og[key] = value
+
+    if "og:title" not in og:
+        # Attempt to find a title from the title tag, or the biggest header on the page.
+        title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")
+        if title:
+            og["og:title"] = title[0].strip()
+        else:
+            og["og:title"] = None
+
+    if "og:image" not in og:
+        meta_image = tree.xpath(
+            "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
+        )
+        # If a meta image is found, use it.
+        if meta_image:
+            og["og:image"] = meta_image[0]
+        else:
+            # Try to find images which are larger than 10px by 10px.
+            #
+            # TODO: consider inlined CSS styles as well as width & height attribs
+            images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
+            images = sorted(
+                images,
+                key=lambda i: (
+                    -1 * float(i.attrib["width"]) * float(i.attrib["height"])
+                ),
+            )
+            # If no images were found, try to find *any* images.
+            if not images:
+                images = tree.xpath("//img[@src][1]")
+            if images:
+                og["og:image"] = images[0].attrib["src"]
+
+            # Finally, fallback to the favicon if nothing else.
+            else:
+                favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]")
+                if favicons:
+                    og["og:image"] = favicons[0]
+
+    if "og:description" not in og:
+        # Check the first meta description tag for content.
+        meta_description = tree.xpath(
+            "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
+        )
+        # If a meta description is found with content, use it.
+        if meta_description:
+            og["og:description"] = meta_description[0]
+        else:
+            og["og:description"] = parse_html_description(tree)
+    elif og["og:description"]:
+        # This must be a non-empty string at this point.
+        assert isinstance(og["og:description"], str)
+        og["og:description"] = summarize_paragraphs([og["og:description"]])
+
+    # TODO: delete the url downloads to stop diskfilling,
+    # as we only ever cared about its OG
+    return og
+
+
+def parse_html_description(tree: "etree.Element") -> Optional[str]:
+    """
+    Calculate a text description based on an HTML document.
+
+    Grabs any text nodes which are inside the <body/> tag, unless they are within
+    an HTML5 semantic markup tag (<header/>, <nav/>, <aside/>, <footer/>), or
+    if they are within a <script/>, <svg/> or <style/> tag, or if they are within
+    a tag whose content is usually only shown to old browsers
+    (<iframe/>, <video/>, <canvas/>, <picture/>).
+
+    This is a very very very coarse approximation to a plain text render of the page.
+
+    Args:
+        tree: The parsed HTML document.
+
+    Returns:
+        The plain text description, or None if one cannot be generated.
+    """
+    # We don't just use XPATH here as that is slow on some machines.
+
+    from lxml import etree
+
+    TAGS_TO_REMOVE = {
+        "header",
+        "nav",
+        "aside",
+        "footer",
+        "script",
+        "noscript",
+        "style",
+        "svg",
+        "iframe",
+        "video",
+        "canvas",
+        "img",
+        "picture",
+        etree.Comment,
+    }
+
+    # Split all the text nodes into paragraphs (by splitting on new
+    # lines)
+    text_nodes = (
+        re.sub(r"\s+", "\n", el).strip()
+        for el in _iterate_over_text(tree.find("body"), TAGS_TO_REMOVE)
+    )
+    return summarize_paragraphs(text_nodes)
+
+
+def _iterate_over_text(
+    tree: Optional["etree.Element"],
+    tags_to_ignore: Set[Union[str, "etree.Comment"]],
+    stack_limit: int = 1024,
+) -> Generator[str, None, None]:
+    """Iterate over the tree returning text nodes in a depth first fashion,
+    skipping text nodes inside certain tags.
+
+    Args:
+        tree: The parent element to iterate. Can be None if there isn't one.
+        tags_to_ignore: Set of tags to ignore
+        stack_limit: Maximum stack size limit for depth-first traversal.
+            Nodes will be dropped if this limit is hit, which may truncate the
+            textual result.
+            Intended to limit the maximum working memory when generating a preview.
+    """
+
+    if tree is None:
+        return
+
+    # This is a stack whose items are elements to iterate over *or* strings
+    # to be returned.
+    elements: List[Union[str, "etree.Element"]] = [tree]
+    while elements:
+        el = elements.pop()
+
+        if isinstance(el, str):
+            yield el
+        elif el.tag not in tags_to_ignore:
+            # If the element isn't meant for display, ignore it.
+            if el.get("role") in ARIA_ROLES_TO_IGNORE:
+                continue
+
+            # el.text is the text before the first child, so we can immediately
+            # return it if the text exists.
+            if el.text:
+                yield el.text
+
+            # We add to the stack all the element's children, interspersed with
+            # each child's tail text (if it exists).
+            #
+            # We iterate in reverse order so that earlier pieces of text appear
+            # closer to the top of the stack.
+            for child in el.iterchildren(reversed=True):
+                if len(elements) > stack_limit:
+                    # We've hit our limit for working memory
+                    break
+
+                if child.tail:
+                    # The tail text of a node is text that comes *after* the node,
+                    # so we always include it even if we ignore the child node.
+                    elements.append(child.tail)
+
+                elements.append(child)
+
+
+def summarize_paragraphs(
+    text_nodes: Iterable[str], min_size: int = 200, max_size: int = 500
+) -> Optional[str]:
+    """
+    Try to get a summary respecting first paragraph and then word boundaries.
+
+    Args:
+        text_nodes: The paragraphs to summarize.
+        min_size: The minimum number of words to include.
+        max_size: The maximum number of words to include.
+
+    Returns:
+        A summary of the text nodes, or None if that was not possible.
+    """
+
+    # TODO: Respect sentences?
+
+    description = ""
+
+    # Keep adding paragraphs until we get to the MIN_SIZE.
+    for text_node in text_nodes:
+        if len(description) < min_size:
+            text_node = re.sub(r"[\t \r\n]+", " ", text_node)
+            description += text_node + "\n\n"
+        else:
+            break
+
+    description = description.strip()
+    description = re.sub(r"[\t ]+", " ", description)
+    description = re.sub(r"[\t \r\n]*[\r\n]+", "\n\n", description)
+
+    # If the concatenation of paragraphs to get above MIN_SIZE
+    # took us over MAX_SIZE, then we need to truncate mid paragraph
+    if len(description) > max_size:
+        new_desc = ""
+
+        # This splits the paragraph into words, but keeping the
+        # (preceding) whitespace intact so we can easily concat
+        # words back together.
+        for match in re.finditer(r"\s*\S+", description):
+            word = match.group()
+
+            # Keep adding words while the total length is less than
+            # MAX_SIZE.
+            if len(word) + len(new_desc) < max_size:
+                new_desc += word
+            else:
+                # At this point the next word *will* take us over
+                # MAX_SIZE, but we also want to ensure that its not
+                # a huge word. If it is add it anyway and we'll
+                # truncate later.
+                if len(new_desc) < min_size:
+                    new_desc += word
+                break
+
+        # Double check that we're not over the limit
+        if len(new_desc) > max_size:
+            new_desc = new_desc[:max_size]
+
+        # We always add an ellipsis because at the very least
+        # we chopped mid paragraph.
+        description = new_desc.strip() + "…"
+    return description if description else None
diff --git a/synapse/media/storage_provider.py b/synapse/media/storage_provider.py
new file mode 100644
index 0000000000..1c9b71d69c
--- /dev/null
+++ b/synapse/media/storage_provider.py
@@ -0,0 +1,181 @@
+# Copyright 2018-2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+import logging
+import os
+import shutil
+from typing import TYPE_CHECKING, Callable, Optional
+
+from synapse.config._base import Config
+from synapse.logging.context import defer_to_thread, run_in_background
+from synapse.util.async_helpers import maybe_awaitable
+
+from ._base import FileInfo, Responder
+from .media_storage import FileResponder
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+    from synapse.server import HomeServer
+
+
+class StorageProvider(metaclass=abc.ABCMeta):
+    """A storage provider is a service that can store uploaded media and
+    retrieve them.
+    """
+
+    @abc.abstractmethod
+    async def store_file(self, path: str, file_info: FileInfo) -> None:
+        """Store the file described by file_info. The actual contents can be
+        retrieved by reading the file in file_info.upload_path.
+
+        Args:
+            path: Relative path of file in local cache
+            file_info: The metadata of the file.
+        """
+
+    @abc.abstractmethod
+    async def fetch(self, path: str, file_info: FileInfo) -> Optional[Responder]:
+        """Attempt to fetch the file described by file_info and stream it
+        into writer.
+
+        Args:
+            path: Relative path of file in local cache
+            file_info: The metadata of the file.
+
+        Returns:
+            Returns a Responder if the provider has the file, otherwise returns None.
+        """
+
+
+class StorageProviderWrapper(StorageProvider):
+    """Wraps a storage provider and provides various config options
+
+    Args:
+        backend: The storage provider to wrap.
+        store_local: Whether to store new local files or not.
+        store_synchronous: Whether to wait for file to be successfully
+            uploaded, or todo the upload in the background.
+        store_remote: Whether remote media should be uploaded
+    """
+
+    def __init__(
+        self,
+        backend: StorageProvider,
+        store_local: bool,
+        store_synchronous: bool,
+        store_remote: bool,
+    ):
+        self.backend = backend
+        self.store_local = store_local
+        self.store_synchronous = store_synchronous
+        self.store_remote = store_remote
+
+    def __str__(self) -> str:
+        return "StorageProviderWrapper[%s]" % (self.backend,)
+
+    async def store_file(self, path: str, file_info: FileInfo) -> None:
+        if not file_info.server_name and not self.store_local:
+            return None
+
+        if file_info.server_name and not self.store_remote:
+            return None
+
+        if file_info.url_cache:
+            # The URL preview cache is short lived and not worth offloading or
+            # backing up.
+            return None
+
+        if self.store_synchronous:
+            # store_file is supposed to return an Awaitable, but guard
+            # against improper implementations.
+            await maybe_awaitable(self.backend.store_file(path, file_info))  # type: ignore
+        else:
+            # TODO: Handle errors.
+            async def store() -> None:
+                try:
+                    return await maybe_awaitable(
+                        self.backend.store_file(path, file_info)
+                    )
+                except Exception:
+                    logger.exception("Error storing file")
+
+            run_in_background(store)
+
+    async def fetch(self, path: str, file_info: FileInfo) -> Optional[Responder]:
+        if file_info.url_cache:
+            # Files in the URL preview cache definitely aren't stored here,
+            # so avoid any potentially slow I/O or network access.
+            return None
+
+        # store_file is supposed to return an Awaitable, but guard
+        # against improper implementations.
+        return await maybe_awaitable(self.backend.fetch(path, file_info))
+
+
+class FileStorageProviderBackend(StorageProvider):
+    """A storage provider that stores files in a directory on a filesystem.
+
+    Args:
+        hs
+        config: The config returned by `parse_config`.
+    """
+
+    def __init__(self, hs: "HomeServer", config: str):
+        self.hs = hs
+        self.cache_directory = hs.config.media.media_store_path
+        self.base_directory = config
+
+    def __str__(self) -> str:
+        return "FileStorageProviderBackend[%s]" % (self.base_directory,)
+
+    async def store_file(self, path: str, file_info: FileInfo) -> None:
+        """See StorageProvider.store_file"""
+
+        primary_fname = os.path.join(self.cache_directory, path)
+        backup_fname = os.path.join(self.base_directory, path)
+
+        dirname = os.path.dirname(backup_fname)
+        os.makedirs(dirname, exist_ok=True)
+
+        # mypy needs help inferring the type of the second parameter, which is generic
+        shutil_copyfile: Callable[[str, str], str] = shutil.copyfile
+        await defer_to_thread(
+            self.hs.get_reactor(),
+            shutil_copyfile,
+            primary_fname,
+            backup_fname,
+        )
+
+    async def fetch(self, path: str, file_info: FileInfo) -> Optional[Responder]:
+        """See StorageProvider.fetch"""
+
+        backup_fname = os.path.join(self.base_directory, path)
+        if os.path.isfile(backup_fname):
+            return FileResponder(open(backup_fname, "rb"))
+
+        return None
+
+    @staticmethod
+    def parse_config(config: dict) -> str:
+        """Called on startup to parse config supplied. This should parse
+        the config and raise if there is a problem.
+
+        The returned value is passed into the constructor.
+
+        In this case we only care about a single param, the directory, so let's
+        just pull that out.
+        """
+        return Config.ensure_directory(config["directory"])
diff --git a/synapse/media/thumbnailer.py b/synapse/media/thumbnailer.py
new file mode 100644
index 0000000000..f909a4fb9a
--- /dev/null
+++ b/synapse/media/thumbnailer.py
@@ -0,0 +1,221 @@
+# Copyright 2014-2016 OpenMarket Ltd
+# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from io import BytesIO
+from types import TracebackType
+from typing import Optional, Tuple, Type
+
+from PIL import Image
+
+logger = logging.getLogger(__name__)
+
+EXIF_ORIENTATION_TAG = 0x0112
+EXIF_TRANSPOSE_MAPPINGS = {
+    2: Image.FLIP_LEFT_RIGHT,
+    3: Image.ROTATE_180,
+    4: Image.FLIP_TOP_BOTTOM,
+    5: Image.TRANSPOSE,
+    6: Image.ROTATE_270,
+    7: Image.TRANSVERSE,
+    8: Image.ROTATE_90,
+}
+
+
+class ThumbnailError(Exception):
+    """An error occurred generating a thumbnail."""
+
+
+class Thumbnailer:
+    FORMATS = {"image/jpeg": "JPEG", "image/png": "PNG"}
+
+    @staticmethod
+    def set_limits(max_image_pixels: int) -> None:
+        Image.MAX_IMAGE_PIXELS = max_image_pixels
+
+    def __init__(self, input_path: str):
+        # Have we closed the image?
+        self._closed = False
+
+        try:
+            self.image = Image.open(input_path)
+        except OSError as e:
+            # If an error occurs opening the image, a thumbnail won't be able to
+            # be generated.
+            raise ThumbnailError from e
+        except Image.DecompressionBombError as e:
+            # If an image decompression bomb error occurs opening the image,
+            # then the image exceeds the pixel limit and a thumbnail won't
+            # be able to be generated.
+            raise ThumbnailError from e
+
+        self.width, self.height = self.image.size
+        self.transpose_method = None
+        try:
+            # We don't use ImageOps.exif_transpose since it crashes with big EXIF
+            #
+            # Ignore safety: Pillow seems to acknowledge that this method is
+            # "private, experimental, but generally widely used". Pillow 6
+            # includes a public getexif() method (no underscore) that we might
+            # consider using instead when we can bump that dependency.
+            #
+            # At the time of writing, Debian buster (currently oldstable)
+            # provides version 5.4.1. It's expected to EOL in mid-2022, see
+            # https://wiki.debian.org/DebianReleases#Production_Releases
+            image_exif = self.image._getexif()  # type: ignore
+            if image_exif is not None:
+                image_orientation = image_exif.get(EXIF_ORIENTATION_TAG)
+                assert type(image_orientation) is int
+                self.transpose_method = EXIF_TRANSPOSE_MAPPINGS.get(image_orientation)
+        except Exception as e:
+            # A lot of parsing errors can happen when parsing EXIF
+            logger.info("Error parsing image EXIF information: %s", e)
+
+    def transpose(self) -> Tuple[int, int]:
+        """Transpose the image using its EXIF Orientation tag
+
+        Returns:
+            A tuple containing the new image size in pixels as (width, height).
+        """
+        if self.transpose_method is not None:
+            # Safety: `transpose` takes an int rather than e.g. an IntEnum.
+            # self.transpose_method is set above to be a value in
+            # EXIF_TRANSPOSE_MAPPINGS, and that only contains correct values.
+            with self.image:
+                self.image = self.image.transpose(self.transpose_method)  # type: ignore[arg-type]
+            self.width, self.height = self.image.size
+            self.transpose_method = None
+            # We don't need EXIF any more
+            self.image.info["exif"] = None
+        return self.image.size
+
+    def aspect(self, max_width: int, max_height: int) -> Tuple[int, int]:
+        """Calculate the largest size that preserves aspect ratio which
+        fits within the given rectangle::
+
+            (w_in / h_in) = (w_out / h_out)
+            w_out = max(min(w_max, h_max * (w_in / h_in)), 1)
+            h_out = max(min(h_max, w_max * (h_in / w_in)), 1)
+
+        Args:
+            max_width: The largest possible width.
+            max_height: The largest possible height.
+        """
+
+        if max_width * self.height < max_height * self.width:
+            return max_width, max((max_width * self.height) // self.width, 1)
+        else:
+            return max((max_height * self.width) // self.height, 1), max_height
+
+    def _resize(self, width: int, height: int) -> Image.Image:
+        # 1-bit or 8-bit color palette images need converting to RGB
+        # otherwise they will be scaled using nearest neighbour which
+        # looks awful.
+        #
+        # If the image has transparency, use RGBA instead.
+        if self.image.mode in ["1", "L", "P"]:
+            if self.image.info.get("transparency", None) is not None:
+                with self.image:
+                    self.image = self.image.convert("RGBA")
+            else:
+                with self.image:
+                    self.image = self.image.convert("RGB")
+        return self.image.resize((width, height), Image.ANTIALIAS)
+
+    def scale(self, width: int, height: int, output_type: str) -> BytesIO:
+        """Rescales the image to the given dimensions.
+
+        Returns:
+            The bytes of the encoded image ready to be written to disk
+        """
+        with self._resize(width, height) as scaled:
+            return self._encode_image(scaled, output_type)
+
+    def crop(self, width: int, height: int, output_type: str) -> BytesIO:
+        """Rescales and crops the image to the given dimensions preserving
+        aspect::
+            (w_in / h_in) = (w_scaled / h_scaled)
+            w_scaled = max(w_out, h_out * (w_in / h_in))
+            h_scaled = max(h_out, w_out * (h_in / w_in))
+
+        Args:
+            max_width: The largest possible width.
+            max_height: The largest possible height.
+
+        Returns:
+            The bytes of the encoded image ready to be written to disk
+        """
+        if width * self.height > height * self.width:
+            scaled_width = width
+            scaled_height = (width * self.height) // self.width
+            crop_top = (scaled_height - height) // 2
+            crop_bottom = height + crop_top
+            crop = (0, crop_top, width, crop_bottom)
+        else:
+            scaled_width = (height * self.width) // self.height
+            scaled_height = height
+            crop_left = (scaled_width - width) // 2
+            crop_right = width + crop_left
+            crop = (crop_left, 0, crop_right, height)
+
+        with self._resize(scaled_width, scaled_height) as scaled_image:
+            with scaled_image.crop(crop) as cropped:
+                return self._encode_image(cropped, output_type)
+
+    def _encode_image(self, output_image: Image.Image, output_type: str) -> BytesIO:
+        output_bytes_io = BytesIO()
+        fmt = self.FORMATS[output_type]
+        if fmt == "JPEG":
+            output_image = output_image.convert("RGB")
+        output_image.save(output_bytes_io, fmt, quality=80)
+        return output_bytes_io
+
+    def close(self) -> None:
+        """Closes the underlying image file.
+
+        Once closed no other functions can be called.
+
+        Can be called multiple times.
+        """
+
+        if self._closed:
+            return
+
+        self._closed = True
+
+        # Since we run this on the finalizer then we need to handle `__init__`
+        # raising an exception before it can define `self.image`.
+        image = getattr(self, "image", None)
+        if image is None:
+            return
+
+        image.close()
+
+    def __enter__(self) -> "Thumbnailer":
+        """Make `Thumbnailer` a context manager that calls `close` on
+        `__exit__`.
+        """
+        return self
+
+    def __exit__(
+        self,
+        type: Optional[Type[BaseException]],
+        value: Optional[BaseException],
+        traceback: Optional[TracebackType],
+    ) -> None:
+        self.close()
+
+    def __del__(self) -> None:
+        # Make sure we actually do close the image, rather than leak data.
+        self.close()
diff --git a/synapse/rest/media/config_resource.py b/synapse/rest/media/config_resource.py
new file mode 100644
index 0000000000..a95804d327
--- /dev/null
+++ b/synapse/rest/media/config_resource.py
@@ -0,0 +1,41 @@
+# Copyright 2018 Will Hunt <will@half-shot.uk>
+# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import TYPE_CHECKING
+
+from synapse.http.server import DirectServeJsonResource, respond_with_json
+from synapse.http.site import SynapseRequest
+
+if TYPE_CHECKING:
+    from synapse.server import HomeServer
+
+
+class MediaConfigResource(DirectServeJsonResource):
+    isLeaf = True
+
+    def __init__(self, hs: "HomeServer"):
+        super().__init__()
+        config = hs.config
+        self.clock = hs.get_clock()
+        self.auth = hs.get_auth()
+        self.limits_dict = {"m.upload.size": config.media.max_upload_size}
+
+    async def _async_render_GET(self, request: SynapseRequest) -> None:
+        await self.auth.get_user_by_req(request)
+        respond_with_json(request, 200, self.limits_dict, send_cors=True)
+
+    async def _async_render_OPTIONS(self, request: SynapseRequest) -> None:
+        respond_with_json(request, 200, {}, send_cors=True)
diff --git a/synapse/rest/media/download_resource.py b/synapse/rest/media/download_resource.py
new file mode 100644
index 0000000000..8f270cf4cc
--- /dev/null
+++ b/synapse/rest/media/download_resource.py
@@ -0,0 +1,75 @@
+# Copyright 2014-2016 OpenMarket Ltd
+# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import TYPE_CHECKING
+
+from synapse.http.server import (
+    DirectServeJsonResource,
+    set_corp_headers,
+    set_cors_headers,
+)
+from synapse.http.servlet import parse_boolean
+from synapse.http.site import SynapseRequest
+from synapse.media._base import parse_media_id, respond_404
+
+if TYPE_CHECKING:
+    from synapse.media.media_repository import MediaRepository
+    from synapse.server import HomeServer
+
+logger = logging.getLogger(__name__)
+
+
+class DownloadResource(DirectServeJsonResource):
+    isLeaf = True
+
+    def __init__(self, hs: "HomeServer", media_repo: "MediaRepository"):
+        super().__init__()
+        self.media_repo = media_repo
+        self.server_name = hs.hostname
+
+    async def _async_render_GET(self, request: SynapseRequest) -> None:
+        set_cors_headers(request)
+        set_corp_headers(request)
+        request.setHeader(
+            b"Content-Security-Policy",
+            b"sandbox;"
+            b" default-src 'none';"
+            b" script-src 'none';"
+            b" plugin-types application/pdf;"
+            b" style-src 'unsafe-inline';"
+            b" media-src 'self';"
+            b" object-src 'self';",
+        )
+        # Limited non-standard form of CSP for IE11
+        request.setHeader(b"X-Content-Security-Policy", b"sandbox;")
+        request.setHeader(
+            b"Referrer-Policy",
+            b"no-referrer",
+        )
+        server_name, media_id, name = parse_media_id(request)
+        if server_name == self.server_name:
+            await self.media_repo.get_local_media(request, media_id, name)
+        else:
+            allow_remote = parse_boolean(request, "allow_remote", default=True)
+            if not allow_remote:
+                logger.info(
+                    "Rejecting request for remote media %s/%s due to allow_remote",
+                    server_name,
+                    media_id,
+                )
+                respond_404(request)
+                return
+
+            await self.media_repo.get_remote_media(request, server_name, media_id, name)
diff --git a/synapse/rest/media/media_repository_resource.py b/synapse/rest/media/media_repository_resource.py
new file mode 100644
index 0000000000..5ebaa3b032
--- /dev/null
+++ b/synapse/rest/media/media_repository_resource.py
@@ -0,0 +1,93 @@
+# Copyright 2014-2016 OpenMarket Ltd
+# Copyright 2018-2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from synapse.config._base import ConfigError
+from synapse.http.server import UnrecognizedRequestResource
+
+from .config_resource import MediaConfigResource
+from .download_resource import DownloadResource
+from .preview_url_resource import PreviewUrlResource
+from .thumbnail_resource import ThumbnailResource
+from .upload_resource import UploadResource
+
+if TYPE_CHECKING:
+    from synapse.server import HomeServer
+
+
+class MediaRepositoryResource(UnrecognizedRequestResource):
+    """File uploading and downloading.
+
+    Uploads are POSTed to a resource which returns a token which is used to GET
+    the download::
+
+        => POST /_matrix/media/r0/upload HTTP/1.1
+           Content-Type: <media-type>
+           Content-Length: <content-length>
+
+           <media>
+
+        <= HTTP/1.1 200 OK
+           Content-Type: application/json
+
+           { "content_uri": "mxc://<server-name>/<media-id>" }
+
+        => GET /_matrix/media/r0/download/<server-name>/<media-id> HTTP/1.1
+
+        <= HTTP/1.1 200 OK
+           Content-Type: <media-type>
+           Content-Disposition: attachment;filename=<upload-filename>
+
+           <media>
+
+    Clients can get thumbnails by supplying a desired width and height and
+    thumbnailing method::
+
+        => GET /_matrix/media/r0/thumbnail/<server_name>
+                /<media-id>?width=<w>&height=<h>&method=<m> HTTP/1.1
+
+        <= HTTP/1.1 200 OK
+           Content-Type: image/jpeg or image/png
+
+           <thumbnail>
+
+    The thumbnail methods are "crop" and "scale". "scale" tries to return an
+    image where either the width or the height is smaller than the requested
+    size. The client should then scale and letterbox the image if it needs to
+    fit within a given rectangle. "crop" tries to return an image where the
+    width and height are close to the requested size and the aspect matches
+    the requested size. The client should scale the image if it needs to fit
+    within a given rectangle.
+    """
+
+    def __init__(self, hs: "HomeServer"):
+        # If we're not configured to use it, raise if we somehow got here.
+        if not hs.config.media.can_load_media_repo:
+            raise ConfigError("Synapse is not configured to use a media repo.")
+
+        super().__init__()
+        media_repo = hs.get_media_repository()
+
+        self.putChild(b"upload", UploadResource(hs, media_repo))
+        self.putChild(b"download", DownloadResource(hs, media_repo))
+        self.putChild(
+            b"thumbnail", ThumbnailResource(hs, media_repo, media_repo.media_storage)
+        )
+        if hs.config.media.url_preview_enabled:
+            self.putChild(
+                b"preview_url",
+                PreviewUrlResource(hs, media_repo, media_repo.media_storage),
+            )
+        self.putChild(b"config", MediaConfigResource(hs))
diff --git a/synapse/rest/media/preview_url_resource.py b/synapse/rest/media/preview_url_resource.py
new file mode 100644
index 0000000000..7ada728757
--- /dev/null
+++ b/synapse/rest/media/preview_url_resource.py
@@ -0,0 +1,869 @@
+# Copyright 2016 OpenMarket Ltd
+# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import datetime
+import errno
+import fnmatch
+import logging
+import os
+import re
+import shutil
+import sys
+import traceback
+from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple
+from urllib.parse import urljoin, urlparse, urlsplit
+from urllib.request import urlopen
+
+import attr
+
+from twisted.internet.defer import Deferred
+from twisted.internet.error import DNSLookupError
+
+from synapse.api.errors import Codes, SynapseError
+from synapse.http.client import SimpleHttpClient
+from synapse.http.server import (
+    DirectServeJsonResource,
+    respond_with_json,
+    respond_with_json_bytes,
+)
+from synapse.http.servlet import parse_integer, parse_string
+from synapse.http.site import SynapseRequest
+from synapse.logging.context import make_deferred_yieldable, run_in_background
+from synapse.media._base import FileInfo, get_filename_from_headers
+from synapse.media.media_storage import MediaStorage
+from synapse.media.oembed import OEmbedProvider
+from synapse.media.preview_html import decode_body, parse_html_to_open_graph
+from synapse.metrics.background_process_metrics import run_as_background_process
+from synapse.types import JsonDict, UserID
+from synapse.util import json_encoder
+from synapse.util.async_helpers import ObservableDeferred
+from synapse.util.caches.expiringcache import ExpiringCache
+from synapse.util.stringutils import random_string
+
+if TYPE_CHECKING:
+    from synapse.media.media_repository import MediaRepository
+    from synapse.server import HomeServer
+
+logger = logging.getLogger(__name__)
+
+OG_TAG_NAME_MAXLEN = 50
+OG_TAG_VALUE_MAXLEN = 1000
+
+ONE_HOUR = 60 * 60 * 1000
+ONE_DAY = 24 * ONE_HOUR
+IMAGE_CACHE_EXPIRY_MS = 2 * ONE_DAY
+
+
+@attr.s(slots=True, frozen=True, auto_attribs=True)
+class DownloadResult:
+    length: int
+    uri: str
+    response_code: int
+    media_type: str
+    download_name: Optional[str]
+    expires: int
+    etag: Optional[str]
+
+
+@attr.s(slots=True, frozen=True, auto_attribs=True)
+class MediaInfo:
+    """
+    Information parsed from downloading media being previewed.
+    """
+
+    # The Content-Type header of the response.
+    media_type: str
+    # The length (in bytes) of the downloaded media.
+    media_length: int
+    # The media filename, according to the server. This is parsed from the
+    # returned headers, if possible.
+    download_name: Optional[str]
+    # The time of the preview.
+    created_ts_ms: int
+    # Information from the media storage provider about where the file is stored
+    # on disk.
+    filesystem_id: str
+    filename: str
+    # The URI being previewed.
+    uri: str
+    # The HTTP response code.
+    response_code: int
+    # The timestamp (in milliseconds) of when this preview expires.
+    expires: int
+    # The ETag header of the response.
+    etag: Optional[str]
+
+
+class PreviewUrlResource(DirectServeJsonResource):
+    """
+    The `GET /_matrix/media/r0/preview_url` endpoint provides a generic preview API
+    for URLs which outputs Open Graph (https://ogp.me/) responses (with some Matrix
+    specific additions).
+
+    This does have trade-offs compared to other designs:
+
+    * Pros:
+      * Simple and flexible; can be used by any clients at any point
+    * Cons:
+      * If each homeserver provides one of these independently, all the homeservers in a
+        room may needlessly DoS the target URI
+      * The URL metadata must be stored somewhere, rather than just using Matrix
+        itself to store the media.
+      * Matrix cannot be used to distribute the metadata between homeservers.
+
+    When Synapse is asked to preview a URL it does the following:
+
+    1. Checks against a URL blacklist (defined as `url_preview_url_blacklist` in the
+       config).
+    2. Checks the URL against an in-memory cache and returns the result if it exists. (This
+       is also used to de-duplicate processing of multiple in-flight requests at once.)
+    3. Kicks off a background process to generate a preview:
+       1. Checks URL and timestamp against the database cache and returns the result if it
+          has not expired and was successful (a 2xx return code).
+       2. Checks if the URL matches an oEmbed (https://oembed.com/) pattern. If it
+          does, update the URL to download.
+       3. Downloads the URL and stores it into a file via the media storage provider
+          and saves the local media metadata.
+       4. If the media is an image:
+          1. Generates thumbnails.
+          2. Generates an Open Graph response based on image properties.
+       5. If the media is HTML:
+          1. Decodes the HTML via the stored file.
+          2. Generates an Open Graph response from the HTML.
+          3. If a JSON oEmbed URL was found in the HTML via autodiscovery:
+             1. Downloads the URL and stores it into a file via the media storage provider
+                and saves the local media metadata.
+             2. Convert the oEmbed response to an Open Graph response.
+             3. Override any Open Graph data from the HTML with data from oEmbed.
+          4. If an image exists in the Open Graph response:
+             1. Downloads the URL and stores it into a file via the media storage
+                provider and saves the local media metadata.
+             2. Generates thumbnails.
+             3. Updates the Open Graph response based on image properties.
+       6. If the media is JSON and an oEmbed URL was found:
+          1. Convert the oEmbed response to an Open Graph response.
+          2. If a thumbnail or image is in the oEmbed response:
+             1. Downloads the URL and stores it into a file via the media storage
+                provider and saves the local media metadata.
+             2. Generates thumbnails.
+             3. Updates the Open Graph response based on image properties.
+       7. Stores the result in the database cache.
+    4. Returns the result.
+
+    If any additional requests (e.g. from oEmbed autodiscovery, step 5.3 or
+    image thumbnailing, step 5.4 or 6.4) fails then the URL preview as a whole
+    does not fail. As much information as possible is returned.
+
+    The in-memory cache expires after 1 hour.
+
+    Expired entries in the database cache (and their associated media files) are
+    deleted every 10 seconds. The default expiration time is 1 hour from download.
+    """
+
+    isLeaf = True
+
+    def __init__(
+        self,
+        hs: "HomeServer",
+        media_repo: "MediaRepository",
+        media_storage: MediaStorage,
+    ):
+        super().__init__()
+
+        self.auth = hs.get_auth()
+        self.clock = hs.get_clock()
+        self.filepaths = media_repo.filepaths
+        self.max_spider_size = hs.config.media.max_spider_size
+        self.server_name = hs.hostname
+        self.store = hs.get_datastores().main
+        self.client = SimpleHttpClient(
+            hs,
+            treq_args={"browser_like_redirects": True},
+            ip_whitelist=hs.config.media.url_preview_ip_range_whitelist,
+            ip_blacklist=hs.config.media.url_preview_ip_range_blacklist,
+            use_proxy=True,
+        )
+        self.media_repo = media_repo
+        self.primary_base_path = media_repo.primary_base_path
+        self.media_storage = media_storage
+
+        self._oembed = OEmbedProvider(hs)
+
+        # We run the background jobs if we're the instance specified (or no
+        # instance is specified, where we assume there is only one instance
+        # serving media).
+        instance_running_jobs = hs.config.media.media_instance_running_background_jobs
+        self._worker_run_media_background_jobs = (
+            instance_running_jobs is None
+            or instance_running_jobs == hs.get_instance_name()
+        )
+
+        self.url_preview_url_blacklist = hs.config.media.url_preview_url_blacklist
+        self.url_preview_accept_language = hs.config.media.url_preview_accept_language
+
+        # memory cache mapping urls to an ObservableDeferred returning
+        # JSON-encoded OG metadata
+        self._cache: ExpiringCache[str, ObservableDeferred] = ExpiringCache(
+            cache_name="url_previews",
+            clock=self.clock,
+            # don't spider URLs more often than once an hour
+            expiry_ms=ONE_HOUR,
+        )
+
+        if self._worker_run_media_background_jobs:
+            self._cleaner_loop = self.clock.looping_call(
+                self._start_expire_url_cache_data, 10 * 1000
+            )
+
+    async def _async_render_OPTIONS(self, request: SynapseRequest) -> None:
+        request.setHeader(b"Allow", b"OPTIONS, GET")
+        respond_with_json(request, 200, {}, send_cors=True)
+
+    async def _async_render_GET(self, request: SynapseRequest) -> None:
+        # XXX: if get_user_by_req fails, what should we do in an async render?
+        requester = await self.auth.get_user_by_req(request)
+        url = parse_string(request, "url", required=True)
+        ts = parse_integer(request, "ts")
+        if ts is None:
+            ts = self.clock.time_msec()
+
+        # XXX: we could move this into _do_preview if we wanted.
+        url_tuple = urlsplit(url)
+        for entry in self.url_preview_url_blacklist:
+            match = True
+            for attrib in entry:
+                pattern = entry[attrib]
+                value = getattr(url_tuple, attrib)
+                logger.debug(
+                    "Matching attrib '%s' with value '%s' against pattern '%s'",
+                    attrib,
+                    value,
+                    pattern,
+                )
+
+                if value is None:
+                    match = False
+                    continue
+
+                # Some attributes might not be parsed as strings by urlsplit (such as the
+                # port, which is parsed as an int). Because we use match functions that
+                # expect strings, we want to make sure that's what we give them.
+                value_str = str(value)
+
+                if pattern.startswith("^"):
+                    if not re.match(pattern, value_str):
+                        match = False
+                        continue
+                else:
+                    if not fnmatch.fnmatch(value_str, pattern):
+                        match = False
+                        continue
+            if match:
+                logger.warning("URL %s blocked by url_blacklist entry %s", url, entry)
+                raise SynapseError(
+                    403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN
+                )
+
+        # the in-memory cache:
+        # * ensures that only one request is active at a time
+        # * takes load off the DB for the thundering herds
+        # * also caches any failures (unlike the DB) so we don't keep
+        #    requesting the same endpoint
+
+        observable = self._cache.get(url)
+
+        if not observable:
+            download = run_in_background(self._do_preview, url, requester.user, ts)
+            observable = ObservableDeferred(download, consumeErrors=True)
+            self._cache[url] = observable
+        else:
+            logger.info("Returning cached response")
+
+        og = await make_deferred_yieldable(observable.observe())
+        respond_with_json_bytes(request, 200, og, send_cors=True)
+
+    async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
+        """Check the db, and download the URL and build a preview
+
+        Args:
+            url: The URL to preview.
+            user: The user requesting the preview.
+            ts: The timestamp requested for the preview.
+
+        Returns:
+            json-encoded og data
+        """
+        # check the URL cache in the DB (which will also provide us with
+        # historical previews, if we have any)
+        cache_result = await self.store.get_url_cache(url, ts)
+        if (
+            cache_result
+            and cache_result["expires_ts"] > ts
+            and cache_result["response_code"] / 100 == 2
+        ):
+            # It may be stored as text in the database, not as bytes (such as
+            # PostgreSQL). If so, encode it back before handing it on.
+            og = cache_result["og"]
+            if isinstance(og, str):
+                og = og.encode("utf8")
+            return og
+
+        # If this URL can be accessed via oEmbed, use that instead.
+        url_to_download = url
+        oembed_url = self._oembed.get_oembed_url(url)
+        if oembed_url:
+            url_to_download = oembed_url
+
+        media_info = await self._handle_url(url_to_download, user)
+
+        logger.debug("got media_info of '%s'", media_info)
+
+        # The number of milliseconds that the response should be considered valid.
+        expiration_ms = media_info.expires
+        author_name: Optional[str] = None
+
+        if _is_media(media_info.media_type):
+            file_id = media_info.filesystem_id
+            dims = await self.media_repo._generate_thumbnails(
+                None, file_id, file_id, media_info.media_type, url_cache=True
+            )
+
+            og = {
+                "og:description": media_info.download_name,
+                "og:image": f"mxc://{self.server_name}/{media_info.filesystem_id}",
+                "og:image:type": media_info.media_type,
+                "matrix:image:size": media_info.media_length,
+            }
+
+            if dims:
+                og["og:image:width"] = dims["width"]
+                og["og:image:height"] = dims["height"]
+            else:
+                logger.warning("Couldn't get dims for %s" % url)
+
+            # define our OG response for this media
+        elif _is_html(media_info.media_type):
+            # TODO: somehow stop a big HTML tree from exploding synapse's RAM
+
+            with open(media_info.filename, "rb") as file:
+                body = file.read()
+
+            tree = decode_body(body, media_info.uri, media_info.media_type)
+            if tree is not None:
+                # Check if this HTML document points to oEmbed information and
+                # defer to that.
+                oembed_url = self._oembed.autodiscover_from_html(tree)
+                og_from_oembed: JsonDict = {}
+                if oembed_url:
+                    try:
+                        oembed_info = await self._handle_url(
+                            oembed_url, user, allow_data_urls=True
+                        )
+                    except Exception as e:
+                        # Fetching the oEmbed info failed, don't block the entire URL preview.
+                        logger.warning(
+                            "oEmbed fetch failed during URL preview: %s errored with %s",
+                            oembed_url,
+                            e,
+                        )
+                    else:
+                        (
+                            og_from_oembed,
+                            author_name,
+                            expiration_ms,
+                        ) = await self._handle_oembed_response(
+                            url, oembed_info, expiration_ms
+                        )
+
+                # Parse Open Graph information from the HTML in case the oEmbed
+                # response failed or is incomplete.
+                og_from_html = parse_html_to_open_graph(tree)
+
+                # Compile the Open Graph response by using the scraped
+                # information from the HTML and overlaying any information
+                # from the oEmbed response.
+                og = {**og_from_html, **og_from_oembed}
+
+                await self._precache_image_url(user, media_info, og)
+            else:
+                og = {}
+
+        elif oembed_url:
+            # Handle the oEmbed information.
+            og, author_name, expiration_ms = await self._handle_oembed_response(
+                url, media_info, expiration_ms
+            )
+            await self._precache_image_url(user, media_info, og)
+
+        else:
+            logger.warning("Failed to find any OG data in %s", url)
+            og = {}
+
+        # If we don't have a title but we have author_name, copy it as
+        # title
+        if not og.get("og:title") and author_name:
+            og["og:title"] = author_name
+
+        # filter out any stupidly long values
+        keys_to_remove = []
+        for k, v in og.items():
+            # values can be numeric as well as strings, hence the cast to str
+            if len(k) > OG_TAG_NAME_MAXLEN or len(str(v)) > OG_TAG_VALUE_MAXLEN:
+                logger.warning(
+                    "Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN]
+                )
+                keys_to_remove.append(k)
+        for k in keys_to_remove:
+            del og[k]
+
+        logger.debug("Calculated OG for %s as %s", url, og)
+
+        jsonog = json_encoder.encode(og)
+
+        # Cap the amount of time to consider a response valid.
+        expiration_ms = min(expiration_ms, ONE_DAY)
+
+        # store OG in history-aware DB cache
+        await self.store.store_url_cache(
+            url,
+            media_info.response_code,
+            media_info.etag,
+            media_info.created_ts_ms + expiration_ms,
+            jsonog,
+            media_info.filesystem_id,
+            media_info.created_ts_ms,
+        )
+
+        return jsonog.encode("utf8")
+
+    async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult:
+        """
+        Fetches a remote URL and parses the headers.
+
+        Args:
+             url: The URL to fetch.
+             output_stream: The stream to write the content to.
+
+        Returns:
+            A tuple of:
+                Media length, URL downloaded, the HTTP response code,
+                the media type, the downloaded file name, the number of
+                milliseconds the result is valid for, the etag header.
+        """
+
+        try:
+            logger.debug("Trying to get preview for url '%s'", url)
+            length, headers, uri, code = await self.client.get_file(
+                url,
+                output_stream=output_stream,
+                max_size=self.max_spider_size,
+                headers={
+                    b"Accept-Language": self.url_preview_accept_language,
+                    # Use a custom user agent for the preview because some sites will only return
+                    # Open Graph metadata to crawler user agents. Omit the Synapse version
+                    # string to avoid leaking information.
+                    b"User-Agent": [
+                        "Synapse (bot; +https://github.com/matrix-org/synapse)"
+                    ],
+                },
+                is_allowed_content_type=_is_previewable,
+            )
+        except SynapseError:
+            # Pass SynapseErrors through directly, so that the servlet
+            # handler will return a SynapseError to the client instead of
+            # blank data or a 500.
+            raise
+        except DNSLookupError:
+            # DNS lookup returned no results
+            # Note: This will also be the case if one of the resolved IP
+            # addresses is blacklisted
+            raise SynapseError(
+                502,
+                "DNS resolution failure during URL preview generation",
+                Codes.UNKNOWN,
+            )
+        except Exception as e:
+            # FIXME: pass through 404s and other error messages nicely
+            logger.warning("Error downloading %s: %r", url, e)
+
+            raise SynapseError(
+                500,
+                "Failed to download content: %s"
+                % (traceback.format_exception_only(sys.exc_info()[0], e),),
+                Codes.UNKNOWN,
+            )
+
+        if b"Content-Type" in headers:
+            media_type = headers[b"Content-Type"][0].decode("ascii")
+        else:
+            media_type = "application/octet-stream"
+
+        download_name = get_filename_from_headers(headers)
+
+        # FIXME: we should calculate a proper expiration based on the
+        # Cache-Control and Expire headers.  But for now, assume 1 hour.
+        expires = ONE_HOUR
+        etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
+
+        return DownloadResult(
+            length, uri, code, media_type, download_name, expires, etag
+        )
+
+    async def _parse_data_url(
+        self, url: str, output_stream: BinaryIO
+    ) -> DownloadResult:
+        """
+        Parses a data: URL.
+
+        Args:
+             url: The URL to parse.
+             output_stream: The stream to write the content to.
+
+        Returns:
+            A tuple of:
+                Media length, URL downloaded, the HTTP response code,
+                the media type, the downloaded file name, the number of
+                milliseconds the result is valid for, the etag header.
+        """
+
+        try:
+            logger.debug("Trying to parse data url '%s'", url)
+            with urlopen(url) as url_info:
+                # TODO Can this be more efficient.
+                output_stream.write(url_info.read())
+        except Exception as e:
+            logger.warning("Error parsing data: URL %s: %r", url, e)
+
+            raise SynapseError(
+                500,
+                "Failed to parse data URL: %s"
+                % (traceback.format_exception_only(sys.exc_info()[0], e),),
+                Codes.UNKNOWN,
+            )
+
+        return DownloadResult(
+            # Read back the length that has been written.
+            length=output_stream.tell(),
+            uri=url,
+            # If it was parsed, consider this a 200 OK.
+            response_code=200,
+            # urlopen shoves the media-type from the data URL into the content type
+            # header object.
+            media_type=url_info.headers.get_content_type(),
+            # Some features are not supported by data: URLs.
+            download_name=None,
+            expires=ONE_HOUR,
+            etag=None,
+        )
+
+    async def _handle_url(
+        self, url: str, user: UserID, allow_data_urls: bool = False
+    ) -> MediaInfo:
+        """
+        Fetches content from a URL and parses the result to generate a MediaInfo.
+
+        It uses the media storage provider to persist the fetched content and
+        stores the mapping into the database.
+
+        Args:
+             url: The URL to fetch.
+             user: The user who ahs requested this URL.
+             allow_data_urls: True if data URLs should be allowed.
+
+        Returns:
+            A MediaInfo object describing the fetched content.
+        """
+
+        # TODO: we should probably honour robots.txt... except in practice
+        # we're most likely being explicitly triggered by a human rather than a
+        # bot, so are we really a robot?
+
+        file_id = datetime.date.today().isoformat() + "_" + random_string(16)
+
+        file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)
+
+        with self.media_storage.store_into_file(file_info) as (f, fname, finish):
+            if url.startswith("data:"):
+                if not allow_data_urls:
+                    raise SynapseError(
+                        500, "Previewing of data: URLs is forbidden", Codes.UNKNOWN
+                    )
+
+                download_result = await self._parse_data_url(url, f)
+            else:
+                download_result = await self._download_url(url, f)
+
+            await finish()
+
+        try:
+            time_now_ms = self.clock.time_msec()
+
+            await self.store.store_local_media(
+                media_id=file_id,
+                media_type=download_result.media_type,
+                time_now_ms=time_now_ms,
+                upload_name=download_result.download_name,
+                media_length=download_result.length,
+                user_id=user,
+                url_cache=url,
+            )
+
+        except Exception as e:
+            logger.error("Error handling downloaded %s: %r", url, e)
+            # TODO: we really ought to delete the downloaded file in this
+            # case, since we won't have recorded it in the db, and will
+            # therefore not expire it.
+            raise
+
+        return MediaInfo(
+            media_type=download_result.media_type,
+            media_length=download_result.length,
+            download_name=download_result.download_name,
+            created_ts_ms=time_now_ms,
+            filesystem_id=file_id,
+            filename=fname,
+            uri=download_result.uri,
+            response_code=download_result.response_code,
+            expires=download_result.expires,
+            etag=download_result.etag,
+        )
+
+    async def _precache_image_url(
+        self, user: UserID, media_info: MediaInfo, og: JsonDict
+    ) -> None:
+        """
+        Pre-cache the image (if one exists) for posterity
+
+        Args:
+            user: The user requesting the preview.
+            media_info: The media being previewed.
+            og: The Open Graph dictionary. This is modified with image information.
+        """
+        # If there's no image or it is blank, there's nothing to do.
+        if "og:image" not in og:
+            return
+
+        # Remove the raw image URL, this will be replaced with an MXC URL, if successful.
+        image_url = og.pop("og:image")
+        if not image_url:
+            return
+
+        # The image URL from the HTML might be relative to the previewed page,
+        # convert it to an URL which can be requested directly.
+        url_parts = urlparse(image_url)
+        if url_parts.scheme != "data":
+            image_url = urljoin(media_info.uri, image_url)
+
+        # FIXME: it might be cleaner to use the same flow as the main /preview_url
+        # request itself and benefit from the same caching etc.  But for now we
+        # just rely on the caching on the master request to speed things up.
+        try:
+            image_info = await self._handle_url(image_url, user, allow_data_urls=True)
+        except Exception as e:
+            # Pre-caching the image failed, don't block the entire URL preview.
+            logger.warning(
+                "Pre-caching image failed during URL preview: %s errored with %s",
+                image_url,
+                e,
+            )
+            return
+
+        if _is_media(image_info.media_type):
+            # TODO: make sure we don't choke on white-on-transparent images
+            file_id = image_info.filesystem_id
+            dims = await self.media_repo._generate_thumbnails(
+                None, file_id, file_id, image_info.media_type, url_cache=True
+            )
+            if dims:
+                og["og:image:width"] = dims["width"]
+                og["og:image:height"] = dims["height"]
+            else:
+                logger.warning("Couldn't get dims for %s", image_url)
+
+            og["og:image"] = f"mxc://{self.server_name}/{image_info.filesystem_id}"
+            og["og:image:type"] = image_info.media_type
+            og["matrix:image:size"] = image_info.media_length
+
+    async def _handle_oembed_response(
+        self, url: str, media_info: MediaInfo, expiration_ms: int
+    ) -> Tuple[JsonDict, Optional[str], int]:
+        """
+        Parse the downloaded oEmbed info.
+
+        Args:
+            url: The URL which is being previewed (not the one which was
+                requested).
+            media_info: The media being previewed.
+            expiration_ms: The length of time, in milliseconds, the media is valid for.
+
+        Returns:
+            A tuple of:
+                The Open Graph dictionary, if the oEmbed info can be parsed.
+                The author name if it could be retrieved from oEmbed.
+                The (possibly updated) length of time, in milliseconds, the media is valid for.
+        """
+        # If JSON was not returned, there's nothing to do.
+        if not _is_json(media_info.media_type):
+            return {}, None, expiration_ms
+
+        with open(media_info.filename, "rb") as file:
+            body = file.read()
+
+        oembed_response = self._oembed.parse_oembed_response(url, body)
+        open_graph_result = oembed_response.open_graph_result
+
+        # Use the cache age from the oEmbed result, if one was given.
+        if open_graph_result and oembed_response.cache_age is not None:
+            expiration_ms = oembed_response.cache_age
+
+        return open_graph_result, oembed_response.author_name, expiration_ms
+
+    def _start_expire_url_cache_data(self) -> Deferred:
+        return run_as_background_process(
+            "expire_url_cache_data", self._expire_url_cache_data
+        )
+
+    async def _expire_url_cache_data(self) -> None:
+        """Clean up expired url cache content, media and thumbnails."""
+
+        assert self._worker_run_media_background_jobs
+
+        now = self.clock.time_msec()
+
+        logger.debug("Running url preview cache expiry")
+
+        def try_remove_parent_dirs(dirs: Iterable[str]) -> None:
+            """Attempt to remove the given chain of parent directories
+
+            Args:
+                dirs: The list of directory paths to delete, with children appearing
+                    before their parents.
+            """
+            for dir in dirs:
+                try:
+                    os.rmdir(dir)
+                except FileNotFoundError:
+                    # Already deleted, continue with deleting the rest
+                    pass
+                except OSError as e:
+                    # Failed, skip deleting the rest of the parent dirs
+                    if e.errno != errno.ENOTEMPTY:
+                        logger.warning(
+                            "Failed to remove media directory while clearing url preview cache: %r: %s",
+                            dir,
+                            e,
+                        )
+                    break
+
+        # First we delete expired url cache entries
+        media_ids = await self.store.get_expired_url_cache(now)
+
+        removed_media = []
+        for media_id in media_ids:
+            fname = self.filepaths.url_cache_filepath(media_id)
+            try:
+                os.remove(fname)
+            except FileNotFoundError:
+                pass  # If the path doesn't exist, meh
+            except OSError as e:
+                logger.warning(
+                    "Failed to remove media while clearing url preview cache: %r: %s",
+                    media_id,
+                    e,
+                )
+                continue
+
+            removed_media.append(media_id)
+
+            dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
+            try_remove_parent_dirs(dirs)
+
+        await self.store.delete_url_cache(removed_media)
+
+        if removed_media:
+            logger.debug(
+                "Deleted %d entries from url preview cache", len(removed_media)
+            )
+        else:
+            logger.debug("No entries removed from url preview cache")
+
+        # Now we delete old images associated with the url cache.
+        # These may be cached for a bit on the client (i.e., they
+        # may have a room open with a preview url thing open).
+        # So we wait a couple of days before deleting, just in case.
+        expire_before = now - IMAGE_CACHE_EXPIRY_MS
+        media_ids = await self.store.get_url_cache_media_before(expire_before)
+
+        removed_media = []
+        for media_id in media_ids:
+            fname = self.filepaths.url_cache_filepath(media_id)
+            try:
+                os.remove(fname)
+            except FileNotFoundError:
+                pass  # If the path doesn't exist, meh
+            except OSError as e:
+                logger.warning(
+                    "Failed to remove media from url preview cache: %r: %s", media_id, e
+                )
+                continue
+
+            dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
+            try_remove_parent_dirs(dirs)
+
+            thumbnail_dir = self.filepaths.url_cache_thumbnail_directory(media_id)
+            try:
+                shutil.rmtree(thumbnail_dir)
+            except FileNotFoundError:
+                pass  # If the path doesn't exist, meh
+            except OSError as e:
+                logger.warning(
+                    "Failed to remove media from url preview cache: %r: %s", media_id, e
+                )
+                continue
+
+            removed_media.append(media_id)
+
+            dirs = self.filepaths.url_cache_thumbnail_dirs_to_delete(media_id)
+            # Note that one of the directories to be deleted has already been
+            # removed by the `rmtree` above.
+            try_remove_parent_dirs(dirs)
+
+        await self.store.delete_url_cache_media(removed_media)
+
+        if removed_media:
+            logger.debug("Deleted %d media from url preview cache", len(removed_media))
+        else:
+            logger.debug("No media removed from url preview cache")
+
+
+def _is_media(content_type: str) -> bool:
+    return content_type.lower().startswith("image/")
+
+
+def _is_html(content_type: str) -> bool:
+    content_type = content_type.lower()
+    return content_type.startswith("text/html") or content_type.startswith(
+        "application/xhtml"
+    )
+
+
+def _is_json(content_type: str) -> bool:
+    return content_type.lower().startswith("application/json")
+
+
+def _is_previewable(content_type: str) -> bool:
+    """Returns True for content types for which we will perform URL preview and False
+    otherwise."""
+
+    return _is_html(content_type) or _is_media(content_type) or _is_json(content_type)
diff --git a/synapse/rest/media/thumbnail_resource.py b/synapse/rest/media/thumbnail_resource.py
new file mode 100644
index 0000000000..4ee2a0dbda
--- /dev/null
+++ b/synapse/rest/media/thumbnail_resource.py
@@ -0,0 +1,554 @@
+# Copyright 2014-2016 OpenMarket Ltd
+# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+from synapse.api.errors import Codes, SynapseError, cs_error
+from synapse.config.repository import THUMBNAIL_SUPPORTED_MEDIA_FORMAT_MAP
+from synapse.http.server import (
+    DirectServeJsonResource,
+    respond_with_json,
+    set_corp_headers,
+    set_cors_headers,
+)
+from synapse.http.servlet import parse_integer, parse_string
+from synapse.http.site import SynapseRequest
+from synapse.media._base import (
+    FileInfo,
+    ThumbnailInfo,
+    parse_media_id,
+    respond_404,
+    respond_with_file,
+    respond_with_responder,
+)
+from synapse.media.media_storage import MediaStorage
+
+if TYPE_CHECKING:
+    from synapse.media.media_repository import MediaRepository
+    from synapse.server import HomeServer
+
+logger = logging.getLogger(__name__)
+
+
+class ThumbnailResource(DirectServeJsonResource):
+    isLeaf = True
+
+    def __init__(
+        self,
+        hs: "HomeServer",
+        media_repo: "MediaRepository",
+        media_storage: MediaStorage,
+    ):
+        super().__init__()
+
+        self.store = hs.get_datastores().main
+        self.media_repo = media_repo
+        self.media_storage = media_storage
+        self.dynamic_thumbnails = hs.config.media.dynamic_thumbnails
+        self.server_name = hs.hostname
+
+    async def _async_render_GET(self, request: SynapseRequest) -> None:
+        set_cors_headers(request)
+        set_corp_headers(request)
+        server_name, media_id, _ = parse_media_id(request)
+        width = parse_integer(request, "width", required=True)
+        height = parse_integer(request, "height", required=True)
+        method = parse_string(request, "method", "scale")
+        # TODO Parse the Accept header to get an prioritised list of thumbnail types.
+        m_type = "image/png"
+
+        if server_name == self.server_name:
+            if self.dynamic_thumbnails:
+                await self._select_or_generate_local_thumbnail(
+                    request, media_id, width, height, method, m_type
+                )
+            else:
+                await self._respond_local_thumbnail(
+                    request, media_id, width, height, method, m_type
+                )
+            self.media_repo.mark_recently_accessed(None, media_id)
+        else:
+            if self.dynamic_thumbnails:
+                await self._select_or_generate_remote_thumbnail(
+                    request, server_name, media_id, width, height, method, m_type
+                )
+            else:
+                await self._respond_remote_thumbnail(
+                    request, server_name, media_id, width, height, method, m_type
+                )
+            self.media_repo.mark_recently_accessed(server_name, media_id)
+
+    async def _respond_local_thumbnail(
+        self,
+        request: SynapseRequest,
+        media_id: str,
+        width: int,
+        height: int,
+        method: str,
+        m_type: str,
+    ) -> None:
+        media_info = await self.store.get_local_media(media_id)
+
+        if not media_info:
+            respond_404(request)
+            return
+        if media_info["quarantined_by"]:
+            logger.info("Media is quarantined")
+            respond_404(request)
+            return
+
+        thumbnail_infos = await self.store.get_local_media_thumbnails(media_id)
+        await self._select_and_respond_with_thumbnail(
+            request,
+            width,
+            height,
+            method,
+            m_type,
+            thumbnail_infos,
+            media_id,
+            media_id,
+            url_cache=bool(media_info["url_cache"]),
+            server_name=None,
+        )
+
+    async def _select_or_generate_local_thumbnail(
+        self,
+        request: SynapseRequest,
+        media_id: str,
+        desired_width: int,
+        desired_height: int,
+        desired_method: str,
+        desired_type: str,
+    ) -> None:
+        media_info = await self.store.get_local_media(media_id)
+
+        if not media_info:
+            respond_404(request)
+            return
+        if media_info["quarantined_by"]:
+            logger.info("Media is quarantined")
+            respond_404(request)
+            return
+
+        thumbnail_infos = await self.store.get_local_media_thumbnails(media_id)
+        for info in thumbnail_infos:
+            t_w = info["thumbnail_width"] == desired_width
+            t_h = info["thumbnail_height"] == desired_height
+            t_method = info["thumbnail_method"] == desired_method
+            t_type = info["thumbnail_type"] == desired_type
+
+            if t_w and t_h and t_method and t_type:
+                file_info = FileInfo(
+                    server_name=None,
+                    file_id=media_id,
+                    url_cache=media_info["url_cache"],
+                    thumbnail=ThumbnailInfo(
+                        width=info["thumbnail_width"],
+                        height=info["thumbnail_height"],
+                        type=info["thumbnail_type"],
+                        method=info["thumbnail_method"],
+                    ),
+                )
+
+                t_type = file_info.thumbnail_type
+                t_length = info["thumbnail_length"]
+
+                responder = await self.media_storage.fetch_media(file_info)
+                if responder:
+                    await respond_with_responder(request, responder, t_type, t_length)
+                    return
+
+        logger.debug("We don't have a thumbnail of that size. Generating")
+
+        # Okay, so we generate one.
+        file_path = await self.media_repo.generate_local_exact_thumbnail(
+            media_id,
+            desired_width,
+            desired_height,
+            desired_method,
+            desired_type,
+            url_cache=bool(media_info["url_cache"]),
+        )
+
+        if file_path:
+            await respond_with_file(request, desired_type, file_path)
+        else:
+            logger.warning("Failed to generate thumbnail")
+            raise SynapseError(400, "Failed to generate thumbnail.")
+
+    async def _select_or_generate_remote_thumbnail(
+        self,
+        request: SynapseRequest,
+        server_name: str,
+        media_id: str,
+        desired_width: int,
+        desired_height: int,
+        desired_method: str,
+        desired_type: str,
+    ) -> None:
+        media_info = await self.media_repo.get_remote_media_info(server_name, media_id)
+
+        thumbnail_infos = await self.store.get_remote_media_thumbnails(
+            server_name, media_id
+        )
+
+        file_id = media_info["filesystem_id"]
+
+        for info in thumbnail_infos:
+            t_w = info["thumbnail_width"] == desired_width
+            t_h = info["thumbnail_height"] == desired_height
+            t_method = info["thumbnail_method"] == desired_method
+            t_type = info["thumbnail_type"] == desired_type
+
+            if t_w and t_h and t_method and t_type:
+                file_info = FileInfo(
+                    server_name=server_name,
+                    file_id=media_info["filesystem_id"],
+                    thumbnail=ThumbnailInfo(
+                        width=info["thumbnail_width"],
+                        height=info["thumbnail_height"],
+                        type=info["thumbnail_type"],
+                        method=info["thumbnail_method"],
+                    ),
+                )
+
+                t_type = file_info.thumbnail_type
+                t_length = info["thumbnail_length"]
+
+                responder = await self.media_storage.fetch_media(file_info)
+                if responder:
+                    await respond_with_responder(request, responder, t_type, t_length)
+                    return
+
+        logger.debug("We don't have a thumbnail of that size. Generating")
+
+        # Okay, so we generate one.
+        file_path = await self.media_repo.generate_remote_exact_thumbnail(
+            server_name,
+            file_id,
+            media_id,
+            desired_width,
+            desired_height,
+            desired_method,
+            desired_type,
+        )
+
+        if file_path:
+            await respond_with_file(request, desired_type, file_path)
+        else:
+            logger.warning("Failed to generate thumbnail")
+            raise SynapseError(400, "Failed to generate thumbnail.")
+
+    async def _respond_remote_thumbnail(
+        self,
+        request: SynapseRequest,
+        server_name: str,
+        media_id: str,
+        width: int,
+        height: int,
+        method: str,
+        m_type: str,
+    ) -> None:
+        # TODO: Don't download the whole remote file
+        # We should proxy the thumbnail from the remote server instead of
+        # downloading the remote file and generating our own thumbnails.
+        media_info = await self.media_repo.get_remote_media_info(server_name, media_id)
+
+        thumbnail_infos = await self.store.get_remote_media_thumbnails(
+            server_name, media_id
+        )
+        await self._select_and_respond_with_thumbnail(
+            request,
+            width,
+            height,
+            method,
+            m_type,
+            thumbnail_infos,
+            media_id,
+            media_info["filesystem_id"],
+            url_cache=False,
+            server_name=server_name,
+        )
+
+    async def _select_and_respond_with_thumbnail(
+        self,
+        request: SynapseRequest,
+        desired_width: int,
+        desired_height: int,
+        desired_method: str,
+        desired_type: str,
+        thumbnail_infos: List[Dict[str, Any]],
+        media_id: str,
+        file_id: str,
+        url_cache: bool,
+        server_name: Optional[str] = None,
+    ) -> None:
+        """
+        Respond to a request with an appropriate thumbnail from the previously generated thumbnails.
+
+        Args:
+            request: The incoming request.
+            desired_width: The desired width, the returned thumbnail may be larger than this.
+            desired_height: The desired height, the returned thumbnail may be larger than this.
+            desired_method: The desired method used to generate the thumbnail.
+            desired_type: The desired content-type of the thumbnail.
+            thumbnail_infos: A list of dictionaries of candidate thumbnails.
+            file_id: The ID of the media that a thumbnail is being requested for.
+            url_cache: True if this is from a URL cache.
+            server_name: The server name, if this is a remote thumbnail.
+        """
+        logger.debug(
+            "_select_and_respond_with_thumbnail: media_id=%s desired=%sx%s (%s) thumbnail_infos=%s",
+            media_id,
+            desired_width,
+            desired_height,
+            desired_method,
+            thumbnail_infos,
+        )
+
+        # If `dynamic_thumbnails` is enabled, we expect Synapse to go down a
+        # different code path to handle it.
+        assert not self.dynamic_thumbnails
+
+        if thumbnail_infos:
+            file_info = self._select_thumbnail(
+                desired_width,
+                desired_height,
+                desired_method,
+                desired_type,
+                thumbnail_infos,
+                file_id,
+                url_cache,
+                server_name,
+            )
+            if not file_info:
+                logger.info("Couldn't find a thumbnail matching the desired inputs")
+                respond_404(request)
+                return
+
+            # The thumbnail property must exist.
+            assert file_info.thumbnail is not None
+
+            responder = await self.media_storage.fetch_media(file_info)
+            if responder:
+                await respond_with_responder(
+                    request,
+                    responder,
+                    file_info.thumbnail.type,
+                    file_info.thumbnail.length,
+                )
+                return
+
+            # If we can't find the thumbnail we regenerate it. This can happen
+            # if e.g. we've deleted the thumbnails but still have the original
+            # image somewhere.
+            #
+            # Since we have an entry for the thumbnail in the DB we a) know we
+            # have have successfully generated the thumbnail in the past (so we
+            # don't need to worry about repeatedly failing to generate
+            # thumbnails), and b) have already calculated that appropriate
+            # width/height/method so we can just call the "generate exact"
+            # methods.
+
+            # First let's check that we do actually have the original image
+            # still. This will throw a 404 if we don't.
+            # TODO: We should refetch the thumbnails for remote media.
+            await self.media_storage.ensure_media_is_in_local_cache(
+                FileInfo(server_name, file_id, url_cache=url_cache)
+            )
+
+            if server_name:
+                await self.media_repo.generate_remote_exact_thumbnail(
+                    server_name,
+                    file_id=file_id,
+                    media_id=media_id,
+                    t_width=file_info.thumbnail.width,
+                    t_height=file_info.thumbnail.height,
+                    t_method=file_info.thumbnail.method,
+                    t_type=file_info.thumbnail.type,
+                )
+            else:
+                await self.media_repo.generate_local_exact_thumbnail(
+                    media_id=media_id,
+                    t_width=file_info.thumbnail.width,
+                    t_height=file_info.thumbnail.height,
+                    t_method=file_info.thumbnail.method,
+                    t_type=file_info.thumbnail.type,
+                    url_cache=url_cache,
+                )
+
+            responder = await self.media_storage.fetch_media(file_info)
+            await respond_with_responder(
+                request,
+                responder,
+                file_info.thumbnail.type,
+                file_info.thumbnail.length,
+            )
+        else:
+            # This might be because:
+            # 1. We can't create thumbnails for the given media (corrupted or
+            #    unsupported file type), or
+            # 2. The thumbnailing process never ran or errored out initially
+            #    when the media was first uploaded (these bugs should be
+            #    reported and fixed).
+            # Note that we don't attempt to generate a thumbnail now because
+            # `dynamic_thumbnails` is disabled.
+            logger.info("Failed to find any generated thumbnails")
+
+            respond_with_json(
+                request,
+                400,
+                cs_error(
+                    "Cannot find any thumbnails for the requested media (%r). This might mean the media is not a supported_media_format=(%s) or that thumbnailing failed for some other reason. (Dynamic thumbnails are disabled on this server.)"
+                    % (
+                        request.postpath,
+                        ", ".join(THUMBNAIL_SUPPORTED_MEDIA_FORMAT_MAP.keys()),
+                    ),
+                    code=Codes.UNKNOWN,
+                ),
+                send_cors=True,
+            )
+
+    def _select_thumbnail(
+        self,
+        desired_width: int,
+        desired_height: int,
+        desired_method: str,
+        desired_type: str,
+        thumbnail_infos: List[Dict[str, Any]],
+        file_id: str,
+        url_cache: bool,
+        server_name: Optional[str],
+    ) -> Optional[FileInfo]:
+        """
+        Choose an appropriate thumbnail from the previously generated thumbnails.
+
+        Args:
+            desired_width: The desired width, the returned thumbnail may be larger than this.
+            desired_height: The desired height, the returned thumbnail may be larger than this.
+            desired_method: The desired method used to generate the thumbnail.
+            desired_type: The desired content-type of the thumbnail.
+            thumbnail_infos: A list of dictionaries of candidate thumbnails.
+            file_id: The ID of the media that a thumbnail is being requested for.
+            url_cache: True if this is from a URL cache.
+            server_name: The server name, if this is a remote thumbnail.
+
+        Returns:
+             The thumbnail which best matches the desired parameters.
+        """
+        desired_method = desired_method.lower()
+
+        # The chosen thumbnail.
+        thumbnail_info = None
+
+        d_w = desired_width
+        d_h = desired_height
+
+        if desired_method == "crop":
+            # Thumbnails that match equal or larger sizes of desired width/height.
+            crop_info_list: List[Tuple[int, int, int, bool, int, Dict[str, Any]]] = []
+            # Other thumbnails.
+            crop_info_list2: List[Tuple[int, int, int, bool, int, Dict[str, Any]]] = []
+            for info in thumbnail_infos:
+                # Skip thumbnails generated with different methods.
+                if info["thumbnail_method"] != "crop":
+                    continue
+
+                t_w = info["thumbnail_width"]
+                t_h = info["thumbnail_height"]
+                aspect_quality = abs(d_w * t_h - d_h * t_w)
+                min_quality = 0 if d_w <= t_w and d_h <= t_h else 1
+                size_quality = abs((d_w - t_w) * (d_h - t_h))
+                type_quality = desired_type != info["thumbnail_type"]
+                length_quality = info["thumbnail_length"]
+                if t_w >= d_w or t_h >= d_h:
+                    crop_info_list.append(
+                        (
+                            aspect_quality,
+                            min_quality,
+                            size_quality,
+                            type_quality,
+                            length_quality,
+                            info,
+                        )
+                    )
+                else:
+                    crop_info_list2.append(
+                        (
+                            aspect_quality,
+                            min_quality,
+                            size_quality,
+                            type_quality,
+                            length_quality,
+                            info,
+                        )
+                    )
+            # Pick the most appropriate thumbnail. Some values of `desired_width` and
+            # `desired_height` may result in a tie, in which case we avoid comparing on
+            # the thumbnail info dictionary and pick the thumbnail that appears earlier
+            # in the list of candidates.
+            if crop_info_list:
+                thumbnail_info = min(crop_info_list, key=lambda t: t[:-1])[-1]
+            elif crop_info_list2:
+                thumbnail_info = min(crop_info_list2, key=lambda t: t[:-1])[-1]
+        elif desired_method == "scale":
+            # Thumbnails that match equal or larger sizes of desired width/height.
+            info_list: List[Tuple[int, bool, int, Dict[str, Any]]] = []
+            # Other thumbnails.
+            info_list2: List[Tuple[int, bool, int, Dict[str, Any]]] = []
+
+            for info in thumbnail_infos:
+                # Skip thumbnails generated with different methods.
+                if info["thumbnail_method"] != "scale":
+                    continue
+
+                t_w = info["thumbnail_width"]
+                t_h = info["thumbnail_height"]
+                size_quality = abs((d_w - t_w) * (d_h - t_h))
+                type_quality = desired_type != info["thumbnail_type"]
+                length_quality = info["thumbnail_length"]
+                if t_w >= d_w or t_h >= d_h:
+                    info_list.append((size_quality, type_quality, length_quality, info))
+                else:
+                    info_list2.append(
+                        (size_quality, type_quality, length_quality, info)
+                    )
+            # Pick the most appropriate thumbnail. Some values of `desired_width` and
+            # `desired_height` may result in a tie, in which case we avoid comparing on
+            # the thumbnail info dictionary and pick the thumbnail that appears earlier
+            # in the list of candidates.
+            if info_list:
+                thumbnail_info = min(info_list, key=lambda t: t[:-1])[-1]
+            elif info_list2:
+                thumbnail_info = min(info_list2, key=lambda t: t[:-1])[-1]
+
+        if thumbnail_info:
+            return FileInfo(
+                file_id=file_id,
+                url_cache=url_cache,
+                server_name=server_name,
+                thumbnail=ThumbnailInfo(
+                    width=thumbnail_info["thumbnail_width"],
+                    height=thumbnail_info["thumbnail_height"],
+                    type=thumbnail_info["thumbnail_type"],
+                    method=thumbnail_info["thumbnail_method"],
+                    length=thumbnail_info["thumbnail_length"],
+                ),
+            )
+
+        # No matching thumbnail was found.
+        return None
diff --git a/synapse/rest/media/upload_resource.py b/synapse/rest/media/upload_resource.py
new file mode 100644
index 0000000000..697348613b
--- /dev/null
+++ b/synapse/rest/media/upload_resource.py
@@ -0,0 +1,108 @@
+# Copyright 2014-2016 OpenMarket Ltd
+# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import IO, TYPE_CHECKING, Dict, List, Optional
+
+from synapse.api.errors import Codes, SynapseError
+from synapse.http.server import DirectServeJsonResource, respond_with_json
+from synapse.http.servlet import parse_bytes_from_args
+from synapse.http.site import SynapseRequest
+from synapse.media.media_storage import SpamMediaException
+
+if TYPE_CHECKING:
+    from synapse.media.media_repository import MediaRepository
+    from synapse.server import HomeServer
+
+logger = logging.getLogger(__name__)
+
+
+class UploadResource(DirectServeJsonResource):
+    isLeaf = True
+
+    def __init__(self, hs: "HomeServer", media_repo: "MediaRepository"):
+        super().__init__()
+
+        self.media_repo = media_repo
+        self.filepaths = media_repo.filepaths
+        self.store = hs.get_datastores().main
+        self.clock = hs.get_clock()
+        self.server_name = hs.hostname
+        self.auth = hs.get_auth()
+        self.max_upload_size = hs.config.media.max_upload_size
+        self.clock = hs.get_clock()
+
+    async def _async_render_OPTIONS(self, request: SynapseRequest) -> None:
+        respond_with_json(request, 200, {}, send_cors=True)
+
+    async def _async_render_POST(self, request: SynapseRequest) -> None:
+        requester = await self.auth.get_user_by_req(request)
+        raw_content_length = request.getHeader("Content-Length")
+        if raw_content_length is None:
+            raise SynapseError(msg="Request must specify a Content-Length", code=400)
+        try:
+            content_length = int(raw_content_length)
+        except ValueError:
+            raise SynapseError(msg="Content-Length value is invalid", code=400)
+        if content_length > self.max_upload_size:
+            raise SynapseError(
+                msg="Upload request body is too large",
+                code=413,
+                errcode=Codes.TOO_LARGE,
+            )
+
+        args: Dict[bytes, List[bytes]] = request.args  # type: ignore
+        upload_name_bytes = parse_bytes_from_args(args, "filename")
+        if upload_name_bytes:
+            try:
+                upload_name: Optional[str] = upload_name_bytes.decode("utf8")
+            except UnicodeDecodeError:
+                raise SynapseError(
+                    msg="Invalid UTF-8 filename parameter: %r" % (upload_name_bytes,),
+                    code=400,
+                )
+
+        # If the name is falsey (e.g. an empty byte string) ensure it is None.
+        else:
+            upload_name = None
+
+        headers = request.requestHeaders
+
+        if headers.hasHeader(b"Content-Type"):
+            content_type_headers = headers.getRawHeaders(b"Content-Type")
+            assert content_type_headers  # for mypy
+            media_type = content_type_headers[0].decode("ascii")
+        else:
+            media_type = "application/octet-stream"
+
+        # if headers.hasHeader(b"Content-Disposition"):
+        #     disposition = headers.getRawHeaders(b"Content-Disposition")[0]
+        # TODO(markjh): parse content-dispostion
+
+        try:
+            content: IO = request.content  # type: ignore
+            content_uri = await self.media_repo.create_content(
+                media_type, upload_name, content, content_length, requester.user
+            )
+        except SpamMediaException:
+            # For uploading of media we want to respond with a 400, instead of
+            # the default 404, as that would just be confusing.
+            raise SynapseError(400, "Bad content")
+
+        logger.info("Uploaded content with URI '%s'", content_uri)
+
+        respond_with_json(
+            request, 200, {"content_uri": str(content_uri)}, send_cors=True
+        )
diff --git a/synapse/rest/media/v1/_base.py b/synapse/rest/media/v1/_base.py
index ef8334ae25..88427a5737 100644
--- a/synapse/rest/media/v1/_base.py
+++ b/synapse/rest/media/v1/_base.py
@@ -1,5 +1,4 @@
-# Copyright 2014-2016 OpenMarket Ltd
-# Copyright 2019-2021 The Matrix.org Foundation C.I.C.
+# Copyright 2023 The Matrix.org Foundation C.I.C.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,468 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+#
 
-import logging
-import os
-import urllib
-from abc import ABC, abstractmethod
-from types import TracebackType
-from typing import Awaitable, Dict, Generator, List, Optional, Tuple, Type
-
-import attr
-
-from twisted.internet.interfaces import IConsumer
-from twisted.protocols.basic import FileSender
-from twisted.web.server import Request
-
-from synapse.api.errors import Codes, SynapseError, cs_error
-from synapse.http.server import finish_request, respond_with_json
-from synapse.http.site import SynapseRequest
-from synapse.logging.context import make_deferred_yieldable
-from synapse.util.stringutils import is_ascii, parse_and_validate_server_name
-
-logger = logging.getLogger(__name__)
-
-# list all text content types that will have the charset default to UTF-8 when
-# none is given
-TEXT_CONTENT_TYPES = [
-    "text/css",
-    "text/csv",
-    "text/html",
-    "text/calendar",
-    "text/plain",
-    "text/javascript",
-    "application/json",
-    "application/ld+json",
-    "application/rtf",
-    "image/svg+xml",
-    "text/xml",
-]
-
-
-def parse_media_id(request: Request) -> Tuple[str, str, Optional[str]]:
-    """Parses the server name, media ID and optional file name from the request URI
-
-    Also performs some rough validation on the server name.
-
-    Args:
-        request: The `Request`.
-
-    Returns:
-        A tuple containing the parsed server name, media ID and optional file name.
-
-    Raises:
-        SynapseError(404): if parsing or validation fail for any reason
-    """
-    try:
-        # The type on postpath seems incorrect in Twisted 21.2.0.
-        postpath: List[bytes] = request.postpath  # type: ignore
-        assert postpath
-
-        # This allows users to append e.g. /test.png to the URL. Useful for
-        # clients that parse the URL to see content type.
-        server_name_bytes, media_id_bytes = postpath[:2]
-        server_name = server_name_bytes.decode("utf-8")
-        media_id = media_id_bytes.decode("utf8")
-
-        # Validate the server name, raising if invalid
-        parse_and_validate_server_name(server_name)
-
-        file_name = None
-        if len(postpath) > 2:
-            try:
-                file_name = urllib.parse.unquote(postpath[-1].decode("utf-8"))
-            except UnicodeDecodeError:
-                pass
-        return server_name, media_id, file_name
-    except Exception:
-        raise SynapseError(
-            404, "Invalid media id token %r" % (request.postpath,), Codes.UNKNOWN
-        )
-
-
-def respond_404(request: SynapseRequest) -> None:
-    respond_with_json(
-        request,
-        404,
-        cs_error("Not found %r" % (request.postpath,), code=Codes.NOT_FOUND),
-        send_cors=True,
-    )
-
-
-async def respond_with_file(
-    request: SynapseRequest,
-    media_type: str,
-    file_path: str,
-    file_size: Optional[int] = None,
-    upload_name: Optional[str] = None,
-) -> None:
-    logger.debug("Responding with %r", file_path)
-
-    if os.path.isfile(file_path):
-        if file_size is None:
-            stat = os.stat(file_path)
-            file_size = stat.st_size
-
-        add_file_headers(request, media_type, file_size, upload_name)
-
-        with open(file_path, "rb") as f:
-            await make_deferred_yieldable(FileSender().beginFileTransfer(f, request))
-
-        finish_request(request)
-    else:
-        respond_404(request)
-
-
-def add_file_headers(
-    request: Request,
-    media_type: str,
-    file_size: Optional[int],
-    upload_name: Optional[str],
-) -> None:
-    """Adds the correct response headers in preparation for responding with the
-    media.
-
-    Args:
-        request
-        media_type: The media/content type.
-        file_size: Size in bytes of the media, if known.
-        upload_name: The name of the requested file, if any.
-    """
-
-    def _quote(x: str) -> str:
-        return urllib.parse.quote(x.encode("utf-8"))
-
-    # Default to a UTF-8 charset for text content types.
-    # ex, uses UTF-8 for 'text/css' but not 'text/css; charset=UTF-16'
-    if media_type.lower() in TEXT_CONTENT_TYPES:
-        content_type = media_type + "; charset=UTF-8"
-    else:
-        content_type = media_type
-
-    request.setHeader(b"Content-Type", content_type.encode("UTF-8"))
-    if upload_name:
-        # RFC6266 section 4.1 [1] defines both `filename` and `filename*`.
-        #
-        # `filename` is defined to be a `value`, which is defined by RFC2616
-        # section 3.6 [2] to be a `token` or a `quoted-string`, where a `token`
-        # is (essentially) a single US-ASCII word, and a `quoted-string` is a
-        # US-ASCII string surrounded by double-quotes, using backslash as an
-        # escape character. Note that %-encoding is *not* permitted.
-        #
-        # `filename*` is defined to be an `ext-value`, which is defined in
-        # RFC5987 section 3.2.1 [3] to be `charset "'" [ language ] "'" value-chars`,
-        # where `value-chars` is essentially a %-encoded string in the given charset.
-        #
-        # [1]: https://tools.ietf.org/html/rfc6266#section-4.1
-        # [2]: https://tools.ietf.org/html/rfc2616#section-3.6
-        # [3]: https://tools.ietf.org/html/rfc5987#section-3.2.1
-
-        # We avoid the quoted-string version of `filename`, because (a) synapse didn't
-        # correctly interpret those as of 0.99.2 and (b) they are a bit of a pain and we
-        # may as well just do the filename* version.
-        if _can_encode_filename_as_token(upload_name):
-            disposition = "inline; filename=%s" % (upload_name,)
-        else:
-            disposition = "inline; filename*=utf-8''%s" % (_quote(upload_name),)
-
-        request.setHeader(b"Content-Disposition", disposition.encode("ascii"))
-
-    # cache for at least a day.
-    # XXX: we might want to turn this off for data we don't want to
-    # recommend caching as it's sensitive or private - or at least
-    # select private. don't bother setting Expires as all our
-    # clients are smart enough to be happy with Cache-Control
-    request.setHeader(b"Cache-Control", b"public,max-age=86400,s-maxage=86400")
-    if file_size is not None:
-        request.setHeader(b"Content-Length", b"%d" % (file_size,))
-
-    # Tell web crawlers to not index, archive, or follow links in media. This
-    # should help to prevent things in the media repo from showing up in web
-    # search results.
-    request.setHeader(b"X-Robots-Tag", "noindex, nofollow, noarchive, noimageindex")
-
-
-# separators as defined in RFC2616. SP and HT are handled separately.
-# see _can_encode_filename_as_token.
-_FILENAME_SEPARATOR_CHARS = {
-    "(",
-    ")",
-    "<",
-    ">",
-    "@",
-    ",",
-    ";",
-    ":",
-    "\\",
-    '"',
-    "/",
-    "[",
-    "]",
-    "?",
-    "=",
-    "{",
-    "}",
-}
-
-
-def _can_encode_filename_as_token(x: str) -> bool:
-    for c in x:
-        # from RFC2616:
-        #
-        #        token          = 1*<any CHAR except CTLs or separators>
-        #
-        #        separators     = "(" | ")" | "<" | ">" | "@"
-        #                       | "," | ";" | ":" | "\" | <">
-        #                       | "/" | "[" | "]" | "?" | "="
-        #                       | "{" | "}" | SP | HT
-        #
-        #        CHAR           = <any US-ASCII character (octets 0 - 127)>
-        #
-        #        CTL            = <any US-ASCII control character
-        #                         (octets 0 - 31) and DEL (127)>
-        #
-        if ord(c) >= 127 or ord(c) <= 32 or c in _FILENAME_SEPARATOR_CHARS:
-            return False
-    return True
-
-
-async def respond_with_responder(
-    request: SynapseRequest,
-    responder: "Optional[Responder]",
-    media_type: str,
-    file_size: Optional[int],
-    upload_name: Optional[str] = None,
-) -> None:
-    """Responds to the request with given responder. If responder is None then
-    returns 404.
-
-    Args:
-        request
-        responder
-        media_type: The media/content type.
-        file_size: Size in bytes of the media. If not known it should be None
-        upload_name: The name of the requested file, if any.
-    """
-    if not responder:
-        respond_404(request)
-        return
-
-    # If we have a responder we *must* use it as a context manager.
-    with responder:
-        if request._disconnected:
-            logger.warning(
-                "Not sending response to request %s, already disconnected.", request
-            )
-            return
-
-        logger.debug("Responding to media request with responder %s", responder)
-        add_file_headers(request, media_type, file_size, upload_name)
-        try:
-            await responder.write_to_consumer(request)
-        except Exception as e:
-            # The majority of the time this will be due to the client having gone
-            # away. Unfortunately, Twisted simply throws a generic exception at us
-            # in that case.
-            logger.warning("Failed to write to consumer: %s %s", type(e), e)
-
-            # Unregister the producer, if it has one, so Twisted doesn't complain
-            if request.producer:
-                request.unregisterProducer()
-
-    finish_request(request)
-
-
-class Responder(ABC):
-    """Represents a response that can be streamed to the requester.
-
-    Responder is a context manager which *must* be used, so that any resources
-    held can be cleaned up.
-    """
-
-    @abstractmethod
-    def write_to_consumer(self, consumer: IConsumer) -> Awaitable:
-        """Stream response into consumer
-
-        Args:
-            consumer: The consumer to stream into.
-
-        Returns:
-            Resolves once the response has finished being written
-        """
-        raise NotImplementedError()
-
-    def __enter__(self) -> None:  # noqa: B027
-        pass
-
-    def __exit__(  # noqa: B027
-        self,
-        exc_type: Optional[Type[BaseException]],
-        exc_val: Optional[BaseException],
-        exc_tb: Optional[TracebackType],
-    ) -> None:
-        pass
-
-
-@attr.s(slots=True, frozen=True, auto_attribs=True)
-class ThumbnailInfo:
-    """Details about a generated thumbnail."""
-
-    width: int
-    height: int
-    method: str
-    # Content type of thumbnail, e.g. image/png
-    type: str
-    # The size of the media file, in bytes.
-    length: Optional[int] = None
-
-
-@attr.s(slots=True, frozen=True, auto_attribs=True)
-class FileInfo:
-    """Details about a requested/uploaded file."""
-
-    # The server name where the media originated from, or None if local.
-    server_name: Optional[str]
-    # The local ID of the file. For local files this is the same as the media_id
-    file_id: str
-    # If the file is for the url preview cache
-    url_cache: bool = False
-    # Whether the file is a thumbnail or not.
-    thumbnail: Optional[ThumbnailInfo] = None
-
-    # The below properties exist to maintain compatibility with third-party modules.
-    @property
-    def thumbnail_width(self) -> Optional[int]:
-        if not self.thumbnail:
-            return None
-        return self.thumbnail.width
-
-    @property
-    def thumbnail_height(self) -> Optional[int]:
-        if not self.thumbnail:
-            return None
-        return self.thumbnail.height
-
-    @property
-    def thumbnail_method(self) -> Optional[str]:
-        if not self.thumbnail:
-            return None
-        return self.thumbnail.method
-
-    @property
-    def thumbnail_type(self) -> Optional[str]:
-        if not self.thumbnail:
-            return None
-        return self.thumbnail.type
-
-    @property
-    def thumbnail_length(self) -> Optional[int]:
-        if not self.thumbnail:
-            return None
-        return self.thumbnail.length
-
-
-def get_filename_from_headers(headers: Dict[bytes, List[bytes]]) -> Optional[str]:
-    """
-    Get the filename of the downloaded file by inspecting the
-    Content-Disposition HTTP header.
-
-    Args:
-        headers: The HTTP request headers.
-
-    Returns:
-        The filename, or None.
-    """
-    content_disposition = headers.get(b"Content-Disposition", [b""])
-
-    # No header, bail out.
-    if not content_disposition[0]:
-        return None
-
-    _, params = _parse_header(content_disposition[0])
-
-    upload_name = None
-
-    # First check if there is a valid UTF-8 filename
-    upload_name_utf8 = params.get(b"filename*", None)
-    if upload_name_utf8:
-        if upload_name_utf8.lower().startswith(b"utf-8''"):
-            upload_name_utf8 = upload_name_utf8[7:]
-            # We have a filename*= section. This MUST be ASCII, and any UTF-8
-            # bytes are %-quoted.
-            try:
-                # Once it is decoded, we can then unquote the %-encoded
-                # parts strictly into a unicode string.
-                upload_name = urllib.parse.unquote(
-                    upload_name_utf8.decode("ascii"), errors="strict"
-                )
-            except UnicodeDecodeError:
-                # Incorrect UTF-8.
-                pass
-
-    # If there isn't check for an ascii name.
-    if not upload_name:
-        upload_name_ascii = params.get(b"filename", None)
-        if upload_name_ascii and is_ascii(upload_name_ascii):
-            upload_name = upload_name_ascii.decode("ascii")
-
-    # This may be None here, indicating we did not find a matching name.
-    return upload_name
-
-
-def _parse_header(line: bytes) -> Tuple[bytes, Dict[bytes, bytes]]:
-    """Parse a Content-type like header.
-
-    Cargo-culted from `cgi`, but works on bytes rather than strings.
-
-    Args:
-        line: header to be parsed
-
-    Returns:
-        The main content-type, followed by the parameter dictionary
-    """
-    parts = _parseparam(b";" + line)
-    key = next(parts)
-    pdict = {}
-    for p in parts:
-        i = p.find(b"=")
-        if i >= 0:
-            name = p[:i].strip().lower()
-            value = p[i + 1 :].strip()
-
-            # strip double-quotes
-            if len(value) >= 2 and value[0:1] == value[-1:] == b'"':
-                value = value[1:-1]
-                value = value.replace(b"\\\\", b"\\").replace(b'\\"', b'"')
-            pdict[name] = value
-
-    return key, pdict
-
-
-def _parseparam(s: bytes) -> Generator[bytes, None, None]:
-    """Generator which splits the input on ;, respecting double-quoted sequences
-
-    Cargo-culted from `cgi`, but works on bytes rather than strings.
-
-    Args:
-        s: header to be parsed
-
-    Returns:
-        The split input
-    """
-    while s[:1] == b";":
-        s = s[1:]
-
-        # look for the next ;
-        end = s.find(b";")
-
-        # if there is an odd number of " marks between here and the next ;, skip to the
-        # next ; instead
-        while end > 0 and (s.count(b'"', 0, end) - s.count(b'\\"', 0, end)) % 2:
-            end = s.find(b";", end + 1)
-
-        if end < 0:
-            end = len(s)
-        f = s[:end]
-        yield f.strip()
-        s = s[end:]
+# This exists purely for backwards compatibility with media providers and spam checkers.
+from synapse.media._base import FileInfo, Responder  # noqa: F401
diff --git a/synapse/rest/media/v1/config_resource.py b/synapse/rest/media/v1/config_resource.py
deleted file mode 100644
index a95804d327..0000000000
--- a/synapse/rest/media/v1/config_resource.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright 2018 Will Hunt <will@half-shot.uk>
-# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from typing import TYPE_CHECKING
-
-from synapse.http.server import DirectServeJsonResource, respond_with_json
-from synapse.http.site import SynapseRequest
-
-if TYPE_CHECKING:
-    from synapse.server import HomeServer
-
-
-class MediaConfigResource(DirectServeJsonResource):
-    isLeaf = True
-
-    def __init__(self, hs: "HomeServer"):
-        super().__init__()
-        config = hs.config
-        self.clock = hs.get_clock()
-        self.auth = hs.get_auth()
-        self.limits_dict = {"m.upload.size": config.media.max_upload_size}
-
-    async def _async_render_GET(self, request: SynapseRequest) -> None:
-        await self.auth.get_user_by_req(request)
-        respond_with_json(request, 200, self.limits_dict, send_cors=True)
-
-    async def _async_render_OPTIONS(self, request: SynapseRequest) -> None:
-        respond_with_json(request, 200, {}, send_cors=True)
diff --git a/synapse/rest/media/v1/download_resource.py b/synapse/rest/media/v1/download_resource.py
deleted file mode 100644
index 048a042692..0000000000
--- a/synapse/rest/media/v1/download_resource.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2014-2016 OpenMarket Ltd
-# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-from typing import TYPE_CHECKING
-
-from synapse.http.server import (
-    DirectServeJsonResource,
-    set_corp_headers,
-    set_cors_headers,
-)
-from synapse.http.servlet import parse_boolean
-from synapse.http.site import SynapseRequest
-
-from ._base import parse_media_id, respond_404
-
-if TYPE_CHECKING:
-    from synapse.rest.media.v1.media_repository import MediaRepository
-    from synapse.server import HomeServer
-
-logger = logging.getLogger(__name__)
-
-
-class DownloadResource(DirectServeJsonResource):
-    isLeaf = True
-
-    def __init__(self, hs: "HomeServer", media_repo: "MediaRepository"):
-        super().__init__()
-        self.media_repo = media_repo
-        self.server_name = hs.hostname
-
-    async def _async_render_GET(self, request: SynapseRequest) -> None:
-        set_cors_headers(request)
-        set_corp_headers(request)
-        request.setHeader(
-            b"Content-Security-Policy",
-            b"sandbox;"
-            b" default-src 'none';"
-            b" script-src 'none';"
-            b" plugin-types application/pdf;"
-            b" style-src 'unsafe-inline';"
-            b" media-src 'self';"
-            b" object-src 'self';",
-        )
-        # Limited non-standard form of CSP for IE11
-        request.setHeader(b"X-Content-Security-Policy", b"sandbox;")
-        request.setHeader(
-            b"Referrer-Policy",
-            b"no-referrer",
-        )
-        server_name, media_id, name = parse_media_id(request)
-        if server_name == self.server_name:
-            await self.media_repo.get_local_media(request, media_id, name)
-        else:
-            allow_remote = parse_boolean(request, "allow_remote", default=True)
-            if not allow_remote:
-                logger.info(
-                    "Rejecting request for remote media %s/%s due to allow_remote",
-                    server_name,
-                    media_id,
-                )
-                respond_404(request)
-                return
-
-            await self.media_repo.get_remote_media(request, server_name, media_id, name)
diff --git a/synapse/rest/media/v1/filepath.py b/synapse/rest/media/v1/filepath.py
deleted file mode 100644
index 1f6441c412..0000000000
--- a/synapse/rest/media/v1/filepath.py
+++ /dev/null
@@ -1,410 +0,0 @@
-# Copyright 2014-2016 OpenMarket Ltd
-# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import functools
-import os
-import re
-import string
-from typing import Any, Callable, List, TypeVar, Union, cast
-
-NEW_FORMAT_ID_RE = re.compile(r"^\d\d\d\d-\d\d-\d\d")
-
-
-F = TypeVar("F", bound=Callable[..., str])
-
-
-def _wrap_in_base_path(func: F) -> F:
-    """Takes a function that returns a relative path and turns it into an
-    absolute path based on the location of the primary media store
-    """
-
-    @functools.wraps(func)
-    def _wrapped(self: "MediaFilePaths", *args: Any, **kwargs: Any) -> str:
-        path = func(self, *args, **kwargs)
-        return os.path.join(self.base_path, path)
-
-    return cast(F, _wrapped)
-
-
-GetPathMethod = TypeVar(
-    "GetPathMethod", bound=Union[Callable[..., str], Callable[..., List[str]]]
-)
-
-
-def _wrap_with_jail_check(relative: bool) -> Callable[[GetPathMethod], GetPathMethod]:
-    """Wraps a path-returning method to check that the returned path(s) do not escape
-    the media store directory.
-
-    The path-returning method may return either a single path, or a list of paths.
-
-    The check is not expected to ever fail, unless `func` is missing a call to
-    `_validate_path_component`, or `_validate_path_component` is buggy.
-
-    Args:
-        relative: A boolean indicating whether the wrapped method returns paths relative
-            to the media store directory.
-
-    Returns:
-        A method which will wrap a path-returning method, adding a check to ensure that
-        the returned path(s) lie within the media store directory. The check will raise
-        a `ValueError` if it fails.
-    """
-
-    def _wrap_with_jail_check_inner(func: GetPathMethod) -> GetPathMethod:
-        @functools.wraps(func)
-        def _wrapped(
-            self: "MediaFilePaths", *args: Any, **kwargs: Any
-        ) -> Union[str, List[str]]:
-            path_or_paths = func(self, *args, **kwargs)
-
-            if isinstance(path_or_paths, list):
-                paths_to_check = path_or_paths
-            else:
-                paths_to_check = [path_or_paths]
-
-            for path in paths_to_check:
-                # Construct the path that will ultimately be used.
-                # We cannot guess whether `path` is relative to the media store
-                # directory, since the media store directory may itself be a relative
-                # path.
-                if relative:
-                    path = os.path.join(self.base_path, path)
-                normalized_path = os.path.normpath(path)
-
-                # Now that `normpath` has eliminated `../`s and `./`s from the path,
-                # `os.path.commonpath` can be used to check whether it lies within the
-                # media store directory.
-                if (
-                    os.path.commonpath([normalized_path, self.normalized_base_path])
-                    != self.normalized_base_path
-                ):
-                    # The path resolves to outside the media store directory,
-                    # or `self.base_path` is `.`, which is an unlikely configuration.
-                    raise ValueError(f"Invalid media store path: {path!r}")
-
-                # Note that `os.path.normpath`/`abspath` has a subtle caveat:
-                # `a/b/c/../c` will normalize to `a/b/c`, but the former refers to a
-                # different path if `a/b/c` is a symlink. That is, the check above is
-                # not perfect and may allow a certain restricted subset of untrustworthy
-                # paths through. Since the check above is secondary to the main
-                # `_validate_path_component` checks, it's less important for it to be
-                # perfect.
-                #
-                # As an alternative, `os.path.realpath` will resolve symlinks, but
-                # proves problematic if there are symlinks inside the media store.
-                # eg. if `url_store/` is symlinked to elsewhere, its canonical path
-                # won't match that of the main media store directory.
-
-            return path_or_paths
-
-        return cast(GetPathMethod, _wrapped)
-
-    return _wrap_with_jail_check_inner
-
-
-ALLOWED_CHARACTERS = set(
-    string.ascii_letters
-    + string.digits
-    + "_-"
-    + ".[]:"  # Domain names, IPv6 addresses and ports in server names
-)
-FORBIDDEN_NAMES = {
-    "",
-    os.path.curdir,  # "." for the current platform
-    os.path.pardir,  # ".." for the current platform
-}
-
-
-def _validate_path_component(name: str) -> str:
-    """Checks that the given string can be safely used as a path component
-
-    Args:
-        name: The path component to check.
-
-    Returns:
-        The path component if valid.
-
-    Raises:
-        ValueError: If `name` cannot be safely used as a path component.
-    """
-    if not ALLOWED_CHARACTERS.issuperset(name) or name in FORBIDDEN_NAMES:
-        raise ValueError(f"Invalid path component: {name!r}")
-
-    return name
-
-
-class MediaFilePaths:
-    """Describes where files are stored on disk.
-
-    Most of the functions have a `*_rel` variant which returns a file path that
-    is relative to the base media store path. This is mainly used when we want
-    to write to the backup media store (when one is configured)
-    """
-
-    def __init__(self, primary_base_path: str):
-        self.base_path = primary_base_path
-        self.normalized_base_path = os.path.normpath(self.base_path)
-
-        # Refuse to initialize if paths cannot be validated correctly for the current
-        # platform.
-        assert os.path.sep not in ALLOWED_CHARACTERS
-        assert os.path.altsep not in ALLOWED_CHARACTERS
-        # On Windows, paths have all sorts of weirdness which `_validate_path_component`
-        # does not consider. In any case, the remote media store can't work correctly
-        # for certain homeservers there, since ":"s aren't allowed in paths.
-        assert os.name == "posix"
-
-    @_wrap_with_jail_check(relative=True)
-    def local_media_filepath_rel(self, media_id: str) -> str:
-        return os.path.join(
-            "local_content",
-            _validate_path_component(media_id[0:2]),
-            _validate_path_component(media_id[2:4]),
-            _validate_path_component(media_id[4:]),
-        )
-
-    local_media_filepath = _wrap_in_base_path(local_media_filepath_rel)
-
-    @_wrap_with_jail_check(relative=True)
-    def local_media_thumbnail_rel(
-        self, media_id: str, width: int, height: int, content_type: str, method: str
-    ) -> str:
-        top_level_type, sub_type = content_type.split("/")
-        file_name = "%i-%i-%s-%s-%s" % (width, height, top_level_type, sub_type, method)
-        return os.path.join(
-            "local_thumbnails",
-            _validate_path_component(media_id[0:2]),
-            _validate_path_component(media_id[2:4]),
-            _validate_path_component(media_id[4:]),
-            _validate_path_component(file_name),
-        )
-
-    local_media_thumbnail = _wrap_in_base_path(local_media_thumbnail_rel)
-
-    @_wrap_with_jail_check(relative=False)
-    def local_media_thumbnail_dir(self, media_id: str) -> str:
-        """
-        Retrieve the local store path of thumbnails of a given media_id
-
-        Args:
-            media_id: The media ID to query.
-        Returns:
-            Path of local_thumbnails from media_id
-        """
-        return os.path.join(
-            self.base_path,
-            "local_thumbnails",
-            _validate_path_component(media_id[0:2]),
-            _validate_path_component(media_id[2:4]),
-            _validate_path_component(media_id[4:]),
-        )
-
-    @_wrap_with_jail_check(relative=True)
-    def remote_media_filepath_rel(self, server_name: str, file_id: str) -> str:
-        return os.path.join(
-            "remote_content",
-            _validate_path_component(server_name),
-            _validate_path_component(file_id[0:2]),
-            _validate_path_component(file_id[2:4]),
-            _validate_path_component(file_id[4:]),
-        )
-
-    remote_media_filepath = _wrap_in_base_path(remote_media_filepath_rel)
-
-    @_wrap_with_jail_check(relative=True)
-    def remote_media_thumbnail_rel(
-        self,
-        server_name: str,
-        file_id: str,
-        width: int,
-        height: int,
-        content_type: str,
-        method: str,
-    ) -> str:
-        top_level_type, sub_type = content_type.split("/")
-        file_name = "%i-%i-%s-%s-%s" % (width, height, top_level_type, sub_type, method)
-        return os.path.join(
-            "remote_thumbnail",
-            _validate_path_component(server_name),
-            _validate_path_component(file_id[0:2]),
-            _validate_path_component(file_id[2:4]),
-            _validate_path_component(file_id[4:]),
-            _validate_path_component(file_name),
-        )
-
-    remote_media_thumbnail = _wrap_in_base_path(remote_media_thumbnail_rel)
-
-    # Legacy path that was used to store thumbnails previously.
-    # Should be removed after some time, when most of the thumbnails are stored
-    # using the new path.
-    @_wrap_with_jail_check(relative=True)
-    def remote_media_thumbnail_rel_legacy(
-        self, server_name: str, file_id: str, width: int, height: int, content_type: str
-    ) -> str:
-        top_level_type, sub_type = content_type.split("/")
-        file_name = "%i-%i-%s-%s" % (width, height, top_level_type, sub_type)
-        return os.path.join(
-            "remote_thumbnail",
-            _validate_path_component(server_name),
-            _validate_path_component(file_id[0:2]),
-            _validate_path_component(file_id[2:4]),
-            _validate_path_component(file_id[4:]),
-            _validate_path_component(file_name),
-        )
-
-    @_wrap_with_jail_check(relative=False)
-    def remote_media_thumbnail_dir(self, server_name: str, file_id: str) -> str:
-        return os.path.join(
-            self.base_path,
-            "remote_thumbnail",
-            _validate_path_component(server_name),
-            _validate_path_component(file_id[0:2]),
-            _validate_path_component(file_id[2:4]),
-            _validate_path_component(file_id[4:]),
-        )
-
-    @_wrap_with_jail_check(relative=True)
-    def url_cache_filepath_rel(self, media_id: str) -> str:
-        if NEW_FORMAT_ID_RE.match(media_id):
-            # Media id is of the form <DATE><RANDOM_STRING>
-            # E.g.: 2017-09-28-fsdRDt24DS234dsf
-            return os.path.join(
-                "url_cache",
-                _validate_path_component(media_id[:10]),
-                _validate_path_component(media_id[11:]),
-            )
-        else:
-            return os.path.join(
-                "url_cache",
-                _validate_path_component(media_id[0:2]),
-                _validate_path_component(media_id[2:4]),
-                _validate_path_component(media_id[4:]),
-            )
-
-    url_cache_filepath = _wrap_in_base_path(url_cache_filepath_rel)
-
-    @_wrap_with_jail_check(relative=False)
-    def url_cache_filepath_dirs_to_delete(self, media_id: str) -> List[str]:
-        "The dirs to try and remove if we delete the media_id file"
-        if NEW_FORMAT_ID_RE.match(media_id):
-            return [
-                os.path.join(
-                    self.base_path, "url_cache", _validate_path_component(media_id[:10])
-                )
-            ]
-        else:
-            return [
-                os.path.join(
-                    self.base_path,
-                    "url_cache",
-                    _validate_path_component(media_id[0:2]),
-                    _validate_path_component(media_id[2:4]),
-                ),
-                os.path.join(
-                    self.base_path, "url_cache", _validate_path_component(media_id[0:2])
-                ),
-            ]
-
-    @_wrap_with_jail_check(relative=True)
-    def url_cache_thumbnail_rel(
-        self, media_id: str, width: int, height: int, content_type: str, method: str
-    ) -> str:
-        # Media id is of the form <DATE><RANDOM_STRING>
-        # E.g.: 2017-09-28-fsdRDt24DS234dsf
-
-        top_level_type, sub_type = content_type.split("/")
-        file_name = "%i-%i-%s-%s-%s" % (width, height, top_level_type, sub_type, method)
-
-        if NEW_FORMAT_ID_RE.match(media_id):
-            return os.path.join(
-                "url_cache_thumbnails",
-                _validate_path_component(media_id[:10]),
-                _validate_path_component(media_id[11:]),
-                _validate_path_component(file_name),
-            )
-        else:
-            return os.path.join(
-                "url_cache_thumbnails",
-                _validate_path_component(media_id[0:2]),
-                _validate_path_component(media_id[2:4]),
-                _validate_path_component(media_id[4:]),
-                _validate_path_component(file_name),
-            )
-
-    url_cache_thumbnail = _wrap_in_base_path(url_cache_thumbnail_rel)
-
-    @_wrap_with_jail_check(relative=True)
-    def url_cache_thumbnail_directory_rel(self, media_id: str) -> str:
-        # Media id is of the form <DATE><RANDOM_STRING>
-        # E.g.: 2017-09-28-fsdRDt24DS234dsf
-
-        if NEW_FORMAT_ID_RE.match(media_id):
-            return os.path.join(
-                "url_cache_thumbnails",
-                _validate_path_component(media_id[:10]),
-                _validate_path_component(media_id[11:]),
-            )
-        else:
-            return os.path.join(
-                "url_cache_thumbnails",
-                _validate_path_component(media_id[0:2]),
-                _validate_path_component(media_id[2:4]),
-                _validate_path_component(media_id[4:]),
-            )
-
-    url_cache_thumbnail_directory = _wrap_in_base_path(
-        url_cache_thumbnail_directory_rel
-    )
-
-    @_wrap_with_jail_check(relative=False)
-    def url_cache_thumbnail_dirs_to_delete(self, media_id: str) -> List[str]:
-        "The dirs to try and remove if we delete the media_id thumbnails"
-        # Media id is of the form <DATE><RANDOM_STRING>
-        # E.g.: 2017-09-28-fsdRDt24DS234dsf
-        if NEW_FORMAT_ID_RE.match(media_id):
-            return [
-                os.path.join(
-                    self.base_path,
-                    "url_cache_thumbnails",
-                    _validate_path_component(media_id[:10]),
-                    _validate_path_component(media_id[11:]),
-                ),
-                os.path.join(
-                    self.base_path,
-                    "url_cache_thumbnails",
-                    _validate_path_component(media_id[:10]),
-                ),
-            ]
-        else:
-            return [
-                os.path.join(
-                    self.base_path,
-                    "url_cache_thumbnails",
-                    _validate_path_component(media_id[0:2]),
-                    _validate_path_component(media_id[2:4]),
-                    _validate_path_component(media_id[4:]),
-                ),
-                os.path.join(
-                    self.base_path,
-                    "url_cache_thumbnails",
-                    _validate_path_component(media_id[0:2]),
-                    _validate_path_component(media_id[2:4]),
-                ),
-                os.path.join(
-                    self.base_path,
-                    "url_cache_thumbnails",
-                    _validate_path_component(media_id[0:2]),
-                ),
-            ]
diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py
deleted file mode 100644
index c70e1837af..0000000000
--- a/synapse/rest/media/v1/media_repository.py
+++ /dev/null
@@ -1,1112 +0,0 @@
-# Copyright 2014-2016 OpenMarket Ltd
-# Copyright 2018-2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import errno
-import logging
-import os
-import shutil
-from io import BytesIO
-from typing import IO, TYPE_CHECKING, Dict, List, Optional, Set, Tuple
-
-from matrix_common.types.mxc_uri import MXCUri
-
-import twisted.internet.error
-import twisted.web.http
-from twisted.internet.defer import Deferred
-
-from synapse.api.errors import (
-    FederationDeniedError,
-    HttpResponseException,
-    NotFoundError,
-    RequestSendFailed,
-    SynapseError,
-)
-from synapse.config._base import ConfigError
-from synapse.config.repository import ThumbnailRequirement
-from synapse.http.server import UnrecognizedRequestResource
-from synapse.http.site import SynapseRequest
-from synapse.logging.context import defer_to_thread
-from synapse.metrics.background_process_metrics import run_as_background_process
-from synapse.types import UserID
-from synapse.util.async_helpers import Linearizer
-from synapse.util.retryutils import NotRetryingDestination
-from synapse.util.stringutils import random_string
-
-from ._base import (
-    FileInfo,
-    Responder,
-    ThumbnailInfo,
-    get_filename_from_headers,
-    respond_404,
-    respond_with_responder,
-)
-from .config_resource import MediaConfigResource
-from .download_resource import DownloadResource
-from .filepath import MediaFilePaths
-from .media_storage import MediaStorage
-from .preview_url_resource import PreviewUrlResource
-from .storage_provider import StorageProviderWrapper
-from .thumbnail_resource import ThumbnailResource
-from .thumbnailer import Thumbnailer, ThumbnailError
-from .upload_resource import UploadResource
-
-if TYPE_CHECKING:
-    from synapse.server import HomeServer
-
-logger = logging.getLogger(__name__)
-
-# How often to run the background job to update the "recently accessed"
-# attribute of local and remote media.
-UPDATE_RECENTLY_ACCESSED_TS = 60 * 1000  # 1 minute
-# How often to run the background job to check for local and remote media
-# that should be purged according to the configured media retention settings.
-MEDIA_RETENTION_CHECK_PERIOD_MS = 60 * 60 * 1000  # 1 hour
-
-
-class MediaRepository:
-    def __init__(self, hs: "HomeServer"):
-        self.hs = hs
-        self.auth = hs.get_auth()
-        self.client = hs.get_federation_http_client()
-        self.clock = hs.get_clock()
-        self.server_name = hs.hostname
-        self.store = hs.get_datastores().main
-        self.max_upload_size = hs.config.media.max_upload_size
-        self.max_image_pixels = hs.config.media.max_image_pixels
-
-        Thumbnailer.set_limits(self.max_image_pixels)
-
-        self.primary_base_path: str = hs.config.media.media_store_path
-        self.filepaths: MediaFilePaths = MediaFilePaths(self.primary_base_path)
-
-        self.dynamic_thumbnails = hs.config.media.dynamic_thumbnails
-        self.thumbnail_requirements = hs.config.media.thumbnail_requirements
-
-        self.remote_media_linearizer = Linearizer(name="media_remote")
-
-        self.recently_accessed_remotes: Set[Tuple[str, str]] = set()
-        self.recently_accessed_locals: Set[str] = set()
-
-        self.federation_domain_whitelist = (
-            hs.config.federation.federation_domain_whitelist
-        )
-
-        # List of StorageProviders where we should search for media and
-        # potentially upload to.
-        storage_providers = []
-
-        for (
-            clz,
-            provider_config,
-            wrapper_config,
-        ) in hs.config.media.media_storage_providers:
-            backend = clz(hs, provider_config)
-            provider = StorageProviderWrapper(
-                backend,
-                store_local=wrapper_config.store_local,
-                store_remote=wrapper_config.store_remote,
-                store_synchronous=wrapper_config.store_synchronous,
-            )
-            storage_providers.append(provider)
-
-        self.media_storage = MediaStorage(
-            self.hs, self.primary_base_path, self.filepaths, storage_providers
-        )
-
-        self.clock.looping_call(
-            self._start_update_recently_accessed, UPDATE_RECENTLY_ACCESSED_TS
-        )
-
-        # Media retention configuration options
-        self._media_retention_local_media_lifetime_ms = (
-            hs.config.media.media_retention_local_media_lifetime_ms
-        )
-        self._media_retention_remote_media_lifetime_ms = (
-            hs.config.media.media_retention_remote_media_lifetime_ms
-        )
-
-        # Check whether local or remote media retention is configured
-        if (
-            hs.config.media.media_retention_local_media_lifetime_ms is not None
-            or hs.config.media.media_retention_remote_media_lifetime_ms is not None
-        ):
-            # Run the background job to apply media retention rules routinely,
-            # with the duration between runs dictated by the homeserver config.
-            self.clock.looping_call(
-                self._start_apply_media_retention_rules,
-                MEDIA_RETENTION_CHECK_PERIOD_MS,
-            )
-
-    def _start_update_recently_accessed(self) -> Deferred:
-        return run_as_background_process(
-            "update_recently_accessed_media", self._update_recently_accessed
-        )
-
-    def _start_apply_media_retention_rules(self) -> Deferred:
-        return run_as_background_process(
-            "apply_media_retention_rules", self._apply_media_retention_rules
-        )
-
-    async def _update_recently_accessed(self) -> None:
-        remote_media = self.recently_accessed_remotes
-        self.recently_accessed_remotes = set()
-
-        local_media = self.recently_accessed_locals
-        self.recently_accessed_locals = set()
-
-        await self.store.update_cached_last_access_time(
-            local_media, remote_media, self.clock.time_msec()
-        )
-
-    def mark_recently_accessed(self, server_name: Optional[str], media_id: str) -> None:
-        """Mark the given media as recently accessed.
-
-        Args:
-            server_name: Origin server of media, or None if local
-            media_id: The media ID of the content
-        """
-        if server_name:
-            self.recently_accessed_remotes.add((server_name, media_id))
-        else:
-            self.recently_accessed_locals.add(media_id)
-
-    async def create_content(
-        self,
-        media_type: str,
-        upload_name: Optional[str],
-        content: IO,
-        content_length: int,
-        auth_user: UserID,
-    ) -> MXCUri:
-        """Store uploaded content for a local user and return the mxc URL
-
-        Args:
-            media_type: The content type of the file.
-            upload_name: The name of the file, if provided.
-            content: A file like object that is the content to store
-            content_length: The length of the content
-            auth_user: The user_id of the uploader
-
-        Returns:
-            The mxc url of the stored content
-        """
-
-        media_id = random_string(24)
-
-        file_info = FileInfo(server_name=None, file_id=media_id)
-
-        fname = await self.media_storage.store_file(content, file_info)
-
-        logger.info("Stored local media in file %r", fname)
-
-        await self.store.store_local_media(
-            media_id=media_id,
-            media_type=media_type,
-            time_now_ms=self.clock.time_msec(),
-            upload_name=upload_name,
-            media_length=content_length,
-            user_id=auth_user,
-        )
-
-        await self._generate_thumbnails(None, media_id, media_id, media_type)
-
-        return MXCUri(self.server_name, media_id)
-
-    async def get_local_media(
-        self, request: SynapseRequest, media_id: str, name: Optional[str]
-    ) -> None:
-        """Responds to requests for local media, if exists, or returns 404.
-
-        Args:
-            request: The incoming request.
-            media_id: The media ID of the content. (This is the same as
-                the file_id for local content.)
-            name: Optional name that, if specified, will be used as
-                the filename in the Content-Disposition header of the response.
-
-        Returns:
-            Resolves once a response has successfully been written to request
-        """
-        media_info = await self.store.get_local_media(media_id)
-        if not media_info or media_info["quarantined_by"]:
-            respond_404(request)
-            return
-
-        self.mark_recently_accessed(None, media_id)
-
-        media_type = media_info["media_type"]
-        if not media_type:
-            media_type = "application/octet-stream"
-        media_length = media_info["media_length"]
-        upload_name = name if name else media_info["upload_name"]
-        url_cache = media_info["url_cache"]
-
-        file_info = FileInfo(None, media_id, url_cache=bool(url_cache))
-
-        responder = await self.media_storage.fetch_media(file_info)
-        await respond_with_responder(
-            request, responder, media_type, media_length, upload_name
-        )
-
-    async def get_remote_media(
-        self,
-        request: SynapseRequest,
-        server_name: str,
-        media_id: str,
-        name: Optional[str],
-    ) -> None:
-        """Respond to requests for remote media.
-
-        Args:
-            request: The incoming request.
-            server_name: Remote server_name where the media originated.
-            media_id: The media ID of the content (as defined by the remote server).
-            name: Optional name that, if specified, will be used as
-                the filename in the Content-Disposition header of the response.
-
-        Returns:
-            Resolves once a response has successfully been written to request
-        """
-        if (
-            self.federation_domain_whitelist is not None
-            and server_name not in self.federation_domain_whitelist
-        ):
-            raise FederationDeniedError(server_name)
-
-        self.mark_recently_accessed(server_name, media_id)
-
-        # We linearize here to ensure that we don't try and download remote
-        # media multiple times concurrently
-        key = (server_name, media_id)
-        async with self.remote_media_linearizer.queue(key):
-            responder, media_info = await self._get_remote_media_impl(
-                server_name, media_id
-            )
-
-        # We deliberately stream the file outside the lock
-        if responder:
-            media_type = media_info["media_type"]
-            media_length = media_info["media_length"]
-            upload_name = name if name else media_info["upload_name"]
-            await respond_with_responder(
-                request, responder, media_type, media_length, upload_name
-            )
-        else:
-            respond_404(request)
-
-    async def get_remote_media_info(self, server_name: str, media_id: str) -> dict:
-        """Gets the media info associated with the remote file, downloading
-        if necessary.
-
-        Args:
-            server_name: Remote server_name where the media originated.
-            media_id: The media ID of the content (as defined by the remote server).
-
-        Returns:
-            The media info of the file
-        """
-        if (
-            self.federation_domain_whitelist is not None
-            and server_name not in self.federation_domain_whitelist
-        ):
-            raise FederationDeniedError(server_name)
-
-        # We linearize here to ensure that we don't try and download remote
-        # media multiple times concurrently
-        key = (server_name, media_id)
-        async with self.remote_media_linearizer.queue(key):
-            responder, media_info = await self._get_remote_media_impl(
-                server_name, media_id
-            )
-
-        # Ensure we actually use the responder so that it releases resources
-        if responder:
-            with responder:
-                pass
-
-        return media_info
-
-    async def _get_remote_media_impl(
-        self, server_name: str, media_id: str
-    ) -> Tuple[Optional[Responder], dict]:
-        """Looks for media in local cache, if not there then attempt to
-        download from remote server.
-
-        Args:
-            server_name: Remote server_name where the media originated.
-            media_id: The media ID of the content (as defined by the
-                remote server).
-
-        Returns:
-            A tuple of responder and the media info of the file.
-        """
-        media_info = await self.store.get_cached_remote_media(server_name, media_id)
-
-        # file_id is the ID we use to track the file locally. If we've already
-        # seen the file then reuse the existing ID, otherwise generate a new
-        # one.
-
-        # If we have an entry in the DB, try and look for it
-        if media_info:
-            file_id = media_info["filesystem_id"]
-            file_info = FileInfo(server_name, file_id)
-
-            if media_info["quarantined_by"]:
-                logger.info("Media is quarantined")
-                raise NotFoundError()
-
-            if not media_info["media_type"]:
-                media_info["media_type"] = "application/octet-stream"
-
-            responder = await self.media_storage.fetch_media(file_info)
-            if responder:
-                return responder, media_info
-
-        # Failed to find the file anywhere, lets download it.
-
-        try:
-            media_info = await self._download_remote_file(
-                server_name,
-                media_id,
-            )
-        except SynapseError:
-            raise
-        except Exception as e:
-            # An exception may be because we downloaded media in another
-            # process, so let's check if we magically have the media.
-            media_info = await self.store.get_cached_remote_media(server_name, media_id)
-            if not media_info:
-                raise e
-
-        file_id = media_info["filesystem_id"]
-        if not media_info["media_type"]:
-            media_info["media_type"] = "application/octet-stream"
-        file_info = FileInfo(server_name, file_id)
-
-        # We generate thumbnails even if another process downloaded the media
-        # as a) it's conceivable that the other download request dies before it
-        # generates thumbnails, but mainly b) we want to be sure the thumbnails
-        # have finished being generated before responding to the client,
-        # otherwise they'll request thumbnails and get a 404 if they're not
-        # ready yet.
-        await self._generate_thumbnails(
-            server_name, media_id, file_id, media_info["media_type"]
-        )
-
-        responder = await self.media_storage.fetch_media(file_info)
-        return responder, media_info
-
-    async def _download_remote_file(
-        self,
-        server_name: str,
-        media_id: str,
-    ) -> dict:
-        """Attempt to download the remote file from the given server name,
-        using the given file_id as the local id.
-
-        Args:
-            server_name: Originating server
-            media_id: The media ID of the content (as defined by the
-                remote server). This is different than the file_id, which is
-                locally generated.
-            file_id: Local file ID
-
-        Returns:
-            The media info of the file.
-        """
-
-        file_id = random_string(24)
-
-        file_info = FileInfo(server_name=server_name, file_id=file_id)
-
-        with self.media_storage.store_into_file(file_info) as (f, fname, finish):
-            request_path = "/".join(
-                ("/_matrix/media/r0/download", server_name, media_id)
-            )
-            try:
-                length, headers = await self.client.get_file(
-                    server_name,
-                    request_path,
-                    output_stream=f,
-                    max_size=self.max_upload_size,
-                    args={
-                        # tell the remote server to 404 if it doesn't
-                        # recognise the server_name, to make sure we don't
-                        # end up with a routing loop.
-                        "allow_remote": "false"
-                    },
-                )
-            except RequestSendFailed as e:
-                logger.warning(
-                    "Request failed fetching remote media %s/%s: %r",
-                    server_name,
-                    media_id,
-                    e,
-                )
-                raise SynapseError(502, "Failed to fetch remote media")
-
-            except HttpResponseException as e:
-                logger.warning(
-                    "HTTP error fetching remote media %s/%s: %s",
-                    server_name,
-                    media_id,
-                    e.response,
-                )
-                if e.code == twisted.web.http.NOT_FOUND:
-                    raise e.to_synapse_error()
-                raise SynapseError(502, "Failed to fetch remote media")
-
-            except SynapseError:
-                logger.warning(
-                    "Failed to fetch remote media %s/%s", server_name, media_id
-                )
-                raise
-            except NotRetryingDestination:
-                logger.warning("Not retrying destination %r", server_name)
-                raise SynapseError(502, "Failed to fetch remote media")
-            except Exception:
-                logger.exception(
-                    "Failed to fetch remote media %s/%s", server_name, media_id
-                )
-                raise SynapseError(502, "Failed to fetch remote media")
-
-            await finish()
-
-            if b"Content-Type" in headers:
-                media_type = headers[b"Content-Type"][0].decode("ascii")
-            else:
-                media_type = "application/octet-stream"
-            upload_name = get_filename_from_headers(headers)
-            time_now_ms = self.clock.time_msec()
-
-            # Multiple remote media download requests can race (when using
-            # multiple media repos), so this may throw a violation constraint
-            # exception. If it does we'll delete the newly downloaded file from
-            # disk (as we're in the ctx manager).
-            #
-            # However: we've already called `finish()` so we may have also
-            # written to the storage providers. This is preferable to the
-            # alternative where we call `finish()` *after* this, where we could
-            # end up having an entry in the DB but fail to write the files to
-            # the storage providers.
-            await self.store.store_cached_remote_media(
-                origin=server_name,
-                media_id=media_id,
-                media_type=media_type,
-                time_now_ms=self.clock.time_msec(),
-                upload_name=upload_name,
-                media_length=length,
-                filesystem_id=file_id,
-            )
-
-        logger.info("Stored remote media in file %r", fname)
-
-        media_info = {
-            "media_type": media_type,
-            "media_length": length,
-            "upload_name": upload_name,
-            "created_ts": time_now_ms,
-            "filesystem_id": file_id,
-        }
-
-        return media_info
-
-    def _get_thumbnail_requirements(
-        self, media_type: str
-    ) -> Tuple[ThumbnailRequirement, ...]:
-        scpos = media_type.find(";")
-        if scpos > 0:
-            media_type = media_type[:scpos]
-        return self.thumbnail_requirements.get(media_type, ())
-
-    def _generate_thumbnail(
-        self,
-        thumbnailer: Thumbnailer,
-        t_width: int,
-        t_height: int,
-        t_method: str,
-        t_type: str,
-    ) -> Optional[BytesIO]:
-        m_width = thumbnailer.width
-        m_height = thumbnailer.height
-
-        if m_width * m_height >= self.max_image_pixels:
-            logger.info(
-                "Image too large to thumbnail %r x %r > %r",
-                m_width,
-                m_height,
-                self.max_image_pixels,
-            )
-            return None
-
-        if thumbnailer.transpose_method is not None:
-            m_width, m_height = thumbnailer.transpose()
-
-        if t_method == "crop":
-            return thumbnailer.crop(t_width, t_height, t_type)
-        elif t_method == "scale":
-            t_width, t_height = thumbnailer.aspect(t_width, t_height)
-            t_width = min(m_width, t_width)
-            t_height = min(m_height, t_height)
-            return thumbnailer.scale(t_width, t_height, t_type)
-
-        return None
-
-    async def generate_local_exact_thumbnail(
-        self,
-        media_id: str,
-        t_width: int,
-        t_height: int,
-        t_method: str,
-        t_type: str,
-        url_cache: bool,
-    ) -> Optional[str]:
-        input_path = await self.media_storage.ensure_media_is_in_local_cache(
-            FileInfo(None, media_id, url_cache=url_cache)
-        )
-
-        try:
-            thumbnailer = Thumbnailer(input_path)
-        except ThumbnailError as e:
-            logger.warning(
-                "Unable to generate a thumbnail for local media %s using a method of %s and type of %s: %s",
-                media_id,
-                t_method,
-                t_type,
-                e,
-            )
-            return None
-
-        with thumbnailer:
-            t_byte_source = await defer_to_thread(
-                self.hs.get_reactor(),
-                self._generate_thumbnail,
-                thumbnailer,
-                t_width,
-                t_height,
-                t_method,
-                t_type,
-            )
-
-        if t_byte_source:
-            try:
-                file_info = FileInfo(
-                    server_name=None,
-                    file_id=media_id,
-                    url_cache=url_cache,
-                    thumbnail=ThumbnailInfo(
-                        width=t_width,
-                        height=t_height,
-                        method=t_method,
-                        type=t_type,
-                    ),
-                )
-
-                output_path = await self.media_storage.store_file(
-                    t_byte_source, file_info
-                )
-            finally:
-                t_byte_source.close()
-
-            logger.info("Stored thumbnail in file %r", output_path)
-
-            t_len = os.path.getsize(output_path)
-
-            await self.store.store_local_thumbnail(
-                media_id, t_width, t_height, t_type, t_method, t_len
-            )
-
-            return output_path
-
-        # Could not generate thumbnail.
-        return None
-
-    async def generate_remote_exact_thumbnail(
-        self,
-        server_name: str,
-        file_id: str,
-        media_id: str,
-        t_width: int,
-        t_height: int,
-        t_method: str,
-        t_type: str,
-    ) -> Optional[str]:
-        input_path = await self.media_storage.ensure_media_is_in_local_cache(
-            FileInfo(server_name, file_id)
-        )
-
-        try:
-            thumbnailer = Thumbnailer(input_path)
-        except ThumbnailError as e:
-            logger.warning(
-                "Unable to generate a thumbnail for remote media %s from %s using a method of %s and type of %s: %s",
-                media_id,
-                server_name,
-                t_method,
-                t_type,
-                e,
-            )
-            return None
-
-        with thumbnailer:
-            t_byte_source = await defer_to_thread(
-                self.hs.get_reactor(),
-                self._generate_thumbnail,
-                thumbnailer,
-                t_width,
-                t_height,
-                t_method,
-                t_type,
-            )
-
-        if t_byte_source:
-            try:
-                file_info = FileInfo(
-                    server_name=server_name,
-                    file_id=file_id,
-                    thumbnail=ThumbnailInfo(
-                        width=t_width,
-                        height=t_height,
-                        method=t_method,
-                        type=t_type,
-                    ),
-                )
-
-                output_path = await self.media_storage.store_file(
-                    t_byte_source, file_info
-                )
-            finally:
-                t_byte_source.close()
-
-            logger.info("Stored thumbnail in file %r", output_path)
-
-            t_len = os.path.getsize(output_path)
-
-            await self.store.store_remote_media_thumbnail(
-                server_name,
-                media_id,
-                file_id,
-                t_width,
-                t_height,
-                t_type,
-                t_method,
-                t_len,
-            )
-
-            return output_path
-
-        # Could not generate thumbnail.
-        return None
-
-    async def _generate_thumbnails(
-        self,
-        server_name: Optional[str],
-        media_id: str,
-        file_id: str,
-        media_type: str,
-        url_cache: bool = False,
-    ) -> Optional[dict]:
-        """Generate and store thumbnails for an image.
-
-        Args:
-            server_name: The server name if remote media, else None if local
-            media_id: The media ID of the content. (This is the same as
-                the file_id for local content)
-            file_id: Local file ID
-            media_type: The content type of the file
-            url_cache: If we are thumbnailing images downloaded for the URL cache,
-                used exclusively by the url previewer
-
-        Returns:
-            Dict with "width" and "height" keys of original image or None if the
-            media cannot be thumbnailed.
-        """
-        requirements = self._get_thumbnail_requirements(media_type)
-        if not requirements:
-            return None
-
-        input_path = await self.media_storage.ensure_media_is_in_local_cache(
-            FileInfo(server_name, file_id, url_cache=url_cache)
-        )
-
-        try:
-            thumbnailer = Thumbnailer(input_path)
-        except ThumbnailError as e:
-            logger.warning(
-                "Unable to generate thumbnails for remote media %s from %s of type %s: %s",
-                media_id,
-                server_name,
-                media_type,
-                e,
-            )
-            return None
-
-        with thumbnailer:
-            m_width = thumbnailer.width
-            m_height = thumbnailer.height
-
-            if m_width * m_height >= self.max_image_pixels:
-                logger.info(
-                    "Image too large to thumbnail %r x %r > %r",
-                    m_width,
-                    m_height,
-                    self.max_image_pixels,
-                )
-                return None
-
-            if thumbnailer.transpose_method is not None:
-                m_width, m_height = await defer_to_thread(
-                    self.hs.get_reactor(), thumbnailer.transpose
-                )
-
-            # We deduplicate the thumbnail sizes by ignoring the cropped versions if
-            # they have the same dimensions of a scaled one.
-            thumbnails: Dict[Tuple[int, int, str], str] = {}
-            for requirement in requirements:
-                if requirement.method == "crop":
-                    thumbnails.setdefault(
-                        (requirement.width, requirement.height, requirement.media_type),
-                        requirement.method,
-                    )
-                elif requirement.method == "scale":
-                    t_width, t_height = thumbnailer.aspect(
-                        requirement.width, requirement.height
-                    )
-                    t_width = min(m_width, t_width)
-                    t_height = min(m_height, t_height)
-                    thumbnails[
-                        (t_width, t_height, requirement.media_type)
-                    ] = requirement.method
-
-            # Now we generate the thumbnails for each dimension, store it
-            for (t_width, t_height, t_type), t_method in thumbnails.items():
-                # Generate the thumbnail
-                if t_method == "crop":
-                    t_byte_source = await defer_to_thread(
-                        self.hs.get_reactor(),
-                        thumbnailer.crop,
-                        t_width,
-                        t_height,
-                        t_type,
-                    )
-                elif t_method == "scale":
-                    t_byte_source = await defer_to_thread(
-                        self.hs.get_reactor(),
-                        thumbnailer.scale,
-                        t_width,
-                        t_height,
-                        t_type,
-                    )
-                else:
-                    logger.error("Unrecognized method: %r", t_method)
-                    continue
-
-                if not t_byte_source:
-                    continue
-
-                file_info = FileInfo(
-                    server_name=server_name,
-                    file_id=file_id,
-                    url_cache=url_cache,
-                    thumbnail=ThumbnailInfo(
-                        width=t_width,
-                        height=t_height,
-                        method=t_method,
-                        type=t_type,
-                    ),
-                )
-
-                with self.media_storage.store_into_file(file_info) as (
-                    f,
-                    fname,
-                    finish,
-                ):
-                    try:
-                        await self.media_storage.write_to_file(t_byte_source, f)
-                        await finish()
-                    finally:
-                        t_byte_source.close()
-
-                    t_len = os.path.getsize(fname)
-
-                    # Write to database
-                    if server_name:
-                        # Multiple remote media download requests can race (when
-                        # using multiple media repos), so this may throw a violation
-                        # constraint exception. If it does we'll delete the newly
-                        # generated thumbnail from disk (as we're in the ctx
-                        # manager).
-                        #
-                        # However: we've already called `finish()` so we may have
-                        # also written to the storage providers. This is preferable
-                        # to the alternative where we call `finish()` *after* this,
-                        # where we could end up having an entry in the DB but fail
-                        # to write the files to the storage providers.
-                        try:
-                            await self.store.store_remote_media_thumbnail(
-                                server_name,
-                                media_id,
-                                file_id,
-                                t_width,
-                                t_height,
-                                t_type,
-                                t_method,
-                                t_len,
-                            )
-                        except Exception as e:
-                            thumbnail_exists = (
-                                await self.store.get_remote_media_thumbnail(
-                                    server_name,
-                                    media_id,
-                                    t_width,
-                                    t_height,
-                                    t_type,
-                                )
-                            )
-                            if not thumbnail_exists:
-                                raise e
-                    else:
-                        await self.store.store_local_thumbnail(
-                            media_id, t_width, t_height, t_type, t_method, t_len
-                        )
-
-        return {"width": m_width, "height": m_height}
-
-    async def _apply_media_retention_rules(self) -> None:
-        """
-        Purge old local and remote media according to the media retention rules
-        defined in the homeserver config.
-        """
-        # Purge remote media
-        if self._media_retention_remote_media_lifetime_ms is not None:
-            # Calculate a threshold timestamp derived from the configured lifetime. Any
-            # media that has not been accessed since this timestamp will be removed.
-            remote_media_threshold_timestamp_ms = (
-                self.clock.time_msec() - self._media_retention_remote_media_lifetime_ms
-            )
-
-            logger.info(
-                "Purging remote media last accessed before"
-                f" {remote_media_threshold_timestamp_ms}"
-            )
-
-            await self.delete_old_remote_media(
-                before_ts=remote_media_threshold_timestamp_ms
-            )
-
-        # And now do the same for local media
-        if self._media_retention_local_media_lifetime_ms is not None:
-            # This works the same as the remote media threshold
-            local_media_threshold_timestamp_ms = (
-                self.clock.time_msec() - self._media_retention_local_media_lifetime_ms
-            )
-
-            logger.info(
-                "Purging local media last accessed before"
-                f" {local_media_threshold_timestamp_ms}"
-            )
-
-            await self.delete_old_local_media(
-                before_ts=local_media_threshold_timestamp_ms,
-                keep_profiles=True,
-                delete_quarantined_media=False,
-                delete_protected_media=False,
-            )
-
-    async def delete_old_remote_media(self, before_ts: int) -> Dict[str, int]:
-        old_media = await self.store.get_remote_media_ids(
-            before_ts, include_quarantined_media=False
-        )
-
-        deleted = 0
-
-        for media in old_media:
-            origin = media["media_origin"]
-            media_id = media["media_id"]
-            file_id = media["filesystem_id"]
-            key = (origin, media_id)
-
-            logger.info("Deleting: %r", key)
-
-            # TODO: Should we delete from the backup store
-
-            async with self.remote_media_linearizer.queue(key):
-                full_path = self.filepaths.remote_media_filepath(origin, file_id)
-                try:
-                    os.remove(full_path)
-                except OSError as e:
-                    logger.warning("Failed to remove file: %r", full_path)
-                    if e.errno == errno.ENOENT:
-                        pass
-                    else:
-                        continue
-
-                thumbnail_dir = self.filepaths.remote_media_thumbnail_dir(
-                    origin, file_id
-                )
-                shutil.rmtree(thumbnail_dir, ignore_errors=True)
-
-                await self.store.delete_remote_media(origin, media_id)
-                deleted += 1
-
-        return {"deleted": deleted}
-
-    async def delete_local_media_ids(
-        self, media_ids: List[str]
-    ) -> Tuple[List[str], int]:
-        """
-        Delete the given local or remote media ID from this server
-
-        Args:
-            media_id: The media ID to delete.
-        Returns:
-            A tuple of (list of deleted media IDs, total deleted media IDs).
-        """
-        return await self._remove_local_media_from_disk(media_ids)
-
-    async def delete_old_local_media(
-        self,
-        before_ts: int,
-        size_gt: int = 0,
-        keep_profiles: bool = True,
-        delete_quarantined_media: bool = False,
-        delete_protected_media: bool = False,
-    ) -> Tuple[List[str], int]:
-        """
-        Delete local or remote media from this server by size and timestamp. Removes
-        media files, any thumbnails and cached URLs.
-
-        Args:
-            before_ts: Unix timestamp in ms.
-                Files that were last used before this timestamp will be deleted.
-            size_gt: Size of the media in bytes. Files that are larger will be deleted.
-            keep_profiles: Switch to delete also files that are still used in image data
-                (e.g user profile, room avatar). If false these files will be deleted.
-            delete_quarantined_media: If True, media marked as quarantined will be deleted.
-            delete_protected_media: If True, media marked as protected will be deleted.
-
-        Returns:
-            A tuple of (list of deleted media IDs, total deleted media IDs).
-        """
-        old_media = await self.store.get_local_media_ids(
-            before_ts,
-            size_gt,
-            keep_profiles,
-            include_quarantined_media=delete_quarantined_media,
-            include_protected_media=delete_protected_media,
-        )
-        return await self._remove_local_media_from_disk(old_media)
-
-    async def _remove_local_media_from_disk(
-        self, media_ids: List[str]
-    ) -> Tuple[List[str], int]:
-        """
-        Delete local or remote media from this server. Removes media files,
-        any thumbnails and cached URLs.
-
-        Args:
-            media_ids: List of media_id to delete
-        Returns:
-            A tuple of (list of deleted media IDs, total deleted media IDs).
-        """
-        removed_media = []
-        for media_id in media_ids:
-            logger.info("Deleting media with ID '%s'", media_id)
-            full_path = self.filepaths.local_media_filepath(media_id)
-            try:
-                os.remove(full_path)
-            except OSError as e:
-                logger.warning("Failed to remove file: %r: %s", full_path, e)
-                if e.errno == errno.ENOENT:
-                    pass
-                else:
-                    continue
-
-            thumbnail_dir = self.filepaths.local_media_thumbnail_dir(media_id)
-            shutil.rmtree(thumbnail_dir, ignore_errors=True)
-
-            await self.store.delete_remote_media(self.server_name, media_id)
-
-            await self.store.delete_url_cache((media_id,))
-            await self.store.delete_url_cache_media((media_id,))
-
-            removed_media.append(media_id)
-
-        return removed_media, len(removed_media)
-
-
-class MediaRepositoryResource(UnrecognizedRequestResource):
-    """File uploading and downloading.
-
-    Uploads are POSTed to a resource which returns a token which is used to GET
-    the download::
-
-        => POST /_matrix/media/r0/upload HTTP/1.1
-           Content-Type: <media-type>
-           Content-Length: <content-length>
-
-           <media>
-
-        <= HTTP/1.1 200 OK
-           Content-Type: application/json
-
-           { "content_uri": "mxc://<server-name>/<media-id>" }
-
-        => GET /_matrix/media/r0/download/<server-name>/<media-id> HTTP/1.1
-
-        <= HTTP/1.1 200 OK
-           Content-Type: <media-type>
-           Content-Disposition: attachment;filename=<upload-filename>
-
-           <media>
-
-    Clients can get thumbnails by supplying a desired width and height and
-    thumbnailing method::
-
-        => GET /_matrix/media/r0/thumbnail/<server_name>
-                /<media-id>?width=<w>&height=<h>&method=<m> HTTP/1.1
-
-        <= HTTP/1.1 200 OK
-           Content-Type: image/jpeg or image/png
-
-           <thumbnail>
-
-    The thumbnail methods are "crop" and "scale". "scale" tries to return an
-    image where either the width or the height is smaller than the requested
-    size. The client should then scale and letterbox the image if it needs to
-    fit within a given rectangle. "crop" tries to return an image where the
-    width and height are close to the requested size and the aspect matches
-    the requested size. The client should scale the image if it needs to fit
-    within a given rectangle.
-    """
-
-    def __init__(self, hs: "HomeServer"):
-        # If we're not configured to use it, raise if we somehow got here.
-        if not hs.config.media.can_load_media_repo:
-            raise ConfigError("Synapse is not configured to use a media repo.")
-
-        super().__init__()
-        media_repo = hs.get_media_repository()
-
-        self.putChild(b"upload", UploadResource(hs, media_repo))
-        self.putChild(b"download", DownloadResource(hs, media_repo))
-        self.putChild(
-            b"thumbnail", ThumbnailResource(hs, media_repo, media_repo.media_storage)
-        )
-        if hs.config.media.url_preview_enabled:
-            self.putChild(
-                b"preview_url",
-                PreviewUrlResource(hs, media_repo, media_repo.media_storage),
-            )
-        self.putChild(b"config", MediaConfigResource(hs))
diff --git a/synapse/rest/media/v1/media_storage.py b/synapse/rest/media/v1/media_storage.py
index db25848744..11b0e8e231 100644
--- a/synapse/rest/media/v1/media_storage.py
+++ b/synapse/rest/media/v1/media_storage.py
@@ -1,4 +1,4 @@
-# Copyright 2018-2021 The Matrix.org Foundation C.I.C.
+# Copyright 2023 The Matrix.org Foundation C.I.C.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,364 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import contextlib
-import logging
-import os
-import shutil
-from types import TracebackType
-from typing import (
-    IO,
-    TYPE_CHECKING,
-    Any,
-    Awaitable,
-    BinaryIO,
-    Callable,
-    Generator,
-    Optional,
-    Sequence,
-    Tuple,
-    Type,
-)
-
-import attr
-
-from twisted.internet.defer import Deferred
-from twisted.internet.interfaces import IConsumer
-from twisted.protocols.basic import FileSender
-
-import synapse
-from synapse.api.errors import NotFoundError
-from synapse.logging.context import defer_to_thread, make_deferred_yieldable
-from synapse.util import Clock
-from synapse.util.file_consumer import BackgroundFileConsumer
-
-from ._base import FileInfo, Responder
-from .filepath import MediaFilePaths
-
-if TYPE_CHECKING:
-    from synapse.rest.media.v1.storage_provider import StorageProvider
-    from synapse.server import HomeServer
-
-logger = logging.getLogger(__name__)
-
-
-class MediaStorage:
-    """Responsible for storing/fetching files from local sources.
-
-    Args:
-        hs
-        local_media_directory: Base path where we store media on disk
-        filepaths
-        storage_providers: List of StorageProvider that are used to fetch and store files.
-    """
-
-    def __init__(
-        self,
-        hs: "HomeServer",
-        local_media_directory: str,
-        filepaths: MediaFilePaths,
-        storage_providers: Sequence["StorageProvider"],
-    ):
-        self.hs = hs
-        self.reactor = hs.get_reactor()
-        self.local_media_directory = local_media_directory
-        self.filepaths = filepaths
-        self.storage_providers = storage_providers
-        self.spam_checker = hs.get_spam_checker()
-        self.clock = hs.get_clock()
-
-    async def store_file(self, source: IO, file_info: FileInfo) -> str:
-        """Write `source` to the on disk media store, and also any other
-        configured storage providers
-
-        Args:
-            source: A file like object that should be written
-            file_info: Info about the file to store
-
-        Returns:
-            the file path written to in the primary media store
-        """
-
-        with self.store_into_file(file_info) as (f, fname, finish_cb):
-            # Write to the main repository
-            await self.write_to_file(source, f)
-            await finish_cb()
-
-        return fname
-
-    async def write_to_file(self, source: IO, output: IO) -> None:
-        """Asynchronously write the `source` to `output`."""
-        await defer_to_thread(self.reactor, _write_file_synchronously, source, output)
-
-    @contextlib.contextmanager
-    def store_into_file(
-        self, file_info: FileInfo
-    ) -> Generator[Tuple[BinaryIO, str, Callable[[], Awaitable[None]]], None, None]:
-        """Context manager used to get a file like object to write into, as
-        described by file_info.
-
-        Actually yields a 3-tuple (file, fname, finish_cb), where file is a file
-        like object that can be written to, fname is the absolute path of file
-        on disk, and finish_cb is a function that returns an awaitable.
-
-        fname can be used to read the contents from after upload, e.g. to
-        generate thumbnails.
-
-        finish_cb must be called and waited on after the file has been
-        successfully been written to. Should not be called if there was an
-        error.
-
-        Args:
-            file_info: Info about the file to store
-
-        Example:
-
-            with media_storage.store_into_file(info) as (f, fname, finish_cb):
-                # .. write into f ...
-                await finish_cb()
-        """
-
-        path = self._file_info_to_path(file_info)
-        fname = os.path.join(self.local_media_directory, path)
-
-        dirname = os.path.dirname(fname)
-        os.makedirs(dirname, exist_ok=True)
-
-        finished_called = [False]
-
-        try:
-            with open(fname, "wb") as f:
-
-                async def finish() -> None:
-                    # Ensure that all writes have been flushed and close the
-                    # file.
-                    f.flush()
-                    f.close()
-
-                    spam_check = await self.spam_checker.check_media_file_for_spam(
-                        ReadableFileWrapper(self.clock, fname), file_info
-                    )
-                    if spam_check != synapse.module_api.NOT_SPAM:
-                        logger.info("Blocking media due to spam checker")
-                        # Note that we'll delete the stored media, due to the
-                        # try/except below. The media also won't be stored in
-                        # the DB.
-                        # We currently ignore any additional field returned by
-                        # the spam-check API.
-                        raise SpamMediaException(errcode=spam_check[0])
-
-                    for provider in self.storage_providers:
-                        await provider.store_file(path, file_info)
-
-                    finished_called[0] = True
-
-                yield f, fname, finish
-        except Exception as e:
-            try:
-                os.remove(fname)
-            except Exception:
-                pass
-
-            raise e from None
-
-        if not finished_called:
-            raise Exception("Finished callback not called")
-
-    async def fetch_media(self, file_info: FileInfo) -> Optional[Responder]:
-        """Attempts to fetch media described by file_info from the local cache
-        and configured storage providers.
-
-        Args:
-            file_info
-
-        Returns:
-            Returns a Responder if the file was found, otherwise None.
-        """
-        paths = [self._file_info_to_path(file_info)]
-
-        # fallback for remote thumbnails with no method in the filename
-        if file_info.thumbnail and file_info.server_name:
-            paths.append(
-                self.filepaths.remote_media_thumbnail_rel_legacy(
-                    server_name=file_info.server_name,
-                    file_id=file_info.file_id,
-                    width=file_info.thumbnail.width,
-                    height=file_info.thumbnail.height,
-                    content_type=file_info.thumbnail.type,
-                )
-            )
-
-        for path in paths:
-            local_path = os.path.join(self.local_media_directory, path)
-            if os.path.exists(local_path):
-                logger.debug("responding with local file %s", local_path)
-                return FileResponder(open(local_path, "rb"))
-            logger.debug("local file %s did not exist", local_path)
-
-        for provider in self.storage_providers:
-            for path in paths:
-                res: Any = await provider.fetch(path, file_info)
-                if res:
-                    logger.debug("Streaming %s from %s", path, provider)
-                    return res
-                logger.debug("%s not found on %s", path, provider)
-
-        return None
-
-    async def ensure_media_is_in_local_cache(self, file_info: FileInfo) -> str:
-        """Ensures that the given file is in the local cache. Attempts to
-        download it from storage providers if it isn't.
-
-        Args:
-            file_info
-
-        Returns:
-            Full path to local file
-        """
-        path = self._file_info_to_path(file_info)
-        local_path = os.path.join(self.local_media_directory, path)
-        if os.path.exists(local_path):
-            return local_path
-
-        # Fallback for paths without method names
-        # Should be removed in the future
-        if file_info.thumbnail and file_info.server_name:
-            legacy_path = self.filepaths.remote_media_thumbnail_rel_legacy(
-                server_name=file_info.server_name,
-                file_id=file_info.file_id,
-                width=file_info.thumbnail.width,
-                height=file_info.thumbnail.height,
-                content_type=file_info.thumbnail.type,
-            )
-            legacy_local_path = os.path.join(self.local_media_directory, legacy_path)
-            if os.path.exists(legacy_local_path):
-                return legacy_local_path
-
-        dirname = os.path.dirname(local_path)
-        os.makedirs(dirname, exist_ok=True)
-
-        for provider in self.storage_providers:
-            res: Any = await provider.fetch(path, file_info)
-            if res:
-                with res:
-                    consumer = BackgroundFileConsumer(
-                        open(local_path, "wb"), self.reactor
-                    )
-                    await res.write_to_consumer(consumer)
-                    await consumer.wait()
-                return local_path
-
-        raise NotFoundError()
-
-    def _file_info_to_path(self, file_info: FileInfo) -> str:
-        """Converts file_info into a relative path.
-
-        The path is suitable for storing files under a directory, e.g. used to
-        store files on local FS under the base media repository directory.
-        """
-        if file_info.url_cache:
-            if file_info.thumbnail:
-                return self.filepaths.url_cache_thumbnail_rel(
-                    media_id=file_info.file_id,
-                    width=file_info.thumbnail.width,
-                    height=file_info.thumbnail.height,
-                    content_type=file_info.thumbnail.type,
-                    method=file_info.thumbnail.method,
-                )
-            return self.filepaths.url_cache_filepath_rel(file_info.file_id)
-
-        if file_info.server_name:
-            if file_info.thumbnail:
-                return self.filepaths.remote_media_thumbnail_rel(
-                    server_name=file_info.server_name,
-                    file_id=file_info.file_id,
-                    width=file_info.thumbnail.width,
-                    height=file_info.thumbnail.height,
-                    content_type=file_info.thumbnail.type,
-                    method=file_info.thumbnail.method,
-                )
-            return self.filepaths.remote_media_filepath_rel(
-                file_info.server_name, file_info.file_id
-            )
-
-        if file_info.thumbnail:
-            return self.filepaths.local_media_thumbnail_rel(
-                media_id=file_info.file_id,
-                width=file_info.thumbnail.width,
-                height=file_info.thumbnail.height,
-                content_type=file_info.thumbnail.type,
-                method=file_info.thumbnail.method,
-            )
-        return self.filepaths.local_media_filepath_rel(file_info.file_id)
-
-
-def _write_file_synchronously(source: IO, dest: IO) -> None:
-    """Write `source` to the file like `dest` synchronously. Should be called
-    from a thread.
-
-    Args:
-        source: A file like object that's to be written
-        dest: A file like object to be written to
-    """
-    source.seek(0)  # Ensure we read from the start of the file
-    shutil.copyfileobj(source, dest)
-
-
-class FileResponder(Responder):
-    """Wraps an open file that can be sent to a request.
-
-    Args:
-        open_file: A file like object to be streamed ot the client,
-            is closed when finished streaming.
-    """
-
-    def __init__(self, open_file: IO):
-        self.open_file = open_file
-
-    def write_to_consumer(self, consumer: IConsumer) -> Deferred:
-        return make_deferred_yieldable(
-            FileSender().beginFileTransfer(self.open_file, consumer)
-        )
-
-    def __exit__(
-        self,
-        exc_type: Optional[Type[BaseException]],
-        exc_val: Optional[BaseException],
-        exc_tb: Optional[TracebackType],
-    ) -> None:
-        self.open_file.close()
-
-
-class SpamMediaException(NotFoundError):
-    """The media was blocked by a spam checker, so we simply 404 the request (in
-    the same way as if it was quarantined).
-    """
-
-
-@attr.s(slots=True, auto_attribs=True)
-class ReadableFileWrapper:
-    """Wrapper that allows reading a file in chunks, yielding to the reactor,
-    and writing to a callback.
-
-    This is simplified `FileSender` that takes an IO object rather than an
-    `IConsumer`.
-    """
-
-    CHUNK_SIZE = 2**14
-
-    clock: Clock
-    path: str
-
-    async def write_chunks_to(self, callback: Callable[[bytes], object]) -> None:
-        """Reads the file in chunks and calls the callback with each chunk."""
-
-        with open(self.path, "rb") as file:
-            while True:
-                chunk = file.read(self.CHUNK_SIZE)
-                if not chunk:
-                    break
-
-                callback(chunk)
+#
 
-                # We yield to the reactor by sleeping for 0 seconds.
-                await self.clock.sleep(0)
+# This exists purely for backwards compatibility with spam checkers.
+from synapse.media.media_storage import ReadableFileWrapper  # noqa: F401
diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py
deleted file mode 100644
index 7592aa5d47..0000000000
--- a/synapse/rest/media/v1/oembed.py
+++ /dev/null
@@ -1,265 +0,0 @@
-#  Copyright 2021 The Matrix.org Foundation C.I.C.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-import html
-import logging
-import urllib.parse
-from typing import TYPE_CHECKING, List, Optional
-
-import attr
-
-from synapse.rest.media.v1.preview_html import parse_html_description
-from synapse.types import JsonDict
-from synapse.util import json_decoder
-
-if TYPE_CHECKING:
-    from lxml import etree
-
-    from synapse.server import HomeServer
-
-logger = logging.getLogger(__name__)
-
-
-@attr.s(slots=True, frozen=True, auto_attribs=True)
-class OEmbedResult:
-    # The Open Graph result (converted from the oEmbed result).
-    open_graph_result: JsonDict
-    # The author_name of the oEmbed result
-    author_name: Optional[str]
-    # Number of milliseconds to cache the content, according to the oEmbed response.
-    #
-    # This will be None if no cache-age is provided in the oEmbed response (or
-    # if the oEmbed response cannot be turned into an Open Graph response).
-    cache_age: Optional[int]
-
-
-class OEmbedProvider:
-    """
-    A helper for accessing oEmbed content.
-
-    It can be used to check if a URL should be accessed via oEmbed and for
-    requesting/parsing oEmbed content.
-    """
-
-    def __init__(self, hs: "HomeServer"):
-        self._oembed_patterns = {}
-        for oembed_endpoint in hs.config.oembed.oembed_patterns:
-            api_endpoint = oembed_endpoint.api_endpoint
-
-            # Only JSON is supported at the moment. This could be declared in
-            # the formats field. Otherwise, if the endpoint ends in .xml assume
-            # it doesn't support JSON.
-            if (
-                oembed_endpoint.formats is not None
-                and "json" not in oembed_endpoint.formats
-            ) or api_endpoint.endswith(".xml"):
-                logger.info(
-                    "Ignoring oEmbed endpoint due to not supporting JSON: %s",
-                    api_endpoint,
-                )
-                continue
-
-            # Iterate through each URL pattern and point it to the endpoint.
-            for pattern in oembed_endpoint.url_patterns:
-                self._oembed_patterns[pattern] = api_endpoint
-
-    def get_oembed_url(self, url: str) -> Optional[str]:
-        """
-        Check whether the URL should be downloaded as oEmbed content instead.
-
-        Args:
-            url: The URL to check.
-
-        Returns:
-            A URL to use instead or None if the original URL should be used.
-        """
-        for url_pattern, endpoint in self._oembed_patterns.items():
-            if url_pattern.fullmatch(url):
-                # TODO Specify max height / width.
-
-                # Note that only the JSON format is supported, some endpoints want
-                # this in the URL, others want it as an argument.
-                endpoint = endpoint.replace("{format}", "json")
-
-                args = {"url": url, "format": "json"}
-                query_str = urllib.parse.urlencode(args, True)
-                return f"{endpoint}?{query_str}"
-
-        # No match.
-        return None
-
-    def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
-        """
-        Search an HTML document for oEmbed autodiscovery information.
-
-        Args:
-            tree: The parsed HTML body.
-
-        Returns:
-            The URL to use for oEmbed information, or None if no URL was found.
-        """
-        # Search for link elements with the proper rel and type attributes.
-        for tag in tree.xpath(
-            "//link[@rel='alternate'][@type='application/json+oembed']"
-        ):
-            if "href" in tag.attrib:
-                return tag.attrib["href"]
-
-        # Some providers (e.g. Flickr) use alternative instead of alternate.
-        for tag in tree.xpath(
-            "//link[@rel='alternative'][@type='application/json+oembed']"
-        ):
-            if "href" in tag.attrib:
-                return tag.attrib["href"]
-
-        return None
-
-    def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
-        """
-        Parse the oEmbed response into an Open Graph response.
-
-        Args:
-            url: The URL which is being previewed (not the one which was
-                requested).
-            raw_body: The oEmbed response as JSON encoded as bytes.
-
-        Returns:
-            json-encoded Open Graph data
-        """
-
-        try:
-            # oEmbed responses *must* be UTF-8 according to the spec.
-            oembed = json_decoder.decode(raw_body.decode("utf-8"))
-        except ValueError:
-            return OEmbedResult({}, None, None)
-
-        # The version is a required string field, but not always provided,
-        # or sometimes provided as a float. Be lenient.
-        oembed_version = oembed.get("version", "1.0")
-        if oembed_version != "1.0" and oembed_version != 1:
-            return OEmbedResult({}, None, None)
-
-        # Attempt to parse the cache age, if possible.
-        try:
-            cache_age = int(oembed.get("cache_age")) * 1000
-        except (TypeError, ValueError):
-            # If the cache age cannot be parsed (e.g. wrong type or invalid
-            # string), ignore it.
-            cache_age = None
-
-        # The oEmbed response converted to Open Graph.
-        open_graph_response: JsonDict = {"og:url": url}
-
-        title = oembed.get("title")
-        if title and isinstance(title, str):
-            # A common WordPress plug-in seems to incorrectly escape entities
-            # in the oEmbed response.
-            open_graph_response["og:title"] = html.unescape(title)
-
-        author_name = oembed.get("author_name")
-        if not isinstance(author_name, str):
-            author_name = None
-
-        # Use the provider name and as the site.
-        provider_name = oembed.get("provider_name")
-        if provider_name and isinstance(provider_name, str):
-            open_graph_response["og:site_name"] = provider_name
-
-        # If a thumbnail exists, use it. Note that dimensions will be calculated later.
-        thumbnail_url = oembed.get("thumbnail_url")
-        if thumbnail_url and isinstance(thumbnail_url, str):
-            open_graph_response["og:image"] = thumbnail_url
-
-        # Process each type separately.
-        oembed_type = oembed.get("type")
-        if oembed_type == "rich":
-            html_str = oembed.get("html")
-            if isinstance(html_str, str):
-                calc_description_and_urls(open_graph_response, html_str)
-
-        elif oembed_type == "photo":
-            # If this is a photo, use the full image, not the thumbnail.
-            url = oembed.get("url")
-            if url and isinstance(url, str):
-                open_graph_response["og:image"] = url
-
-        elif oembed_type == "video":
-            open_graph_response["og:type"] = "video.other"
-            html_str = oembed.get("html")
-            if html_str and isinstance(html_str, str):
-                calc_description_and_urls(open_graph_response, oembed["html"])
-            for size in ("width", "height"):
-                val = oembed.get(size)
-                if type(val) is int:
-                    open_graph_response[f"og:video:{size}"] = val
-
-        elif oembed_type == "link":
-            open_graph_response["og:type"] = "website"
-
-        else:
-            logger.warning("Unknown oEmbed type: %s", oembed_type)
-
-        return OEmbedResult(open_graph_response, author_name, cache_age)
-
-
-def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:
-    results = []
-    for tag in tree.xpath("//*/" + tag_name):
-        if "src" in tag.attrib:
-            results.append(tag.attrib["src"])
-    return results
-
-
-def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None:
-    """
-    Calculate description for an HTML document.
-
-    This uses lxml to convert the HTML document into plaintext. If errors
-    occur during processing of the document, an empty response is returned.
-
-    Args:
-        open_graph_response: The current Open Graph summary. This is updated with additional fields.
-        html_body: The HTML document, as bytes.
-
-    Returns:
-        The summary
-    """
-    # If there's no body, nothing useful is going to be found.
-    if not html_body:
-        return
-
-    from lxml import etree
-
-    # Create an HTML parser. If this fails, log and return no metadata.
-    parser = etree.HTMLParser(recover=True, encoding="utf-8")
-
-    # Attempt to parse the body. If this fails, log and return no metadata.
-    tree = etree.fromstring(html_body, parser)
-
-    # The data was successfully parsed, but no tree was found.
-    if tree is None:
-        return
-
-    # Attempt to find interesting URLs (images, videos, embeds).
-    if "og:image" not in open_graph_response:
-        image_urls = _fetch_urls(tree, "img")
-        if image_urls:
-            open_graph_response["og:image"] = image_urls[0]
-
-    video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed")
-    if video_urls:
-        open_graph_response["og:video"] = video_urls[0]
-
-    description = parse_html_description(tree)
-    if description:
-        open_graph_response["og:description"] = description
diff --git a/synapse/rest/media/v1/preview_html.py b/synapse/rest/media/v1/preview_html.py
deleted file mode 100644
index 516d0434f0..0000000000
--- a/synapse/rest/media/v1/preview_html.py
+++ /dev/null
@@ -1,501 +0,0 @@
-# Copyright 2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import codecs
-import logging
-import re
-from typing import (
-    TYPE_CHECKING,
-    Callable,
-    Dict,
-    Generator,
-    Iterable,
-    List,
-    Optional,
-    Set,
-    Union,
-)
-
-if TYPE_CHECKING:
-    from lxml import etree
-
-logger = logging.getLogger(__name__)
-
-_charset_match = re.compile(
-    rb'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I
-)
-_xml_encoding_match = re.compile(
-    rb'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I
-)
-_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
-
-# Certain elements aren't meant for display.
-ARIA_ROLES_TO_IGNORE = {"directory", "menu", "menubar", "toolbar"}
-
-
-def _normalise_encoding(encoding: str) -> Optional[str]:
-    """Use the Python codec's name as the normalised entry."""
-    try:
-        return codecs.lookup(encoding).name
-    except LookupError:
-        return None
-
-
-def _get_html_media_encodings(
-    body: bytes, content_type: Optional[str]
-) -> Iterable[str]:
-    """
-    Get potential encoding of the body based on the (presumably) HTML body or the content-type header.
-
-    The precedence used for finding a character encoding is:
-
-    1. <meta> tag with a charset declared.
-    2. The XML document's character encoding attribute.
-    3. The Content-Type header.
-    4. Fallback to utf-8.
-    5. Fallback to windows-1252.
-
-    This roughly follows the algorithm used by BeautifulSoup's bs4.dammit.EncodingDetector.
-
-    Args:
-        body: The HTML document, as bytes.
-        content_type: The Content-Type header.
-
-    Returns:
-        The character encoding of the body, as a string.
-    """
-    # There's no point in returning an encoding more than once.
-    attempted_encodings: Set[str] = set()
-
-    # Limit searches to the first 1kb, since it ought to be at the top.
-    body_start = body[:1024]
-
-    # Check if it has an encoding set in a meta tag.
-    match = _charset_match.search(body_start)
-    if match:
-        encoding = _normalise_encoding(match.group(1).decode("ascii"))
-        if encoding:
-            attempted_encodings.add(encoding)
-            yield encoding
-
-    # TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
-
-    # Check if it has an XML document with an encoding.
-    match = _xml_encoding_match.match(body_start)
-    if match:
-        encoding = _normalise_encoding(match.group(1).decode("ascii"))
-        if encoding and encoding not in attempted_encodings:
-            attempted_encodings.add(encoding)
-            yield encoding
-
-    # Check the HTTP Content-Type header for a character set.
-    if content_type:
-        content_match = _content_type_match.match(content_type)
-        if content_match:
-            encoding = _normalise_encoding(content_match.group(1))
-            if encoding and encoding not in attempted_encodings:
-                attempted_encodings.add(encoding)
-                yield encoding
-
-    # Finally, fallback to UTF-8, then windows-1252.
-    for fallback in ("utf-8", "cp1252"):
-        if fallback not in attempted_encodings:
-            yield fallback
-
-
-def decode_body(
-    body: bytes, uri: str, content_type: Optional[str] = None
-) -> Optional["etree.Element"]:
-    """
-    This uses lxml to parse the HTML document.
-
-    Args:
-        body: The HTML document, as bytes.
-        uri: The URI used to download the body.
-        content_type: The Content-Type header.
-
-    Returns:
-        The parsed HTML body, or None if an error occurred during processed.
-    """
-    # If there's no body, nothing useful is going to be found.
-    if not body:
-        return None
-
-    # The idea here is that multiple encodings are tried until one works.
-    # Unfortunately the result is never used and then LXML will decode the string
-    # again with the found encoding.
-    for encoding in _get_html_media_encodings(body, content_type):
-        try:
-            body.decode(encoding)
-        except Exception:
-            pass
-        else:
-            break
-    else:
-        logger.warning("Unable to decode HTML body for %s", uri)
-        return None
-
-    from lxml import etree
-
-    # Create an HTML parser.
-    parser = etree.HTMLParser(recover=True, encoding=encoding)
-
-    # Attempt to parse the body. Returns None if the body was successfully
-    # parsed, but no tree was found.
-    return etree.fromstring(body, parser)
-
-
-def _get_meta_tags(
-    tree: "etree.Element",
-    property: str,
-    prefix: str,
-    property_mapper: Optional[Callable[[str], Optional[str]]] = None,
-) -> Dict[str, Optional[str]]:
-    """
-    Search for meta tags prefixed with a particular string.
-
-    Args:
-        tree: The parsed HTML document.
-        property: The name of the property which contains the tag name, e.g.
-            "property" for Open Graph.
-        prefix: The prefix on the property to search for, e.g. "og" for Open Graph.
-        property_mapper: An optional callable to map the property to the Open Graph
-            form. Can return None for a key to ignore that key.
-
-    Returns:
-        A map of tag name to value.
-    """
-    results: Dict[str, Optional[str]] = {}
-    for tag in tree.xpath(
-        f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]"
-    ):
-        # if we've got more than 50 tags, someone is taking the piss
-        if len(results) >= 50:
-            logger.warning(
-                "Skipping parsing of Open Graph for page with too many '%s:' tags",
-                prefix,
-            )
-            return {}
-
-        key = tag.attrib[property]
-        if property_mapper:
-            key = property_mapper(key)
-            # None is a special value used to ignore a value.
-            if key is None:
-                continue
-
-        results[key] = tag.attrib["content"]
-
-    return results
-
-
-def _map_twitter_to_open_graph(key: str) -> Optional[str]:
-    """
-    Map a Twitter card property to the analogous Open Graph property.
-
-    Args:
-        key: The Twitter card property (starts with "twitter:").
-
-    Returns:
-        The Open Graph property (starts with "og:") or None to have this property
-        be ignored.
-    """
-    # Twitter card properties with no analogous Open Graph property.
-    if key == "twitter:card" or key == "twitter:creator":
-        return None
-    if key == "twitter:site":
-        return "og:site_name"
-    # Otherwise, swap twitter to og.
-    return "og" + key[7:]
-
-
-def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
-    """
-    Parse the HTML document into an Open Graph response.
-
-    This uses lxml to search the HTML document for Open Graph data (or
-    synthesizes it from the document).
-
-    Args:
-        tree: The parsed HTML document.
-
-    Returns:
-        The Open Graph response as a dictionary.
-    """
-
-    # Search for Open Graph (og:) meta tags, e.g.:
-    #
-    # "og:type"         : "video",
-    # "og:url"          : "https://www.youtube.com/watch?v=LXDBoHyjmtw",
-    # "og:site_name"    : "YouTube",
-    # "og:video:type"   : "application/x-shockwave-flash",
-    # "og:description"  : "Fun stuff happening here",
-    # "og:title"        : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon",
-    # "og:image"        : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg",
-    # "og:video:url"    : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1",
-    # "og:video:width"  : "1280"
-    # "og:video:height" : "720",
-    # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
-
-    og = _get_meta_tags(tree, "property", "og")
-
-    # TODO: Search for properties specific to the different Open Graph types,
-    # such as article: meta tags, e.g.:
-    #
-    # "article:publisher" : "https://www.facebook.com/thethudonline" />
-    # "article:author" content="https://www.facebook.com/thethudonline" />
-    # "article:tag" content="baby" />
-    # "article:section" content="Breaking News" />
-    # "article:published_time" content="2016-03-31T19:58:24+00:00" />
-    # "article:modified_time" content="2016-04-01T18:31:53+00:00" />
-
-    # Search for Twitter Card (twitter:) meta tags, e.g.:
-    #
-    # "twitter:site"    : "@matrixdotorg"
-    # "twitter:creator" : "@matrixdotorg"
-    #
-    # Twitter cards tags also duplicate Open Graph tags.
-    #
-    # See https://developer.twitter.com/en/docs/twitter-for-websites/cards/guides/getting-started
-    twitter = _get_meta_tags(tree, "name", "twitter", _map_twitter_to_open_graph)
-    # Merge the Twitter values with the Open Graph values, but do not overwrite
-    # information from Open Graph tags.
-    for key, value in twitter.items():
-        if key not in og:
-            og[key] = value
-
-    if "og:title" not in og:
-        # Attempt to find a title from the title tag, or the biggest header on the page.
-        title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")
-        if title:
-            og["og:title"] = title[0].strip()
-        else:
-            og["og:title"] = None
-
-    if "og:image" not in og:
-        meta_image = tree.xpath(
-            "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
-        )
-        # If a meta image is found, use it.
-        if meta_image:
-            og["og:image"] = meta_image[0]
-        else:
-            # Try to find images which are larger than 10px by 10px.
-            #
-            # TODO: consider inlined CSS styles as well as width & height attribs
-            images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
-            images = sorted(
-                images,
-                key=lambda i: (
-                    -1 * float(i.attrib["width"]) * float(i.attrib["height"])
-                ),
-            )
-            # If no images were found, try to find *any* images.
-            if not images:
-                images = tree.xpath("//img[@src][1]")
-            if images:
-                og["og:image"] = images[0].attrib["src"]
-
-            # Finally, fallback to the favicon if nothing else.
-            else:
-                favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]")
-                if favicons:
-                    og["og:image"] = favicons[0]
-
-    if "og:description" not in og:
-        # Check the first meta description tag for content.
-        meta_description = tree.xpath(
-            "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
-        )
-        # If a meta description is found with content, use it.
-        if meta_description:
-            og["og:description"] = meta_description[0]
-        else:
-            og["og:description"] = parse_html_description(tree)
-    elif og["og:description"]:
-        # This must be a non-empty string at this point.
-        assert isinstance(og["og:description"], str)
-        og["og:description"] = summarize_paragraphs([og["og:description"]])
-
-    # TODO: delete the url downloads to stop diskfilling,
-    # as we only ever cared about its OG
-    return og
-
-
-def parse_html_description(tree: "etree.Element") -> Optional[str]:
-    """
-    Calculate a text description based on an HTML document.
-
-    Grabs any text nodes which are inside the <body/> tag, unless they are within
-    an HTML5 semantic markup tag (<header/>, <nav/>, <aside/>, <footer/>), or
-    if they are within a <script/>, <svg/> or <style/> tag, or if they are within
-    a tag whose content is usually only shown to old browsers
-    (<iframe/>, <video/>, <canvas/>, <picture/>).
-
-    This is a very very very coarse approximation to a plain text render of the page.
-
-    Args:
-        tree: The parsed HTML document.
-
-    Returns:
-        The plain text description, or None if one cannot be generated.
-    """
-    # We don't just use XPATH here as that is slow on some machines.
-
-    from lxml import etree
-
-    TAGS_TO_REMOVE = {
-        "header",
-        "nav",
-        "aside",
-        "footer",
-        "script",
-        "noscript",
-        "style",
-        "svg",
-        "iframe",
-        "video",
-        "canvas",
-        "img",
-        "picture",
-        etree.Comment,
-    }
-
-    # Split all the text nodes into paragraphs (by splitting on new
-    # lines)
-    text_nodes = (
-        re.sub(r"\s+", "\n", el).strip()
-        for el in _iterate_over_text(tree.find("body"), TAGS_TO_REMOVE)
-    )
-    return summarize_paragraphs(text_nodes)
-
-
-def _iterate_over_text(
-    tree: Optional["etree.Element"],
-    tags_to_ignore: Set[Union[str, "etree.Comment"]],
-    stack_limit: int = 1024,
-) -> Generator[str, None, None]:
-    """Iterate over the tree returning text nodes in a depth first fashion,
-    skipping text nodes inside certain tags.
-
-    Args:
-        tree: The parent element to iterate. Can be None if there isn't one.
-        tags_to_ignore: Set of tags to ignore
-        stack_limit: Maximum stack size limit for depth-first traversal.
-            Nodes will be dropped if this limit is hit, which may truncate the
-            textual result.
-            Intended to limit the maximum working memory when generating a preview.
-    """
-
-    if tree is None:
-        return
-
-    # This is a stack whose items are elements to iterate over *or* strings
-    # to be returned.
-    elements: List[Union[str, "etree.Element"]] = [tree]
-    while elements:
-        el = elements.pop()
-
-        if isinstance(el, str):
-            yield el
-        elif el.tag not in tags_to_ignore:
-            # If the element isn't meant for display, ignore it.
-            if el.get("role") in ARIA_ROLES_TO_IGNORE:
-                continue
-
-            # el.text is the text before the first child, so we can immediately
-            # return it if the text exists.
-            if el.text:
-                yield el.text
-
-            # We add to the stack all the element's children, interspersed with
-            # each child's tail text (if it exists).
-            #
-            # We iterate in reverse order so that earlier pieces of text appear
-            # closer to the top of the stack.
-            for child in el.iterchildren(reversed=True):
-                if len(elements) > stack_limit:
-                    # We've hit our limit for working memory
-                    break
-
-                if child.tail:
-                    # The tail text of a node is text that comes *after* the node,
-                    # so we always include it even if we ignore the child node.
-                    elements.append(child.tail)
-
-                elements.append(child)
-
-
-def summarize_paragraphs(
-    text_nodes: Iterable[str], min_size: int = 200, max_size: int = 500
-) -> Optional[str]:
-    """
-    Try to get a summary respecting first paragraph and then word boundaries.
-
-    Args:
-        text_nodes: The paragraphs to summarize.
-        min_size: The minimum number of words to include.
-        max_size: The maximum number of words to include.
-
-    Returns:
-        A summary of the text nodes, or None if that was not possible.
-    """
-
-    # TODO: Respect sentences?
-
-    description = ""
-
-    # Keep adding paragraphs until we get to the MIN_SIZE.
-    for text_node in text_nodes:
-        if len(description) < min_size:
-            text_node = re.sub(r"[\t \r\n]+", " ", text_node)
-            description += text_node + "\n\n"
-        else:
-            break
-
-    description = description.strip()
-    description = re.sub(r"[\t ]+", " ", description)
-    description = re.sub(r"[\t \r\n]*[\r\n]+", "\n\n", description)
-
-    # If the concatenation of paragraphs to get above MIN_SIZE
-    # took us over MAX_SIZE, then we need to truncate mid paragraph
-    if len(description) > max_size:
-        new_desc = ""
-
-        # This splits the paragraph into words, but keeping the
-        # (preceding) whitespace intact so we can easily concat
-        # words back together.
-        for match in re.finditer(r"\s*\S+", description):
-            word = match.group()
-
-            # Keep adding words while the total length is less than
-            # MAX_SIZE.
-            if len(word) + len(new_desc) < max_size:
-                new_desc += word
-            else:
-                # At this point the next word *will* take us over
-                # MAX_SIZE, but we also want to ensure that its not
-                # a huge word. If it is add it anyway and we'll
-                # truncate later.
-                if len(new_desc) < min_size:
-                    new_desc += word
-                break
-
-        # Double check that we're not over the limit
-        if len(new_desc) > max_size:
-            new_desc = new_desc[:max_size]
-
-        # We always add an ellipsis because at the very least
-        # we chopped mid paragraph.
-        description = new_desc.strip() + "…"
-    return description if description else None
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
deleted file mode 100644
index 4a594ab9d8..0000000000
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ /dev/null
@@ -1,871 +0,0 @@
-# Copyright 2016 OpenMarket Ltd
-# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import datetime
-import errno
-import fnmatch
-import logging
-import os
-import re
-import shutil
-import sys
-import traceback
-from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple
-from urllib.parse import urljoin, urlparse, urlsplit
-from urllib.request import urlopen
-
-import attr
-
-from twisted.internet.defer import Deferred
-from twisted.internet.error import DNSLookupError
-
-from synapse.api.errors import Codes, SynapseError
-from synapse.http.client import SimpleHttpClient
-from synapse.http.server import (
-    DirectServeJsonResource,
-    respond_with_json,
-    respond_with_json_bytes,
-)
-from synapse.http.servlet import parse_integer, parse_string
-from synapse.http.site import SynapseRequest
-from synapse.logging.context import make_deferred_yieldable, run_in_background
-from synapse.metrics.background_process_metrics import run_as_background_process
-from synapse.rest.media.v1._base import get_filename_from_headers
-from synapse.rest.media.v1.media_storage import MediaStorage
-from synapse.rest.media.v1.oembed import OEmbedProvider
-from synapse.rest.media.v1.preview_html import decode_body, parse_html_to_open_graph
-from synapse.types import JsonDict, UserID
-from synapse.util import json_encoder
-from synapse.util.async_helpers import ObservableDeferred
-from synapse.util.caches.expiringcache import ExpiringCache
-from synapse.util.stringutils import random_string
-
-from ._base import FileInfo
-
-if TYPE_CHECKING:
-    from synapse.rest.media.v1.media_repository import MediaRepository
-    from synapse.server import HomeServer
-
-logger = logging.getLogger(__name__)
-
-OG_TAG_NAME_MAXLEN = 50
-OG_TAG_VALUE_MAXLEN = 1000
-
-ONE_HOUR = 60 * 60 * 1000
-ONE_DAY = 24 * ONE_HOUR
-IMAGE_CACHE_EXPIRY_MS = 2 * ONE_DAY
-
-
-@attr.s(slots=True, frozen=True, auto_attribs=True)
-class DownloadResult:
-    length: int
-    uri: str
-    response_code: int
-    media_type: str
-    download_name: Optional[str]
-    expires: int
-    etag: Optional[str]
-
-
-@attr.s(slots=True, frozen=True, auto_attribs=True)
-class MediaInfo:
-    """
-    Information parsed from downloading media being previewed.
-    """
-
-    # The Content-Type header of the response.
-    media_type: str
-    # The length (in bytes) of the downloaded media.
-    media_length: int
-    # The media filename, according to the server. This is parsed from the
-    # returned headers, if possible.
-    download_name: Optional[str]
-    # The time of the preview.
-    created_ts_ms: int
-    # Information from the media storage provider about where the file is stored
-    # on disk.
-    filesystem_id: str
-    filename: str
-    # The URI being previewed.
-    uri: str
-    # The HTTP response code.
-    response_code: int
-    # The timestamp (in milliseconds) of when this preview expires.
-    expires: int
-    # The ETag header of the response.
-    etag: Optional[str]
-
-
-class PreviewUrlResource(DirectServeJsonResource):
-    """
-    The `GET /_matrix/media/r0/preview_url` endpoint provides a generic preview API
-    for URLs which outputs Open Graph (https://ogp.me/) responses (with some Matrix
-    specific additions).
-
-    This does have trade-offs compared to other designs:
-
-    * Pros:
-      * Simple and flexible; can be used by any clients at any point
-    * Cons:
-      * If each homeserver provides one of these independently, all the homeservers in a
-        room may needlessly DoS the target URI
-      * The URL metadata must be stored somewhere, rather than just using Matrix
-        itself to store the media.
-      * Matrix cannot be used to distribute the metadata between homeservers.
-
-    When Synapse is asked to preview a URL it does the following:
-
-    1. Checks against a URL blacklist (defined as `url_preview_url_blacklist` in the
-       config).
-    2. Checks the URL against an in-memory cache and returns the result if it exists. (This
-       is also used to de-duplicate processing of multiple in-flight requests at once.)
-    3. Kicks off a background process to generate a preview:
-       1. Checks URL and timestamp against the database cache and returns the result if it
-          has not expired and was successful (a 2xx return code).
-       2. Checks if the URL matches an oEmbed (https://oembed.com/) pattern. If it
-          does, update the URL to download.
-       3. Downloads the URL and stores it into a file via the media storage provider
-          and saves the local media metadata.
-       4. If the media is an image:
-          1. Generates thumbnails.
-          2. Generates an Open Graph response based on image properties.
-       5. If the media is HTML:
-          1. Decodes the HTML via the stored file.
-          2. Generates an Open Graph response from the HTML.
-          3. If a JSON oEmbed URL was found in the HTML via autodiscovery:
-             1. Downloads the URL and stores it into a file via the media storage provider
-                and saves the local media metadata.
-             2. Convert the oEmbed response to an Open Graph response.
-             3. Override any Open Graph data from the HTML with data from oEmbed.
-          4. If an image exists in the Open Graph response:
-             1. Downloads the URL and stores it into a file via the media storage
-                provider and saves the local media metadata.
-             2. Generates thumbnails.
-             3. Updates the Open Graph response based on image properties.
-       6. If the media is JSON and an oEmbed URL was found:
-          1. Convert the oEmbed response to an Open Graph response.
-          2. If a thumbnail or image is in the oEmbed response:
-             1. Downloads the URL and stores it into a file via the media storage
-                provider and saves the local media metadata.
-             2. Generates thumbnails.
-             3. Updates the Open Graph response based on image properties.
-       7. Stores the result in the database cache.
-    4. Returns the result.
-
-    If any additional requests (e.g. from oEmbed autodiscovery, step 5.3 or
-    image thumbnailing, step 5.4 or 6.4) fails then the URL preview as a whole
-    does not fail. As much information as possible is returned.
-
-    The in-memory cache expires after 1 hour.
-
-    Expired entries in the database cache (and their associated media files) are
-    deleted every 10 seconds. The default expiration time is 1 hour from download.
-    """
-
-    isLeaf = True
-
-    def __init__(
-        self,
-        hs: "HomeServer",
-        media_repo: "MediaRepository",
-        media_storage: MediaStorage,
-    ):
-        super().__init__()
-
-        self.auth = hs.get_auth()
-        self.clock = hs.get_clock()
-        self.filepaths = media_repo.filepaths
-        self.max_spider_size = hs.config.media.max_spider_size
-        self.server_name = hs.hostname
-        self.store = hs.get_datastores().main
-        self.client = SimpleHttpClient(
-            hs,
-            treq_args={"browser_like_redirects": True},
-            ip_whitelist=hs.config.media.url_preview_ip_range_whitelist,
-            ip_blacklist=hs.config.media.url_preview_ip_range_blacklist,
-            use_proxy=True,
-        )
-        self.media_repo = media_repo
-        self.primary_base_path = media_repo.primary_base_path
-        self.media_storage = media_storage
-
-        self._oembed = OEmbedProvider(hs)
-
-        # We run the background jobs if we're the instance specified (or no
-        # instance is specified, where we assume there is only one instance
-        # serving media).
-        instance_running_jobs = hs.config.media.media_instance_running_background_jobs
-        self._worker_run_media_background_jobs = (
-            instance_running_jobs is None
-            or instance_running_jobs == hs.get_instance_name()
-        )
-
-        self.url_preview_url_blacklist = hs.config.media.url_preview_url_blacklist
-        self.url_preview_accept_language = hs.config.media.url_preview_accept_language
-
-        # memory cache mapping urls to an ObservableDeferred returning
-        # JSON-encoded OG metadata
-        self._cache: ExpiringCache[str, ObservableDeferred] = ExpiringCache(
-            cache_name="url_previews",
-            clock=self.clock,
-            # don't spider URLs more often than once an hour
-            expiry_ms=ONE_HOUR,
-        )
-
-        if self._worker_run_media_background_jobs:
-            self._cleaner_loop = self.clock.looping_call(
-                self._start_expire_url_cache_data, 10 * 1000
-            )
-
-    async def _async_render_OPTIONS(self, request: SynapseRequest) -> None:
-        request.setHeader(b"Allow", b"OPTIONS, GET")
-        respond_with_json(request, 200, {}, send_cors=True)
-
-    async def _async_render_GET(self, request: SynapseRequest) -> None:
-        # XXX: if get_user_by_req fails, what should we do in an async render?
-        requester = await self.auth.get_user_by_req(request)
-        url = parse_string(request, "url", required=True)
-        ts = parse_integer(request, "ts")
-        if ts is None:
-            ts = self.clock.time_msec()
-
-        # XXX: we could move this into _do_preview if we wanted.
-        url_tuple = urlsplit(url)
-        for entry in self.url_preview_url_blacklist:
-            match = True
-            for attrib in entry:
-                pattern = entry[attrib]
-                value = getattr(url_tuple, attrib)
-                logger.debug(
-                    "Matching attrib '%s' with value '%s' against pattern '%s'",
-                    attrib,
-                    value,
-                    pattern,
-                )
-
-                if value is None:
-                    match = False
-                    continue
-
-                # Some attributes might not be parsed as strings by urlsplit (such as the
-                # port, which is parsed as an int). Because we use match functions that
-                # expect strings, we want to make sure that's what we give them.
-                value_str = str(value)
-
-                if pattern.startswith("^"):
-                    if not re.match(pattern, value_str):
-                        match = False
-                        continue
-                else:
-                    if not fnmatch.fnmatch(value_str, pattern):
-                        match = False
-                        continue
-            if match:
-                logger.warning("URL %s blocked by url_blacklist entry %s", url, entry)
-                raise SynapseError(
-                    403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN
-                )
-
-        # the in-memory cache:
-        # * ensures that only one request is active at a time
-        # * takes load off the DB for the thundering herds
-        # * also caches any failures (unlike the DB) so we don't keep
-        #    requesting the same endpoint
-
-        observable = self._cache.get(url)
-
-        if not observable:
-            download = run_in_background(self._do_preview, url, requester.user, ts)
-            observable = ObservableDeferred(download, consumeErrors=True)
-            self._cache[url] = observable
-        else:
-            logger.info("Returning cached response")
-
-        og = await make_deferred_yieldable(observable.observe())
-        respond_with_json_bytes(request, 200, og, send_cors=True)
-
-    async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
-        """Check the db, and download the URL and build a preview
-
-        Args:
-            url: The URL to preview.
-            user: The user requesting the preview.
-            ts: The timestamp requested for the preview.
-
-        Returns:
-            json-encoded og data
-        """
-        # check the URL cache in the DB (which will also provide us with
-        # historical previews, if we have any)
-        cache_result = await self.store.get_url_cache(url, ts)
-        if (
-            cache_result
-            and cache_result["expires_ts"] > ts
-            and cache_result["response_code"] / 100 == 2
-        ):
-            # It may be stored as text in the database, not as bytes (such as
-            # PostgreSQL). If so, encode it back before handing it on.
-            og = cache_result["og"]
-            if isinstance(og, str):
-                og = og.encode("utf8")
-            return og
-
-        # If this URL can be accessed via oEmbed, use that instead.
-        url_to_download = url
-        oembed_url = self._oembed.get_oembed_url(url)
-        if oembed_url:
-            url_to_download = oembed_url
-
-        media_info = await self._handle_url(url_to_download, user)
-
-        logger.debug("got media_info of '%s'", media_info)
-
-        # The number of milliseconds that the response should be considered valid.
-        expiration_ms = media_info.expires
-        author_name: Optional[str] = None
-
-        if _is_media(media_info.media_type):
-            file_id = media_info.filesystem_id
-            dims = await self.media_repo._generate_thumbnails(
-                None, file_id, file_id, media_info.media_type, url_cache=True
-            )
-
-            og = {
-                "og:description": media_info.download_name,
-                "og:image": f"mxc://{self.server_name}/{media_info.filesystem_id}",
-                "og:image:type": media_info.media_type,
-                "matrix:image:size": media_info.media_length,
-            }
-
-            if dims:
-                og["og:image:width"] = dims["width"]
-                og["og:image:height"] = dims["height"]
-            else:
-                logger.warning("Couldn't get dims for %s" % url)
-
-            # define our OG response for this media
-        elif _is_html(media_info.media_type):
-            # TODO: somehow stop a big HTML tree from exploding synapse's RAM
-
-            with open(media_info.filename, "rb") as file:
-                body = file.read()
-
-            tree = decode_body(body, media_info.uri, media_info.media_type)
-            if tree is not None:
-                # Check if this HTML document points to oEmbed information and
-                # defer to that.
-                oembed_url = self._oembed.autodiscover_from_html(tree)
-                og_from_oembed: JsonDict = {}
-                if oembed_url:
-                    try:
-                        oembed_info = await self._handle_url(
-                            oembed_url, user, allow_data_urls=True
-                        )
-                    except Exception as e:
-                        # Fetching the oEmbed info failed, don't block the entire URL preview.
-                        logger.warning(
-                            "oEmbed fetch failed during URL preview: %s errored with %s",
-                            oembed_url,
-                            e,
-                        )
-                    else:
-                        (
-                            og_from_oembed,
-                            author_name,
-                            expiration_ms,
-                        ) = await self._handle_oembed_response(
-                            url, oembed_info, expiration_ms
-                        )
-
-                # Parse Open Graph information from the HTML in case the oEmbed
-                # response failed or is incomplete.
-                og_from_html = parse_html_to_open_graph(tree)
-
-                # Compile the Open Graph response by using the scraped
-                # information from the HTML and overlaying any information
-                # from the oEmbed response.
-                og = {**og_from_html, **og_from_oembed}
-
-                await self._precache_image_url(user, media_info, og)
-            else:
-                og = {}
-
-        elif oembed_url:
-            # Handle the oEmbed information.
-            og, author_name, expiration_ms = await self._handle_oembed_response(
-                url, media_info, expiration_ms
-            )
-            await self._precache_image_url(user, media_info, og)
-
-        else:
-            logger.warning("Failed to find any OG data in %s", url)
-            og = {}
-
-        # If we don't have a title but we have author_name, copy it as
-        # title
-        if not og.get("og:title") and author_name:
-            og["og:title"] = author_name
-
-        # filter out any stupidly long values
-        keys_to_remove = []
-        for k, v in og.items():
-            # values can be numeric as well as strings, hence the cast to str
-            if len(k) > OG_TAG_NAME_MAXLEN or len(str(v)) > OG_TAG_VALUE_MAXLEN:
-                logger.warning(
-                    "Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN]
-                )
-                keys_to_remove.append(k)
-        for k in keys_to_remove:
-            del og[k]
-
-        logger.debug("Calculated OG for %s as %s", url, og)
-
-        jsonog = json_encoder.encode(og)
-
-        # Cap the amount of time to consider a response valid.
-        expiration_ms = min(expiration_ms, ONE_DAY)
-
-        # store OG in history-aware DB cache
-        await self.store.store_url_cache(
-            url,
-            media_info.response_code,
-            media_info.etag,
-            media_info.created_ts_ms + expiration_ms,
-            jsonog,
-            media_info.filesystem_id,
-            media_info.created_ts_ms,
-        )
-
-        return jsonog.encode("utf8")
-
-    async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult:
-        """
-        Fetches a remote URL and parses the headers.
-
-        Args:
-             url: The URL to fetch.
-             output_stream: The stream to write the content to.
-
-        Returns:
-            A tuple of:
-                Media length, URL downloaded, the HTTP response code,
-                the media type, the downloaded file name, the number of
-                milliseconds the result is valid for, the etag header.
-        """
-
-        try:
-            logger.debug("Trying to get preview for url '%s'", url)
-            length, headers, uri, code = await self.client.get_file(
-                url,
-                output_stream=output_stream,
-                max_size=self.max_spider_size,
-                headers={
-                    b"Accept-Language": self.url_preview_accept_language,
-                    # Use a custom user agent for the preview because some sites will only return
-                    # Open Graph metadata to crawler user agents. Omit the Synapse version
-                    # string to avoid leaking information.
-                    b"User-Agent": [
-                        "Synapse (bot; +https://github.com/matrix-org/synapse)"
-                    ],
-                },
-                is_allowed_content_type=_is_previewable,
-            )
-        except SynapseError:
-            # Pass SynapseErrors through directly, so that the servlet
-            # handler will return a SynapseError to the client instead of
-            # blank data or a 500.
-            raise
-        except DNSLookupError:
-            # DNS lookup returned no results
-            # Note: This will also be the case if one of the resolved IP
-            # addresses is blacklisted
-            raise SynapseError(
-                502,
-                "DNS resolution failure during URL preview generation",
-                Codes.UNKNOWN,
-            )
-        except Exception as e:
-            # FIXME: pass through 404s and other error messages nicely
-            logger.warning("Error downloading %s: %r", url, e)
-
-            raise SynapseError(
-                500,
-                "Failed to download content: %s"
-                % (traceback.format_exception_only(sys.exc_info()[0], e),),
-                Codes.UNKNOWN,
-            )
-
-        if b"Content-Type" in headers:
-            media_type = headers[b"Content-Type"][0].decode("ascii")
-        else:
-            media_type = "application/octet-stream"
-
-        download_name = get_filename_from_headers(headers)
-
-        # FIXME: we should calculate a proper expiration based on the
-        # Cache-Control and Expire headers.  But for now, assume 1 hour.
-        expires = ONE_HOUR
-        etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
-
-        return DownloadResult(
-            length, uri, code, media_type, download_name, expires, etag
-        )
-
-    async def _parse_data_url(
-        self, url: str, output_stream: BinaryIO
-    ) -> DownloadResult:
-        """
-        Parses a data: URL.
-
-        Args:
-             url: The URL to parse.
-             output_stream: The stream to write the content to.
-
-        Returns:
-            A tuple of:
-                Media length, URL downloaded, the HTTP response code,
-                the media type, the downloaded file name, the number of
-                milliseconds the result is valid for, the etag header.
-        """
-
-        try:
-            logger.debug("Trying to parse data url '%s'", url)
-            with urlopen(url) as url_info:
-                # TODO Can this be more efficient.
-                output_stream.write(url_info.read())
-        except Exception as e:
-            logger.warning("Error parsing data: URL %s: %r", url, e)
-
-            raise SynapseError(
-                500,
-                "Failed to parse data URL: %s"
-                % (traceback.format_exception_only(sys.exc_info()[0], e),),
-                Codes.UNKNOWN,
-            )
-
-        return DownloadResult(
-            # Read back the length that has been written.
-            length=output_stream.tell(),
-            uri=url,
-            # If it was parsed, consider this a 200 OK.
-            response_code=200,
-            # urlopen shoves the media-type from the data URL into the content type
-            # header object.
-            media_type=url_info.headers.get_content_type(),
-            # Some features are not supported by data: URLs.
-            download_name=None,
-            expires=ONE_HOUR,
-            etag=None,
-        )
-
-    async def _handle_url(
-        self, url: str, user: UserID, allow_data_urls: bool = False
-    ) -> MediaInfo:
-        """
-        Fetches content from a URL and parses the result to generate a MediaInfo.
-
-        It uses the media storage provider to persist the fetched content and
-        stores the mapping into the database.
-
-        Args:
-             url: The URL to fetch.
-             user: The user who ahs requested this URL.
-             allow_data_urls: True if data URLs should be allowed.
-
-        Returns:
-            A MediaInfo object describing the fetched content.
-        """
-
-        # TODO: we should probably honour robots.txt... except in practice
-        # we're most likely being explicitly triggered by a human rather than a
-        # bot, so are we really a robot?
-
-        file_id = datetime.date.today().isoformat() + "_" + random_string(16)
-
-        file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)
-
-        with self.media_storage.store_into_file(file_info) as (f, fname, finish):
-            if url.startswith("data:"):
-                if not allow_data_urls:
-                    raise SynapseError(
-                        500, "Previewing of data: URLs is forbidden", Codes.UNKNOWN
-                    )
-
-                download_result = await self._parse_data_url(url, f)
-            else:
-                download_result = await self._download_url(url, f)
-
-            await finish()
-
-        try:
-            time_now_ms = self.clock.time_msec()
-
-            await self.store.store_local_media(
-                media_id=file_id,
-                media_type=download_result.media_type,
-                time_now_ms=time_now_ms,
-                upload_name=download_result.download_name,
-                media_length=download_result.length,
-                user_id=user,
-                url_cache=url,
-            )
-
-        except Exception as e:
-            logger.error("Error handling downloaded %s: %r", url, e)
-            # TODO: we really ought to delete the downloaded file in this
-            # case, since we won't have recorded it in the db, and will
-            # therefore not expire it.
-            raise
-
-        return MediaInfo(
-            media_type=download_result.media_type,
-            media_length=download_result.length,
-            download_name=download_result.download_name,
-            created_ts_ms=time_now_ms,
-            filesystem_id=file_id,
-            filename=fname,
-            uri=download_result.uri,
-            response_code=download_result.response_code,
-            expires=download_result.expires,
-            etag=download_result.etag,
-        )
-
-    async def _precache_image_url(
-        self, user: UserID, media_info: MediaInfo, og: JsonDict
-    ) -> None:
-        """
-        Pre-cache the image (if one exists) for posterity
-
-        Args:
-            user: The user requesting the preview.
-            media_info: The media being previewed.
-            og: The Open Graph dictionary. This is modified with image information.
-        """
-        # If there's no image or it is blank, there's nothing to do.
-        if "og:image" not in og:
-            return
-
-        # Remove the raw image URL, this will be replaced with an MXC URL, if successful.
-        image_url = og.pop("og:image")
-        if not image_url:
-            return
-
-        # The image URL from the HTML might be relative to the previewed page,
-        # convert it to an URL which can be requested directly.
-        url_parts = urlparse(image_url)
-        if url_parts.scheme != "data":
-            image_url = urljoin(media_info.uri, image_url)
-
-        # FIXME: it might be cleaner to use the same flow as the main /preview_url
-        # request itself and benefit from the same caching etc.  But for now we
-        # just rely on the caching on the master request to speed things up.
-        try:
-            image_info = await self._handle_url(image_url, user, allow_data_urls=True)
-        except Exception as e:
-            # Pre-caching the image failed, don't block the entire URL preview.
-            logger.warning(
-                "Pre-caching image failed during URL preview: %s errored with %s",
-                image_url,
-                e,
-            )
-            return
-
-        if _is_media(image_info.media_type):
-            # TODO: make sure we don't choke on white-on-transparent images
-            file_id = image_info.filesystem_id
-            dims = await self.media_repo._generate_thumbnails(
-                None, file_id, file_id, image_info.media_type, url_cache=True
-            )
-            if dims:
-                og["og:image:width"] = dims["width"]
-                og["og:image:height"] = dims["height"]
-            else:
-                logger.warning("Couldn't get dims for %s", image_url)
-
-            og["og:image"] = f"mxc://{self.server_name}/{image_info.filesystem_id}"
-            og["og:image:type"] = image_info.media_type
-            og["matrix:image:size"] = image_info.media_length
-
-    async def _handle_oembed_response(
-        self, url: str, media_info: MediaInfo, expiration_ms: int
-    ) -> Tuple[JsonDict, Optional[str], int]:
-        """
-        Parse the downloaded oEmbed info.
-
-        Args:
-            url: The URL which is being previewed (not the one which was
-                requested).
-            media_info: The media being previewed.
-            expiration_ms: The length of time, in milliseconds, the media is valid for.
-
-        Returns:
-            A tuple of:
-                The Open Graph dictionary, if the oEmbed info can be parsed.
-                The author name if it could be retrieved from oEmbed.
-                The (possibly updated) length of time, in milliseconds, the media is valid for.
-        """
-        # If JSON was not returned, there's nothing to do.
-        if not _is_json(media_info.media_type):
-            return {}, None, expiration_ms
-
-        with open(media_info.filename, "rb") as file:
-            body = file.read()
-
-        oembed_response = self._oembed.parse_oembed_response(url, body)
-        open_graph_result = oembed_response.open_graph_result
-
-        # Use the cache age from the oEmbed result, if one was given.
-        if open_graph_result and oembed_response.cache_age is not None:
-            expiration_ms = oembed_response.cache_age
-
-        return open_graph_result, oembed_response.author_name, expiration_ms
-
-    def _start_expire_url_cache_data(self) -> Deferred:
-        return run_as_background_process(
-            "expire_url_cache_data", self._expire_url_cache_data
-        )
-
-    async def _expire_url_cache_data(self) -> None:
-        """Clean up expired url cache content, media and thumbnails."""
-
-        assert self._worker_run_media_background_jobs
-
-        now = self.clock.time_msec()
-
-        logger.debug("Running url preview cache expiry")
-
-        def try_remove_parent_dirs(dirs: Iterable[str]) -> None:
-            """Attempt to remove the given chain of parent directories
-
-            Args:
-                dirs: The list of directory paths to delete, with children appearing
-                    before their parents.
-            """
-            for dir in dirs:
-                try:
-                    os.rmdir(dir)
-                except FileNotFoundError:
-                    # Already deleted, continue with deleting the rest
-                    pass
-                except OSError as e:
-                    # Failed, skip deleting the rest of the parent dirs
-                    if e.errno != errno.ENOTEMPTY:
-                        logger.warning(
-                            "Failed to remove media directory while clearing url preview cache: %r: %s",
-                            dir,
-                            e,
-                        )
-                    break
-
-        # First we delete expired url cache entries
-        media_ids = await self.store.get_expired_url_cache(now)
-
-        removed_media = []
-        for media_id in media_ids:
-            fname = self.filepaths.url_cache_filepath(media_id)
-            try:
-                os.remove(fname)
-            except FileNotFoundError:
-                pass  # If the path doesn't exist, meh
-            except OSError as e:
-                logger.warning(
-                    "Failed to remove media while clearing url preview cache: %r: %s",
-                    media_id,
-                    e,
-                )
-                continue
-
-            removed_media.append(media_id)
-
-            dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
-            try_remove_parent_dirs(dirs)
-
-        await self.store.delete_url_cache(removed_media)
-
-        if removed_media:
-            logger.debug(
-                "Deleted %d entries from url preview cache", len(removed_media)
-            )
-        else:
-            logger.debug("No entries removed from url preview cache")
-
-        # Now we delete old images associated with the url cache.
-        # These may be cached for a bit on the client (i.e., they
-        # may have a room open with a preview url thing open).
-        # So we wait a couple of days before deleting, just in case.
-        expire_before = now - IMAGE_CACHE_EXPIRY_MS
-        media_ids = await self.store.get_url_cache_media_before(expire_before)
-
-        removed_media = []
-        for media_id in media_ids:
-            fname = self.filepaths.url_cache_filepath(media_id)
-            try:
-                os.remove(fname)
-            except FileNotFoundError:
-                pass  # If the path doesn't exist, meh
-            except OSError as e:
-                logger.warning(
-                    "Failed to remove media from url preview cache: %r: %s", media_id, e
-                )
-                continue
-
-            dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
-            try_remove_parent_dirs(dirs)
-
-            thumbnail_dir = self.filepaths.url_cache_thumbnail_directory(media_id)
-            try:
-                shutil.rmtree(thumbnail_dir)
-            except FileNotFoundError:
-                pass  # If the path doesn't exist, meh
-            except OSError as e:
-                logger.warning(
-                    "Failed to remove media from url preview cache: %r: %s", media_id, e
-                )
-                continue
-
-            removed_media.append(media_id)
-
-            dirs = self.filepaths.url_cache_thumbnail_dirs_to_delete(media_id)
-            # Note that one of the directories to be deleted has already been
-            # removed by the `rmtree` above.
-            try_remove_parent_dirs(dirs)
-
-        await self.store.delete_url_cache_media(removed_media)
-
-        if removed_media:
-            logger.debug("Deleted %d media from url preview cache", len(removed_media))
-        else:
-            logger.debug("No media removed from url preview cache")
-
-
-def _is_media(content_type: str) -> bool:
-    return content_type.lower().startswith("image/")
-
-
-def _is_html(content_type: str) -> bool:
-    content_type = content_type.lower()
-    return content_type.startswith("text/html") or content_type.startswith(
-        "application/xhtml"
-    )
-
-
-def _is_json(content_type: str) -> bool:
-    return content_type.lower().startswith("application/json")
-
-
-def _is_previewable(content_type: str) -> bool:
-    """Returns True for content types for which we will perform URL preview and False
-    otherwise."""
-
-    return _is_html(content_type) or _is_media(content_type) or _is_json(content_type)
diff --git a/synapse/rest/media/v1/storage_provider.py b/synapse/rest/media/v1/storage_provider.py
index 1c9b71d69c..d7653f30ae 100644
--- a/synapse/rest/media/v1/storage_provider.py
+++ b/synapse/rest/media/v1/storage_provider.py
@@ -1,4 +1,4 @@
-# Copyright 2018-2021 The Matrix.org Foundation C.I.C.
+# Copyright 2023 The Matrix.org Foundation C.I.C.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,171 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+#
 
-import abc
-import logging
-import os
-import shutil
-from typing import TYPE_CHECKING, Callable, Optional
-
-from synapse.config._base import Config
-from synapse.logging.context import defer_to_thread, run_in_background
-from synapse.util.async_helpers import maybe_awaitable
-
-from ._base import FileInfo, Responder
-from .media_storage import FileResponder
-
-logger = logging.getLogger(__name__)
-
-if TYPE_CHECKING:
-    from synapse.server import HomeServer
-
-
-class StorageProvider(metaclass=abc.ABCMeta):
-    """A storage provider is a service that can store uploaded media and
-    retrieve them.
-    """
-
-    @abc.abstractmethod
-    async def store_file(self, path: str, file_info: FileInfo) -> None:
-        """Store the file described by file_info. The actual contents can be
-        retrieved by reading the file in file_info.upload_path.
-
-        Args:
-            path: Relative path of file in local cache
-            file_info: The metadata of the file.
-        """
-
-    @abc.abstractmethod
-    async def fetch(self, path: str, file_info: FileInfo) -> Optional[Responder]:
-        """Attempt to fetch the file described by file_info and stream it
-        into writer.
-
-        Args:
-            path: Relative path of file in local cache
-            file_info: The metadata of the file.
-
-        Returns:
-            Returns a Responder if the provider has the file, otherwise returns None.
-        """
-
-
-class StorageProviderWrapper(StorageProvider):
-    """Wraps a storage provider and provides various config options
-
-    Args:
-        backend: The storage provider to wrap.
-        store_local: Whether to store new local files or not.
-        store_synchronous: Whether to wait for file to be successfully
-            uploaded, or todo the upload in the background.
-        store_remote: Whether remote media should be uploaded
-    """
-
-    def __init__(
-        self,
-        backend: StorageProvider,
-        store_local: bool,
-        store_synchronous: bool,
-        store_remote: bool,
-    ):
-        self.backend = backend
-        self.store_local = store_local
-        self.store_synchronous = store_synchronous
-        self.store_remote = store_remote
-
-    def __str__(self) -> str:
-        return "StorageProviderWrapper[%s]" % (self.backend,)
-
-    async def store_file(self, path: str, file_info: FileInfo) -> None:
-        if not file_info.server_name and not self.store_local:
-            return None
-
-        if file_info.server_name and not self.store_remote:
-            return None
-
-        if file_info.url_cache:
-            # The URL preview cache is short lived and not worth offloading or
-            # backing up.
-            return None
-
-        if self.store_synchronous:
-            # store_file is supposed to return an Awaitable, but guard
-            # against improper implementations.
-            await maybe_awaitable(self.backend.store_file(path, file_info))  # type: ignore
-        else:
-            # TODO: Handle errors.
-            async def store() -> None:
-                try:
-                    return await maybe_awaitable(
-                        self.backend.store_file(path, file_info)
-                    )
-                except Exception:
-                    logger.exception("Error storing file")
-
-            run_in_background(store)
-
-    async def fetch(self, path: str, file_info: FileInfo) -> Optional[Responder]:
-        if file_info.url_cache:
-            # Files in the URL preview cache definitely aren't stored here,
-            # so avoid any potentially slow I/O or network access.
-            return None
-
-        # store_file is supposed to return an Awaitable, but guard
-        # against improper implementations.
-        return await maybe_awaitable(self.backend.fetch(path, file_info))
-
-
-class FileStorageProviderBackend(StorageProvider):
-    """A storage provider that stores files in a directory on a filesystem.
-
-    Args:
-        hs
-        config: The config returned by `parse_config`.
-    """
-
-    def __init__(self, hs: "HomeServer", config: str):
-        self.hs = hs
-        self.cache_directory = hs.config.media.media_store_path
-        self.base_directory = config
-
-    def __str__(self) -> str:
-        return "FileStorageProviderBackend[%s]" % (self.base_directory,)
-
-    async def store_file(self, path: str, file_info: FileInfo) -> None:
-        """See StorageProvider.store_file"""
-
-        primary_fname = os.path.join(self.cache_directory, path)
-        backup_fname = os.path.join(self.base_directory, path)
-
-        dirname = os.path.dirname(backup_fname)
-        os.makedirs(dirname, exist_ok=True)
-
-        # mypy needs help inferring the type of the second parameter, which is generic
-        shutil_copyfile: Callable[[str, str], str] = shutil.copyfile
-        await defer_to_thread(
-            self.hs.get_reactor(),
-            shutil_copyfile,
-            primary_fname,
-            backup_fname,
-        )
-
-    async def fetch(self, path: str, file_info: FileInfo) -> Optional[Responder]:
-        """See StorageProvider.fetch"""
-
-        backup_fname = os.path.join(self.base_directory, path)
-        if os.path.isfile(backup_fname):
-            return FileResponder(open(backup_fname, "rb"))
-
-        return None
-
-    @staticmethod
-    def parse_config(config: dict) -> str:
-        """Called on startup to parse config supplied. This should parse
-        the config and raise if there is a problem.
-
-        The returned value is passed into the constructor.
-
-        In this case we only care about a single param, the directory, so let's
-        just pull that out.
-        """
-        return Config.ensure_directory(config["directory"])
+# This exists purely for backwards compatibility with media providers.
+from synapse.media.storage_provider import StorageProvider  # noqa: F401
diff --git a/synapse/rest/media/v1/thumbnail_resource.py b/synapse/rest/media/v1/thumbnail_resource.py
deleted file mode 100644
index 3e720018b3..0000000000
--- a/synapse/rest/media/v1/thumbnail_resource.py
+++ /dev/null
@@ -1,555 +0,0 @@
-# Copyright 2014-2016 OpenMarket Ltd
-# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import logging
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
-
-from synapse.api.errors import Codes, SynapseError, cs_error
-from synapse.config.repository import THUMBNAIL_SUPPORTED_MEDIA_FORMAT_MAP
-from synapse.http.server import (
-    DirectServeJsonResource,
-    respond_with_json,
-    set_corp_headers,
-    set_cors_headers,
-)
-from synapse.http.servlet import parse_integer, parse_string
-from synapse.http.site import SynapseRequest
-from synapse.rest.media.v1.media_storage import MediaStorage
-
-from ._base import (
-    FileInfo,
-    ThumbnailInfo,
-    parse_media_id,
-    respond_404,
-    respond_with_file,
-    respond_with_responder,
-)
-
-if TYPE_CHECKING:
-    from synapse.rest.media.v1.media_repository import MediaRepository
-    from synapse.server import HomeServer
-
-logger = logging.getLogger(__name__)
-
-
-class ThumbnailResource(DirectServeJsonResource):
-    isLeaf = True
-
-    def __init__(
-        self,
-        hs: "HomeServer",
-        media_repo: "MediaRepository",
-        media_storage: MediaStorage,
-    ):
-        super().__init__()
-
-        self.store = hs.get_datastores().main
-        self.media_repo = media_repo
-        self.media_storage = media_storage
-        self.dynamic_thumbnails = hs.config.media.dynamic_thumbnails
-        self.server_name = hs.hostname
-
-    async def _async_render_GET(self, request: SynapseRequest) -> None:
-        set_cors_headers(request)
-        set_corp_headers(request)
-        server_name, media_id, _ = parse_media_id(request)
-        width = parse_integer(request, "width", required=True)
-        height = parse_integer(request, "height", required=True)
-        method = parse_string(request, "method", "scale")
-        # TODO Parse the Accept header to get an prioritised list of thumbnail types.
-        m_type = "image/png"
-
-        if server_name == self.server_name:
-            if self.dynamic_thumbnails:
-                await self._select_or_generate_local_thumbnail(
-                    request, media_id, width, height, method, m_type
-                )
-            else:
-                await self._respond_local_thumbnail(
-                    request, media_id, width, height, method, m_type
-                )
-            self.media_repo.mark_recently_accessed(None, media_id)
-        else:
-            if self.dynamic_thumbnails:
-                await self._select_or_generate_remote_thumbnail(
-                    request, server_name, media_id, width, height, method, m_type
-                )
-            else:
-                await self._respond_remote_thumbnail(
-                    request, server_name, media_id, width, height, method, m_type
-                )
-            self.media_repo.mark_recently_accessed(server_name, media_id)
-
-    async def _respond_local_thumbnail(
-        self,
-        request: SynapseRequest,
-        media_id: str,
-        width: int,
-        height: int,
-        method: str,
-        m_type: str,
-    ) -> None:
-        media_info = await self.store.get_local_media(media_id)
-
-        if not media_info:
-            respond_404(request)
-            return
-        if media_info["quarantined_by"]:
-            logger.info("Media is quarantined")
-            respond_404(request)
-            return
-
-        thumbnail_infos = await self.store.get_local_media_thumbnails(media_id)
-        await self._select_and_respond_with_thumbnail(
-            request,
-            width,
-            height,
-            method,
-            m_type,
-            thumbnail_infos,
-            media_id,
-            media_id,
-            url_cache=bool(media_info["url_cache"]),
-            server_name=None,
-        )
-
-    async def _select_or_generate_local_thumbnail(
-        self,
-        request: SynapseRequest,
-        media_id: str,
-        desired_width: int,
-        desired_height: int,
-        desired_method: str,
-        desired_type: str,
-    ) -> None:
-        media_info = await self.store.get_local_media(media_id)
-
-        if not media_info:
-            respond_404(request)
-            return
-        if media_info["quarantined_by"]:
-            logger.info("Media is quarantined")
-            respond_404(request)
-            return
-
-        thumbnail_infos = await self.store.get_local_media_thumbnails(media_id)
-        for info in thumbnail_infos:
-            t_w = info["thumbnail_width"] == desired_width
-            t_h = info["thumbnail_height"] == desired_height
-            t_method = info["thumbnail_method"] == desired_method
-            t_type = info["thumbnail_type"] == desired_type
-
-            if t_w and t_h and t_method and t_type:
-                file_info = FileInfo(
-                    server_name=None,
-                    file_id=media_id,
-                    url_cache=media_info["url_cache"],
-                    thumbnail=ThumbnailInfo(
-                        width=info["thumbnail_width"],
-                        height=info["thumbnail_height"],
-                        type=info["thumbnail_type"],
-                        method=info["thumbnail_method"],
-                    ),
-                )
-
-                t_type = file_info.thumbnail_type
-                t_length = info["thumbnail_length"]
-
-                responder = await self.media_storage.fetch_media(file_info)
-                if responder:
-                    await respond_with_responder(request, responder, t_type, t_length)
-                    return
-
-        logger.debug("We don't have a thumbnail of that size. Generating")
-
-        # Okay, so we generate one.
-        file_path = await self.media_repo.generate_local_exact_thumbnail(
-            media_id,
-            desired_width,
-            desired_height,
-            desired_method,
-            desired_type,
-            url_cache=bool(media_info["url_cache"]),
-        )
-
-        if file_path:
-            await respond_with_file(request, desired_type, file_path)
-        else:
-            logger.warning("Failed to generate thumbnail")
-            raise SynapseError(400, "Failed to generate thumbnail.")
-
-    async def _select_or_generate_remote_thumbnail(
-        self,
-        request: SynapseRequest,
-        server_name: str,
-        media_id: str,
-        desired_width: int,
-        desired_height: int,
-        desired_method: str,
-        desired_type: str,
-    ) -> None:
-        media_info = await self.media_repo.get_remote_media_info(server_name, media_id)
-
-        thumbnail_infos = await self.store.get_remote_media_thumbnails(
-            server_name, media_id
-        )
-
-        file_id = media_info["filesystem_id"]
-
-        for info in thumbnail_infos:
-            t_w = info["thumbnail_width"] == desired_width
-            t_h = info["thumbnail_height"] == desired_height
-            t_method = info["thumbnail_method"] == desired_method
-            t_type = info["thumbnail_type"] == desired_type
-
-            if t_w and t_h and t_method and t_type:
-                file_info = FileInfo(
-                    server_name=server_name,
-                    file_id=media_info["filesystem_id"],
-                    thumbnail=ThumbnailInfo(
-                        width=info["thumbnail_width"],
-                        height=info["thumbnail_height"],
-                        type=info["thumbnail_type"],
-                        method=info["thumbnail_method"],
-                    ),
-                )
-
-                t_type = file_info.thumbnail_type
-                t_length = info["thumbnail_length"]
-
-                responder = await self.media_storage.fetch_media(file_info)
-                if responder:
-                    await respond_with_responder(request, responder, t_type, t_length)
-                    return
-
-        logger.debug("We don't have a thumbnail of that size. Generating")
-
-        # Okay, so we generate one.
-        file_path = await self.media_repo.generate_remote_exact_thumbnail(
-            server_name,
-            file_id,
-            media_id,
-            desired_width,
-            desired_height,
-            desired_method,
-            desired_type,
-        )
-
-        if file_path:
-            await respond_with_file(request, desired_type, file_path)
-        else:
-            logger.warning("Failed to generate thumbnail")
-            raise SynapseError(400, "Failed to generate thumbnail.")
-
-    async def _respond_remote_thumbnail(
-        self,
-        request: SynapseRequest,
-        server_name: str,
-        media_id: str,
-        width: int,
-        height: int,
-        method: str,
-        m_type: str,
-    ) -> None:
-        # TODO: Don't download the whole remote file
-        # We should proxy the thumbnail from the remote server instead of
-        # downloading the remote file and generating our own thumbnails.
-        media_info = await self.media_repo.get_remote_media_info(server_name, media_id)
-
-        thumbnail_infos = await self.store.get_remote_media_thumbnails(
-            server_name, media_id
-        )
-        await self._select_and_respond_with_thumbnail(
-            request,
-            width,
-            height,
-            method,
-            m_type,
-            thumbnail_infos,
-            media_id,
-            media_info["filesystem_id"],
-            url_cache=False,
-            server_name=server_name,
-        )
-
-    async def _select_and_respond_with_thumbnail(
-        self,
-        request: SynapseRequest,
-        desired_width: int,
-        desired_height: int,
-        desired_method: str,
-        desired_type: str,
-        thumbnail_infos: List[Dict[str, Any]],
-        media_id: str,
-        file_id: str,
-        url_cache: bool,
-        server_name: Optional[str] = None,
-    ) -> None:
-        """
-        Respond to a request with an appropriate thumbnail from the previously generated thumbnails.
-
-        Args:
-            request: The incoming request.
-            desired_width: The desired width, the returned thumbnail may be larger than this.
-            desired_height: The desired height, the returned thumbnail may be larger than this.
-            desired_method: The desired method used to generate the thumbnail.
-            desired_type: The desired content-type of the thumbnail.
-            thumbnail_infos: A list of dictionaries of candidate thumbnails.
-            file_id: The ID of the media that a thumbnail is being requested for.
-            url_cache: True if this is from a URL cache.
-            server_name: The server name, if this is a remote thumbnail.
-        """
-        logger.debug(
-            "_select_and_respond_with_thumbnail: media_id=%s desired=%sx%s (%s) thumbnail_infos=%s",
-            media_id,
-            desired_width,
-            desired_height,
-            desired_method,
-            thumbnail_infos,
-        )
-
-        # If `dynamic_thumbnails` is enabled, we expect Synapse to go down a
-        # different code path to handle it.
-        assert not self.dynamic_thumbnails
-
-        if thumbnail_infos:
-            file_info = self._select_thumbnail(
-                desired_width,
-                desired_height,
-                desired_method,
-                desired_type,
-                thumbnail_infos,
-                file_id,
-                url_cache,
-                server_name,
-            )
-            if not file_info:
-                logger.info("Couldn't find a thumbnail matching the desired inputs")
-                respond_404(request)
-                return
-
-            # The thumbnail property must exist.
-            assert file_info.thumbnail is not None
-
-            responder = await self.media_storage.fetch_media(file_info)
-            if responder:
-                await respond_with_responder(
-                    request,
-                    responder,
-                    file_info.thumbnail.type,
-                    file_info.thumbnail.length,
-                )
-                return
-
-            # If we can't find the thumbnail we regenerate it. This can happen
-            # if e.g. we've deleted the thumbnails but still have the original
-            # image somewhere.
-            #
-            # Since we have an entry for the thumbnail in the DB we a) know we
-            # have have successfully generated the thumbnail in the past (so we
-            # don't need to worry about repeatedly failing to generate
-            # thumbnails), and b) have already calculated that appropriate
-            # width/height/method so we can just call the "generate exact"
-            # methods.
-
-            # First let's check that we do actually have the original image
-            # still. This will throw a 404 if we don't.
-            # TODO: We should refetch the thumbnails for remote media.
-            await self.media_storage.ensure_media_is_in_local_cache(
-                FileInfo(server_name, file_id, url_cache=url_cache)
-            )
-
-            if server_name:
-                await self.media_repo.generate_remote_exact_thumbnail(
-                    server_name,
-                    file_id=file_id,
-                    media_id=media_id,
-                    t_width=file_info.thumbnail.width,
-                    t_height=file_info.thumbnail.height,
-                    t_method=file_info.thumbnail.method,
-                    t_type=file_info.thumbnail.type,
-                )
-            else:
-                await self.media_repo.generate_local_exact_thumbnail(
-                    media_id=media_id,
-                    t_width=file_info.thumbnail.width,
-                    t_height=file_info.thumbnail.height,
-                    t_method=file_info.thumbnail.method,
-                    t_type=file_info.thumbnail.type,
-                    url_cache=url_cache,
-                )
-
-            responder = await self.media_storage.fetch_media(file_info)
-            await respond_with_responder(
-                request,
-                responder,
-                file_info.thumbnail.type,
-                file_info.thumbnail.length,
-            )
-        else:
-            # This might be because:
-            # 1. We can't create thumbnails for the given media (corrupted or
-            #    unsupported file type), or
-            # 2. The thumbnailing process never ran or errored out initially
-            #    when the media was first uploaded (these bugs should be
-            #    reported and fixed).
-            # Note that we don't attempt to generate a thumbnail now because
-            # `dynamic_thumbnails` is disabled.
-            logger.info("Failed to find any generated thumbnails")
-
-            respond_with_json(
-                request,
-                400,
-                cs_error(
-                    "Cannot find any thumbnails for the requested media (%r). This might mean the media is not a supported_media_format=(%s) or that thumbnailing failed for some other reason. (Dynamic thumbnails are disabled on this server.)"
-                    % (
-                        request.postpath,
-                        ", ".join(THUMBNAIL_SUPPORTED_MEDIA_FORMAT_MAP.keys()),
-                    ),
-                    code=Codes.UNKNOWN,
-                ),
-                send_cors=True,
-            )
-
-    def _select_thumbnail(
-        self,
-        desired_width: int,
-        desired_height: int,
-        desired_method: str,
-        desired_type: str,
-        thumbnail_infos: List[Dict[str, Any]],
-        file_id: str,
-        url_cache: bool,
-        server_name: Optional[str],
-    ) -> Optional[FileInfo]:
-        """
-        Choose an appropriate thumbnail from the previously generated thumbnails.
-
-        Args:
-            desired_width: The desired width, the returned thumbnail may be larger than this.
-            desired_height: The desired height, the returned thumbnail may be larger than this.
-            desired_method: The desired method used to generate the thumbnail.
-            desired_type: The desired content-type of the thumbnail.
-            thumbnail_infos: A list of dictionaries of candidate thumbnails.
-            file_id: The ID of the media that a thumbnail is being requested for.
-            url_cache: True if this is from a URL cache.
-            server_name: The server name, if this is a remote thumbnail.
-
-        Returns:
-             The thumbnail which best matches the desired parameters.
-        """
-        desired_method = desired_method.lower()
-
-        # The chosen thumbnail.
-        thumbnail_info = None
-
-        d_w = desired_width
-        d_h = desired_height
-
-        if desired_method == "crop":
-            # Thumbnails that match equal or larger sizes of desired width/height.
-            crop_info_list: List[Tuple[int, int, int, bool, int, Dict[str, Any]]] = []
-            # Other thumbnails.
-            crop_info_list2: List[Tuple[int, int, int, bool, int, Dict[str, Any]]] = []
-            for info in thumbnail_infos:
-                # Skip thumbnails generated with different methods.
-                if info["thumbnail_method"] != "crop":
-                    continue
-
-                t_w = info["thumbnail_width"]
-                t_h = info["thumbnail_height"]
-                aspect_quality = abs(d_w * t_h - d_h * t_w)
-                min_quality = 0 if d_w <= t_w and d_h <= t_h else 1
-                size_quality = abs((d_w - t_w) * (d_h - t_h))
-                type_quality = desired_type != info["thumbnail_type"]
-                length_quality = info["thumbnail_length"]
-                if t_w >= d_w or t_h >= d_h:
-                    crop_info_list.append(
-                        (
-                            aspect_quality,
-                            min_quality,
-                            size_quality,
-                            type_quality,
-                            length_quality,
-                            info,
-                        )
-                    )
-                else:
-                    crop_info_list2.append(
-                        (
-                            aspect_quality,
-                            min_quality,
-                            size_quality,
-                            type_quality,
-                            length_quality,
-                            info,
-                        )
-                    )
-            # Pick the most appropriate thumbnail. Some values of `desired_width` and
-            # `desired_height` may result in a tie, in which case we avoid comparing on
-            # the thumbnail info dictionary and pick the thumbnail that appears earlier
-            # in the list of candidates.
-            if crop_info_list:
-                thumbnail_info = min(crop_info_list, key=lambda t: t[:-1])[-1]
-            elif crop_info_list2:
-                thumbnail_info = min(crop_info_list2, key=lambda t: t[:-1])[-1]
-        elif desired_method == "scale":
-            # Thumbnails that match equal or larger sizes of desired width/height.
-            info_list: List[Tuple[int, bool, int, Dict[str, Any]]] = []
-            # Other thumbnails.
-            info_list2: List[Tuple[int, bool, int, Dict[str, Any]]] = []
-
-            for info in thumbnail_infos:
-                # Skip thumbnails generated with different methods.
-                if info["thumbnail_method"] != "scale":
-                    continue
-
-                t_w = info["thumbnail_width"]
-                t_h = info["thumbnail_height"]
-                size_quality = abs((d_w - t_w) * (d_h - t_h))
-                type_quality = desired_type != info["thumbnail_type"]
-                length_quality = info["thumbnail_length"]
-                if t_w >= d_w or t_h >= d_h:
-                    info_list.append((size_quality, type_quality, length_quality, info))
-                else:
-                    info_list2.append(
-                        (size_quality, type_quality, length_quality, info)
-                    )
-            # Pick the most appropriate thumbnail. Some values of `desired_width` and
-            # `desired_height` may result in a tie, in which case we avoid comparing on
-            # the thumbnail info dictionary and pick the thumbnail that appears earlier
-            # in the list of candidates.
-            if info_list:
-                thumbnail_info = min(info_list, key=lambda t: t[:-1])[-1]
-            elif info_list2:
-                thumbnail_info = min(info_list2, key=lambda t: t[:-1])[-1]
-
-        if thumbnail_info:
-            return FileInfo(
-                file_id=file_id,
-                url_cache=url_cache,
-                server_name=server_name,
-                thumbnail=ThumbnailInfo(
-                    width=thumbnail_info["thumbnail_width"],
-                    height=thumbnail_info["thumbnail_height"],
-                    type=thumbnail_info["thumbnail_type"],
-                    method=thumbnail_info["thumbnail_method"],
-                    length=thumbnail_info["thumbnail_length"],
-                ),
-            )
-
-        # No matching thumbnail was found.
-        return None
diff --git a/synapse/rest/media/v1/thumbnailer.py b/synapse/rest/media/v1/thumbnailer.py
deleted file mode 100644
index f909a4fb9a..0000000000
--- a/synapse/rest/media/v1/thumbnailer.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Copyright 2014-2016 OpenMarket Ltd
-# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-from io import BytesIO
-from types import TracebackType
-from typing import Optional, Tuple, Type
-
-from PIL import Image
-
-logger = logging.getLogger(__name__)
-
-EXIF_ORIENTATION_TAG = 0x0112
-EXIF_TRANSPOSE_MAPPINGS = {
-    2: Image.FLIP_LEFT_RIGHT,
-    3: Image.ROTATE_180,
-    4: Image.FLIP_TOP_BOTTOM,
-    5: Image.TRANSPOSE,
-    6: Image.ROTATE_270,
-    7: Image.TRANSVERSE,
-    8: Image.ROTATE_90,
-}
-
-
-class ThumbnailError(Exception):
-    """An error occurred generating a thumbnail."""
-
-
-class Thumbnailer:
-    FORMATS = {"image/jpeg": "JPEG", "image/png": "PNG"}
-
-    @staticmethod
-    def set_limits(max_image_pixels: int) -> None:
-        Image.MAX_IMAGE_PIXELS = max_image_pixels
-
-    def __init__(self, input_path: str):
-        # Have we closed the image?
-        self._closed = False
-
-        try:
-            self.image = Image.open(input_path)
-        except OSError as e:
-            # If an error occurs opening the image, a thumbnail won't be able to
-            # be generated.
-            raise ThumbnailError from e
-        except Image.DecompressionBombError as e:
-            # If an image decompression bomb error occurs opening the image,
-            # then the image exceeds the pixel limit and a thumbnail won't
-            # be able to be generated.
-            raise ThumbnailError from e
-
-        self.width, self.height = self.image.size
-        self.transpose_method = None
-        try:
-            # We don't use ImageOps.exif_transpose since it crashes with big EXIF
-            #
-            # Ignore safety: Pillow seems to acknowledge that this method is
-            # "private, experimental, but generally widely used". Pillow 6
-            # includes a public getexif() method (no underscore) that we might
-            # consider using instead when we can bump that dependency.
-            #
-            # At the time of writing, Debian buster (currently oldstable)
-            # provides version 5.4.1. It's expected to EOL in mid-2022, see
-            # https://wiki.debian.org/DebianReleases#Production_Releases
-            image_exif = self.image._getexif()  # type: ignore
-            if image_exif is not None:
-                image_orientation = image_exif.get(EXIF_ORIENTATION_TAG)
-                assert type(image_orientation) is int
-                self.transpose_method = EXIF_TRANSPOSE_MAPPINGS.get(image_orientation)
-        except Exception as e:
-            # A lot of parsing errors can happen when parsing EXIF
-            logger.info("Error parsing image EXIF information: %s", e)
-
-    def transpose(self) -> Tuple[int, int]:
-        """Transpose the image using its EXIF Orientation tag
-
-        Returns:
-            A tuple containing the new image size in pixels as (width, height).
-        """
-        if self.transpose_method is not None:
-            # Safety: `transpose` takes an int rather than e.g. an IntEnum.
-            # self.transpose_method is set above to be a value in
-            # EXIF_TRANSPOSE_MAPPINGS, and that only contains correct values.
-            with self.image:
-                self.image = self.image.transpose(self.transpose_method)  # type: ignore[arg-type]
-            self.width, self.height = self.image.size
-            self.transpose_method = None
-            # We don't need EXIF any more
-            self.image.info["exif"] = None
-        return self.image.size
-
-    def aspect(self, max_width: int, max_height: int) -> Tuple[int, int]:
-        """Calculate the largest size that preserves aspect ratio which
-        fits within the given rectangle::
-
-            (w_in / h_in) = (w_out / h_out)
-            w_out = max(min(w_max, h_max * (w_in / h_in)), 1)
-            h_out = max(min(h_max, w_max * (h_in / w_in)), 1)
-
-        Args:
-            max_width: The largest possible width.
-            max_height: The largest possible height.
-        """
-
-        if max_width * self.height < max_height * self.width:
-            return max_width, max((max_width * self.height) // self.width, 1)
-        else:
-            return max((max_height * self.width) // self.height, 1), max_height
-
-    def _resize(self, width: int, height: int) -> Image.Image:
-        # 1-bit or 8-bit color palette images need converting to RGB
-        # otherwise they will be scaled using nearest neighbour which
-        # looks awful.
-        #
-        # If the image has transparency, use RGBA instead.
-        if self.image.mode in ["1", "L", "P"]:
-            if self.image.info.get("transparency", None) is not None:
-                with self.image:
-                    self.image = self.image.convert("RGBA")
-            else:
-                with self.image:
-                    self.image = self.image.convert("RGB")
-        return self.image.resize((width, height), Image.ANTIALIAS)
-
-    def scale(self, width: int, height: int, output_type: str) -> BytesIO:
-        """Rescales the image to the given dimensions.
-
-        Returns:
-            The bytes of the encoded image ready to be written to disk
-        """
-        with self._resize(width, height) as scaled:
-            return self._encode_image(scaled, output_type)
-
-    def crop(self, width: int, height: int, output_type: str) -> BytesIO:
-        """Rescales and crops the image to the given dimensions preserving
-        aspect::
-            (w_in / h_in) = (w_scaled / h_scaled)
-            w_scaled = max(w_out, h_out * (w_in / h_in))
-            h_scaled = max(h_out, w_out * (h_in / w_in))
-
-        Args:
-            max_width: The largest possible width.
-            max_height: The largest possible height.
-
-        Returns:
-            The bytes of the encoded image ready to be written to disk
-        """
-        if width * self.height > height * self.width:
-            scaled_width = width
-            scaled_height = (width * self.height) // self.width
-            crop_top = (scaled_height - height) // 2
-            crop_bottom = height + crop_top
-            crop = (0, crop_top, width, crop_bottom)
-        else:
-            scaled_width = (height * self.width) // self.height
-            scaled_height = height
-            crop_left = (scaled_width - width) // 2
-            crop_right = width + crop_left
-            crop = (crop_left, 0, crop_right, height)
-
-        with self._resize(scaled_width, scaled_height) as scaled_image:
-            with scaled_image.crop(crop) as cropped:
-                return self._encode_image(cropped, output_type)
-
-    def _encode_image(self, output_image: Image.Image, output_type: str) -> BytesIO:
-        output_bytes_io = BytesIO()
-        fmt = self.FORMATS[output_type]
-        if fmt == "JPEG":
-            output_image = output_image.convert("RGB")
-        output_image.save(output_bytes_io, fmt, quality=80)
-        return output_bytes_io
-
-    def close(self) -> None:
-        """Closes the underlying image file.
-
-        Once closed no other functions can be called.
-
-        Can be called multiple times.
-        """
-
-        if self._closed:
-            return
-
-        self._closed = True
-
-        # Since we run this on the finalizer then we need to handle `__init__`
-        # raising an exception before it can define `self.image`.
-        image = getattr(self, "image", None)
-        if image is None:
-            return
-
-        image.close()
-
-    def __enter__(self) -> "Thumbnailer":
-        """Make `Thumbnailer` a context manager that calls `close` on
-        `__exit__`.
-        """
-        return self
-
-    def __exit__(
-        self,
-        type: Optional[Type[BaseException]],
-        value: Optional[BaseException],
-        traceback: Optional[TracebackType],
-    ) -> None:
-        self.close()
-
-    def __del__(self) -> None:
-        # Make sure we actually do close the image, rather than leak data.
-        self.close()
diff --git a/synapse/rest/media/v1/upload_resource.py b/synapse/rest/media/v1/upload_resource.py
deleted file mode 100644
index 97548b54e5..0000000000
--- a/synapse/rest/media/v1/upload_resource.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright 2014-2016 OpenMarket Ltd
-# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from typing import IO, TYPE_CHECKING, Dict, List, Optional
-
-from synapse.api.errors import Codes, SynapseError
-from synapse.http.server import DirectServeJsonResource, respond_with_json
-from synapse.http.servlet import parse_bytes_from_args
-from synapse.http.site import SynapseRequest
-from synapse.rest.media.v1.media_storage import SpamMediaException
-
-if TYPE_CHECKING:
-    from synapse.rest.media.v1.media_repository import MediaRepository
-    from synapse.server import HomeServer
-
-logger = logging.getLogger(__name__)
-
-
-class UploadResource(DirectServeJsonResource):
-    isLeaf = True
-
-    def __init__(self, hs: "HomeServer", media_repo: "MediaRepository"):
-        super().__init__()
-
-        self.media_repo = media_repo
-        self.filepaths = media_repo.filepaths
-        self.store = hs.get_datastores().main
-        self.clock = hs.get_clock()
-        self.server_name = hs.hostname
-        self.auth = hs.get_auth()
-        self.max_upload_size = hs.config.media.max_upload_size
-        self.clock = hs.get_clock()
-
-    async def _async_render_OPTIONS(self, request: SynapseRequest) -> None:
-        respond_with_json(request, 200, {}, send_cors=True)
-
-    async def _async_render_POST(self, request: SynapseRequest) -> None:
-        requester = await self.auth.get_user_by_req(request)
-        raw_content_length = request.getHeader("Content-Length")
-        if raw_content_length is None:
-            raise SynapseError(msg="Request must specify a Content-Length", code=400)
-        try:
-            content_length = int(raw_content_length)
-        except ValueError:
-            raise SynapseError(msg="Content-Length value is invalid", code=400)
-        if content_length > self.max_upload_size:
-            raise SynapseError(
-                msg="Upload request body is too large",
-                code=413,
-                errcode=Codes.TOO_LARGE,
-            )
-
-        args: Dict[bytes, List[bytes]] = request.args  # type: ignore
-        upload_name_bytes = parse_bytes_from_args(args, "filename")
-        if upload_name_bytes:
-            try:
-                upload_name: Optional[str] = upload_name_bytes.decode("utf8")
-            except UnicodeDecodeError:
-                raise SynapseError(
-                    msg="Invalid UTF-8 filename parameter: %r" % (upload_name_bytes,),
-                    code=400,
-                )
-
-        # If the name is falsey (e.g. an empty byte string) ensure it is None.
-        else:
-            upload_name = None
-
-        headers = request.requestHeaders
-
-        if headers.hasHeader(b"Content-Type"):
-            content_type_headers = headers.getRawHeaders(b"Content-Type")
-            assert content_type_headers  # for mypy
-            media_type = content_type_headers[0].decode("ascii")
-        else:
-            media_type = "application/octet-stream"
-
-        # if headers.hasHeader(b"Content-Disposition"):
-        #     disposition = headers.getRawHeaders(b"Content-Disposition")[0]
-        # TODO(markjh): parse content-dispostion
-
-        try:
-            content: IO = request.content  # type: ignore
-            content_uri = await self.media_repo.create_content(
-                media_type, upload_name, content, content_length, requester.user
-            )
-        except SpamMediaException:
-            # For uploading of media we want to respond with a 400, instead of
-            # the default 404, as that would just be confusing.
-            raise SynapseError(400, "Bad content")
-
-        logger.info("Uploaded content with URI '%s'", content_uri)
-
-        respond_with_json(
-            request, 200, {"content_uri": str(content_uri)}, send_cors=True
-        )
diff --git a/synapse/server.py b/synapse/server.py
index e5a3475247..a7c32e9a60 100644
--- a/synapse/server.py
+++ b/synapse/server.py
@@ -105,6 +105,7 @@ from synapse.handlers.typing import FollowerTypingHandler, TypingWriterHandler
 from synapse.handlers.user_directory import UserDirectoryHandler
 from synapse.http.client import InsecureInterceptableContextFactory, SimpleHttpClient
 from synapse.http.matrixfederationclient import MatrixFederationHttpClient
+from synapse.media.media_repository import MediaRepository
 from synapse.metrics.common_usage_metrics import CommonUsageMetricsManager
 from synapse.module_api import ModuleApi
 from synapse.notifier import Notifier, ReplicationNotifier
@@ -115,10 +116,7 @@ from synapse.replication.tcp.external_cache import ExternalCache
 from synapse.replication.tcp.handler import ReplicationCommandHandler
 from synapse.replication.tcp.resource import ReplicationStreamer
 from synapse.replication.tcp.streams import STREAMS_MAP, Stream
-from synapse.rest.media.v1.media_repository import (
-    MediaRepository,
-    MediaRepositoryResource,
-)
+from synapse.rest.media.media_repository_resource import MediaRepositoryResource
 from synapse.server_notices.server_notices_manager import ServerNoticesManager
 from synapse.server_notices.server_notices_sender import ServerNoticesSender
 from synapse.server_notices.worker_server_notices_sender import (
diff --git a/tests/media/__init__.py b/tests/media/__init__.py
new file mode 100644
index 0000000000..68910cbf5b
--- /dev/null
+++ b/tests/media/__init__.py
@@ -0,0 +1,13 @@
+#  Copyright 2023 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/media/test_base.py b/tests/media/test_base.py
new file mode 100644
index 0000000000..66498c744d
--- /dev/null
+++ b/tests/media/test_base.py
@@ -0,0 +1,38 @@
+# Copyright 2019 New Vector Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from synapse.media._base import get_filename_from_headers
+
+from tests import unittest
+
+
+class GetFileNameFromHeadersTests(unittest.TestCase):
+    # input -> expected result
+    TEST_CASES = {
+        b"inline; filename=abc.txt": "abc.txt",
+        b'inline; filename="azerty"': "azerty",
+        b'inline; filename="aze%20rty"': "aze%20rty",
+        b'inline; filename="aze"rty"': 'aze"rty',
+        b'inline; filename="azer;ty"': "azer;ty",
+        b"inline; filename*=utf-8''foo%C2%A3bar": "foo£bar",
+    }
+
+    def tests(self) -> None:
+        for hdr, expected in self.TEST_CASES.items():
+            res = get_filename_from_headers({b"Content-Disposition": [hdr]})
+            self.assertEqual(
+                res,
+                expected,
+                f"expected output for {hdr!r} to be {expected} but was {res}",
+            )
diff --git a/tests/media/test_filepath.py b/tests/media/test_filepath.py
new file mode 100644
index 0000000000..95e3b83d5a
--- /dev/null
+++ b/tests/media/test_filepath.py
@@ -0,0 +1,595 @@
+# Copyright 2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import os
+from typing import Iterable
+
+from synapse.media.filepath import MediaFilePaths, _wrap_with_jail_check
+
+from tests import unittest
+
+
+class MediaFilePathsTestCase(unittest.TestCase):
+    def setUp(self) -> None:
+        super().setUp()
+
+        self.filepaths = MediaFilePaths("/media_store")
+
+    def test_local_media_filepath(self) -> None:
+        """Test local media paths"""
+        self.assertEqual(
+            self.filepaths.local_media_filepath_rel("GerZNDnDZVjsOtardLuwfIBg"),
+            "local_content/Ge/rZ/NDnDZVjsOtardLuwfIBg",
+        )
+        self.assertEqual(
+            self.filepaths.local_media_filepath("GerZNDnDZVjsOtardLuwfIBg"),
+            "/media_store/local_content/Ge/rZ/NDnDZVjsOtardLuwfIBg",
+        )
+
+    def test_local_media_thumbnail(self) -> None:
+        """Test local media thumbnail paths"""
+        self.assertEqual(
+            self.filepaths.local_media_thumbnail_rel(
+                "GerZNDnDZVjsOtardLuwfIBg", 800, 600, "image/jpeg", "scale"
+            ),
+            "local_thumbnails/Ge/rZ/NDnDZVjsOtardLuwfIBg/800-600-image-jpeg-scale",
+        )
+        self.assertEqual(
+            self.filepaths.local_media_thumbnail(
+                "GerZNDnDZVjsOtardLuwfIBg", 800, 600, "image/jpeg", "scale"
+            ),
+            "/media_store/local_thumbnails/Ge/rZ/NDnDZVjsOtardLuwfIBg/800-600-image-jpeg-scale",
+        )
+
+    def test_local_media_thumbnail_dir(self) -> None:
+        """Test local media thumbnail directory paths"""
+        self.assertEqual(
+            self.filepaths.local_media_thumbnail_dir("GerZNDnDZVjsOtardLuwfIBg"),
+            "/media_store/local_thumbnails/Ge/rZ/NDnDZVjsOtardLuwfIBg",
+        )
+
+    def test_remote_media_filepath(self) -> None:
+        """Test remote media paths"""
+        self.assertEqual(
+            self.filepaths.remote_media_filepath_rel(
+                "example.com", "GerZNDnDZVjsOtardLuwfIBg"
+            ),
+            "remote_content/example.com/Ge/rZ/NDnDZVjsOtardLuwfIBg",
+        )
+        self.assertEqual(
+            self.filepaths.remote_media_filepath(
+                "example.com", "GerZNDnDZVjsOtardLuwfIBg"
+            ),
+            "/media_store/remote_content/example.com/Ge/rZ/NDnDZVjsOtardLuwfIBg",
+        )
+
+    def test_remote_media_thumbnail(self) -> None:
+        """Test remote media thumbnail paths"""
+        self.assertEqual(
+            self.filepaths.remote_media_thumbnail_rel(
+                "example.com",
+                "GerZNDnDZVjsOtardLuwfIBg",
+                800,
+                600,
+                "image/jpeg",
+                "scale",
+            ),
+            "remote_thumbnail/example.com/Ge/rZ/NDnDZVjsOtardLuwfIBg/800-600-image-jpeg-scale",
+        )
+        self.assertEqual(
+            self.filepaths.remote_media_thumbnail(
+                "example.com",
+                "GerZNDnDZVjsOtardLuwfIBg",
+                800,
+                600,
+                "image/jpeg",
+                "scale",
+            ),
+            "/media_store/remote_thumbnail/example.com/Ge/rZ/NDnDZVjsOtardLuwfIBg/800-600-image-jpeg-scale",
+        )
+
+    def test_remote_media_thumbnail_legacy(self) -> None:
+        """Test old-style remote media thumbnail paths"""
+        self.assertEqual(
+            self.filepaths.remote_media_thumbnail_rel_legacy(
+                "example.com", "GerZNDnDZVjsOtardLuwfIBg", 800, 600, "image/jpeg"
+            ),
+            "remote_thumbnail/example.com/Ge/rZ/NDnDZVjsOtardLuwfIBg/800-600-image-jpeg",
+        )
+
+    def test_remote_media_thumbnail_dir(self) -> None:
+        """Test remote media thumbnail directory paths"""
+        self.assertEqual(
+            self.filepaths.remote_media_thumbnail_dir(
+                "example.com", "GerZNDnDZVjsOtardLuwfIBg"
+            ),
+            "/media_store/remote_thumbnail/example.com/Ge/rZ/NDnDZVjsOtardLuwfIBg",
+        )
+
+    def test_url_cache_filepath(self) -> None:
+        """Test URL cache paths"""
+        self.assertEqual(
+            self.filepaths.url_cache_filepath_rel("2020-01-02_GerZNDnDZVjsOtar"),
+            "url_cache/2020-01-02/GerZNDnDZVjsOtar",
+        )
+        self.assertEqual(
+            self.filepaths.url_cache_filepath("2020-01-02_GerZNDnDZVjsOtar"),
+            "/media_store/url_cache/2020-01-02/GerZNDnDZVjsOtar",
+        )
+
+    def test_url_cache_filepath_legacy(self) -> None:
+        """Test old-style URL cache paths"""
+        self.assertEqual(
+            self.filepaths.url_cache_filepath_rel("GerZNDnDZVjsOtardLuwfIBg"),
+            "url_cache/Ge/rZ/NDnDZVjsOtardLuwfIBg",
+        )
+        self.assertEqual(
+            self.filepaths.url_cache_filepath("GerZNDnDZVjsOtardLuwfIBg"),
+            "/media_store/url_cache/Ge/rZ/NDnDZVjsOtardLuwfIBg",
+        )
+
+    def test_url_cache_filepath_dirs_to_delete(self) -> None:
+        """Test URL cache cleanup paths"""
+        self.assertEqual(
+            self.filepaths.url_cache_filepath_dirs_to_delete(
+                "2020-01-02_GerZNDnDZVjsOtar"
+            ),
+            ["/media_store/url_cache/2020-01-02"],
+        )
+
+    def test_url_cache_filepath_dirs_to_delete_legacy(self) -> None:
+        """Test old-style URL cache cleanup paths"""
+        self.assertEqual(
+            self.filepaths.url_cache_filepath_dirs_to_delete(
+                "GerZNDnDZVjsOtardLuwfIBg"
+            ),
+            [
+                "/media_store/url_cache/Ge/rZ",
+                "/media_store/url_cache/Ge",
+            ],
+        )
+
+    def test_url_cache_thumbnail(self) -> None:
+        """Test URL cache thumbnail paths"""
+        self.assertEqual(
+            self.filepaths.url_cache_thumbnail_rel(
+                "2020-01-02_GerZNDnDZVjsOtar", 800, 600, "image/jpeg", "scale"
+            ),
+            "url_cache_thumbnails/2020-01-02/GerZNDnDZVjsOtar/800-600-image-jpeg-scale",
+        )
+        self.assertEqual(
+            self.filepaths.url_cache_thumbnail(
+                "2020-01-02_GerZNDnDZVjsOtar", 800, 600, "image/jpeg", "scale"
+            ),
+            "/media_store/url_cache_thumbnails/2020-01-02/GerZNDnDZVjsOtar/800-600-image-jpeg-scale",
+        )
+
+    def test_url_cache_thumbnail_legacy(self) -> None:
+        """Test old-style URL cache thumbnail paths"""
+        self.assertEqual(
+            self.filepaths.url_cache_thumbnail_rel(
+                "GerZNDnDZVjsOtardLuwfIBg", 800, 600, "image/jpeg", "scale"
+            ),
+            "url_cache_thumbnails/Ge/rZ/NDnDZVjsOtardLuwfIBg/800-600-image-jpeg-scale",
+        )
+        self.assertEqual(
+            self.filepaths.url_cache_thumbnail(
+                "GerZNDnDZVjsOtardLuwfIBg", 800, 600, "image/jpeg", "scale"
+            ),
+            "/media_store/url_cache_thumbnails/Ge/rZ/NDnDZVjsOtardLuwfIBg/800-600-image-jpeg-scale",
+        )
+
+    def test_url_cache_thumbnail_directory(self) -> None:
+        """Test URL cache thumbnail directory paths"""
+        self.assertEqual(
+            self.filepaths.url_cache_thumbnail_directory_rel(
+                "2020-01-02_GerZNDnDZVjsOtar"
+            ),
+            "url_cache_thumbnails/2020-01-02/GerZNDnDZVjsOtar",
+        )
+        self.assertEqual(
+            self.filepaths.url_cache_thumbnail_directory("2020-01-02_GerZNDnDZVjsOtar"),
+            "/media_store/url_cache_thumbnails/2020-01-02/GerZNDnDZVjsOtar",
+        )
+
+    def test_url_cache_thumbnail_directory_legacy(self) -> None:
+        """Test old-style URL cache thumbnail directory paths"""
+        self.assertEqual(
+            self.filepaths.url_cache_thumbnail_directory_rel(
+                "GerZNDnDZVjsOtardLuwfIBg"
+            ),
+            "url_cache_thumbnails/Ge/rZ/NDnDZVjsOtardLuwfIBg",
+        )
+        self.assertEqual(
+            self.filepaths.url_cache_thumbnail_directory("GerZNDnDZVjsOtardLuwfIBg"),
+            "/media_store/url_cache_thumbnails/Ge/rZ/NDnDZVjsOtardLuwfIBg",
+        )
+
+    def test_url_cache_thumbnail_dirs_to_delete(self) -> None:
+        """Test URL cache thumbnail cleanup paths"""
+        self.assertEqual(
+            self.filepaths.url_cache_thumbnail_dirs_to_delete(
+                "2020-01-02_GerZNDnDZVjsOtar"
+            ),
+            [
+                "/media_store/url_cache_thumbnails/2020-01-02/GerZNDnDZVjsOtar",
+                "/media_store/url_cache_thumbnails/2020-01-02",
+            ],
+        )
+
+    def test_url_cache_thumbnail_dirs_to_delete_legacy(self) -> None:
+        """Test old-style URL cache thumbnail cleanup paths"""
+        self.assertEqual(
+            self.filepaths.url_cache_thumbnail_dirs_to_delete(
+                "GerZNDnDZVjsOtardLuwfIBg"
+            ),
+            [
+                "/media_store/url_cache_thumbnails/Ge/rZ/NDnDZVjsOtardLuwfIBg",
+                "/media_store/url_cache_thumbnails/Ge/rZ",
+                "/media_store/url_cache_thumbnails/Ge",
+            ],
+        )
+
+    def test_server_name_validation(self) -> None:
+        """Test validation of server names"""
+        self._test_path_validation(
+            [
+                "remote_media_filepath_rel",
+                "remote_media_filepath",
+                "remote_media_thumbnail_rel",
+                "remote_media_thumbnail",
+                "remote_media_thumbnail_rel_legacy",
+                "remote_media_thumbnail_dir",
+            ],
+            parameter="server_name",
+            valid_values=[
+                "matrix.org",
+                "matrix.org:8448",
+                "matrix-federation.matrix.org",
+                "matrix-federation.matrix.org:8448",
+                "10.1.12.123",
+                "10.1.12.123:8448",
+                "[fd00:abcd::ffff]",
+                "[fd00:abcd::ffff]:8448",
+            ],
+            invalid_values=[
+                "/matrix.org",
+                "matrix.org/..",
+                "matrix.org\x00",
+                "",
+                ".",
+                "..",
+                "/",
+            ],
+        )
+
+    def test_file_id_validation(self) -> None:
+        """Test validation of local, remote and legacy URL cache file / media IDs"""
+        # File / media IDs get split into three parts to form paths, consisting of the
+        # first two characters, next two characters and rest of the ID.
+        valid_file_ids = [
+            "GerZNDnDZVjsOtardLuwfIBg",
+            # Unexpected, but produces an acceptable path:
+            "GerZN",  # "N" becomes the last directory
+        ]
+        invalid_file_ids = [
+            "/erZNDnDZVjsOtardLuwfIBg",
+            "Ge/ZNDnDZVjsOtardLuwfIBg",
+            "GerZ/DnDZVjsOtardLuwfIBg",
+            "GerZ/..",
+            "G\x00rZNDnDZVjsOtardLuwfIBg",
+            "Ger\x00NDnDZVjsOtardLuwfIBg",
+            "GerZNDnDZVjsOtardLuwfIBg\x00",
+            "",
+            "Ge",
+            "GerZ",
+            "GerZ.",
+            "..rZNDnDZVjsOtardLuwfIBg",
+            "Ge..NDnDZVjsOtardLuwfIBg",
+            "GerZ..",
+            "GerZ/",
+        ]
+
+        self._test_path_validation(
+            [
+                "local_media_filepath_rel",
+                "local_media_filepath",
+                "local_media_thumbnail_rel",
+                "local_media_thumbnail",
+                "local_media_thumbnail_dir",
+                # Legacy URL cache media IDs
+                "url_cache_filepath_rel",
+                "url_cache_filepath",
+                # `url_cache_filepath_dirs_to_delete` is tested below.
+                "url_cache_thumbnail_rel",
+                "url_cache_thumbnail",
+                "url_cache_thumbnail_directory_rel",
+                "url_cache_thumbnail_directory",
+                "url_cache_thumbnail_dirs_to_delete",
+            ],
+            parameter="media_id",
+            valid_values=valid_file_ids,
+            invalid_values=invalid_file_ids,
+        )
+
+        # `url_cache_filepath_dirs_to_delete` ignores what would be the last path
+        # component, so only the first 4 characters matter.
+        self._test_path_validation(
+            [
+                "url_cache_filepath_dirs_to_delete",
+            ],
+            parameter="media_id",
+            valid_values=valid_file_ids,
+            invalid_values=[
+                "/erZNDnDZVjsOtardLuwfIBg",
+                "Ge/ZNDnDZVjsOtardLuwfIBg",
+                "G\x00rZNDnDZVjsOtardLuwfIBg",
+                "Ger\x00NDnDZVjsOtardLuwfIBg",
+                "",
+                "Ge",
+                "..rZNDnDZVjsOtardLuwfIBg",
+                "Ge..NDnDZVjsOtardLuwfIBg",
+            ],
+        )
+
+        self._test_path_validation(
+            [
+                "remote_media_filepath_rel",
+                "remote_media_filepath",
+                "remote_media_thumbnail_rel",
+                "remote_media_thumbnail",
+                "remote_media_thumbnail_rel_legacy",
+                "remote_media_thumbnail_dir",
+            ],
+            parameter="file_id",
+            valid_values=valid_file_ids,
+            invalid_values=invalid_file_ids,
+        )
+
+    def test_url_cache_media_id_validation(self) -> None:
+        """Test validation of URL cache media IDs"""
+        self._test_path_validation(
+            [
+                "url_cache_filepath_rel",
+                "url_cache_filepath",
+                # `url_cache_filepath_dirs_to_delete` only cares about the date prefix
+                "url_cache_thumbnail_rel",
+                "url_cache_thumbnail",
+                "url_cache_thumbnail_directory_rel",
+                "url_cache_thumbnail_directory",
+                "url_cache_thumbnail_dirs_to_delete",
+            ],
+            parameter="media_id",
+            valid_values=[
+                "2020-01-02_GerZNDnDZVjsOtar",
+                "2020-01-02_G",  # Unexpected, but produces an acceptable path
+            ],
+            invalid_values=[
+                "2020-01-02",
+                "2020-01-02-",
+                "2020-01-02-.",
+                "2020-01-02-..",
+                "2020-01-02-/",
+                "2020-01-02-/GerZNDnDZVjsOtar",
+                "2020-01-02-GerZNDnDZVjsOtar/..",
+                "2020-01-02-GerZNDnDZVjsOtar\x00",
+            ],
+        )
+
+    def test_content_type_validation(self) -> None:
+        """Test validation of thumbnail content types"""
+        self._test_path_validation(
+            [
+                "local_media_thumbnail_rel",
+                "local_media_thumbnail",
+                "remote_media_thumbnail_rel",
+                "remote_media_thumbnail",
+                "remote_media_thumbnail_rel_legacy",
+                "url_cache_thumbnail_rel",
+                "url_cache_thumbnail",
+            ],
+            parameter="content_type",
+            valid_values=[
+                "image/jpeg",
+            ],
+            invalid_values=[
+                "",  # ValueError: not enough values to unpack
+                "image/jpeg/abc",  # ValueError: too many values to unpack
+                "image/jpeg\x00",
+            ],
+        )
+
+    def test_thumbnail_method_validation(self) -> None:
+        """Test validation of thumbnail methods"""
+        self._test_path_validation(
+            [
+                "local_media_thumbnail_rel",
+                "local_media_thumbnail",
+                "remote_media_thumbnail_rel",
+                "remote_media_thumbnail",
+                "url_cache_thumbnail_rel",
+                "url_cache_thumbnail",
+            ],
+            parameter="method",
+            valid_values=[
+                "crop",
+                "scale",
+            ],
+            invalid_values=[
+                "/scale",
+                "scale/..",
+                "scale\x00",
+                "/",
+            ],
+        )
+
+    def _test_path_validation(
+        self,
+        methods: Iterable[str],
+        parameter: str,
+        valid_values: Iterable[str],
+        invalid_values: Iterable[str],
+    ) -> None:
+        """Test that the specified methods validate the named parameter as expected
+
+        Args:
+            methods: The names of `MediaFilePaths` methods to test
+            parameter: The name of the parameter to test
+            valid_values: A list of parameter values that are expected to be accepted
+            invalid_values: A list of parameter values that are expected to be rejected
+
+        Raises:
+            AssertionError: If a value was accepted when it should have failed
+                validation.
+            ValueError: If a value failed validation when it should have been accepted.
+        """
+        for method in methods:
+            get_path = getattr(self.filepaths, method)
+
+            parameters = inspect.signature(get_path).parameters
+            kwargs = {
+                "server_name": "matrix.org",
+                "media_id": "GerZNDnDZVjsOtardLuwfIBg",
+                "file_id": "GerZNDnDZVjsOtardLuwfIBg",
+                "width": 800,
+                "height": 600,
+                "content_type": "image/jpeg",
+                "method": "scale",
+            }
+
+            if get_path.__name__.startswith("url_"):
+                kwargs["media_id"] = "2020-01-02_GerZNDnDZVjsOtar"
+
+            kwargs = {k: v for k, v in kwargs.items() if k in parameters}
+            kwargs.pop(parameter)
+
+            for value in valid_values:
+                kwargs[parameter] = value
+                get_path(**kwargs)
+                # No exception should be raised
+
+            for value in invalid_values:
+                with self.assertRaises(ValueError):
+                    kwargs[parameter] = value
+                    path_or_list = get_path(**kwargs)
+                    self.fail(
+                        f"{value!r} unexpectedly passed validation: "
+                        f"{method} returned {path_or_list!r}"
+                    )
+
+
+class MediaFilePathsJailTestCase(unittest.TestCase):
+    def _check_relative_path(self, filepaths: MediaFilePaths, path: str) -> None:
+        """Passes a relative path through the jail check.
+
+        Args:
+            filepaths: The `MediaFilePaths` instance.
+            path: A path relative to the media store directory.
+
+        Raises:
+            ValueError: If the jail check fails.
+        """
+
+        @_wrap_with_jail_check(relative=True)
+        def _make_relative_path(self: MediaFilePaths, path: str) -> str:
+            return path
+
+        _make_relative_path(filepaths, path)
+
+    def _check_absolute_path(self, filepaths: MediaFilePaths, path: str) -> None:
+        """Passes an absolute path through the jail check.
+
+        Args:
+            filepaths: The `MediaFilePaths` instance.
+            path: A path relative to the media store directory.
+
+        Raises:
+            ValueError: If the jail check fails.
+        """
+
+        @_wrap_with_jail_check(relative=False)
+        def _make_absolute_path(self: MediaFilePaths, path: str) -> str:
+            return os.path.join(self.base_path, path)
+
+        _make_absolute_path(filepaths, path)
+
+    def test_traversal_inside(self) -> None:
+        """Test the jail check for paths that stay within the media directory."""
+        # Despite the `../`s, these paths still lie within the media directory and it's
+        # expected for the jail check to allow them through.
+        # These paths ought to trip the other checks in place and should never be
+        # returned.
+        filepaths = MediaFilePaths("/media_store")
+        path = "url_cache/2020-01-02/../../GerZNDnDZVjsOtar"
+        self._check_relative_path(filepaths, path)
+        self._check_absolute_path(filepaths, path)
+
+    def test_traversal_outside(self) -> None:
+        """Test that the jail check fails for paths that escape the media directory."""
+        filepaths = MediaFilePaths("/media_store")
+        path = "url_cache/2020-01-02/../../../GerZNDnDZVjsOtar"
+        with self.assertRaises(ValueError):
+            self._check_relative_path(filepaths, path)
+        with self.assertRaises(ValueError):
+            self._check_absolute_path(filepaths, path)
+
+    def test_traversal_reentry(self) -> None:
+        """Test the jail check for paths that exit and re-enter the media directory."""
+        # These paths lie outside the media directory if it is a symlink, and inside
+        # otherwise. Ideally the check should fail, but this proves difficult.
+        # This test documents the behaviour for this edge case.
+        # These paths ought to trip the other checks in place and should never be
+        # returned.
+        filepaths = MediaFilePaths("/media_store")
+        path = "url_cache/2020-01-02/../../../media_store/GerZNDnDZVjsOtar"
+        self._check_relative_path(filepaths, path)
+        self._check_absolute_path(filepaths, path)
+
+    def test_symlink(self) -> None:
+        """Test that a symlink does not cause the jail check to fail."""
+        media_store_path = self.mktemp()
+
+        # symlink the media store directory
+        os.symlink("/mnt/synapse/media_store", media_store_path)
+
+        # Test that relative and absolute paths don't trip the check
+        # NB: `media_store_path` is a relative path
+        filepaths = MediaFilePaths(media_store_path)
+        self._check_relative_path(filepaths, "url_cache/2020-01-02/GerZNDnDZVjsOtar")
+        self._check_absolute_path(filepaths, "url_cache/2020-01-02/GerZNDnDZVjsOtar")
+
+        filepaths = MediaFilePaths(os.path.abspath(media_store_path))
+        self._check_relative_path(filepaths, "url_cache/2020-01-02/GerZNDnDZVjsOtar")
+        self._check_absolute_path(filepaths, "url_cache/2020-01-02/GerZNDnDZVjsOtar")
+
+    def test_symlink_subdirectory(self) -> None:
+        """Test that a symlinked subdirectory does not cause the jail check to fail."""
+        media_store_path = self.mktemp()
+        os.mkdir(media_store_path)
+
+        # symlink `url_cache/`
+        os.symlink(
+            "/mnt/synapse/media_store_url_cache",
+            os.path.join(media_store_path, "url_cache"),
+        )
+
+        # Test that relative and absolute paths don't trip the check
+        # NB: `media_store_path` is a relative path
+        filepaths = MediaFilePaths(media_store_path)
+        self._check_relative_path(filepaths, "url_cache/2020-01-02/GerZNDnDZVjsOtar")
+        self._check_absolute_path(filepaths, "url_cache/2020-01-02/GerZNDnDZVjsOtar")
+
+        filepaths = MediaFilePaths(os.path.abspath(media_store_path))
+        self._check_relative_path(filepaths, "url_cache/2020-01-02/GerZNDnDZVjsOtar")
+        self._check_absolute_path(filepaths, "url_cache/2020-01-02/GerZNDnDZVjsOtar")
diff --git a/tests/media/test_html_preview.py b/tests/media/test_html_preview.py
new file mode 100644
index 0000000000..e7da75db3e
--- /dev/null
+++ b/tests/media/test_html_preview.py
@@ -0,0 +1,542 @@
+# Copyright 2014-2016 OpenMarket Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from synapse.media.preview_html import (
+    _get_html_media_encodings,
+    decode_body,
+    parse_html_to_open_graph,
+    summarize_paragraphs,
+)
+
+from tests import unittest
+
+try:
+    import lxml
+except ImportError:
+    lxml = None
+
+
+class SummarizeTestCase(unittest.TestCase):
+    if not lxml:
+        skip = "url preview feature requires lxml"
+
+    def test_long_summarize(self) -> None:
+        example_paras = [
+            """Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:
+            Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in
+            Troms county, Norway. The administrative centre of the municipality is
+            the city of Tromsø. Outside of Norway, Tromso and Tromsö are
+            alternative spellings of the city.Tromsø is considered the northernmost
+            city in the world with a population above 50,000. The most populous town
+            north of it is Alta, Norway, with a population of 14,272 (2013).""",
+            """Tromsø lies in Northern Norway. The municipality has a population of
+            (2015) 72,066, but with an annual influx of students it has over 75,000
+            most of the year. It is the largest urban area in Northern Norway and the
+            third largest north of the Arctic Circle (following Murmansk and Norilsk).
+            Most of Tromsø, including the city centre, is located on the island of
+            Tromsøya, 350 kilometres (217 mi) north of the Arctic Circle. In 2012,
+            Tromsøya had a population of 36,088. Substantial parts of the urban area
+            are also situated on the mainland to the east, and on parts of Kvaløya—a
+            large island to the west. Tromsøya is connected to the mainland by the Tromsø
+            Bridge and the Tromsøysund Tunnel, and to the island of Kvaløya by the
+            Sandnessund Bridge. Tromsø Airport connects the city to many destinations
+            in Europe. The city is warmer than most other places located on the same
+            latitude, due to the warming effect of the Gulf Stream.""",
+            """The city centre of Tromsø contains the highest number of old wooden
+            houses in Northern Norway, the oldest house dating from 1789. The Arctic
+            Cathedral, a modern church from 1965, is probably the most famous landmark
+            in Tromsø. The city is a cultural centre for its region, with several
+            festivals taking place in the summer. Some of Norway's best-known
+             musicians, Torbjørn Brundtland and Svein Berge of the electronica duo
+             Röyksopp and Lene Marlin grew up and started their careers in Tromsø.
+             Noted electronic musician Geir Jenssen also hails from Tromsø.""",
+        ]
+
+        desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
+
+        self.assertEqual(
+            desc,
+            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
+            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
+            " Troms county, Norway. The administrative centre of the municipality is"
+            " the city of Tromsø. Outside of Norway, Tromso and Tromsö are"
+            " alternative spellings of the city.Tromsø is considered the northernmost"
+            " city in the world with a population above 50,000. The most populous town"
+            " north of it is Alta, Norway, with a population of 14,272 (2013).",
+        )
+
+        desc = summarize_paragraphs(example_paras[1:], min_size=200, max_size=500)
+
+        self.assertEqual(
+            desc,
+            "Tromsø lies in Northern Norway. The municipality has a population of"
+            " (2015) 72,066, but with an annual influx of students it has over 75,000"
+            " most of the year. It is the largest urban area in Northern Norway and the"
+            " third largest north of the Arctic Circle (following Murmansk and Norilsk)."
+            " Most of Tromsø, including the city centre, is located on the island of"
+            " Tromsøya, 350 kilometres (217 mi) north of the Arctic Circle. In 2012,"
+            " Tromsøya had a population of 36,088. Substantial parts of the urban…",
+        )
+
+    def test_short_summarize(self) -> None:
+        example_paras = [
+            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
+            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
+            " Troms county, Norway.",
+            "Tromsø lies in Northern Norway. The municipality has a population of"
+            " (2015) 72,066, but with an annual influx of students it has over 75,000"
+            " most of the year.",
+            "The city centre of Tromsø contains the highest number of old wooden"
+            " houses in Northern Norway, the oldest house dating from 1789. The Arctic"
+            " Cathedral, a modern church from 1965, is probably the most famous landmark"
+            " in Tromsø.",
+        ]
+
+        desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
+
+        self.assertEqual(
+            desc,
+            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
+            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
+            " Troms county, Norway.\n"
+            "\n"
+            "Tromsø lies in Northern Norway. The municipality has a population of"
+            " (2015) 72,066, but with an annual influx of students it has over 75,000"
+            " most of the year.",
+        )
+
+    def test_small_then_large_summarize(self) -> None:
+        example_paras = [
+            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
+            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
+            " Troms county, Norway.",
+            "Tromsø lies in Northern Norway. The municipality has a population of"
+            " (2015) 72,066, but with an annual influx of students it has over 75,000"
+            " most of the year."
+            " The city centre of Tromsø contains the highest number of old wooden"
+            " houses in Northern Norway, the oldest house dating from 1789. The Arctic"
+            " Cathedral, a modern church from 1965, is probably the most famous landmark"
+            " in Tromsø.",
+        ]
+
+        desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
+        self.assertEqual(
+            desc,
+            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
+            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
+            " Troms county, Norway.\n"
+            "\n"
+            "Tromsø lies in Northern Norway. The municipality has a population of"
+            " (2015) 72,066, but with an annual influx of students it has over 75,000"
+            " most of the year. The city centre of Tromsø contains the highest number"
+            " of old wooden houses in Northern Norway, the oldest house dating from"
+            " 1789. The Arctic Cathedral, a modern church from…",
+        )
+
+
+class OpenGraphFromHtmlTestCase(unittest.TestCase):
+    if not lxml:
+        skip = "url preview feature requires lxml"
+
+    def test_simple(self) -> None:
+        html = b"""
+        <html>
+        <head><title>Foo</title></head>
+        <body>
+        Some text.
+        </body>
+        </html>
+        """
+
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree)
+
+        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
+
+    def test_comment(self) -> None:
+        html = b"""
+        <html>
+        <head><title>Foo</title></head>
+        <body>
+        <!-- HTML comment -->
+        Some text.
+        </body>
+        </html>
+        """
+
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree)
+
+        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
+
+    def test_comment2(self) -> None:
+        html = b"""
+        <html>
+        <head><title>Foo</title></head>
+        <body>
+        Some text.
+        <!-- HTML comment -->
+        Some more text.
+        <p>Text</p>
+        More text
+        </body>
+        </html>
+        """
+
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree)
+
+        self.assertEqual(
+            og,
+            {
+                "og:title": "Foo",
+                "og:description": "Some text.\n\nSome more text.\n\nText\n\nMore text",
+            },
+        )
+
+    def test_script(self) -> None:
+        html = b"""
+        <html>
+        <head><title>Foo</title></head>
+        <body>
+        <script> (function() {})() </script>
+        Some text.
+        </body>
+        </html>
+        """
+
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree)
+
+        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
+
+    def test_missing_title(self) -> None:
+        html = b"""
+        <html>
+        <body>
+        Some text.
+        </body>
+        </html>
+        """
+
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree)
+
+        self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
+
+        # Another variant is a title with no content.
+        html = b"""
+        <html>
+        <head><title></title></head>
+        <body>
+        <h1>Title</h1>
+        </body>
+        </html>
+        """
+
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree)
+
+        self.assertEqual(og, {"og:title": "Title", "og:description": "Title"})
+
+    def test_h1_as_title(self) -> None:
+        html = b"""
+        <html>
+        <meta property="og:description" content="Some text."/>
+        <body>
+        <h1>Title</h1>
+        </body>
+        </html>
+        """
+
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree)
+
+        self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
+
+    def test_empty_description(self) -> None:
+        """Description tags with empty content should be ignored."""
+        html = b"""
+        <html>
+        <meta property="og:description" content=""/>
+        <meta property="og:description"/>
+        <meta name="description" content=""/>
+        <meta name="description"/>
+        <meta name="description" content="Finally!"/>
+        <body>
+        <h1>Title</h1>
+        </body>
+        </html>
+        """
+
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree)
+
+        self.assertEqual(og, {"og:title": "Title", "og:description": "Finally!"})
+
+    def test_missing_title_and_broken_h1(self) -> None:
+        html = b"""
+        <html>
+        <body>
+        <h1><a href="foo"/></h1>
+        Some text.
+        </body>
+        </html>
+        """
+
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree)
+
+        self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
+
+    def test_empty(self) -> None:
+        """Test a body with no data in it."""
+        html = b""
+        tree = decode_body(html, "http://example.com/test.html")
+        self.assertIsNone(tree)
+
+    def test_no_tree(self) -> None:
+        """A valid body with no tree in it."""
+        html = b"\x00"
+        tree = decode_body(html, "http://example.com/test.html")
+        self.assertIsNone(tree)
+
+    def test_xml(self) -> None:
+        """Test decoding XML and ensure it works properly."""
+        # Note that the strip() call is important to ensure the xml tag starts
+        # at the initial byte.
+        html = b"""
+        <?xml version="1.0" encoding="UTF-8"?>
+
+        <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+        <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+        <head><title>Foo</title></head><body>Some text.</body></html>
+        """.strip()
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree)
+        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
+
+    def test_invalid_encoding(self) -> None:
+        """An invalid character encoding should be ignored and treated as UTF-8, if possible."""
+        html = b"""
+        <html>
+        <head><title>Foo</title></head>
+        <body>
+        Some text.
+        </body>
+        </html>
+        """
+        tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
+        og = parse_html_to_open_graph(tree)
+        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
+
+    def test_invalid_encoding2(self) -> None:
+        """A body which doesn't match the sent character encoding."""
+        # Note that this contains an invalid UTF-8 sequence in the title.
+        html = b"""
+        <html>
+        <head><title>\xff\xff Foo</title></head>
+        <body>
+        Some text.
+        </body>
+        </html>
+        """
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree)
+        self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
+
+    def test_windows_1252(self) -> None:
+        """A body which uses cp1252, but doesn't declare that."""
+        html = b"""
+        <html>
+        <head><title>\xf3</title></head>
+        <body>
+        Some text.
+        </body>
+        </html>
+        """
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree)
+        self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
+
+    def test_twitter_tag(self) -> None:
+        """Twitter card tags should be used if nothing else is available."""
+        html = b"""
+        <html>
+        <meta name="twitter:card" content="summary">
+        <meta name="twitter:description" content="Description">
+        <meta name="twitter:site" content="@matrixdotorg">
+        </html>
+        """
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree)
+        self.assertEqual(
+            og,
+            {
+                "og:title": None,
+                "og:description": "Description",
+                "og:site_name": "@matrixdotorg",
+            },
+        )
+
+        # But they shouldn't override Open Graph values.
+        html = b"""
+        <html>
+        <meta name="twitter:card" content="summary">
+        <meta name="twitter:description" content="Description">
+        <meta property="og:description" content="Real Description">
+        <meta name="twitter:site" content="@matrixdotorg">
+        <meta property="og:site_name" content="matrix.org">
+        </html>
+        """
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree)
+        self.assertEqual(
+            og,
+            {
+                "og:title": None,
+                "og:description": "Real Description",
+                "og:site_name": "matrix.org",
+            },
+        )
+
+    def test_nested_nodes(self) -> None:
+        """A body with some nested nodes. Tests that we iterate over children
+        in the right order (and don't reverse the order of the text)."""
+        html = b"""
+        <a href="somewhere">Welcome <b>the bold <u>and underlined text <svg>
+        with a cheeky SVG</svg></u> and <strong>some</strong> tail text</b></a>
+        """
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree)
+        self.assertEqual(
+            og,
+            {
+                "og:title": None,
+                "og:description": "Welcome\n\nthe bold\n\nand underlined text\n\nand\n\nsome\n\ntail text",
+            },
+        )
+
+
+class MediaEncodingTestCase(unittest.TestCase):
+    def test_meta_charset(self) -> None:
+        """A character encoding is found via the meta tag."""
+        encodings = _get_html_media_encodings(
+            b"""
+        <html>
+        <head><meta charset="ascii">
+        </head>
+        </html>
+        """,
+            "text/html",
+        )
+        self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
+
+        # A less well-formed version.
+        encodings = _get_html_media_encodings(
+            b"""
+        <html>
+        <head>< meta charset = ascii>
+        </head>
+        </html>
+        """,
+            "text/html",
+        )
+        self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
+
+    def test_meta_charset_underscores(self) -> None:
+        """A character encoding contains underscore."""
+        encodings = _get_html_media_encodings(
+            b"""
+        <html>
+        <head><meta charset="Shift_JIS">
+        </head>
+        </html>
+        """,
+            "text/html",
+        )
+        self.assertEqual(list(encodings), ["shift_jis", "utf-8", "cp1252"])
+
+    def test_xml_encoding(self) -> None:
+        """A character encoding is found via the meta tag."""
+        encodings = _get_html_media_encodings(
+            b"""
+        <?xml version="1.0" encoding="ascii"?>
+        <html>
+        </html>
+        """,
+            "text/html",
+        )
+        self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
+
+    def test_meta_xml_encoding(self) -> None:
+        """Meta tags take precedence over XML encoding."""
+        encodings = _get_html_media_encodings(
+            b"""
+        <?xml version="1.0" encoding="ascii"?>
+        <html>
+        <head><meta charset="UTF-16">
+        </head>
+        </html>
+        """,
+            "text/html",
+        )
+        self.assertEqual(list(encodings), ["utf-16", "ascii", "utf-8", "cp1252"])
+
+    def test_content_type(self) -> None:
+        """A character encoding is found via the Content-Type header."""
+        # Test a few variations of the header.
+        headers = (
+            'text/html; charset="ascii";',
+            "text/html;charset=ascii;",
+            'text/html;  charset="ascii"',
+            "text/html; charset=ascii",
+            'text/html; charset="ascii;',
+            'text/html; charset=ascii";',
+        )
+        for header in headers:
+            encodings = _get_html_media_encodings(b"", header)
+            self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
+
+    def test_fallback(self) -> None:
+        """A character encoding cannot be found in the body or header."""
+        encodings = _get_html_media_encodings(b"", "text/html")
+        self.assertEqual(list(encodings), ["utf-8", "cp1252"])
+
+    def test_duplicates(self) -> None:
+        """Ensure each encoding is only attempted once."""
+        encodings = _get_html_media_encodings(
+            b"""
+        <?xml version="1.0" encoding="utf8"?>
+        <html>
+        <head><meta charset="UTF-8">
+        </head>
+        </html>
+        """,
+            'text/html; charset="UTF_8"',
+        )
+        self.assertEqual(list(encodings), ["utf-8", "cp1252"])
+
+    def test_unknown_invalid(self) -> None:
+        """A character encoding should be ignored if it is unknown or invalid."""
+        encodings = _get_html_media_encodings(
+            b"""
+        <html>
+        <head><meta charset="invalid">
+        </head>
+        </html>
+        """,
+            'text/html; charset="invalid"',
+        )
+        self.assertEqual(list(encodings), ["utf-8", "cp1252"])
diff --git a/tests/media/test_media_storage.py b/tests/media/test_media_storage.py
new file mode 100644
index 0000000000..870047d0f2
--- /dev/null
+++ b/tests/media/test_media_storage.py
@@ -0,0 +1,792 @@
+# Copyright 2018-2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import shutil
+import tempfile
+from binascii import unhexlify
+from io import BytesIO
+from typing import Any, BinaryIO, ClassVar, Dict, List, Optional, Tuple, Union
+from unittest.mock import Mock
+from urllib import parse
+
+import attr
+from parameterized import parameterized, parameterized_class
+from PIL import Image as Image
+from typing_extensions import Literal
+
+from twisted.internet import defer
+from twisted.internet.defer import Deferred
+from twisted.test.proto_helpers import MemoryReactor
+
+from synapse.api.errors import Codes
+from synapse.events import EventBase
+from synapse.events.spamcheck import load_legacy_spam_checkers
+from synapse.http.types import QueryParams
+from synapse.logging.context import make_deferred_yieldable
+from synapse.media._base import FileInfo
+from synapse.media.filepath import MediaFilePaths
+from synapse.media.media_storage import MediaStorage, ReadableFileWrapper
+from synapse.media.storage_provider import FileStorageProviderBackend
+from synapse.module_api import ModuleApi
+from synapse.rest import admin
+from synapse.rest.client import login
+from synapse.server import HomeServer
+from synapse.types import JsonDict, RoomAlias
+from synapse.util import Clock
+
+from tests import unittest
+from tests.server import FakeChannel, FakeSite, make_request
+from tests.test_utils import SMALL_PNG
+from tests.utils import default_config
+
+
+class MediaStorageTests(unittest.HomeserverTestCase):
+    needs_threadpool = True
+
+    def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
+        self.test_dir = tempfile.mkdtemp(prefix="synapse-tests-")
+        self.addCleanup(shutil.rmtree, self.test_dir)
+
+        self.primary_base_path = os.path.join(self.test_dir, "primary")
+        self.secondary_base_path = os.path.join(self.test_dir, "secondary")
+
+        hs.config.media.media_store_path = self.primary_base_path
+
+        storage_providers = [FileStorageProviderBackend(hs, self.secondary_base_path)]
+
+        self.filepaths = MediaFilePaths(self.primary_base_path)
+        self.media_storage = MediaStorage(
+            hs, self.primary_base_path, self.filepaths, storage_providers
+        )
+
+    def test_ensure_media_is_in_local_cache(self) -> None:
+        media_id = "some_media_id"
+        test_body = "Test\n"
+
+        # First we create a file that is in a storage provider but not in the
+        # local primary media store
+        rel_path = self.filepaths.local_media_filepath_rel(media_id)
+        secondary_path = os.path.join(self.secondary_base_path, rel_path)
+
+        os.makedirs(os.path.dirname(secondary_path))
+
+        with open(secondary_path, "w") as f:
+            f.write(test_body)
+
+        # Now we run ensure_media_is_in_local_cache, which should copy the file
+        # to the local cache.
+        file_info = FileInfo(None, media_id)
+
+        # This uses a real blocking threadpool so we have to wait for it to be
+        # actually done :/
+        x = defer.ensureDeferred(
+            self.media_storage.ensure_media_is_in_local_cache(file_info)
+        )
+
+        # Hotloop until the threadpool does its job...
+        self.wait_on_thread(x)
+
+        local_path = self.get_success(x)
+
+        self.assertTrue(os.path.exists(local_path))
+
+        # Asserts the file is under the expected local cache directory
+        self.assertEqual(
+            os.path.commonprefix([self.primary_base_path, local_path]),
+            self.primary_base_path,
+        )
+
+        with open(local_path) as f:
+            body = f.read()
+
+        self.assertEqual(test_body, body)
+
+
+@attr.s(auto_attribs=True, slots=True, frozen=True)
+class _TestImage:
+    """An image for testing thumbnailing with the expected results
+
+    Attributes:
+        data: The raw image to thumbnail
+        content_type: The type of the image as a content type, e.g. "image/png"
+        extension: The extension associated with the format, e.g. ".png"
+        expected_cropped: The expected bytes from cropped thumbnailing, or None if
+            test should just check for success.
+        expected_scaled: The expected bytes from scaled thumbnailing, or None if
+            test should just check for a valid image returned.
+        expected_found: True if the file should exist on the server, or False if
+            a 404/400 is expected.
+        unable_to_thumbnail: True if we expect the thumbnailing to fail (400), or
+            False if the thumbnailing should succeed or a normal 404 is expected.
+    """
+
+    data: bytes
+    content_type: bytes
+    extension: bytes
+    expected_cropped: Optional[bytes] = None
+    expected_scaled: Optional[bytes] = None
+    expected_found: bool = True
+    unable_to_thumbnail: bool = False
+
+
+@parameterized_class(
+    ("test_image",),
+    [
+        # small png
+        (
+            _TestImage(
+                SMALL_PNG,
+                b"image/png",
+                b".png",
+                unhexlify(
+                    b"89504e470d0a1a0a0000000d4948445200000020000000200806"
+                    b"000000737a7af40000001a49444154789cedc101010000008220"
+                    b"ffaf6e484001000000ef0610200001194334ee0000000049454e"
+                    b"44ae426082"
+                ),
+                unhexlify(
+                    b"89504e470d0a1a0a0000000d4948445200000001000000010806"
+                    b"0000001f15c4890000000d49444154789c636060606000000005"
+                    b"0001a5f645400000000049454e44ae426082"
+                ),
+            ),
+        ),
+        # small png with transparency.
+        (
+            _TestImage(
+                unhexlify(
+                    b"89504e470d0a1a0a0000000d49484452000000010000000101000"
+                    b"00000376ef9240000000274524e5300010194fdae0000000a4944"
+                    b"4154789c636800000082008177cd72b60000000049454e44ae426"
+                    b"082"
+                ),
+                b"image/png",
+                b".png",
+                # Note that we don't check the output since it varies across
+                # different versions of Pillow.
+            ),
+        ),
+        # small lossless webp
+        (
+            _TestImage(
+                unhexlify(
+                    b"524946461a000000574542505650384c0d0000002f0000001007"
+                    b"1011118888fe0700"
+                ),
+                b"image/webp",
+                b".webp",
+            ),
+        ),
+        # an empty file
+        (
+            _TestImage(
+                b"",
+                b"image/gif",
+                b".gif",
+                expected_found=False,
+                unable_to_thumbnail=True,
+            ),
+        ),
+    ],
+)
+class MediaRepoTests(unittest.HomeserverTestCase):
+    test_image: ClassVar[_TestImage]
+    hijack_auth = True
+    user_id = "@test:user"
+
+    def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
+        self.fetches: List[
+            Tuple[
+                "Deferred[Tuple[bytes, Tuple[int, Dict[bytes, List[bytes]]]]]",
+                str,
+                str,
+                Optional[QueryParams],
+            ]
+        ] = []
+
+        def get_file(
+            destination: str,
+            path: str,
+            output_stream: BinaryIO,
+            args: Optional[QueryParams] = None,
+            retry_on_dns_fail: bool = True,
+            max_size: Optional[int] = None,
+            ignore_backoff: bool = False,
+        ) -> "Deferred[Tuple[int, Dict[bytes, List[bytes]]]]":
+            """A mock for MatrixFederationHttpClient.get_file."""
+
+            def write_to(
+                r: Tuple[bytes, Tuple[int, Dict[bytes, List[bytes]]]]
+            ) -> Tuple[int, Dict[bytes, List[bytes]]]:
+                data, response = r
+                output_stream.write(data)
+                return response
+
+            d: Deferred[Tuple[bytes, Tuple[int, Dict[bytes, List[bytes]]]]] = Deferred()
+            self.fetches.append((d, destination, path, args))
+            # Note that this callback changes the value held by d.
+            d_after_callback = d.addCallback(write_to)
+            return make_deferred_yieldable(d_after_callback)
+
+        # Mock out the homeserver's MatrixFederationHttpClient
+        client = Mock()
+        client.get_file = get_file
+
+        self.storage_path = self.mktemp()
+        self.media_store_path = self.mktemp()
+        os.mkdir(self.storage_path)
+        os.mkdir(self.media_store_path)
+
+        config = self.default_config()
+        config["media_store_path"] = self.media_store_path
+        config["max_image_pixels"] = 2000000
+
+        provider_config = {
+            "module": "synapse.media.storage_provider.FileStorageProviderBackend",
+            "store_local": True,
+            "store_synchronous": False,
+            "store_remote": True,
+            "config": {"directory": self.storage_path},
+        }
+        config["media_storage_providers"] = [provider_config]
+
+        hs = self.setup_test_homeserver(config=config, federation_http_client=client)
+
+        return hs
+
+    def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
+        media_resource = hs.get_media_repository_resource()
+        self.download_resource = media_resource.children[b"download"]
+        self.thumbnail_resource = media_resource.children[b"thumbnail"]
+        self.store = hs.get_datastores().main
+        self.media_repo = hs.get_media_repository()
+
+        self.media_id = "example.com/12345"
+
+    def _req(
+        self, content_disposition: Optional[bytes], include_content_type: bool = True
+    ) -> FakeChannel:
+        channel = make_request(
+            self.reactor,
+            FakeSite(self.download_resource, self.reactor),
+            "GET",
+            self.media_id,
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        # We've made one fetch, to example.com, using the media URL, and asking
+        # the other server not to do a remote fetch
+        self.assertEqual(len(self.fetches), 1)
+        self.assertEqual(self.fetches[0][1], "example.com")
+        self.assertEqual(
+            self.fetches[0][2], "/_matrix/media/r0/download/" + self.media_id
+        )
+        self.assertEqual(self.fetches[0][3], {"allow_remote": "false"})
+
+        headers = {
+            b"Content-Length": [b"%d" % (len(self.test_image.data))],
+        }
+
+        if include_content_type:
+            headers[b"Content-Type"] = [self.test_image.content_type]
+
+        if content_disposition:
+            headers[b"Content-Disposition"] = [content_disposition]
+
+        self.fetches[0][0].callback(
+            (self.test_image.data, (len(self.test_image.data), headers))
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 200)
+
+        return channel
+
+    def test_handle_missing_content_type(self) -> None:
+        channel = self._req(
+            b"inline; filename=out" + self.test_image.extension,
+            include_content_type=False,
+        )
+        headers = channel.headers
+        self.assertEqual(channel.code, 200)
+        self.assertEqual(
+            headers.getRawHeaders(b"Content-Type"), [b"application/octet-stream"]
+        )
+
+    def test_disposition_filename_ascii(self) -> None:
+        """
+        If the filename is filename=<ascii> then Synapse will decode it as an
+        ASCII string, and use filename= in the response.
+        """
+        channel = self._req(b"inline; filename=out" + self.test_image.extension)
+
+        headers = channel.headers
+        self.assertEqual(
+            headers.getRawHeaders(b"Content-Type"), [self.test_image.content_type]
+        )
+        self.assertEqual(
+            headers.getRawHeaders(b"Content-Disposition"),
+            [b"inline; filename=out" + self.test_image.extension],
+        )
+
+    def test_disposition_filenamestar_utf8escaped(self) -> None:
+        """
+        If the filename is filename=*utf8''<utf8 escaped> then Synapse will
+        correctly decode it as the UTF-8 string, and use filename* in the
+        response.
+        """
+        filename = parse.quote("\u2603".encode()).encode("ascii")
+        channel = self._req(
+            b"inline; filename*=utf-8''" + filename + self.test_image.extension
+        )
+
+        headers = channel.headers
+        self.assertEqual(
+            headers.getRawHeaders(b"Content-Type"), [self.test_image.content_type]
+        )
+        self.assertEqual(
+            headers.getRawHeaders(b"Content-Disposition"),
+            [b"inline; filename*=utf-8''" + filename + self.test_image.extension],
+        )
+
+    def test_disposition_none(self) -> None:
+        """
+        If there is no filename, one isn't passed on in the Content-Disposition
+        of the request.
+        """
+        channel = self._req(None)
+
+        headers = channel.headers
+        self.assertEqual(
+            headers.getRawHeaders(b"Content-Type"), [self.test_image.content_type]
+        )
+        self.assertEqual(headers.getRawHeaders(b"Content-Disposition"), None)
+
+    def test_thumbnail_crop(self) -> None:
+        """Test that a cropped remote thumbnail is available."""
+        self._test_thumbnail(
+            "crop",
+            self.test_image.expected_cropped,
+            expected_found=self.test_image.expected_found,
+            unable_to_thumbnail=self.test_image.unable_to_thumbnail,
+        )
+
+    def test_thumbnail_scale(self) -> None:
+        """Test that a scaled remote thumbnail is available."""
+        self._test_thumbnail(
+            "scale",
+            self.test_image.expected_scaled,
+            expected_found=self.test_image.expected_found,
+            unable_to_thumbnail=self.test_image.unable_to_thumbnail,
+        )
+
+    def test_invalid_type(self) -> None:
+        """An invalid thumbnail type is never available."""
+        self._test_thumbnail(
+            "invalid",
+            None,
+            expected_found=False,
+            unable_to_thumbnail=self.test_image.unable_to_thumbnail,
+        )
+
+    @unittest.override_config(
+        {"thumbnail_sizes": [{"width": 32, "height": 32, "method": "scale"}]}
+    )
+    def test_no_thumbnail_crop(self) -> None:
+        """
+        Override the config to generate only scaled thumbnails, but request a cropped one.
+        """
+        self._test_thumbnail(
+            "crop",
+            None,
+            expected_found=False,
+            unable_to_thumbnail=self.test_image.unable_to_thumbnail,
+        )
+
+    @unittest.override_config(
+        {"thumbnail_sizes": [{"width": 32, "height": 32, "method": "crop"}]}
+    )
+    def test_no_thumbnail_scale(self) -> None:
+        """
+        Override the config to generate only cropped thumbnails, but request a scaled one.
+        """
+        self._test_thumbnail(
+            "scale",
+            None,
+            expected_found=False,
+            unable_to_thumbnail=self.test_image.unable_to_thumbnail,
+        )
+
+    def test_thumbnail_repeated_thumbnail(self) -> None:
+        """Test that fetching the same thumbnail works, and deleting the on disk
+        thumbnail regenerates it.
+        """
+        self._test_thumbnail(
+            "scale",
+            self.test_image.expected_scaled,
+            expected_found=self.test_image.expected_found,
+            unable_to_thumbnail=self.test_image.unable_to_thumbnail,
+        )
+
+        if not self.test_image.expected_found:
+            return
+
+        # Fetching again should work, without re-requesting the image from the
+        # remote.
+        params = "?width=32&height=32&method=scale"
+        channel = make_request(
+            self.reactor,
+            FakeSite(self.thumbnail_resource, self.reactor),
+            "GET",
+            self.media_id + params,
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        self.assertEqual(channel.code, 200)
+        if self.test_image.expected_scaled:
+            self.assertEqual(
+                channel.result["body"],
+                self.test_image.expected_scaled,
+                channel.result["body"],
+            )
+
+        # Deleting the thumbnail on disk then re-requesting it should work as
+        # Synapse should regenerate missing thumbnails.
+        origin, media_id = self.media_id.split("/")
+        info = self.get_success(self.store.get_cached_remote_media(origin, media_id))
+        assert info is not None
+        file_id = info["filesystem_id"]
+
+        thumbnail_dir = self.media_repo.filepaths.remote_media_thumbnail_dir(
+            origin, file_id
+        )
+        shutil.rmtree(thumbnail_dir, ignore_errors=True)
+
+        channel = make_request(
+            self.reactor,
+            FakeSite(self.thumbnail_resource, self.reactor),
+            "GET",
+            self.media_id + params,
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        self.assertEqual(channel.code, 200)
+        if self.test_image.expected_scaled:
+            self.assertEqual(
+                channel.result["body"],
+                self.test_image.expected_scaled,
+                channel.result["body"],
+            )
+
+    def _test_thumbnail(
+        self,
+        method: str,
+        expected_body: Optional[bytes],
+        expected_found: bool,
+        unable_to_thumbnail: bool = False,
+    ) -> None:
+        """Test the given thumbnailing method works as expected.
+
+        Args:
+            method: The thumbnailing method to use (crop, scale).
+            expected_body: The expected bytes from thumbnailing, or None if
+                test should just check for a valid image.
+            expected_found: True if the file should exist on the server, or False if
+                a 404/400 is expected.
+            unable_to_thumbnail: True if we expect the thumbnailing to fail (400), or
+                False if the thumbnailing should succeed or a normal 404 is expected.
+        """
+
+        params = "?width=32&height=32&method=" + method
+        channel = make_request(
+            self.reactor,
+            FakeSite(self.thumbnail_resource, self.reactor),
+            "GET",
+            self.media_id + params,
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        headers = {
+            b"Content-Length": [b"%d" % (len(self.test_image.data))],
+            b"Content-Type": [self.test_image.content_type],
+        }
+        self.fetches[0][0].callback(
+            (self.test_image.data, (len(self.test_image.data), headers))
+        )
+        self.pump()
+
+        if expected_found:
+            self.assertEqual(channel.code, 200)
+
+            self.assertEqual(
+                channel.headers.getRawHeaders(b"Cross-Origin-Resource-Policy"),
+                [b"cross-origin"],
+            )
+
+            if expected_body is not None:
+                self.assertEqual(
+                    channel.result["body"], expected_body, channel.result["body"]
+                )
+            else:
+                # ensure that the result is at least some valid image
+                Image.open(BytesIO(channel.result["body"]))
+        elif unable_to_thumbnail:
+            # A 400 with a JSON body.
+            self.assertEqual(channel.code, 400)
+            self.assertEqual(
+                channel.json_body,
+                {
+                    "errcode": "M_UNKNOWN",
+                    "error": "Cannot find any thumbnails for the requested media ([b'example.com', b'12345']). This might mean the media is not a supported_media_format=(image/jpeg, image/jpg, image/webp, image/gif, image/png) or that thumbnailing failed for some other reason. (Dynamic thumbnails are disabled on this server.)",
+                },
+            )
+        else:
+            # A 404 with a JSON body.
+            self.assertEqual(channel.code, 404)
+            self.assertEqual(
+                channel.json_body,
+                {
+                    "errcode": "M_NOT_FOUND",
+                    "error": "Not found [b'example.com', b'12345']",
+                },
+            )
+
+    @parameterized.expand([("crop", 16), ("crop", 64), ("scale", 16), ("scale", 64)])
+    def test_same_quality(self, method: str, desired_size: int) -> None:
+        """Test that choosing between thumbnails with the same quality rating succeeds.
+
+        We are not particular about which thumbnail is chosen."""
+        self.assertIsNotNone(
+            self.thumbnail_resource._select_thumbnail(
+                desired_width=desired_size,
+                desired_height=desired_size,
+                desired_method=method,
+                desired_type=self.test_image.content_type,
+                # Provide two identical thumbnails which are guaranteed to have the same
+                # quality rating.
+                thumbnail_infos=[
+                    {
+                        "thumbnail_width": 32,
+                        "thumbnail_height": 32,
+                        "thumbnail_method": method,
+                        "thumbnail_type": self.test_image.content_type,
+                        "thumbnail_length": 256,
+                        "filesystem_id": f"thumbnail1{self.test_image.extension.decode()}",
+                    },
+                    {
+                        "thumbnail_width": 32,
+                        "thumbnail_height": 32,
+                        "thumbnail_method": method,
+                        "thumbnail_type": self.test_image.content_type,
+                        "thumbnail_length": 256,
+                        "filesystem_id": f"thumbnail2{self.test_image.extension.decode()}",
+                    },
+                ],
+                file_id=f"image{self.test_image.extension.decode()}",
+                url_cache=None,
+                server_name=None,
+            )
+        )
+
+    def test_x_robots_tag_header(self) -> None:
+        """
+        Tests that the `X-Robots-Tag` header is present, which informs web crawlers
+        to not index, archive, or follow links in media.
+        """
+        channel = self._req(b"inline; filename=out" + self.test_image.extension)
+
+        headers = channel.headers
+        self.assertEqual(
+            headers.getRawHeaders(b"X-Robots-Tag"),
+            [b"noindex, nofollow, noarchive, noimageindex"],
+        )
+
+    def test_cross_origin_resource_policy_header(self) -> None:
+        """
+        Test that the Cross-Origin-Resource-Policy header is set to "cross-origin"
+        allowing web clients to embed media from the downloads API.
+        """
+        channel = self._req(b"inline; filename=out" + self.test_image.extension)
+
+        headers = channel.headers
+
+        self.assertEqual(
+            headers.getRawHeaders(b"Cross-Origin-Resource-Policy"),
+            [b"cross-origin"],
+        )
+
+
+class TestSpamCheckerLegacy:
+    """A spam checker module that rejects all media that includes the bytes
+    `evil`.
+
+    Uses the legacy Spam-Checker API.
+    """
+
+    def __init__(self, config: Dict[str, Any], api: ModuleApi) -> None:
+        self.config = config
+        self.api = api
+
+    @staticmethod
+    def parse_config(config: Dict[str, Any]) -> Dict[str, Any]:
+        return config
+
+    async def check_event_for_spam(self, event: EventBase) -> Union[bool, str]:
+        return False  # allow all events
+
+    async def user_may_invite(
+        self,
+        inviter_userid: str,
+        invitee_userid: str,
+        room_id: str,
+    ) -> bool:
+        return True  # allow all invites
+
+    async def user_may_create_room(self, userid: str) -> bool:
+        return True  # allow all room creations
+
+    async def user_may_create_room_alias(
+        self, userid: str, room_alias: RoomAlias
+    ) -> bool:
+        return True  # allow all room aliases
+
+    async def user_may_publish_room(self, userid: str, room_id: str) -> bool:
+        return True  # allow publishing of all rooms
+
+    async def check_media_file_for_spam(
+        self, file_wrapper: ReadableFileWrapper, file_info: FileInfo
+    ) -> bool:
+        buf = BytesIO()
+        await file_wrapper.write_chunks_to(buf.write)
+
+        return b"evil" in buf.getvalue()
+
+
+class SpamCheckerTestCaseLegacy(unittest.HomeserverTestCase):
+    servlets = [
+        login.register_servlets,
+        admin.register_servlets,
+    ]
+
+    def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
+        self.user = self.register_user("user", "pass")
+        self.tok = self.login("user", "pass")
+
+        # Allow for uploading and downloading to/from the media repo
+        self.media_repo = hs.get_media_repository_resource()
+        self.download_resource = self.media_repo.children[b"download"]
+        self.upload_resource = self.media_repo.children[b"upload"]
+
+        load_legacy_spam_checkers(hs)
+
+    def default_config(self) -> Dict[str, Any]:
+        config = default_config("test")
+
+        config.update(
+            {
+                "spam_checker": [
+                    {
+                        "module": TestSpamCheckerLegacy.__module__
+                        + ".TestSpamCheckerLegacy",
+                        "config": {},
+                    }
+                ]
+            }
+        )
+
+        return config
+
+    def test_upload_innocent(self) -> None:
+        """Attempt to upload some innocent data that should be allowed."""
+        self.helper.upload_media(
+            self.upload_resource, SMALL_PNG, tok=self.tok, expect_code=200
+        )
+
+    def test_upload_ban(self) -> None:
+        """Attempt to upload some data that includes bytes "evil", which should
+        get rejected by the spam checker.
+        """
+
+        data = b"Some evil data"
+
+        self.helper.upload_media(
+            self.upload_resource, data, tok=self.tok, expect_code=400
+        )
+
+
+EVIL_DATA = b"Some evil data"
+EVIL_DATA_EXPERIMENT = b"Some evil data to trigger the experimental tuple API"
+
+
+class SpamCheckerTestCase(unittest.HomeserverTestCase):
+    servlets = [
+        login.register_servlets,
+        admin.register_servlets,
+    ]
+
+    def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
+        self.user = self.register_user("user", "pass")
+        self.tok = self.login("user", "pass")
+
+        # Allow for uploading and downloading to/from the media repo
+        self.media_repo = hs.get_media_repository_resource()
+        self.download_resource = self.media_repo.children[b"download"]
+        self.upload_resource = self.media_repo.children[b"upload"]
+
+        hs.get_module_api().register_spam_checker_callbacks(
+            check_media_file_for_spam=self.check_media_file_for_spam
+        )
+
+    async def check_media_file_for_spam(
+        self, file_wrapper: ReadableFileWrapper, file_info: FileInfo
+    ) -> Union[Codes, Literal["NOT_SPAM"], Tuple[Codes, JsonDict]]:
+        buf = BytesIO()
+        await file_wrapper.write_chunks_to(buf.write)
+
+        if buf.getvalue() == EVIL_DATA:
+            return Codes.FORBIDDEN
+        elif buf.getvalue() == EVIL_DATA_EXPERIMENT:
+            return (Codes.FORBIDDEN, {})
+        else:
+            return "NOT_SPAM"
+
+    def test_upload_innocent(self) -> None:
+        """Attempt to upload some innocent data that should be allowed."""
+        self.helper.upload_media(
+            self.upload_resource, SMALL_PNG, tok=self.tok, expect_code=200
+        )
+
+    def test_upload_ban(self) -> None:
+        """Attempt to upload some data that includes bytes "evil", which should
+        get rejected by the spam checker.
+        """
+
+        self.helper.upload_media(
+            self.upload_resource, EVIL_DATA, tok=self.tok, expect_code=400
+        )
+
+        self.helper.upload_media(
+            self.upload_resource,
+            EVIL_DATA_EXPERIMENT,
+            tok=self.tok,
+            expect_code=400,
+        )
diff --git a/tests/media/test_oembed.py b/tests/media/test_oembed.py
new file mode 100644
index 0000000000..c8bf8421da
--- /dev/null
+++ b/tests/media/test_oembed.py
@@ -0,0 +1,162 @@
+#  Copyright 2021 The Matrix.org Foundation C.I.C.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import json
+
+from parameterized import parameterized
+
+from twisted.test.proto_helpers import MemoryReactor
+
+from synapse.media.oembed import OEmbedProvider, OEmbedResult
+from synapse.server import HomeServer
+from synapse.types import JsonDict
+from synapse.util import Clock
+
+from tests.unittest import HomeserverTestCase
+
+try:
+    import lxml
+except ImportError:
+    lxml = None
+
+
+class OEmbedTests(HomeserverTestCase):
+    if not lxml:
+        skip = "url preview feature requires lxml"
+
+    def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
+        self.oembed = OEmbedProvider(hs)
+
+    def parse_response(self, response: JsonDict) -> OEmbedResult:
+        return self.oembed.parse_oembed_response(
+            "https://test", json.dumps(response).encode("utf-8")
+        )
+
+    def test_version(self) -> None:
+        """Accept versions that are similar to 1.0 as a string or int (or missing)."""
+        for version in ("1.0", 1.0, 1):
+            result = self.parse_response({"version": version})
+            # An empty Open Graph response is an error, ensure the URL is included.
+            self.assertIn("og:url", result.open_graph_result)
+
+        # A missing version should be treated as 1.0.
+        result = self.parse_response({"type": "link"})
+        self.assertIn("og:url", result.open_graph_result)
+
+        # Invalid versions should be rejected.
+        for version in ("2.0", "1", 1.1, 0, None, {}, []):
+            result = self.parse_response({"version": version, "type": "link"})
+            # An empty Open Graph response is an error, ensure the URL is included.
+            self.assertEqual({}, result.open_graph_result)
+
+    def test_cache_age(self) -> None:
+        """Ensure a cache-age is parsed properly."""
+        # Correct-ish cache ages are allowed.
+        for cache_age in ("1", 1.0, 1):
+            result = self.parse_response({"cache_age": cache_age})
+            self.assertEqual(result.cache_age, 1000)
+
+        # Invalid cache ages are ignored.
+        for cache_age in ("invalid", {}):
+            result = self.parse_response({"cache_age": cache_age})
+            self.assertIsNone(result.cache_age)
+
+        # Cache age is optional.
+        result = self.parse_response({})
+        self.assertIsNone(result.cache_age)
+
+    @parameterized.expand(
+        [
+            ("title", "title"),
+            ("provider_name", "site_name"),
+            ("thumbnail_url", "image"),
+        ],
+        name_func=lambda func, num, p: f"{func.__name__}_{p.args[0]}",
+    )
+    def test_property(self, oembed_property: str, open_graph_property: str) -> None:
+        """Test properties which must be strings."""
+        result = self.parse_response({oembed_property: "test"})
+        self.assertIn(f"og:{open_graph_property}", result.open_graph_result)
+        self.assertEqual(result.open_graph_result[f"og:{open_graph_property}"], "test")
+
+        result = self.parse_response({oembed_property: 1})
+        self.assertNotIn(f"og:{open_graph_property}", result.open_graph_result)
+
+    def test_author_name(self) -> None:
+        """Test the author_name property."""
+        result = self.parse_response({"author_name": "test"})
+        self.assertEqual(result.author_name, "test")
+
+        result = self.parse_response({"author_name": 1})
+        self.assertIsNone(result.author_name)
+
+    def test_rich(self) -> None:
+        """Test a type of rich."""
+        result = self.parse_response({"html": "test<img src='foo'>", "type": "rich"})
+        self.assertIn("og:description", result.open_graph_result)
+        self.assertIn("og:image", result.open_graph_result)
+        self.assertEqual(result.open_graph_result["og:description"], "test")
+        self.assertEqual(result.open_graph_result["og:image"], "foo")
+
+        result = self.parse_response({"type": "rich"})
+        self.assertNotIn("og:description", result.open_graph_result)
+
+        result = self.parse_response({"html": 1, "type": "rich"})
+        self.assertNotIn("og:description", result.open_graph_result)
+
+    def test_photo(self) -> None:
+        """Test a type of photo."""
+        result = self.parse_response({"url": "test", "type": "photo"})
+        self.assertIn("og:image", result.open_graph_result)
+        self.assertEqual(result.open_graph_result["og:image"], "test")
+
+        result = self.parse_response({"type": "photo"})
+        self.assertNotIn("og:image", result.open_graph_result)
+
+        result = self.parse_response({"url": 1, "type": "photo"})
+        self.assertNotIn("og:image", result.open_graph_result)
+
+    def test_video(self) -> None:
+        """Test a type of video."""
+        result = self.parse_response({"html": "test", "type": "video"})
+        self.assertIn("og:type", result.open_graph_result)
+        self.assertEqual(result.open_graph_result["og:type"], "video.other")
+        self.assertIn("og:description", result.open_graph_result)
+        self.assertEqual(result.open_graph_result["og:description"], "test")
+
+        result = self.parse_response({"type": "video"})
+        self.assertIn("og:type", result.open_graph_result)
+        self.assertEqual(result.open_graph_result["og:type"], "video.other")
+        self.assertNotIn("og:description", result.open_graph_result)
+
+        result = self.parse_response({"url": 1, "type": "video"})
+        self.assertIn("og:type", result.open_graph_result)
+        self.assertEqual(result.open_graph_result["og:type"], "video.other")
+        self.assertNotIn("og:description", result.open_graph_result)
+
+    def test_link(self) -> None:
+        """Test type of link."""
+        result = self.parse_response({"type": "link"})
+        self.assertIn("og:type", result.open_graph_result)
+        self.assertEqual(result.open_graph_result["og:type"], "website")
+
+    def test_title_html_entities(self) -> None:
+        """Test HTML entities in title"""
+        result = self.parse_response(
+            {"title": "Why JSON isn&#8217;t a Good Configuration Language"}
+        )
+        self.assertEqual(
+            result.open_graph_result["og:title"],
+            "Why JSON isn’t a Good Configuration Language",
+        )
diff --git a/tests/rest/admin/test_media.py b/tests/rest/admin/test_media.py
index f41319a5b6..6d04911d67 100644
--- a/tests/rest/admin/test_media.py
+++ b/tests/rest/admin/test_media.py
@@ -20,8 +20,8 @@ from twisted.test.proto_helpers import MemoryReactor
 
 import synapse.rest.admin
 from synapse.api.errors import Codes
+from synapse.media.filepath import MediaFilePaths
 from synapse.rest.client import login, profile, room
-from synapse.rest.media.v1.filepath import MediaFilePaths
 from synapse.server import HomeServer
 from synapse.util import Clock
 
diff --git a/tests/rest/admin/test_user.py b/tests/rest/admin/test_user.py
index f5b213219f..4b8f889a71 100644
--- a/tests/rest/admin/test_user.py
+++ b/tests/rest/admin/test_user.py
@@ -28,8 +28,8 @@ import synapse.rest.admin
 from synapse.api.constants import ApprovalNoticeMedium, LoginType, UserTypes
 from synapse.api.errors import Codes, HttpResponseException, ResourceLimitError
 from synapse.api.room_versions import RoomVersions
+from synapse.media.filepath import MediaFilePaths
 from synapse.rest.client import devices, login, logout, profile, register, room, sync
-from synapse.rest.media.v1.filepath import MediaFilePaths
 from synapse.server import HomeServer
 from synapse.types import JsonDict, UserID, create_requester
 from synapse.util import Clock
diff --git a/tests/rest/media/test_url_preview.py b/tests/rest/media/test_url_preview.py
new file mode 100644
index 0000000000..e91dc581c2
--- /dev/null
+++ b/tests/rest/media/test_url_preview.py
@@ -0,0 +1,1234 @@
+# Copyright 2018 New Vector Ltd
+# Copyright 2021 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import json
+import os
+import re
+from typing import Any, Dict, Optional, Sequence, Tuple, Type
+from urllib.parse import quote, urlencode
+
+from twisted.internet._resolver import HostResolution
+from twisted.internet.address import IPv4Address, IPv6Address
+from twisted.internet.error import DNSLookupError
+from twisted.internet.interfaces import IAddress, IResolutionReceiver
+from twisted.test.proto_helpers import AccumulatingProtocol, MemoryReactor
+
+from synapse.config.oembed import OEmbedEndpointConfig
+from synapse.rest.media.media_repository_resource import MediaRepositoryResource
+from synapse.rest.media.preview_url_resource import IMAGE_CACHE_EXPIRY_MS
+from synapse.server import HomeServer
+from synapse.types import JsonDict
+from synapse.util import Clock
+from synapse.util.stringutils import parse_and_validate_mxc_uri
+
+from tests import unittest
+from tests.server import FakeTransport
+from tests.test_utils import SMALL_PNG
+from tests.utils import MockClock
+
+try:
+    import lxml
+except ImportError:
+    lxml = None
+
+
+class URLPreviewTests(unittest.HomeserverTestCase):
+    if not lxml:
+        skip = "url preview feature requires lxml"
+
+    hijack_auth = True
+    user_id = "@test:user"
+    end_content = (
+        b"<html><head>"
+        b'<meta property="og:title" content="~matrix~" />'
+        b'<meta property="og:description" content="hi" />'
+        b"</head></html>"
+    )
+
+    def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
+        config = self.default_config()
+        config["url_preview_enabled"] = True
+        config["max_spider_size"] = 9999999
+        config["url_preview_ip_range_blacklist"] = (
+            "192.168.1.1",
+            "1.0.0.0/8",
+            "3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff",
+            "2001:800::/21",
+        )
+        config["url_preview_ip_range_whitelist"] = ("1.1.1.1",)
+        config["url_preview_accept_language"] = [
+            "en-UK",
+            "en-US;q=0.9",
+            "fr;q=0.8",
+            "*;q=0.7",
+        ]
+
+        self.storage_path = self.mktemp()
+        self.media_store_path = self.mktemp()
+        os.mkdir(self.storage_path)
+        os.mkdir(self.media_store_path)
+        config["media_store_path"] = self.media_store_path
+
+        provider_config = {
+            "module": "synapse.media.storage_provider.FileStorageProviderBackend",
+            "store_local": True,
+            "store_synchronous": False,
+            "store_remote": True,
+            "config": {"directory": self.storage_path},
+        }
+
+        config["media_storage_providers"] = [provider_config]
+
+        hs = self.setup_test_homeserver(config=config)
+
+        # After the hs is created, modify the parsed oEmbed config (to avoid
+        # messing with files).
+        #
+        # Note that HTTP URLs are used to avoid having to deal with TLS in tests.
+        hs.config.oembed.oembed_patterns = [
+            OEmbedEndpointConfig(
+                api_endpoint="http://publish.twitter.com/oembed",
+                url_patterns=[
+                    re.compile(r"http://twitter\.com/.+/status/.+"),
+                ],
+                formats=None,
+            ),
+            OEmbedEndpointConfig(
+                api_endpoint="http://www.hulu.com/api/oembed.{format}",
+                url_patterns=[
+                    re.compile(r"http://www\.hulu\.com/watch/.+"),
+                ],
+                formats=["json"],
+            ),
+        ]
+
+        return hs
+
+    def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
+        self.media_repo = hs.get_media_repository_resource()
+        self.preview_url = self.media_repo.children[b"preview_url"]
+
+        self.lookups: Dict[str, Any] = {}
+
+        class Resolver:
+            def resolveHostName(
+                _self,
+                resolutionReceiver: IResolutionReceiver,
+                hostName: str,
+                portNumber: int = 0,
+                addressTypes: Optional[Sequence[Type[IAddress]]] = None,
+                transportSemantics: str = "TCP",
+            ) -> IResolutionReceiver:
+                resolution = HostResolution(hostName)
+                resolutionReceiver.resolutionBegan(resolution)
+                if hostName not in self.lookups:
+                    raise DNSLookupError("OH NO")
+
+                for i in self.lookups[hostName]:
+                    resolutionReceiver.addressResolved(i[0]("TCP", i[1], portNumber))
+                resolutionReceiver.resolutionComplete()
+                return resolutionReceiver
+
+        self.reactor.nameResolver = Resolver()  # type: ignore[assignment]
+
+    def create_test_resource(self) -> MediaRepositoryResource:
+        return self.hs.get_media_repository_resource()
+
+    def _assert_small_png(self, json_body: JsonDict) -> None:
+        """Assert properties from the SMALL_PNG test image."""
+        self.assertTrue(json_body["og:image"].startswith("mxc://"))
+        self.assertEqual(json_body["og:image:height"], 1)
+        self.assertEqual(json_body["og:image:width"], 1)
+        self.assertEqual(json_body["og:image:type"], "image/png")
+        self.assertEqual(json_body["matrix:image:size"], 67)
+
+    def test_cache_returns_correct_type(self) -> None:
+        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://matrix.org",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
+            % (len(self.end_content),)
+            + self.end_content
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 200)
+        self.assertEqual(
+            channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
+        )
+
+        # Check the cache returns the correct response
+        channel = self.make_request(
+            "GET", "preview_url?url=http://matrix.org", shorthand=False
+        )
+
+        # Check the cache response has the same content
+        self.assertEqual(channel.code, 200)
+        self.assertEqual(
+            channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
+        )
+
+        # Clear the in-memory cache
+        self.assertIn("http://matrix.org", self.preview_url._cache)
+        self.preview_url._cache.pop("http://matrix.org")
+        self.assertNotIn("http://matrix.org", self.preview_url._cache)
+
+        # Check the database cache returns the correct response
+        channel = self.make_request(
+            "GET", "preview_url?url=http://matrix.org", shorthand=False
+        )
+
+        # Check the cache response has the same content
+        self.assertEqual(channel.code, 200)
+        self.assertEqual(
+            channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
+        )
+
+    def test_non_ascii_preview_httpequiv(self) -> None:
+        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
+
+        end_content = (
+            b"<html><head>"
+            b'<meta http-equiv="Content-Type" content="text/html; charset=windows-1251"/>'
+            b'<meta property="og:title" content="\xe4\xea\xe0" />'
+            b'<meta property="og:description" content="hi" />'
+            b"</head></html>"
+        )
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://matrix.org",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b'Content-Type: text/html; charset="utf8"\r\n\r\n'
+            )
+            % (len(end_content),)
+            + end_content
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 200)
+        self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
+
+    def test_video_rejected(self) -> None:
+        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
+
+        end_content = b"anything"
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://matrix.org",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b"Content-Type: video/mp4\r\n\r\n"
+            )
+            % (len(end_content))
+            + end_content
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 502)
+        self.assertEqual(
+            channel.json_body,
+            {
+                "errcode": "M_UNKNOWN",
+                "error": "Requested file's content type not allowed for this operation: video/mp4",
+            },
+        )
+
+    def test_audio_rejected(self) -> None:
+        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
+
+        end_content = b"anything"
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://matrix.org",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b"Content-Type: audio/aac\r\n\r\n"
+            )
+            % (len(end_content))
+            + end_content
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 502)
+        self.assertEqual(
+            channel.json_body,
+            {
+                "errcode": "M_UNKNOWN",
+                "error": "Requested file's content type not allowed for this operation: audio/aac",
+            },
+        )
+
+    def test_non_ascii_preview_content_type(self) -> None:
+        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
+
+        end_content = (
+            b"<html><head>"
+            b'<meta property="og:title" content="\xe4\xea\xe0" />'
+            b'<meta property="og:description" content="hi" />'
+            b"</head></html>"
+        )
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://matrix.org",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
+            )
+            % (len(end_content),)
+            + end_content
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 200)
+        self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
+
+    def test_overlong_title(self) -> None:
+        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
+
+        end_content = (
+            b"<html><head>"
+            b"<title>" + b"x" * 2000 + b"</title>"
+            b'<meta property="og:description" content="hi" />'
+            b"</head></html>"
+        )
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://matrix.org",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
+            )
+            % (len(end_content),)
+            + end_content
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 200)
+        res = channel.json_body
+        # We should only see the `og:description` field, as `title` is too long and should be stripped out
+        self.assertCountEqual(["og:description"], res.keys())
+
+    def test_ipaddr(self) -> None:
+        """
+        IP addresses can be previewed directly.
+        """
+        self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://example.com",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
+            % (len(self.end_content),)
+            + self.end_content
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 200)
+        self.assertEqual(
+            channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
+        )
+
+    def test_blacklisted_ip_specific(self) -> None:
+        """
+        Blacklisted IP addresses, found via DNS, are not spidered.
+        """
+        self.lookups["example.com"] = [(IPv4Address, "192.168.1.1")]
+
+        channel = self.make_request(
+            "GET", "preview_url?url=http://example.com", shorthand=False
+        )
+
+        # No requests made.
+        self.assertEqual(len(self.reactor.tcpClients), 0)
+        self.assertEqual(channel.code, 502)
+        self.assertEqual(
+            channel.json_body,
+            {
+                "errcode": "M_UNKNOWN",
+                "error": "DNS resolution failure during URL preview generation",
+            },
+        )
+
+    def test_blacklisted_ip_range(self) -> None:
+        """
+        Blacklisted IP ranges, IPs found over DNS, are not spidered.
+        """
+        self.lookups["example.com"] = [(IPv4Address, "1.1.1.2")]
+
+        channel = self.make_request(
+            "GET", "preview_url?url=http://example.com", shorthand=False
+        )
+
+        self.assertEqual(channel.code, 502)
+        self.assertEqual(
+            channel.json_body,
+            {
+                "errcode": "M_UNKNOWN",
+                "error": "DNS resolution failure during URL preview generation",
+            },
+        )
+
+    def test_blacklisted_ip_specific_direct(self) -> None:
+        """
+        Blacklisted IP addresses, accessed directly, are not spidered.
+        """
+        channel = self.make_request(
+            "GET", "preview_url?url=http://192.168.1.1", shorthand=False
+        )
+
+        # No requests made.
+        self.assertEqual(len(self.reactor.tcpClients), 0)
+        self.assertEqual(
+            channel.json_body,
+            {
+                "errcode": "M_UNKNOWN",
+                "error": "IP address blocked by IP blacklist entry",
+            },
+        )
+        self.assertEqual(channel.code, 403)
+
+    def test_blacklisted_ip_range_direct(self) -> None:
+        """
+        Blacklisted IP ranges, accessed directly, are not spidered.
+        """
+        channel = self.make_request(
+            "GET", "preview_url?url=http://1.1.1.2", shorthand=False
+        )
+
+        self.assertEqual(channel.code, 403)
+        self.assertEqual(
+            channel.json_body,
+            {
+                "errcode": "M_UNKNOWN",
+                "error": "IP address blocked by IP blacklist entry",
+            },
+        )
+
+    def test_blacklisted_ip_range_whitelisted_ip(self) -> None:
+        """
+        Blacklisted but then subsequently whitelisted IP addresses can be
+        spidered.
+        """
+        self.lookups["example.com"] = [(IPv4Address, "1.1.1.1")]
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://example.com",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+
+        client.dataReceived(
+            b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
+            % (len(self.end_content),)
+            + self.end_content
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 200)
+        self.assertEqual(
+            channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
+        )
+
+    def test_blacklisted_ip_with_external_ip(self) -> None:
+        """
+        If a hostname resolves a blacklisted IP, even if there's a
+        non-blacklisted one, it will be rejected.
+        """
+        # Hardcode the URL resolving to the IP we want.
+        self.lookups["example.com"] = [
+            (IPv4Address, "1.1.1.2"),
+            (IPv4Address, "10.1.2.3"),
+        ]
+
+        channel = self.make_request(
+            "GET", "preview_url?url=http://example.com", shorthand=False
+        )
+        self.assertEqual(channel.code, 502)
+        self.assertEqual(
+            channel.json_body,
+            {
+                "errcode": "M_UNKNOWN",
+                "error": "DNS resolution failure during URL preview generation",
+            },
+        )
+
+    def test_blacklisted_ipv6_specific(self) -> None:
+        """
+        Blacklisted IP addresses, found via DNS, are not spidered.
+        """
+        self.lookups["example.com"] = [
+            (IPv6Address, "3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff")
+        ]
+
+        channel = self.make_request(
+            "GET", "preview_url?url=http://example.com", shorthand=False
+        )
+
+        # No requests made.
+        self.assertEqual(len(self.reactor.tcpClients), 0)
+        self.assertEqual(channel.code, 502)
+        self.assertEqual(
+            channel.json_body,
+            {
+                "errcode": "M_UNKNOWN",
+                "error": "DNS resolution failure during URL preview generation",
+            },
+        )
+
+    def test_blacklisted_ipv6_range(self) -> None:
+        """
+        Blacklisted IP ranges, IPs found over DNS, are not spidered.
+        """
+        self.lookups["example.com"] = [(IPv6Address, "2001:800::1")]
+
+        channel = self.make_request(
+            "GET", "preview_url?url=http://example.com", shorthand=False
+        )
+
+        self.assertEqual(channel.code, 502)
+        self.assertEqual(
+            channel.json_body,
+            {
+                "errcode": "M_UNKNOWN",
+                "error": "DNS resolution failure during URL preview generation",
+            },
+        )
+
+    def test_OPTIONS(self) -> None:
+        """
+        OPTIONS returns the OPTIONS.
+        """
+        channel = self.make_request(
+            "OPTIONS", "preview_url?url=http://example.com", shorthand=False
+        )
+        self.assertEqual(channel.code, 200)
+        self.assertEqual(channel.json_body, {})
+
+    def test_accept_language_config_option(self) -> None:
+        """
+        Accept-Language header is sent to the remote server
+        """
+        self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
+
+        # Build and make a request to the server
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://example.com",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        # Extract Synapse's tcp client
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+
+        # Build a fake remote server to reply with
+        server = AccumulatingProtocol()
+
+        # Connect the two together
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+
+        # Tell Synapse that it has received some data from the remote server
+        client.dataReceived(
+            b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
+            % (len(self.end_content),)
+            + self.end_content
+        )
+
+        # Move the reactor along until we get a response on our original channel
+        self.pump()
+        self.assertEqual(channel.code, 200)
+        self.assertEqual(
+            channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
+        )
+
+        # Check that the server received the Accept-Language header as part
+        # of the request from Synapse
+        self.assertIn(
+            (
+                b"Accept-Language: en-UK\r\n"
+                b"Accept-Language: en-US;q=0.9\r\n"
+                b"Accept-Language: fr;q=0.8\r\n"
+                b"Accept-Language: *;q=0.7"
+            ),
+            server.data,
+        )
+
+    def test_nonexistent_image(self) -> None:
+        """If the preview image doesn't exist, ensure some data is returned."""
+        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
+
+        result = (
+            b"""<html><body><img src="http://cdn.matrix.org/foo.jpg"></body></html>"""
+        )
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://matrix.org",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b'Content-Type: text/html; charset="utf8"\r\n\r\n'
+            )
+            % (len(result),)
+            + result
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 200)
+
+        # The image should not be in the result.
+        self.assertNotIn("og:image", channel.json_body)
+
+    def test_oembed_failure(self) -> None:
+        """If the autodiscovered oEmbed URL fails, ensure some data is returned."""
+        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
+
+        result = b"""
+        <title>oEmbed Autodiscovery Fail</title>
+        <link rel="alternate" type="application/json+oembed"
+            href="http://example.com/oembed?url=http%3A%2F%2Fmatrix.org&format=json"
+            title="matrixdotorg" />
+        """
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://matrix.org",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b'Content-Type: text/html; charset="utf8"\r\n\r\n'
+            )
+            % (len(result),)
+            + result
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 200)
+
+        # The image should not be in the result.
+        self.assertEqual(channel.json_body["og:title"], "oEmbed Autodiscovery Fail")
+
+    def test_data_url(self) -> None:
+        """
+        Requesting to preview a data URL is not supported.
+        """
+        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
+
+        data = base64.b64encode(SMALL_PNG).decode()
+
+        query_params = urlencode(
+            {
+                "url": f'<html><head><img src="data:image/png;base64,{data}" /></head></html>'
+            }
+        )
+
+        channel = self.make_request(
+            "GET",
+            f"preview_url?{query_params}",
+            shorthand=False,
+        )
+        self.pump()
+
+        self.assertEqual(channel.code, 500)
+
+    def test_inline_data_url(self) -> None:
+        """
+        An inline image (as a data URL) should be parsed properly.
+        """
+        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
+
+        data = base64.b64encode(SMALL_PNG)
+
+        end_content = (
+            b"<html><head>" b'<img src="data:image/png;base64,%s" />' b"</head></html>"
+        ) % (data,)
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://matrix.org",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b'Content-Type: text/html; charset="utf8"\r\n\r\n'
+            )
+            % (len(end_content),)
+            + end_content
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 200)
+        self._assert_small_png(channel.json_body)
+
+    def test_oembed_photo(self) -> None:
+        """Test an oEmbed endpoint which returns a 'photo' type which redirects the preview to a new URL."""
+        self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
+        self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
+
+        result = {
+            "version": "1.0",
+            "type": "photo",
+            "url": "http://cdn.twitter.com/matrixdotorg",
+        }
+        oembed_content = json.dumps(result).encode("utf-8")
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://twitter.com/matrixdotorg/status/12345",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b'Content-Type: application/json; charset="utf8"\r\n\r\n'
+            )
+            % (len(oembed_content),)
+            + oembed_content
+        )
+
+        self.pump()
+
+        # Ensure a second request is made to the photo URL.
+        client = self.reactor.tcpClients[1][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b"Content-Type: image/png\r\n\r\n"
+            )
+            % (len(SMALL_PNG),)
+            + SMALL_PNG
+        )
+
+        self.pump()
+
+        # Ensure the URL is what was requested.
+        self.assertIn(b"/matrixdotorg", server.data)
+
+        self.assertEqual(channel.code, 200)
+        body = channel.json_body
+        self.assertEqual(body["og:url"], "http://twitter.com/matrixdotorg/status/12345")
+        self._assert_small_png(body)
+
+    def test_oembed_rich(self) -> None:
+        """Test an oEmbed endpoint which returns HTML content via the 'rich' type."""
+        self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
+
+        result = {
+            "version": "1.0",
+            "type": "rich",
+            # Note that this provides the author, not the title.
+            "author_name": "Alice",
+            "html": "<div>Content Preview</div>",
+        }
+        end_content = json.dumps(result).encode("utf-8")
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://twitter.com/matrixdotorg/status/12345",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b'Content-Type: application/json; charset="utf8"\r\n\r\n'
+            )
+            % (len(end_content),)
+            + end_content
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 200)
+        body = channel.json_body
+        self.assertEqual(
+            body,
+            {
+                "og:url": "http://twitter.com/matrixdotorg/status/12345",
+                "og:title": "Alice",
+                "og:description": "Content Preview",
+            },
+        )
+
+    def test_oembed_format(self) -> None:
+        """Test an oEmbed endpoint which requires the format in the URL."""
+        self.lookups["www.hulu.com"] = [(IPv4Address, "10.1.2.3")]
+
+        result = {
+            "version": "1.0",
+            "type": "rich",
+            "html": "<div>Content Preview</div>",
+        }
+        end_content = json.dumps(result).encode("utf-8")
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://www.hulu.com/watch/12345",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b'Content-Type: application/json; charset="utf8"\r\n\r\n'
+            )
+            % (len(end_content),)
+            + end_content
+        )
+
+        self.pump()
+
+        # The {format} should have been turned into json.
+        self.assertIn(b"/api/oembed.json", server.data)
+        # A URL parameter of format=json should be provided.
+        self.assertIn(b"format=json", server.data)
+
+        self.assertEqual(channel.code, 200)
+        body = channel.json_body
+        self.assertEqual(
+            body,
+            {
+                "og:url": "http://www.hulu.com/watch/12345",
+                "og:description": "Content Preview",
+            },
+        )
+
+    def test_oembed_autodiscovery(self) -> None:
+        """
+        Autodiscovery works by finding the link in the HTML response and then requesting an oEmbed URL.
+        1. Request a preview of a URL which is not known to the oEmbed code.
+        2. It returns HTML including a link to an oEmbed preview.
+        3. The oEmbed preview is requested and returns a URL for an image.
+        4. The image is requested for thumbnailing.
+        """
+        # This is a little cheesy in that we use the www subdomain (which isn't the
+        # list of oEmbed patterns) to get "raw" HTML response.
+        self.lookups["www.twitter.com"] = [(IPv4Address, "10.1.2.3")]
+        self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
+        self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
+
+        result = b"""
+        <link rel="alternate" type="application/json+oembed"
+            href="http://publish.twitter.com/oembed?url=http%3A%2F%2Fcdn.twitter.com%2Fmatrixdotorg%2Fstatus%2F12345&format=json"
+            title="matrixdotorg" />
+        """
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://www.twitter.com/matrixdotorg/status/12345",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b'Content-Type: text/html; charset="utf8"\r\n\r\n'
+            )
+            % (len(result),)
+            + result
+        )
+
+        self.pump()
+
+        # The oEmbed response.
+        result2 = {
+            "version": "1.0",
+            "type": "photo",
+            "url": "http://cdn.twitter.com/matrixdotorg",
+        }
+        oembed_content = json.dumps(result2).encode("utf-8")
+
+        # Ensure a second request is made to the oEmbed URL.
+        client = self.reactor.tcpClients[1][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b'Content-Type: application/json; charset="utf8"\r\n\r\n'
+            )
+            % (len(oembed_content),)
+            + oembed_content
+        )
+
+        self.pump()
+
+        # Ensure the URL is what was requested.
+        self.assertIn(b"/oembed?", server.data)
+
+        # Ensure a third request is made to the photo URL.
+        client = self.reactor.tcpClients[2][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b"Content-Type: image/png\r\n\r\n"
+            )
+            % (len(SMALL_PNG),)
+            + SMALL_PNG
+        )
+
+        self.pump()
+
+        # Ensure the URL is what was requested.
+        self.assertIn(b"/matrixdotorg", server.data)
+
+        self.assertEqual(channel.code, 200)
+        body = channel.json_body
+        self.assertEqual(
+            body["og:url"], "http://www.twitter.com/matrixdotorg/status/12345"
+        )
+        self._assert_small_png(body)
+
+    def _download_image(self) -> Tuple[str, str]:
+        """Downloads an image into the URL cache.
+        Returns:
+            A (host, media_id) tuple representing the MXC URI of the image.
+        """
+        self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://cdn.twitter.com/matrixdotorg",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: image/png\r\n\r\n"
+            % (len(SMALL_PNG),)
+            + SMALL_PNG
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 200)
+        body = channel.json_body
+        mxc_uri = body["og:image"]
+        host, _port, media_id = parse_and_validate_mxc_uri(mxc_uri)
+        self.assertIsNone(_port)
+        return host, media_id
+
+    def test_storage_providers_exclude_files(self) -> None:
+        """Test that files are not stored in or fetched from storage providers."""
+        host, media_id = self._download_image()
+
+        rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
+        media_store_path = os.path.join(self.media_store_path, rel_file_path)
+        storage_provider_path = os.path.join(self.storage_path, rel_file_path)
+
+        # Check storage
+        self.assertTrue(os.path.isfile(media_store_path))
+        self.assertFalse(
+            os.path.isfile(storage_provider_path),
+            "URL cache file was unexpectedly stored in a storage provider",
+        )
+
+        # Check fetching
+        channel = self.make_request(
+            "GET",
+            f"download/{host}/{media_id}",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+        self.assertEqual(channel.code, 200)
+
+        # Move cached file into the storage provider
+        os.makedirs(os.path.dirname(storage_provider_path), exist_ok=True)
+        os.rename(media_store_path, storage_provider_path)
+
+        channel = self.make_request(
+            "GET",
+            f"download/{host}/{media_id}",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+        self.assertEqual(
+            channel.code,
+            404,
+            "URL cache file was unexpectedly retrieved from a storage provider",
+        )
+
+    def test_storage_providers_exclude_thumbnails(self) -> None:
+        """Test that thumbnails are not stored in or fetched from storage providers."""
+        host, media_id = self._download_image()
+
+        rel_thumbnail_path = (
+            self.preview_url.filepaths.url_cache_thumbnail_directory_rel(media_id)
+        )
+        media_store_thumbnail_path = os.path.join(
+            self.media_store_path, rel_thumbnail_path
+        )
+        storage_provider_thumbnail_path = os.path.join(
+            self.storage_path, rel_thumbnail_path
+        )
+
+        # Check storage
+        self.assertTrue(os.path.isdir(media_store_thumbnail_path))
+        self.assertFalse(
+            os.path.isdir(storage_provider_thumbnail_path),
+            "URL cache thumbnails were unexpectedly stored in a storage provider",
+        )
+
+        # Check fetching
+        channel = self.make_request(
+            "GET",
+            f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+        self.assertEqual(channel.code, 200)
+
+        # Remove the original, otherwise thumbnails will regenerate
+        rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
+        media_store_path = os.path.join(self.media_store_path, rel_file_path)
+        os.remove(media_store_path)
+
+        # Move cached thumbnails into the storage provider
+        os.makedirs(os.path.dirname(storage_provider_thumbnail_path), exist_ok=True)
+        os.rename(media_store_thumbnail_path, storage_provider_thumbnail_path)
+
+        channel = self.make_request(
+            "GET",
+            f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+        self.assertEqual(
+            channel.code,
+            404,
+            "URL cache thumbnail was unexpectedly retrieved from a storage provider",
+        )
+
+    def test_cache_expiry(self) -> None:
+        """Test that URL cache files and thumbnails are cleaned up properly on expiry."""
+        self.preview_url.clock = MockClock()
+
+        _host, media_id = self._download_image()
+
+        file_path = self.preview_url.filepaths.url_cache_filepath(media_id)
+        file_dirs = self.preview_url.filepaths.url_cache_filepath_dirs_to_delete(
+            media_id
+        )
+        thumbnail_dir = self.preview_url.filepaths.url_cache_thumbnail_directory(
+            media_id
+        )
+        thumbnail_dirs = self.preview_url.filepaths.url_cache_thumbnail_dirs_to_delete(
+            media_id
+        )
+
+        self.assertTrue(os.path.isfile(file_path))
+        self.assertTrue(os.path.isdir(thumbnail_dir))
+
+        self.preview_url.clock.advance_time_msec(IMAGE_CACHE_EXPIRY_MS + 1)
+        self.get_success(self.preview_url._expire_url_cache_data())
+
+        for path in [file_path] + file_dirs + [thumbnail_dir] + thumbnail_dirs:
+            self.assertFalse(
+                os.path.exists(path),
+                f"{os.path.relpath(path, self.media_store_path)} was not deleted",
+            )
+
+    @unittest.override_config({"url_preview_url_blacklist": [{"port": "*"}]})
+    def test_blacklist_port(self) -> None:
+        """Tests that blacklisting URLs with a port makes previewing such URLs
+        fail with a 403 error and doesn't impact other previews.
+        """
+        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
+
+        bad_url = quote("http://matrix.org:8888/foo")
+        good_url = quote("http://matrix.org/foo")
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=" + bad_url,
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+        self.assertEqual(channel.code, 403, channel.result)
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=" + good_url,
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
+            % (len(self.end_content),)
+            + self.end_content
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 200)
diff --git a/tests/rest/media/v1/__init__.py b/tests/rest/media/v1/__init__.py
deleted file mode 100644
index b1ee10cfcc..0000000000
--- a/tests/rest/media/v1/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright 2018 New Vector Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/tests/rest/media/v1/test_base.py b/tests/rest/media/v1/test_base.py
deleted file mode 100644
index c73179151a..0000000000
--- a/tests/rest/media/v1/test_base.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2019 New Vector Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from synapse.rest.media.v1._base import get_filename_from_headers
-
-from tests import unittest
-
-
-class GetFileNameFromHeadersTests(unittest.TestCase):
-    # input -> expected result
-    TEST_CASES = {
-        b"inline; filename=abc.txt": "abc.txt",
-        b'inline; filename="azerty"': "azerty",
-        b'inline; filename="aze%20rty"': "aze%20rty",
-        b'inline; filename="aze"rty"': 'aze"rty',
-        b'inline; filename="azer;ty"': "azer;ty",
-        b"inline; filename*=utf-8''foo%C2%A3bar": "foo£bar",
-    }
-
-    def tests(self) -> None:
-        for hdr, expected in self.TEST_CASES.items():
-            res = get_filename_from_headers({b"Content-Disposition": [hdr]})
-            self.assertEqual(
-                res,
-                expected,
-                f"expected output for {hdr!r} to be {expected} but was {res}",
-            )
diff --git a/tests/rest/media/v1/test_filepath.py b/tests/rest/media/v1/test_filepath.py
deleted file mode 100644
index 43e6f0f70a..0000000000
--- a/tests/rest/media/v1/test_filepath.py
+++ /dev/null
@@ -1,595 +0,0 @@
-# Copyright 2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import inspect
-import os
-from typing import Iterable
-
-from synapse.rest.media.v1.filepath import MediaFilePaths, _wrap_with_jail_check
-
-from tests import unittest
-
-
-class MediaFilePathsTestCase(unittest.TestCase):
-    def setUp(self) -> None:
-        super().setUp()
-
-        self.filepaths = MediaFilePaths("/media_store")
-
-    def test_local_media_filepath(self) -> None:
-        """Test local media paths"""
-        self.assertEqual(
-            self.filepaths.local_media_filepath_rel("GerZNDnDZVjsOtardLuwfIBg"),
-            "local_content/Ge/rZ/NDnDZVjsOtardLuwfIBg",
-        )
-        self.assertEqual(
-            self.filepaths.local_media_filepath("GerZNDnDZVjsOtardLuwfIBg"),
-            "/media_store/local_content/Ge/rZ/NDnDZVjsOtardLuwfIBg",
-        )
-
-    def test_local_media_thumbnail(self) -> None:
-        """Test local media thumbnail paths"""
-        self.assertEqual(
-            self.filepaths.local_media_thumbnail_rel(
-                "GerZNDnDZVjsOtardLuwfIBg", 800, 600, "image/jpeg", "scale"
-            ),
-            "local_thumbnails/Ge/rZ/NDnDZVjsOtardLuwfIBg/800-600-image-jpeg-scale",
-        )
-        self.assertEqual(
-            self.filepaths.local_media_thumbnail(
-                "GerZNDnDZVjsOtardLuwfIBg", 800, 600, "image/jpeg", "scale"
-            ),
-            "/media_store/local_thumbnails/Ge/rZ/NDnDZVjsOtardLuwfIBg/800-600-image-jpeg-scale",
-        )
-
-    def test_local_media_thumbnail_dir(self) -> None:
-        """Test local media thumbnail directory paths"""
-        self.assertEqual(
-            self.filepaths.local_media_thumbnail_dir("GerZNDnDZVjsOtardLuwfIBg"),
-            "/media_store/local_thumbnails/Ge/rZ/NDnDZVjsOtardLuwfIBg",
-        )
-
-    def test_remote_media_filepath(self) -> None:
-        """Test remote media paths"""
-        self.assertEqual(
-            self.filepaths.remote_media_filepath_rel(
-                "example.com", "GerZNDnDZVjsOtardLuwfIBg"
-            ),
-            "remote_content/example.com/Ge/rZ/NDnDZVjsOtardLuwfIBg",
-        )
-        self.assertEqual(
-            self.filepaths.remote_media_filepath(
-                "example.com", "GerZNDnDZVjsOtardLuwfIBg"
-            ),
-            "/media_store/remote_content/example.com/Ge/rZ/NDnDZVjsOtardLuwfIBg",
-        )
-
-    def test_remote_media_thumbnail(self) -> None:
-        """Test remote media thumbnail paths"""
-        self.assertEqual(
-            self.filepaths.remote_media_thumbnail_rel(
-                "example.com",
-                "GerZNDnDZVjsOtardLuwfIBg",
-                800,
-                600,
-                "image/jpeg",
-                "scale",
-            ),
-            "remote_thumbnail/example.com/Ge/rZ/NDnDZVjsOtardLuwfIBg/800-600-image-jpeg-scale",
-        )
-        self.assertEqual(
-            self.filepaths.remote_media_thumbnail(
-                "example.com",
-                "GerZNDnDZVjsOtardLuwfIBg",
-                800,
-                600,
-                "image/jpeg",
-                "scale",
-            ),
-            "/media_store/remote_thumbnail/example.com/Ge/rZ/NDnDZVjsOtardLuwfIBg/800-600-image-jpeg-scale",
-        )
-
-    def test_remote_media_thumbnail_legacy(self) -> None:
-        """Test old-style remote media thumbnail paths"""
-        self.assertEqual(
-            self.filepaths.remote_media_thumbnail_rel_legacy(
-                "example.com", "GerZNDnDZVjsOtardLuwfIBg", 800, 600, "image/jpeg"
-            ),
-            "remote_thumbnail/example.com/Ge/rZ/NDnDZVjsOtardLuwfIBg/800-600-image-jpeg",
-        )
-
-    def test_remote_media_thumbnail_dir(self) -> None:
-        """Test remote media thumbnail directory paths"""
-        self.assertEqual(
-            self.filepaths.remote_media_thumbnail_dir(
-                "example.com", "GerZNDnDZVjsOtardLuwfIBg"
-            ),
-            "/media_store/remote_thumbnail/example.com/Ge/rZ/NDnDZVjsOtardLuwfIBg",
-        )
-
-    def test_url_cache_filepath(self) -> None:
-        """Test URL cache paths"""
-        self.assertEqual(
-            self.filepaths.url_cache_filepath_rel("2020-01-02_GerZNDnDZVjsOtar"),
-            "url_cache/2020-01-02/GerZNDnDZVjsOtar",
-        )
-        self.assertEqual(
-            self.filepaths.url_cache_filepath("2020-01-02_GerZNDnDZVjsOtar"),
-            "/media_store/url_cache/2020-01-02/GerZNDnDZVjsOtar",
-        )
-
-    def test_url_cache_filepath_legacy(self) -> None:
-        """Test old-style URL cache paths"""
-        self.assertEqual(
-            self.filepaths.url_cache_filepath_rel("GerZNDnDZVjsOtardLuwfIBg"),
-            "url_cache/Ge/rZ/NDnDZVjsOtardLuwfIBg",
-        )
-        self.assertEqual(
-            self.filepaths.url_cache_filepath("GerZNDnDZVjsOtardLuwfIBg"),
-            "/media_store/url_cache/Ge/rZ/NDnDZVjsOtardLuwfIBg",
-        )
-
-    def test_url_cache_filepath_dirs_to_delete(self) -> None:
-        """Test URL cache cleanup paths"""
-        self.assertEqual(
-            self.filepaths.url_cache_filepath_dirs_to_delete(
-                "2020-01-02_GerZNDnDZVjsOtar"
-            ),
-            ["/media_store/url_cache/2020-01-02"],
-        )
-
-    def test_url_cache_filepath_dirs_to_delete_legacy(self) -> None:
-        """Test old-style URL cache cleanup paths"""
-        self.assertEqual(
-            self.filepaths.url_cache_filepath_dirs_to_delete(
-                "GerZNDnDZVjsOtardLuwfIBg"
-            ),
-            [
-                "/media_store/url_cache/Ge/rZ",
-                "/media_store/url_cache/Ge",
-            ],
-        )
-
-    def test_url_cache_thumbnail(self) -> None:
-        """Test URL cache thumbnail paths"""
-        self.assertEqual(
-            self.filepaths.url_cache_thumbnail_rel(
-                "2020-01-02_GerZNDnDZVjsOtar", 800, 600, "image/jpeg", "scale"
-            ),
-            "url_cache_thumbnails/2020-01-02/GerZNDnDZVjsOtar/800-600-image-jpeg-scale",
-        )
-        self.assertEqual(
-            self.filepaths.url_cache_thumbnail(
-                "2020-01-02_GerZNDnDZVjsOtar", 800, 600, "image/jpeg", "scale"
-            ),
-            "/media_store/url_cache_thumbnails/2020-01-02/GerZNDnDZVjsOtar/800-600-image-jpeg-scale",
-        )
-
-    def test_url_cache_thumbnail_legacy(self) -> None:
-        """Test old-style URL cache thumbnail paths"""
-        self.assertEqual(
-            self.filepaths.url_cache_thumbnail_rel(
-                "GerZNDnDZVjsOtardLuwfIBg", 800, 600, "image/jpeg", "scale"
-            ),
-            "url_cache_thumbnails/Ge/rZ/NDnDZVjsOtardLuwfIBg/800-600-image-jpeg-scale",
-        )
-        self.assertEqual(
-            self.filepaths.url_cache_thumbnail(
-                "GerZNDnDZVjsOtardLuwfIBg", 800, 600, "image/jpeg", "scale"
-            ),
-            "/media_store/url_cache_thumbnails/Ge/rZ/NDnDZVjsOtardLuwfIBg/800-600-image-jpeg-scale",
-        )
-
-    def test_url_cache_thumbnail_directory(self) -> None:
-        """Test URL cache thumbnail directory paths"""
-        self.assertEqual(
-            self.filepaths.url_cache_thumbnail_directory_rel(
-                "2020-01-02_GerZNDnDZVjsOtar"
-            ),
-            "url_cache_thumbnails/2020-01-02/GerZNDnDZVjsOtar",
-        )
-        self.assertEqual(
-            self.filepaths.url_cache_thumbnail_directory("2020-01-02_GerZNDnDZVjsOtar"),
-            "/media_store/url_cache_thumbnails/2020-01-02/GerZNDnDZVjsOtar",
-        )
-
-    def test_url_cache_thumbnail_directory_legacy(self) -> None:
-        """Test old-style URL cache thumbnail directory paths"""
-        self.assertEqual(
-            self.filepaths.url_cache_thumbnail_directory_rel(
-                "GerZNDnDZVjsOtardLuwfIBg"
-            ),
-            "url_cache_thumbnails/Ge/rZ/NDnDZVjsOtardLuwfIBg",
-        )
-        self.assertEqual(
-            self.filepaths.url_cache_thumbnail_directory("GerZNDnDZVjsOtardLuwfIBg"),
-            "/media_store/url_cache_thumbnails/Ge/rZ/NDnDZVjsOtardLuwfIBg",
-        )
-
-    def test_url_cache_thumbnail_dirs_to_delete(self) -> None:
-        """Test URL cache thumbnail cleanup paths"""
-        self.assertEqual(
-            self.filepaths.url_cache_thumbnail_dirs_to_delete(
-                "2020-01-02_GerZNDnDZVjsOtar"
-            ),
-            [
-                "/media_store/url_cache_thumbnails/2020-01-02/GerZNDnDZVjsOtar",
-                "/media_store/url_cache_thumbnails/2020-01-02",
-            ],
-        )
-
-    def test_url_cache_thumbnail_dirs_to_delete_legacy(self) -> None:
-        """Test old-style URL cache thumbnail cleanup paths"""
-        self.assertEqual(
-            self.filepaths.url_cache_thumbnail_dirs_to_delete(
-                "GerZNDnDZVjsOtardLuwfIBg"
-            ),
-            [
-                "/media_store/url_cache_thumbnails/Ge/rZ/NDnDZVjsOtardLuwfIBg",
-                "/media_store/url_cache_thumbnails/Ge/rZ",
-                "/media_store/url_cache_thumbnails/Ge",
-            ],
-        )
-
-    def test_server_name_validation(self) -> None:
-        """Test validation of server names"""
-        self._test_path_validation(
-            [
-                "remote_media_filepath_rel",
-                "remote_media_filepath",
-                "remote_media_thumbnail_rel",
-                "remote_media_thumbnail",
-                "remote_media_thumbnail_rel_legacy",
-                "remote_media_thumbnail_dir",
-            ],
-            parameter="server_name",
-            valid_values=[
-                "matrix.org",
-                "matrix.org:8448",
-                "matrix-federation.matrix.org",
-                "matrix-federation.matrix.org:8448",
-                "10.1.12.123",
-                "10.1.12.123:8448",
-                "[fd00:abcd::ffff]",
-                "[fd00:abcd::ffff]:8448",
-            ],
-            invalid_values=[
-                "/matrix.org",
-                "matrix.org/..",
-                "matrix.org\x00",
-                "",
-                ".",
-                "..",
-                "/",
-            ],
-        )
-
-    def test_file_id_validation(self) -> None:
-        """Test validation of local, remote and legacy URL cache file / media IDs"""
-        # File / media IDs get split into three parts to form paths, consisting of the
-        # first two characters, next two characters and rest of the ID.
-        valid_file_ids = [
-            "GerZNDnDZVjsOtardLuwfIBg",
-            # Unexpected, but produces an acceptable path:
-            "GerZN",  # "N" becomes the last directory
-        ]
-        invalid_file_ids = [
-            "/erZNDnDZVjsOtardLuwfIBg",
-            "Ge/ZNDnDZVjsOtardLuwfIBg",
-            "GerZ/DnDZVjsOtardLuwfIBg",
-            "GerZ/..",
-            "G\x00rZNDnDZVjsOtardLuwfIBg",
-            "Ger\x00NDnDZVjsOtardLuwfIBg",
-            "GerZNDnDZVjsOtardLuwfIBg\x00",
-            "",
-            "Ge",
-            "GerZ",
-            "GerZ.",
-            "..rZNDnDZVjsOtardLuwfIBg",
-            "Ge..NDnDZVjsOtardLuwfIBg",
-            "GerZ..",
-            "GerZ/",
-        ]
-
-        self._test_path_validation(
-            [
-                "local_media_filepath_rel",
-                "local_media_filepath",
-                "local_media_thumbnail_rel",
-                "local_media_thumbnail",
-                "local_media_thumbnail_dir",
-                # Legacy URL cache media IDs
-                "url_cache_filepath_rel",
-                "url_cache_filepath",
-                # `url_cache_filepath_dirs_to_delete` is tested below.
-                "url_cache_thumbnail_rel",
-                "url_cache_thumbnail",
-                "url_cache_thumbnail_directory_rel",
-                "url_cache_thumbnail_directory",
-                "url_cache_thumbnail_dirs_to_delete",
-            ],
-            parameter="media_id",
-            valid_values=valid_file_ids,
-            invalid_values=invalid_file_ids,
-        )
-
-        # `url_cache_filepath_dirs_to_delete` ignores what would be the last path
-        # component, so only the first 4 characters matter.
-        self._test_path_validation(
-            [
-                "url_cache_filepath_dirs_to_delete",
-            ],
-            parameter="media_id",
-            valid_values=valid_file_ids,
-            invalid_values=[
-                "/erZNDnDZVjsOtardLuwfIBg",
-                "Ge/ZNDnDZVjsOtardLuwfIBg",
-                "G\x00rZNDnDZVjsOtardLuwfIBg",
-                "Ger\x00NDnDZVjsOtardLuwfIBg",
-                "",
-                "Ge",
-                "..rZNDnDZVjsOtardLuwfIBg",
-                "Ge..NDnDZVjsOtardLuwfIBg",
-            ],
-        )
-
-        self._test_path_validation(
-            [
-                "remote_media_filepath_rel",
-                "remote_media_filepath",
-                "remote_media_thumbnail_rel",
-                "remote_media_thumbnail",
-                "remote_media_thumbnail_rel_legacy",
-                "remote_media_thumbnail_dir",
-            ],
-            parameter="file_id",
-            valid_values=valid_file_ids,
-            invalid_values=invalid_file_ids,
-        )
-
-    def test_url_cache_media_id_validation(self) -> None:
-        """Test validation of URL cache media IDs"""
-        self._test_path_validation(
-            [
-                "url_cache_filepath_rel",
-                "url_cache_filepath",
-                # `url_cache_filepath_dirs_to_delete` only cares about the date prefix
-                "url_cache_thumbnail_rel",
-                "url_cache_thumbnail",
-                "url_cache_thumbnail_directory_rel",
-                "url_cache_thumbnail_directory",
-                "url_cache_thumbnail_dirs_to_delete",
-            ],
-            parameter="media_id",
-            valid_values=[
-                "2020-01-02_GerZNDnDZVjsOtar",
-                "2020-01-02_G",  # Unexpected, but produces an acceptable path
-            ],
-            invalid_values=[
-                "2020-01-02",
-                "2020-01-02-",
-                "2020-01-02-.",
-                "2020-01-02-..",
-                "2020-01-02-/",
-                "2020-01-02-/GerZNDnDZVjsOtar",
-                "2020-01-02-GerZNDnDZVjsOtar/..",
-                "2020-01-02-GerZNDnDZVjsOtar\x00",
-            ],
-        )
-
-    def test_content_type_validation(self) -> None:
-        """Test validation of thumbnail content types"""
-        self._test_path_validation(
-            [
-                "local_media_thumbnail_rel",
-                "local_media_thumbnail",
-                "remote_media_thumbnail_rel",
-                "remote_media_thumbnail",
-                "remote_media_thumbnail_rel_legacy",
-                "url_cache_thumbnail_rel",
-                "url_cache_thumbnail",
-            ],
-            parameter="content_type",
-            valid_values=[
-                "image/jpeg",
-            ],
-            invalid_values=[
-                "",  # ValueError: not enough values to unpack
-                "image/jpeg/abc",  # ValueError: too many values to unpack
-                "image/jpeg\x00",
-            ],
-        )
-
-    def test_thumbnail_method_validation(self) -> None:
-        """Test validation of thumbnail methods"""
-        self._test_path_validation(
-            [
-                "local_media_thumbnail_rel",
-                "local_media_thumbnail",
-                "remote_media_thumbnail_rel",
-                "remote_media_thumbnail",
-                "url_cache_thumbnail_rel",
-                "url_cache_thumbnail",
-            ],
-            parameter="method",
-            valid_values=[
-                "crop",
-                "scale",
-            ],
-            invalid_values=[
-                "/scale",
-                "scale/..",
-                "scale\x00",
-                "/",
-            ],
-        )
-
-    def _test_path_validation(
-        self,
-        methods: Iterable[str],
-        parameter: str,
-        valid_values: Iterable[str],
-        invalid_values: Iterable[str],
-    ) -> None:
-        """Test that the specified methods validate the named parameter as expected
-
-        Args:
-            methods: The names of `MediaFilePaths` methods to test
-            parameter: The name of the parameter to test
-            valid_values: A list of parameter values that are expected to be accepted
-            invalid_values: A list of parameter values that are expected to be rejected
-
-        Raises:
-            AssertionError: If a value was accepted when it should have failed
-                validation.
-            ValueError: If a value failed validation when it should have been accepted.
-        """
-        for method in methods:
-            get_path = getattr(self.filepaths, method)
-
-            parameters = inspect.signature(get_path).parameters
-            kwargs = {
-                "server_name": "matrix.org",
-                "media_id": "GerZNDnDZVjsOtardLuwfIBg",
-                "file_id": "GerZNDnDZVjsOtardLuwfIBg",
-                "width": 800,
-                "height": 600,
-                "content_type": "image/jpeg",
-                "method": "scale",
-            }
-
-            if get_path.__name__.startswith("url_"):
-                kwargs["media_id"] = "2020-01-02_GerZNDnDZVjsOtar"
-
-            kwargs = {k: v for k, v in kwargs.items() if k in parameters}
-            kwargs.pop(parameter)
-
-            for value in valid_values:
-                kwargs[parameter] = value
-                get_path(**kwargs)
-                # No exception should be raised
-
-            for value in invalid_values:
-                with self.assertRaises(ValueError):
-                    kwargs[parameter] = value
-                    path_or_list = get_path(**kwargs)
-                    self.fail(
-                        f"{value!r} unexpectedly passed validation: "
-                        f"{method} returned {path_or_list!r}"
-                    )
-
-
-class MediaFilePathsJailTestCase(unittest.TestCase):
-    def _check_relative_path(self, filepaths: MediaFilePaths, path: str) -> None:
-        """Passes a relative path through the jail check.
-
-        Args:
-            filepaths: The `MediaFilePaths` instance.
-            path: A path relative to the media store directory.
-
-        Raises:
-            ValueError: If the jail check fails.
-        """
-
-        @_wrap_with_jail_check(relative=True)
-        def _make_relative_path(self: MediaFilePaths, path: str) -> str:
-            return path
-
-        _make_relative_path(filepaths, path)
-
-    def _check_absolute_path(self, filepaths: MediaFilePaths, path: str) -> None:
-        """Passes an absolute path through the jail check.
-
-        Args:
-            filepaths: The `MediaFilePaths` instance.
-            path: A path relative to the media store directory.
-
-        Raises:
-            ValueError: If the jail check fails.
-        """
-
-        @_wrap_with_jail_check(relative=False)
-        def _make_absolute_path(self: MediaFilePaths, path: str) -> str:
-            return os.path.join(self.base_path, path)
-
-        _make_absolute_path(filepaths, path)
-
-    def test_traversal_inside(self) -> None:
-        """Test the jail check for paths that stay within the media directory."""
-        # Despite the `../`s, these paths still lie within the media directory and it's
-        # expected for the jail check to allow them through.
-        # These paths ought to trip the other checks in place and should never be
-        # returned.
-        filepaths = MediaFilePaths("/media_store")
-        path = "url_cache/2020-01-02/../../GerZNDnDZVjsOtar"
-        self._check_relative_path(filepaths, path)
-        self._check_absolute_path(filepaths, path)
-
-    def test_traversal_outside(self) -> None:
-        """Test that the jail check fails for paths that escape the media directory."""
-        filepaths = MediaFilePaths("/media_store")
-        path = "url_cache/2020-01-02/../../../GerZNDnDZVjsOtar"
-        with self.assertRaises(ValueError):
-            self._check_relative_path(filepaths, path)
-        with self.assertRaises(ValueError):
-            self._check_absolute_path(filepaths, path)
-
-    def test_traversal_reentry(self) -> None:
-        """Test the jail check for paths that exit and re-enter the media directory."""
-        # These paths lie outside the media directory if it is a symlink, and inside
-        # otherwise. Ideally the check should fail, but this proves difficult.
-        # This test documents the behaviour for this edge case.
-        # These paths ought to trip the other checks in place and should never be
-        # returned.
-        filepaths = MediaFilePaths("/media_store")
-        path = "url_cache/2020-01-02/../../../media_store/GerZNDnDZVjsOtar"
-        self._check_relative_path(filepaths, path)
-        self._check_absolute_path(filepaths, path)
-
-    def test_symlink(self) -> None:
-        """Test that a symlink does not cause the jail check to fail."""
-        media_store_path = self.mktemp()
-
-        # symlink the media store directory
-        os.symlink("/mnt/synapse/media_store", media_store_path)
-
-        # Test that relative and absolute paths don't trip the check
-        # NB: `media_store_path` is a relative path
-        filepaths = MediaFilePaths(media_store_path)
-        self._check_relative_path(filepaths, "url_cache/2020-01-02/GerZNDnDZVjsOtar")
-        self._check_absolute_path(filepaths, "url_cache/2020-01-02/GerZNDnDZVjsOtar")
-
-        filepaths = MediaFilePaths(os.path.abspath(media_store_path))
-        self._check_relative_path(filepaths, "url_cache/2020-01-02/GerZNDnDZVjsOtar")
-        self._check_absolute_path(filepaths, "url_cache/2020-01-02/GerZNDnDZVjsOtar")
-
-    def test_symlink_subdirectory(self) -> None:
-        """Test that a symlinked subdirectory does not cause the jail check to fail."""
-        media_store_path = self.mktemp()
-        os.mkdir(media_store_path)
-
-        # symlink `url_cache/`
-        os.symlink(
-            "/mnt/synapse/media_store_url_cache",
-            os.path.join(media_store_path, "url_cache"),
-        )
-
-        # Test that relative and absolute paths don't trip the check
-        # NB: `media_store_path` is a relative path
-        filepaths = MediaFilePaths(media_store_path)
-        self._check_relative_path(filepaths, "url_cache/2020-01-02/GerZNDnDZVjsOtar")
-        self._check_absolute_path(filepaths, "url_cache/2020-01-02/GerZNDnDZVjsOtar")
-
-        filepaths = MediaFilePaths(os.path.abspath(media_store_path))
-        self._check_relative_path(filepaths, "url_cache/2020-01-02/GerZNDnDZVjsOtar")
-        self._check_absolute_path(filepaths, "url_cache/2020-01-02/GerZNDnDZVjsOtar")
diff --git a/tests/rest/media/v1/test_html_preview.py b/tests/rest/media/v1/test_html_preview.py
deleted file mode 100644
index 1062081a06..0000000000
--- a/tests/rest/media/v1/test_html_preview.py
+++ /dev/null
@@ -1,542 +0,0 @@
-# Copyright 2014-2016 OpenMarket Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from synapse.rest.media.v1.preview_html import (
-    _get_html_media_encodings,
-    decode_body,
-    parse_html_to_open_graph,
-    summarize_paragraphs,
-)
-
-from tests import unittest
-
-try:
-    import lxml
-except ImportError:
-    lxml = None
-
-
-class SummarizeTestCase(unittest.TestCase):
-    if not lxml:
-        skip = "url preview feature requires lxml"
-
-    def test_long_summarize(self) -> None:
-        example_paras = [
-            """Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:
-            Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in
-            Troms county, Norway. The administrative centre of the municipality is
-            the city of Tromsø. Outside of Norway, Tromso and Tromsö are
-            alternative spellings of the city.Tromsø is considered the northernmost
-            city in the world with a population above 50,000. The most populous town
-            north of it is Alta, Norway, with a population of 14,272 (2013).""",
-            """Tromsø lies in Northern Norway. The municipality has a population of
-            (2015) 72,066, but with an annual influx of students it has over 75,000
-            most of the year. It is the largest urban area in Northern Norway and the
-            third largest north of the Arctic Circle (following Murmansk and Norilsk).
-            Most of Tromsø, including the city centre, is located on the island of
-            Tromsøya, 350 kilometres (217 mi) north of the Arctic Circle. In 2012,
-            Tromsøya had a population of 36,088. Substantial parts of the urban area
-            are also situated on the mainland to the east, and on parts of Kvaløya—a
-            large island to the west. Tromsøya is connected to the mainland by the Tromsø
-            Bridge and the Tromsøysund Tunnel, and to the island of Kvaløya by the
-            Sandnessund Bridge. Tromsø Airport connects the city to many destinations
-            in Europe. The city is warmer than most other places located on the same
-            latitude, due to the warming effect of the Gulf Stream.""",
-            """The city centre of Tromsø contains the highest number of old wooden
-            houses in Northern Norway, the oldest house dating from 1789. The Arctic
-            Cathedral, a modern church from 1965, is probably the most famous landmark
-            in Tromsø. The city is a cultural centre for its region, with several
-            festivals taking place in the summer. Some of Norway's best-known
-             musicians, Torbjørn Brundtland and Svein Berge of the electronica duo
-             Röyksopp and Lene Marlin grew up and started their careers in Tromsø.
-             Noted electronic musician Geir Jenssen also hails from Tromsø.""",
-        ]
-
-        desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
-
-        self.assertEqual(
-            desc,
-            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
-            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
-            " Troms county, Norway. The administrative centre of the municipality is"
-            " the city of Tromsø. Outside of Norway, Tromso and Tromsö are"
-            " alternative spellings of the city.Tromsø is considered the northernmost"
-            " city in the world with a population above 50,000. The most populous town"
-            " north of it is Alta, Norway, with a population of 14,272 (2013).",
-        )
-
-        desc = summarize_paragraphs(example_paras[1:], min_size=200, max_size=500)
-
-        self.assertEqual(
-            desc,
-            "Tromsø lies in Northern Norway. The municipality has a population of"
-            " (2015) 72,066, but with an annual influx of students it has over 75,000"
-            " most of the year. It is the largest urban area in Northern Norway and the"
-            " third largest north of the Arctic Circle (following Murmansk and Norilsk)."
-            " Most of Tromsø, including the city centre, is located on the island of"
-            " Tromsøya, 350 kilometres (217 mi) north of the Arctic Circle. In 2012,"
-            " Tromsøya had a population of 36,088. Substantial parts of the urban…",
-        )
-
-    def test_short_summarize(self) -> None:
-        example_paras = [
-            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
-            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
-            " Troms county, Norway.",
-            "Tromsø lies in Northern Norway. The municipality has a population of"
-            " (2015) 72,066, but with an annual influx of students it has over 75,000"
-            " most of the year.",
-            "The city centre of Tromsø contains the highest number of old wooden"
-            " houses in Northern Norway, the oldest house dating from 1789. The Arctic"
-            " Cathedral, a modern church from 1965, is probably the most famous landmark"
-            " in Tromsø.",
-        ]
-
-        desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
-
-        self.assertEqual(
-            desc,
-            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
-            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
-            " Troms county, Norway.\n"
-            "\n"
-            "Tromsø lies in Northern Norway. The municipality has a population of"
-            " (2015) 72,066, but with an annual influx of students it has over 75,000"
-            " most of the year.",
-        )
-
-    def test_small_then_large_summarize(self) -> None:
-        example_paras = [
-            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
-            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
-            " Troms county, Norway.",
-            "Tromsø lies in Northern Norway. The municipality has a population of"
-            " (2015) 72,066, but with an annual influx of students it has over 75,000"
-            " most of the year."
-            " The city centre of Tromsø contains the highest number of old wooden"
-            " houses in Northern Norway, the oldest house dating from 1789. The Arctic"
-            " Cathedral, a modern church from 1965, is probably the most famous landmark"
-            " in Tromsø.",
-        ]
-
-        desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
-        self.assertEqual(
-            desc,
-            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
-            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
-            " Troms county, Norway.\n"
-            "\n"
-            "Tromsø lies in Northern Norway. The municipality has a population of"
-            " (2015) 72,066, but with an annual influx of students it has over 75,000"
-            " most of the year. The city centre of Tromsø contains the highest number"
-            " of old wooden houses in Northern Norway, the oldest house dating from"
-            " 1789. The Arctic Cathedral, a modern church from…",
-        )
-
-
-class OpenGraphFromHtmlTestCase(unittest.TestCase):
-    if not lxml:
-        skip = "url preview feature requires lxml"
-
-    def test_simple(self) -> None:
-        html = b"""
-        <html>
-        <head><title>Foo</title></head>
-        <body>
-        Some text.
-        </body>
-        </html>
-        """
-
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
-
-        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
-
-    def test_comment(self) -> None:
-        html = b"""
-        <html>
-        <head><title>Foo</title></head>
-        <body>
-        <!-- HTML comment -->
-        Some text.
-        </body>
-        </html>
-        """
-
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
-
-        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
-
-    def test_comment2(self) -> None:
-        html = b"""
-        <html>
-        <head><title>Foo</title></head>
-        <body>
-        Some text.
-        <!-- HTML comment -->
-        Some more text.
-        <p>Text</p>
-        More text
-        </body>
-        </html>
-        """
-
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
-
-        self.assertEqual(
-            og,
-            {
-                "og:title": "Foo",
-                "og:description": "Some text.\n\nSome more text.\n\nText\n\nMore text",
-            },
-        )
-
-    def test_script(self) -> None:
-        html = b"""
-        <html>
-        <head><title>Foo</title></head>
-        <body>
-        <script> (function() {})() </script>
-        Some text.
-        </body>
-        </html>
-        """
-
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
-
-        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
-
-    def test_missing_title(self) -> None:
-        html = b"""
-        <html>
-        <body>
-        Some text.
-        </body>
-        </html>
-        """
-
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
-
-        self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
-
-        # Another variant is a title with no content.
-        html = b"""
-        <html>
-        <head><title></title></head>
-        <body>
-        <h1>Title</h1>
-        </body>
-        </html>
-        """
-
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
-
-        self.assertEqual(og, {"og:title": "Title", "og:description": "Title"})
-
-    def test_h1_as_title(self) -> None:
-        html = b"""
-        <html>
-        <meta property="og:description" content="Some text."/>
-        <body>
-        <h1>Title</h1>
-        </body>
-        </html>
-        """
-
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
-
-        self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
-
-    def test_empty_description(self) -> None:
-        """Description tags with empty content should be ignored."""
-        html = b"""
-        <html>
-        <meta property="og:description" content=""/>
-        <meta property="og:description"/>
-        <meta name="description" content=""/>
-        <meta name="description"/>
-        <meta name="description" content="Finally!"/>
-        <body>
-        <h1>Title</h1>
-        </body>
-        </html>
-        """
-
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
-
-        self.assertEqual(og, {"og:title": "Title", "og:description": "Finally!"})
-
-    def test_missing_title_and_broken_h1(self) -> None:
-        html = b"""
-        <html>
-        <body>
-        <h1><a href="foo"/></h1>
-        Some text.
-        </body>
-        </html>
-        """
-
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
-
-        self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
-
-    def test_empty(self) -> None:
-        """Test a body with no data in it."""
-        html = b""
-        tree = decode_body(html, "http://example.com/test.html")
-        self.assertIsNone(tree)
-
-    def test_no_tree(self) -> None:
-        """A valid body with no tree in it."""
-        html = b"\x00"
-        tree = decode_body(html, "http://example.com/test.html")
-        self.assertIsNone(tree)
-
-    def test_xml(self) -> None:
-        """Test decoding XML and ensure it works properly."""
-        # Note that the strip() call is important to ensure the xml tag starts
-        # at the initial byte.
-        html = b"""
-        <?xml version="1.0" encoding="UTF-8"?>
-
-        <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-        <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
-        <head><title>Foo</title></head><body>Some text.</body></html>
-        """.strip()
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
-        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
-
-    def test_invalid_encoding(self) -> None:
-        """An invalid character encoding should be ignored and treated as UTF-8, if possible."""
-        html = b"""
-        <html>
-        <head><title>Foo</title></head>
-        <body>
-        Some text.
-        </body>
-        </html>
-        """
-        tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
-        og = parse_html_to_open_graph(tree)
-        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
-
-    def test_invalid_encoding2(self) -> None:
-        """A body which doesn't match the sent character encoding."""
-        # Note that this contains an invalid UTF-8 sequence in the title.
-        html = b"""
-        <html>
-        <head><title>\xff\xff Foo</title></head>
-        <body>
-        Some text.
-        </body>
-        </html>
-        """
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
-        self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
-
-    def test_windows_1252(self) -> None:
-        """A body which uses cp1252, but doesn't declare that."""
-        html = b"""
-        <html>
-        <head><title>\xf3</title></head>
-        <body>
-        Some text.
-        </body>
-        </html>
-        """
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
-        self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
-
-    def test_twitter_tag(self) -> None:
-        """Twitter card tags should be used if nothing else is available."""
-        html = b"""
-        <html>
-        <meta name="twitter:card" content="summary">
-        <meta name="twitter:description" content="Description">
-        <meta name="twitter:site" content="@matrixdotorg">
-        </html>
-        """
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
-        self.assertEqual(
-            og,
-            {
-                "og:title": None,
-                "og:description": "Description",
-                "og:site_name": "@matrixdotorg",
-            },
-        )
-
-        # But they shouldn't override Open Graph values.
-        html = b"""
-        <html>
-        <meta name="twitter:card" content="summary">
-        <meta name="twitter:description" content="Description">
-        <meta property="og:description" content="Real Description">
-        <meta name="twitter:site" content="@matrixdotorg">
-        <meta property="og:site_name" content="matrix.org">
-        </html>
-        """
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
-        self.assertEqual(
-            og,
-            {
-                "og:title": None,
-                "og:description": "Real Description",
-                "og:site_name": "matrix.org",
-            },
-        )
-
-    def test_nested_nodes(self) -> None:
-        """A body with some nested nodes. Tests that we iterate over children
-        in the right order (and don't reverse the order of the text)."""
-        html = b"""
-        <a href="somewhere">Welcome <b>the bold <u>and underlined text <svg>
-        with a cheeky SVG</svg></u> and <strong>some</strong> tail text</b></a>
-        """
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
-        self.assertEqual(
-            og,
-            {
-                "og:title": None,
-                "og:description": "Welcome\n\nthe bold\n\nand underlined text\n\nand\n\nsome\n\ntail text",
-            },
-        )
-
-
-class MediaEncodingTestCase(unittest.TestCase):
-    def test_meta_charset(self) -> None:
-        """A character encoding is found via the meta tag."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <html>
-        <head><meta charset="ascii">
-        </head>
-        </html>
-        """,
-            "text/html",
-        )
-        self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
-
-        # A less well-formed version.
-        encodings = _get_html_media_encodings(
-            b"""
-        <html>
-        <head>< meta charset = ascii>
-        </head>
-        </html>
-        """,
-            "text/html",
-        )
-        self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
-
-    def test_meta_charset_underscores(self) -> None:
-        """A character encoding contains underscore."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <html>
-        <head><meta charset="Shift_JIS">
-        </head>
-        </html>
-        """,
-            "text/html",
-        )
-        self.assertEqual(list(encodings), ["shift_jis", "utf-8", "cp1252"])
-
-    def test_xml_encoding(self) -> None:
-        """A character encoding is found via the meta tag."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <?xml version="1.0" encoding="ascii"?>
-        <html>
-        </html>
-        """,
-            "text/html",
-        )
-        self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
-
-    def test_meta_xml_encoding(self) -> None:
-        """Meta tags take precedence over XML encoding."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <?xml version="1.0" encoding="ascii"?>
-        <html>
-        <head><meta charset="UTF-16">
-        </head>
-        </html>
-        """,
-            "text/html",
-        )
-        self.assertEqual(list(encodings), ["utf-16", "ascii", "utf-8", "cp1252"])
-
-    def test_content_type(self) -> None:
-        """A character encoding is found via the Content-Type header."""
-        # Test a few variations of the header.
-        headers = (
-            'text/html; charset="ascii";',
-            "text/html;charset=ascii;",
-            'text/html;  charset="ascii"',
-            "text/html; charset=ascii",
-            'text/html; charset="ascii;',
-            'text/html; charset=ascii";',
-        )
-        for header in headers:
-            encodings = _get_html_media_encodings(b"", header)
-            self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
-
-    def test_fallback(self) -> None:
-        """A character encoding cannot be found in the body or header."""
-        encodings = _get_html_media_encodings(b"", "text/html")
-        self.assertEqual(list(encodings), ["utf-8", "cp1252"])
-
-    def test_duplicates(self) -> None:
-        """Ensure each encoding is only attempted once."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <?xml version="1.0" encoding="utf8"?>
-        <html>
-        <head><meta charset="UTF-8">
-        </head>
-        </html>
-        """,
-            'text/html; charset="UTF_8"',
-        )
-        self.assertEqual(list(encodings), ["utf-8", "cp1252"])
-
-    def test_unknown_invalid(self) -> None:
-        """A character encoding should be ignored if it is unknown or invalid."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <html>
-        <head><meta charset="invalid">
-        </head>
-        </html>
-        """,
-            'text/html; charset="invalid"',
-        )
-        self.assertEqual(list(encodings), ["utf-8", "cp1252"])
diff --git a/tests/rest/media/v1/test_media_storage.py b/tests/rest/media/v1/test_media_storage.py
deleted file mode 100644
index 8ed27179c4..0000000000
--- a/tests/rest/media/v1/test_media_storage.py
+++ /dev/null
@@ -1,792 +0,0 @@
-# Copyright 2018-2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import shutil
-import tempfile
-from binascii import unhexlify
-from io import BytesIO
-from typing import Any, BinaryIO, ClassVar, Dict, List, Optional, Tuple, Union
-from unittest.mock import Mock
-from urllib import parse
-
-import attr
-from parameterized import parameterized, parameterized_class
-from PIL import Image as Image
-from typing_extensions import Literal
-
-from twisted.internet import defer
-from twisted.internet.defer import Deferred
-from twisted.test.proto_helpers import MemoryReactor
-
-from synapse.api.errors import Codes
-from synapse.events import EventBase
-from synapse.events.spamcheck import load_legacy_spam_checkers
-from synapse.http.types import QueryParams
-from synapse.logging.context import make_deferred_yieldable
-from synapse.module_api import ModuleApi
-from synapse.rest import admin
-from synapse.rest.client import login
-from synapse.rest.media.v1._base import FileInfo
-from synapse.rest.media.v1.filepath import MediaFilePaths
-from synapse.rest.media.v1.media_storage import MediaStorage, ReadableFileWrapper
-from synapse.rest.media.v1.storage_provider import FileStorageProviderBackend
-from synapse.server import HomeServer
-from synapse.types import JsonDict, RoomAlias
-from synapse.util import Clock
-
-from tests import unittest
-from tests.server import FakeChannel, FakeSite, make_request
-from tests.test_utils import SMALL_PNG
-from tests.utils import default_config
-
-
-class MediaStorageTests(unittest.HomeserverTestCase):
-    needs_threadpool = True
-
-    def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
-        self.test_dir = tempfile.mkdtemp(prefix="synapse-tests-")
-        self.addCleanup(shutil.rmtree, self.test_dir)
-
-        self.primary_base_path = os.path.join(self.test_dir, "primary")
-        self.secondary_base_path = os.path.join(self.test_dir, "secondary")
-
-        hs.config.media.media_store_path = self.primary_base_path
-
-        storage_providers = [FileStorageProviderBackend(hs, self.secondary_base_path)]
-
-        self.filepaths = MediaFilePaths(self.primary_base_path)
-        self.media_storage = MediaStorage(
-            hs, self.primary_base_path, self.filepaths, storage_providers
-        )
-
-    def test_ensure_media_is_in_local_cache(self) -> None:
-        media_id = "some_media_id"
-        test_body = "Test\n"
-
-        # First we create a file that is in a storage provider but not in the
-        # local primary media store
-        rel_path = self.filepaths.local_media_filepath_rel(media_id)
-        secondary_path = os.path.join(self.secondary_base_path, rel_path)
-
-        os.makedirs(os.path.dirname(secondary_path))
-
-        with open(secondary_path, "w") as f:
-            f.write(test_body)
-
-        # Now we run ensure_media_is_in_local_cache, which should copy the file
-        # to the local cache.
-        file_info = FileInfo(None, media_id)
-
-        # This uses a real blocking threadpool so we have to wait for it to be
-        # actually done :/
-        x = defer.ensureDeferred(
-            self.media_storage.ensure_media_is_in_local_cache(file_info)
-        )
-
-        # Hotloop until the threadpool does its job...
-        self.wait_on_thread(x)
-
-        local_path = self.get_success(x)
-
-        self.assertTrue(os.path.exists(local_path))
-
-        # Asserts the file is under the expected local cache directory
-        self.assertEqual(
-            os.path.commonprefix([self.primary_base_path, local_path]),
-            self.primary_base_path,
-        )
-
-        with open(local_path) as f:
-            body = f.read()
-
-        self.assertEqual(test_body, body)
-
-
-@attr.s(auto_attribs=True, slots=True, frozen=True)
-class _TestImage:
-    """An image for testing thumbnailing with the expected results
-
-    Attributes:
-        data: The raw image to thumbnail
-        content_type: The type of the image as a content type, e.g. "image/png"
-        extension: The extension associated with the format, e.g. ".png"
-        expected_cropped: The expected bytes from cropped thumbnailing, or None if
-            test should just check for success.
-        expected_scaled: The expected bytes from scaled thumbnailing, or None if
-            test should just check for a valid image returned.
-        expected_found: True if the file should exist on the server, or False if
-            a 404/400 is expected.
-        unable_to_thumbnail: True if we expect the thumbnailing to fail (400), or
-            False if the thumbnailing should succeed or a normal 404 is expected.
-    """
-
-    data: bytes
-    content_type: bytes
-    extension: bytes
-    expected_cropped: Optional[bytes] = None
-    expected_scaled: Optional[bytes] = None
-    expected_found: bool = True
-    unable_to_thumbnail: bool = False
-
-
-@parameterized_class(
-    ("test_image",),
-    [
-        # small png
-        (
-            _TestImage(
-                SMALL_PNG,
-                b"image/png",
-                b".png",
-                unhexlify(
-                    b"89504e470d0a1a0a0000000d4948445200000020000000200806"
-                    b"000000737a7af40000001a49444154789cedc101010000008220"
-                    b"ffaf6e484001000000ef0610200001194334ee0000000049454e"
-                    b"44ae426082"
-                ),
-                unhexlify(
-                    b"89504e470d0a1a0a0000000d4948445200000001000000010806"
-                    b"0000001f15c4890000000d49444154789c636060606000000005"
-                    b"0001a5f645400000000049454e44ae426082"
-                ),
-            ),
-        ),
-        # small png with transparency.
-        (
-            _TestImage(
-                unhexlify(
-                    b"89504e470d0a1a0a0000000d49484452000000010000000101000"
-                    b"00000376ef9240000000274524e5300010194fdae0000000a4944"
-                    b"4154789c636800000082008177cd72b60000000049454e44ae426"
-                    b"082"
-                ),
-                b"image/png",
-                b".png",
-                # Note that we don't check the output since it varies across
-                # different versions of Pillow.
-            ),
-        ),
-        # small lossless webp
-        (
-            _TestImage(
-                unhexlify(
-                    b"524946461a000000574542505650384c0d0000002f0000001007"
-                    b"1011118888fe0700"
-                ),
-                b"image/webp",
-                b".webp",
-            ),
-        ),
-        # an empty file
-        (
-            _TestImage(
-                b"",
-                b"image/gif",
-                b".gif",
-                expected_found=False,
-                unable_to_thumbnail=True,
-            ),
-        ),
-    ],
-)
-class MediaRepoTests(unittest.HomeserverTestCase):
-    test_image: ClassVar[_TestImage]
-    hijack_auth = True
-    user_id = "@test:user"
-
-    def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
-        self.fetches: List[
-            Tuple[
-                "Deferred[Tuple[bytes, Tuple[int, Dict[bytes, List[bytes]]]]]",
-                str,
-                str,
-                Optional[QueryParams],
-            ]
-        ] = []
-
-        def get_file(
-            destination: str,
-            path: str,
-            output_stream: BinaryIO,
-            args: Optional[QueryParams] = None,
-            retry_on_dns_fail: bool = True,
-            max_size: Optional[int] = None,
-            ignore_backoff: bool = False,
-        ) -> "Deferred[Tuple[int, Dict[bytes, List[bytes]]]]":
-            """A mock for MatrixFederationHttpClient.get_file."""
-
-            def write_to(
-                r: Tuple[bytes, Tuple[int, Dict[bytes, List[bytes]]]]
-            ) -> Tuple[int, Dict[bytes, List[bytes]]]:
-                data, response = r
-                output_stream.write(data)
-                return response
-
-            d: Deferred[Tuple[bytes, Tuple[int, Dict[bytes, List[bytes]]]]] = Deferred()
-            self.fetches.append((d, destination, path, args))
-            # Note that this callback changes the value held by d.
-            d_after_callback = d.addCallback(write_to)
-            return make_deferred_yieldable(d_after_callback)
-
-        # Mock out the homeserver's MatrixFederationHttpClient
-        client = Mock()
-        client.get_file = get_file
-
-        self.storage_path = self.mktemp()
-        self.media_store_path = self.mktemp()
-        os.mkdir(self.storage_path)
-        os.mkdir(self.media_store_path)
-
-        config = self.default_config()
-        config["media_store_path"] = self.media_store_path
-        config["max_image_pixels"] = 2000000
-
-        provider_config = {
-            "module": "synapse.rest.media.v1.storage_provider.FileStorageProviderBackend",
-            "store_local": True,
-            "store_synchronous": False,
-            "store_remote": True,
-            "config": {"directory": self.storage_path},
-        }
-        config["media_storage_providers"] = [provider_config]
-
-        hs = self.setup_test_homeserver(config=config, federation_http_client=client)
-
-        return hs
-
-    def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
-        media_resource = hs.get_media_repository_resource()
-        self.download_resource = media_resource.children[b"download"]
-        self.thumbnail_resource = media_resource.children[b"thumbnail"]
-        self.store = hs.get_datastores().main
-        self.media_repo = hs.get_media_repository()
-
-        self.media_id = "example.com/12345"
-
-    def _req(
-        self, content_disposition: Optional[bytes], include_content_type: bool = True
-    ) -> FakeChannel:
-        channel = make_request(
-            self.reactor,
-            FakeSite(self.download_resource, self.reactor),
-            "GET",
-            self.media_id,
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        # We've made one fetch, to example.com, using the media URL, and asking
-        # the other server not to do a remote fetch
-        self.assertEqual(len(self.fetches), 1)
-        self.assertEqual(self.fetches[0][1], "example.com")
-        self.assertEqual(
-            self.fetches[0][2], "/_matrix/media/r0/download/" + self.media_id
-        )
-        self.assertEqual(self.fetches[0][3], {"allow_remote": "false"})
-
-        headers = {
-            b"Content-Length": [b"%d" % (len(self.test_image.data))],
-        }
-
-        if include_content_type:
-            headers[b"Content-Type"] = [self.test_image.content_type]
-
-        if content_disposition:
-            headers[b"Content-Disposition"] = [content_disposition]
-
-        self.fetches[0][0].callback(
-            (self.test_image.data, (len(self.test_image.data), headers))
-        )
-
-        self.pump()
-        self.assertEqual(channel.code, 200)
-
-        return channel
-
-    def test_handle_missing_content_type(self) -> None:
-        channel = self._req(
-            b"inline; filename=out" + self.test_image.extension,
-            include_content_type=False,
-        )
-        headers = channel.headers
-        self.assertEqual(channel.code, 200)
-        self.assertEqual(
-            headers.getRawHeaders(b"Content-Type"), [b"application/octet-stream"]
-        )
-
-    def test_disposition_filename_ascii(self) -> None:
-        """
-        If the filename is filename=<ascii> then Synapse will decode it as an
-        ASCII string, and use filename= in the response.
-        """
-        channel = self._req(b"inline; filename=out" + self.test_image.extension)
-
-        headers = channel.headers
-        self.assertEqual(
-            headers.getRawHeaders(b"Content-Type"), [self.test_image.content_type]
-        )
-        self.assertEqual(
-            headers.getRawHeaders(b"Content-Disposition"),
-            [b"inline; filename=out" + self.test_image.extension],
-        )
-
-    def test_disposition_filenamestar_utf8escaped(self) -> None:
-        """
-        If the filename is filename=*utf8''<utf8 escaped> then Synapse will
-        correctly decode it as the UTF-8 string, and use filename* in the
-        response.
-        """
-        filename = parse.quote("\u2603".encode()).encode("ascii")
-        channel = self._req(
-            b"inline; filename*=utf-8''" + filename + self.test_image.extension
-        )
-
-        headers = channel.headers
-        self.assertEqual(
-            headers.getRawHeaders(b"Content-Type"), [self.test_image.content_type]
-        )
-        self.assertEqual(
-            headers.getRawHeaders(b"Content-Disposition"),
-            [b"inline; filename*=utf-8''" + filename + self.test_image.extension],
-        )
-
-    def test_disposition_none(self) -> None:
-        """
-        If there is no filename, one isn't passed on in the Content-Disposition
-        of the request.
-        """
-        channel = self._req(None)
-
-        headers = channel.headers
-        self.assertEqual(
-            headers.getRawHeaders(b"Content-Type"), [self.test_image.content_type]
-        )
-        self.assertEqual(headers.getRawHeaders(b"Content-Disposition"), None)
-
-    def test_thumbnail_crop(self) -> None:
-        """Test that a cropped remote thumbnail is available."""
-        self._test_thumbnail(
-            "crop",
-            self.test_image.expected_cropped,
-            expected_found=self.test_image.expected_found,
-            unable_to_thumbnail=self.test_image.unable_to_thumbnail,
-        )
-
-    def test_thumbnail_scale(self) -> None:
-        """Test that a scaled remote thumbnail is available."""
-        self._test_thumbnail(
-            "scale",
-            self.test_image.expected_scaled,
-            expected_found=self.test_image.expected_found,
-            unable_to_thumbnail=self.test_image.unable_to_thumbnail,
-        )
-
-    def test_invalid_type(self) -> None:
-        """An invalid thumbnail type is never available."""
-        self._test_thumbnail(
-            "invalid",
-            None,
-            expected_found=False,
-            unable_to_thumbnail=self.test_image.unable_to_thumbnail,
-        )
-
-    @unittest.override_config(
-        {"thumbnail_sizes": [{"width": 32, "height": 32, "method": "scale"}]}
-    )
-    def test_no_thumbnail_crop(self) -> None:
-        """
-        Override the config to generate only scaled thumbnails, but request a cropped one.
-        """
-        self._test_thumbnail(
-            "crop",
-            None,
-            expected_found=False,
-            unable_to_thumbnail=self.test_image.unable_to_thumbnail,
-        )
-
-    @unittest.override_config(
-        {"thumbnail_sizes": [{"width": 32, "height": 32, "method": "crop"}]}
-    )
-    def test_no_thumbnail_scale(self) -> None:
-        """
-        Override the config to generate only cropped thumbnails, but request a scaled one.
-        """
-        self._test_thumbnail(
-            "scale",
-            None,
-            expected_found=False,
-            unable_to_thumbnail=self.test_image.unable_to_thumbnail,
-        )
-
-    def test_thumbnail_repeated_thumbnail(self) -> None:
-        """Test that fetching the same thumbnail works, and deleting the on disk
-        thumbnail regenerates it.
-        """
-        self._test_thumbnail(
-            "scale",
-            self.test_image.expected_scaled,
-            expected_found=self.test_image.expected_found,
-            unable_to_thumbnail=self.test_image.unable_to_thumbnail,
-        )
-
-        if not self.test_image.expected_found:
-            return
-
-        # Fetching again should work, without re-requesting the image from the
-        # remote.
-        params = "?width=32&height=32&method=scale"
-        channel = make_request(
-            self.reactor,
-            FakeSite(self.thumbnail_resource, self.reactor),
-            "GET",
-            self.media_id + params,
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        self.assertEqual(channel.code, 200)
-        if self.test_image.expected_scaled:
-            self.assertEqual(
-                channel.result["body"],
-                self.test_image.expected_scaled,
-                channel.result["body"],
-            )
-
-        # Deleting the thumbnail on disk then re-requesting it should work as
-        # Synapse should regenerate missing thumbnails.
-        origin, media_id = self.media_id.split("/")
-        info = self.get_success(self.store.get_cached_remote_media(origin, media_id))
-        assert info is not None
-        file_id = info["filesystem_id"]
-
-        thumbnail_dir = self.media_repo.filepaths.remote_media_thumbnail_dir(
-            origin, file_id
-        )
-        shutil.rmtree(thumbnail_dir, ignore_errors=True)
-
-        channel = make_request(
-            self.reactor,
-            FakeSite(self.thumbnail_resource, self.reactor),
-            "GET",
-            self.media_id + params,
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        self.assertEqual(channel.code, 200)
-        if self.test_image.expected_scaled:
-            self.assertEqual(
-                channel.result["body"],
-                self.test_image.expected_scaled,
-                channel.result["body"],
-            )
-
-    def _test_thumbnail(
-        self,
-        method: str,
-        expected_body: Optional[bytes],
-        expected_found: bool,
-        unable_to_thumbnail: bool = False,
-    ) -> None:
-        """Test the given thumbnailing method works as expected.
-
-        Args:
-            method: The thumbnailing method to use (crop, scale).
-            expected_body: The expected bytes from thumbnailing, or None if
-                test should just check for a valid image.
-            expected_found: True if the file should exist on the server, or False if
-                a 404/400 is expected.
-            unable_to_thumbnail: True if we expect the thumbnailing to fail (400), or
-                False if the thumbnailing should succeed or a normal 404 is expected.
-        """
-
-        params = "?width=32&height=32&method=" + method
-        channel = make_request(
-            self.reactor,
-            FakeSite(self.thumbnail_resource, self.reactor),
-            "GET",
-            self.media_id + params,
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        headers = {
-            b"Content-Length": [b"%d" % (len(self.test_image.data))],
-            b"Content-Type": [self.test_image.content_type],
-        }
-        self.fetches[0][0].callback(
-            (self.test_image.data, (len(self.test_image.data), headers))
-        )
-        self.pump()
-
-        if expected_found:
-            self.assertEqual(channel.code, 200)
-
-            self.assertEqual(
-                channel.headers.getRawHeaders(b"Cross-Origin-Resource-Policy"),
-                [b"cross-origin"],
-            )
-
-            if expected_body is not None:
-                self.assertEqual(
-                    channel.result["body"], expected_body, channel.result["body"]
-                )
-            else:
-                # ensure that the result is at least some valid image
-                Image.open(BytesIO(channel.result["body"]))
-        elif unable_to_thumbnail:
-            # A 400 with a JSON body.
-            self.assertEqual(channel.code, 400)
-            self.assertEqual(
-                channel.json_body,
-                {
-                    "errcode": "M_UNKNOWN",
-                    "error": "Cannot find any thumbnails for the requested media ([b'example.com', b'12345']). This might mean the media is not a supported_media_format=(image/jpeg, image/jpg, image/webp, image/gif, image/png) or that thumbnailing failed for some other reason. (Dynamic thumbnails are disabled on this server.)",
-                },
-            )
-        else:
-            # A 404 with a JSON body.
-            self.assertEqual(channel.code, 404)
-            self.assertEqual(
-                channel.json_body,
-                {
-                    "errcode": "M_NOT_FOUND",
-                    "error": "Not found [b'example.com', b'12345']",
-                },
-            )
-
-    @parameterized.expand([("crop", 16), ("crop", 64), ("scale", 16), ("scale", 64)])
-    def test_same_quality(self, method: str, desired_size: int) -> None:
-        """Test that choosing between thumbnails with the same quality rating succeeds.
-
-        We are not particular about which thumbnail is chosen."""
-        self.assertIsNotNone(
-            self.thumbnail_resource._select_thumbnail(
-                desired_width=desired_size,
-                desired_height=desired_size,
-                desired_method=method,
-                desired_type=self.test_image.content_type,
-                # Provide two identical thumbnails which are guaranteed to have the same
-                # quality rating.
-                thumbnail_infos=[
-                    {
-                        "thumbnail_width": 32,
-                        "thumbnail_height": 32,
-                        "thumbnail_method": method,
-                        "thumbnail_type": self.test_image.content_type,
-                        "thumbnail_length": 256,
-                        "filesystem_id": f"thumbnail1{self.test_image.extension.decode()}",
-                    },
-                    {
-                        "thumbnail_width": 32,
-                        "thumbnail_height": 32,
-                        "thumbnail_method": method,
-                        "thumbnail_type": self.test_image.content_type,
-                        "thumbnail_length": 256,
-                        "filesystem_id": f"thumbnail2{self.test_image.extension.decode()}",
-                    },
-                ],
-                file_id=f"image{self.test_image.extension.decode()}",
-                url_cache=None,
-                server_name=None,
-            )
-        )
-
-    def test_x_robots_tag_header(self) -> None:
-        """
-        Tests that the `X-Robots-Tag` header is present, which informs web crawlers
-        to not index, archive, or follow links in media.
-        """
-        channel = self._req(b"inline; filename=out" + self.test_image.extension)
-
-        headers = channel.headers
-        self.assertEqual(
-            headers.getRawHeaders(b"X-Robots-Tag"),
-            [b"noindex, nofollow, noarchive, noimageindex"],
-        )
-
-    def test_cross_origin_resource_policy_header(self) -> None:
-        """
-        Test that the Cross-Origin-Resource-Policy header is set to "cross-origin"
-        allowing web clients to embed media from the downloads API.
-        """
-        channel = self._req(b"inline; filename=out" + self.test_image.extension)
-
-        headers = channel.headers
-
-        self.assertEqual(
-            headers.getRawHeaders(b"Cross-Origin-Resource-Policy"),
-            [b"cross-origin"],
-        )
-
-
-class TestSpamCheckerLegacy:
-    """A spam checker module that rejects all media that includes the bytes
-    `evil`.
-
-    Uses the legacy Spam-Checker API.
-    """
-
-    def __init__(self, config: Dict[str, Any], api: ModuleApi) -> None:
-        self.config = config
-        self.api = api
-
-    @staticmethod
-    def parse_config(config: Dict[str, Any]) -> Dict[str, Any]:
-        return config
-
-    async def check_event_for_spam(self, event: EventBase) -> Union[bool, str]:
-        return False  # allow all events
-
-    async def user_may_invite(
-        self,
-        inviter_userid: str,
-        invitee_userid: str,
-        room_id: str,
-    ) -> bool:
-        return True  # allow all invites
-
-    async def user_may_create_room(self, userid: str) -> bool:
-        return True  # allow all room creations
-
-    async def user_may_create_room_alias(
-        self, userid: str, room_alias: RoomAlias
-    ) -> bool:
-        return True  # allow all room aliases
-
-    async def user_may_publish_room(self, userid: str, room_id: str) -> bool:
-        return True  # allow publishing of all rooms
-
-    async def check_media_file_for_spam(
-        self, file_wrapper: ReadableFileWrapper, file_info: FileInfo
-    ) -> bool:
-        buf = BytesIO()
-        await file_wrapper.write_chunks_to(buf.write)
-
-        return b"evil" in buf.getvalue()
-
-
-class SpamCheckerTestCaseLegacy(unittest.HomeserverTestCase):
-    servlets = [
-        login.register_servlets,
-        admin.register_servlets,
-    ]
-
-    def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
-        self.user = self.register_user("user", "pass")
-        self.tok = self.login("user", "pass")
-
-        # Allow for uploading and downloading to/from the media repo
-        self.media_repo = hs.get_media_repository_resource()
-        self.download_resource = self.media_repo.children[b"download"]
-        self.upload_resource = self.media_repo.children[b"upload"]
-
-        load_legacy_spam_checkers(hs)
-
-    def default_config(self) -> Dict[str, Any]:
-        config = default_config("test")
-
-        config.update(
-            {
-                "spam_checker": [
-                    {
-                        "module": TestSpamCheckerLegacy.__module__
-                        + ".TestSpamCheckerLegacy",
-                        "config": {},
-                    }
-                ]
-            }
-        )
-
-        return config
-
-    def test_upload_innocent(self) -> None:
-        """Attempt to upload some innocent data that should be allowed."""
-        self.helper.upload_media(
-            self.upload_resource, SMALL_PNG, tok=self.tok, expect_code=200
-        )
-
-    def test_upload_ban(self) -> None:
-        """Attempt to upload some data that includes bytes "evil", which should
-        get rejected by the spam checker.
-        """
-
-        data = b"Some evil data"
-
-        self.helper.upload_media(
-            self.upload_resource, data, tok=self.tok, expect_code=400
-        )
-
-
-EVIL_DATA = b"Some evil data"
-EVIL_DATA_EXPERIMENT = b"Some evil data to trigger the experimental tuple API"
-
-
-class SpamCheckerTestCase(unittest.HomeserverTestCase):
-    servlets = [
-        login.register_servlets,
-        admin.register_servlets,
-    ]
-
-    def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
-        self.user = self.register_user("user", "pass")
-        self.tok = self.login("user", "pass")
-
-        # Allow for uploading and downloading to/from the media repo
-        self.media_repo = hs.get_media_repository_resource()
-        self.download_resource = self.media_repo.children[b"download"]
-        self.upload_resource = self.media_repo.children[b"upload"]
-
-        hs.get_module_api().register_spam_checker_callbacks(
-            check_media_file_for_spam=self.check_media_file_for_spam
-        )
-
-    async def check_media_file_for_spam(
-        self, file_wrapper: ReadableFileWrapper, file_info: FileInfo
-    ) -> Union[Codes, Literal["NOT_SPAM"], Tuple[Codes, JsonDict]]:
-        buf = BytesIO()
-        await file_wrapper.write_chunks_to(buf.write)
-
-        if buf.getvalue() == EVIL_DATA:
-            return Codes.FORBIDDEN
-        elif buf.getvalue() == EVIL_DATA_EXPERIMENT:
-            return (Codes.FORBIDDEN, {})
-        else:
-            return "NOT_SPAM"
-
-    def test_upload_innocent(self) -> None:
-        """Attempt to upload some innocent data that should be allowed."""
-        self.helper.upload_media(
-            self.upload_resource, SMALL_PNG, tok=self.tok, expect_code=200
-        )
-
-    def test_upload_ban(self) -> None:
-        """Attempt to upload some data that includes bytes "evil", which should
-        get rejected by the spam checker.
-        """
-
-        self.helper.upload_media(
-            self.upload_resource, EVIL_DATA, tok=self.tok, expect_code=400
-        )
-
-        self.helper.upload_media(
-            self.upload_resource,
-            EVIL_DATA_EXPERIMENT,
-            tok=self.tok,
-            expect_code=400,
-        )
diff --git a/tests/rest/media/v1/test_oembed.py b/tests/rest/media/v1/test_oembed.py
deleted file mode 100644
index 3f7f1dbab9..0000000000
--- a/tests/rest/media/v1/test_oembed.py
+++ /dev/null
@@ -1,162 +0,0 @@
-#  Copyright 2021 The Matrix.org Foundation C.I.C.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import json
-
-from parameterized import parameterized
-
-from twisted.test.proto_helpers import MemoryReactor
-
-from synapse.rest.media.v1.oembed import OEmbedProvider, OEmbedResult
-from synapse.server import HomeServer
-from synapse.types import JsonDict
-from synapse.util import Clock
-
-from tests.unittest import HomeserverTestCase
-
-try:
-    import lxml
-except ImportError:
-    lxml = None
-
-
-class OEmbedTests(HomeserverTestCase):
-    if not lxml:
-        skip = "url preview feature requires lxml"
-
-    def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
-        self.oembed = OEmbedProvider(hs)
-
-    def parse_response(self, response: JsonDict) -> OEmbedResult:
-        return self.oembed.parse_oembed_response(
-            "https://test", json.dumps(response).encode("utf-8")
-        )
-
-    def test_version(self) -> None:
-        """Accept versions that are similar to 1.0 as a string or int (or missing)."""
-        for version in ("1.0", 1.0, 1):
-            result = self.parse_response({"version": version})
-            # An empty Open Graph response is an error, ensure the URL is included.
-            self.assertIn("og:url", result.open_graph_result)
-
-        # A missing version should be treated as 1.0.
-        result = self.parse_response({"type": "link"})
-        self.assertIn("og:url", result.open_graph_result)
-
-        # Invalid versions should be rejected.
-        for version in ("2.0", "1", 1.1, 0, None, {}, []):
-            result = self.parse_response({"version": version, "type": "link"})
-            # An empty Open Graph response is an error, ensure the URL is included.
-            self.assertEqual({}, result.open_graph_result)
-
-    def test_cache_age(self) -> None:
-        """Ensure a cache-age is parsed properly."""
-        # Correct-ish cache ages are allowed.
-        for cache_age in ("1", 1.0, 1):
-            result = self.parse_response({"cache_age": cache_age})
-            self.assertEqual(result.cache_age, 1000)
-
-        # Invalid cache ages are ignored.
-        for cache_age in ("invalid", {}):
-            result = self.parse_response({"cache_age": cache_age})
-            self.assertIsNone(result.cache_age)
-
-        # Cache age is optional.
-        result = self.parse_response({})
-        self.assertIsNone(result.cache_age)
-
-    @parameterized.expand(
-        [
-            ("title", "title"),
-            ("provider_name", "site_name"),
-            ("thumbnail_url", "image"),
-        ],
-        name_func=lambda func, num, p: f"{func.__name__}_{p.args[0]}",
-    )
-    def test_property(self, oembed_property: str, open_graph_property: str) -> None:
-        """Test properties which must be strings."""
-        result = self.parse_response({oembed_property: "test"})
-        self.assertIn(f"og:{open_graph_property}", result.open_graph_result)
-        self.assertEqual(result.open_graph_result[f"og:{open_graph_property}"], "test")
-
-        result = self.parse_response({oembed_property: 1})
-        self.assertNotIn(f"og:{open_graph_property}", result.open_graph_result)
-
-    def test_author_name(self) -> None:
-        """Test the author_name property."""
-        result = self.parse_response({"author_name": "test"})
-        self.assertEqual(result.author_name, "test")
-
-        result = self.parse_response({"author_name": 1})
-        self.assertIsNone(result.author_name)
-
-    def test_rich(self) -> None:
-        """Test a type of rich."""
-        result = self.parse_response({"html": "test<img src='foo'>", "type": "rich"})
-        self.assertIn("og:description", result.open_graph_result)
-        self.assertIn("og:image", result.open_graph_result)
-        self.assertEqual(result.open_graph_result["og:description"], "test")
-        self.assertEqual(result.open_graph_result["og:image"], "foo")
-
-        result = self.parse_response({"type": "rich"})
-        self.assertNotIn("og:description", result.open_graph_result)
-
-        result = self.parse_response({"html": 1, "type": "rich"})
-        self.assertNotIn("og:description", result.open_graph_result)
-
-    def test_photo(self) -> None:
-        """Test a type of photo."""
-        result = self.parse_response({"url": "test", "type": "photo"})
-        self.assertIn("og:image", result.open_graph_result)
-        self.assertEqual(result.open_graph_result["og:image"], "test")
-
-        result = self.parse_response({"type": "photo"})
-        self.assertNotIn("og:image", result.open_graph_result)
-
-        result = self.parse_response({"url": 1, "type": "photo"})
-        self.assertNotIn("og:image", result.open_graph_result)
-
-    def test_video(self) -> None:
-        """Test a type of video."""
-        result = self.parse_response({"html": "test", "type": "video"})
-        self.assertIn("og:type", result.open_graph_result)
-        self.assertEqual(result.open_graph_result["og:type"], "video.other")
-        self.assertIn("og:description", result.open_graph_result)
-        self.assertEqual(result.open_graph_result["og:description"], "test")
-
-        result = self.parse_response({"type": "video"})
-        self.assertIn("og:type", result.open_graph_result)
-        self.assertEqual(result.open_graph_result["og:type"], "video.other")
-        self.assertNotIn("og:description", result.open_graph_result)
-
-        result = self.parse_response({"url": 1, "type": "video"})
-        self.assertIn("og:type", result.open_graph_result)
-        self.assertEqual(result.open_graph_result["og:type"], "video.other")
-        self.assertNotIn("og:description", result.open_graph_result)
-
-    def test_link(self) -> None:
-        """Test type of link."""
-        result = self.parse_response({"type": "link"})
-        self.assertIn("og:type", result.open_graph_result)
-        self.assertEqual(result.open_graph_result["og:type"], "website")
-
-    def test_title_html_entities(self) -> None:
-        """Test HTML entities in title"""
-        result = self.parse_response(
-            {"title": "Why JSON isn&#8217;t a Good Configuration Language"}
-        )
-        self.assertEqual(
-            result.open_graph_result["og:title"],
-            "Why JSON isn’t a Good Configuration Language",
-        )
diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py
deleted file mode 100644
index 2acfccec61..0000000000
--- a/tests/rest/media/v1/test_url_preview.py
+++ /dev/null
@@ -1,1234 +0,0 @@
-# Copyright 2018 New Vector Ltd
-# Copyright 2021 The Matrix.org Foundation C.I.C.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import base64
-import json
-import os
-import re
-from typing import Any, Dict, Optional, Sequence, Tuple, Type
-from urllib.parse import quote, urlencode
-
-from twisted.internet._resolver import HostResolution
-from twisted.internet.address import IPv4Address, IPv6Address
-from twisted.internet.error import DNSLookupError
-from twisted.internet.interfaces import IAddress, IResolutionReceiver
-from twisted.test.proto_helpers import AccumulatingProtocol, MemoryReactor
-
-from synapse.config.oembed import OEmbedEndpointConfig
-from synapse.rest.media.v1.media_repository import MediaRepositoryResource
-from synapse.rest.media.v1.preview_url_resource import IMAGE_CACHE_EXPIRY_MS
-from synapse.server import HomeServer
-from synapse.types import JsonDict
-from synapse.util import Clock
-from synapse.util.stringutils import parse_and_validate_mxc_uri
-
-from tests import unittest
-from tests.server import FakeTransport
-from tests.test_utils import SMALL_PNG
-from tests.utils import MockClock
-
-try:
-    import lxml
-except ImportError:
-    lxml = None
-
-
-class URLPreviewTests(unittest.HomeserverTestCase):
-    if not lxml:
-        skip = "url preview feature requires lxml"
-
-    hijack_auth = True
-    user_id = "@test:user"
-    end_content = (
-        b"<html><head>"
-        b'<meta property="og:title" content="~matrix~" />'
-        b'<meta property="og:description" content="hi" />'
-        b"</head></html>"
-    )
-
-    def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
-        config = self.default_config()
-        config["url_preview_enabled"] = True
-        config["max_spider_size"] = 9999999
-        config["url_preview_ip_range_blacklist"] = (
-            "192.168.1.1",
-            "1.0.0.0/8",
-            "3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff",
-            "2001:800::/21",
-        )
-        config["url_preview_ip_range_whitelist"] = ("1.1.1.1",)
-        config["url_preview_accept_language"] = [
-            "en-UK",
-            "en-US;q=0.9",
-            "fr;q=0.8",
-            "*;q=0.7",
-        ]
-
-        self.storage_path = self.mktemp()
-        self.media_store_path = self.mktemp()
-        os.mkdir(self.storage_path)
-        os.mkdir(self.media_store_path)
-        config["media_store_path"] = self.media_store_path
-
-        provider_config = {
-            "module": "synapse.rest.media.v1.storage_provider.FileStorageProviderBackend",
-            "store_local": True,
-            "store_synchronous": False,
-            "store_remote": True,
-            "config": {"directory": self.storage_path},
-        }
-
-        config["media_storage_providers"] = [provider_config]
-
-        hs = self.setup_test_homeserver(config=config)
-
-        # After the hs is created, modify the parsed oEmbed config (to avoid
-        # messing with files).
-        #
-        # Note that HTTP URLs are used to avoid having to deal with TLS in tests.
-        hs.config.oembed.oembed_patterns = [
-            OEmbedEndpointConfig(
-                api_endpoint="http://publish.twitter.com/oembed",
-                url_patterns=[
-                    re.compile(r"http://twitter\.com/.+/status/.+"),
-                ],
-                formats=None,
-            ),
-            OEmbedEndpointConfig(
-                api_endpoint="http://www.hulu.com/api/oembed.{format}",
-                url_patterns=[
-                    re.compile(r"http://www\.hulu\.com/watch/.+"),
-                ],
-                formats=["json"],
-            ),
-        ]
-
-        return hs
-
-    def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
-        self.media_repo = hs.get_media_repository_resource()
-        self.preview_url = self.media_repo.children[b"preview_url"]
-
-        self.lookups: Dict[str, Any] = {}
-
-        class Resolver:
-            def resolveHostName(
-                _self,
-                resolutionReceiver: IResolutionReceiver,
-                hostName: str,
-                portNumber: int = 0,
-                addressTypes: Optional[Sequence[Type[IAddress]]] = None,
-                transportSemantics: str = "TCP",
-            ) -> IResolutionReceiver:
-                resolution = HostResolution(hostName)
-                resolutionReceiver.resolutionBegan(resolution)
-                if hostName not in self.lookups:
-                    raise DNSLookupError("OH NO")
-
-                for i in self.lookups[hostName]:
-                    resolutionReceiver.addressResolved(i[0]("TCP", i[1], portNumber))
-                resolutionReceiver.resolutionComplete()
-                return resolutionReceiver
-
-        self.reactor.nameResolver = Resolver()  # type: ignore[assignment]
-
-    def create_test_resource(self) -> MediaRepositoryResource:
-        return self.hs.get_media_repository_resource()
-
-    def _assert_small_png(self, json_body: JsonDict) -> None:
-        """Assert properties from the SMALL_PNG test image."""
-        self.assertTrue(json_body["og:image"].startswith("mxc://"))
-        self.assertEqual(json_body["og:image:height"], 1)
-        self.assertEqual(json_body["og:image:width"], 1)
-        self.assertEqual(json_body["og:image:type"], "image/png")
-        self.assertEqual(json_body["matrix:image:size"], 67)
-
-    def test_cache_returns_correct_type(self) -> None:
-        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
-
-        channel = self.make_request(
-            "GET",
-            "preview_url?url=http://matrix.org",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        client = self.reactor.tcpClients[0][2].buildProtocol(None)
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-        client.dataReceived(
-            b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
-            % (len(self.end_content),)
-            + self.end_content
-        )
-
-        self.pump()
-        self.assertEqual(channel.code, 200)
-        self.assertEqual(
-            channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
-        )
-
-        # Check the cache returns the correct response
-        channel = self.make_request(
-            "GET", "preview_url?url=http://matrix.org", shorthand=False
-        )
-
-        # Check the cache response has the same content
-        self.assertEqual(channel.code, 200)
-        self.assertEqual(
-            channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
-        )
-
-        # Clear the in-memory cache
-        self.assertIn("http://matrix.org", self.preview_url._cache)
-        self.preview_url._cache.pop("http://matrix.org")
-        self.assertNotIn("http://matrix.org", self.preview_url._cache)
-
-        # Check the database cache returns the correct response
-        channel = self.make_request(
-            "GET", "preview_url?url=http://matrix.org", shorthand=False
-        )
-
-        # Check the cache response has the same content
-        self.assertEqual(channel.code, 200)
-        self.assertEqual(
-            channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
-        )
-
-    def test_non_ascii_preview_httpequiv(self) -> None:
-        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
-
-        end_content = (
-            b"<html><head>"
-            b'<meta http-equiv="Content-Type" content="text/html; charset=windows-1251"/>'
-            b'<meta property="og:title" content="\xe4\xea\xe0" />'
-            b'<meta property="og:description" content="hi" />'
-            b"</head></html>"
-        )
-
-        channel = self.make_request(
-            "GET",
-            "preview_url?url=http://matrix.org",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        client = self.reactor.tcpClients[0][2].buildProtocol(None)
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-        client.dataReceived(
-            (
-                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
-                b'Content-Type: text/html; charset="utf8"\r\n\r\n'
-            )
-            % (len(end_content),)
-            + end_content
-        )
-
-        self.pump()
-        self.assertEqual(channel.code, 200)
-        self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
-
-    def test_video_rejected(self) -> None:
-        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
-
-        end_content = b"anything"
-
-        channel = self.make_request(
-            "GET",
-            "preview_url?url=http://matrix.org",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        client = self.reactor.tcpClients[0][2].buildProtocol(None)
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-        client.dataReceived(
-            (
-                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
-                b"Content-Type: video/mp4\r\n\r\n"
-            )
-            % (len(end_content))
-            + end_content
-        )
-
-        self.pump()
-        self.assertEqual(channel.code, 502)
-        self.assertEqual(
-            channel.json_body,
-            {
-                "errcode": "M_UNKNOWN",
-                "error": "Requested file's content type not allowed for this operation: video/mp4",
-            },
-        )
-
-    def test_audio_rejected(self) -> None:
-        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
-
-        end_content = b"anything"
-
-        channel = self.make_request(
-            "GET",
-            "preview_url?url=http://matrix.org",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        client = self.reactor.tcpClients[0][2].buildProtocol(None)
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-        client.dataReceived(
-            (
-                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
-                b"Content-Type: audio/aac\r\n\r\n"
-            )
-            % (len(end_content))
-            + end_content
-        )
-
-        self.pump()
-        self.assertEqual(channel.code, 502)
-        self.assertEqual(
-            channel.json_body,
-            {
-                "errcode": "M_UNKNOWN",
-                "error": "Requested file's content type not allowed for this operation: audio/aac",
-            },
-        )
-
-    def test_non_ascii_preview_content_type(self) -> None:
-        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
-
-        end_content = (
-            b"<html><head>"
-            b'<meta property="og:title" content="\xe4\xea\xe0" />'
-            b'<meta property="og:description" content="hi" />'
-            b"</head></html>"
-        )
-
-        channel = self.make_request(
-            "GET",
-            "preview_url?url=http://matrix.org",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        client = self.reactor.tcpClients[0][2].buildProtocol(None)
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-        client.dataReceived(
-            (
-                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
-                b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
-            )
-            % (len(end_content),)
-            + end_content
-        )
-
-        self.pump()
-        self.assertEqual(channel.code, 200)
-        self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
-
-    def test_overlong_title(self) -> None:
-        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
-
-        end_content = (
-            b"<html><head>"
-            b"<title>" + b"x" * 2000 + b"</title>"
-            b'<meta property="og:description" content="hi" />'
-            b"</head></html>"
-        )
-
-        channel = self.make_request(
-            "GET",
-            "preview_url?url=http://matrix.org",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        client = self.reactor.tcpClients[0][2].buildProtocol(None)
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-        client.dataReceived(
-            (
-                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
-                b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
-            )
-            % (len(end_content),)
-            + end_content
-        )
-
-        self.pump()
-        self.assertEqual(channel.code, 200)
-        res = channel.json_body
-        # We should only see the `og:description` field, as `title` is too long and should be stripped out
-        self.assertCountEqual(["og:description"], res.keys())
-
-    def test_ipaddr(self) -> None:
-        """
-        IP addresses can be previewed directly.
-        """
-        self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
-
-        channel = self.make_request(
-            "GET",
-            "preview_url?url=http://example.com",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        client = self.reactor.tcpClients[0][2].buildProtocol(None)
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-        client.dataReceived(
-            b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
-            % (len(self.end_content),)
-            + self.end_content
-        )
-
-        self.pump()
-        self.assertEqual(channel.code, 200)
-        self.assertEqual(
-            channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
-        )
-
-    def test_blacklisted_ip_specific(self) -> None:
-        """
-        Blacklisted IP addresses, found via DNS, are not spidered.
-        """
-        self.lookups["example.com"] = [(IPv4Address, "192.168.1.1")]
-
-        channel = self.make_request(
-            "GET", "preview_url?url=http://example.com", shorthand=False
-        )
-
-        # No requests made.
-        self.assertEqual(len(self.reactor.tcpClients), 0)
-        self.assertEqual(channel.code, 502)
-        self.assertEqual(
-            channel.json_body,
-            {
-                "errcode": "M_UNKNOWN",
-                "error": "DNS resolution failure during URL preview generation",
-            },
-        )
-
-    def test_blacklisted_ip_range(self) -> None:
-        """
-        Blacklisted IP ranges, IPs found over DNS, are not spidered.
-        """
-        self.lookups["example.com"] = [(IPv4Address, "1.1.1.2")]
-
-        channel = self.make_request(
-            "GET", "preview_url?url=http://example.com", shorthand=False
-        )
-
-        self.assertEqual(channel.code, 502)
-        self.assertEqual(
-            channel.json_body,
-            {
-                "errcode": "M_UNKNOWN",
-                "error": "DNS resolution failure during URL preview generation",
-            },
-        )
-
-    def test_blacklisted_ip_specific_direct(self) -> None:
-        """
-        Blacklisted IP addresses, accessed directly, are not spidered.
-        """
-        channel = self.make_request(
-            "GET", "preview_url?url=http://192.168.1.1", shorthand=False
-        )
-
-        # No requests made.
-        self.assertEqual(len(self.reactor.tcpClients), 0)
-        self.assertEqual(
-            channel.json_body,
-            {
-                "errcode": "M_UNKNOWN",
-                "error": "IP address blocked by IP blacklist entry",
-            },
-        )
-        self.assertEqual(channel.code, 403)
-
-    def test_blacklisted_ip_range_direct(self) -> None:
-        """
-        Blacklisted IP ranges, accessed directly, are not spidered.
-        """
-        channel = self.make_request(
-            "GET", "preview_url?url=http://1.1.1.2", shorthand=False
-        )
-
-        self.assertEqual(channel.code, 403)
-        self.assertEqual(
-            channel.json_body,
-            {
-                "errcode": "M_UNKNOWN",
-                "error": "IP address blocked by IP blacklist entry",
-            },
-        )
-
-    def test_blacklisted_ip_range_whitelisted_ip(self) -> None:
-        """
-        Blacklisted but then subsequently whitelisted IP addresses can be
-        spidered.
-        """
-        self.lookups["example.com"] = [(IPv4Address, "1.1.1.1")]
-
-        channel = self.make_request(
-            "GET",
-            "preview_url?url=http://example.com",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        client = self.reactor.tcpClients[0][2].buildProtocol(None)
-
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-
-        client.dataReceived(
-            b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
-            % (len(self.end_content),)
-            + self.end_content
-        )
-
-        self.pump()
-        self.assertEqual(channel.code, 200)
-        self.assertEqual(
-            channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
-        )
-
-    def test_blacklisted_ip_with_external_ip(self) -> None:
-        """
-        If a hostname resolves a blacklisted IP, even if there's a
-        non-blacklisted one, it will be rejected.
-        """
-        # Hardcode the URL resolving to the IP we want.
-        self.lookups["example.com"] = [
-            (IPv4Address, "1.1.1.2"),
-            (IPv4Address, "10.1.2.3"),
-        ]
-
-        channel = self.make_request(
-            "GET", "preview_url?url=http://example.com", shorthand=False
-        )
-        self.assertEqual(channel.code, 502)
-        self.assertEqual(
-            channel.json_body,
-            {
-                "errcode": "M_UNKNOWN",
-                "error": "DNS resolution failure during URL preview generation",
-            },
-        )
-
-    def test_blacklisted_ipv6_specific(self) -> None:
-        """
-        Blacklisted IP addresses, found via DNS, are not spidered.
-        """
-        self.lookups["example.com"] = [
-            (IPv6Address, "3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff")
-        ]
-
-        channel = self.make_request(
-            "GET", "preview_url?url=http://example.com", shorthand=False
-        )
-
-        # No requests made.
-        self.assertEqual(len(self.reactor.tcpClients), 0)
-        self.assertEqual(channel.code, 502)
-        self.assertEqual(
-            channel.json_body,
-            {
-                "errcode": "M_UNKNOWN",
-                "error": "DNS resolution failure during URL preview generation",
-            },
-        )
-
-    def test_blacklisted_ipv6_range(self) -> None:
-        """
-        Blacklisted IP ranges, IPs found over DNS, are not spidered.
-        """
-        self.lookups["example.com"] = [(IPv6Address, "2001:800::1")]
-
-        channel = self.make_request(
-            "GET", "preview_url?url=http://example.com", shorthand=False
-        )
-
-        self.assertEqual(channel.code, 502)
-        self.assertEqual(
-            channel.json_body,
-            {
-                "errcode": "M_UNKNOWN",
-                "error": "DNS resolution failure during URL preview generation",
-            },
-        )
-
-    def test_OPTIONS(self) -> None:
-        """
-        OPTIONS returns the OPTIONS.
-        """
-        channel = self.make_request(
-            "OPTIONS", "preview_url?url=http://example.com", shorthand=False
-        )
-        self.assertEqual(channel.code, 200)
-        self.assertEqual(channel.json_body, {})
-
-    def test_accept_language_config_option(self) -> None:
-        """
-        Accept-Language header is sent to the remote server
-        """
-        self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
-
-        # Build and make a request to the server
-        channel = self.make_request(
-            "GET",
-            "preview_url?url=http://example.com",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        # Extract Synapse's tcp client
-        client = self.reactor.tcpClients[0][2].buildProtocol(None)
-
-        # Build a fake remote server to reply with
-        server = AccumulatingProtocol()
-
-        # Connect the two together
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-
-        # Tell Synapse that it has received some data from the remote server
-        client.dataReceived(
-            b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
-            % (len(self.end_content),)
-            + self.end_content
-        )
-
-        # Move the reactor along until we get a response on our original channel
-        self.pump()
-        self.assertEqual(channel.code, 200)
-        self.assertEqual(
-            channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
-        )
-
-        # Check that the server received the Accept-Language header as part
-        # of the request from Synapse
-        self.assertIn(
-            (
-                b"Accept-Language: en-UK\r\n"
-                b"Accept-Language: en-US;q=0.9\r\n"
-                b"Accept-Language: fr;q=0.8\r\n"
-                b"Accept-Language: *;q=0.7"
-            ),
-            server.data,
-        )
-
-    def test_nonexistent_image(self) -> None:
-        """If the preview image doesn't exist, ensure some data is returned."""
-        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
-
-        result = (
-            b"""<html><body><img src="http://cdn.matrix.org/foo.jpg"></body></html>"""
-        )
-
-        channel = self.make_request(
-            "GET",
-            "preview_url?url=http://matrix.org",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        client = self.reactor.tcpClients[0][2].buildProtocol(None)
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-        client.dataReceived(
-            (
-                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
-                b'Content-Type: text/html; charset="utf8"\r\n\r\n'
-            )
-            % (len(result),)
-            + result
-        )
-
-        self.pump()
-        self.assertEqual(channel.code, 200)
-
-        # The image should not be in the result.
-        self.assertNotIn("og:image", channel.json_body)
-
-    def test_oembed_failure(self) -> None:
-        """If the autodiscovered oEmbed URL fails, ensure some data is returned."""
-        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
-
-        result = b"""
-        <title>oEmbed Autodiscovery Fail</title>
-        <link rel="alternate" type="application/json+oembed"
-            href="http://example.com/oembed?url=http%3A%2F%2Fmatrix.org&format=json"
-            title="matrixdotorg" />
-        """
-
-        channel = self.make_request(
-            "GET",
-            "preview_url?url=http://matrix.org",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        client = self.reactor.tcpClients[0][2].buildProtocol(None)
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-        client.dataReceived(
-            (
-                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
-                b'Content-Type: text/html; charset="utf8"\r\n\r\n'
-            )
-            % (len(result),)
-            + result
-        )
-
-        self.pump()
-        self.assertEqual(channel.code, 200)
-
-        # The image should not be in the result.
-        self.assertEqual(channel.json_body["og:title"], "oEmbed Autodiscovery Fail")
-
-    def test_data_url(self) -> None:
-        """
-        Requesting to preview a data URL is not supported.
-        """
-        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
-
-        data = base64.b64encode(SMALL_PNG).decode()
-
-        query_params = urlencode(
-            {
-                "url": f'<html><head><img src="data:image/png;base64,{data}" /></head></html>'
-            }
-        )
-
-        channel = self.make_request(
-            "GET",
-            f"preview_url?{query_params}",
-            shorthand=False,
-        )
-        self.pump()
-
-        self.assertEqual(channel.code, 500)
-
-    def test_inline_data_url(self) -> None:
-        """
-        An inline image (as a data URL) should be parsed properly.
-        """
-        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
-
-        data = base64.b64encode(SMALL_PNG)
-
-        end_content = (
-            b"<html><head>" b'<img src="data:image/png;base64,%s" />' b"</head></html>"
-        ) % (data,)
-
-        channel = self.make_request(
-            "GET",
-            "preview_url?url=http://matrix.org",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        client = self.reactor.tcpClients[0][2].buildProtocol(None)
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-        client.dataReceived(
-            (
-                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
-                b'Content-Type: text/html; charset="utf8"\r\n\r\n'
-            )
-            % (len(end_content),)
-            + end_content
-        )
-
-        self.pump()
-        self.assertEqual(channel.code, 200)
-        self._assert_small_png(channel.json_body)
-
-    def test_oembed_photo(self) -> None:
-        """Test an oEmbed endpoint which returns a 'photo' type which redirects the preview to a new URL."""
-        self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
-        self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
-
-        result = {
-            "version": "1.0",
-            "type": "photo",
-            "url": "http://cdn.twitter.com/matrixdotorg",
-        }
-        oembed_content = json.dumps(result).encode("utf-8")
-
-        channel = self.make_request(
-            "GET",
-            "preview_url?url=http://twitter.com/matrixdotorg/status/12345",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        client = self.reactor.tcpClients[0][2].buildProtocol(None)
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-        client.dataReceived(
-            (
-                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
-                b'Content-Type: application/json; charset="utf8"\r\n\r\n'
-            )
-            % (len(oembed_content),)
-            + oembed_content
-        )
-
-        self.pump()
-
-        # Ensure a second request is made to the photo URL.
-        client = self.reactor.tcpClients[1][2].buildProtocol(None)
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-        client.dataReceived(
-            (
-                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
-                b"Content-Type: image/png\r\n\r\n"
-            )
-            % (len(SMALL_PNG),)
-            + SMALL_PNG
-        )
-
-        self.pump()
-
-        # Ensure the URL is what was requested.
-        self.assertIn(b"/matrixdotorg", server.data)
-
-        self.assertEqual(channel.code, 200)
-        body = channel.json_body
-        self.assertEqual(body["og:url"], "http://twitter.com/matrixdotorg/status/12345")
-        self._assert_small_png(body)
-
-    def test_oembed_rich(self) -> None:
-        """Test an oEmbed endpoint which returns HTML content via the 'rich' type."""
-        self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
-
-        result = {
-            "version": "1.0",
-            "type": "rich",
-            # Note that this provides the author, not the title.
-            "author_name": "Alice",
-            "html": "<div>Content Preview</div>",
-        }
-        end_content = json.dumps(result).encode("utf-8")
-
-        channel = self.make_request(
-            "GET",
-            "preview_url?url=http://twitter.com/matrixdotorg/status/12345",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        client = self.reactor.tcpClients[0][2].buildProtocol(None)
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-        client.dataReceived(
-            (
-                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
-                b'Content-Type: application/json; charset="utf8"\r\n\r\n'
-            )
-            % (len(end_content),)
-            + end_content
-        )
-
-        self.pump()
-        self.assertEqual(channel.code, 200)
-        body = channel.json_body
-        self.assertEqual(
-            body,
-            {
-                "og:url": "http://twitter.com/matrixdotorg/status/12345",
-                "og:title": "Alice",
-                "og:description": "Content Preview",
-            },
-        )
-
-    def test_oembed_format(self) -> None:
-        """Test an oEmbed endpoint which requires the format in the URL."""
-        self.lookups["www.hulu.com"] = [(IPv4Address, "10.1.2.3")]
-
-        result = {
-            "version": "1.0",
-            "type": "rich",
-            "html": "<div>Content Preview</div>",
-        }
-        end_content = json.dumps(result).encode("utf-8")
-
-        channel = self.make_request(
-            "GET",
-            "preview_url?url=http://www.hulu.com/watch/12345",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        client = self.reactor.tcpClients[0][2].buildProtocol(None)
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-        client.dataReceived(
-            (
-                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
-                b'Content-Type: application/json; charset="utf8"\r\n\r\n'
-            )
-            % (len(end_content),)
-            + end_content
-        )
-
-        self.pump()
-
-        # The {format} should have been turned into json.
-        self.assertIn(b"/api/oembed.json", server.data)
-        # A URL parameter of format=json should be provided.
-        self.assertIn(b"format=json", server.data)
-
-        self.assertEqual(channel.code, 200)
-        body = channel.json_body
-        self.assertEqual(
-            body,
-            {
-                "og:url": "http://www.hulu.com/watch/12345",
-                "og:description": "Content Preview",
-            },
-        )
-
-    def test_oembed_autodiscovery(self) -> None:
-        """
-        Autodiscovery works by finding the link in the HTML response and then requesting an oEmbed URL.
-        1. Request a preview of a URL which is not known to the oEmbed code.
-        2. It returns HTML including a link to an oEmbed preview.
-        3. The oEmbed preview is requested and returns a URL for an image.
-        4. The image is requested for thumbnailing.
-        """
-        # This is a little cheesy in that we use the www subdomain (which isn't the
-        # list of oEmbed patterns) to get "raw" HTML response.
-        self.lookups["www.twitter.com"] = [(IPv4Address, "10.1.2.3")]
-        self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
-        self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
-
-        result = b"""
-        <link rel="alternate" type="application/json+oembed"
-            href="http://publish.twitter.com/oembed?url=http%3A%2F%2Fcdn.twitter.com%2Fmatrixdotorg%2Fstatus%2F12345&format=json"
-            title="matrixdotorg" />
-        """
-
-        channel = self.make_request(
-            "GET",
-            "preview_url?url=http://www.twitter.com/matrixdotorg/status/12345",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        client = self.reactor.tcpClients[0][2].buildProtocol(None)
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-        client.dataReceived(
-            (
-                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
-                b'Content-Type: text/html; charset="utf8"\r\n\r\n'
-            )
-            % (len(result),)
-            + result
-        )
-
-        self.pump()
-
-        # The oEmbed response.
-        result2 = {
-            "version": "1.0",
-            "type": "photo",
-            "url": "http://cdn.twitter.com/matrixdotorg",
-        }
-        oembed_content = json.dumps(result2).encode("utf-8")
-
-        # Ensure a second request is made to the oEmbed URL.
-        client = self.reactor.tcpClients[1][2].buildProtocol(None)
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-        client.dataReceived(
-            (
-                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
-                b'Content-Type: application/json; charset="utf8"\r\n\r\n'
-            )
-            % (len(oembed_content),)
-            + oembed_content
-        )
-
-        self.pump()
-
-        # Ensure the URL is what was requested.
-        self.assertIn(b"/oembed?", server.data)
-
-        # Ensure a third request is made to the photo URL.
-        client = self.reactor.tcpClients[2][2].buildProtocol(None)
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-        client.dataReceived(
-            (
-                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
-                b"Content-Type: image/png\r\n\r\n"
-            )
-            % (len(SMALL_PNG),)
-            + SMALL_PNG
-        )
-
-        self.pump()
-
-        # Ensure the URL is what was requested.
-        self.assertIn(b"/matrixdotorg", server.data)
-
-        self.assertEqual(channel.code, 200)
-        body = channel.json_body
-        self.assertEqual(
-            body["og:url"], "http://www.twitter.com/matrixdotorg/status/12345"
-        )
-        self._assert_small_png(body)
-
-    def _download_image(self) -> Tuple[str, str]:
-        """Downloads an image into the URL cache.
-        Returns:
-            A (host, media_id) tuple representing the MXC URI of the image.
-        """
-        self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
-
-        channel = self.make_request(
-            "GET",
-            "preview_url?url=http://cdn.twitter.com/matrixdotorg",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        client = self.reactor.tcpClients[0][2].buildProtocol(None)
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-        client.dataReceived(
-            b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: image/png\r\n\r\n"
-            % (len(SMALL_PNG),)
-            + SMALL_PNG
-        )
-
-        self.pump()
-        self.assertEqual(channel.code, 200)
-        body = channel.json_body
-        mxc_uri = body["og:image"]
-        host, _port, media_id = parse_and_validate_mxc_uri(mxc_uri)
-        self.assertIsNone(_port)
-        return host, media_id
-
-    def test_storage_providers_exclude_files(self) -> None:
-        """Test that files are not stored in or fetched from storage providers."""
-        host, media_id = self._download_image()
-
-        rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
-        media_store_path = os.path.join(self.media_store_path, rel_file_path)
-        storage_provider_path = os.path.join(self.storage_path, rel_file_path)
-
-        # Check storage
-        self.assertTrue(os.path.isfile(media_store_path))
-        self.assertFalse(
-            os.path.isfile(storage_provider_path),
-            "URL cache file was unexpectedly stored in a storage provider",
-        )
-
-        # Check fetching
-        channel = self.make_request(
-            "GET",
-            f"download/{host}/{media_id}",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-        self.assertEqual(channel.code, 200)
-
-        # Move cached file into the storage provider
-        os.makedirs(os.path.dirname(storage_provider_path), exist_ok=True)
-        os.rename(media_store_path, storage_provider_path)
-
-        channel = self.make_request(
-            "GET",
-            f"download/{host}/{media_id}",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-        self.assertEqual(
-            channel.code,
-            404,
-            "URL cache file was unexpectedly retrieved from a storage provider",
-        )
-
-    def test_storage_providers_exclude_thumbnails(self) -> None:
-        """Test that thumbnails are not stored in or fetched from storage providers."""
-        host, media_id = self._download_image()
-
-        rel_thumbnail_path = (
-            self.preview_url.filepaths.url_cache_thumbnail_directory_rel(media_id)
-        )
-        media_store_thumbnail_path = os.path.join(
-            self.media_store_path, rel_thumbnail_path
-        )
-        storage_provider_thumbnail_path = os.path.join(
-            self.storage_path, rel_thumbnail_path
-        )
-
-        # Check storage
-        self.assertTrue(os.path.isdir(media_store_thumbnail_path))
-        self.assertFalse(
-            os.path.isdir(storage_provider_thumbnail_path),
-            "URL cache thumbnails were unexpectedly stored in a storage provider",
-        )
-
-        # Check fetching
-        channel = self.make_request(
-            "GET",
-            f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-        self.assertEqual(channel.code, 200)
-
-        # Remove the original, otherwise thumbnails will regenerate
-        rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
-        media_store_path = os.path.join(self.media_store_path, rel_file_path)
-        os.remove(media_store_path)
-
-        # Move cached thumbnails into the storage provider
-        os.makedirs(os.path.dirname(storage_provider_thumbnail_path), exist_ok=True)
-        os.rename(media_store_thumbnail_path, storage_provider_thumbnail_path)
-
-        channel = self.make_request(
-            "GET",
-            f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-        self.assertEqual(
-            channel.code,
-            404,
-            "URL cache thumbnail was unexpectedly retrieved from a storage provider",
-        )
-
-    def test_cache_expiry(self) -> None:
-        """Test that URL cache files and thumbnails are cleaned up properly on expiry."""
-        self.preview_url.clock = MockClock()
-
-        _host, media_id = self._download_image()
-
-        file_path = self.preview_url.filepaths.url_cache_filepath(media_id)
-        file_dirs = self.preview_url.filepaths.url_cache_filepath_dirs_to_delete(
-            media_id
-        )
-        thumbnail_dir = self.preview_url.filepaths.url_cache_thumbnail_directory(
-            media_id
-        )
-        thumbnail_dirs = self.preview_url.filepaths.url_cache_thumbnail_dirs_to_delete(
-            media_id
-        )
-
-        self.assertTrue(os.path.isfile(file_path))
-        self.assertTrue(os.path.isdir(thumbnail_dir))
-
-        self.preview_url.clock.advance_time_msec(IMAGE_CACHE_EXPIRY_MS + 1)
-        self.get_success(self.preview_url._expire_url_cache_data())
-
-        for path in [file_path] + file_dirs + [thumbnail_dir] + thumbnail_dirs:
-            self.assertFalse(
-                os.path.exists(path),
-                f"{os.path.relpath(path, self.media_store_path)} was not deleted",
-            )
-
-    @unittest.override_config({"url_preview_url_blacklist": [{"port": "*"}]})
-    def test_blacklist_port(self) -> None:
-        """Tests that blacklisting URLs with a port makes previewing such URLs
-        fail with a 403 error and doesn't impact other previews.
-        """
-        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
-
-        bad_url = quote("http://matrix.org:8888/foo")
-        good_url = quote("http://matrix.org/foo")
-
-        channel = self.make_request(
-            "GET",
-            "preview_url?url=" + bad_url,
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-        self.assertEqual(channel.code, 403, channel.result)
-
-        channel = self.make_request(
-            "GET",
-            "preview_url?url=" + good_url,
-            shorthand=False,
-            await_result=False,
-        )
-        self.pump()
-
-        client = self.reactor.tcpClients[0][2].buildProtocol(None)
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-        client.dataReceived(
-            b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
-            % (len(self.end_content),)
-            + self.end_content
-        )
-
-        self.pump()
-        self.assertEqual(channel.code, 200)
-- 
cgit 1.5.1


From b40657314e03583f45ad49504711698a70735313 Mon Sep 17 00:00:00 2001
From: Andrew Morgan <1342360+anoadragon453@users.noreply.github.com>
Date: Mon, 27 Feb 2023 14:19:19 +0000
Subject: Add module API callbacks for adding and deleting local 3PID
 associations (#15044

---
 changelog.d/15044.feature                      |   1 +
 docs/modules/third_party_rules_callbacks.md    |  45 ++++++++-
 docs/upgrade.md                                |  24 +++++
 synapse/events/third_party_rules.py            |  63 +++++++++++++
 synapse/handlers/auth.py                       |  49 ++++++----
 synapse/handlers/deactivate_account.py         |  20 ++--
 synapse/module_api/__init__.py                 |  10 ++
 synapse/rest/admin/users.py                    |  11 ++-
 synapse/rest/client/account.py                 |   9 +-
 synapse/storage/databases/main/registration.py |  13 ---
 tests/push/test_email.py                       |   6 +-
 tests/rest/client/test_third_party_rules.py    | 121 +++++++++++++++++++++++++
 12 files changed, 324 insertions(+), 48 deletions(-)
 create mode 100644 changelog.d/15044.feature

(limited to 'tests/rest')

diff --git a/changelog.d/15044.feature b/changelog.d/15044.feature
new file mode 100644
index 0000000000..91e5cda8c3
--- /dev/null
+++ b/changelog.d/15044.feature
@@ -0,0 +1 @@
+Add two new Third Party Rules module API callbacks: [`on_add_user_third_party_identifier`](https://matrix-org.github.io/synapse/v1.79/modules/third_party_rules_callbacks.html#on_add_user_third_party_identifier) and [`on_remove_user_third_party_identifier`](https://matrix-org.github.io/synapse/v1.79/modules/third_party_rules_callbacks.html#on_remove_user_third_party_identifier).
\ No newline at end of file
diff --git a/docs/modules/third_party_rules_callbacks.md b/docs/modules/third_party_rules_callbacks.md
index 888e43bd10..4a27d976fb 100644
--- a/docs/modules/third_party_rules_callbacks.md
+++ b/docs/modules/third_party_rules_callbacks.md
@@ -254,6 +254,11 @@ If multiple modules implement this callback, Synapse runs them all in order.
 
 _First introduced in Synapse v1.56.0_
 
+**<span style="color:red">
+This callback is deprecated in favour of the `on_add_user_third_party_identifier` callback, which
+features the same functionality. The only difference is in name.
+</span>**
+
 ```python
 async def on_threepid_bind(user_id: str, medium: str, address: str) -> None:
 ```
@@ -268,6 +273,44 @@ server_.
 
 If multiple modules implement this callback, Synapse runs them all in order.
 
+### `on_add_user_third_party_identifier`
+
+_First introduced in Synapse v1.79.0_
+
+```python
+async def on_add_user_third_party_identifier(user_id: str, medium: str, address: str) -> None:
+```
+
+Called after successfully creating an association between a user and a third-party identifier
+(email address, phone number). The module is given the Matrix ID of the user the
+association is for, as well as the medium (`email` or `msisdn`) and address of the
+third-party identifier (i.e. an email address).
+
+Note that this callback is _not_ called if a user attempts to bind their third-party identifier
+to an identity server (via a call to [`POST
+/_matrix/client/v3/account/3pid/bind`](https://spec.matrix.org/v1.5/client-server-api/#post_matrixclientv3account3pidbind)).
+
+If multiple modules implement this callback, Synapse runs them all in order.
+
+### `on_remove_user_third_party_identifier`
+
+_First introduced in Synapse v1.79.0_
+
+```python
+async def on_remove_user_third_party_identifier(user_id: str, medium: str, address: str) -> None:
+```
+
+Called after successfully removing an association between a user and a third-party identifier
+(email address, phone number). The module is given the Matrix ID of the user the
+association is for, as well as the medium (`email` or `msisdn`) and address of the
+third-party identifier (i.e. an email address).
+
+Note that this callback is _not_ called if a user attempts to unbind their third-party
+identifier from an identity server (via a call to [`POST
+/_matrix/client/v3/account/3pid/unbind`](https://spec.matrix.org/v1.5/client-server-api/#post_matrixclientv3account3pidunbind)).
+
+If multiple modules implement this callback, Synapse runs them all in order.
+
 ## Example
 
 The example below is a module that implements the third-party rules callback
@@ -300,4 +343,4 @@ class EventCensorer:
         )
         event_dict["content"] = new_event_content
         return event_dict
-```
+```
\ No newline at end of file
diff --git a/docs/upgrade.md b/docs/upgrade.md
index 15167b8c58..f06e874054 100644
--- a/docs/upgrade.md
+++ b/docs/upgrade.md
@@ -88,6 +88,30 @@ process, for example:
     dpkg -i matrix-synapse-py3_1.3.0+stretch1_amd64.deb
     ```
 
+# Upgrading to v1.79.0
+
+## The `on_threepid_bind` module callback method has been deprecated
+
+Synapse v1.79.0 deprecates the
+[`on_threepid_bind`](modules/third_party_rules_callbacks.md#on_threepid_bind)
+"third-party rules" Synapse module callback method in favour of a new module method,
+[`on_add_user_third_party_identifier`](modules/third_party_rules_callbacks.md#on_add_user_third_party_identifier).
+`on_threepid_bind` will be removed in a future version of Synapse. You should check whether any Synapse
+modules in use in your deployment are making use of `on_threepid_bind`, and update them where possible.
+
+The arguments and functionality of the new method are the same.
+
+The justification behind the name change is that the old method's name, `on_threepid_bind`, was
+misleading. A user is considered to "bind" their third-party ID to their Matrix ID only if they
+do so via an [identity server](https://spec.matrix.org/latest/identity-service-api/)
+(so that users on other homeservers may find them). But this method was not called in that case -
+it was only called when a user added a third-party identifier on the local homeserver.
+
+Module developers may also be interested in the related
+[`on_remove_user_third_party_identifier`](modules/third_party_rules_callbacks.md#on_remove_user_third_party_identifier)
+module callback method that was also added in Synapse v1.79.0. This new method is called when a
+user removes a third-party identifier from their account.
+
 # Upgrading to v1.78.0
 
 ## Deprecate the `/_synapse/admin/v1/media/<server_name>/delete` admin API
diff --git a/synapse/events/third_party_rules.py b/synapse/events/third_party_rules.py
index 9a25ed419b..3e4d52c8d8 100644
--- a/synapse/events/third_party_rules.py
+++ b/synapse/events/third_party_rules.py
@@ -45,6 +45,8 @@ CHECK_CAN_DEACTIVATE_USER_CALLBACK = Callable[[str, bool], Awaitable[bool]]
 ON_PROFILE_UPDATE_CALLBACK = Callable[[str, ProfileInfo, bool, bool], Awaitable]
 ON_USER_DEACTIVATION_STATUS_CHANGED_CALLBACK = Callable[[str, bool, bool], Awaitable]
 ON_THREEPID_BIND_CALLBACK = Callable[[str, str, str], Awaitable]
+ON_ADD_USER_THIRD_PARTY_IDENTIFIER_CALLBACK = Callable[[str, str, str], Awaitable]
+ON_REMOVE_USER_THIRD_PARTY_IDENTIFIER_CALLBACK = Callable[[str, str, str], Awaitable]
 
 
 def load_legacy_third_party_event_rules(hs: "HomeServer") -> None:
@@ -172,6 +174,12 @@ class ThirdPartyEventRules:
             ON_USER_DEACTIVATION_STATUS_CHANGED_CALLBACK
         ] = []
         self._on_threepid_bind_callbacks: List[ON_THREEPID_BIND_CALLBACK] = []
+        self._on_add_user_third_party_identifier_callbacks: List[
+            ON_ADD_USER_THIRD_PARTY_IDENTIFIER_CALLBACK
+        ] = []
+        self._on_remove_user_third_party_identifier_callbacks: List[
+            ON_REMOVE_USER_THIRD_PARTY_IDENTIFIER_CALLBACK
+        ] = []
 
     def register_third_party_rules_callbacks(
         self,
@@ -191,6 +199,12 @@ class ThirdPartyEventRules:
             ON_USER_DEACTIVATION_STATUS_CHANGED_CALLBACK
         ] = None,
         on_threepid_bind: Optional[ON_THREEPID_BIND_CALLBACK] = None,
+        on_add_user_third_party_identifier: Optional[
+            ON_ADD_USER_THIRD_PARTY_IDENTIFIER_CALLBACK
+        ] = None,
+        on_remove_user_third_party_identifier: Optional[
+            ON_REMOVE_USER_THIRD_PARTY_IDENTIFIER_CALLBACK
+        ] = None,
     ) -> None:
         """Register callbacks from modules for each hook."""
         if check_event_allowed is not None:
@@ -228,6 +242,11 @@ class ThirdPartyEventRules:
         if on_threepid_bind is not None:
             self._on_threepid_bind_callbacks.append(on_threepid_bind)
 
+        if on_add_user_third_party_identifier is not None:
+            self._on_add_user_third_party_identifier_callbacks.append(
+                on_add_user_third_party_identifier
+            )
+
     async def check_event_allowed(
         self,
         event: EventBase,
@@ -511,6 +530,9 @@ class ThirdPartyEventRules:
         local homeserver, not when it's created on an identity server (and then kept track
         of so that it can be unbound on the same IS later on).
 
+        THIS MODULE CALLBACK METHOD HAS BEEN DEPRECATED. Please use the
+        `on_add_user_third_party_identifier` callback method instead.
+
         Args:
             user_id: the user being associated with the threepid.
             medium: the threepid's medium.
@@ -523,3 +545,44 @@ class ThirdPartyEventRules:
                 logger.exception(
                     "Failed to run module API callback %s: %s", callback, e
                 )
+
+    async def on_add_user_third_party_identifier(
+        self, user_id: str, medium: str, address: str
+    ) -> None:
+        """Called when an association between a user's Matrix ID and a third-party ID
+        (email, phone number) has successfully been registered on the homeserver.
+
+        Args:
+            user_id: The User ID included in the association.
+            medium: The medium of the third-party ID (email, msisdn).
+            address: The address of the third-party ID (i.e. an email address).
+        """
+        for callback in self._on_add_user_third_party_identifier_callbacks:
+            try:
+                await callback(user_id, medium, address)
+            except Exception as e:
+                logger.exception(
+                    "Failed to run module API callback %s: %s", callback, e
+                )
+
+    async def on_remove_user_third_party_identifier(
+        self, user_id: str, medium: str, address: str
+    ) -> None:
+        """Called when an association between a user's Matrix ID and a third-party ID
+        (email, phone number) has been successfully removed on the homeserver.
+
+        This is called *after* any known bindings on identity servers for this
+        association have been removed.
+
+        Args:
+            user_id: The User ID included in the removed association.
+            medium: The medium of the third-party ID (email, msisdn).
+            address: The address of the third-party ID (i.e. an email address).
+        """
+        for callback in self._on_remove_user_third_party_identifier_callbacks:
+            try:
+                await callback(user_id, medium, address)
+            except Exception as e:
+                logger.exception(
+                    "Failed to run module API callback %s: %s", callback, e
+                )
diff --git a/synapse/handlers/auth.py b/synapse/handlers/auth.py
index b12bc4c9a3..308e38edea 100644
--- a/synapse/handlers/auth.py
+++ b/synapse/handlers/auth.py
@@ -1542,6 +1542,17 @@ class AuthHandler:
     async def add_threepid(
         self, user_id: str, medium: str, address: str, validated_at: int
     ) -> None:
+        """
+        Adds an association between a user's Matrix ID and a third-party ID (email,
+        phone number).
+
+        Args:
+            user_id: The ID of the user to associate.
+            medium: The medium of the third-party ID (email, msisdn).
+            address: The address of the third-party ID (i.e. an email address).
+            validated_at: The timestamp in ms of when the validation that the user owns
+                this third-party ID occurred.
+        """
         # check if medium has a valid value
         if medium not in ["email", "msisdn"]:
             raise SynapseError(
@@ -1566,42 +1577,44 @@ class AuthHandler:
             user_id, medium, address, validated_at, self.hs.get_clock().time_msec()
         )
 
+        # Inform Synapse modules that a 3PID association has been created.
+        await self._third_party_rules.on_add_user_third_party_identifier(
+            user_id, medium, address
+        )
+
+        # Deprecated method for informing Synapse modules that a 3PID association
+        # has successfully been created.
         await self._third_party_rules.on_threepid_bind(user_id, medium, address)
 
-    async def delete_threepid(
-        self, user_id: str, medium: str, address: str, id_server: Optional[str] = None
-    ) -> bool:
-        """Attempts to unbind the 3pid on the identity servers and deletes it
-        from the local database.
+    async def delete_local_threepid(
+        self, user_id: str, medium: str, address: str
+    ) -> None:
+        """Deletes an association between a third-party ID and a user ID from the local
+        database. This method does not unbind the association from any identity servers.
+
+        If `medium` is 'email' and a pusher is associated with this third-party ID, the
+        pusher will also be deleted.
 
         Args:
             user_id: ID of user to remove the 3pid from.
             medium: The medium of the 3pid being removed: "email" or "msisdn".
             address: The 3pid address to remove.
-            id_server: Use the given identity server when unbinding
-                any threepids. If None then will attempt to unbind using the
-                identity server specified when binding (if known).
-
-        Returns:
-            Returns True if successfully unbound the 3pid on
-            the identity server, False if identity server doesn't support the
-            unbind API.
         """
-
         # 'Canonicalise' email addresses as per above
         if medium == "email":
             address = canonicalise_email(address)
 
-        result = await self.hs.get_identity_handler().try_unbind_threepid(
-            user_id, medium, address, id_server
+        await self.store.user_delete_threepid(user_id, medium, address)
+
+        # Inform Synapse modules that a 3PID association has been deleted.
+        await self._third_party_rules.on_remove_user_third_party_identifier(
+            user_id, medium, address
         )
 
-        await self.store.user_delete_threepid(user_id, medium, address)
         if medium == "email":
             await self.store.delete_pusher_by_app_id_pushkey_user_id(
                 app_id="m.email", pushkey=address, user_id=user_id
             )
-        return result
 
     async def hash(self, password: str) -> str:
         """Computes a secure hash of password.
diff --git a/synapse/handlers/deactivate_account.py b/synapse/handlers/deactivate_account.py
index d24f649382..d31263c717 100644
--- a/synapse/handlers/deactivate_account.py
+++ b/synapse/handlers/deactivate_account.py
@@ -100,26 +100,28 @@ class DeactivateAccountHandler:
         # unbinding
         identity_server_supports_unbinding = True
 
-        # Retrieve the 3PIDs this user has bound to an identity server
-        threepids = await self.store.user_get_bound_threepids(user_id)
-
-        for threepid in threepids:
+        # Attempt to unbind any known bound threepids to this account from identity
+        # server(s).
+        bound_threepids = await self.store.user_get_bound_threepids(user_id)
+        for threepid in bound_threepids:
             try:
                 result = await self._identity_handler.try_unbind_threepid(
                     user_id, threepid["medium"], threepid["address"], id_server
                 )
-                identity_server_supports_unbinding &= result
             except Exception:
                 # Do we want this to be a fatal error or should we carry on?
                 logger.exception("Failed to remove threepid from ID server")
                 raise SynapseError(400, "Failed to remove threepid from ID server")
-            await self.store.user_delete_threepid(
+
+            identity_server_supports_unbinding &= result
+
+        # Remove any local threepid associations for this account.
+        local_threepids = await self.store.user_get_threepids(user_id)
+        for threepid in local_threepids:
+            await self._auth_handler.delete_local_threepid(
                 user_id, threepid["medium"], threepid["address"]
             )
 
-        # Remove all 3PIDs this user has bound to the homeserver
-        await self.store.user_delete_threepids(user_id)
-
         # delete any devices belonging to the user, which will also
         # delete corresponding access tokens.
         await self._device_handler.delete_all_devices_for_user(user_id)
diff --git a/synapse/module_api/__init__.py b/synapse/module_api/__init__.py
index 1964276a54..424239e3df 100644
--- a/synapse/module_api/__init__.py
+++ b/synapse/module_api/__init__.py
@@ -64,9 +64,11 @@ from synapse.events.third_party_rules import (
     CHECK_EVENT_ALLOWED_CALLBACK,
     CHECK_THREEPID_CAN_BE_INVITED_CALLBACK,
     CHECK_VISIBILITY_CAN_BE_MODIFIED_CALLBACK,
+    ON_ADD_USER_THIRD_PARTY_IDENTIFIER_CALLBACK,
     ON_CREATE_ROOM_CALLBACK,
     ON_NEW_EVENT_CALLBACK,
     ON_PROFILE_UPDATE_CALLBACK,
+    ON_REMOVE_USER_THIRD_PARTY_IDENTIFIER_CALLBACK,
     ON_THREEPID_BIND_CALLBACK,
     ON_USER_DEACTIVATION_STATUS_CHANGED_CALLBACK,
 )
@@ -357,6 +359,12 @@ class ModuleApi:
             ON_USER_DEACTIVATION_STATUS_CHANGED_CALLBACK
         ] = None,
         on_threepid_bind: Optional[ON_THREEPID_BIND_CALLBACK] = None,
+        on_add_user_third_party_identifier: Optional[
+            ON_ADD_USER_THIRD_PARTY_IDENTIFIER_CALLBACK
+        ] = None,
+        on_remove_user_third_party_identifier: Optional[
+            ON_REMOVE_USER_THIRD_PARTY_IDENTIFIER_CALLBACK
+        ] = None,
     ) -> None:
         """Registers callbacks for third party event rules capabilities.
 
@@ -373,6 +381,8 @@ class ModuleApi:
             on_profile_update=on_profile_update,
             on_user_deactivation_status_changed=on_user_deactivation_status_changed,
             on_threepid_bind=on_threepid_bind,
+            on_add_user_third_party_identifier=on_add_user_third_party_identifier,
+            on_remove_user_third_party_identifier=on_remove_user_third_party_identifier,
         )
 
     def register_presence_router_callbacks(
diff --git a/synapse/rest/admin/users.py b/synapse/rest/admin/users.py
index 7cc4db20d6..357e9a574d 100644
--- a/synapse/rest/admin/users.py
+++ b/synapse/rest/admin/users.py
@@ -304,13 +304,20 @@ class UserRestServletV2(RestServlet):
                 # remove old threepids
                 for medium, address in del_threepids:
                     try:
-                        await self.auth_handler.delete_threepid(
-                            user_id, medium, address, None
+                        # Attempt to remove any known bindings of this third-party ID
+                        # and user ID from identity servers.
+                        await self.hs.get_identity_handler().try_unbind_threepid(
+                            user_id, medium, address, id_server=None
                         )
                     except Exception:
                         logger.exception("Failed to remove threepids")
                         raise SynapseError(500, "Failed to remove threepids")
 
+                    # Delete the local association of this user ID and third-party ID.
+                    await self.auth_handler.delete_local_threepid(
+                        user_id, medium, address
+                    )
+
                 # add new threepids
                 current_time = self.hs.get_clock().time_msec()
                 for medium, address in add_threepids:
diff --git a/synapse/rest/client/account.py b/synapse/rest/client/account.py
index 662f5bf762..484d7440a4 100644
--- a/synapse/rest/client/account.py
+++ b/synapse/rest/client/account.py
@@ -768,7 +768,9 @@ class ThreepidDeleteRestServlet(RestServlet):
         user_id = requester.user.to_string()
 
         try:
-            ret = await self.auth_handler.delete_threepid(
+            # Attempt to remove any known bindings of this third-party ID
+            # and user ID from identity servers.
+            ret = await self.hs.get_identity_handler().try_unbind_threepid(
                 user_id, body.medium, body.address, body.id_server
             )
         except Exception:
@@ -783,6 +785,11 @@ class ThreepidDeleteRestServlet(RestServlet):
         else:
             id_server_unbind_result = "no-support"
 
+        # Delete the local association of this user ID and third-party ID.
+        await self.auth_handler.delete_local_threepid(
+            user_id, body.medium, body.address
+        )
+
         return 200, {"id_server_unbind_result": id_server_unbind_result}
 
 
diff --git a/synapse/storage/databases/main/registration.py b/synapse/storage/databases/main/registration.py
index 9a55e17624..717237e024 100644
--- a/synapse/storage/databases/main/registration.py
+++ b/synapse/storage/databases/main/registration.py
@@ -1002,19 +1002,6 @@ class RegistrationWorkerStore(CacheInvalidationWorkerStore):
             desc="user_delete_threepid",
         )
 
-    async def user_delete_threepids(self, user_id: str) -> None:
-        """Delete all threepid this user has bound
-
-        Args:
-             user_id: The user id to delete all threepids of
-
-        """
-        await self.db_pool.simple_delete(
-            "user_threepids",
-            keyvalues={"user_id": user_id},
-            desc="user_delete_threepids",
-        )
-
     async def add_user_bound_threepid(
         self, user_id: str, medium: str, address: str, id_server: str
     ) -> None:
diff --git a/tests/push/test_email.py b/tests/push/test_email.py
index 0a3aca5c50..4ea5472eb4 100644
--- a/tests/push/test_email.py
+++ b/tests/push/test_email.py
@@ -369,10 +369,8 @@ class EmailPusherTests(HomeserverTestCase):
 
         # disassociate the user's email address
         self.get_success(
-            self.auth_handler.delete_threepid(
-                user_id=self.user_id,
-                medium="email",
-                address="a@example.com",
+            self.auth_handler.delete_local_threepid(
+                user_id=self.user_id, medium="email", address="a@example.com"
             )
         )
 
diff --git a/tests/rest/client/test_third_party_rules.py b/tests/rest/client/test_third_party_rules.py
index c0f93f898a..3b99513707 100644
--- a/tests/rest/client/test_third_party_rules.py
+++ b/tests/rest/client/test_third_party_rules.py
@@ -934,3 +934,124 @@ class ThirdPartyRulesTestCase(unittest.FederatingHomeserverTestCase):
 
         # Check that the mock was called with the right parameters
         self.assertEqual(args, (user_id, "email", "foo@example.com"))
+
+    def test_on_add_and_remove_user_third_party_identifier(self) -> None:
+        """Tests that the on_add_user_third_party_identifier and
+        on_remove_user_third_party_identifier module callbacks are called
+        just before associating and removing a 3PID to/from an account.
+        """
+        # Pretend to be a Synapse module and register both callbacks as mocks.
+        third_party_rules = self.hs.get_third_party_event_rules()
+        on_add_user_third_party_identifier_callback_mock = Mock(
+            return_value=make_awaitable(None)
+        )
+        on_remove_user_third_party_identifier_callback_mock = Mock(
+            return_value=make_awaitable(None)
+        )
+        third_party_rules._on_threepid_bind_callbacks.append(
+            on_add_user_third_party_identifier_callback_mock
+        )
+        third_party_rules._on_threepid_bind_callbacks.append(
+            on_remove_user_third_party_identifier_callback_mock
+        )
+
+        # Register an admin user.
+        self.register_user("admin", "password", admin=True)
+        admin_tok = self.login("admin", "password")
+
+        # Also register a normal user we can modify.
+        user_id = self.register_user("user", "password")
+
+        # Add a 3PID to the user.
+        channel = self.make_request(
+            "PUT",
+            "/_synapse/admin/v2/users/%s" % user_id,
+            {
+                "threepids": [
+                    {
+                        "medium": "email",
+                        "address": "foo@example.com",
+                    },
+                ],
+            },
+            access_token=admin_tok,
+        )
+
+        # Check that the mocked add callback was called with the appropriate
+        # 3PID details.
+        self.assertEqual(channel.code, 200, channel.json_body)
+        on_add_user_third_party_identifier_callback_mock.assert_called_once()
+        args = on_add_user_third_party_identifier_callback_mock.call_args[0]
+        self.assertEqual(args, (user_id, "email", "foo@example.com"))
+
+        # Now remove the 3PID from the user
+        channel = self.make_request(
+            "PUT",
+            "/_synapse/admin/v2/users/%s" % user_id,
+            {
+                "threepids": [],
+            },
+            access_token=admin_tok,
+        )
+
+        # Check that the mocked remove callback was called with the appropriate
+        # 3PID details.
+        self.assertEqual(channel.code, 200, channel.json_body)
+        on_remove_user_third_party_identifier_callback_mock.assert_called_once()
+        args = on_remove_user_third_party_identifier_callback_mock.call_args[0]
+        self.assertEqual(args, (user_id, "email", "foo@example.com"))
+
+    def test_on_remove_user_third_party_identifier_is_called_on_deactivate(
+        self,
+    ) -> None:
+        """Tests that the on_remove_user_third_party_identifier module callback is called
+        when a user is deactivated and their third-party ID associations are deleted.
+        """
+        # Pretend to be a Synapse module and register both callbacks as mocks.
+        third_party_rules = self.hs.get_third_party_event_rules()
+        on_remove_user_third_party_identifier_callback_mock = Mock(
+            return_value=make_awaitable(None)
+        )
+        third_party_rules._on_threepid_bind_callbacks.append(
+            on_remove_user_third_party_identifier_callback_mock
+        )
+
+        # Register an admin user.
+        self.register_user("admin", "password", admin=True)
+        admin_tok = self.login("admin", "password")
+
+        # Also register a normal user we can modify.
+        user_id = self.register_user("user", "password")
+
+        # Add a 3PID to the user.
+        channel = self.make_request(
+            "PUT",
+            "/_synapse/admin/v2/users/%s" % user_id,
+            {
+                "threepids": [
+                    {
+                        "medium": "email",
+                        "address": "foo@example.com",
+                    },
+                ],
+            },
+            access_token=admin_tok,
+        )
+        self.assertEqual(channel.code, 200, channel.json_body)
+
+        # Now deactivate the user.
+        channel = self.make_request(
+            "PUT",
+            "/_synapse/admin/v2/users/%s" % user_id,
+            {
+                "deactivated": True,
+            },
+            access_token=admin_tok,
+        )
+
+        # Check that the mocked remove callback was called with the appropriate
+        # 3PID details.
+        self.assertEqual(channel.code, 200, channel.json_body)
+        on_remove_user_third_party_identifier_callback_mock.assert_called_once()
+        args = on_remove_user_third_party_identifier_callback_mock.call_args[0]
+        self.assertEqual(args, (user_id, "email", "foo@example.com"))
-- 
cgit 1.5.1


From 93f7955eba50c827f96e1b2e8e44ef22a98cecc4 Mon Sep 17 00:00:00 2001
From: Dirk Klimpel <5740567+dklimpel@users.noreply.github.com>
Date: Tue, 28 Feb 2023 13:09:10 +0100
Subject: Admin API endpoint to delete a reported event (#15116)

* Admin api to delete event report

* lint +  tests

* newsfile

* Apply suggestions from code review

Co-authored-by: David Robertson <david.m.robertson1@gmail.com>

* revert changes - move to WorkerStore

* update unit test

* Note that timestamp is in millseconds

---------

Co-authored-by: David Robertson <david.m.robertson1@gmail.com>
---
 changelog.d/15116.feature              |   1 +
 docs/admin_api/event_reports.md        |  14 ++++
 synapse/rest/admin/event_reports.py    |  41 ++++++++--
 synapse/storage/databases/main/room.py |  36 ++++++++-
 tests/rest/admin/test_event_reports.py | 143 ++++++++++++++++++++++++++++++++-
 5 files changed, 224 insertions(+), 11 deletions(-)
 create mode 100644 changelog.d/15116.feature

(limited to 'tests/rest')

diff --git a/changelog.d/15116.feature b/changelog.d/15116.feature
new file mode 100644
index 0000000000..087d8dc7f1
--- /dev/null
+++ b/changelog.d/15116.feature
@@ -0,0 +1 @@
+Add an [admin API](https://matrix-org.github.io/synapse/latest/usage/administration/admin_api/index.html) to delete a [specific event report](https://spec.matrix.org/v1.6/client-server-api/#reporting-content).
\ No newline at end of file
diff --git a/docs/admin_api/event_reports.md b/docs/admin_api/event_reports.md
index beec8bb7ef..83f7dc37f4 100644
--- a/docs/admin_api/event_reports.md
+++ b/docs/admin_api/event_reports.md
@@ -169,3 +169,17 @@ The following fields are returned in the JSON response body:
 * `canonical_alias`: string - The canonical alias of the room. `null` if the room does not
   have a canonical alias set.
 * `event_json`: object - Details of the original event that was reported.
+
+# Delete a specific event report
+
+This API deletes a specific event report. If the request is successful, the response body
+will be an empty JSON object.
+
+The api is:
+```
+DELETE /_synapse/admin/v1/event_reports/<report_id>
+```
+
+**URL parameters:**
+
+* `report_id`: string - The ID of the event report.
diff --git a/synapse/rest/admin/event_reports.py b/synapse/rest/admin/event_reports.py
index a3beb74e2c..c546ef7e23 100644
--- a/synapse/rest/admin/event_reports.py
+++ b/synapse/rest/admin/event_reports.py
@@ -53,11 +53,11 @@ class EventReportsRestServlet(RestServlet):
     PATTERNS = admin_patterns("/event_reports$")
 
     def __init__(self, hs: "HomeServer"):
-        self.auth = hs.get_auth()
-        self.store = hs.get_datastores().main
+        self._auth = hs.get_auth()
+        self._store = hs.get_datastores().main
 
     async def on_GET(self, request: SynapseRequest) -> Tuple[int, JsonDict]:
-        await assert_requester_is_admin(self.auth, request)
+        await assert_requester_is_admin(self._auth, request)
 
         start = parse_integer(request, "from", default=0)
         limit = parse_integer(request, "limit", default=100)
@@ -79,7 +79,7 @@ class EventReportsRestServlet(RestServlet):
                 errcode=Codes.INVALID_PARAM,
             )
 
-        event_reports, total = await self.store.get_event_reports_paginate(
+        event_reports, total = await self._store.get_event_reports_paginate(
             start, limit, direction, user_id, room_id
         )
         ret = {"event_reports": event_reports, "total": total}
@@ -108,13 +108,13 @@ class EventReportDetailRestServlet(RestServlet):
     PATTERNS = admin_patterns("/event_reports/(?P<report_id>[^/]*)$")
 
     def __init__(self, hs: "HomeServer"):
-        self.auth = hs.get_auth()
-        self.store = hs.get_datastores().main
+        self._auth = hs.get_auth()
+        self._store = hs.get_datastores().main
 
     async def on_GET(
         self, request: SynapseRequest, report_id: str
     ) -> Tuple[int, JsonDict]:
-        await assert_requester_is_admin(self.auth, request)
+        await assert_requester_is_admin(self._auth, request)
 
         message = (
             "The report_id parameter must be a string representing a positive integer."
@@ -131,8 +131,33 @@ class EventReportDetailRestServlet(RestServlet):
                 HTTPStatus.BAD_REQUEST, message, errcode=Codes.INVALID_PARAM
             )
 
-        ret = await self.store.get_event_report(resolved_report_id)
+        ret = await self._store.get_event_report(resolved_report_id)
         if not ret:
             raise NotFoundError("Event report not found")
 
         return HTTPStatus.OK, ret
+
+    async def on_DELETE(
+        self, request: SynapseRequest, report_id: str
+    ) -> Tuple[int, JsonDict]:
+        await assert_requester_is_admin(self._auth, request)
+
+        message = (
+            "The report_id parameter must be a string representing a positive integer."
+        )
+        try:
+            resolved_report_id = int(report_id)
+        except ValueError:
+            raise SynapseError(
+                HTTPStatus.BAD_REQUEST, message, errcode=Codes.INVALID_PARAM
+            )
+
+        if resolved_report_id < 0:
+            raise SynapseError(
+                HTTPStatus.BAD_REQUEST, message, errcode=Codes.INVALID_PARAM
+            )
+
+        if await self._store.delete_event_report(resolved_report_id):
+            return HTTPStatus.OK, {}
+
+        raise NotFoundError("Event report not found")
diff --git a/synapse/storage/databases/main/room.py b/synapse/storage/databases/main/room.py
index 39f89291b2..a2e9519cb6 100644
--- a/synapse/storage/databases/main/room.py
+++ b/synapse/storage/databases/main/room.py
@@ -1417,6 +1417,27 @@ class RoomWorkerStore(CacheInvalidationWorkerStore):
             get_un_partial_stated_rooms_from_stream_txn,
         )
 
+    async def delete_event_report(self, report_id: int) -> bool:
+        """Remove an event report from database.
+
+        Args:
+            report_id: Report to delete
+
+        Returns:
+            Whether the report was successfully deleted or not.
+        """
+        try:
+            await self.db_pool.simple_delete_one(
+                table="event_reports",
+                keyvalues={"id": report_id},
+                desc="delete_event_report",
+            )
+        except StoreError:
+            # Deletion failed because report does not exist
+            return False
+
+        return True
+
 
 class _BackgroundUpdates:
     REMOVE_TOMESTONED_ROOMS_BG_UPDATE = "remove_tombstoned_rooms_from_directory"
@@ -2139,7 +2160,19 @@ class RoomStore(RoomBackgroundUpdateStore, RoomWorkerStore):
         reason: Optional[str],
         content: JsonDict,
         received_ts: int,
-    ) -> None:
+    ) -> int:
+        """Add an event report
+
+        Args:
+            room_id: Room that contains the reported event.
+            event_id: The reported event.
+            user_id: User who reports the event.
+            reason: Description that the user specifies.
+            content: Report request body (score and reason).
+            received_ts: Time when the user submitted the report (milliseconds).
+        Returns:
+            Id of the event report.
+        """
         next_id = self._event_reports_id_gen.get_next()
         await self.db_pool.simple_insert(
             table="event_reports",
@@ -2154,6 +2187,7 @@ class RoomStore(RoomBackgroundUpdateStore, RoomWorkerStore):
             },
             desc="add_event_report",
         )
+        return next_id
 
     async def get_event_report(self, report_id: int) -> Optional[Dict[str, Any]]:
         """Retrieve an event report
diff --git a/tests/rest/admin/test_event_reports.py b/tests/rest/admin/test_event_reports.py
index 233eba3516..f189b07769 100644
--- a/tests/rest/admin/test_event_reports.py
+++ b/tests/rest/admin/test_event_reports.py
@@ -78,7 +78,7 @@ class EventReportsTestCase(unittest.HomeserverTestCase):
         """
         Try to get an event report without authentication.
         """
-        channel = self.make_request("GET", self.url, b"{}")
+        channel = self.make_request("GET", self.url, {})
 
         self.assertEqual(401, channel.code, msg=channel.json_body)
         self.assertEqual(Codes.MISSING_TOKEN, channel.json_body["errcode"])
@@ -473,7 +473,7 @@ class EventReportDetailTestCase(unittest.HomeserverTestCase):
         """
         Try to get event report without authentication.
         """
-        channel = self.make_request("GET", self.url, b"{}")
+        channel = self.make_request("GET", self.url, {})
 
         self.assertEqual(401, channel.code, msg=channel.json_body)
         self.assertEqual(Codes.MISSING_TOKEN, channel.json_body["errcode"])
@@ -599,3 +599,142 @@ class EventReportDetailTestCase(unittest.HomeserverTestCase):
         self.assertIn("room_id", content["event_json"])
         self.assertIn("sender", content["event_json"])
         self.assertIn("content", content["event_json"])
+
+
+class DeleteEventReportTestCase(unittest.HomeserverTestCase):
+    servlets = [
+        synapse.rest.admin.register_servlets,
+        login.register_servlets,
+    ]
+
+    def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
+        self._store = hs.get_datastores().main
+
+        self.admin_user = self.register_user("admin", "pass", admin=True)
+        self.admin_user_tok = self.login("admin", "pass")
+
+        self.other_user = self.register_user("user", "pass")
+        self.other_user_tok = self.login("user", "pass")
+
+        # create report
+        event_id = self.get_success(
+            self._store.add_event_report(
+                "room_id",
+                "event_id",
+                self.other_user,
+                "this makes me sad",
+                {},
+                self.clock.time_msec(),
+            )
+        )
+
+        self.url = f"/_synapse/admin/v1/event_reports/{event_id}"
+
+    def test_no_auth(self) -> None:
+        """
+        Try to delete event report without authentication.
+        """
+        channel = self.make_request("DELETE", self.url)
+
+        self.assertEqual(401, channel.code, msg=channel.json_body)
+        self.assertEqual(Codes.MISSING_TOKEN, channel.json_body["errcode"])
+
+    def test_requester_is_no_admin(self) -> None:
+        """
+        If the user is not a server admin, an error 403 is returned.
+        """
+
+        channel = self.make_request(
+            "DELETE",
+            self.url,
+            access_token=self.other_user_tok,
+        )
+
+        self.assertEqual(403, channel.code, msg=channel.json_body)
+        self.assertEqual(Codes.FORBIDDEN, channel.json_body["errcode"])
+
+    def test_delete_success(self) -> None:
+        """
+        Testing delete a report.
+        """
+
+        channel = self.make_request(
+            "DELETE",
+            self.url,
+            access_token=self.admin_user_tok,
+        )
+
+        self.assertEqual(200, channel.code, msg=channel.json_body)
+        self.assertEqual({}, channel.json_body)
+
+        channel = self.make_request(
+            "GET",
+            self.url,
+            access_token=self.admin_user_tok,
+        )
+
+        # check that report was deleted
+        self.assertEqual(404, channel.code, msg=channel.json_body)
+        self.assertEqual(Codes.NOT_FOUND, channel.json_body["errcode"])
+
+    def test_invalid_report_id(self) -> None:
+        """
+        Testing that an invalid `report_id` returns a 400.
+        """
+
+        # `report_id` is negative
+        channel = self.make_request(
+            "DELETE",
+            "/_synapse/admin/v1/event_reports/-123",
+            access_token=self.admin_user_tok,
+        )
+
+        self.assertEqual(400, channel.code, msg=channel.json_body)
+        self.assertEqual(Codes.INVALID_PARAM, channel.json_body["errcode"])
+        self.assertEqual(
+            "The report_id parameter must be a string representing a positive integer.",
+            channel.json_body["error"],
+        )
+
+        # `report_id` is a non-numerical string
+        channel = self.make_request(
+            "DELETE",
+            "/_synapse/admin/v1/event_reports/abcdef",
+            access_token=self.admin_user_tok,
+        )
+
+        self.assertEqual(400, channel.code, msg=channel.json_body)
+        self.assertEqual(Codes.INVALID_PARAM, channel.json_body["errcode"])
+        self.assertEqual(
+            "The report_id parameter must be a string representing a positive integer.",
+            channel.json_body["error"],
+        )
+
+        # `report_id` is undefined
+        channel = self.make_request(
+            "DELETE",
+            "/_synapse/admin/v1/event_reports/",
+            access_token=self.admin_user_tok,
+        )
+
+        self.assertEqual(400, channel.code, msg=channel.json_body)
+        self.assertEqual(Codes.INVALID_PARAM, channel.json_body["errcode"])
+        self.assertEqual(
+            "The report_id parameter must be a string representing a positive integer.",
+            channel.json_body["error"],
+        )
+
+    def test_report_id_not_found(self) -> None:
+        """
+        Testing that a not existing `report_id` returns a 404.
+        """
+
+        channel = self.make_request(
+            "DELETE",
+            "/_synapse/admin/v1/event_reports/123",
+            access_token=self.admin_user_tok,
+        )
+
+        self.assertEqual(404, channel.code, msg=channel.json_body)
+        self.assertEqual(Codes.NOT_FOUND, channel.json_body["errcode"])
+        self.assertEqual("Event report not found", channel.json_body["error"])
-- 
cgit 1.5.1


From 2b78981736f9004f99b1760e3e77b234f92755a7 Mon Sep 17 00:00:00 2001
From: Richard van der Hoff <1389908+richvdh@users.noreply.github.com>
Date: Tue, 28 Feb 2023 18:49:28 +0000
Subject: Remove support for aggregating reactions (#15172)

It turns out that no clients rely on server-side aggregation of `m.annotation`
relationships: it's just not very useful as currently implemented.

It's also non-trivial to calculate.

I want to remove it from MSC2677, so to keep the implementation in line, let's
remove it here.
---
 changelog.d/15172.feature                          |   1 +
 synapse/events/utils.py                            |   5 -
 synapse/handlers/relations.py                      |  76 +--------
 synapse/storage/databases/main/cache.py            |   3 -
 synapse/storage/databases/main/events.py           |   4 -
 .../storage/databases/main/events_bg_updates.py    |   3 -
 synapse/storage/databases/main/relations.py        | 137 ----------------
 tests/rest/client/test_relations.py                | 178 ++++-----------------
 8 files changed, 30 insertions(+), 377 deletions(-)
 create mode 100644 changelog.d/15172.feature

(limited to 'tests/rest')

diff --git a/changelog.d/15172.feature b/changelog.d/15172.feature
new file mode 100644
index 0000000000..3f789edb7f
--- /dev/null
+++ b/changelog.d/15172.feature
@@ -0,0 +1 @@
+Remove support for server-side aggregation of reactions.
diff --git a/synapse/events/utils.py b/synapse/events/utils.py
index ebf8c7ed83..eaa6cad4af 100644
--- a/synapse/events/utils.py
+++ b/synapse/events/utils.py
@@ -516,11 +516,6 @@ class EventClientSerializer:
         # being serialized.
         serialized_aggregations = {}
 
-        if event_aggregations.annotations:
-            serialized_aggregations[
-                RelationTypes.ANNOTATION
-            ] = event_aggregations.annotations
-
         if event_aggregations.references:
             serialized_aggregations[
                 RelationTypes.REFERENCE
diff --git a/synapse/handlers/relations.py b/synapse/handlers/relations.py
index 0fb15391e0..553053b694 100644
--- a/synapse/handlers/relations.py
+++ b/synapse/handlers/relations.py
@@ -60,13 +60,12 @@ class BundledAggregations:
     Some values require additional processing during serialization.
     """
 
-    annotations: Optional[JsonDict] = None
     references: Optional[JsonDict] = None
     replace: Optional[EventBase] = None
     thread: Optional[_ThreadAggregation] = None
 
     def __bool__(self) -> bool:
-        return bool(self.annotations or self.references or self.replace or self.thread)
+        return bool(self.references or self.replace or self.thread)
 
 
 class RelationsHandler:
@@ -227,67 +226,6 @@ class RelationsHandler:
                     e.msg,
                 )
 
-    async def get_annotations_for_events(
-        self, event_ids: Collection[str], ignored_users: FrozenSet[str] = frozenset()
-    ) -> Dict[str, List[JsonDict]]:
-        """Get a list of annotations to the given events, grouped by event type and
-        aggregation key, sorted by count.
-
-        This is used e.g. to get the what and how many reactions have happened
-        on an event.
-
-        Args:
-            event_ids: Fetch events that relate to these event IDs.
-            ignored_users: The users ignored by the requesting user.
-
-        Returns:
-            A map of event IDs to a list of groups of annotations that match.
-            Each entry is a dict with `type`, `key` and `count` fields.
-        """
-        # Get the base results for all users.
-        full_results = await self._main_store.get_aggregation_groups_for_events(
-            event_ids
-        )
-
-        # Avoid additional logic if there are no ignored users.
-        if not ignored_users:
-            return {
-                event_id: results
-                for event_id, results in full_results.items()
-                if results
-            }
-
-        # Then subtract off the results for any ignored users.
-        ignored_results = await self._main_store.get_aggregation_groups_for_users(
-            [event_id for event_id, results in full_results.items() if results],
-            ignored_users,
-        )
-
-        filtered_results = {}
-        for event_id, results in full_results.items():
-            # If no annotations, skip.
-            if not results:
-                continue
-
-            # If there are not ignored results for this event, copy verbatim.
-            if event_id not in ignored_results:
-                filtered_results[event_id] = results
-                continue
-
-            # Otherwise, subtract out the ignored results.
-            event_ignored_results = ignored_results[event_id]
-            for result in results:
-                key = (result["type"], result["key"])
-                if key in event_ignored_results:
-                    # Ensure to not modify the cache.
-                    result = result.copy()
-                    result["count"] -= event_ignored_results[key]
-                    if result["count"] <= 0:
-                        continue
-                filtered_results.setdefault(event_id, []).append(result)
-
-        return filtered_results
-
     async def get_references_for_events(
         self, event_ids: Collection[str], ignored_users: FrozenSet[str] = frozenset()
     ) -> Dict[str, List[_RelatedEvent]]:
@@ -531,17 +469,6 @@ class RelationsHandler:
                 # (as that is what makes it part of the thread).
                 relations_by_id[latest_thread_event.event_id] = RelationTypes.THREAD
 
-        async def _fetch_annotations() -> None:
-            """Fetch any annotations (ie, reactions) to bundle with this event."""
-            annotations_by_event_id = await self.get_annotations_for_events(
-                events_by_id.keys(), ignored_users=ignored_users
-            )
-            for event_id, annotations in annotations_by_event_id.items():
-                if annotations:
-                    results.setdefault(event_id, BundledAggregations()).annotations = {
-                        "chunk": annotations
-                    }
-
         async def _fetch_references() -> None:
             """Fetch any references to bundle with this event."""
             references_by_event_id = await self.get_references_for_events(
@@ -575,7 +502,6 @@ class RelationsHandler:
         await make_deferred_yieldable(
             gather_results(
                 (
-                    run_in_background(_fetch_annotations),
                     run_in_background(_fetch_references),
                     run_in_background(_fetch_edits),
                 )
diff --git a/synapse/storage/databases/main/cache.py b/synapse/storage/databases/main/cache.py
index 5b66431691..096dec7f87 100644
--- a/synapse/storage/databases/main/cache.py
+++ b/synapse/storage/databases/main/cache.py
@@ -266,9 +266,6 @@ class CacheInvalidationWorkerStore(SQLBaseStore):
         if relates_to:
             self._attempt_to_invalidate_cache("get_relations_for_event", (relates_to,))
             self._attempt_to_invalidate_cache("get_references_for_event", (relates_to,))
-            self._attempt_to_invalidate_cache(
-                "get_aggregation_groups_for_event", (relates_to,)
-            )
             self._attempt_to_invalidate_cache("get_applicable_edit", (relates_to,))
             self._attempt_to_invalidate_cache("get_thread_summary", (relates_to,))
             self._attempt_to_invalidate_cache("get_thread_participated", (relates_to,))
diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py
index 73b8aea16c..a8a4ed4436 100644
--- a/synapse/storage/databases/main/events.py
+++ b/synapse/storage/databases/main/events.py
@@ -2024,10 +2024,6 @@ class PersistEventsStore:
         self.store._invalidate_cache_and_stream(
             txn, self.store.get_relations_for_event, (redacted_relates_to,)
         )
-        if rel_type == RelationTypes.ANNOTATION:
-            self.store._invalidate_cache_and_stream(
-                txn, self.store.get_aggregation_groups_for_event, (redacted_relates_to,)
-            )
         if rel_type == RelationTypes.REFERENCE:
             self.store._invalidate_cache_and_stream(
                 txn, self.store.get_references_for_event, (redacted_relates_to,)
diff --git a/synapse/storage/databases/main/events_bg_updates.py b/synapse/storage/databases/main/events_bg_updates.py
index 0a275e6ce6..daef3685b0 100644
--- a/synapse/storage/databases/main/events_bg_updates.py
+++ b/synapse/storage/databases/main/events_bg_updates.py
@@ -1219,9 +1219,6 @@ class EventsBackgroundUpdatesStore(SQLBaseStore):
                     self._invalidate_cache_and_stream(  # type: ignore[attr-defined]
                         txn, self.get_relations_for_event, cache_tuple  # type: ignore[attr-defined]
                     )
-                    self._invalidate_cache_and_stream(  # type: ignore[attr-defined]
-                        txn, self.get_aggregation_groups_for_event, cache_tuple  # type: ignore[attr-defined]
-                    )
                     self._invalidate_cache_and_stream(  # type: ignore[attr-defined]
                         txn, self.get_thread_summary, cache_tuple  # type: ignore[attr-defined]
                     )
diff --git a/synapse/storage/databases/main/relations.py b/synapse/storage/databases/main/relations.py
index fa3266c081..bc3a83919c 100644
--- a/synapse/storage/databases/main/relations.py
+++ b/synapse/storage/databases/main/relations.py
@@ -397,143 +397,6 @@ class RelationsWorkerStore(SQLBaseStore):
         )
         return result is not None
 
-    @cached()
-    async def get_aggregation_groups_for_event(
-        self, event_id: str
-    ) -> Sequence[JsonDict]:
-        raise NotImplementedError()
-
-    @cachedList(
-        cached_method_name="get_aggregation_groups_for_event", list_name="event_ids"
-    )
-    async def get_aggregation_groups_for_events(
-        self, event_ids: Collection[str]
-    ) -> Mapping[str, Optional[List[JsonDict]]]:
-        """Get a list of annotations on the given events, grouped by event type and
-        aggregation key, sorted by count.
-
-        This is used e.g. to get the what and how many reactions have happend
-        on an event.
-
-        Args:
-            event_ids: Fetch events that relate to these event IDs.
-
-        Returns:
-            A map of event IDs to a list of groups of annotations that match.
-            Each entry is a dict with `type`, `key` and `count` fields.
-        """
-        # The number of entries to return per event ID.
-        limit = 5
-
-        clause, args = make_in_list_sql_clause(
-            self.database_engine, "relates_to_id", event_ids
-        )
-        args.append(RelationTypes.ANNOTATION)
-
-        sql = f"""
-            SELECT
-                relates_to_id,
-                annotation.type,
-                aggregation_key,
-                COUNT(DISTINCT annotation.sender)
-            FROM events AS annotation
-            INNER JOIN event_relations USING (event_id)
-            INNER JOIN events AS parent ON
-                parent.event_id = relates_to_id
-                AND parent.room_id = annotation.room_id
-            WHERE
-                {clause}
-                AND relation_type = ?
-            GROUP BY relates_to_id, annotation.type, aggregation_key
-            ORDER BY relates_to_id, COUNT(*) DESC
-        """
-
-        def _get_aggregation_groups_for_events_txn(
-            txn: LoggingTransaction,
-        ) -> Mapping[str, List[JsonDict]]:
-            txn.execute(sql, args)
-
-            result: Dict[str, List[JsonDict]] = {}
-            for event_id, type, key, count in cast(
-                List[Tuple[str, str, str, int]], txn
-            ):
-                event_results = result.setdefault(event_id, [])
-
-                # Limit the number of results per event ID.
-                if len(event_results) == limit:
-                    continue
-
-                event_results.append({"type": type, "key": key, "count": count})
-
-            return result
-
-        return await self.db_pool.runInteraction(
-            "get_aggregation_groups_for_events", _get_aggregation_groups_for_events_txn
-        )
-
-    async def get_aggregation_groups_for_users(
-        self, event_ids: Collection[str], users: FrozenSet[str]
-    ) -> Dict[str, Dict[Tuple[str, str], int]]:
-        """Fetch the partial aggregations for an event for specific users.
-
-        This is used, in conjunction with get_aggregation_groups_for_event, to
-        remove information from the results for ignored users.
-
-        Args:
-            event_ids: Fetch events that relate to these event IDs.
-            users: The users to fetch information for.
-
-        Returns:
-            A map of event ID to a map of (event type, aggregation key) to a
-            count of users.
-        """
-
-        if not users:
-            return {}
-
-        events_sql, args = make_in_list_sql_clause(
-            self.database_engine, "relates_to_id", event_ids
-        )
-
-        users_sql, users_args = make_in_list_sql_clause(
-            self.database_engine, "annotation.sender", users
-        )
-        args.extend(users_args)
-        args.append(RelationTypes.ANNOTATION)
-
-        sql = f"""
-            SELECT
-                relates_to_id,
-                annotation.type,
-                aggregation_key,
-                COUNT(DISTINCT annotation.sender)
-            FROM events AS annotation
-            INNER JOIN event_relations USING (event_id)
-            INNER JOIN events AS parent ON
-                parent.event_id = relates_to_id
-                AND parent.room_id = annotation.room_id
-            WHERE {events_sql} AND {users_sql} AND relation_type = ?
-            GROUP BY relates_to_id, annotation.type, aggregation_key
-            ORDER BY relates_to_id, COUNT(*) DESC
-        """
-
-        def _get_aggregation_groups_for_users_txn(
-            txn: LoggingTransaction,
-        ) -> Dict[str, Dict[Tuple[str, str], int]]:
-            txn.execute(sql, args)
-
-            result: Dict[str, Dict[Tuple[str, str], int]] = {}
-            for event_id, type, key, count in cast(
-                List[Tuple[str, str, str, int]], txn
-            ):
-                result.setdefault(event_id, {})[(type, key)] = count
-
-            return result
-
-        return await self.db_pool.runInteraction(
-            "get_aggregation_groups_for_users", _get_aggregation_groups_for_users_txn
-        )
-
     @cached()
     async def get_references_for_event(self, event_id: str) -> List[JsonDict]:
         raise NotImplementedError()
diff --git a/tests/rest/client/test_relations.py b/tests/rest/client/test_relations.py
index c8a6911d5e..a8a0a16141 100644
--- a/tests/rest/client/test_relations.py
+++ b/tests/rest/client/test_relations.py
@@ -1080,48 +1080,6 @@ class BundledAggregationsTestCase(BaseRelationsTestCase):
         ]
         assert_bundle(self._find_event_in_chunk(chunk))
 
-    def test_annotation(self) -> None:
-        """
-        Test that annotations get correctly bundled.
-        """
-        # Setup by sending a variety of relations.
-        self._send_relation(RelationTypes.ANNOTATION, "m.reaction", "a")
-        self._send_relation(
-            RelationTypes.ANNOTATION, "m.reaction", "a", access_token=self.user2_token
-        )
-        self._send_relation(RelationTypes.ANNOTATION, "m.reaction", "b")
-
-        def assert_annotations(bundled_aggregations: JsonDict) -> None:
-            self.assertEqual(
-                {
-                    "chunk": [
-                        {"type": "m.reaction", "key": "a", "count": 2},
-                        {"type": "m.reaction", "key": "b", "count": 1},
-                    ]
-                },
-                bundled_aggregations,
-            )
-
-        self._test_bundled_aggregations(RelationTypes.ANNOTATION, assert_annotations, 7)
-
-    def test_annotation_to_annotation(self) -> None:
-        """Any relation to an annotation should be ignored."""
-        channel = self._send_relation(RelationTypes.ANNOTATION, "m.reaction", "a")
-        event_id = channel.json_body["event_id"]
-        self._send_relation(
-            RelationTypes.ANNOTATION, "m.reaction", "b", parent_id=event_id
-        )
-
-        # Fetch the initial annotation event to see if it has bundled aggregations.
-        channel = self.make_request(
-            "GET",
-            f"/_matrix/client/v3/rooms/{self.room}/event/{event_id}",
-            access_token=self.user_token,
-        )
-        self.assertEquals(200, channel.code, channel.json_body)
-        # The first annotationt should not have any bundled aggregations.
-        self.assertNotIn("m.relations", channel.json_body["unsigned"])
-
     def test_reference(self) -> None:
         """
         Test that references get correctly bundled.
@@ -1138,7 +1096,7 @@ class BundledAggregationsTestCase(BaseRelationsTestCase):
                 bundled_aggregations,
             )
 
-        self._test_bundled_aggregations(RelationTypes.REFERENCE, assert_annotations, 7)
+        self._test_bundled_aggregations(RelationTypes.REFERENCE, assert_annotations, 6)
 
     def test_thread(self) -> None:
         """
@@ -1183,7 +1141,7 @@ class BundledAggregationsTestCase(BaseRelationsTestCase):
 
         # The "user" sent the root event and is making queries for the bundled
         # aggregations: they have participated.
-        self._test_bundled_aggregations(RelationTypes.THREAD, _gen_assert(True), 7)
+        self._test_bundled_aggregations(RelationTypes.THREAD, _gen_assert(True), 6)
         # The "user2" sent replies in the thread and is making queries for the
         # bundled aggregations: they have participated.
         #
@@ -1208,9 +1166,10 @@ class BundledAggregationsTestCase(BaseRelationsTestCase):
         channel = self._send_relation(RelationTypes.THREAD, "m.room.test")
         thread_2 = channel.json_body["event_id"]
 
-        self._send_relation(
-            RelationTypes.ANNOTATION, "m.reaction", "a", parent_id=thread_2
+        channel = self._send_relation(
+            RelationTypes.REFERENCE, "org.matrix.test", parent_id=thread_2
         )
+        reference_event_id = channel.json_body["event_id"]
 
         def assert_thread(bundled_aggregations: JsonDict) -> None:
             self.assertEqual(2, bundled_aggregations.get("count"))
@@ -1235,17 +1194,15 @@ class BundledAggregationsTestCase(BaseRelationsTestCase):
             self.assert_dict(
                 {
                     "m.relations": {
-                        RelationTypes.ANNOTATION: {
-                            "chunk": [
-                                {"type": "m.reaction", "key": "a", "count": 1},
-                            ]
+                        RelationTypes.REFERENCE: {
+                            "chunk": [{"event_id": reference_event_id}]
                         },
                     }
                 },
                 bundled_aggregations["latest_event"].get("unsigned"),
             )
 
-        self._test_bundled_aggregations(RelationTypes.THREAD, assert_thread, 7)
+        self._test_bundled_aggregations(RelationTypes.THREAD, assert_thread, 6)
 
     def test_nested_thread(self) -> None:
         """
@@ -1363,10 +1320,11 @@ class BundledAggregationsTestCase(BaseRelationsTestCase):
         channel = self._send_relation(RelationTypes.THREAD, "m.room.test")
         thread_id = channel.json_body["event_id"]
 
-        # Annotate the thread.
-        self._send_relation(
-            RelationTypes.ANNOTATION, "m.reaction", "a", parent_id=thread_id
+        # Make a reference to the thread.
+        channel = self._send_relation(
+            RelationTypes.REFERENCE, "org.matrix.test", parent_id=thread_id
         )
+        reference_event_id = channel.json_body["event_id"]
 
         channel = self.make_request(
             "GET",
@@ -1377,9 +1335,7 @@ class BundledAggregationsTestCase(BaseRelationsTestCase):
         self.assertEqual(
             channel.json_body["unsigned"].get("m.relations"),
             {
-                RelationTypes.ANNOTATION: {
-                    "chunk": [{"count": 1, "key": "a", "type": "m.reaction"}]
-                },
+                RelationTypes.REFERENCE: {"chunk": [{"event_id": reference_event_id}]},
             },
         )
 
@@ -1396,9 +1352,7 @@ class BundledAggregationsTestCase(BaseRelationsTestCase):
         self.assertEqual(
             thread_message["unsigned"].get("m.relations"),
             {
-                RelationTypes.ANNOTATION: {
-                    "chunk": [{"count": 1, "key": "a", "type": "m.reaction"}]
-                },
+                RelationTypes.REFERENCE: {"chunk": [{"event_id": reference_event_id}]},
             },
         )
 
@@ -1410,7 +1364,8 @@ class BundledAggregationsTestCase(BaseRelationsTestCase):
         Note that the spec allows for a server to return additional fields beyond
         what is specified.
         """
-        self._send_relation(RelationTypes.ANNOTATION, "m.reaction", "a")
+        channel = self._send_relation(RelationTypes.REFERENCE, "org.matrix.test")
+        reference_event_id = channel.json_body["event_id"]
 
         # Note that the sync filter does not include "unsigned" as a field.
         filter = urllib.parse.quote_plus(
@@ -1428,7 +1383,12 @@ class BundledAggregationsTestCase(BaseRelationsTestCase):
 
         # Ensure there's bundled aggregations on it.
         self.assertIn("unsigned", parent_event)
-        self.assertIn("m.relations", parent_event["unsigned"])
+        self.assertEqual(
+            parent_event["unsigned"].get("m.relations"),
+            {
+                RelationTypes.REFERENCE: {"chunk": [{"event_id": reference_event_id}]},
+            },
+        )
 
 
 class RelationIgnoredUserTestCase(BaseRelationsTestCase):
@@ -1475,53 +1435,8 @@ class RelationIgnoredUserTestCase(BaseRelationsTestCase):
 
         return before_aggregations[relation_type], after_aggregations[relation_type]
 
-    def test_annotation(self) -> None:
-        """Annotations should ignore"""
-        # Send 2 from us, 2 from the to be ignored user.
-        allowed_event_ids = []
-        ignored_event_ids = []
-        channel = self._send_relation(RelationTypes.ANNOTATION, "m.reaction", key="a")
-        allowed_event_ids.append(channel.json_body["event_id"])
-        channel = self._send_relation(RelationTypes.ANNOTATION, "m.reaction", key="b")
-        allowed_event_ids.append(channel.json_body["event_id"])
-        channel = self._send_relation(
-            RelationTypes.ANNOTATION,
-            "m.reaction",
-            key="a",
-            access_token=self.user2_token,
-        )
-        ignored_event_ids.append(channel.json_body["event_id"])
-        channel = self._send_relation(
-            RelationTypes.ANNOTATION,
-            "m.reaction",
-            key="c",
-            access_token=self.user2_token,
-        )
-        ignored_event_ids.append(channel.json_body["event_id"])
-
-        before_aggregations, after_aggregations = self._test_ignored_user(
-            RelationTypes.ANNOTATION, allowed_event_ids, ignored_event_ids
-        )
-
-        self.assertCountEqual(
-            before_aggregations["chunk"],
-            [
-                {"type": "m.reaction", "key": "a", "count": 2},
-                {"type": "m.reaction", "key": "b", "count": 1},
-                {"type": "m.reaction", "key": "c", "count": 1},
-            ],
-        )
-
-        self.assertCountEqual(
-            after_aggregations["chunk"],
-            [
-                {"type": "m.reaction", "key": "a", "count": 1},
-                {"type": "m.reaction", "key": "b", "count": 1},
-            ],
-        )
-
     def test_reference(self) -> None:
-        """Annotations should ignore"""
+        """Aggregations should exclude reference relations from ignored users"""
         channel = self._send_relation(RelationTypes.REFERENCE, "m.room.test")
         allowed_event_ids = [channel.json_body["event_id"]]
 
@@ -1544,7 +1459,7 @@ class RelationIgnoredUserTestCase(BaseRelationsTestCase):
         )
 
     def test_thread(self) -> None:
-        """Annotations should ignore"""
+        """Aggregations should exclude thread releations from ignored users"""
         channel = self._send_relation(RelationTypes.THREAD, "m.room.test")
         allowed_event_ids = [channel.json_body["event_id"]]
 
@@ -1618,43 +1533,6 @@ class RelationRedactionTestCase(BaseRelationsTestCase):
             for t in threads
         ]
 
-    def test_redact_relation_annotation(self) -> None:
-        """
-        Test that annotations of an event are properly handled after the
-        annotation is redacted.
-
-        The redacted relation should not be included in bundled aggregations or
-        the response to relations.
-        """
-        channel = self._send_relation(RelationTypes.ANNOTATION, "m.reaction", "a")
-        to_redact_event_id = channel.json_body["event_id"]
-
-        channel = self._send_relation(
-            RelationTypes.ANNOTATION, "m.reaction", "a", access_token=self.user2_token
-        )
-        unredacted_event_id = channel.json_body["event_id"]
-
-        # Both relations should exist.
-        event_ids = self._get_related_events()
-        relations = self._get_bundled_aggregations()
-        self.assertCountEqual(event_ids, [to_redact_event_id, unredacted_event_id])
-        self.assertEquals(
-            relations["m.annotation"],
-            {"chunk": [{"type": "m.reaction", "key": "a", "count": 2}]},
-        )
-
-        # Redact one of the reactions.
-        self._redact(to_redact_event_id)
-
-        # The unredacted relation should still exist.
-        event_ids = self._get_related_events()
-        relations = self._get_bundled_aggregations()
-        self.assertEquals(event_ids, [unredacted_event_id])
-        self.assertEquals(
-            relations["m.annotation"],
-            {"chunk": [{"type": "m.reaction", "key": "a", "count": 1}]},
-        )
-
     def test_redact_relation_thread(self) -> None:
         """
         Test that thread replies are properly handled after the thread reply redacted.
@@ -1775,14 +1653,14 @@ class RelationRedactionTestCase(BaseRelationsTestCase):
         is redacted.
         """
         # Add a relation
-        channel = self._send_relation(RelationTypes.ANNOTATION, "m.reaction", key="👍")
+        channel = self._send_relation(RelationTypes.REFERENCE, "org.matrix.test")
         related_event_id = channel.json_body["event_id"]
 
         # The relations should exist.
         event_ids = self._get_related_events()
         relations = self._get_bundled_aggregations()
         self.assertEqual(len(event_ids), 1)
-        self.assertIn(RelationTypes.ANNOTATION, relations)
+        self.assertIn(RelationTypes.REFERENCE, relations)
 
         # Redact the original event.
         self._redact(self.parent_id)
@@ -1792,8 +1670,8 @@ class RelationRedactionTestCase(BaseRelationsTestCase):
         relations = self._get_bundled_aggregations()
         self.assertEquals(event_ids, [related_event_id])
         self.assertEquals(
-            relations["m.annotation"],
-            {"chunk": [{"type": "m.reaction", "key": "👍", "count": 1}]},
+            relations[RelationTypes.REFERENCE],
+            {"chunk": [{"event_id": related_event_id}]},
         )
 
     def test_redact_parent_thread(self) -> None:
-- 
cgit 1.5.1


From 916b8061d20dc0902b7f2d42d994efc20300e9e7 Mon Sep 17 00:00:00 2001
From: Hugh Nimmo-Smith <hughns@users.noreply.github.com>
Date: Thu, 2 Mar 2023 10:34:59 +0000
Subject: Implementation of MSC3967: Don't require UIA for initial upload of
 cross signing keys (#15077)

---
 changelog.d/15077.feature      |   1 +
 synapse/config/experimental.py |   3 +
 synapse/handlers/e2e_keys.py   |  14 ++++
 synapse/rest/client/keys.py    |  32 +++++++---
 tests/rest/client/test_keys.py | 141 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 182 insertions(+), 9 deletions(-)
 create mode 100644 changelog.d/15077.feature

(limited to 'tests/rest')

diff --git a/changelog.d/15077.feature b/changelog.d/15077.feature
new file mode 100644
index 0000000000..384e751056
--- /dev/null
+++ b/changelog.d/15077.feature
@@ -0,0 +1 @@
+Experimental support for MSC3967 to not require UIA for setting up cross-signing on first use.
diff --git a/synapse/config/experimental.py b/synapse/config/experimental.py
index bc38fae0b6..7c81f055b6 100644
--- a/synapse/config/experimental.py
+++ b/synapse/config/experimental.py
@@ -194,3 +194,6 @@ class ExperimentalConfig(Config):
         self.msc3966_exact_event_property_contains = experimental.get(
             "msc3966_exact_event_property_contains", False
         )
+
+        # MSC3967: Do not require UIA when first uploading cross signing keys
+        self.msc3967_enabled = experimental.get("msc3967_enabled", False)
diff --git a/synapse/handlers/e2e_keys.py b/synapse/handlers/e2e_keys.py
index 43cbece21b..4e9c8d8db0 100644
--- a/synapse/handlers/e2e_keys.py
+++ b/synapse/handlers/e2e_keys.py
@@ -1301,6 +1301,20 @@ class E2eKeysHandler:
 
         return desired_key_data
 
+    async def is_cross_signing_set_up_for_user(self, user_id: str) -> bool:
+        """Checks if the user has cross-signing set up
+
+        Args:
+            user_id: The user to check
+
+        Returns:
+            True if the user has cross-signing set up, False otherwise
+        """
+        existing_master_key = await self.store.get_e2e_cross_signing_key(
+            user_id, "master"
+        )
+        return existing_master_key is not None
+
 
 def _check_cross_signing_key(
     key: JsonDict, user_id: str, key_type: str, signing_key: Optional[VerifyKey] = None
diff --git a/synapse/rest/client/keys.py b/synapse/rest/client/keys.py
index 7873b363c0..32bb8b9a91 100644
--- a/synapse/rest/client/keys.py
+++ b/synapse/rest/client/keys.py
@@ -312,15 +312,29 @@ class SigningKeyUploadServlet(RestServlet):
         user_id = requester.user.to_string()
         body = parse_json_object_from_request(request)
 
-        await self.auth_handler.validate_user_via_ui_auth(
-            requester,
-            request,
-            body,
-            "add a device signing key to your account",
-            # Allow skipping of UI auth since this is frequently called directly
-            # after login and it is silly to ask users to re-auth immediately.
-            can_skip_ui_auth=True,
-        )
+        if self.hs.config.experimental.msc3967_enabled:
+            if await self.e2e_keys_handler.is_cross_signing_set_up_for_user(user_id):
+                # If we already have a master key then cross signing is set up and we require UIA to reset
+                await self.auth_handler.validate_user_via_ui_auth(
+                    requester,
+                    request,
+                    body,
+                    "reset the device signing key on your account",
+                    # Do not allow skipping of UIA auth.
+                    can_skip_ui_auth=False,
+                )
+            # Otherwise we don't require UIA since we are setting up cross signing for first time
+        else:
+            # Previous behaviour is to always require UIA but allow it to be skipped
+            await self.auth_handler.validate_user_via_ui_auth(
+                requester,
+                request,
+                body,
+                "add a device signing key to your account",
+                # Allow skipping of UI auth since this is frequently called directly
+                # after login and it is silly to ask users to re-auth immediately.
+                can_skip_ui_auth=True,
+            )
 
         result = await self.e2e_keys_handler.upload_signing_keys_for_user(user_id, body)
         return 200, result
diff --git a/tests/rest/client/test_keys.py b/tests/rest/client/test_keys.py
index 741fecea77..8ee5489057 100644
--- a/tests/rest/client/test_keys.py
+++ b/tests/rest/client/test_keys.py
@@ -14,12 +14,21 @@
 
 from http import HTTPStatus
 
+from signedjson.key import (
+    encode_verify_key_base64,
+    generate_signing_key,
+    get_verify_key,
+)
+from signedjson.sign import sign_json
+
 from synapse.api.errors import Codes
 from synapse.rest import admin
 from synapse.rest.client import keys, login
+from synapse.types import JsonDict
 
 from tests import unittest
 from tests.http.server._base import make_request_with_cancellation_test
+from tests.unittest import override_config
 
 
 class KeyQueryTestCase(unittest.HomeserverTestCase):
@@ -118,3 +127,135 @@ class KeyQueryTestCase(unittest.HomeserverTestCase):
 
         self.assertEqual(200, channel.code, msg=channel.result["body"])
         self.assertIn(bob, channel.json_body["device_keys"])
+
+    def make_device_keys(self, user_id: str, device_id: str) -> JsonDict:
+        # We only generate a master key to simplify the test.
+        master_signing_key = generate_signing_key(device_id)
+        master_verify_key = encode_verify_key_base64(get_verify_key(master_signing_key))
+
+        return {
+            "master_key": sign_json(
+                {
+                    "user_id": user_id,
+                    "usage": ["master"],
+                    "keys": {"ed25519:" + master_verify_key: master_verify_key},
+                },
+                user_id,
+                master_signing_key,
+            ),
+        }
+
+    def test_device_signing_with_uia(self) -> None:
+        """Device signing key upload requires UIA."""
+        password = "wonderland"
+        device_id = "ABCDEFGHI"
+        alice_id = self.register_user("alice", password)
+        alice_token = self.login("alice", password, device_id=device_id)
+
+        content = self.make_device_keys(alice_id, device_id)
+
+        channel = self.make_request(
+            "POST",
+            "/_matrix/client/v3/keys/device_signing/upload",
+            content,
+            alice_token,
+        )
+
+        self.assertEqual(channel.code, HTTPStatus.UNAUTHORIZED, channel.result)
+        # Grab the session
+        session = channel.json_body["session"]
+        # Ensure that flows are what is expected.
+        self.assertIn({"stages": ["m.login.password"]}, channel.json_body["flows"])
+
+        # add UI auth
+        content["auth"] = {
+            "type": "m.login.password",
+            "identifier": {"type": "m.id.user", "user": alice_id},
+            "password": password,
+            "session": session,
+        }
+
+        channel = self.make_request(
+            "POST",
+            "/_matrix/client/v3/keys/device_signing/upload",
+            content,
+            alice_token,
+        )
+
+        self.assertEqual(channel.code, HTTPStatus.OK, channel.result)
+
+    @override_config({"ui_auth": {"session_timeout": "15m"}})
+    def test_device_signing_with_uia_session_timeout(self) -> None:
+        """Device signing key upload requires UIA buy passes with grace period."""
+        password = "wonderland"
+        device_id = "ABCDEFGHI"
+        alice_id = self.register_user("alice", password)
+        alice_token = self.login("alice", password, device_id=device_id)
+
+        content = self.make_device_keys(alice_id, device_id)
+
+        channel = self.make_request(
+            "POST",
+            "/_matrix/client/v3/keys/device_signing/upload",
+            content,
+            alice_token,
+        )
+
+        self.assertEqual(channel.code, HTTPStatus.OK, channel.result)
+
+    @override_config(
+        {
+            "experimental_features": {"msc3967_enabled": True},
+            "ui_auth": {"session_timeout": "15s"},
+        }
+    )
+    def test_device_signing_with_msc3967(self) -> None:
+        """Device signing key follows MSC3967 behaviour when enabled."""
+        password = "wonderland"
+        device_id = "ABCDEFGHI"
+        alice_id = self.register_user("alice", password)
+        alice_token = self.login("alice", password, device_id=device_id)
+
+        keys1 = self.make_device_keys(alice_id, device_id)
+
+        # Initial request should succeed as no existing keys are present.
+        channel = self.make_request(
+            "POST",
+            "/_matrix/client/v3/keys/device_signing/upload",
+            keys1,
+            alice_token,
+        )
+        self.assertEqual(channel.code, HTTPStatus.OK, channel.result)
+
+        keys2 = self.make_device_keys(alice_id, device_id)
+
+        # Subsequent request should require UIA as keys already exist even though session_timeout is set.
+        channel = self.make_request(
+            "POST",
+            "/_matrix/client/v3/keys/device_signing/upload",
+            keys2,
+            alice_token,
+        )
+        self.assertEqual(channel.code, HTTPStatus.UNAUTHORIZED, channel.result)
+
+        # Grab the session
+        session = channel.json_body["session"]
+        # Ensure that flows are what is expected.
+        self.assertIn({"stages": ["m.login.password"]}, channel.json_body["flows"])
+
+        # add UI auth
+        keys2["auth"] = {
+            "type": "m.login.password",
+            "identifier": {"type": "m.id.user", "user": alice_id},
+            "password": password,
+            "session": session,
+        }
+
+        # Request should complete
+        channel = self.make_request(
+            "POST",
+            "/_matrix/client/v3/keys/device_signing/upload",
+            keys2,
+            alice_token,
+        )
+        self.assertEqual(channel.code, HTTPStatus.OK, channel.result)
-- 
cgit 1.5.1


From 05e0a4089a013979e5d0642f6a0f1d22ad865ee1 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@users.noreply.github.com>
Date: Mon, 6 Mar 2023 09:43:01 -0500
Subject: Stop applying edits to event contents (MSC3925). (#15193)

Enables MSC3925 support by default, which:

* Includes the full edit event in the bundled aggregations of an
  edited event.
* Stops modifying the original event's content to return the new
  content from the edit event.

This is a backwards-incompatible change that is considered to be
"correct" by the spec.
---
 changelog.d/15193.bugfix            |  1 +
 synapse/config/experimental.py      |  3 --
 synapse/events/utils.py             | 57 ++---------------------------------
 synapse/rest/client/room.py         |  2 +-
 synapse/server.py                   |  2 +-
 tests/rest/client/test_relations.py | 59 +++++++------------------------------
 6 files changed, 15 insertions(+), 109 deletions(-)
 create mode 100644 changelog.d/15193.bugfix

(limited to 'tests/rest')

diff --git a/changelog.d/15193.bugfix b/changelog.d/15193.bugfix
new file mode 100644
index 0000000000..ca781e9631
--- /dev/null
+++ b/changelog.d/15193.bugfix
@@ -0,0 +1 @@
+Stop applying edits when bundling aggregations, per [MSC3925](https://github.com/matrix-org/matrix-spec-proposals/pull/3925).
diff --git a/synapse/config/experimental.py b/synapse/config/experimental.py
index 9c58cee2c8..489f2601ac 100644
--- a/synapse/config/experimental.py
+++ b/synapse/config/experimental.py
@@ -166,9 +166,6 @@ class ExperimentalConfig(Config):
         # MSC3391: Removing account data.
         self.msc3391_enabled = experimental.get("msc3391_enabled", False)
 
-        # MSC3925: do not replace events with their edits
-        self.msc3925_inhibit_edit = experimental.get("msc3925_inhibit_edit", False)
-
         # MSC3873: Disambiguate event_match keys.
         self.msc3873_escape_event_match_key = experimental.get(
             "msc3873_escape_event_match_key", False
diff --git a/synapse/events/utils.py b/synapse/events/utils.py
index eaa6cad4af..45f46949a1 100644
--- a/synapse/events/utils.py
+++ b/synapse/events/utils.py
@@ -39,7 +39,6 @@ from synapse.api.constants import (
 from synapse.api.errors import Codes, SynapseError
 from synapse.api.room_versions import RoomVersion
 from synapse.types import JsonDict
-from synapse.util.frozenutils import unfreeze
 
 from . import EventBase
 
@@ -403,14 +402,6 @@ class EventClientSerializer:
     clients.
     """
 
-    def __init__(self, inhibit_replacement_via_edits: bool = False):
-        """
-        Args:
-            inhibit_replacement_via_edits: If this is set to True, then events are
-               never replaced by their edits.
-        """
-        self._inhibit_replacement_via_edits = inhibit_replacement_via_edits
-
     def serialize_event(
         self,
         event: Union[JsonDict, EventBase],
@@ -418,7 +409,6 @@ class EventClientSerializer:
         *,
         config: SerializeEventConfig = _DEFAULT_SERIALIZE_EVENT_CONFIG,
         bundle_aggregations: Optional[Dict[str, "BundledAggregations"]] = None,
-        apply_edits: bool = True,
     ) -> JsonDict:
         """Serializes a single event.
 
@@ -428,10 +418,7 @@ class EventClientSerializer:
             config: Event serialization config
             bundle_aggregations: A map from event_id to the aggregations to be bundled
                into the event.
-            apply_edits: Whether the content of the event should be modified to reflect
-               any replacement in `bundle_aggregations[<event_id>].replace`.
-               See also the `inhibit_replacement_via_edits` constructor arg: if that is
-               set to True, then this argument is ignored.
+
         Returns:
             The serialized event
         """
@@ -450,38 +437,10 @@ class EventClientSerializer:
                     config,
                     bundle_aggregations,
                     serialized_event,
-                    apply_edits=apply_edits,
                 )
 
         return serialized_event
 
-    def _apply_edit(
-        self, orig_event: EventBase, serialized_event: JsonDict, edit: EventBase
-    ) -> None:
-        """Replace the content, preserving existing relations of the serialized event.
-
-        Args:
-            orig_event: The original event.
-            serialized_event: The original event, serialized. This is modified.
-            edit: The event which edits the above.
-        """
-
-        # Ensure we take copies of the edit content, otherwise we risk modifying
-        # the original event.
-        edit_content = edit.content.copy()
-
-        # Unfreeze the event content if necessary, so that we may modify it below
-        edit_content = unfreeze(edit_content)
-        serialized_event["content"] = edit_content.get("m.new_content", {})
-
-        # Check for existing relations
-        relates_to = orig_event.content.get("m.relates_to")
-        if relates_to:
-            # Keep the relations, ensuring we use a dict copy of the original
-            serialized_event["content"]["m.relates_to"] = relates_to.copy()
-        else:
-            serialized_event["content"].pop("m.relates_to", None)
-
     def _inject_bundled_aggregations(
         self,
         event: EventBase,
@@ -489,7 +448,6 @@ class EventClientSerializer:
         config: SerializeEventConfig,
         bundled_aggregations: Dict[str, "BundledAggregations"],
         serialized_event: JsonDict,
-        apply_edits: bool,
     ) -> None:
         """Potentially injects bundled aggregations into the unsigned portion of the serialized event.
 
@@ -504,9 +462,6 @@ class EventClientSerializer:
                 While serializing the bundled aggregations this map may be searched
                 again for additional events in a recursive manner.
             serialized_event: The serialized event which may be modified.
-            apply_edits: Whether the content of the event should be modified to reflect
-               any replacement in `aggregations.replace` (subject to the
-               `inhibit_replacement_via_edits` constructor arg).
         """
 
         # We have already checked that aggregations exist for this event.
@@ -522,11 +477,6 @@ class EventClientSerializer:
             ] = event_aggregations.references
 
         if event_aggregations.replace:
-            # If there is an edit, optionally apply it to the event.
-            edit = event_aggregations.replace
-            if apply_edits and not self._inhibit_replacement_via_edits:
-                self._apply_edit(event, serialized_event, edit)
-
             # Include information about it in the relations dict.
             #
             # Matrix spec v1.5 (https://spec.matrix.org/v1.5/client-server-api/#server-side-aggregation-of-mreplace-relationships)
@@ -534,10 +484,7 @@ class EventClientSerializer:
             # `sender` of the edit; however MSC3925 proposes extending it to the whole
             # of the edit, which is what we do here.
             serialized_aggregations[RelationTypes.REPLACE] = self.serialize_event(
-                edit,
-                time_now,
-                config=config,
-                apply_edits=False,
+                event_aggregations.replace, time_now, config=config
             )
 
         # Include any threaded replies to this event.
diff --git a/synapse/rest/client/room.py b/synapse/rest/client/room.py
index 45aee3d3fe..c5af07816a 100644
--- a/synapse/rest/client/room.py
+++ b/synapse/rest/client/room.py
@@ -818,7 +818,7 @@ class RoomEventServlet(RestServlet):
             # per MSC2676, /rooms/{roomId}/event/{eventId}, should return the
             # *original* event, rather than the edited version
             event_dict = self._event_serializer.serialize_event(
-                event, time_now, bundle_aggregations=aggregations, apply_edits=False
+                event, time_now, bundle_aggregations=aggregations
             )
             return 200, event_dict
 
diff --git a/synapse/server.py b/synapse/server.py
index a7c32e9a60..df80fc1beb 100644
--- a/synapse/server.py
+++ b/synapse/server.py
@@ -743,7 +743,7 @@ class HomeServer(metaclass=abc.ABCMeta):
 
     @cache_in_self
     def get_event_client_serializer(self) -> EventClientSerializer:
-        return EventClientSerializer(self.config.experimental.msc3925_inhibit_edit)
+        return EventClientSerializer()
 
     @cache_in_self
     def get_password_policy_handler(self) -> PasswordPolicyHandler:
diff --git a/tests/rest/client/test_relations.py b/tests/rest/client/test_relations.py
index a8a0a16141..fbbbcb23f1 100644
--- a/tests/rest/client/test_relations.py
+++ b/tests/rest/client/test_relations.py
@@ -30,7 +30,6 @@ from tests import unittest
 from tests.server import FakeChannel
 from tests.test_utils import make_awaitable
 from tests.test_utils.event_injection import inject_event
-from tests.unittest import override_config
 
 
 class BaseRelationsTestCase(unittest.HomeserverTestCase):
@@ -403,7 +402,7 @@ class RelationsTestCase(BaseRelationsTestCase):
 
     def test_edit(self) -> None:
         """Test that a simple edit works."""
-
+        orig_body = {"body": "Hi!", "msgtype": "m.text"}
         new_body = {"msgtype": "m.text", "body": "I've been edited!"}
         edit_event_content = {
             "msgtype": "m.text",
@@ -424,9 +423,7 @@ class RelationsTestCase(BaseRelationsTestCase):
             access_token=self.user_token,
         )
         self.assertEqual(200, channel.code, channel.json_body)
-        self.assertEqual(
-            channel.json_body["content"], {"body": "Hi!", "msgtype": "m.text"}
-        )
+        self.assertEqual(channel.json_body["content"], orig_body)
         self._assert_edit_bundle(channel.json_body, edit_event_id, edit_event_content)
 
         # Request the room messages.
@@ -443,7 +440,7 @@ class RelationsTestCase(BaseRelationsTestCase):
         )
 
         # Request the room context.
-        # /context should return the edited event.
+        # /context should return the event.
         channel = self.make_request(
             "GET",
             f"/rooms/{self.room}/context/{self.parent_id}",
@@ -453,7 +450,7 @@ class RelationsTestCase(BaseRelationsTestCase):
         self._assert_edit_bundle(
             channel.json_body["event"], edit_event_id, edit_event_content
         )
-        self.assertEqual(channel.json_body["event"]["content"], new_body)
+        self.assertEqual(channel.json_body["event"]["content"], orig_body)
 
         # Request sync, but limit the timeline so it becomes limited (and includes
         # bundled aggregations).
@@ -491,45 +488,11 @@ class RelationsTestCase(BaseRelationsTestCase):
             edit_event_content,
         )
 
-    @override_config({"experimental_features": {"msc3925_inhibit_edit": True}})
-    def test_edit_inhibit_replace(self) -> None:
-        """
-        If msc3925_inhibit_edit is enabled, then the original event should not be
-        replaced.
-        """
-
-        new_body = {"msgtype": "m.text", "body": "I've been edited!"}
-        edit_event_content = {
-            "msgtype": "m.text",
-            "body": "foo",
-            "m.new_content": new_body,
-        }
-        channel = self._send_relation(
-            RelationTypes.REPLACE,
-            "m.room.message",
-            content=edit_event_content,
-        )
-        edit_event_id = channel.json_body["event_id"]
-
-        # /context should return the *original* event.
-        channel = self.make_request(
-            "GET",
-            f"/rooms/{self.room}/context/{self.parent_id}",
-            access_token=self.user_token,
-        )
-        self.assertEqual(200, channel.code, channel.json_body)
-        self.assertEqual(
-            channel.json_body["event"]["content"], {"body": "Hi!", "msgtype": "m.text"}
-        )
-        self._assert_edit_bundle(
-            channel.json_body["event"], edit_event_id, edit_event_content
-        )
-
     def test_multi_edit(self) -> None:
         """Test that multiple edits, including attempts by people who
         shouldn't be allowed, are correctly handled.
         """
-
+        orig_body = orig_body = {"body": "Hi!", "msgtype": "m.text"}
         self._send_relation(
             RelationTypes.REPLACE,
             "m.room.message",
@@ -570,7 +533,7 @@ class RelationsTestCase(BaseRelationsTestCase):
         )
         self.assertEqual(200, channel.code, channel.json_body)
 
-        self.assertEqual(channel.json_body["event"]["content"], new_body)
+        self.assertEqual(channel.json_body["event"]["content"], orig_body)
         self._assert_edit_bundle(
             channel.json_body["event"], edit_event_id, edit_event_content
         )
@@ -642,6 +605,7 @@ class RelationsTestCase(BaseRelationsTestCase):
 
     def test_edit_edit(self) -> None:
         """Test that an edit cannot be edited."""
+        orig_body = {"body": "Hi!", "msgtype": "m.text"}
         new_body = {"msgtype": "m.text", "body": "Initial edit"}
         edit_event_content = {
             "msgtype": "m.text",
@@ -675,14 +639,12 @@ class RelationsTestCase(BaseRelationsTestCase):
             access_token=self.user_token,
         )
         self.assertEqual(200, channel.code, channel.json_body)
-        self.assertEqual(
-            channel.json_body["content"], {"body": "Hi!", "msgtype": "m.text"}
-        )
+        self.assertEqual(channel.json_body["content"], orig_body)
 
         # The relations information should not include the edit to the edit.
         self._assert_edit_bundle(channel.json_body, edit_event_id, edit_event_content)
 
-        # /context should return the event updated for the *first* edit
+        # /context should return the bundled edit for the *first* edit
         # (The edit to the edit should be ignored.)
         channel = self.make_request(
             "GET",
@@ -690,7 +652,7 @@ class RelationsTestCase(BaseRelationsTestCase):
             access_token=self.user_token,
         )
         self.assertEqual(200, channel.code, channel.json_body)
-        self.assertEqual(channel.json_body["event"]["content"], new_body)
+        self.assertEqual(channel.json_body["event"]["content"], orig_body)
         self._assert_edit_bundle(
             channel.json_body["event"], edit_event_id, edit_event_content
         )
@@ -1287,7 +1249,6 @@ class BundledAggregationsTestCase(BaseRelationsTestCase):
         thread_summary = relations_dict[RelationTypes.THREAD]
         self.assertIn("latest_event", thread_summary)
         latest_event_in_thread = thread_summary["latest_event"]
-        self.assertEqual(latest_event_in_thread["content"]["body"], "I've been edited!")
         # The latest event in the thread should have the edit appear under the
         # bundled aggregations.
         self.assertDictContainsSubset(
-- 
cgit 1.5.1