diff --git a/changelog.d/10911.bugfix b/changelog.d/10911.bugfix
new file mode 100644
index 0000000000..96e36bb15a
--- /dev/null
+++ b/changelog.d/10911.bugfix
@@ -0,0 +1 @@
+Avoid storing URL cache files in storage providers. Server admins may safely delete the `url_cache/` and `url_cache_thumbnails/` directories from any configured storage providers to reclaim space.
diff --git a/docs/upgrade.md b/docs/upgrade.md
index f9b832cb3f..a8221372df 100644
--- a/docs/upgrade.md
+++ b/docs/upgrade.md
@@ -85,6 +85,13 @@ process, for example:
dpkg -i matrix-synapse-py3_1.3.0+stretch1_amd64.deb
```
+# Upgrading to v1.44.0
+
+## The URL preview cache is no longer mirrored to storage providers
+The `url_cache/` and `url_cache_thumbnails/` directories in the media store are
+no longer mirrored to storage providers. These two directories can be safely
+deleted from any configured storage providers to reclaim space.
+
# Upgrading to v1.43.0
## The spaces summary APIs can now be handled by workers
diff --git a/synapse/rest/media/v1/filepath.py b/synapse/rest/media/v1/filepath.py
index 39bbe4e874..08bd85f664 100644
--- a/synapse/rest/media/v1/filepath.py
+++ b/synapse/rest/media/v1/filepath.py
@@ -195,23 +195,24 @@ class MediaFilePaths:
url_cache_thumbnail = _wrap_in_base_path(url_cache_thumbnail_rel)
- def url_cache_thumbnail_directory(self, media_id: str) -> str:
+ def url_cache_thumbnail_directory_rel(self, media_id: str) -> str:
# Media id is of the form <DATE><RANDOM_STRING>
# E.g.: 2017-09-28-fsdRDt24DS234dsf
if NEW_FORMAT_ID_RE.match(media_id):
- return os.path.join(
- self.base_path, "url_cache_thumbnails", media_id[:10], media_id[11:]
- )
+ return os.path.join("url_cache_thumbnails", media_id[:10], media_id[11:])
else:
return os.path.join(
- self.base_path,
"url_cache_thumbnails",
media_id[0:2],
media_id[2:4],
media_id[4:],
)
+ url_cache_thumbnail_directory = _wrap_in_base_path(
+ url_cache_thumbnail_directory_rel
+ )
+
def url_cache_thumbnail_dirs_to_delete(self, media_id: str) -> List[str]:
"The dirs to try and remove if we delete the media_id thumbnails"
# Media id is of the form <DATE><RANDOM_STRING>
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 0b0c4d6469..79a42b2455 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -485,7 +485,6 @@ class PreviewUrlResource(DirectServeJsonResource):
async def _expire_url_cache_data(self) -> None:
"""Clean up expired url cache content, media and thumbnails."""
- # TODO: Delete from backup media store
assert self._worker_run_media_background_jobs
diff --git a/synapse/rest/media/v1/storage_provider.py b/synapse/rest/media/v1/storage_provider.py
index da78fcee5e..18bf977d3d 100644
--- a/synapse/rest/media/v1/storage_provider.py
+++ b/synapse/rest/media/v1/storage_provider.py
@@ -93,6 +93,11 @@ class StorageProviderWrapper(StorageProvider):
if file_info.server_name and not self.store_remote:
return None
+ if file_info.url_cache:
+ # The URL preview cache is short lived and not worth offloading or
+ # backing up.
+ return None
+
if self.store_synchronous:
# store_file is supposed to return an Awaitable, but guard
# against improper implementations.
@@ -110,6 +115,11 @@ class StorageProviderWrapper(StorageProvider):
run_in_background(store)
async def fetch(self, path: str, file_info: FileInfo) -> Optional[Responder]:
+ if file_info.url_cache:
+ # Files in the URL preview cache definitely aren't stored here,
+ # so avoid any potentially slow I/O or network access.
+ return None
+
# store_file is supposed to return an Awaitable, but guard
# against improper implementations.
return await maybe_awaitable(self.backend.fetch(path, file_info))
diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py
index d83dfacfed..4d09b5d07e 100644
--- a/tests/rest/media/v1/test_url_preview.py
+++ b/tests/rest/media/v1/test_url_preview.py
@@ -21,6 +21,7 @@ from twisted.internet.error import DNSLookupError
from twisted.test.proto_helpers import AccumulatingProtocol
from synapse.config.oembed import OEmbedEndpointConfig
+from synapse.util.stringutils import parse_and_validate_mxc_uri
from tests import unittest
from tests.server import FakeTransport
@@ -721,3 +722,132 @@ class URLPreviewTests(unittest.HomeserverTestCase):
"og:description": "Content Preview",
},
)
+
+ def _download_image(self):
+ """Downloads an image into the URL cache.
+
+ Returns:
+ A (host, media_id) tuple representing the MXC URI of the image.
+ """
+ self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
+
+ channel = self.make_request(
+ "GET",
+ "preview_url?url=http://cdn.twitter.com/matrixdotorg",
+ shorthand=False,
+ await_result=False,
+ )
+ self.pump()
+
+ client = self.reactor.tcpClients[0][2].buildProtocol(None)
+ server = AccumulatingProtocol()
+ server.makeConnection(FakeTransport(client, self.reactor))
+ client.makeConnection(FakeTransport(server, self.reactor))
+ client.dataReceived(
+ b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: image/png\r\n\r\n"
+ % (len(SMALL_PNG),)
+ + SMALL_PNG
+ )
+
+ self.pump()
+ self.assertEqual(channel.code, 200)
+ body = channel.json_body
+ mxc_uri = body["og:image"]
+ host, _port, media_id = parse_and_validate_mxc_uri(mxc_uri)
+ self.assertIsNone(_port)
+ return host, media_id
+
+ def test_storage_providers_exclude_files(self):
+ """Test that files are not stored in or fetched from storage providers."""
+ host, media_id = self._download_image()
+
+ rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
+ media_store_path = os.path.join(self.media_store_path, rel_file_path)
+ storage_provider_path = os.path.join(self.storage_path, rel_file_path)
+
+ # Check storage
+ self.assertTrue(os.path.isfile(media_store_path))
+ self.assertFalse(
+ os.path.isfile(storage_provider_path),
+ "URL cache file was unexpectedly stored in a storage provider",
+ )
+
+ # Check fetching
+ channel = self.make_request(
+ "GET",
+ f"download/{host}/{media_id}",
+ shorthand=False,
+ await_result=False,
+ )
+ self.pump()
+ self.assertEqual(channel.code, 200)
+
+ # Move cached file into the storage provider
+ os.makedirs(os.path.dirname(storage_provider_path), exist_ok=True)
+ os.rename(media_store_path, storage_provider_path)
+
+ channel = self.make_request(
+ "GET",
+ f"download/{host}/{media_id}",
+ shorthand=False,
+ await_result=False,
+ )
+ self.pump()
+ self.assertEqual(
+ channel.code,
+ 404,
+ "URL cache file was unexpectedly retrieved from a storage provider",
+ )
+
+ def test_storage_providers_exclude_thumbnails(self):
+ """Test that thumbnails are not stored in or fetched from storage providers."""
+ host, media_id = self._download_image()
+
+ rel_thumbnail_path = (
+ self.preview_url.filepaths.url_cache_thumbnail_directory_rel(media_id)
+ )
+ media_store_thumbnail_path = os.path.join(
+ self.media_store_path, rel_thumbnail_path
+ )
+ storage_provider_thumbnail_path = os.path.join(
+ self.storage_path, rel_thumbnail_path
+ )
+
+ # Check storage
+ self.assertTrue(os.path.isdir(media_store_thumbnail_path))
+ self.assertFalse(
+ os.path.isdir(storage_provider_thumbnail_path),
+ "URL cache thumbnails were unexpectedly stored in a storage provider",
+ )
+
+ # Check fetching
+ channel = self.make_request(
+ "GET",
+ f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
+ shorthand=False,
+ await_result=False,
+ )
+ self.pump()
+ self.assertEqual(channel.code, 200)
+
+ # Remove the original, otherwise thumbnails will regenerate
+ rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
+ media_store_path = os.path.join(self.media_store_path, rel_file_path)
+ os.remove(media_store_path)
+
+ # Move cached thumbnails into the storage provider
+ os.makedirs(os.path.dirname(storage_provider_thumbnail_path), exist_ok=True)
+ os.rename(media_store_thumbnail_path, storage_provider_thumbnail_path)
+
+ channel = self.make_request(
+ "GET",
+ f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
+ shorthand=False,
+ await_result=False,
+ )
+ self.pump()
+ self.assertEqual(
+ channel.code,
+ 404,
+ "URL cache thumbnail was unexpectedly retrieved from a storage provider",
+ )
|