From d17295e5c3de642ba2c4e47f1bb2be7b2e4c9c06 Mon Sep 17 00:00:00 2001 From: Will Hunt Date: Thu, 27 Mar 2025 17:26:34 +0000 Subject: Store hashes of media files, and allow quarantining by hash. (#18277) This PR makes a few radical changes to media. This now stores the SHA256 hash of each file stored in the database (excluding thumbnails, more on that later). If a set of media is quarantined, any additional uploads of the same file contents or any other files with the same hash will be quarantined at the same time. Currently this does NOT: - De-duplicate media, although a future extension could be to do that. - Run any background jobs to identify the hashes of older files. This could also be a future extension, though the value of doing so is limited to combat the abuse of recent media. - Hash thumbnails. It's assumed that thumbnails are parented to some form of media, so you'd likely be wanting to quarantine the media and the thumbnail at the same time. --- tests/rest/admin/test_admin.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'tests/rest/admin/test_admin.py') diff --git a/tests/rest/admin/test_admin.py b/tests/rest/admin/test_admin.py index 5483f8f37f..fc2a6c569b 100644 --- a/tests/rest/admin/test_admin.py +++ b/tests/rest/admin/test_admin.py @@ -20,7 +20,7 @@ # import urllib.parse -from typing import Dict +from typing import Dict, cast from parameterized import parameterized @@ -32,6 +32,7 @@ from synapse.http.server import JsonResource from synapse.rest.admin import VersionServlet from synapse.rest.client import login, media, room from synapse.server import HomeServer +from synapse.types import UserID from synapse.util import Clock from tests import unittest @@ -227,10 +228,25 @@ class QuarantineMediaTestCase(unittest.HomeserverTestCase): # Upload some media response_1 = self.helper.upload_media(SMALL_PNG, tok=non_admin_user_tok) response_2 = self.helper.upload_media(SMALL_PNG, tok=non_admin_user_tok) + response_3 = self.helper.upload_media(SMALL_PNG, tok=non_admin_user_tok) # Extract media IDs server_and_media_id_1 = response_1["content_uri"][6:] server_and_media_id_2 = response_2["content_uri"][6:] + server_and_media_id_3 = response_3["content_uri"][6:] + + # Remove the hash from the media to simulate historic media. + self.get_success( + self.hs.get_datastores().main.update_local_media( + media_id=server_and_media_id_3.split("/")[1], + media_type="image/png", + upload_name=None, + media_length=123, + user_id=UserID.from_string(non_admin_user), + # Hack to force some media to have no hash. + sha256=cast(str, None), + ) + ) # Quarantine all media by this user url = "/_synapse/admin/v1/user/%s/media/quarantine" % urllib.parse.quote( @@ -244,12 +260,13 @@ class QuarantineMediaTestCase(unittest.HomeserverTestCase): self.pump(1.0) self.assertEqual(200, channel.code, msg=channel.json_body) self.assertEqual( - channel.json_body, {"num_quarantined": 2}, "Expected 2 quarantined items" + channel.json_body, {"num_quarantined": 3}, "Expected 3 quarantined items" ) # Attempt to access each piece of media self._ensure_quarantined(admin_user_tok, server_and_media_id_1) self._ensure_quarantined(admin_user_tok, server_and_media_id_2) + self._ensure_quarantined(admin_user_tok, server_and_media_id_3) def test_cannot_quarantine_safe_media(self) -> None: self.register_user("user_admin", "pass", admin=True) -- cgit 1.5.1