From 9ccb4226ba0824a1468c4e8f0abe91aca3381862 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 28 Sep 2017 12:18:06 +0100 Subject: Delete expired url cache data --- synapse/rest/media/v1/filepath.py | 43 ++++++++++++- synapse/rest/media/v1/preview_url_resource.py | 90 ++++++++++++++++++++++++++- 2 files changed, 129 insertions(+), 4 deletions(-) (limited to 'synapse/rest/media') diff --git a/synapse/rest/media/v1/filepath.py b/synapse/rest/media/v1/filepath.py index d92b7ff337..c5d43209f9 100644 --- a/synapse/rest/media/v1/filepath.py +++ b/synapse/rest/media/v1/filepath.py @@ -73,19 +73,58 @@ class MediaFilePaths(object): ) def url_cache_filepath(self, media_id): + # Media id is of the form + # E.g.: 2017-09-28-fsdRDt24DS234dsf return os.path.join( self.base_path, "url_cache", - media_id[0:2], media_id[2:4], media_id[4:] + media_id[:10], media_id[11:] ) + def url_cache_filepath_dirs_to_delete(self, media_id): + "The dirs to try and remove if we delete the media_id file" + return [ + os.path.join( + self.base_path, "url_cache", + media_id[:10], + ), + ] + def url_cache_thumbnail(self, media_id, width, height, content_type, method): + # Media id is of the form + # E.g.: 2017-09-28-fsdRDt24DS234dsf + top_level_type, sub_type = content_type.split("/") file_name = "%i-%i-%s-%s-%s" % ( width, height, top_level_type, sub_type, method ) + return os.path.join( self.base_path, "url_cache_thumbnails", - media_id[0:2], media_id[2:4], media_id[4:], + media_id[:10], media_id[11:], file_name ) + + def url_cache_thumbnail_directory(self, media_id): + # Media id is of the form + # E.g.: 2017-09-28-fsdRDt24DS234dsf + + return os.path.join( + self.base_path, "url_cache_thumbnails", + media_id[:10], media_id[11:], + ) + + def url_cache_thumbnail_dirs_to_delete(self, media_id): + "The dirs to try and remove if we delete the media_id thumbnails" + # Media id is of the form + # E.g.: 2017-09-28-fsdRDt24DS234dsf + return [ + os.path.join( + self.base_path, "url_cache_thumbnails", + media_id[:10], media_id[11:], + ), + os.path.join( + self.base_path, "url_cache_thumbnails", + media_id[:10], + ), + ] diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index b81a336c5d..c5ba83ddfd 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -36,6 +36,9 @@ import cgi import ujson as json import urlparse import itertools +import datetime +import errno +import shutil import logging logger = logging.getLogger(__name__) @@ -70,6 +73,10 @@ class PreviewUrlResource(Resource): self.downloads = {} + self._cleaner_loop = self.clock.looping_call( + self._expire_url_cache_data, 30 * 10000 + ) + def render_GET(self, request): self._async_render_GET(request) return NOT_DONE_YET @@ -253,8 +260,7 @@ class PreviewUrlResource(Resource): # we're most likely being explicitly triggered by a human rather than a # bot, so are we really a robot? - # XXX: horrible duplication with base_resource's _download_remote_file() - file_id = random_string(24) + file_id = datetime.date.today().isoformat() + '_' + random_string(16) fname = self.filepaths.url_cache_filepath(file_id) self.media_repo._makedirs(fname) @@ -328,6 +334,86 @@ class PreviewUrlResource(Resource): "etag": headers["ETag"][0] if "ETag" in headers else None, }) + @defer.inlineCallbacks + def _expire_url_cache_data(self): + """Clean up expired url cache content, media and thumbnails. + """ + now = self.clock.time_msec() + + # First we delete expired url cache entries + media_ids = yield self.store.get_expired_url_cache(now) + + removed_media = [] + for media_id in media_ids: + fname = self.filepaths.url_cache_filepath(media_id) + try: + os.remove(fname) + except OSError as e: + # If the path doesn't exist, meh + if e.errno != errno.ENOENT: + logger.warn("Failed to remove media: %r: %s", media_id, e) + continue + + removed_media.append(media_id) + + try: + dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id) + for dir in dirs: + os.rmdir(dir) + except: + pass + + yield self.store.delete_url_cache(removed_media) + + logger.info("Deleted %d entries from url cache", len(removed_media)) + + # Now we delete old images associated with the url cache. + # These may be cached for a bit on the client (i.e., they + # may have a room open with a preview url thing open). + # So we wait a couple of days before deleting, just in case. + expire_before = now - 2 * 24 * 60 * 60 * 1000 + yield self.store.get_url_cache_media_before(expire_before) + + removed_media = [] + for media_id in media_ids: + fname = self.filepaths.url_cache_filepath(media_id) + try: + os.remove(fname) + except OSError as e: + # If the path doesn't exist, meh + if e.errno != errno.ENOENT: + logger.warn("Failed to remove media: %r: %s", media_id, e) + continue + + try: + dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id) + for dir in dirs: + os.rmdir(dir) + except: + pass + + thumbnail_dir = self.filepaths.url_cache_thumbnail_directory(media_id) + try: + shutil.rmtree(thumbnail_dir) + except OSError as e: + # If the path doesn't exist, meh + if e.errno != errno.ENOENT: + logger.warn("Failed to remove media: %r: %s", media_id, e) + continue + + removed_media.append(media_id) + + try: + dirs = self.filepaths.url_cache_thumbnail_dirs_to_delete(media_id) + for dir in dirs: + os.rmdir(dir) + except: + pass + + yield self.store.delete_url_cache_media(removed_media) + + logger.info("Deleted %d media from url cache", len(removed_media)) + def decode_and_calc_og(body, media_uri, request_encoding=None): from lxml import etree -- cgit 1.4.1 From ae79764fe55ab15156b4f28658326bd2c9c0b937 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 28 Sep 2017 12:37:53 +0100 Subject: Change expires column to expires_ts --- synapse/rest/media/v1/preview_url_resource.py | 4 ++-- synapse/storage/media_repository.py | 14 +++++++------- .../storage/schema/delta/44/expire_url_cache.sql | 21 ++++++++++++++++++++- 3 files changed, 29 insertions(+), 10 deletions(-) (limited to 'synapse/rest/media') diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index c5ba83ddfd..6f896ffb53 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -137,7 +137,7 @@ class PreviewUrlResource(Resource): cache_result = yield self.store.get_url_cache(url, ts) if ( cache_result and - cache_result["download_ts"] + cache_result["expires"] > ts and + cache_result["expires_ts"] > ts and cache_result["response_code"] / 100 == 2 ): respond_with_json_bytes( @@ -246,7 +246,7 @@ class PreviewUrlResource(Resource): url, media_info["response_code"], media_info["etag"], - media_info["expires"], + media_info["expires"] + media_info["created_ts"], json.dumps(og), media_info["filesystem_id"], media_info["created_ts"], diff --git a/synapse/storage/media_repository.py b/synapse/storage/media_repository.py index 5cca14ccb2..b8a0dd0762 100644 --- a/synapse/storage/media_repository.py +++ b/synapse/storage/media_repository.py @@ -62,7 +62,7 @@ class MediaRepositoryStore(SQLBaseStore): def get_url_cache_txn(txn): # get the most recently cached result (relative to the given ts) sql = ( - "SELECT response_code, etag, expires, og, media_id, download_ts" + "SELECT response_code, etag, expires_ts, og, media_id, download_ts" " FROM local_media_repository_url_cache" " WHERE url = ? AND download_ts <= ?" " ORDER BY download_ts DESC LIMIT 1" @@ -74,7 +74,7 @@ class MediaRepositoryStore(SQLBaseStore): # ...or if we've requested a timestamp older than the oldest # copy in the cache, return the oldest copy (if any) sql = ( - "SELECT response_code, etag, expires, og, media_id, download_ts" + "SELECT response_code, etag, expires_ts, og, media_id, download_ts" " FROM local_media_repository_url_cache" " WHERE url = ? AND download_ts > ?" " ORDER BY download_ts ASC LIMIT 1" @@ -86,14 +86,14 @@ class MediaRepositoryStore(SQLBaseStore): return None return dict(zip(( - 'response_code', 'etag', 'expires', 'og', 'media_id', 'download_ts' + 'response_code', 'etag', 'expires_ts', 'og', 'media_id', 'download_ts' ), row)) return self.runInteraction( "get_url_cache", get_url_cache_txn ) - def store_url_cache(self, url, response_code, etag, expires, og, media_id, + def store_url_cache(self, url, response_code, etag, expires_ts, og, media_id, download_ts): return self._simple_insert( "local_media_repository_url_cache", @@ -101,7 +101,7 @@ class MediaRepositoryStore(SQLBaseStore): "url": url, "response_code": response_code, "etag": etag, - "expires": expires, + "expires_ts": expires_ts, "og": og, "media_id": media_id, "download_ts": download_ts, @@ -242,8 +242,8 @@ class MediaRepositoryStore(SQLBaseStore): def get_expired_url_cache(self, now_ts): sql = ( "SELECT media_id FROM local_media_repository_url_cache" - " WHERE download_ts + expires < ?" - " ORDER BY download_ts + expires ASC" + " WHERE expires_ts < ?" + " ORDER BY expires_ts ASC" " LIMIT 100" ) diff --git a/synapse/storage/schema/delta/44/expire_url_cache.sql b/synapse/storage/schema/delta/44/expire_url_cache.sql index 997e790b6d..9475d53e84 100644 --- a/synapse/storage/schema/delta/44/expire_url_cache.sql +++ b/synapse/storage/schema/delta/44/expire_url_cache.sql @@ -14,4 +14,23 @@ */ CREATE INDEX local_media_repository_url_idx ON local_media_repository(created_ts) WHERE url_cache IS NOT NULL; -CREATE INDEX local_media_repository_url_cache_expires_idx ON local_media_repository_url_cache((download_ts + expires)); + +-- we need to change `expires` to `expires_ts` so that we can index on it. SQLite doesn't support +-- indices on expressions until 3.9. +CREATE TABLE local_media_repository_url_cache_new( + url TEXT, + response_code INTEGER, + etag TEXT, + expires_ts BIGINT, + og TEXT, + media_id TEXT, + download_ts BIGINT +); + +INSERT INTO local_media_repository_url_cache_new + SELECT url, response_code, etag, expires + download_ts, og, media_id, download_ts FROM local_media_repository_url_cache; + +DROP TABLE local_media_repository_url_cache; +ALTER TABLE local_media_repository_url_cache_new RENAME TO local_media_repository_url_cache; + +CREATE INDEX local_media_repository_url_cache_expires_idx ON local_media_repository_url_cache(expires_ts); -- cgit 1.4.1 From ace807908602cb955fc7a2cae63dc6e64bf90cc5 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 28 Sep 2017 12:52:51 +0100 Subject: Support new and old style media id formats --- synapse/rest/media/v1/filepath.py | 112 +++++++++++++++++++++++++++----------- 1 file changed, 81 insertions(+), 31 deletions(-) (limited to 'synapse/rest/media') diff --git a/synapse/rest/media/v1/filepath.py b/synapse/rest/media/v1/filepath.py index c5d43209f9..d5cec10127 100644 --- a/synapse/rest/media/v1/filepath.py +++ b/synapse/rest/media/v1/filepath.py @@ -14,6 +14,9 @@ # limitations under the License. import os +import re + +NEW_FORMAT_ID_RE = re.compile(r"^\d\d\d\d-\d\d-\d\d") class MediaFilePaths(object): @@ -73,21 +76,39 @@ class MediaFilePaths(object): ) def url_cache_filepath(self, media_id): - # Media id is of the form - # E.g.: 2017-09-28-fsdRDt24DS234dsf - return os.path.join( - self.base_path, "url_cache", - media_id[:10], media_id[11:] - ) + if NEW_FORMAT_ID_RE.match(media_id): + # Media id is of the form + # E.g.: 2017-09-28-fsdRDt24DS234dsf + return os.path.join( + self.base_path, "url_cache", + media_id[:10], media_id[11:] + ) + else: + return os.path.join( + self.base_path, "url_cache", + media_id[0:2], media_id[2:4], media_id[4:], + ) def url_cache_filepath_dirs_to_delete(self, media_id): "The dirs to try and remove if we delete the media_id file" - return [ - os.path.join( - self.base_path, "url_cache", - media_id[:10], - ), - ] + if NEW_FORMAT_ID_RE.match(media_id): + return [ + os.path.join( + self.base_path, "url_cache", + media_id[:10], + ), + ] + else: + return [ + os.path.join( + self.base_path, "url_cache", + media_id[0:2], media_id[2:4], + ), + os.path.join( + self.base_path, "url_cache", + media_id[0:2], + ), + ] def url_cache_thumbnail(self, media_id, width, height, content_type, method): @@ -99,32 +120,61 @@ class MediaFilePaths(object): width, height, top_level_type, sub_type, method ) - return os.path.join( - self.base_path, "url_cache_thumbnails", - media_id[:10], media_id[11:], - file_name - ) + if NEW_FORMAT_ID_RE.match(media_id): + return os.path.join( + self.base_path, "url_cache_thumbnails", + media_id[:10], media_id[11:], + file_name + ) + else: + return os.path.join( + self.base_path, "url_cache_thumbnails", + media_id[0:2], media_id[2:4], media_id[4:], + file_name + ) def url_cache_thumbnail_directory(self, media_id): # Media id is of the form # E.g.: 2017-09-28-fsdRDt24DS234dsf - return os.path.join( - self.base_path, "url_cache_thumbnails", - media_id[:10], media_id[11:], - ) + if NEW_FORMAT_ID_RE.match(media_id): + return os.path.join( + self.base_path, "url_cache_thumbnails", + media_id[:10], media_id[11:], + ) + else: + return os.path.join( + self.base_path, "url_cache_thumbnails", + media_id[0:2], media_id[2:4], media_id[4:], + ) def url_cache_thumbnail_dirs_to_delete(self, media_id): "The dirs to try and remove if we delete the media_id thumbnails" # Media id is of the form # E.g.: 2017-09-28-fsdRDt24DS234dsf - return [ - os.path.join( - self.base_path, "url_cache_thumbnails", - media_id[:10], media_id[11:], - ), - os.path.join( - self.base_path, "url_cache_thumbnails", - media_id[:10], - ), - ] + if NEW_FORMAT_ID_RE.match(media_id): + return [ + os.path.join( + self.base_path, "url_cache_thumbnails", + media_id[:10], media_id[11:], + ), + os.path.join( + self.base_path, "url_cache_thumbnails", + media_id[:10], + ), + ] + else: + return [ + os.path.join( + self.base_path, "url_cache_thumbnails", + media_id[0:2], media_id[2:4], media_id[4:], + ), + os.path.join( + self.base_path, "url_cache_thumbnails", + media_id[0:2], media_id[2:4], + ), + os.path.join( + self.base_path, "url_cache_thumbnails", + media_id[0:2], + ), + ] -- cgit 1.4.1 From 5f501ec7e2645abe232bd6bab407ac863e3250c2 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 28 Sep 2017 12:59:01 +0100 Subject: Fix typo in url cache expiry timer --- synapse/rest/media/v1/preview_url_resource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'synapse/rest/media') diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 6f896ffb53..1616809e8f 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -74,7 +74,7 @@ class PreviewUrlResource(Resource): self.downloads = {} self._cleaner_loop = self.clock.looping_call( - self._expire_url_cache_data, 30 * 10000 + self._expire_url_cache_data, 30 * 1000 ) def render_GET(self, request): -- cgit 1.4.1 From e1e7d76cf16858d998884f19b141f90a0415d297 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 28 Sep 2017 13:55:29 +0100 Subject: Actually assign result to variable --- synapse/rest/media/v1/preview_url_resource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'synapse/rest/media') diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 1616809e8f..0123369a7f 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -372,7 +372,7 @@ class PreviewUrlResource(Resource): # may have a room open with a preview url thing open). # So we wait a couple of days before deleting, just in case. expire_before = now - 2 * 24 * 60 * 60 * 1000 - yield self.store.get_url_cache_media_before(expire_before) + media_ids = yield self.store.get_url_cache_media_before(expire_before) removed_media = [] for media_id in media_ids: -- cgit 1.4.1 From 7cc483aa0ef9e51bd3839768e44b449cf6d24136 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 28 Sep 2017 13:56:53 +0100 Subject: Clear up expired url cache every 10s --- synapse/rest/media/v1/preview_url_resource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'synapse/rest/media') diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 0123369a7f..2300c263e0 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -74,7 +74,7 @@ class PreviewUrlResource(Resource): self.downloads = {} self._cleaner_loop = self.clock.looping_call( - self._expire_url_cache_data, 30 * 1000 + self._expire_url_cache_data, 10 * 1000 ) def render_GET(self, request): -- cgit 1.4.1 From d5694ac5fa3266a777fa171f33bebc0d7477c12a Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 28 Sep 2017 16:08:08 +0100 Subject: Only log if we've removed media --- synapse/rest/media/v1/preview_url_resource.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'synapse/rest/media') diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 2300c263e0..895b480d5c 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -365,7 +365,8 @@ class PreviewUrlResource(Resource): yield self.store.delete_url_cache(removed_media) - logger.info("Deleted %d entries from url cache", len(removed_media)) + if removed_media: + logger.info("Deleted %d entries from url cache", len(removed_media)) # Now we delete old images associated with the url cache. # These may be cached for a bit on the client (i.e., they @@ -412,7 +413,8 @@ class PreviewUrlResource(Resource): yield self.store.delete_url_cache_media(removed_media) - logger.info("Deleted %d media from url cache", len(removed_media)) + if removed_media: + logger.info("Deleted %d media from url cache", len(removed_media)) def decode_and_calc_og(body, media_uri, request_encoding=None): -- cgit 1.4.1