summary refs log tree commit diff
path: root/synapse/rest/media/v1
diff options
context:
space:
mode:
Diffstat (limited to 'synapse/rest/media/v1')
-rw-r--r--synapse/rest/media/v1/filepath.py176
-rw-r--r--synapse/rest/media/v1/media_repository.py371
-rw-r--r--synapse/rest/media/v1/preview_url_resource.py113
-rw-r--r--synapse/rest/media/v1/thumbnailer.py25
-rw-r--r--synapse/rest/media/v1/upload_resource.py2
5 files changed, 478 insertions, 209 deletions
diff --git a/synapse/rest/media/v1/filepath.py b/synapse/rest/media/v1/filepath.py
index d92b7ff337..d5164e47e0 100644
--- a/synapse/rest/media/v1/filepath.py
+++ b/synapse/rest/media/v1/filepath.py
@@ -14,78 +14,200 @@
 # limitations under the License.
 
 import os
+import re
+import functools
+
+NEW_FORMAT_ID_RE = re.compile(r"^\d\d\d\d-\d\d-\d\d")
+
+
+def _wrap_in_base_path(func):
+    """Takes a function that returns a relative path and turns it into an
+    absolute path based on the location of the primary media store
+    """
+    @functools.wraps(func)
+    def _wrapped(self, *args, **kwargs):
+        path = func(self, *args, **kwargs)
+        return os.path.join(self.base_path, path)
+
+    return _wrapped
 
 
 class MediaFilePaths(object):
+    """Describes where files are stored on disk.
 
-    def __init__(self, base_path):
-        self.base_path = base_path
+    Most of the functions have a `*_rel` variant which returns a file path that
+    is relative to the base media store path. This is mainly used when we want
+    to write to the backup media store (when one is configured)
+    """
 
-    def default_thumbnail(self, default_top_level, default_sub_type, width,
-                          height, content_type, method):
+    def __init__(self, primary_base_path):
+        self.base_path = primary_base_path
+
+    def default_thumbnail_rel(self, default_top_level, default_sub_type, width,
+                              height, content_type, method):
         top_level_type, sub_type = content_type.split("/")
         file_name = "%i-%i-%s-%s-%s" % (
             width, height, top_level_type, sub_type, method
         )
         return os.path.join(
-            self.base_path, "default_thumbnails", default_top_level,
+            "default_thumbnails", default_top_level,
             default_sub_type, file_name
         )
 
-    def local_media_filepath(self, media_id):
+    default_thumbnail = _wrap_in_base_path(default_thumbnail_rel)
+
+    def local_media_filepath_rel(self, media_id):
         return os.path.join(
-            self.base_path, "local_content",
+            "local_content",
             media_id[0:2], media_id[2:4], media_id[4:]
         )
 
-    def local_media_thumbnail(self, media_id, width, height, content_type,
-                              method):
+    local_media_filepath = _wrap_in_base_path(local_media_filepath_rel)
+
+    def local_media_thumbnail_rel(self, media_id, width, height, content_type,
+                                  method):
         top_level_type, sub_type = content_type.split("/")
         file_name = "%i-%i-%s-%s-%s" % (
             width, height, top_level_type, sub_type, method
         )
         return os.path.join(
-            self.base_path, "local_thumbnails",
+            "local_thumbnails",
             media_id[0:2], media_id[2:4], media_id[4:],
             file_name
         )
 
-    def remote_media_filepath(self, server_name, file_id):
+    local_media_thumbnail = _wrap_in_base_path(local_media_thumbnail_rel)
+
+    def remote_media_filepath_rel(self, server_name, file_id):
         return os.path.join(
-            self.base_path, "remote_content", server_name,
+            "remote_content", server_name,
             file_id[0:2], file_id[2:4], file_id[4:]
         )
 
-    def remote_media_thumbnail(self, server_name, file_id, width, height,
-                               content_type, method):
+    remote_media_filepath = _wrap_in_base_path(remote_media_filepath_rel)
+
+    def remote_media_thumbnail_rel(self, server_name, file_id, width, height,
+                                   content_type, method):
         top_level_type, sub_type = content_type.split("/")
         file_name = "%i-%i-%s-%s" % (width, height, top_level_type, sub_type)
         return os.path.join(
-            self.base_path, "remote_thumbnail", server_name,
+            "remote_thumbnail", server_name,
             file_id[0:2], file_id[2:4], file_id[4:],
             file_name
         )
 
+    remote_media_thumbnail = _wrap_in_base_path(remote_media_thumbnail_rel)
+
     def remote_media_thumbnail_dir(self, server_name, file_id):
         return os.path.join(
             self.base_path, "remote_thumbnail", server_name,
             file_id[0:2], file_id[2:4], file_id[4:],
         )
 
-    def url_cache_filepath(self, media_id):
-        return os.path.join(
-            self.base_path, "url_cache",
-            media_id[0:2], media_id[2:4], media_id[4:]
-        )
+    def url_cache_filepath_rel(self, media_id):
+        if NEW_FORMAT_ID_RE.match(media_id):
+            # Media id is of the form <DATE><RANDOM_STRING>
+            # E.g.: 2017-09-28-fsdRDt24DS234dsf
+            return os.path.join(
+                "url_cache",
+                media_id[:10], media_id[11:]
+            )
+        else:
+            return os.path.join(
+                "url_cache",
+                media_id[0:2], media_id[2:4], media_id[4:],
+            )
+
+    url_cache_filepath = _wrap_in_base_path(url_cache_filepath_rel)
+
+    def url_cache_filepath_dirs_to_delete(self, media_id):
+        "The dirs to try and remove if we delete the media_id file"
+        if NEW_FORMAT_ID_RE.match(media_id):
+            return [
+                os.path.join(
+                    self.base_path, "url_cache",
+                    media_id[:10],
+                ),
+            ]
+        else:
+            return [
+                os.path.join(
+                    self.base_path, "url_cache",
+                    media_id[0:2], media_id[2:4],
+                ),
+                os.path.join(
+                    self.base_path, "url_cache",
+                    media_id[0:2],
+                ),
+            ]
+
+    def url_cache_thumbnail_rel(self, media_id, width, height, content_type,
+                                method):
+        # Media id is of the form <DATE><RANDOM_STRING>
+        # E.g.: 2017-09-28-fsdRDt24DS234dsf
 
-    def url_cache_thumbnail(self, media_id, width, height, content_type,
-                            method):
         top_level_type, sub_type = content_type.split("/")
         file_name = "%i-%i-%s-%s-%s" % (
             width, height, top_level_type, sub_type, method
         )
-        return os.path.join(
-            self.base_path, "url_cache_thumbnails",
-            media_id[0:2], media_id[2:4], media_id[4:],
-            file_name
-        )
+
+        if NEW_FORMAT_ID_RE.match(media_id):
+            return os.path.join(
+                "url_cache_thumbnails",
+                media_id[:10], media_id[11:],
+                file_name
+            )
+        else:
+            return os.path.join(
+                "url_cache_thumbnails",
+                media_id[0:2], media_id[2:4], media_id[4:],
+                file_name
+            )
+
+    url_cache_thumbnail = _wrap_in_base_path(url_cache_thumbnail_rel)
+
+    def url_cache_thumbnail_directory(self, media_id):
+        # Media id is of the form <DATE><RANDOM_STRING>
+        # E.g.: 2017-09-28-fsdRDt24DS234dsf
+
+        if NEW_FORMAT_ID_RE.match(media_id):
+            return os.path.join(
+                self.base_path, "url_cache_thumbnails",
+                media_id[:10], media_id[11:],
+            )
+        else:
+            return os.path.join(
+                self.base_path, "url_cache_thumbnails",
+                media_id[0:2], media_id[2:4], media_id[4:],
+            )
+
+    def url_cache_thumbnail_dirs_to_delete(self, media_id):
+        "The dirs to try and remove if we delete the media_id thumbnails"
+        # Media id is of the form <DATE><RANDOM_STRING>
+        # E.g.: 2017-09-28-fsdRDt24DS234dsf
+        if NEW_FORMAT_ID_RE.match(media_id):
+            return [
+                os.path.join(
+                    self.base_path, "url_cache_thumbnails",
+                    media_id[:10], media_id[11:],
+                ),
+                os.path.join(
+                    self.base_path, "url_cache_thumbnails",
+                    media_id[:10],
+                ),
+            ]
+        else:
+            return [
+                os.path.join(
+                    self.base_path, "url_cache_thumbnails",
+                    media_id[0:2], media_id[2:4], media_id[4:],
+                ),
+                os.path.join(
+                    self.base_path, "url_cache_thumbnails",
+                    media_id[0:2], media_id[2:4],
+                ),
+                os.path.join(
+                    self.base_path, "url_cache_thumbnails",
+                    media_id[0:2],
+                ),
+            ]
diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py
index 0ea1248ce6..6b50b45b1f 100644
--- a/synapse/rest/media/v1/media_repository.py
+++ b/synapse/rest/media/v1/media_repository.py
@@ -33,7 +33,7 @@ from synapse.api.errors import SynapseError, HttpResponseException, \
 
 from synapse.util.async import Linearizer
 from synapse.util.stringutils import is_ascii
-from synapse.util.logcontext import preserve_context_over_fn
+from synapse.util.logcontext import make_deferred_yieldable, preserve_fn
 from synapse.util.retryutils import NotRetryingDestination
 
 import os
@@ -59,7 +59,14 @@ class MediaRepository(object):
         self.store = hs.get_datastore()
         self.max_upload_size = hs.config.max_upload_size
         self.max_image_pixels = hs.config.max_image_pixels
-        self.filepaths = MediaFilePaths(hs.config.media_store_path)
+
+        self.primary_base_path = hs.config.media_store_path
+        self.filepaths = MediaFilePaths(self.primary_base_path)
+
+        self.backup_base_path = hs.config.backup_media_store_path
+
+        self.synchronous_backup_media_store = hs.config.synchronous_backup_media_store
+
         self.dynamic_thumbnails = hs.config.dynamic_thumbnails
         self.thumbnail_requirements = hs.config.thumbnail_requirements
 
@@ -87,18 +94,86 @@ class MediaRepository(object):
         if not os.path.exists(dirname):
             os.makedirs(dirname)
 
+    @staticmethod
+    def _write_file_synchronously(source, fname):
+        """Write `source` to the path `fname` synchronously. Should be called
+        from a thread.
+
+        Args:
+            source: A file like object to be written
+            fname (str): Path to write to
+        """
+        MediaRepository._makedirs(fname)
+        source.seek(0)  # Ensure we read from the start of the file
+        with open(fname, "wb") as f:
+            shutil.copyfileobj(source, f)
+
+    @defer.inlineCallbacks
+    def write_to_file_and_backup(self, source, path):
+        """Write `source` to the on disk media store, and also the backup store
+        if configured.
+
+        Args:
+            source: A file like object that should be written
+            path (str): Relative path to write file to
+
+        Returns:
+            Deferred[str]: the file path written to in the primary media store
+        """
+        fname = os.path.join(self.primary_base_path, path)
+
+        # Write to the main repository
+        yield make_deferred_yieldable(threads.deferToThread(
+            self._write_file_synchronously, source, fname,
+        ))
+
+        # Write to backup repository
+        yield self.copy_to_backup(path)
+
+        defer.returnValue(fname)
+
+    @defer.inlineCallbacks
+    def copy_to_backup(self, path):
+        """Copy a file from the primary to backup media store, if configured.
+
+        Args:
+            path(str): Relative path to write file to
+        """
+        if self.backup_base_path:
+            primary_fname = os.path.join(self.primary_base_path, path)
+            backup_fname = os.path.join(self.backup_base_path, path)
+
+            # We can either wait for successful writing to the backup repository
+            # or write in the background and immediately return
+            if self.synchronous_backup_media_store:
+                yield make_deferred_yieldable(threads.deferToThread(
+                    shutil.copyfile, primary_fname, backup_fname,
+                ))
+            else:
+                preserve_fn(threads.deferToThread)(
+                    shutil.copyfile, primary_fname, backup_fname,
+                )
+
     @defer.inlineCallbacks
     def create_content(self, media_type, upload_name, content, content_length,
                        auth_user):
+        """Store uploaded content for a local user and return the mxc URL
+
+        Args:
+            media_type(str): The content type of the file
+            upload_name(str): The name of the file
+            content: A file like object that is the content to store
+            content_length(int): The length of the content
+            auth_user(str): The user_id of the uploader
+
+        Returns:
+            Deferred[str]: The mxc url of the stored content
+        """
         media_id = random_string(24)
 
-        fname = self.filepaths.local_media_filepath(media_id)
-        self._makedirs(fname)
-
-        # This shouldn't block for very long because the content will have
-        # already been uploaded at this point.
-        with open(fname, "wb") as f:
-            f.write(content)
+        fname = yield self.write_to_file_and_backup(
+            content, self.filepaths.local_media_filepath_rel(media_id)
+        )
 
         logger.info("Stored local media in file %r", fname)
 
@@ -115,7 +190,7 @@ class MediaRepository(object):
             "media_length": content_length,
         }
 
-        yield self._generate_local_thumbnails(media_id, media_info)
+        yield self._generate_thumbnails(None, media_id, media_info)
 
         defer.returnValue("mxc://%s/%s" % (self.server_name, media_id))
 
@@ -148,9 +223,10 @@ class MediaRepository(object):
     def _download_remote_file(self, server_name, media_id):
         file_id = random_string(24)
 
-        fname = self.filepaths.remote_media_filepath(
+        fpath = self.filepaths.remote_media_filepath_rel(
             server_name, file_id
         )
+        fname = os.path.join(self.primary_base_path, fpath)
         self._makedirs(fname)
 
         try:
@@ -192,6 +268,8 @@ class MediaRepository(object):
                                      server_name, media_id)
                     raise SynapseError(502, "Failed to fetch remote media")
 
+            yield self.copy_to_backup(fpath)
+
             media_type = headers["Content-Type"][0]
             time_now_ms = self.clock.time_msec()
 
@@ -244,7 +322,7 @@ class MediaRepository(object):
             "filesystem_id": file_id,
         }
 
-        yield self._generate_remote_thumbnails(
+        yield self._generate_thumbnails(
             server_name, media_id, media_info
         )
 
@@ -253,9 +331,8 @@ class MediaRepository(object):
     def _get_thumbnail_requirements(self, media_type):
         return self.thumbnail_requirements.get(media_type, ())
 
-    def _generate_thumbnail(self, input_path, t_path, t_width, t_height,
+    def _generate_thumbnail(self, thumbnailer, t_width, t_height,
                             t_method, t_type):
-        thumbnailer = Thumbnailer(input_path)
         m_width = thumbnailer.width
         m_height = thumbnailer.height
 
@@ -267,72 +344,105 @@ class MediaRepository(object):
             return
 
         if t_method == "crop":
-            t_len = thumbnailer.crop(t_path, t_width, t_height, t_type)
+            t_byte_source = thumbnailer.crop(t_width, t_height, t_type)
         elif t_method == "scale":
             t_width, t_height = thumbnailer.aspect(t_width, t_height)
             t_width = min(m_width, t_width)
             t_height = min(m_height, t_height)
-            t_len = thumbnailer.scale(t_path, t_width, t_height, t_type)
+            t_byte_source = thumbnailer.scale(t_width, t_height, t_type)
         else:
-            t_len = None
+            t_byte_source = None
 
-        return t_len
+        return t_byte_source
 
     @defer.inlineCallbacks
     def generate_local_exact_thumbnail(self, media_id, t_width, t_height,
                                        t_method, t_type):
         input_path = self.filepaths.local_media_filepath(media_id)
 
-        t_path = self.filepaths.local_media_thumbnail(
-            media_id, t_width, t_height, t_type, t_method
-        )
-        self._makedirs(t_path)
-
-        t_len = yield preserve_context_over_fn(
-            threads.deferToThread,
+        thumbnailer = Thumbnailer(input_path)
+        t_byte_source = yield make_deferred_yieldable(threads.deferToThread(
             self._generate_thumbnail,
-            input_path, t_path, t_width, t_height, t_method, t_type
-        )
+            thumbnailer, t_width, t_height, t_method, t_type
+        ))
+
+        if t_byte_source:
+            try:
+                output_path = yield self.write_to_file_and_backup(
+                    t_byte_source,
+                    self.filepaths.local_media_thumbnail_rel(
+                        media_id, t_width, t_height, t_type, t_method
+                    )
+                )
+            finally:
+                t_byte_source.close()
+
+            logger.info("Stored thumbnail in file %r", output_path)
+
+            t_len = os.path.getsize(output_path)
 
-        if t_len:
             yield self.store.store_local_thumbnail(
                 media_id, t_width, t_height, t_type, t_method, t_len
             )
 
-            defer.returnValue(t_path)
+            defer.returnValue(output_path)
 
     @defer.inlineCallbacks
     def generate_remote_exact_thumbnail(self, server_name, file_id, media_id,
                                         t_width, t_height, t_method, t_type):
         input_path = self.filepaths.remote_media_filepath(server_name, file_id)
 
-        t_path = self.filepaths.remote_media_thumbnail(
-            server_name, file_id, t_width, t_height, t_type, t_method
-        )
-        self._makedirs(t_path)
-
-        t_len = yield preserve_context_over_fn(
-            threads.deferToThread,
+        thumbnailer = Thumbnailer(input_path)
+        t_byte_source = yield make_deferred_yieldable(threads.deferToThread(
             self._generate_thumbnail,
-            input_path, t_path, t_width, t_height, t_method, t_type
-        )
+            thumbnailer, t_width, t_height, t_method, t_type
+        ))
+
+        if t_byte_source:
+            try:
+                output_path = yield self.write_to_file_and_backup(
+                    t_byte_source,
+                    self.filepaths.remote_media_thumbnail_rel(
+                        server_name, file_id, t_width, t_height, t_type, t_method
+                    )
+                )
+            finally:
+                t_byte_source.close()
+
+            logger.info("Stored thumbnail in file %r", output_path)
+
+            t_len = os.path.getsize(output_path)
 
-        if t_len:
             yield self.store.store_remote_media_thumbnail(
                 server_name, media_id, file_id,
                 t_width, t_height, t_type, t_method, t_len
             )
 
-            defer.returnValue(t_path)
+            defer.returnValue(output_path)
 
     @defer.inlineCallbacks
-    def _generate_local_thumbnails(self, media_id, media_info, url_cache=False):
+    def _generate_thumbnails(self, server_name, media_id, media_info, url_cache=False):
+        """Generate and store thumbnails for an image.
+
+        Args:
+            server_name(str|None): The server name if remote media, else None if local
+            media_id(str)
+            media_info(dict)
+            url_cache(bool): If we are thumbnailing images downloaded for the URL cache,
+                used exclusively by the url previewer
+
+        Returns:
+            Deferred[dict]: Dict with "width" and "height" keys of original image
+        """
         media_type = media_info["media_type"]
+        file_id = media_info.get("filesystem_id")
         requirements = self._get_thumbnail_requirements(media_type)
         if not requirements:
             return
 
-        if url_cache:
+        if server_name:
+            input_path = self.filepaths.remote_media_filepath(server_name, file_id)
+        elif url_cache:
             input_path = self.filepaths.url_cache_filepath(media_id)
         else:
             input_path = self.filepaths.local_media_filepath(media_id)
@@ -348,135 +458,72 @@ class MediaRepository(object):
             )
             return
 
-        local_thumbnails = []
-
-        def generate_thumbnails():
-            scales = set()
-            crops = set()
-            for r_width, r_height, r_method, r_type in requirements:
-                if r_method == "scale":
-                    t_width, t_height = thumbnailer.aspect(r_width, r_height)
-                    scales.add((
-                        min(m_width, t_width), min(m_height, t_height), r_type,
-                    ))
-                elif r_method == "crop":
-                    crops.add((r_width, r_height, r_type))
-
-            for t_width, t_height, t_type in scales:
-                t_method = "scale"
-                if url_cache:
-                    t_path = self.filepaths.url_cache_thumbnail(
-                        media_id, t_width, t_height, t_type, t_method
-                    )
-                else:
-                    t_path = self.filepaths.local_media_thumbnail(
-                        media_id, t_width, t_height, t_type, t_method
-                    )
-                self._makedirs(t_path)
-                t_len = thumbnailer.scale(t_path, t_width, t_height, t_type)
+        # We deduplicate the thumbnail sizes by ignoring the cropped versions if
+        # they have the same dimensions of a scaled one.
+        thumbnails = {}
+        for r_width, r_height, r_method, r_type in requirements:
+            if r_method == "crop":
+                thumbnails.setdefault((r_width, r_height, r_type), r_method)
+            elif r_method == "scale":
+                t_width, t_height = thumbnailer.aspect(r_width, r_height)
+                t_width = min(m_width, t_width)
+                t_height = min(m_height, t_height)
+                thumbnails[(t_width, t_height, r_type)] = r_method
+
+        # Now we generate the thumbnails for each dimension, store it
+        for (t_width, t_height, t_type), t_method in thumbnails.iteritems():
+            # Work out the correct file name for thumbnail
+            if server_name:
+                file_path = self.filepaths.remote_media_thumbnail_rel(
+                    server_name, file_id, t_width, t_height, t_type, t_method
+                )
+            elif url_cache:
+                file_path = self.filepaths.url_cache_thumbnail_rel(
+                    media_id, t_width, t_height, t_type, t_method
+                )
+            else:
+                file_path = self.filepaths.local_media_thumbnail_rel(
+                    media_id, t_width, t_height, t_type, t_method
+                )
 
-                local_thumbnails.append((
-                    media_id, t_width, t_height, t_type, t_method, t_len
+            # Generate the thumbnail
+            if t_method == "crop":
+                t_byte_source = yield make_deferred_yieldable(threads.deferToThread(
+                    thumbnailer.crop,
+                    t_width, t_height, t_type,
                 ))
-
-            for t_width, t_height, t_type in crops:
-                if (t_width, t_height, t_type) in scales:
-                    # If the aspect ratio of the cropped thumbnail matches a purely
-                    # scaled one then there is no point in calculating a separate
-                    # thumbnail.
-                    continue
-                t_method = "crop"
-                if url_cache:
-                    t_path = self.filepaths.url_cache_thumbnail(
-                        media_id, t_width, t_height, t_type, t_method
-                    )
-                else:
-                    t_path = self.filepaths.local_media_thumbnail(
-                        media_id, t_width, t_height, t_type, t_method
-                    )
-                self._makedirs(t_path)
-                t_len = thumbnailer.crop(t_path, t_width, t_height, t_type)
-                local_thumbnails.append((
-                    media_id, t_width, t_height, t_type, t_method, t_len
+            elif t_method == "scale":
+                t_byte_source = yield make_deferred_yieldable(threads.deferToThread(
+                    thumbnailer.scale,
+                    t_width, t_height, t_type,
                 ))
+            else:
+                logger.error("Unrecognized method: %r", t_method)
+                continue
 
-        yield preserve_context_over_fn(threads.deferToThread, generate_thumbnails)
-
-        for l in local_thumbnails:
-            yield self.store.store_local_thumbnail(*l)
-
-        defer.returnValue({
-            "width": m_width,
-            "height": m_height,
-        })
-
-    @defer.inlineCallbacks
-    def _generate_remote_thumbnails(self, server_name, media_id, media_info):
-        media_type = media_info["media_type"]
-        file_id = media_info["filesystem_id"]
-        requirements = self._get_thumbnail_requirements(media_type)
-        if not requirements:
-            return
+            if not t_byte_source:
+                continue
 
-        remote_thumbnails = []
+            try:
+                # Write to disk
+                output_path = yield self.write_to_file_and_backup(
+                    t_byte_source, file_path,
+                )
+            finally:
+                t_byte_source.close()
 
-        input_path = self.filepaths.remote_media_filepath(server_name, file_id)
-        thumbnailer = Thumbnailer(input_path)
-        m_width = thumbnailer.width
-        m_height = thumbnailer.height
+            t_len = os.path.getsize(output_path)
 
-        def generate_thumbnails():
-            if m_width * m_height >= self.max_image_pixels:
-                logger.info(
-                    "Image too large to thumbnail %r x %r > %r",
-                    m_width, m_height, self.max_image_pixels
-                )
-                return
-
-            scales = set()
-            crops = set()
-            for r_width, r_height, r_method, r_type in requirements:
-                if r_method == "scale":
-                    t_width, t_height = thumbnailer.aspect(r_width, r_height)
-                    scales.add((
-                        min(m_width, t_width), min(m_height, t_height), r_type,
-                    ))
-                elif r_method == "crop":
-                    crops.add((r_width, r_height, r_type))
-
-            for t_width, t_height, t_type in scales:
-                t_method = "scale"
-                t_path = self.filepaths.remote_media_thumbnail(
-                    server_name, file_id, t_width, t_height, t_type, t_method
-                )
-                self._makedirs(t_path)
-                t_len = thumbnailer.scale(t_path, t_width, t_height, t_type)
-                remote_thumbnails.append([
+            # Write to database
+            if server_name:
+                yield self.store.store_remote_media_thumbnail(
                     server_name, media_id, file_id,
                     t_width, t_height, t_type, t_method, t_len
-                ])
-
-            for t_width, t_height, t_type in crops:
-                if (t_width, t_height, t_type) in scales:
-                    # If the aspect ratio of the cropped thumbnail matches a purely
-                    # scaled one then there is no point in calculating a separate
-                    # thumbnail.
-                    continue
-                t_method = "crop"
-                t_path = self.filepaths.remote_media_thumbnail(
-                    server_name, file_id, t_width, t_height, t_type, t_method
                 )
-                self._makedirs(t_path)
-                t_len = thumbnailer.crop(t_path, t_width, t_height, t_type)
-                remote_thumbnails.append([
-                    server_name, media_id, file_id,
-                    t_width, t_height, t_type, t_method, t_len
-                ])
-
-        yield preserve_context_over_fn(threads.deferToThread, generate_thumbnails)
-
-        for r in remote_thumbnails:
-            yield self.store.store_remote_media_thumbnail(*r)
+            else:
+                yield self.store.store_local_thumbnail(
+                    media_id, t_width, t_height, t_type, t_method, t_len
+                )
 
         defer.returnValue({
             "width": m_width,
@@ -497,6 +544,8 @@ class MediaRepository(object):
 
             logger.info("Deleting: %r", key)
 
+            # TODO: Should we delete from the backup store
+
             with (yield self.remote_media_linearizer.queue(key)):
                 full_path = self.filepaths.remote_media_filepath(origin, file_id)
                 try:
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index b81a336c5d..2a3e37fdf4 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -36,6 +36,9 @@ import cgi
 import ujson as json
 import urlparse
 import itertools
+import datetime
+import errno
+import shutil
 
 import logging
 logger = logging.getLogger(__name__)
@@ -56,6 +59,7 @@ class PreviewUrlResource(Resource):
         self.store = hs.get_datastore()
         self.client = SpiderHttpClient(hs)
         self.media_repo = media_repo
+        self.primary_base_path = media_repo.primary_base_path
 
         self.url_preview_url_blacklist = hs.config.url_preview_url_blacklist
 
@@ -70,6 +74,10 @@ class PreviewUrlResource(Resource):
 
         self.downloads = {}
 
+        self._cleaner_loop = self.clock.looping_call(
+            self._expire_url_cache_data, 10 * 1000
+        )
+
     def render_GET(self, request):
         self._async_render_GET(request)
         return NOT_DONE_YET
@@ -130,7 +138,7 @@ class PreviewUrlResource(Resource):
         cache_result = yield self.store.get_url_cache(url, ts)
         if (
             cache_result and
-            cache_result["download_ts"] + cache_result["expires"] > ts and
+            cache_result["expires_ts"] > ts and
             cache_result["response_code"] / 100 == 2
         ):
             respond_with_json_bytes(
@@ -163,8 +171,8 @@ class PreviewUrlResource(Resource):
         logger.debug("got media_info of '%s'" % media_info)
 
         if _is_media(media_info['media_type']):
-            dims = yield self.media_repo._generate_local_thumbnails(
-                media_info['filesystem_id'], media_info, url_cache=True,
+            dims = yield self.media_repo._generate_thumbnails(
+                None, media_info['filesystem_id'], media_info, url_cache=True,
             )
 
             og = {
@@ -209,8 +217,8 @@ class PreviewUrlResource(Resource):
 
                 if _is_media(image_info['media_type']):
                     # TODO: make sure we don't choke on white-on-transparent images
-                    dims = yield self.media_repo._generate_local_thumbnails(
-                        image_info['filesystem_id'], image_info, url_cache=True,
+                    dims = yield self.media_repo._generate_thumbnails(
+                        None, image_info['filesystem_id'], image_info, url_cache=True,
                     )
                     if dims:
                         og["og:image:width"] = dims['width']
@@ -239,7 +247,7 @@ class PreviewUrlResource(Resource):
             url,
             media_info["response_code"],
             media_info["etag"],
-            media_info["expires"],
+            media_info["expires"] + media_info["created_ts"],
             json.dumps(og),
             media_info["filesystem_id"],
             media_info["created_ts"],
@@ -253,10 +261,10 @@ class PreviewUrlResource(Resource):
         # we're most likely being explicitly triggered by a human rather than a
         # bot, so are we really a robot?
 
-        # XXX: horrible duplication with base_resource's _download_remote_file()
-        file_id = random_string(24)
+        file_id = datetime.date.today().isoformat() + '_' + random_string(16)
 
-        fname = self.filepaths.url_cache_filepath(file_id)
+        fpath = self.filepaths.url_cache_filepath_rel(file_id)
+        fname = os.path.join(self.primary_base_path, fpath)
         self.media_repo._makedirs(fname)
 
         try:
@@ -267,6 +275,8 @@ class PreviewUrlResource(Resource):
                 )
                 # FIXME: pass through 404s and other error messages nicely
 
+            yield self.media_repo.copy_to_backup(fpath)
+
             media_type = headers["Content-Type"][0]
             time_now_ms = self.clock.time_msec()
 
@@ -328,6 +338,91 @@ class PreviewUrlResource(Resource):
             "etag": headers["ETag"][0] if "ETag" in headers else None,
         })
 
+    @defer.inlineCallbacks
+    def _expire_url_cache_data(self):
+        """Clean up expired url cache content, media and thumbnails.
+        """
+
+        # TODO: Delete from backup media store
+
+        now = self.clock.time_msec()
+
+        # First we delete expired url cache entries
+        media_ids = yield self.store.get_expired_url_cache(now)
+
+        removed_media = []
+        for media_id in media_ids:
+            fname = self.filepaths.url_cache_filepath(media_id)
+            try:
+                os.remove(fname)
+            except OSError as e:
+                # If the path doesn't exist, meh
+                if e.errno != errno.ENOENT:
+                    logger.warn("Failed to remove media: %r: %s", media_id, e)
+                    continue
+
+            removed_media.append(media_id)
+
+            try:
+                dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
+                for dir in dirs:
+                    os.rmdir(dir)
+            except:
+                pass
+
+        yield self.store.delete_url_cache(removed_media)
+
+        if removed_media:
+            logger.info("Deleted %d entries from url cache", len(removed_media))
+
+        # Now we delete old images associated with the url cache.
+        # These may be cached for a bit on the client (i.e., they
+        # may have a room open with a preview url thing open).
+        # So we wait a couple of days before deleting, just in case.
+        expire_before = now - 2 * 24 * 60 * 60 * 1000
+        media_ids = yield self.store.get_url_cache_media_before(expire_before)
+
+        removed_media = []
+        for media_id in media_ids:
+            fname = self.filepaths.url_cache_filepath(media_id)
+            try:
+                os.remove(fname)
+            except OSError as e:
+                # If the path doesn't exist, meh
+                if e.errno != errno.ENOENT:
+                    logger.warn("Failed to remove media: %r: %s", media_id, e)
+                    continue
+
+            try:
+                dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
+                for dir in dirs:
+                    os.rmdir(dir)
+            except:
+                pass
+
+            thumbnail_dir = self.filepaths.url_cache_thumbnail_directory(media_id)
+            try:
+                shutil.rmtree(thumbnail_dir)
+            except OSError as e:
+                # If the path doesn't exist, meh
+                if e.errno != errno.ENOENT:
+                    logger.warn("Failed to remove media: %r: %s", media_id, e)
+                    continue
+
+            removed_media.append(media_id)
+
+            try:
+                dirs = self.filepaths.url_cache_thumbnail_dirs_to_delete(media_id)
+                for dir in dirs:
+                    os.rmdir(dir)
+            except:
+                pass
+
+        yield self.store.delete_url_cache_media(removed_media)
+
+        if removed_media:
+            logger.info("Deleted %d media from url cache", len(removed_media))
+
 
 def decode_and_calc_og(body, media_uri, request_encoding=None):
     from lxml import etree
diff --git a/synapse/rest/media/v1/thumbnailer.py b/synapse/rest/media/v1/thumbnailer.py
index 3868d4f65f..e1ee535b9a 100644
--- a/synapse/rest/media/v1/thumbnailer.py
+++ b/synapse/rest/media/v1/thumbnailer.py
@@ -50,12 +50,16 @@ class Thumbnailer(object):
         else:
             return ((max_height * self.width) // self.height, max_height)
 
-    def scale(self, output_path, width, height, output_type):
-        """Rescales the image to the given dimensions"""
+    def scale(self, width, height, output_type):
+        """Rescales the image to the given dimensions.
+
+        Returns:
+            BytesIO: the bytes of the encoded image ready to be written to disk
+        """
         scaled = self.image.resize((width, height), Image.ANTIALIAS)
-        return self.save_image(scaled, output_type, output_path)
+        return self._encode_image(scaled, output_type)
 
-    def crop(self, output_path, width, height, output_type):
+    def crop(self, width, height, output_type):
         """Rescales and crops the image to the given dimensions preserving
         aspect::
             (w_in / h_in) = (w_scaled / h_scaled)
@@ -65,6 +69,9 @@ class Thumbnailer(object):
         Args:
             max_width: The largest possible width.
             max_height: The larget possible height.
+
+        Returns:
+            BytesIO: the bytes of the encoded image ready to be written to disk
         """
         if width * self.height > height * self.width:
             scaled_height = (width * self.height) // self.width
@@ -82,13 +89,9 @@ class Thumbnailer(object):
             crop_left = (scaled_width - width) // 2
             crop_right = width + crop_left
             cropped = scaled_image.crop((crop_left, 0, crop_right, height))
-        return self.save_image(cropped, output_type, output_path)
+        return self._encode_image(cropped, output_type)
 
-    def save_image(self, output_image, output_type, output_path):
+    def _encode_image(self, output_image, output_type):
         output_bytes_io = BytesIO()
         output_image.save(output_bytes_io, self.FORMATS[output_type], quality=80)
-        output_bytes = output_bytes_io.getvalue()
-        with open(output_path, "wb") as output_file:
-            output_file.write(output_bytes)
-        logger.info("Stored thumbnail in file %r", output_path)
-        return len(output_bytes)
+        return output_bytes_io
diff --git a/synapse/rest/media/v1/upload_resource.py b/synapse/rest/media/v1/upload_resource.py
index 4ab33f73bf..f6f498cdc5 100644
--- a/synapse/rest/media/v1/upload_resource.py
+++ b/synapse/rest/media/v1/upload_resource.py
@@ -93,7 +93,7 @@ class UploadResource(Resource):
         # TODO(markjh): parse content-dispostion
 
         content_uri = yield self.media_repo.create_content(
-            media_type, upload_name, request.content.read(),
+            media_type, upload_name, request.content,
             content_length, requester.user
         )