summary refs log tree commit diff
path: root/synapse/rest/media/v1
diff options
context:
space:
mode:
Diffstat (limited to 'synapse/rest/media/v1')
-rw-r--r--synapse/rest/media/v1/_base.py94
-rw-r--r--synapse/rest/media/v1/download_resource.py3
-rw-r--r--synapse/rest/media/v1/media_repository.py125
-rw-r--r--synapse/rest/media/v1/media_storage.py1
-rw-r--r--synapse/rest/media/v1/preview_url_resource.py114
-rw-r--r--synapse/rest/media/v1/storage_provider.py6
-rw-r--r--synapse/rest/media/v1/thumbnail_resource.py72
-rw-r--r--synapse/rest/media/v1/thumbnailer.py19
-rw-r--r--synapse/rest/media/v1/upload_resource.py8
9 files changed, 247 insertions, 195 deletions
diff --git a/synapse/rest/media/v1/_base.py b/synapse/rest/media/v1/_base.py
index 5fefee4dde..3689777266 100644
--- a/synapse/rest/media/v1/_base.py
+++ b/synapse/rest/media/v1/_base.py
@@ -17,7 +17,6 @@
 import logging
 import os
 
-from six import PY3
 from six.moves import urllib
 
 from twisted.internet import defer
@@ -30,6 +29,22 @@ from synapse.util.stringutils import is_ascii
 
 logger = logging.getLogger(__name__)
 
+# list all text content types that will have the charset default to UTF-8 when
+# none is given
+TEXT_CONTENT_TYPES = [
+    "text/css",
+    "text/csv",
+    "text/html",
+    "text/calendar",
+    "text/plain",
+    "text/javascript",
+    "application/json",
+    "application/ld+json",
+    "application/rtf",
+    "image/svg+xml",
+    "text/xml",
+]
+
 
 def parse_media_id(request):
     try:
@@ -96,7 +111,14 @@ def add_file_headers(request, media_type, file_size, upload_name):
     def _quote(x):
         return urllib.parse.quote(x.encode("utf-8"))
 
-    request.setHeader(b"Content-Type", media_type.encode("UTF-8"))
+    # Default to a UTF-8 charset for text content types.
+    # ex, uses UTF-8 for 'text/css' but not 'text/css; charset=UTF-16'
+    if media_type.lower() in TEXT_CONTENT_TYPES:
+        content_type = media_type + "; charset=UTF-8"
+    else:
+        content_type = media_type
+
+    request.setHeader(b"Content-Type", content_type.encode("UTF-8"))
     if upload_name:
         # RFC6266 section 4.1 [1] defines both `filename` and `filename*`.
         #
@@ -135,27 +157,25 @@ def add_file_headers(request, media_type, file_size, upload_name):
 
 # separators as defined in RFC2616. SP and HT are handled separately.
 # see _can_encode_filename_as_token.
-_FILENAME_SEPARATOR_CHARS = set(
-    (
-        "(",
-        ")",
-        "<",
-        ">",
-        "@",
-        ",",
-        ";",
-        ":",
-        "\\",
-        '"',
-        "/",
-        "[",
-        "]",
-        "?",
-        "=",
-        "{",
-        "}",
-    )
-)
+_FILENAME_SEPARATOR_CHARS = {
+    "(",
+    ")",
+    "<",
+    ">",
+    "@",
+    ",",
+    ";",
+    ":",
+    "\\",
+    '"',
+    "/",
+    "[",
+    "]",
+    "?",
+    "=",
+    "{",
+    "}",
+}
 
 
 def _can_encode_filename_as_token(x):
@@ -195,7 +215,7 @@ def respond_with_responder(request, responder, media_type, file_size, upload_nam
         respond_404(request)
         return
 
-    logger.debug("Responding to media request with responder %s")
+    logger.debug("Responding to media request with responder %s", responder)
     add_file_headers(request, media_type, file_size, upload_name)
     try:
         with responder:
@@ -303,23 +323,15 @@ def get_filename_from_headers(headers):
             upload_name_utf8 = upload_name_utf8[7:]
             # We have a filename*= section. This MUST be ASCII, and any UTF-8
             # bytes are %-quoted.
-            if PY3:
-                try:
-                    # Once it is decoded, we can then unquote the %-encoded
-                    # parts strictly into a unicode string.
-                    upload_name = urllib.parse.unquote(
-                        upload_name_utf8.decode("ascii"), errors="strict"
-                    )
-                except UnicodeDecodeError:
-                    # Incorrect UTF-8.
-                    pass
-            else:
-                # On Python 2, we first unquote the %-encoded parts and then
-                # decode it strictly using UTF-8.
-                try:
-                    upload_name = urllib.parse.unquote(upload_name_utf8).decode("utf8")
-                except UnicodeDecodeError:
-                    pass
+            try:
+                # Once it is decoded, we can then unquote the %-encoded
+                # parts strictly into a unicode string.
+                upload_name = urllib.parse.unquote(
+                    upload_name_utf8.decode("ascii"), errors="strict"
+                )
+            except UnicodeDecodeError:
+                # Incorrect UTF-8.
+                pass
 
     # If there isn't check for an ascii name.
     if not upload_name:
diff --git a/synapse/rest/media/v1/download_resource.py b/synapse/rest/media/v1/download_resource.py
index 66a01559e1..24d3ae5bbc 100644
--- a/synapse/rest/media/v1/download_resource.py
+++ b/synapse/rest/media/v1/download_resource.py
@@ -50,6 +50,9 @@ class DownloadResource(DirectServeResource):
             b" media-src 'self';"
             b" object-src 'self';",
         )
+        request.setHeader(
+            b"Referrer-Policy", b"no-referrer",
+        )
         server_name, media_id, name = parse_media_id(request)
         if server_name == self.server_name:
             await self.media_repo.get_local_media(request, media_id, name)
diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py
index b972e152a9..fd10d42f2f 100644
--- a/synapse/rest/media/v1/media_repository.py
+++ b/synapse/rest/media/v1/media_repository.py
@@ -18,12 +18,12 @@ import errno
 import logging
 import os
 import shutil
+from typing import Dict, Tuple
 
 from six import iteritems
 
 import twisted.internet.error
 import twisted.web.http
-from twisted.internet import defer
 from twisted.web.resource import Resource
 
 from synapse.api.errors import (
@@ -113,15 +113,14 @@ class MediaRepository(object):
             "update_recently_accessed_media", self._update_recently_accessed
         )
 
-    @defer.inlineCallbacks
-    def _update_recently_accessed(self):
+    async def _update_recently_accessed(self):
         remote_media = self.recently_accessed_remotes
         self.recently_accessed_remotes = set()
 
         local_media = self.recently_accessed_locals
         self.recently_accessed_locals = set()
 
-        yield self.store.update_cached_last_access_time(
+        await self.store.update_cached_last_access_time(
             local_media, remote_media, self.clock.time_msec()
         )
 
@@ -137,8 +136,7 @@ class MediaRepository(object):
         else:
             self.recently_accessed_locals.add(media_id)
 
-    @defer.inlineCallbacks
-    def create_content(
+    async def create_content(
         self, media_type, upload_name, content, content_length, auth_user
     ):
         """Store uploaded content for a local user and return the mxc URL
@@ -157,11 +155,11 @@ class MediaRepository(object):
 
         file_info = FileInfo(server_name=None, file_id=media_id)
 
-        fname = yield self.media_storage.store_file(content, file_info)
+        fname = await self.media_storage.store_file(content, file_info)
 
         logger.info("Stored local media in file %r", fname)
 
-        yield self.store.store_local_media(
+        await self.store.store_local_media(
             media_id=media_id,
             media_type=media_type,
             time_now_ms=self.clock.time_msec(),
@@ -170,12 +168,11 @@ class MediaRepository(object):
             user_id=auth_user,
         )
 
-        yield self._generate_thumbnails(None, media_id, media_id, media_type)
+        await self._generate_thumbnails(None, media_id, media_id, media_type)
 
         return "mxc://%s/%s" % (self.server_name, media_id)
 
-    @defer.inlineCallbacks
-    def get_local_media(self, request, media_id, name):
+    async def get_local_media(self, request, media_id, name):
         """Responds to reqests for local media, if exists, or returns 404.
 
         Args:
@@ -189,7 +186,7 @@ class MediaRepository(object):
             Deferred: Resolves once a response has successfully been written
                 to request
         """
-        media_info = yield self.store.get_local_media(media_id)
+        media_info = await self.store.get_local_media(media_id)
         if not media_info or media_info["quarantined_by"]:
             respond_404(request)
             return
@@ -203,13 +200,12 @@ class MediaRepository(object):
 
         file_info = FileInfo(None, media_id, url_cache=url_cache)
 
-        responder = yield self.media_storage.fetch_media(file_info)
-        yield respond_with_responder(
+        responder = await self.media_storage.fetch_media(file_info)
+        await respond_with_responder(
             request, responder, media_type, media_length, upload_name
         )
 
-    @defer.inlineCallbacks
-    def get_remote_media(self, request, server_name, media_id, name):
+    async def get_remote_media(self, request, server_name, media_id, name):
         """Respond to requests for remote media.
 
         Args:
@@ -235,8 +231,8 @@ class MediaRepository(object):
         # We linearize here to ensure that we don't try and download remote
         # media multiple times concurrently
         key = (server_name, media_id)
-        with (yield self.remote_media_linearizer.queue(key)):
-            responder, media_info = yield self._get_remote_media_impl(
+        with (await self.remote_media_linearizer.queue(key)):
+            responder, media_info = await self._get_remote_media_impl(
                 server_name, media_id
             )
 
@@ -245,14 +241,13 @@ class MediaRepository(object):
             media_type = media_info["media_type"]
             media_length = media_info["media_length"]
             upload_name = name if name else media_info["upload_name"]
-            yield respond_with_responder(
+            await respond_with_responder(
                 request, responder, media_type, media_length, upload_name
             )
         else:
             respond_404(request)
 
-    @defer.inlineCallbacks
-    def get_remote_media_info(self, server_name, media_id):
+    async def get_remote_media_info(self, server_name, media_id):
         """Gets the media info associated with the remote file, downloading
         if necessary.
 
@@ -273,8 +268,8 @@ class MediaRepository(object):
         # We linearize here to ensure that we don't try and download remote
         # media multiple times concurrently
         key = (server_name, media_id)
-        with (yield self.remote_media_linearizer.queue(key)):
-            responder, media_info = yield self._get_remote_media_impl(
+        with (await self.remote_media_linearizer.queue(key)):
+            responder, media_info = await self._get_remote_media_impl(
                 server_name, media_id
             )
 
@@ -285,8 +280,7 @@ class MediaRepository(object):
 
         return media_info
 
-    @defer.inlineCallbacks
-    def _get_remote_media_impl(self, server_name, media_id):
+    async def _get_remote_media_impl(self, server_name, media_id):
         """Looks for media in local cache, if not there then attempt to
         download from remote server.
 
@@ -298,7 +292,7 @@ class MediaRepository(object):
         Returns:
             Deferred[(Responder, media_info)]
         """
-        media_info = yield self.store.get_cached_remote_media(server_name, media_id)
+        media_info = await self.store.get_cached_remote_media(server_name, media_id)
 
         # file_id is the ID we use to track the file locally. If we've already
         # seen the file then reuse the existing ID, otherwise genereate a new
@@ -316,19 +310,18 @@ class MediaRepository(object):
                 logger.info("Media is quarantined")
                 raise NotFoundError()
 
-            responder = yield self.media_storage.fetch_media(file_info)
+            responder = await self.media_storage.fetch_media(file_info)
             if responder:
                 return responder, media_info
 
         # Failed to find the file anywhere, lets download it.
 
-        media_info = yield self._download_remote_file(server_name, media_id, file_id)
+        media_info = await self._download_remote_file(server_name, media_id, file_id)
 
-        responder = yield self.media_storage.fetch_media(file_info)
+        responder = await self.media_storage.fetch_media(file_info)
         return responder, media_info
 
-    @defer.inlineCallbacks
-    def _download_remote_file(self, server_name, media_id, file_id):
+    async def _download_remote_file(self, server_name, media_id, file_id):
         """Attempt to download the remote file from the given server name,
         using the given file_id as the local id.
 
@@ -350,7 +343,7 @@ class MediaRepository(object):
                 ("/_matrix/media/v1/download", server_name, media_id)
             )
             try:
-                length, headers = yield self.client.get_file(
+                length, headers = await self.client.get_file(
                     server_name,
                     request_path,
                     output_stream=f,
@@ -363,7 +356,7 @@ class MediaRepository(object):
                     },
                 )
             except RequestSendFailed as e:
-                logger.warn(
+                logger.warning(
                     "Request failed fetching remote media %s/%s: %r",
                     server_name,
                     media_id,
@@ -372,7 +365,7 @@ class MediaRepository(object):
                 raise SynapseError(502, "Failed to fetch remote media")
 
             except HttpResponseException as e:
-                logger.warn(
+                logger.warning(
                     "HTTP error fetching remote media %s/%s: %s",
                     server_name,
                     media_id,
@@ -383,10 +376,12 @@ class MediaRepository(object):
                 raise SynapseError(502, "Failed to fetch remote media")
 
             except SynapseError:
-                logger.warn("Failed to fetch remote media %s/%s", server_name, media_id)
+                logger.warning(
+                    "Failed to fetch remote media %s/%s", server_name, media_id
+                )
                 raise
             except NotRetryingDestination:
-                logger.warn("Not retrying destination %r", server_name)
+                logger.warning("Not retrying destination %r", server_name)
                 raise SynapseError(502, "Failed to fetch remote media")
             except Exception:
                 logger.exception(
@@ -394,7 +389,7 @@ class MediaRepository(object):
                 )
                 raise SynapseError(502, "Failed to fetch remote media")
 
-            yield finish()
+            await finish()
 
         media_type = headers[b"Content-Type"][0].decode("ascii")
         upload_name = get_filename_from_headers(headers)
@@ -402,7 +397,7 @@ class MediaRepository(object):
 
         logger.info("Stored remote media in file %r", fname)
 
-        yield self.store.store_cached_remote_media(
+        await self.store.store_cached_remote_media(
             origin=server_name,
             media_id=media_id,
             media_type=media_type,
@@ -420,7 +415,7 @@ class MediaRepository(object):
             "filesystem_id": file_id,
         }
 
-        yield self._generate_thumbnails(server_name, media_id, file_id, media_type)
+        await self._generate_thumbnails(server_name, media_id, file_id, media_type)
 
         return media_info
 
@@ -455,16 +450,15 @@ class MediaRepository(object):
 
         return t_byte_source
 
-    @defer.inlineCallbacks
-    def generate_local_exact_thumbnail(
+    async def generate_local_exact_thumbnail(
         self, media_id, t_width, t_height, t_method, t_type, url_cache
     ):
-        input_path = yield self.media_storage.ensure_media_is_in_local_cache(
+        input_path = await self.media_storage.ensure_media_is_in_local_cache(
             FileInfo(None, media_id, url_cache=url_cache)
         )
 
         thumbnailer = Thumbnailer(input_path)
-        t_byte_source = yield defer_to_thread(
+        t_byte_source = await defer_to_thread(
             self.hs.get_reactor(),
             self._generate_thumbnail,
             thumbnailer,
@@ -487,7 +481,7 @@ class MediaRepository(object):
                     thumbnail_type=t_type,
                 )
 
-                output_path = yield self.media_storage.store_file(
+                output_path = await self.media_storage.store_file(
                     t_byte_source, file_info
                 )
             finally:
@@ -497,22 +491,21 @@ class MediaRepository(object):
 
             t_len = os.path.getsize(output_path)
 
-            yield self.store.store_local_thumbnail(
+            await self.store.store_local_thumbnail(
                 media_id, t_width, t_height, t_type, t_method, t_len
             )
 
             return output_path
 
-    @defer.inlineCallbacks
-    def generate_remote_exact_thumbnail(
+    async def generate_remote_exact_thumbnail(
         self, server_name, file_id, media_id, t_width, t_height, t_method, t_type
     ):
-        input_path = yield self.media_storage.ensure_media_is_in_local_cache(
+        input_path = await self.media_storage.ensure_media_is_in_local_cache(
             FileInfo(server_name, file_id, url_cache=False)
         )
 
         thumbnailer = Thumbnailer(input_path)
-        t_byte_source = yield defer_to_thread(
+        t_byte_source = await defer_to_thread(
             self.hs.get_reactor(),
             self._generate_thumbnail,
             thumbnailer,
@@ -534,7 +527,7 @@ class MediaRepository(object):
                     thumbnail_type=t_type,
                 )
 
-                output_path = yield self.media_storage.store_file(
+                output_path = await self.media_storage.store_file(
                     t_byte_source, file_info
                 )
             finally:
@@ -544,7 +537,7 @@ class MediaRepository(object):
 
             t_len = os.path.getsize(output_path)
 
-            yield self.store.store_remote_media_thumbnail(
+            await self.store.store_remote_media_thumbnail(
                 server_name,
                 media_id,
                 file_id,
@@ -557,8 +550,7 @@ class MediaRepository(object):
 
             return output_path
 
-    @defer.inlineCallbacks
-    def _generate_thumbnails(
+    async def _generate_thumbnails(
         self, server_name, media_id, file_id, media_type, url_cache=False
     ):
         """Generate and store thumbnails for an image.
@@ -579,7 +571,7 @@ class MediaRepository(object):
         if not requirements:
             return
 
-        input_path = yield self.media_storage.ensure_media_is_in_local_cache(
+        input_path = await self.media_storage.ensure_media_is_in_local_cache(
             FileInfo(server_name, file_id, url_cache=url_cache)
         )
 
@@ -597,13 +589,13 @@ class MediaRepository(object):
             return
 
         if thumbnailer.transpose_method is not None:
-            m_width, m_height = yield defer_to_thread(
+            m_width, m_height = await defer_to_thread(
                 self.hs.get_reactor(), thumbnailer.transpose
             )
 
         # We deduplicate the thumbnail sizes by ignoring the cropped versions if
         # they have the same dimensions of a scaled one.
-        thumbnails = {}
+        thumbnails = {}  # type: Dict[Tuple[int, int, str], str]
         for r_width, r_height, r_method, r_type in requirements:
             if r_method == "crop":
                 thumbnails.setdefault((r_width, r_height, r_type), r_method)
@@ -617,11 +609,11 @@ class MediaRepository(object):
         for (t_width, t_height, t_type), t_method in iteritems(thumbnails):
             # Generate the thumbnail
             if t_method == "crop":
-                t_byte_source = yield defer_to_thread(
+                t_byte_source = await defer_to_thread(
                     self.hs.get_reactor(), thumbnailer.crop, t_width, t_height, t_type
                 )
             elif t_method == "scale":
-                t_byte_source = yield defer_to_thread(
+                t_byte_source = await defer_to_thread(
                     self.hs.get_reactor(), thumbnailer.scale, t_width, t_height, t_type
                 )
             else:
@@ -643,7 +635,7 @@ class MediaRepository(object):
                     url_cache=url_cache,
                 )
 
-                output_path = yield self.media_storage.store_file(
+                output_path = await self.media_storage.store_file(
                     t_byte_source, file_info
                 )
             finally:
@@ -653,7 +645,7 @@ class MediaRepository(object):
 
             # Write to database
             if server_name:
-                yield self.store.store_remote_media_thumbnail(
+                await self.store.store_remote_media_thumbnail(
                     server_name,
                     media_id,
                     file_id,
@@ -664,15 +656,14 @@ class MediaRepository(object):
                     t_len,
                 )
             else:
-                yield self.store.store_local_thumbnail(
+                await self.store.store_local_thumbnail(
                     media_id, t_width, t_height, t_type, t_method, t_len
                 )
 
         return {"width": m_width, "height": m_height}
 
-    @defer.inlineCallbacks
-    def delete_old_remote_media(self, before_ts):
-        old_media = yield self.store.get_remote_media_before(before_ts)
+    async def delete_old_remote_media(self, before_ts):
+        old_media = await self.store.get_remote_media_before(before_ts)
 
         deleted = 0
 
@@ -686,12 +677,12 @@ class MediaRepository(object):
 
             # TODO: Should we delete from the backup store
 
-            with (yield self.remote_media_linearizer.queue(key)):
+            with (await self.remote_media_linearizer.queue(key)):
                 full_path = self.filepaths.remote_media_filepath(origin, file_id)
                 try:
                     os.remove(full_path)
                 except OSError as e:
-                    logger.warn("Failed to remove file: %r", full_path)
+                    logger.warning("Failed to remove file: %r", full_path)
                     if e.errno == errno.ENOENT:
                         pass
                     else:
@@ -702,7 +693,7 @@ class MediaRepository(object):
                 )
                 shutil.rmtree(thumbnail_dir, ignore_errors=True)
 
-                yield self.store.delete_remote_media(origin, media_id)
+                await self.store.delete_remote_media(origin, media_id)
                 deleted += 1
 
         return {"deleted": deleted}
diff --git a/synapse/rest/media/v1/media_storage.py b/synapse/rest/media/v1/media_storage.py
index 3b87717a5a..683a79c966 100644
--- a/synapse/rest/media/v1/media_storage.py
+++ b/synapse/rest/media/v1/media_storage.py
@@ -148,6 +148,7 @@ class MediaStorage(object):
         for provider in self.storage_providers:
             res = yield provider.fetch(path, file_info)
             if res:
+                logger.debug("Streaming %s from %s", path, provider)
                 return res
 
         return None
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 7a56cd4b6c..f206605727 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -23,6 +23,7 @@ import re
 import shutil
 import sys
 import traceback
+from typing import Dict, Optional
 
 import six
 from six import string_types
@@ -56,6 +57,9 @@ logger = logging.getLogger(__name__)
 _charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
 _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
 
+OG_TAG_NAME_MAXLEN = 50
+OG_TAG_VALUE_MAXLEN = 1000
+
 
 class PreviewUrlResource(DirectServeResource):
     isLeaf = True
@@ -74,12 +78,15 @@ class PreviewUrlResource(DirectServeResource):
             treq_args={"browser_like_redirects": True},
             ip_whitelist=hs.config.url_preview_ip_range_whitelist,
             ip_blacklist=hs.config.url_preview_ip_range_blacklist,
+            http_proxy=os.getenvb(b"http_proxy"),
+            https_proxy=os.getenvb(b"HTTPS_PROXY"),
         )
         self.media_repo = media_repo
         self.primary_base_path = media_repo.primary_base_path
         self.media_storage = media_storage
 
         self.url_preview_url_blacklist = hs.config.url_preview_url_blacklist
+        self.url_preview_accept_language = hs.config.url_preview_accept_language
 
         # memory cache mapping urls to an ObservableDeferred returning
         # JSON-encoded OG metadata
@@ -117,8 +124,10 @@ class PreviewUrlResource(DirectServeResource):
                 pattern = entry[attrib]
                 value = getattr(url_tuple, attrib)
                 logger.debug(
-                    ("Matching attrib '%s' with value '%s' against" " pattern '%s'")
-                    % (attrib, value, pattern)
+                    "Matching attrib '%s' with value '%s' against pattern '%s'",
+                    attrib,
+                    value,
+                    pattern,
                 )
 
                 if value is None:
@@ -134,7 +143,7 @@ class PreviewUrlResource(DirectServeResource):
                         match = False
                         continue
             if match:
-                logger.warn("URL %s blocked by url_blacklist entry %s", url, entry)
+                logger.warning("URL %s blocked by url_blacklist entry %s", url, entry)
                 raise SynapseError(
                     403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN
                 )
@@ -157,8 +166,7 @@ class PreviewUrlResource(DirectServeResource):
         og = await make_deferred_yieldable(defer.maybeDeferred(observable.observe))
         respond_with_json_bytes(request, 200, og, send_cors=True)
 
-    @defer.inlineCallbacks
-    def _do_preview(self, url, user, ts):
+    async def _do_preview(self, url, user, ts):
         """Check the db, and download the URL and build a preview
 
         Args:
@@ -167,11 +175,11 @@ class PreviewUrlResource(DirectServeResource):
             ts (int):
 
         Returns:
-            Deferred[str]: json-encoded og data
+            Deferred[bytes]: json-encoded og data
         """
         # check the URL cache in the DB (which will also provide us with
         # historical previews, if we have any)
-        cache_result = yield self.store.get_url_cache(url, ts)
+        cache_result = await self.store.get_url_cache(url, ts)
         if (
             cache_result
             and cache_result["expires_ts"] > ts
@@ -184,13 +192,13 @@ class PreviewUrlResource(DirectServeResource):
                 og = og.encode("utf8")
             return og
 
-        media_info = yield self._download_url(url, user)
+        media_info = await self._download_url(url, user)
 
-        logger.debug("got media_info of '%s'" % media_info)
+        logger.debug("got media_info of '%s'", media_info)
 
         if _is_media(media_info["media_type"]):
             file_id = media_info["filesystem_id"]
-            dims = yield self.media_repo._generate_thumbnails(
+            dims = await self.media_repo._generate_thumbnails(
                 None, file_id, file_id, media_info["media_type"], url_cache=True
             )
 
@@ -206,7 +214,7 @@ class PreviewUrlResource(DirectServeResource):
                 og["og:image:width"] = dims["width"]
                 og["og:image:height"] = dims["height"]
             else:
-                logger.warn("Couldn't get dims for %s" % url)
+                logger.warning("Couldn't get dims for %s" % url)
 
             # define our OG response for this media
         elif _is_html(media_info["media_type"]):
@@ -230,8 +238,8 @@ class PreviewUrlResource(DirectServeResource):
             # If we don't find a match, we'll look at the HTTP Content-Type, and
             # if that doesn't exist, we'll fall back to UTF-8.
             if not encoding:
-                match = _content_type_match.match(media_info["media_type"])
-                encoding = match.group(1) if match else "utf-8"
+                content_match = _content_type_match.match(media_info["media_type"])
+                encoding = content_match.group(1) if content_match else "utf-8"
 
             og = decode_and_calc_og(body, media_info["uri"], encoding)
 
@@ -240,21 +248,21 @@ class PreviewUrlResource(DirectServeResource):
             # request itself and benefit from the same caching etc.  But for now we
             # just rely on the caching on the master request to speed things up.
             if "og:image" in og and og["og:image"]:
-                image_info = yield self._download_url(
+                image_info = await self._download_url(
                     _rebase_url(og["og:image"], media_info["uri"]), user
                 )
 
                 if _is_media(image_info["media_type"]):
                     # TODO: make sure we don't choke on white-on-transparent images
                     file_id = image_info["filesystem_id"]
-                    dims = yield self.media_repo._generate_thumbnails(
+                    dims = await self.media_repo._generate_thumbnails(
                         None, file_id, file_id, image_info["media_type"], url_cache=True
                     )
                     if dims:
                         og["og:image:width"] = dims["width"]
                         og["og:image:height"] = dims["height"]
                     else:
-                        logger.warn("Couldn't get dims for %s" % og["og:image"])
+                        logger.warning("Couldn't get dims for %s", og["og:image"])
 
                     og["og:image"] = "mxc://%s/%s" % (
                         self.server_name,
@@ -265,15 +273,27 @@ class PreviewUrlResource(DirectServeResource):
                 else:
                     del og["og:image"]
         else:
-            logger.warn("Failed to find any OG data in %s", url)
+            logger.warning("Failed to find any OG data in %s", url)
             og = {}
 
-        logger.debug("Calculated OG for %s as %s" % (url, og))
+        # filter out any stupidly long values
+        keys_to_remove = []
+        for k, v in og.items():
+            # values can be numeric as well as strings, hence the cast to str
+            if len(k) > OG_TAG_NAME_MAXLEN or len(str(v)) > OG_TAG_VALUE_MAXLEN:
+                logger.warning(
+                    "Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN]
+                )
+                keys_to_remove.append(k)
+        for k in keys_to_remove:
+            del og[k]
+
+        logger.debug("Calculated OG for %s as %s", url, og)
 
-        jsonog = json.dumps(og).encode("utf8")
+        jsonog = json.dumps(og)
 
         # store OG in history-aware DB cache
-        yield self.store.store_url_cache(
+        await self.store.store_url_cache(
             url,
             media_info["response_code"],
             media_info["etag"],
@@ -283,10 +303,9 @@ class PreviewUrlResource(DirectServeResource):
             media_info["created_ts"],
         )
 
-        return jsonog
+        return jsonog.encode("utf8")
 
-    @defer.inlineCallbacks
-    def _download_url(self, url, user):
+    async def _download_url(self, url, user):
         # TODO: we should probably honour robots.txt... except in practice
         # we're most likely being explicitly triggered by a human rather than a
         # bot, so are we really a robot?
@@ -297,9 +316,12 @@ class PreviewUrlResource(DirectServeResource):
 
         with self.media_storage.store_into_file(file_info) as (f, fname, finish):
             try:
-                logger.debug("Trying to get url '%s'" % url)
-                length, headers, uri, code = yield self.client.get_file(
-                    url, output_stream=f, max_size=self.max_spider_size
+                logger.debug("Trying to get preview for url '%s'", url)
+                length, headers, uri, code = await self.client.get_file(
+                    url,
+                    output_stream=f,
+                    max_size=self.max_spider_size,
+                    headers={"Accept-Language": self.url_preview_accept_language},
                 )
             except SynapseError:
                 # Pass SynapseErrors through directly, so that the servlet
@@ -317,7 +339,7 @@ class PreviewUrlResource(DirectServeResource):
                 )
             except Exception as e:
                 # FIXME: pass through 404s and other error messages nicely
-                logger.warn("Error downloading %s: %r", url, e)
+                logger.warning("Error downloading %s: %r", url, e)
 
                 raise SynapseError(
                     500,
@@ -325,7 +347,7 @@ class PreviewUrlResource(DirectServeResource):
                     % (traceback.format_exception_only(sys.exc_info()[0], e),),
                     Codes.UNKNOWN,
                 )
-            yield finish()
+            await finish()
 
         try:
             if b"Content-Type" in headers:
@@ -336,7 +358,7 @@ class PreviewUrlResource(DirectServeResource):
 
             download_name = get_filename_from_headers(headers)
 
-            yield self.store.store_local_media(
+            await self.store.store_local_media(
                 media_id=file_id,
                 media_type=media_type,
                 time_now_ms=self.clock.time_msec(),
@@ -373,22 +395,21 @@ class PreviewUrlResource(DirectServeResource):
             "expire_url_cache_data", self._expire_url_cache_data
         )
 
-    @defer.inlineCallbacks
-    def _expire_url_cache_data(self):
+    async def _expire_url_cache_data(self):
         """Clean up expired url cache content, media and thumbnails.
         """
         # TODO: Delete from backup media store
 
         now = self.clock.time_msec()
 
-        logger.info("Running url preview cache expiry")
+        logger.debug("Running url preview cache expiry")
 
-        if not (yield self.store.has_completed_background_updates()):
+        if not (await self.store.db.updates.has_completed_background_updates()):
             logger.info("Still running DB updates; skipping expiry")
             return
 
         # First we delete expired url cache entries
-        media_ids = yield self.store.get_expired_url_cache(now)
+        media_ids = await self.store.get_expired_url_cache(now)
 
         removed_media = []
         for media_id in media_ids:
@@ -398,7 +419,7 @@ class PreviewUrlResource(DirectServeResource):
             except OSError as e:
                 # If the path doesn't exist, meh
                 if e.errno != errno.ENOENT:
-                    logger.warn("Failed to remove media: %r: %s", media_id, e)
+                    logger.warning("Failed to remove media: %r: %s", media_id, e)
                     continue
 
             removed_media.append(media_id)
@@ -410,17 +431,19 @@ class PreviewUrlResource(DirectServeResource):
             except Exception:
                 pass
 
-        yield self.store.delete_url_cache(removed_media)
+        await self.store.delete_url_cache(removed_media)
 
         if removed_media:
             logger.info("Deleted %d entries from url cache", len(removed_media))
+        else:
+            logger.debug("No entries removed from url cache")
 
         # Now we delete old images associated with the url cache.
         # These may be cached for a bit on the client (i.e., they
         # may have a room open with a preview url thing open).
         # So we wait a couple of days before deleting, just in case.
         expire_before = now - 2 * 24 * 60 * 60 * 1000
-        media_ids = yield self.store.get_url_cache_media_before(expire_before)
+        media_ids = await self.store.get_url_cache_media_before(expire_before)
 
         removed_media = []
         for media_id in media_ids:
@@ -430,7 +453,7 @@ class PreviewUrlResource(DirectServeResource):
             except OSError as e:
                 # If the path doesn't exist, meh
                 if e.errno != errno.ENOENT:
-                    logger.warn("Failed to remove media: %r: %s", media_id, e)
+                    logger.warning("Failed to remove media: %r: %s", media_id, e)
                     continue
 
             try:
@@ -446,7 +469,7 @@ class PreviewUrlResource(DirectServeResource):
             except OSError as e:
                 # If the path doesn't exist, meh
                 if e.errno != errno.ENOENT:
-                    logger.warn("Failed to remove media: %r: %s", media_id, e)
+                    logger.warning("Failed to remove media: %r: %s", media_id, e)
                     continue
 
             removed_media.append(media_id)
@@ -458,9 +481,12 @@ class PreviewUrlResource(DirectServeResource):
             except Exception:
                 pass
 
-        yield self.store.delete_url_cache_media(removed_media)
+        await self.store.delete_url_cache_media(removed_media)
 
-        logger.info("Deleted %d media from url cache", len(removed_media))
+        if removed_media:
+            logger.info("Deleted %d media from url cache", len(removed_media))
+        else:
+            logger.debug("No media removed from url cache")
 
 
 def decode_and_calc_og(body, media_uri, request_encoding=None):
@@ -499,9 +525,13 @@ def _calc_og(tree, media_uri):
     # "og:video:height" : "720",
     # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
 
-    og = {}
+    og = {}  # type: Dict[str, Optional[str]]
     for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
         if "content" in tag.attrib:
+            # if we've got more than 50 tags, someone is taking the piss
+            if len(og) >= 50:
+                logger.warning("Skipping OG for page with too many 'og:' tags")
+                return {}
             og[tag.attrib["property"]] = tag.attrib["content"]
 
     # TODO: grab article: meta tags too, e.g.:
diff --git a/synapse/rest/media/v1/storage_provider.py b/synapse/rest/media/v1/storage_provider.py
index 37687ea7f4..858680be26 100644
--- a/synapse/rest/media/v1/storage_provider.py
+++ b/synapse/rest/media/v1/storage_provider.py
@@ -77,6 +77,9 @@ class StorageProviderWrapper(StorageProvider):
         self.store_synchronous = store_synchronous
         self.store_remote = store_remote
 
+    def __str__(self):
+        return "StorageProviderWrapper[%s]" % (self.backend,)
+
     def store_file(self, path, file_info):
         if not file_info.server_name and not self.store_local:
             return defer.succeed(None)
@@ -114,6 +117,9 @@ class FileStorageProviderBackend(StorageProvider):
         self.cache_directory = hs.config.media_store_path
         self.base_directory = config
 
+    def __str__(self):
+        return "FileStorageProviderBackend[%s]" % (self.base_directory,)
+
     def store_file(self, path, file_info):
         """See StorageProvider.store_file"""
 
diff --git a/synapse/rest/media/v1/thumbnail_resource.py b/synapse/rest/media/v1/thumbnail_resource.py
index 08329884ac..0b87220234 100644
--- a/synapse/rest/media/v1/thumbnail_resource.py
+++ b/synapse/rest/media/v1/thumbnail_resource.py
@@ -16,8 +16,6 @@
 
 import logging
 
-from twisted.internet import defer
-
 from synapse.http.server import (
     DirectServeResource,
     set_cors_headers,
@@ -79,11 +77,10 @@ class ThumbnailResource(DirectServeResource):
                 )
             self.media_repo.mark_recently_accessed(server_name, media_id)
 
-    @defer.inlineCallbacks
-    def _respond_local_thumbnail(
+    async def _respond_local_thumbnail(
         self, request, media_id, width, height, method, m_type
     ):
-        media_info = yield self.store.get_local_media(media_id)
+        media_info = await self.store.get_local_media(media_id)
 
         if not media_info:
             respond_404(request)
@@ -93,7 +90,7 @@ class ThumbnailResource(DirectServeResource):
             respond_404(request)
             return
 
-        thumbnail_infos = yield self.store.get_local_media_thumbnails(media_id)
+        thumbnail_infos = await self.store.get_local_media_thumbnails(media_id)
 
         if thumbnail_infos:
             thumbnail_info = self._select_thumbnail(
@@ -114,14 +111,13 @@ class ThumbnailResource(DirectServeResource):
             t_type = file_info.thumbnail_type
             t_length = thumbnail_info["thumbnail_length"]
 
-            responder = yield self.media_storage.fetch_media(file_info)
-            yield respond_with_responder(request, responder, t_type, t_length)
+            responder = await self.media_storage.fetch_media(file_info)
+            await respond_with_responder(request, responder, t_type, t_length)
         else:
             logger.info("Couldn't find any generated thumbnails")
             respond_404(request)
 
-    @defer.inlineCallbacks
-    def _select_or_generate_local_thumbnail(
+    async def _select_or_generate_local_thumbnail(
         self,
         request,
         media_id,
@@ -130,7 +126,7 @@ class ThumbnailResource(DirectServeResource):
         desired_method,
         desired_type,
     ):
-        media_info = yield self.store.get_local_media(media_id)
+        media_info = await self.store.get_local_media(media_id)
 
         if not media_info:
             respond_404(request)
@@ -140,7 +136,7 @@ class ThumbnailResource(DirectServeResource):
             respond_404(request)
             return
 
-        thumbnail_infos = yield self.store.get_local_media_thumbnails(media_id)
+        thumbnail_infos = await self.store.get_local_media_thumbnails(media_id)
         for info in thumbnail_infos:
             t_w = info["thumbnail_width"] == desired_width
             t_h = info["thumbnail_height"] == desired_height
@@ -162,15 +158,15 @@ class ThumbnailResource(DirectServeResource):
                 t_type = file_info.thumbnail_type
                 t_length = info["thumbnail_length"]
 
-                responder = yield self.media_storage.fetch_media(file_info)
+                responder = await self.media_storage.fetch_media(file_info)
                 if responder:
-                    yield respond_with_responder(request, responder, t_type, t_length)
+                    await respond_with_responder(request, responder, t_type, t_length)
                     return
 
         logger.debug("We don't have a thumbnail of that size. Generating")
 
         # Okay, so we generate one.
-        file_path = yield self.media_repo.generate_local_exact_thumbnail(
+        file_path = await self.media_repo.generate_local_exact_thumbnail(
             media_id,
             desired_width,
             desired_height,
@@ -180,13 +176,12 @@ class ThumbnailResource(DirectServeResource):
         )
 
         if file_path:
-            yield respond_with_file(request, desired_type, file_path)
+            await respond_with_file(request, desired_type, file_path)
         else:
-            logger.warn("Failed to generate thumbnail")
+            logger.warning("Failed to generate thumbnail")
             respond_404(request)
 
-    @defer.inlineCallbacks
-    def _select_or_generate_remote_thumbnail(
+    async def _select_or_generate_remote_thumbnail(
         self,
         request,
         server_name,
@@ -196,9 +191,9 @@ class ThumbnailResource(DirectServeResource):
         desired_method,
         desired_type,
     ):
-        media_info = yield self.media_repo.get_remote_media_info(server_name, media_id)
+        media_info = await self.media_repo.get_remote_media_info(server_name, media_id)
 
-        thumbnail_infos = yield self.store.get_remote_media_thumbnails(
+        thumbnail_infos = await self.store.get_remote_media_thumbnails(
             server_name, media_id
         )
 
@@ -224,15 +219,15 @@ class ThumbnailResource(DirectServeResource):
                 t_type = file_info.thumbnail_type
                 t_length = info["thumbnail_length"]
 
-                responder = yield self.media_storage.fetch_media(file_info)
+                responder = await self.media_storage.fetch_media(file_info)
                 if responder:
-                    yield respond_with_responder(request, responder, t_type, t_length)
+                    await respond_with_responder(request, responder, t_type, t_length)
                     return
 
         logger.debug("We don't have a thumbnail of that size. Generating")
 
         # Okay, so we generate one.
-        file_path = yield self.media_repo.generate_remote_exact_thumbnail(
+        file_path = await self.media_repo.generate_remote_exact_thumbnail(
             server_name,
             file_id,
             media_id,
@@ -243,21 +238,20 @@ class ThumbnailResource(DirectServeResource):
         )
 
         if file_path:
-            yield respond_with_file(request, desired_type, file_path)
+            await respond_with_file(request, desired_type, file_path)
         else:
-            logger.warn("Failed to generate thumbnail")
+            logger.warning("Failed to generate thumbnail")
             respond_404(request)
 
-    @defer.inlineCallbacks
-    def _respond_remote_thumbnail(
+    async def _respond_remote_thumbnail(
         self, request, server_name, media_id, width, height, method, m_type
     ):
         # TODO: Don't download the whole remote file
         # We should proxy the thumbnail from the remote server instead of
         # downloading the remote file and generating our own thumbnails.
-        media_info = yield self.media_repo.get_remote_media_info(server_name, media_id)
+        media_info = await self.media_repo.get_remote_media_info(server_name, media_id)
 
-        thumbnail_infos = yield self.store.get_remote_media_thumbnails(
+        thumbnail_infos = await self.store.get_remote_media_thumbnails(
             server_name, media_id
         )
 
@@ -278,8 +272,8 @@ class ThumbnailResource(DirectServeResource):
             t_type = file_info.thumbnail_type
             t_length = thumbnail_info["thumbnail_length"]
 
-            responder = yield self.media_storage.fetch_media(file_info)
-            yield respond_with_responder(request, responder, t_type, t_length)
+            responder = await self.media_storage.fetch_media(file_info)
+            await respond_with_responder(request, responder, t_type, t_length)
         else:
             logger.info("Failed to find any generated thumbnails")
             respond_404(request)
@@ -296,8 +290,8 @@ class ThumbnailResource(DirectServeResource):
         d_h = desired_height
 
         if desired_method.lower() == "crop":
-            info_list = []
-            info_list2 = []
+            crop_info_list = []
+            crop_info_list2 = []
             for info in thumbnail_infos:
                 t_w = info["thumbnail_width"]
                 t_h = info["thumbnail_height"]
@@ -309,7 +303,7 @@ class ThumbnailResource(DirectServeResource):
                     type_quality = desired_type != info["thumbnail_type"]
                     length_quality = info["thumbnail_length"]
                     if t_w >= d_w or t_h >= d_h:
-                        info_list.append(
+                        crop_info_list.append(
                             (
                                 aspect_quality,
                                 min_quality,
@@ -320,7 +314,7 @@ class ThumbnailResource(DirectServeResource):
                             )
                         )
                     else:
-                        info_list2.append(
+                        crop_info_list2.append(
                             (
                                 aspect_quality,
                                 min_quality,
@@ -330,10 +324,10 @@ class ThumbnailResource(DirectServeResource):
                                 info,
                             )
                         )
-            if info_list:
-                return min(info_list)[-1]
+            if crop_info_list:
+                return min(crop_info_list)[-1]
             else:
-                return min(info_list2)[-1]
+                return min(crop_info_list2)[-1]
         else:
             info_list = []
             info_list2 = []
diff --git a/synapse/rest/media/v1/thumbnailer.py b/synapse/rest/media/v1/thumbnailer.py
index c995d7e043..c234ea7421 100644
--- a/synapse/rest/media/v1/thumbnailer.py
+++ b/synapse/rest/media/v1/thumbnailer.py
@@ -82,13 +82,21 @@ class Thumbnailer(object):
         else:
             return (max_height * self.width) // self.height, max_height
 
+    def _resize(self, width, height):
+        # 1-bit or 8-bit color palette images need converting to RGB
+        # otherwise they will be scaled using nearest neighbour which
+        # looks awful
+        if self.image.mode in ["1", "P"]:
+            self.image = self.image.convert("RGB")
+        return self.image.resize((width, height), Image.ANTIALIAS)
+
     def scale(self, width, height, output_type):
         """Rescales the image to the given dimensions.
 
         Returns:
             BytesIO: the bytes of the encoded image ready to be written to disk
         """
-        scaled = self.image.resize((width, height), Image.ANTIALIAS)
+        scaled = self._resize(width, height)
         return self._encode_image(scaled, output_type)
 
     def crop(self, width, height, output_type):
@@ -107,13 +115,13 @@ class Thumbnailer(object):
         """
         if width * self.height > height * self.width:
             scaled_height = (width * self.height) // self.width
-            scaled_image = self.image.resize((width, scaled_height), Image.ANTIALIAS)
+            scaled_image = self._resize(width, scaled_height)
             crop_top = (scaled_height - height) // 2
             crop_bottom = height + crop_top
             cropped = scaled_image.crop((0, crop_top, width, crop_bottom))
         else:
             scaled_width = (height * self.width) // self.height
-            scaled_image = self.image.resize((scaled_width, height), Image.ANTIALIAS)
+            scaled_image = self._resize(scaled_width, height)
             crop_left = (scaled_width - width) // 2
             crop_right = width + crop_left
             cropped = scaled_image.crop((crop_left, 0, crop_right, height))
@@ -121,5 +129,8 @@ class Thumbnailer(object):
 
     def _encode_image(self, output_image, output_type):
         output_bytes_io = BytesIO()
-        output_image.save(output_bytes_io, self.FORMATS[output_type], quality=80)
+        fmt = self.FORMATS[output_type]
+        if fmt == "JPEG":
+            output_image = output_image.convert("RGB")
+        output_image.save(output_bytes_io, fmt, quality=80)
         return output_bytes_io
diff --git a/synapse/rest/media/v1/upload_resource.py b/synapse/rest/media/v1/upload_resource.py
index 5d76bbdf68..83d005812d 100644
--- a/synapse/rest/media/v1/upload_resource.py
+++ b/synapse/rest/media/v1/upload_resource.py
@@ -17,7 +17,7 @@ import logging
 
 from twisted.web.server import NOT_DONE_YET
 
-from synapse.api.errors import SynapseError
+from synapse.api.errors import Codes, SynapseError
 from synapse.http.server import (
     DirectServeResource,
     respond_with_json,
@@ -56,7 +56,11 @@ class UploadResource(DirectServeResource):
         if content_length is None:
             raise SynapseError(msg="Request must specify a Content-Length", code=400)
         if int(content_length) > self.max_upload_size:
-            raise SynapseError(msg="Upload request body is too large", code=413)
+            raise SynapseError(
+                msg="Upload request body is too large",
+                code=413,
+                errcode=Codes.TOO_LARGE,
+            )
 
         upload_name = parse_string(request, b"filename", encoding=None)
         if upload_name: