diff --git a/changelog.d/4176.bugfix b/changelog.d/4176.bugfix
new file mode 100644
index 0000000000..3846f8a27b
--- /dev/null
+++ b/changelog.d/4176.bugfix
@@ -0,0 +1 @@
+The media repository now no longer fails to decode UTF-8 filenames when downloading remote media.
diff --git a/synapse/rest/media/v1/_base.py b/synapse/rest/media/v1/_base.py
index 76e479afa3..efe42a429d 100644
--- a/synapse/rest/media/v1/_base.py
+++ b/synapse/rest/media/v1/_base.py
@@ -16,6 +16,7 @@
import logging
import os
+from six import PY3
from six.moves import urllib
from twisted.internet import defer
@@ -48,26 +49,21 @@ def parse_media_id(request):
return server_name, media_id, file_name
except Exception:
raise SynapseError(
- 404,
- "Invalid media id token %r" % (request.postpath,),
- Codes.UNKNOWN,
+ 404, "Invalid media id token %r" % (request.postpath,), Codes.UNKNOWN
)
def respond_404(request):
respond_with_json(
- request, 404,
- cs_error(
- "Not found %r" % (request.postpath,),
- code=Codes.NOT_FOUND,
- ),
- send_cors=True
+ request,
+ 404,
+ cs_error("Not found %r" % (request.postpath,), code=Codes.NOT_FOUND),
+ send_cors=True,
)
@defer.inlineCallbacks
-def respond_with_file(request, media_type, file_path,
- file_size=None, upload_name=None):
+def respond_with_file(request, media_type, file_path, file_size=None, upload_name=None):
logger.debug("Responding with %r", file_path)
if os.path.isfile(file_path):
@@ -97,31 +93,26 @@ def add_file_headers(request, media_type, file_size, upload_name):
file_size (int): Size in bytes of the media, if known.
upload_name (str): The name of the requested file, if any.
"""
+
def _quote(x):
return urllib.parse.quote(x.encode("utf-8"))
request.setHeader(b"Content-Type", media_type.encode("UTF-8"))
if upload_name:
if is_ascii(upload_name):
- disposition = ("inline; filename=%s" % (_quote(upload_name),)).encode("ascii")
+ disposition = "inline; filename=%s" % (_quote(upload_name),)
else:
- disposition = (
- "inline; filename*=utf-8''%s" % (_quote(upload_name),)).encode("ascii")
+ disposition = "inline; filename*=utf-8''%s" % (_quote(upload_name),)
- request.setHeader(b"Content-Disposition", disposition)
+ request.setHeader(b"Content-Disposition", disposition.encode('ascii'))
# cache for at least a day.
# XXX: we might want to turn this off for data we don't want to
# recommend caching as it's sensitive or private - or at least
# select private. don't bother setting Expires as all our
# clients are smart enough to be happy with Cache-Control
- request.setHeader(
- b"Cache-Control", b"public,max-age=86400,s-maxage=86400"
- )
-
- request.setHeader(
- b"Content-Length", b"%d" % (file_size,)
- )
+ request.setHeader(b"Cache-Control", b"public,max-age=86400,s-maxage=86400")
+ request.setHeader(b"Content-Length", b"%d" % (file_size,))
@defer.inlineCallbacks
@@ -153,6 +144,7 @@ class Responder(object):
Responder is a context manager which *must* be used, so that any resources
held can be cleaned up.
"""
+
def write_to_consumer(self, consumer):
"""Stream response into consumer
@@ -186,9 +178,18 @@ class FileInfo(object):
thumbnail_method (str)
thumbnail_type (str): Content type of thumbnail, e.g. image/png
"""
- def __init__(self, server_name, file_id, url_cache=False,
- thumbnail=False, thumbnail_width=None, thumbnail_height=None,
- thumbnail_method=None, thumbnail_type=None):
+
+ def __init__(
+ self,
+ server_name,
+ file_id,
+ url_cache=False,
+ thumbnail=False,
+ thumbnail_width=None,
+ thumbnail_height=None,
+ thumbnail_method=None,
+ thumbnail_type=None,
+ ):
self.server_name = server_name
self.file_id = file_id
self.url_cache = url_cache
@@ -197,3 +198,74 @@ class FileInfo(object):
self.thumbnail_height = thumbnail_height
self.thumbnail_method = thumbnail_method
self.thumbnail_type = thumbnail_type
+
+
+def get_filename_from_headers(headers):
+ """
+ Get the filename of the downloaded file by inspecting the
+ Content-Disposition HTTP header.
+
+ Args:
+ headers (twisted.web.http_headers.Headers): The HTTP
+ request headers.
+
+ Returns:
+ A Unicode string of the filename, or None.
+ """
+ content_disposition = headers.get(b"Content-Disposition", [b''])
+
+ # No header, bail out.
+ if not content_disposition[0]:
+ return
+
+ # dict of unicode: bytes, corresponding to the key value sections of the
+ # Content-Disposition header.
+ params = {}
+ parts = content_disposition[0].split(b";")
+ for i in parts:
+ # Split into key-value pairs, if able
+ # We don't care about things like `inline`, so throw it out
+ if b"=" not in i:
+ continue
+
+ key, value = i.strip().split(b"=")
+ params[key.decode('ascii')] = value
+
+ upload_name = None
+
+ # First check if there is a valid UTF-8 filename
+ upload_name_utf8 = params.get("filename*", None)
+ if upload_name_utf8:
+ if upload_name_utf8.lower().startswith(b"utf-8''"):
+ upload_name_utf8 = upload_name_utf8[7:]
+ # We have a filename*= section. This MUST be ASCII, and any UTF-8
+ # bytes are %-quoted.
+ if PY3:
+ try:
+ # Once it is decoded, we can then unquote the %-encoded
+ # parts strictly into a unicode string.
+ upload_name = urllib.parse.unquote(
+ upload_name_utf8.decode('ascii'), errors="strict"
+ )
+ except UnicodeDecodeError:
+ # Incorrect UTF-8.
+ pass
+ else:
+ # On Python 2, we first unquote the %-encoded parts and then
+ # decode it strictly using UTF-8.
+ try:
+ upload_name = urllib.parse.unquote(upload_name_utf8).decode('utf8')
+ except UnicodeDecodeError:
+ pass
+
+ # If there isn't check for an ascii name.
+ if not upload_name:
+ upload_name_ascii = params.get("filename", None)
+ if upload_name_ascii and is_ascii(upload_name_ascii):
+ # Make sure there's no %-quoted bytes. If there is, reject it as
+ # non-valid ASCII.
+ if b"%" not in upload_name_ascii:
+ upload_name = upload_name_ascii.decode('ascii')
+
+ # This may be None here, indicating we did not find a matching name.
+ return upload_name
diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py
index d6c5f07af0..e117836e9a 100644
--- a/synapse/rest/media/v1/media_repository.py
+++ b/synapse/rest/media/v1/media_repository.py
@@ -14,14 +14,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import cgi
import errno
import logging
import os
import shutil
-from six import PY3, iteritems
-from six.moves.urllib import parse as urlparse
+from six import iteritems
import twisted.internet.error
import twisted.web.http
@@ -34,14 +32,18 @@ from synapse.api.errors import (
NotFoundError,
SynapseError,
)
-from synapse.http.matrixfederationclient import MatrixFederationHttpClient
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.util import logcontext
from synapse.util.async_helpers import Linearizer
from synapse.util.retryutils import NotRetryingDestination
-from synapse.util.stringutils import is_ascii, random_string
+from synapse.util.stringutils import random_string
-from ._base import FileInfo, respond_404, respond_with_responder
+from ._base import (
+ FileInfo,
+ get_filename_from_headers,
+ respond_404,
+ respond_with_responder,
+)
from .config_resource import MediaConfigResource
from .download_resource import DownloadResource
from .filepath import MediaFilePaths
@@ -62,7 +64,7 @@ class MediaRepository(object):
def __init__(self, hs):
self.hs = hs
self.auth = hs.get_auth()
- self.client = MatrixFederationHttpClient(hs)
+ self.client = hs.get_http_client()
self.clock = hs.get_clock()
self.server_name = hs.hostname
self.store = hs.get_datastore()
@@ -397,39 +399,9 @@ class MediaRepository(object):
yield finish()
media_type = headers[b"Content-Type"][0].decode('ascii')
-
+ upload_name = get_filename_from_headers(headers)
time_now_ms = self.clock.time_msec()
- content_disposition = headers.get(b"Content-Disposition", None)
- if content_disposition:
- _, params = cgi.parse_header(content_disposition[0].decode('ascii'),)
- upload_name = None
-
- # First check if there is a valid UTF-8 filename
- upload_name_utf8 = params.get("filename*", None)
- if upload_name_utf8:
- if upload_name_utf8.lower().startswith("utf-8''"):
- upload_name = upload_name_utf8[7:]
-
- # If there isn't check for an ascii name.
- if not upload_name:
- upload_name_ascii = params.get("filename", None)
- if upload_name_ascii and is_ascii(upload_name_ascii):
- upload_name = upload_name_ascii
-
- if upload_name:
- if PY3:
- upload_name = urlparse.unquote(upload_name)
- else:
- upload_name = urlparse.unquote(upload_name.encode('ascii'))
- try:
- if isinstance(upload_name, bytes):
- upload_name = upload_name.decode("utf-8")
- except UnicodeDecodeError:
- upload_name = None
- else:
- upload_name = None
-
logger.info("Stored remote media in file %r", fname)
yield self.store.store_cached_remote_media(
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 9b15699e4d..d0ecf241b6 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import cgi
import datetime
import errno
import fnmatch
@@ -44,10 +43,11 @@ from synapse.http.server import (
)
from synapse.http.servlet import parse_integer, parse_string
from synapse.metrics.background_process_metrics import run_as_background_process
+from synapse.rest.media.v1._base import get_filename_from_headers
from synapse.util.async_helpers import ObservableDeferred
from synapse.util.caches.expiringcache import ExpiringCache
from synapse.util.logcontext import make_deferred_yieldable, run_in_background
-from synapse.util.stringutils import is_ascii, random_string
+from synapse.util.stringutils import random_string
from ._base import FileInfo
@@ -336,31 +336,7 @@ class PreviewUrlResource(Resource):
media_type = "application/octet-stream"
time_now_ms = self.clock.time_msec()
- content_disposition = headers.get(b"Content-Disposition", None)
- if content_disposition:
- _, params = cgi.parse_header(content_disposition[0],)
- download_name = None
-
- # First check if there is a valid UTF-8 filename
- download_name_utf8 = params.get("filename*", None)
- if download_name_utf8:
- if download_name_utf8.lower().startswith("utf-8''"):
- download_name = download_name_utf8[7:]
-
- # If there isn't check for an ascii name.
- if not download_name:
- download_name_ascii = params.get("filename", None)
- if download_name_ascii and is_ascii(download_name_ascii):
- download_name = download_name_ascii
-
- if download_name:
- download_name = urlparse.unquote(download_name)
- try:
- download_name = download_name.decode("utf-8")
- except UnicodeDecodeError:
- download_name = None
- else:
- download_name = None
+ download_name = get_filename_from_headers(headers)
yield self.store.store_local_media(
media_id=file_id,
diff --git a/tests/rest/media/v1/test_media_storage.py b/tests/rest/media/v1/test_media_storage.py
index a86901c2d8..fd131e3454 100644
--- a/tests/rest/media/v1/test_media_storage.py
+++ b/tests/rest/media/v1/test_media_storage.py
@@ -17,15 +17,20 @@
import os
import shutil
import tempfile
+from binascii import unhexlify
from mock import Mock
+from six.moves.urllib import parse
from twisted.internet import defer, reactor
+from twisted.internet.defer import Deferred
+from synapse.config.repository import MediaStorageProviderConfig
from synapse.rest.media.v1._base import FileInfo
from synapse.rest.media.v1.filepath import MediaFilePaths
from synapse.rest.media.v1.media_storage import MediaStorage
from synapse.rest.media.v1.storage_provider import FileStorageProviderBackend
+from synapse.util.module_loader import load_module
from tests import unittest
@@ -83,3 +88,143 @@ class MediaStorageTests(unittest.TestCase):
body = f.read()
self.assertEqual(test_body, body)
+
+
+class MediaRepoTests(unittest.HomeserverTestCase):
+
+ hijack_auth = True
+ user_id = "@test:user"
+
+ def make_homeserver(self, reactor, clock):
+
+ self.fetches = []
+
+ def get_file(destination, path, output_stream, args=None, max_size=None):
+ """
+ Returns tuple[int,dict,str,int] of file length, response headers,
+ absolute URI, and response code.
+ """
+
+ def write_to(r):
+ data, response = r
+ output_stream.write(data)
+ return response
+
+ d = Deferred()
+ d.addCallback(write_to)
+ self.fetches.append((d, destination, path, args))
+ return d
+
+ client = Mock()
+ client.get_file = get_file
+
+ self.storage_path = self.mktemp()
+ os.mkdir(self.storage_path)
+
+ config = self.default_config()
+ config.media_store_path = self.storage_path
+ config.thumbnail_requirements = {}
+ config.max_image_pixels = 2000000
+
+ provider_config = {
+ "module": "synapse.rest.media.v1.storage_provider.FileStorageProviderBackend",
+ "store_local": True,
+ "store_synchronous": False,
+ "store_remote": True,
+ "config": {"directory": self.storage_path},
+ }
+
+ loaded = list(load_module(provider_config)) + [
+ MediaStorageProviderConfig(False, False, False)
+ ]
+
+ config.media_storage_providers = [loaded]
+
+ hs = self.setup_test_homeserver(config=config, http_client=client)
+
+ return hs
+
+ def prepare(self, reactor, clock, hs):
+
+ self.media_repo = hs.get_media_repository_resource()
+ self.download_resource = self.media_repo.children[b'download']
+
+ # smol png
+ self.end_content = unhexlify(
+ b"89504e470d0a1a0a0000000d4948445200000001000000010806"
+ b"0000001f15c4890000000a49444154789c63000100000500010d"
+ b"0a2db40000000049454e44ae426082"
+ )
+
+ def _req(self, content_disposition):
+
+ request, channel = self.make_request(
+ "GET", "example.com/12345", shorthand=False
+ )
+ request.render(self.download_resource)
+ self.pump()
+
+ # We've made one fetch, to example.com, using the media URL, and asking
+ # the other server not to do a remote fetch
+ self.assertEqual(len(self.fetches), 1)
+ self.assertEqual(self.fetches[0][1], "example.com")
+ self.assertEqual(
+ self.fetches[0][2], "/_matrix/media/v1/download/example.com/12345"
+ )
+ self.assertEqual(self.fetches[0][3], {"allow_remote": "false"})
+
+ headers = {
+ b"Content-Length": [b"%d" % (len(self.end_content))],
+ b"Content-Type": [b'image/png'],
+ }
+ if content_disposition:
+ headers[b"Content-Disposition"] = [content_disposition]
+
+ self.fetches[0][0].callback(
+ (self.end_content, (len(self.end_content), headers))
+ )
+
+ self.pump()
+ self.assertEqual(channel.code, 200)
+
+ return channel
+
+ def test_disposition_filename_ascii(self):
+ """
+ If the filename is filename=<ascii> then Synapse will decode it as an
+ ASCII string, and use filename= in the response.
+ """
+ channel = self._req(b"inline; filename=out.png")
+
+ headers = channel.headers
+ self.assertEqual(headers.getRawHeaders(b"Content-Type"), [b"image/png"])
+ self.assertEqual(
+ headers.getRawHeaders(b"Content-Disposition"), [b"inline; filename=out.png"]
+ )
+
+ def test_disposition_filenamestar_utf8escaped(self):
+ """
+ If the filename is filename=*utf8''<utf8 escaped> then Synapse will
+ correctly decode it as the UTF-8 string, and use filename* in the
+ response.
+ """
+ filename = parse.quote(u"\u2603".encode('utf8')).encode('ascii')
+ channel = self._req(b"inline; filename*=utf-8''" + filename + b".png")
+
+ headers = channel.headers
+ self.assertEqual(headers.getRawHeaders(b"Content-Type"), [b"image/png"])
+ self.assertEqual(
+ headers.getRawHeaders(b"Content-Disposition"),
+ [b"inline; filename*=utf-8''" + filename + b".png"],
+ )
+
+ def test_disposition_none(self):
+ """
+ If there is no filename, one isn't passed on in the Content-Disposition
+ of the request.
+ """
+ channel = self._req(None)
+
+ headers = channel.headers
+ self.assertEqual(headers.getRawHeaders(b"Content-Type"), [b"image/png"])
+ self.assertEqual(headers.getRawHeaders(b"Content-Disposition"), None)
diff --git a/tests/server.py b/tests/server.py
index 7919a1f124..ceec2f2d4e 100644
--- a/tests/server.py
+++ b/tests/server.py
@@ -14,6 +14,8 @@ from twisted.internet.error import DNSLookupError
from twisted.internet.interfaces import IReactorPluggableNameResolver
from twisted.python.failure import Failure
from twisted.test.proto_helpers import MemoryReactorClock
+from twisted.web.http import unquote
+from twisted.web.http_headers import Headers
from synapse.http.site import SynapseRequest
from synapse.util import Clock
@@ -50,6 +52,15 @@ class FakeChannel(object):
raise Exception("No result yet.")
return int(self.result["code"])
+ @property
+ def headers(self):
+ if not self.result:
+ raise Exception("No result yet.")
+ h = Headers()
+ for i in self.result["headers"]:
+ h.addRawHeader(*i)
+ return h
+
def writeHeaders(self, version, code, reason, headers):
self.result["version"] = version
self.result["code"] = code
@@ -152,6 +163,9 @@ def make_request(
path = b"/_matrix/client/r0/" + path
path = path.replace(b"//", b"/")
+ if not path.startswith(b"/"):
+ path = b"/" + path
+
if isinstance(content, text_type):
content = content.encode('utf8')
@@ -161,6 +175,7 @@ def make_request(
req = request(site, channel)
req.process = lambda: b""
req.content = BytesIO(content)
+ req.postpath = list(map(unquote, path[1:].split(b'/')))
if access_token:
req.requestHeaders.addRawHeader(
|