Fix parsing of Content-Disposition headers (#4763)

* Fix parsing of Content-Disposition headers TIL: filenames in content-dispostion headers can contain semicolons, and aren't %-encoded. * fix python2 incompatibility * Fix docstrings
author: Richard van der Hoff <1389908+richvdh@users.noreply.github.com> 2019-02-27 22:29:10 +0000
committer: Amber Brown <hawkowl@atleastfornow.net> 2019-02-27 14:29:10 -0800
commit: 68f47d6744ac2b4c6ac8b59b8c52a537a5072b4c (patch)
tree: 1c90996cf1aace9c99ee6ed882a2171466686a52
parent: Move from TravisCI to BuildKite (#4752) (diff)
download: synapse-68f47d6744ac2b4c6ac8b59b8c52a537a5072b4c.tar.xz
3 files changed, 111 insertions, 20 deletions
diff --git a/changelog.d/4763.bugfix b/changelog.d/4763.bugfix
new file mode 100644
index 0000000000..213ea44b70
--- /dev/null
+++ b/changelog.d/4763.bugfix
@@ -0,0 +1 @@
+Fix parsing of Content-Disposition headers on remote media requests and URL previews.
diff --git a/synapse/rest/media/v1/_base.py b/synapse/rest/media/v1/_base.py
index d16a30acd8..fece1ef0b8 100644
--- a/synapse/rest/media/v1/_base.py
+++ b/synapse/rest/media/v1/_base.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright 2014-2016 OpenMarket Ltd
+# Copyright 2019 New Vector Ltd.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -213,8 +214,7 @@ def get_filename_from_headers(headers):
     Content-Disposition HTTP header.
 
     Args:
-        headers (twisted.web.http_headers.Headers): The HTTP
-            request headers.
+        headers (dict[bytes, list[bytes]]): The HTTP request headers.
 
     Returns:
         A Unicode string of the filename, or None.
@@ -225,23 +225,12 @@ def get_filename_from_headers(headers):
     if not content_disposition[0]:
         return
 
-    # dict of unicode: bytes, corresponding to the key value sections of the
-    # Content-Disposition header.
-    params = {}
-    parts = content_disposition[0].split(b";")
-    for i in parts:
-        # Split into key-value pairs, if able
-        # We don't care about things like `inline`, so throw it out
-        if b"=" not in i:
-            continue
-
-        key, value = i.strip().split(b"=")
-        params[key.decode('ascii')] = value
+    _, params = _parse_header(content_disposition[0])
 
     upload_name = None
 
     # First check if there is a valid UTF-8 filename
-    upload_name_utf8 = params.get("filename*", None)
+    upload_name_utf8 = params.get(b"filename*", None)
     if upload_name_utf8:
         if upload_name_utf8.lower().startswith(b"utf-8''"):
             upload_name_utf8 = upload_name_utf8[7:]
@@ -267,12 +256,68 @@ def get_filename_from_headers(headers):
 
     # If there isn't check for an ascii name.
     if not upload_name:
-        upload_name_ascii = params.get("filename", None)
+        upload_name_ascii = params.get(b"filename", None)
         if upload_name_ascii and is_ascii(upload_name_ascii):
-            # Make sure there's no %-quoted bytes. If there is, reject it as
-            # non-valid ASCII.
-            if b"%" not in upload_name_ascii:
-                upload_name = upload_name_ascii.decode('ascii')
+            upload_name = upload_name_ascii.decode('ascii')
 
     # This may be None here, indicating we did not find a matching name.
     return upload_name
+
+
+def _parse_header(line):
+    """Parse a Content-type like header.
+
+    Cargo-culted from `cgi`, but works on bytes rather than strings.
+
+    Args:
+        line (bytes): header to be parsed
+
+    Returns:
+        Tuple[bytes, dict[bytes, bytes]]:
+            the main content-type, followed by the parameter dictionary
+    """
+    parts = _parseparam(b';' + line)
+    key = next(parts)
+    pdict = {}
+    for p in parts:
+        i = p.find(b'=')
+        if i >= 0:
+            name = p[:i].strip().lower()
+            value = p[i + 1:].strip()
+
+            # strip double-quotes
+            if len(value) >= 2 and value[0:1] == value[-1:] == b'"':
+                value = value[1:-1]
+                value = value.replace(b'\\\\', b'\\').replace(b'\\"', b'"')
+            pdict[name] = value
+
+    return key, pdict
+
+
+def _parseparam(s):
+    """Generator which splits the input on ;, respecting double-quoted sequences
+
+    Cargo-culted from `cgi`, but works on bytes rather than strings.
+
+    Args:
+        s (bytes): header to be parsed
+
+    Returns:
+        Iterable[bytes]: the split input
+    """
+    while s[:1] == b';':
+        s = s[1:]
+
+        # look for the next ;
+        end = s.find(b';')
+
+        # if there is an odd number of " marks between here and the next ;, skip to the
+        # next ; instead
+        while end > 0 and (s.count(b'"', 0, end) - s.count(b'\\"', 0, end)) % 2:
+            end = s.find(b';', end + 1)
+
+        if end < 0:
+            end = len(s)
+        f = s[:end]
+        yield f.strip()
+        s = s[end:]
diff --git a/tests/rest/media/v1/test_base.py b/tests/rest/media/v1/test_base.py
new file mode 100644
index 0000000000..af8f74eb42
--- /dev/null
+++ b/tests/rest/media/v1/test_base.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+# Copyright 2019 New Vector Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from synapse.rest.media.v1._base import get_filename_from_headers
+
+from tests import unittest
+
+
+class GetFileNameFromHeadersTests(unittest.TestCase):
+    # input -> expected result
+    TEST_CASES = {
+        b"inline; filename=abc.txt": u"abc.txt",
+        b'inline; filename="azerty"': u"azerty",
+        b'inline; filename="aze%20rty"': u"aze%20rty",
+        b'inline; filename="aze\"rty"': u'aze"rty',
+        b'inline; filename="azer;ty"': u"azer;ty",
+
+        b"inline; filename*=utf-8''foo%C2%A3bar": u"foo£bar",
+    }
+
+    def tests(self):
+        for hdr, expected in self.TEST_CASES.items():
+            res = get_filename_from_headers(
+                {
+                    b'Content-Disposition': [hdr],
+                },
+            )
+            self.assertEqual(
+                res, expected,
+                "expected output for %s to be %s but was %s" % (
+                    hdr, expected, res,
+                )
+            )
author	Richard van der Hoff <1389908+richvdh@users.noreply.github.com>	2019-02-27 22:29:10 +0000
committer	Amber Brown <hawkowl@atleastfornow.net>	2019-02-27 14:29:10 -0800
commit	68f47d6744ac2b4c6ac8b59b8c52a537a5072b4c (patch)
tree	1c90996cf1aace9c99ee6ed882a2171466686a52
parent	Move from TravisCI to BuildKite (#4752) (diff)
download	synapse-68f47d6744ac2b4c6ac8b59b8c52a537a5072b4c.tar.xz