summary refs log tree commit diff
path: root/synapse/rest/media/v1/_base.py
diff options
context:
space:
mode:
Diffstat (limited to 'synapse/rest/media/v1/_base.py')
-rw-r--r--synapse/rest/media/v1/_base.py94
1 files changed, 53 insertions, 41 deletions
diff --git a/synapse/rest/media/v1/_base.py b/synapse/rest/media/v1/_base.py
index 5fefee4dde..3689777266 100644
--- a/synapse/rest/media/v1/_base.py
+++ b/synapse/rest/media/v1/_base.py
@@ -17,7 +17,6 @@
 import logging
 import os
 
-from six import PY3
 from six.moves import urllib
 
 from twisted.internet import defer
@@ -30,6 +29,22 @@ from synapse.util.stringutils import is_ascii
 
 logger = logging.getLogger(__name__)
 
+# list all text content types that will have the charset default to UTF-8 when
+# none is given
+TEXT_CONTENT_TYPES = [
+    "text/css",
+    "text/csv",
+    "text/html",
+    "text/calendar",
+    "text/plain",
+    "text/javascript",
+    "application/json",
+    "application/ld+json",
+    "application/rtf",
+    "image/svg+xml",
+    "text/xml",
+]
+
 
 def parse_media_id(request):
     try:
@@ -96,7 +111,14 @@ def add_file_headers(request, media_type, file_size, upload_name):
     def _quote(x):
         return urllib.parse.quote(x.encode("utf-8"))
 
-    request.setHeader(b"Content-Type", media_type.encode("UTF-8"))
+    # Default to a UTF-8 charset for text content types.
+    # ex, uses UTF-8 for 'text/css' but not 'text/css; charset=UTF-16'
+    if media_type.lower() in TEXT_CONTENT_TYPES:
+        content_type = media_type + "; charset=UTF-8"
+    else:
+        content_type = media_type
+
+    request.setHeader(b"Content-Type", content_type.encode("UTF-8"))
     if upload_name:
         # RFC6266 section 4.1 [1] defines both `filename` and `filename*`.
         #
@@ -135,27 +157,25 @@ def add_file_headers(request, media_type, file_size, upload_name):
 
 # separators as defined in RFC2616. SP and HT are handled separately.
 # see _can_encode_filename_as_token.
-_FILENAME_SEPARATOR_CHARS = set(
-    (
-        "(",
-        ")",
-        "<",
-        ">",
-        "@",
-        ",",
-        ";",
-        ":",
-        "\\",
-        '"',
-        "/",
-        "[",
-        "]",
-        "?",
-        "=",
-        "{",
-        "}",
-    )
-)
+_FILENAME_SEPARATOR_CHARS = {
+    "(",
+    ")",
+    "<",
+    ">",
+    "@",
+    ",",
+    ";",
+    ":",
+    "\\",
+    '"',
+    "/",
+    "[",
+    "]",
+    "?",
+    "=",
+    "{",
+    "}",
+}
 
 
 def _can_encode_filename_as_token(x):
@@ -195,7 +215,7 @@ def respond_with_responder(request, responder, media_type, file_size, upload_nam
         respond_404(request)
         return
 
-    logger.debug("Responding to media request with responder %s")
+    logger.debug("Responding to media request with responder %s", responder)
     add_file_headers(request, media_type, file_size, upload_name)
     try:
         with responder:
@@ -303,23 +323,15 @@ def get_filename_from_headers(headers):
             upload_name_utf8 = upload_name_utf8[7:]
             # We have a filename*= section. This MUST be ASCII, and any UTF-8
             # bytes are %-quoted.
-            if PY3:
-                try:
-                    # Once it is decoded, we can then unquote the %-encoded
-                    # parts strictly into a unicode string.
-                    upload_name = urllib.parse.unquote(
-                        upload_name_utf8.decode("ascii"), errors="strict"
-                    )
-                except UnicodeDecodeError:
-                    # Incorrect UTF-8.
-                    pass
-            else:
-                # On Python 2, we first unquote the %-encoded parts and then
-                # decode it strictly using UTF-8.
-                try:
-                    upload_name = urllib.parse.unquote(upload_name_utf8).decode("utf8")
-                except UnicodeDecodeError:
-                    pass
+            try:
+                # Once it is decoded, we can then unquote the %-encoded
+                # parts strictly into a unicode string.
+                upload_name = urllib.parse.unquote(
+                    upload_name_utf8.decode("ascii"), errors="strict"
+                )
+            except UnicodeDecodeError:
+                # Incorrect UTF-8.
+                pass
 
     # If there isn't check for an ascii name.
     if not upload_name: