summary refs log tree commit diff
path: root/synapse/rest/media
diff options
context:
space:
mode:
Diffstat (limited to 'synapse/rest/media')
-rw-r--r--synapse/rest/media/v1/media_storage.py59
-rw-r--r--synapse/rest/media/v1/preview_url_resource.py74
-rw-r--r--synapse/rest/media/v1/upload_resource.py12
3 files changed, 118 insertions, 27 deletions
diff --git a/synapse/rest/media/v1/media_storage.py b/synapse/rest/media/v1/media_storage.py

index 89cdd605aa..aba6d689a8 100644 --- a/synapse/rest/media/v1/media_storage.py +++ b/synapse/rest/media/v1/media_storage.py
@@ -16,13 +16,17 @@ import contextlib import logging import os import shutil -from typing import IO, TYPE_CHECKING, Any, Optional, Sequence +from typing import IO, TYPE_CHECKING, Any, Callable, Optional, Sequence + +import attr from twisted.internet.defer import Deferred from twisted.internet.interfaces import IConsumer from twisted.protocols.basic import FileSender +from synapse.api.errors import NotFoundError from synapse.logging.context import defer_to_thread, make_deferred_yieldable +from synapse.util import Clock from synapse.util.file_consumer import BackgroundFileConsumer from ._base import FileInfo, Responder @@ -58,6 +62,8 @@ class MediaStorage: self.local_media_directory = local_media_directory self.filepaths = filepaths self.storage_providers = storage_providers + self.spam_checker = hs.get_spam_checker() + self.clock = hs.get_clock() async def store_file(self, source: IO, file_info: FileInfo) -> str: """Write `source` to the on disk media store, and also any other @@ -127,18 +133,29 @@ class MediaStorage: f.flush() f.close() + spam = await self.spam_checker.check_media_file_for_spam( + ReadableFileWrapper(self.clock, fname), file_info + ) + if spam: + logger.info("Blocking media due to spam checker") + # Note that we'll delete the stored media, due to the + # try/except below. The media also won't be stored in + # the DB. + raise SpamMediaException() + for provider in self.storage_providers: await provider.store_file(path, file_info) finished_called[0] = True yield f, fname, finish - except Exception: + except Exception as e: try: os.remove(fname) except Exception: pass - raise + + raise e from None if not finished_called: raise Exception("Finished callback not called") @@ -302,3 +319,39 @@ class FileResponder(Responder): def __exit__(self, exc_type, exc_val, exc_tb): self.open_file.close() + + +class SpamMediaException(NotFoundError): + """The media was blocked by a spam checker, so we simply 404 the request (in + the same way as if it was quarantined). + """ + + +@attr.s(slots=True) +class ReadableFileWrapper: + """Wrapper that allows reading a file in chunks, yielding to the reactor, + and writing to a callback. + + This is simplified `FileSender` that takes an IO object rather than an + `IConsumer`. + """ + + CHUNK_SIZE = 2 ** 14 + + clock = attr.ib(type=Clock) + path = attr.ib(type=str) + + async def write_chunks_to(self, callback: Callable[[bytes], None]): + """Reads the file in chunks and calls the callback with each chunk. + """ + + with open(self.path, "rb") as file: + while True: + chunk = file.read(self.CHUNK_SIZE) + if not chunk: + break + + callback(chunk) + + # We yield to the reactor by sleeping for 0 seconds. + await self.clock.sleep(0) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index bf3be653aa..3e4566464b 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -58,7 +58,10 @@ if TYPE_CHECKING: logger = logging.getLogger(__name__) -_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I) +_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I) +_xml_encoding_match = re.compile( + br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I +) _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I) OG_TAG_NAME_MAXLEN = 50 @@ -146,8 +149,7 @@ class PreviewUrlResource(DirectServeJsonResource): treq_args={"browser_like_redirects": True}, ip_whitelist=hs.config.url_preview_ip_range_whitelist, ip_blacklist=hs.config.url_preview_ip_range_blacklist, - http_proxy=os.getenvb(b"http_proxy"), - https_proxy=os.getenvb(b"HTTPS_PROXY"), + use_proxy=True, ) self.media_repo = media_repo self.primary_base_path = media_repo.primary_base_path @@ -300,24 +302,7 @@ class PreviewUrlResource(DirectServeJsonResource): with open(media_info["filename"], "rb") as file: body = file.read() - encoding = None - - # Let's try and figure out if it has an encoding set in a meta tag. - # Limit it to the first 1kb, since it ought to be in the meta tags - # at the top. - match = _charset_match.search(body[:1000]) - - # If we find a match, it should take precedence over the - # Content-Type header, so set it here. - if match: - encoding = match.group(1).decode("ascii") - - # If we don't find a match, we'll look at the HTTP Content-Type, and - # if that doesn't exist, we'll fall back to UTF-8. - if not encoding: - content_match = _content_type_match.match(media_info["media_type"]) - encoding = content_match.group(1) if content_match else "utf-8" - + encoding = get_html_media_encoding(body, media_info["media_type"]) og = decode_and_calc_og(body, media_info["uri"], encoding) # pre-cache the image for posterity @@ -689,6 +674,48 @@ class PreviewUrlResource(DirectServeJsonResource): logger.debug("No media removed from url cache") +def get_html_media_encoding(body: bytes, content_type: str) -> str: + """ + Get the encoding of the body based on the (presumably) HTML body or media_type. + + The precedence used for finding a character encoding is: + + 1. meta tag with a charset declared. + 2. The XML document's character encoding attribute. + 3. The Content-Type header. + 4. Fallback to UTF-8. + + Args: + body: The HTML document, as bytes. + content_type: The Content-Type header. + + Returns: + The character encoding of the body, as a string. + """ + # Limit searches to the first 1kb, since it ought to be at the top. + body_start = body[:1024] + + # Let's try and figure out if it has an encoding set in a meta tag. + match = _charset_match.search(body_start) + if match: + return match.group(1).decode("ascii") + + # TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> + + # If we didn't find a match, see if it an XML document with an encoding. + match = _xml_encoding_match.match(body_start) + if match: + return match.group(1).decode("ascii") + + # If we don't find a match, we'll look at the HTTP Content-Type, and + # if that doesn't exist, we'll fall back to UTF-8. + content_match = _content_type_match.match(content_type) + if content_match: + return content_match.group(1) + + return "utf-8" + + def decode_and_calc_og( body: bytes, media_uri: str, request_encoding: Optional[str] = None ) -> Dict[str, Optional[str]]: @@ -725,6 +752,11 @@ def decode_and_calc_og( def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]: # Attempt to parse the body. If this fails, log and return no metadata. tree = etree.fromstring(body_attempt, parser) + + # The data was successfully parsed, but no tree was found. + if tree is None: + return {} + return _calc_og(tree, media_uri) # Attempt to parse the body. If this fails, log and return no metadata. diff --git a/synapse/rest/media/v1/upload_resource.py b/synapse/rest/media/v1/upload_resource.py
index 6da76ae994..1136277794 100644 --- a/synapse/rest/media/v1/upload_resource.py +++ b/synapse/rest/media/v1/upload_resource.py
@@ -22,6 +22,7 @@ from twisted.web.http import Request from synapse.api.errors import Codes, SynapseError from synapse.http.server import DirectServeJsonResource, respond_with_json from synapse.http.servlet import parse_string +from synapse.rest.media.v1.media_storage import SpamMediaException if TYPE_CHECKING: from synapse.app.homeserver import HomeServer @@ -86,9 +87,14 @@ class UploadResource(DirectServeJsonResource): # disposition = headers.getRawHeaders(b"Content-Disposition")[0] # TODO(markjh): parse content-dispostion - content_uri = await self.media_repo.create_content( - media_type, upload_name, request.content, content_length, requester.user - ) + try: + content_uri = await self.media_repo.create_content( + media_type, upload_name, request.content, content_length, requester.user + ) + except SpamMediaException: + # For uploading of media we want to respond with a 400, instead of + # the default 404, as that would just be confusing. + raise SynapseError(400, "Bad content") logger.info("Uploaded content with URI %r", content_uri)