diff options
author | Patrick Cloke <clokep@users.noreply.github.com> | 2020-07-27 07:50:44 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-07-27 07:50:44 -0400 |
commit | 3fc8fdd150e2471d6e96b842e364d9421066f4ba (patch) | |
tree | 3c8fa3a533c8efc0ee01bde28cc44cbbe21ffba1 /synapse | |
parent | Convert state resolution to async/await (#7942) (diff) | |
download | synapse-3fc8fdd150e2471d6e96b842e364d9421066f4ba.tar.xz |
Support oEmbed for media previews. (#7920)
Fixes previews of Twitter URLs by using their oEmbed endpoint to grab content.
Diffstat (limited to 'synapse')
-rw-r--r-- | synapse/rest/media/v1/preview_url_resource.py | 265 |
1 files changed, 220 insertions, 45 deletions
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index e52c86c798..13d1a6d2ed 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -26,6 +26,7 @@ import traceback from typing import Dict, Optional from urllib import parse as urlparse +import attr from canonicaljson import json from twisted.internet import defer @@ -56,6 +57,65 @@ _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I) OG_TAG_NAME_MAXLEN = 50 OG_TAG_VALUE_MAXLEN = 1000 +ONE_HOUR = 60 * 60 * 1000 + +# A map of globs to API endpoints. +_oembed_globs = { + # Twitter. + "https://publish.twitter.com/oembed": [ + "https://twitter.com/*/status/*", + "https://*.twitter.com/*/status/*", + "https://twitter.com/*/moments/*", + "https://*.twitter.com/*/moments/*", + # Include the HTTP versions too. + "http://twitter.com/*/status/*", + "http://*.twitter.com/*/status/*", + "http://twitter.com/*/moments/*", + "http://*.twitter.com/*/moments/*", + ], +} +# Convert the globs to regular expressions. +_oembed_patterns = {} +for endpoint, globs in _oembed_globs.items(): + for glob in globs: + # Convert the glob into a sane regular expression to match against. The + # rules followed will be slightly different for the domain portion vs. + # the rest. + # + # 1. The scheme must be one of HTTP / HTTPS (and have no globs). + # 2. The domain can have globs, but we limit it to characters that can + # reasonably be a domain part. + # TODO: This does not attempt to handle Unicode domain names. + # 3. Other parts allow a glob to be any one, or more, characters. + results = urlparse.urlparse(glob) + + # Ensure the scheme does not have wildcards (and is a sane scheme). + if results.scheme not in {"http", "https"}: + raise ValueError("Insecure oEmbed glob scheme: %s" % (results.scheme,)) + + pattern = urlparse.urlunparse( + [ + results.scheme, + re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"), + ] + + [re.escape(part).replace("\\*", ".+") for part in results[2:]] + ) + _oembed_patterns[re.compile(pattern)] = endpoint + + +@attr.s +class OEmbedResult: + # Either HTML content or URL must be provided. + html = attr.ib(type=Optional[str]) + url = attr.ib(type=Optional[str]) + title = attr.ib(type=Optional[str]) + # Number of seconds to cache the content. + cache_age = attr.ib(type=int) + + +class OEmbedError(Exception): + """An error occurred processing the oEmbed object.""" + class PreviewUrlResource(DirectServeJsonResource): isLeaf = True @@ -99,7 +159,7 @@ class PreviewUrlResource(DirectServeJsonResource): cache_name="url_previews", clock=self.clock, # don't spider URLs more often than once an hour - expiry_ms=60 * 60 * 1000, + expiry_ms=ONE_HOUR, ) if self._worker_run_media_background_jobs: @@ -310,6 +370,87 @@ class PreviewUrlResource(DirectServeJsonResource): return jsonog.encode("utf8") + def _get_oembed_url(self, url: str) -> Optional[str]: + """ + Check whether the URL should be downloaded as oEmbed content instead. + + Params: + url: The URL to check. + + Returns: + A URL to use instead or None if the original URL should be used. + """ + for url_pattern, endpoint in _oembed_patterns.items(): + if url_pattern.fullmatch(url): + return endpoint + + # No match. + return None + + async def _get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult: + """ + Request content from an oEmbed endpoint. + + Params: + endpoint: The oEmbed API endpoint. + url: The URL to pass to the API. + + Returns: + An object representing the metadata returned. + + Raises: + OEmbedError if fetching or parsing of the oEmbed information fails. + """ + try: + logger.debug("Trying to get oEmbed content for url '%s'", url) + result = await self.client.get_json( + endpoint, + # TODO Specify max height / width. + # Note that only the JSON format is supported. + args={"url": url}, + ) + + # Ensure there's a version of 1.0. + if result.get("version") != "1.0": + raise OEmbedError("Invalid version: %s" % (result.get("version"),)) + + oembed_type = result.get("type") + + # Ensure the cache age is None or an int. + cache_age = result.get("cache_age") + if cache_age: + cache_age = int(cache_age) + + oembed_result = OEmbedResult(None, None, result.get("title"), cache_age) + + # HTML content. + if oembed_type == "rich": + oembed_result.html = result.get("html") + return oembed_result + + if oembed_type == "photo": + oembed_result.url = result.get("url") + return oembed_result + + # TODO Handle link and video types. + + if "thumbnail_url" in result: + oembed_result.url = result.get("thumbnail_url") + return oembed_result + + raise OEmbedError("Incompatible oEmbed information.") + + except OEmbedError as e: + # Trap OEmbedErrors first so we can directly re-raise them. + logger.warning("Error parsing oEmbed metadata from %s: %r", url, e) + raise + + except Exception as e: + # Trap any exception and let the code follow as usual. + # FIXME: pass through 404s and other error messages nicely + logger.warning("Error downloading oEmbed metadata from %s: %r", url, e) + raise OEmbedError() from e + async def _download_url(self, url, user): # TODO: we should probably honour robots.txt... except in practice # we're most likely being explicitly triggered by a human rather than a @@ -319,54 +460,90 @@ class PreviewUrlResource(DirectServeJsonResource): file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True) - with self.media_storage.store_into_file(file_info) as (f, fname, finish): + # If this URL can be accessed via oEmbed, use that instead. + url_to_download = url + oembed_url = self._get_oembed_url(url) + if oembed_url: + # The result might be a new URL to download, or it might be HTML content. try: - logger.debug("Trying to get preview for url '%s'", url) - length, headers, uri, code = await self.client.get_file( - url, - output_stream=f, - max_size=self.max_spider_size, - headers={"Accept-Language": self.url_preview_accept_language}, - ) - except SynapseError: - # Pass SynapseErrors through directly, so that the servlet - # handler will return a SynapseError to the client instead of - # blank data or a 500. - raise - except DNSLookupError: - # DNS lookup returned no results - # Note: This will also be the case if one of the resolved IP - # addresses is blacklisted - raise SynapseError( - 502, - "DNS resolution failure during URL preview generation", - Codes.UNKNOWN, - ) - except Exception as e: - # FIXME: pass through 404s and other error messages nicely - logger.warning("Error downloading %s: %r", url, e) + oembed_result = await self._get_oembed_content(oembed_url, url) + if oembed_result.url: + url_to_download = oembed_result.url + elif oembed_result.html: + url_to_download = None + except OEmbedError: + # If an error occurs, try doing a normal preview. + pass - raise SynapseError( - 500, - "Failed to download content: %s" - % (traceback.format_exception_only(sys.exc_info()[0], e),), - Codes.UNKNOWN, - ) - await finish() + if url_to_download: + with self.media_storage.store_into_file(file_info) as (f, fname, finish): + try: + logger.debug("Trying to get preview for url '%s'", url_to_download) + length, headers, uri, code = await self.client.get_file( + url_to_download, + output_stream=f, + max_size=self.max_spider_size, + headers={"Accept-Language": self.url_preview_accept_language}, + ) + except SynapseError: + # Pass SynapseErrors through directly, so that the servlet + # handler will return a SynapseError to the client instead of + # blank data or a 500. + raise + except DNSLookupError: + # DNS lookup returned no results + # Note: This will also be the case if one of the resolved IP + # addresses is blacklisted + raise SynapseError( + 502, + "DNS resolution failure during URL preview generation", + Codes.UNKNOWN, + ) + except Exception as e: + # FIXME: pass through 404s and other error messages nicely + logger.warning("Error downloading %s: %r", url_to_download, e) + + raise SynapseError( + 500, + "Failed to download content: %s" + % (traceback.format_exception_only(sys.exc_info()[0], e),), + Codes.UNKNOWN, + ) + await finish() + + if b"Content-Type" in headers: + media_type = headers[b"Content-Type"][0].decode("ascii") + else: + media_type = "application/octet-stream" + + download_name = get_filename_from_headers(headers) + + # FIXME: we should calculate a proper expiration based on the + # Cache-Control and Expire headers. But for now, assume 1 hour. + expires = ONE_HOUR + etag = headers["ETag"][0] if "ETag" in headers else None + else: + html_bytes = oembed_result.html.encode("utf-8") # type: ignore + with self.media_storage.store_into_file(file_info) as (f, fname, finish): + f.write(html_bytes) + await finish() + + media_type = "text/html" + download_name = oembed_result.title + length = len(html_bytes) + # If a specific cache age was not given, assume 1 hour. + expires = oembed_result.cache_age or ONE_HOUR + uri = oembed_url + code = 200 + etag = None try: - if b"Content-Type" in headers: - media_type = headers[b"Content-Type"][0].decode("ascii") - else: - media_type = "application/octet-stream" time_now_ms = self.clock.time_msec() - download_name = get_filename_from_headers(headers) - await self.store.store_local_media( media_id=file_id, media_type=media_type, - time_now_ms=self.clock.time_msec(), + time_now_ms=time_now_ms, upload_name=download_name, media_length=length, user_id=user, @@ -389,10 +566,8 @@ class PreviewUrlResource(DirectServeJsonResource): "filename": fname, "uri": uri, "response_code": code, - # FIXME: we should calculate a proper expiration based on the - # Cache-Control and Expire headers. But for now, assume 1 hour. - "expires": 60 * 60 * 1000, - "etag": headers["ETag"][0] if "ETag" in headers else None, + "expires": expires, + "etag": etag, } def _start_expire_url_cache_data(self): @@ -449,7 +624,7 @@ class PreviewUrlResource(DirectServeJsonResource): # These may be cached for a bit on the client (i.e., they # may have a room open with a preview url thing open). # So we wait a couple of days before deleting, just in case. - expire_before = now - 2 * 24 * 60 * 60 * 1000 + expire_before = now - 2 * 24 * ONE_HOUR media_ids = await self.store.get_url_cache_media_before(expire_before) removed_media = [] |