From e2481dbe9325321d460037a2efe9b9ea2ac78057 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 31 Aug 2021 18:37:07 -0400 Subject: Allow configuration of the oEmbed URLs. (#10714) This adds configuration options (under an `oembed` section) to configure which URLs are matched to use oEmbed for URL previews. --- synapse/rest/media/v1/oembed.py | 135 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 synapse/rest/media/v1/oembed.py (limited to 'synapse/rest/media/v1/oembed.py') diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py new file mode 100644 index 0000000000..afe41823e4 --- /dev/null +++ b/synapse/rest/media/v1/oembed.py @@ -0,0 +1,135 @@ +# Copyright 2021 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +from typing import TYPE_CHECKING, Optional + +import attr + +from synapse.http.client import SimpleHttpClient + +if TYPE_CHECKING: + from synapse.server import HomeServer + +logger = logging.getLogger(__name__) + + +@attr.s(slots=True, auto_attribs=True) +class OEmbedResult: + # Either HTML content or URL must be provided. + html: Optional[str] + url: Optional[str] + title: Optional[str] + # Number of seconds to cache the content. + cache_age: int + + +class OEmbedError(Exception): + """An error occurred processing the oEmbed object.""" + + +class OEmbedProvider: + """ + A helper for accessing oEmbed content. + + It can be used to check if a URL should be accessed via oEmbed and for + requesting/parsing oEmbed content. + """ + + def __init__(self, hs: "HomeServer", client: SimpleHttpClient): + self._oembed_patterns = {} + for oembed_endpoint in hs.config.oembed.oembed_patterns: + for pattern in oembed_endpoint.url_patterns: + self._oembed_patterns[pattern] = oembed_endpoint.api_endpoint + self._client = client + + def get_oembed_url(self, url: str) -> Optional[str]: + """ + Check whether the URL should be downloaded as oEmbed content instead. + + Args: + url: The URL to check. + + Returns: + A URL to use instead or None if the original URL should be used. + """ + for url_pattern, endpoint in self._oembed_patterns.items(): + if url_pattern.fullmatch(url): + return endpoint + + # No match. + return None + + async def get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult: + """ + Request content from an oEmbed endpoint. + + Args: + endpoint: The oEmbed API endpoint. + url: The URL to pass to the API. + + Returns: + An object representing the metadata returned. + + Raises: + OEmbedError if fetching or parsing of the oEmbed information fails. + """ + try: + logger.debug("Trying to get oEmbed content for url '%s'", url) + result = await self._client.get_json( + endpoint, + # TODO Specify max height / width. + # Note that only the JSON format is supported. + args={"url": url}, + ) + + # Ensure there's a version of 1.0. + if result.get("version") != "1.0": + raise OEmbedError("Invalid version: %s" % (result.get("version"),)) + + oembed_type = result.get("type") + + # Ensure the cache age is None or an int. + cache_age = result.get("cache_age") + if cache_age: + cache_age = int(cache_age) + + oembed_result = OEmbedResult(None, None, result.get("title"), cache_age) + + # HTML content. + if oembed_type == "rich": + oembed_result.html = result.get("html") + return oembed_result + + if oembed_type == "photo": + oembed_result.url = result.get("url") + return oembed_result + + # TODO Handle link and video types. + + if "thumbnail_url" in result: + oembed_result.url = result.get("thumbnail_url") + return oembed_result + + raise OEmbedError("Incompatible oEmbed information.") + + except OEmbedError as e: + # Trap OEmbedErrors first so we can directly re-raise them. + logger.warning("Error parsing oEmbed metadata from %s: %r", url, e) + raise + + except Exception as e: + # Trap any exception and let the code follow as usual. + # FIXME: pass through 404s and other error messages nicely + logger.warning("Error downloading oEmbed metadata from %s: %r", url, e) + raise OEmbedError() from e -- cgit 1.5.1 From 580a15e039f47c2a0038754bba9d96883866a9f2 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 8 Sep 2021 07:17:52 -0400 Subject: Request JSON for oEmbed requests (and ignore XML only providers). (#10759) This adds the format to the request arguments / URL to ensure that JSON data is returned (which is all that Synapse supports). This also adds additional error checking / filtering to the configuration file to ignore XML-only providers. --- changelog.d/10759.feature | 1 + synapse/config/oembed.py | 24 +++++++++++--- synapse/rest/media/v1/oembed.py | 26 ++++++++++++++-- tests/rest/media/v1/test_url_preview.py | 55 ++++++++++++++++++++++++++++++++- 4 files changed, 98 insertions(+), 8 deletions(-) create mode 100644 changelog.d/10759.feature (limited to 'synapse/rest/media/v1/oembed.py') diff --git a/changelog.d/10759.feature b/changelog.d/10759.feature new file mode 100644 index 0000000000..7d18f5c133 --- /dev/null +++ b/changelog.d/10759.feature @@ -0,0 +1 @@ +Allow configuration of the oEmbed URLs used for URL previews. diff --git a/synapse/config/oembed.py b/synapse/config/oembed.py index 09267b5eef..ea6ace4729 100644 --- a/synapse/config/oembed.py +++ b/synapse/config/oembed.py @@ -13,7 +13,7 @@ # limitations under the License. import json import re -from typing import Any, Dict, Iterable, List, Pattern +from typing import Any, Dict, Iterable, List, Optional, Pattern from urllib import parse as urlparse import attr @@ -31,6 +31,8 @@ class OEmbedEndpointConfig: api_endpoint: str # The patterns to match. url_patterns: List[Pattern] + # The supported formats. + formats: Optional[List[str]] class OembedConfig(Config): @@ -93,11 +95,22 @@ class OembedConfig(Config): # might have multiple patterns to match. for endpoint in provider["endpoints"]: api_endpoint = endpoint["url"] + + # The API endpoint must be an HTTP(S) URL. + results = urlparse.urlparse(api_endpoint) + if results.scheme not in {"http", "https"}: + raise ConfigError( + f"Unsupported oEmbed scheme ({results.scheme}) for endpoint {api_endpoint}", + config_path, + ) + patterns = [ self._glob_to_pattern(glob, config_path) for glob in endpoint["schemes"] ] - yield OEmbedEndpointConfig(api_endpoint, patterns) + yield OEmbedEndpointConfig( + api_endpoint, patterns, endpoint.get("formats") + ) def _glob_to_pattern(self, glob: str, config_path: Iterable[str]) -> Pattern: """ @@ -114,9 +127,12 @@ class OembedConfig(Config): """ results = urlparse.urlparse(glob) - # Ensure the scheme does not have wildcards (and is a sane scheme). + # The scheme must be HTTP(S) (and cannot contain wildcards). if results.scheme not in {"http", "https"}: - raise ConfigError(f"Insecure oEmbed scheme: {results.scheme}", config_path) + raise ConfigError( + f"Unsupported oEmbed scheme ({results.scheme}) for pattern: {glob}", + config_path, + ) pattern = urlparse.urlunparse( [ diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py index afe41823e4..2e6706dbfa 100644 --- a/synapse/rest/media/v1/oembed.py +++ b/synapse/rest/media/v1/oembed.py @@ -49,8 +49,24 @@ class OEmbedProvider: def __init__(self, hs: "HomeServer", client: SimpleHttpClient): self._oembed_patterns = {} for oembed_endpoint in hs.config.oembed.oembed_patterns: + api_endpoint = oembed_endpoint.api_endpoint + + # Only JSON is supported at the moment. This could be declared in + # the formats field. Otherwise, if the endpoint ends in .xml assume + # it doesn't support JSON. + if ( + oembed_endpoint.formats is not None + and "json" not in oembed_endpoint.formats + ) or api_endpoint.endswith(".xml"): + logger.info( + "Ignoring oEmbed endpoint due to not supporting JSON: %s", + api_endpoint, + ) + continue + + # Iterate through each URL pattern and point it to the endpoint. for pattern in oembed_endpoint.url_patterns: - self._oembed_patterns[pattern] = oembed_endpoint.api_endpoint + self._oembed_patterns[pattern] = api_endpoint self._client = client def get_oembed_url(self, url: str) -> Optional[str]: @@ -86,11 +102,15 @@ class OEmbedProvider: """ try: logger.debug("Trying to get oEmbed content for url '%s'", url) + + # Note that only the JSON format is supported, some endpoints want + # this in the URL, others want it as an argument. + endpoint = endpoint.replace("{format}", "json") + result = await self._client.get_json( endpoint, # TODO Specify max height / width. - # Note that only the JSON format is supported. - args={"url": url}, + args={"url": url, "format": "json"}, ) # Ensure there's a version of 1.0. diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index 7fa9027227..9f6fbfe6de 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -92,7 +92,15 @@ class URLPreviewTests(unittest.HomeserverTestCase): url_patterns=[ re.compile(r"http://twitter\.com/.+/status/.+"), ], - ) + formats=None, + ), + OEmbedEndpointConfig( + api_endpoint="http://www.hulu.com/api/oembed.{format}", + url_patterns=[ + re.compile(r"http://www\.hulu\.com/watch/.+"), + ], + formats=["json"], + ), ] return hs @@ -656,3 +664,48 @@ class URLPreviewTests(unittest.HomeserverTestCase): channel.json_body, {"og:title": None, "og:description": "Content Preview"}, ) + + def test_oembed_format(self): + """Test an oEmbed endpoint which requires the format in the URL.""" + self.lookups["www.hulu.com"] = [(IPv4Address, "10.1.2.3")] + + result = { + "version": "1.0", + "type": "rich", + "html": "
Content Preview
", + } + end_content = json.dumps(result).encode("utf-8") + + channel = self.make_request( + "GET", + "preview_url?url=http://www.hulu.com/watch/12345", + shorthand=False, + await_result=False, + ) + self.pump() + + client = self.reactor.tcpClients[0][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b'Content-Type: application/json; charset="utf8"\r\n\r\n' + ) + % (len(end_content),) + + end_content + ) + + self.pump() + + # The {format} should have been turned into json. + self.assertIn(b"/api/oembed.json", server.data) + # A URL parameter of format=json should be provided. + self.assertIn(b"format=json", server.data) + + self.assertEqual(channel.code, 200) + self.assertEqual( + channel.json_body, + {"og:title": None, "og:description": "Content Preview"}, + ) -- cgit 1.5.1