From 7dd0c1730a1ea5962a77b9bbb883c1690b25b686 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 24 Jan 2016 18:47:27 -0500 Subject: initial WIP of a tentative preview_url endpoint - incomplete, untested, experimental, etc. just putting it here for safekeeping for now --- docs/url_previews.rst | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 docs/url_previews.rst (limited to 'docs/url_previews.rst') diff --git a/docs/url_previews.rst b/docs/url_previews.rst new file mode 100644 index 0000000000..1dc6ee0c45 --- /dev/null +++ b/docs/url_previews.rst @@ -0,0 +1,74 @@ +URL Previews +============ + +Design notes on a URL previewing service for Matrix: + +Options are: + + 1. Have an AS which listens for URLs, downloads them, and inserts an event that describes their metadata. + * Pros: + * Decouples the implementation entirely from Synapse. + * Uses existing Matrix events & content repo to store the metadata. + * Cons: + * Which AS should provide this service for a room, and why should you trust it? + * Doesn't work well with E2E; you'd have to cut the AS into every room + * the AS would end up subscribing to every room anyway. + + 2. Have a generic preview API (nothing to do with Matrix) that provides a previewing service: + * Pros: + * Simple and flexible; can be used by any clients at any point + * Cons: + * If each HS provides one of these independently, all the HSes in a room may needlessly DoS the target URI + * We need somewhere to store the URL metadata rather than just using Matrix itself + * We can't piggyback on matrix to distribute the metadata between HSes. + + 3. Make the synapse of the sending user responsible for spidering the URL and inserting an event asynchronously which describes the metadata. + * Pros: + * Works transparently for all clients + * Piggy-backs nicely on using Matrix for distributing the metadata. + * No confusion as to which AS + * Cons: + * Doesn't work with E2E + * We might want to decouple the implementation of the spider from the HS, given spider behaviour can be quite complicated and evolve much more rapidly than the HS. It's more like a bot than a core part of the server. + + 4. Make the sending client use the preview API and insert the event itself when successful. + * Pros: + * Works well with E2E + * No custom server functionality + * Lets the client customise the preview that they send (like on FB) + * Cons: + * Entirely specific to the sending client, whereas it'd be nice if /any/ URL was correctly previewed if clients support it. + + 5. Have the option of specifying a shared (centralised) previewing service used by a room, to avoid all the different HSes in the room DoSing the target. + +Best solution is probably a combination of both 2 and 4. + * Sending clients do their best to create and send a preview at the point of sending the message, perhaps delaying the message until the preview is computed? (This also lets the user validate the preview before sending) + * Receiving clients have the option of going and creating their own preview if one doesn't arrive soon enough (or if the original sender didn't create one) + +This is a bit magical though in that the preview could come from two entirely different sources - the sending HS or your local one. However, this can always be exposed to users: "Generate your own URL previews if none are available?" + +This is tantamount also to senders calculating their own thumbnails for sending in advance of the main content - we are trusting the sender not to lie about the content in the thumbnail. Whereas currently thumbnails are calculated by the receiving homeserver to avoid this attack. + +However, this kind of phishing attack does exist whether we let senders pick their thumbnails or not, in that a malicious sender can send normal text messages around the attachment claiming it to be legitimate. We could rely on (future) reputation/abuse management to punish users who phish (be it with bogus metadata or bogus descriptions). Bogus metadata is particularly bad though, especially if it's avoidable. + +As a first cut, let's do #2 and have the receiver hit the API to calculate its own previews (as it does currently for image thumbnails). We can then extend/optimise this to option 4 as a special extra if needed. + +API +--- + +GET /_matrix/media/r0/previewUrl?url=http://wherever.com +200 OK +{ + "og:type" : "article" + "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" + "og:title" : "Matrix on Twitter" + "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" + "og:description" : "“Synapse 0.12 is out! Lots of polishing, performance & bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP”" + "og:site_name" : "Twitter" +} + +* Downloads the URL + * If HTML, just stores it in RAM and parses it for OG meta tags + * Download any media OG meta tags to the media repo, and refer to them in the OG via mxc:// URIs. + * If a media filetype we know we can thumbnail: store it on disk, and hand it to the thumbnailer. Generate OG meta tags from the thumbnailer contents. + * Otherwise, don't bother downloading further. -- cgit 1.4.1 From 64b4aead15927be56d7433250462c03f2d1f4565 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Tue, 29 Mar 2016 03:13:25 +0100 Subject: make it work --- docs/url_previews.rst | 2 +- synapse/http/client.py | 3 +- synapse/rest/media/v1/base_resource.py | 1 + synapse/rest/media/v1/preview_url_resource.py | 131 +++++++++++++++----------- 4 files changed, 80 insertions(+), 57 deletions(-) (limited to 'docs/url_previews.rst') diff --git a/docs/url_previews.rst b/docs/url_previews.rst index 1dc6ee0c45..634d9d907f 100644 --- a/docs/url_previews.rst +++ b/docs/url_previews.rst @@ -56,7 +56,7 @@ As a first cut, let's do #2 and have the receiver hit the API to calculate its o API --- -GET /_matrix/media/r0/previewUrl?url=http://wherever.com +GET /_matrix/media/r0/preview_url?url=http://wherever.com 200 OK { "og:type" : "article" diff --git a/synapse/http/client.py b/synapse/http/client.py index a735300db0..cfdea91b57 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -26,6 +26,7 @@ from twisted.web.client import ( Agent, readBody, FileBodyProducer, PartialDownloadError, ) from twisted.web.http_headers import Headers +from twisted.web._newclient import ResponseDone from StringIO import StringIO @@ -266,7 +267,7 @@ class SimpleHttpClient(object): headers = dict(response.headers.getAllRawHeaders()) - if headers['Content-Length'] > max_size: + if 'Content-Length' in headers and headers['Content-Length'] > max_size: logger.warn("Requested URL is too large > %r bytes" % (self.max_size,)) # XXX: do we want to explicitly drop the connection here somehow? if so, how? raise # what should we be raising here? diff --git a/synapse/rest/media/v1/base_resource.py b/synapse/rest/media/v1/base_resource.py index 58ef91c0b8..2b1938dc8e 100644 --- a/synapse/rest/media/v1/base_resource.py +++ b/synapse/rest/media/v1/base_resource.py @@ -72,6 +72,7 @@ class BaseMediaResource(Resource): self.store = hs.get_datastore() self.max_upload_size = hs.config.max_upload_size self.max_image_pixels = hs.config.max_image_pixels + self.max_spider_size = hs.config.max_spider_size self.filepaths = filepaths self.version_string = hs.version_string self.downloads = {} diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 5c8e20e23c..408b103367 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -12,26 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .base_resource import BaseMediaResource +from synapse.api.errors import Codes from twisted.web.resource import Resource +from twisted.web.server import NOT_DONE_YET from twisted.internet import defer from lxml import html +from synapse.util.stringutils import random_string from synapse.http.client import SimpleHttpClient -from synapse.http.server import request_handler, respond_with_json_bytes +from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes + +import os import ujson as json import logging logger = logging.getLogger(__name__) -class PreviewUrlResource(Resource): +class PreviewUrlResource(BaseMediaResource): isLeaf = True def __init__(self, hs, filepaths): - Resource.__init__(self) + BaseMediaResource.__init__(self, hs, filepaths) self.client = SimpleHttpClient(hs) - self.filepaths = filepaths - self.max_spider_size = hs.config.max_spider_size - self.server_name = hs.hostname - self.clock = hs.get_clock() def render_GET(self, request): self._async_render_GET(request) @@ -40,57 +42,76 @@ class PreviewUrlResource(Resource): @request_handler @defer.inlineCallbacks def _async_render_GET(self, request): - url = request.args.get("url") try: + # XXX: if get_user_by_req fails, what should we do in an async render? + requester = yield self.auth.get_user_by_req(request) + url = request.args.get("url")[0] + # TODO: keep track of whether there's an ongoing request for this preview # and block and return their details if there is one. - media_info = self._download_url(url) + media_info = yield self._download_url(url, requester.user) + + logger.warn("got media_info of '%s'" % media_info) + + if self._is_media(media_info['media_type']): + dims = yield self._generate_local_thumbnails( + media_info.filesystem_id, media_info + ) + + og = { + "og:description" : media_info.download_name, + "og:image" : "mxc://%s/%s" % (self.server_name, media_info.filesystem_id), + "og:image:type" : media_info['media_type'], + "og:image:width" : dims.width, + "og:image:height" : dims.height, + } + + # define our OG response for this media + elif self._is_html(media_info['media_type']): + tree = html.parse(media_info['filename']) + logger.warn(html.tostring(tree)) + + # suck it up into lxml and define our OG response. + # if we see any URLs in the OG response, then spider them + # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) + + # "og:type" : "article" + # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" + # "og:title" : "Matrix on Twitter" + # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" + # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance & bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" + # "og:site_name" : "Twitter" + + og = {} + for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): + og[tag.attrib['property']] = tag.attrib['content'] + + # TODO: store our OG details in a cache (and expire them when stale) + # TODO: delete the content to stop diskfilling, as we only ever cared about its OG + else: + logger.warn("Failed to find any OG data in %s", url) + og = {} + + respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) except: - os.remove(fname) + # XXX: if we don't explicitly respond here, the request never returns. + # isn't this what server.py's wrapper is meant to be doing for us? + respond_with_json( + request, + 500, + { + "error": "Internal server error", + "errcode": Codes.UNKNOWN, + }, + send_cors=True + ) raise - if self._is_media(media_type): - dims = yield self._generate_local_thumbnails( - media_info.filesystem_id, media_info - ) - - og = { - "og:description" : media_info.download_name, - "og:image" : "mxc://%s/%s" % (self.server_name, media_info.filesystem_id), - "og:image:type" : media_info.media_type, - "og:image:width" : dims.width, - "og:image:height" : dims.height, - } - - # define our OG response for this media - elif self._is_html(media_type): - tree = html.parse(media_info.filename) - - # suck it up into lxml and define our OG response. - # if we see any URLs in the OG response, then spider them - # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) - - # "og:type" : "article" - # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" - # "og:title" : "Matrix on Twitter" - # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" - # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance & bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" - # "og:site_name" : "Twitter" - - og = {} - for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): - og[tag.attrib['property']] = tag.attrib['content'] - - # TODO: store our OG details in a cache (and expire them when stale) - # TODO: delete the content to stop diskfilling, as we only ever cared about its OG - - respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) - - def _download_url(url): - requester = yield self.auth.get_user_by_req(request) + @defer.inlineCallbacks + def _download_url(self, url, user): # XXX: horrible duplication with base_resource's _download_remote_file() file_id = random_string(24) @@ -99,6 +120,7 @@ class PreviewUrlResource(Resource): try: with open(fname, "wb") as f: + logger.warn("Trying to get url '%s'" % url) length, headers = yield self.client.get_file( url, output_stream=f, max_size=self.max_spider_size, ) @@ -137,14 +159,14 @@ class PreviewUrlResource(Resource): time_now_ms=self.clock.time_msec(), upload_name=download_name, media_length=length, - user_id=requester.user, + user_id=user, ) except: os.remove(fname) raise - yield ({ + defer.returnValue({ "media_type": media_type, "media_length": length, "download_name": download_name, @@ -152,14 +174,13 @@ class PreviewUrlResource(Resource): "filesystem_id": file_id, "filename": fname, }) - return - def _is_media(content_type): + def _is_media(self, content_type): if content_type.lower().startswith("image/"): return True - def _is_html(content_type): + def _is_html(self, content_type): content_type = content_type.lower() - if (content_type == "text/html" or + if (content_type.startswith("text/html") or content_type.startswith("application/xhtml")): return True -- cgit 1.4.1