From 7dd0c1730a1ea5962a77b9bbb883c1690b25b686 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 24 Jan 2016 18:47:27 -0500 Subject: initial WIP of a tentative preview_url endpoint - incomplete, untested, experimental, etc. just putting it here for safekeeping for now --- synapse/rest/media/v1/media_repository.py | 3 + synapse/rest/media/v1/preview_url_resource.py | 164 ++++++++++++++++++++++++++ 2 files changed, 167 insertions(+) create mode 100644 synapse/rest/media/v1/preview_url_resource.py (limited to 'synapse/rest') diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index 7dfb027dd1..8f3491b91c 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -17,6 +17,7 @@ from .upload_resource import UploadResource from .download_resource import DownloadResource from .thumbnail_resource import ThumbnailResource from .identicon_resource import IdenticonResource +from .preview_url_resource import PreviewUrlResource from .filepath import MediaFilePaths from twisted.web.resource import Resource @@ -78,3 +79,5 @@ class MediaRepositoryResource(Resource): self.putChild("download", DownloadResource(hs, filepaths)) self.putChild("thumbnail", ThumbnailResource(hs, filepaths)) self.putChild("identicon", IdenticonResource()) + self.putChild("preview_url", PreviewUrlResource(hs, filepaths)) + diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py new file mode 100644 index 0000000000..fb8ab3096f --- /dev/null +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -0,0 +1,164 @@ +# Copyright 2016 OpenMarket Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from twisted.web.resource import Resource +from lxml import html +from synapse.http.client import SimpleHttpClient +from synapse.http.server import respond_with_json_bytes +from simplejson import json + +import logging +logger = logging.getLogger(__name__) + +class PreviewUrlResource(Resource): + isLeaf = True + + def __init__(self, hs, filepaths): + Resource.__init__(self) + self.client = SimpleHttpClient(hs) + self.filepaths = filepaths + self.max_spider_size = hs.config.max_spider_size + self.server_name = hs.hostname + self.clock = hs.get_clock() + + def render_GET(self, request): + self._async_render_GET(request) + return NOT_DONE_YET + + @request_handler + @defer.inlineCallbacks + def _async_render_GET(self, request): + url = request.args.get("url") + + try: + # TODO: keep track of whether there's an ongoing request for this preview + # and block and return their details if there is one. + + media_info = self._download_url(url) + except: + os.remove(fname) + raise + + if self._is_media(media_type): + dims = yield self._generate_local_thumbnails( + media_info.filesystem_id, media_info + ) + + og = { + "og:description" : media_info.download_name, + "og:image" : "mxc://%s/%s" % (self.server_name, media_info.filesystem_id), + "og:image:type" : media_info.media_type, + "og:image:width" : dims.width, + "og:image:height" : dims.height, + } + + # define our OG response for this media + elif self._is_html(media_type): + tree = html.parse(media_info.filename) + + # suck it up into lxml and define our OG response. + # if we see any URLs in the OG response, then spider them + # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) + + # "og:type" : "article" + # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" + # "og:title" : "Matrix on Twitter" + # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" + # "og:description" : "“Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP”" + # "og:site_name" : "Twitter" + + og = {} + for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): + og[tag.attrib['property']] = tag.attrib['content'] + + # TODO: store our OG details in a cache (and expire them when stale) + # TODO: delete the content to stop diskfilling, as we only ever cared about its OG + + respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) + + def _download_url(url): + requester = yield self.auth.get_user_by_req(request) + + # XXX: horrible duplication with base_resource's _download_remote_file() + file_id = random_string(24) + + fname = self.filepaths.local_media_filepath(file_id) + self._makedirs(fname) + + try: + with open(fname, "wb") as f: + length, headers = yield self.client.get_file( + url, output_stream=f, max_size=self.max_spider_size, + ) + media_type = headers["Content-Type"][0] + time_now_ms = self.clock.time_msec() + + content_disposition = headers.get("Content-Disposition", None) + if content_disposition: + _, params = cgi.parse_header(content_disposition[0],) + download_name = None + + # First check if there is a valid UTF-8 filename + download_name_utf8 = params.get("filename*", None) + if download_name_utf8: + if download_name_utf8.lower().startswith("utf-8''"): + download_name = download_name_utf8[7:] + + # If there isn't check for an ascii name. + if not download_name: + download_name_ascii = params.get("filename", None) + if download_name_ascii and is_ascii(download_name_ascii): + download_name = download_name_ascii + + if download_name: + download_name = urlparse.unquote(download_name) + try: + download_name = download_name.decode("utf-8") + except UnicodeDecodeError: + download_name = None + else: + download_name = None + + yield self.store.store_local_media( + media_id=fname, + media_type=media_type, + time_now_ms=self.clock.time_msec(), + upload_name=download_name, + media_length=length, + user_id=requester.user, + ) + + except: + os.remove(fname) + raise + + return { + "media_type": media_type, + "media_length": length, + "download_name": download_name, + "created_ts": time_now_ms, + "filesystem_id": file_id, + "filename": fname, + } + + + def _is_media(content_type): + if content_type.lower().startswith("image/"): + return True + + def _is_html(content_type): + content_type = content_type.lower() + if content_type == "text/html" or + content_type.startswith("application/xhtml"): + return True -- cgit 1.5.1 From 191c7bef6bbb80f66f66e95387940c3bb6b5a0cf Mon Sep 17 00:00:00 2001 From: Mark Haines Date: Thu, 24 Mar 2016 17:47:31 +0000 Subject: Deduplicate identical /sync requests --- synapse/handlers/sync.py | 16 +++++++++++- synapse/rest/client/v2_alpha/sync.py | 3 +++ synapse/util/caches/response_cache.py | 46 +++++++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 synapse/util/caches/response_cache.py (limited to 'synapse/rest') diff --git a/synapse/handlers/sync.py b/synapse/handlers/sync.py index 1f6fde8e8a..48ab5707e1 100644 --- a/synapse/handlers/sync.py +++ b/synapse/handlers/sync.py @@ -20,6 +20,7 @@ from synapse.api.constants import Membership, EventTypes from synapse.util import unwrapFirstError from synapse.util.logcontext import LoggingContext, preserve_fn from synapse.util.metrics import Measure +from synapse.util.caches.response_cache import ResponseCache from synapse.push.clientformat import format_push_rules_for_user from twisted.internet import defer @@ -35,6 +36,7 @@ SyncConfig = collections.namedtuple("SyncConfig", [ "user", "filter_collection", "is_guest", + "request_key", ]) @@ -136,8 +138,8 @@ class SyncHandler(BaseHandler): super(SyncHandler, self).__init__(hs) self.event_sources = hs.get_event_sources() self.clock = hs.get_clock() + self.response_cache = ResponseCache() - @defer.inlineCallbacks def wait_for_sync_for_user(self, sync_config, since_token=None, timeout=0, full_state=False): """Get the sync for a client if we have new data for it now. Otherwise @@ -146,7 +148,19 @@ class SyncHandler(BaseHandler): Returns: A Deferred SyncResult. """ + result = self.response_cache.get(sync_config.request_key) + if not result: + result = self.response_cache.set( + sync_config.request_key, + self._wait_for_sync_for_user( + sync_config, since_token, timeout, full_state + ) + ) + return result + @defer.inlineCallbacks + def _wait_for_sync_for_user(self, sync_config, since_token, timeout, + full_state): context = LoggingContext.current_context() if context: if since_token is None: diff --git a/synapse/rest/client/v2_alpha/sync.py b/synapse/rest/client/v2_alpha/sync.py index de4a020ad4..c5785d7074 100644 --- a/synapse/rest/client/v2_alpha/sync.py +++ b/synapse/rest/client/v2_alpha/sync.py @@ -115,6 +115,8 @@ class SyncRestServlet(RestServlet): ) ) + request_key = (user, timeout, since, filter_id, full_state) + if filter_id: if filter_id.startswith('{'): try: @@ -134,6 +136,7 @@ class SyncRestServlet(RestServlet): user=user, filter_collection=filter, is_guest=requester.is_guest, + request_key=request_key, ) if since is not None: diff --git a/synapse/util/caches/response_cache.py b/synapse/util/caches/response_cache.py new file mode 100644 index 0000000000..1c2e344269 --- /dev/null +++ b/synapse/util/caches/response_cache.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +# Copyright 2016 OpenMarket Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from synapse.util.async import ObservableDeferred + + +class ResponseCache(object): + """ + This caches a deferred response. Until the deferred completes it will be + returned from the cache. This means that if the client retries the request + while the response is still being computed, that original response will be + used rather than trying to compute a new response. + """ + + def __init__(self): + self.pending_result_cache = {} # Request that haven't finished yet. + + def get(self, key): + result = self.pending_result_cache.get(key) + if result is not None: + return result.observe() + else: + return None + + def set(self, key, deferred): + result = ObservableDeferred(deferred) + self.pending_result_cache[key] = result + + def remove(r): + self.pending_result_cache.pop(key, None) + return r + + result.addBoth(remove) + return result.observe() -- cgit 1.5.1 From dd4287ca5d0c3e3df566748e0dd6ab36398f64b4 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Tue, 29 Mar 2016 02:07:57 +0100 Subject: make it build --- synapse/http/client.py | 2 +- synapse/python_dependencies.py | 1 + synapse/rest/media/v1/preview_url_resource.py | 17 +++++++++-------- 3 files changed, 11 insertions(+), 9 deletions(-) (limited to 'synapse/rest') diff --git a/synapse/http/client.py b/synapse/http/client.py index 127690e534..a735300db0 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -21,7 +21,7 @@ import synapse.metrics from canonicaljson import encode_canonical_json -from twisted.internet import defer, reactor, ssl +from twisted.internet import defer, reactor, ssl, protocol from twisted.web.client import ( Agent, readBody, FileBodyProducer, PartialDownloadError, ) diff --git a/synapse/python_dependencies.py b/synapse/python_dependencies.py index 0a6043ae8d..d12ef15043 100644 --- a/synapse/python_dependencies.py +++ b/synapse/python_dependencies.py @@ -36,6 +36,7 @@ REQUIREMENTS = { "blist": ["blist"], "pysaml2>=3.0.0,<4.0.0": ["saml2>=3.0.0,<4.0.0"], "pymacaroons-pynacl": ["pymacaroons"], + "lxml>=3.6.0": ["lxml"], } CONDITIONAL_REQUIREMENTS = { "web_client": { diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index fb8ab3096f..5c8e20e23c 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -13,10 +13,11 @@ # limitations under the License. from twisted.web.resource import Resource +from twisted.internet import defer from lxml import html from synapse.http.client import SimpleHttpClient -from synapse.http.server import respond_with_json_bytes -from simplejson import json +from synapse.http.server import request_handler, respond_with_json_bytes +import ujson as json import logging logger = logging.getLogger(__name__) @@ -75,7 +76,7 @@ class PreviewUrlResource(Resource): # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" # "og:title" : "Matrix on Twitter" # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" - # "og:description" : "“Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP”" + # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" # "og:site_name" : "Twitter" og = {} @@ -143,15 +144,15 @@ class PreviewUrlResource(Resource): os.remove(fname) raise - return { + yield ({ "media_type": media_type, "media_length": length, "download_name": download_name, "created_ts": time_now_ms, "filesystem_id": file_id, "filename": fname, - } - + }) + return def _is_media(content_type): if content_type.lower().startswith("image/"): @@ -159,6 +160,6 @@ class PreviewUrlResource(Resource): def _is_html(content_type): content_type = content_type.lower() - if content_type == "text/html" or - content_type.startswith("application/xhtml"): + if (content_type == "text/html" or + content_type.startswith("application/xhtml")): return True -- cgit 1.5.1 From 64b4aead15927be56d7433250462c03f2d1f4565 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Tue, 29 Mar 2016 03:13:25 +0100 Subject: make it work --- docs/url_previews.rst | 2 +- synapse/http/client.py | 3 +- synapse/rest/media/v1/base_resource.py | 1 + synapse/rest/media/v1/preview_url_resource.py | 131 +++++++++++++++----------- 4 files changed, 80 insertions(+), 57 deletions(-) (limited to 'synapse/rest') diff --git a/docs/url_previews.rst b/docs/url_previews.rst index 1dc6ee0c45..634d9d907f 100644 --- a/docs/url_previews.rst +++ b/docs/url_previews.rst @@ -56,7 +56,7 @@ As a first cut, let's do #2 and have the receiver hit the API to calculate its o API --- -GET /_matrix/media/r0/previewUrl?url=http://wherever.com +GET /_matrix/media/r0/preview_url?url=http://wherever.com 200 OK { "og:type" : "article" diff --git a/synapse/http/client.py b/synapse/http/client.py index a735300db0..cfdea91b57 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -26,6 +26,7 @@ from twisted.web.client import ( Agent, readBody, FileBodyProducer, PartialDownloadError, ) from twisted.web.http_headers import Headers +from twisted.web._newclient import ResponseDone from StringIO import StringIO @@ -266,7 +267,7 @@ class SimpleHttpClient(object): headers = dict(response.headers.getAllRawHeaders()) - if headers['Content-Length'] > max_size: + if 'Content-Length' in headers and headers['Content-Length'] > max_size: logger.warn("Requested URL is too large > %r bytes" % (self.max_size,)) # XXX: do we want to explicitly drop the connection here somehow? if so, how? raise # what should we be raising here? diff --git a/synapse/rest/media/v1/base_resource.py b/synapse/rest/media/v1/base_resource.py index 58ef91c0b8..2b1938dc8e 100644 --- a/synapse/rest/media/v1/base_resource.py +++ b/synapse/rest/media/v1/base_resource.py @@ -72,6 +72,7 @@ class BaseMediaResource(Resource): self.store = hs.get_datastore() self.max_upload_size = hs.config.max_upload_size self.max_image_pixels = hs.config.max_image_pixels + self.max_spider_size = hs.config.max_spider_size self.filepaths = filepaths self.version_string = hs.version_string self.downloads = {} diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 5c8e20e23c..408b103367 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -12,26 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .base_resource import BaseMediaResource +from synapse.api.errors import Codes from twisted.web.resource import Resource +from twisted.web.server import NOT_DONE_YET from twisted.internet import defer from lxml import html +from synapse.util.stringutils import random_string from synapse.http.client import SimpleHttpClient -from synapse.http.server import request_handler, respond_with_json_bytes +from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes + +import os import ujson as json import logging logger = logging.getLogger(__name__) -class PreviewUrlResource(Resource): +class PreviewUrlResource(BaseMediaResource): isLeaf = True def __init__(self, hs, filepaths): - Resource.__init__(self) + BaseMediaResource.__init__(self, hs, filepaths) self.client = SimpleHttpClient(hs) - self.filepaths = filepaths - self.max_spider_size = hs.config.max_spider_size - self.server_name = hs.hostname - self.clock = hs.get_clock() def render_GET(self, request): self._async_render_GET(request) @@ -40,57 +42,76 @@ class PreviewUrlResource(Resource): @request_handler @defer.inlineCallbacks def _async_render_GET(self, request): - url = request.args.get("url") try: + # XXX: if get_user_by_req fails, what should we do in an async render? + requester = yield self.auth.get_user_by_req(request) + url = request.args.get("url")[0] + # TODO: keep track of whether there's an ongoing request for this preview # and block and return their details if there is one. - media_info = self._download_url(url) + media_info = yield self._download_url(url, requester.user) + + logger.warn("got media_info of '%s'" % media_info) + + if self._is_media(media_info['media_type']): + dims = yield self._generate_local_thumbnails( + media_info.filesystem_id, media_info + ) + + og = { + "og:description" : media_info.download_name, + "og:image" : "mxc://%s/%s" % (self.server_name, media_info.filesystem_id), + "og:image:type" : media_info['media_type'], + "og:image:width" : dims.width, + "og:image:height" : dims.height, + } + + # define our OG response for this media + elif self._is_html(media_info['media_type']): + tree = html.parse(media_info['filename']) + logger.warn(html.tostring(tree)) + + # suck it up into lxml and define our OG response. + # if we see any URLs in the OG response, then spider them + # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) + + # "og:type" : "article" + # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" + # "og:title" : "Matrix on Twitter" + # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" + # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" + # "og:site_name" : "Twitter" + + og = {} + for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): + og[tag.attrib['property']] = tag.attrib['content'] + + # TODO: store our OG details in a cache (and expire them when stale) + # TODO: delete the content to stop diskfilling, as we only ever cared about its OG + else: + logger.warn("Failed to find any OG data in %s", url) + og = {} + + respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) except: - os.remove(fname) + # XXX: if we don't explicitly respond here, the request never returns. + # isn't this what server.py's wrapper is meant to be doing for us? + respond_with_json( + request, + 500, + { + "error": "Internal server error", + "errcode": Codes.UNKNOWN, + }, + send_cors=True + ) raise - if self._is_media(media_type): - dims = yield self._generate_local_thumbnails( - media_info.filesystem_id, media_info - ) - - og = { - "og:description" : media_info.download_name, - "og:image" : "mxc://%s/%s" % (self.server_name, media_info.filesystem_id), - "og:image:type" : media_info.media_type, - "og:image:width" : dims.width, - "og:image:height" : dims.height, - } - - # define our OG response for this media - elif self._is_html(media_type): - tree = html.parse(media_info.filename) - - # suck it up into lxml and define our OG response. - # if we see any URLs in the OG response, then spider them - # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) - - # "og:type" : "article" - # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" - # "og:title" : "Matrix on Twitter" - # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" - # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" - # "og:site_name" : "Twitter" - - og = {} - for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): - og[tag.attrib['property']] = tag.attrib['content'] - - # TODO: store our OG details in a cache (and expire them when stale) - # TODO: delete the content to stop diskfilling, as we only ever cared about its OG - - respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) - - def _download_url(url): - requester = yield self.auth.get_user_by_req(request) + @defer.inlineCallbacks + def _download_url(self, url, user): # XXX: horrible duplication with base_resource's _download_remote_file() file_id = random_string(24) @@ -99,6 +120,7 @@ class PreviewUrlResource(Resource): try: with open(fname, "wb") as f: + logger.warn("Trying to get url '%s'" % url) length, headers = yield self.client.get_file( url, output_stream=f, max_size=self.max_spider_size, ) @@ -137,14 +159,14 @@ class PreviewUrlResource(Resource): time_now_ms=self.clock.time_msec(), upload_name=download_name, media_length=length, - user_id=requester.user, + user_id=user, ) except: os.remove(fname) raise - yield ({ + defer.returnValue({ "media_type": media_type, "media_length": length, "download_name": download_name, @@ -152,14 +174,13 @@ class PreviewUrlResource(Resource): "filesystem_id": file_id, "filename": fname, }) - return - def _is_media(content_type): + def _is_media(self, content_type): if content_type.lower().startswith("image/"): return True - def _is_html(content_type): + def _is_html(self, content_type): content_type = content_type.lower() - if (content_type == "text/html" or + if (content_type.startswith("text/html") or content_type.startswith("application/xhtml")): return True -- cgit 1.5.1 From 19038582d3957eef2b662d28035361ecf9d3a84e Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Tue, 29 Mar 2016 03:14:16 +0100 Subject: debug --- synapse/rest/media/v1/preview_url_resource.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'synapse/rest') diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 408b103367..4f7c9e3d1b 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -53,7 +53,7 @@ class PreviewUrlResource(BaseMediaResource): media_info = yield self._download_url(url, requester.user) - logger.warn("got media_info of '%s'" % media_info) + logger.debug("got media_info of '%s'" % media_info) if self._is_media(media_info['media_type']): dims = yield self._generate_local_thumbnails( @@ -71,7 +71,6 @@ class PreviewUrlResource(BaseMediaResource): # define our OG response for this media elif self._is_html(media_info['media_type']): tree = html.parse(media_info['filename']) - logger.warn(html.tostring(tree)) # suck it up into lxml and define our OG response. # if we see any URLs in the OG response, then spider them @@ -120,7 +119,7 @@ class PreviewUrlResource(BaseMediaResource): try: with open(fname, "wb") as f: - logger.warn("Trying to get url '%s'" % url) + logger.debug("Trying to get url '%s'" % url) length, headers = yield self.client.get_file( url, output_stream=f, max_size=self.max_spider_size, ) -- cgit 1.5.1 From ae5831d30354c713cd1693f3b74cf048de7428a7 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Tue, 29 Mar 2016 03:32:55 +0100 Subject: fix bugs --- synapse/rest/media/v1/preview_url_resource.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'synapse/rest') diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 4f7c9e3d1b..b999944e86 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -57,15 +57,15 @@ class PreviewUrlResource(BaseMediaResource): if self._is_media(media_info['media_type']): dims = yield self._generate_local_thumbnails( - media_info.filesystem_id, media_info + media_info['filesystem_id'], media_info ) og = { - "og:description" : media_info.download_name, - "og:image" : "mxc://%s/%s" % (self.server_name, media_info.filesystem_id), + "og:description" : media_info['download_name'], + "og:image" : "mxc://%s/%s" % (self.server_name, media_info['filesystem_id']), "og:image:type" : media_info['media_type'], - "og:image:width" : dims.width, - "og:image:height" : dims.height, + "og:image:width" : dims['width'], + "og:image:height" : dims['height'], } # define our OG response for this media @@ -123,6 +123,7 @@ class PreviewUrlResource(BaseMediaResource): length, headers = yield self.client.get_file( url, output_stream=f, max_size=self.max_spider_size, ) + # FIXME: handle 404s sanely - don't spider an error page media_type = headers["Content-Type"][0] time_now_ms = self.clock.time_msec() -- cgit 1.5.1 From 3f9948a069498e9966166a0fa581bdbf872d4ad3 Mon Sep 17 00:00:00 2001 From: Niklas Riekenbrauck Date: Mon, 28 Mar 2016 21:33:40 +0200 Subject: Add JWT support --- synapse/config/homeserver.py | 3 ++- synapse/config/jwt.py | 37 ++++++++++++++++++++++++++++ synapse/python_dependencies.py | 1 + synapse/rest/client/v1/login.py | 53 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 synapse/config/jwt.py (limited to 'synapse/rest') diff --git a/synapse/config/homeserver.py b/synapse/config/homeserver.py index a08c170f1d..acf74c8761 100644 --- a/synapse/config/homeserver.py +++ b/synapse/config/homeserver.py @@ -29,13 +29,14 @@ from .key import KeyConfig from .saml2 import SAML2Config from .cas import CasConfig from .password import PasswordConfig +from .jwt import JWTConfig class HomeServerConfig(TlsConfig, ServerConfig, DatabaseConfig, LoggingConfig, RatelimitConfig, ContentRepositoryConfig, CaptchaConfig, VoipConfig, RegistrationConfig, MetricsConfig, ApiConfig, AppServiceConfig, KeyConfig, SAML2Config, CasConfig, - PasswordConfig,): + JWTConfig, PasswordConfig,): pass diff --git a/synapse/config/jwt.py b/synapse/config/jwt.py new file mode 100644 index 0000000000..4cb092bbec --- /dev/null +++ b/synapse/config/jwt.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +# Copyright 2015 Niklas Riekenbrauck +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ._base import Config + + +class JWTConfig(Config): + def read_config(self, config): + jwt_config = config.get("jwt_config", None) + if jwt_config: + self.jwt_enabled = jwt_config.get("enabled", False) + self.jwt_secret = jwt_config["secret"] + self.jwt_algorithm = jwt_config["algorithm"] + else: + self.jwt_enabled = False + self.jwt_secret = None + self.jwt_algorithm = None + + def default_config(self, **kwargs): + return """\ + # jwt_config: + # enabled: true + # secret: "a secret" + # algorithm: "HS256" + """ diff --git a/synapse/python_dependencies.py b/synapse/python_dependencies.py index 0a6043ae8d..cf1414b4db 100644 --- a/synapse/python_dependencies.py +++ b/synapse/python_dependencies.py @@ -36,6 +36,7 @@ REQUIREMENTS = { "blist": ["blist"], "pysaml2>=3.0.0,<4.0.0": ["saml2>=3.0.0,<4.0.0"], "pymacaroons-pynacl": ["pymacaroons"], + "pyjwt": ["jwt"], } CONDITIONAL_REQUIREMENTS = { "web_client": { diff --git a/synapse/rest/client/v1/login.py b/synapse/rest/client/v1/login.py index fe593d07ce..d14ce3efa2 100644 --- a/synapse/rest/client/v1/login.py +++ b/synapse/rest/client/v1/login.py @@ -33,6 +33,9 @@ from saml2.client import Saml2Client import xml.etree.ElementTree as ET +import jwt +from jwt.exceptions import InvalidTokenError + logger = logging.getLogger(__name__) @@ -43,12 +46,16 @@ class LoginRestServlet(ClientV1RestServlet): SAML2_TYPE = "m.login.saml2" CAS_TYPE = "m.login.cas" TOKEN_TYPE = "m.login.token" + JWT_TYPE = "m.login.jwt" def __init__(self, hs): super(LoginRestServlet, self).__init__(hs) self.idp_redirect_url = hs.config.saml2_idp_redirect_url self.password_enabled = hs.config.password_enabled self.saml2_enabled = hs.config.saml2_enabled + self.jwt_enabled = hs.config.jwt_enabled + self.jwt_secret = hs.config.jwt_secret + self.jwt_algorithm = hs.config.jwt_algorithm self.cas_enabled = hs.config.cas_enabled self.cas_server_url = hs.config.cas_server_url self.cas_required_attributes = hs.config.cas_required_attributes @@ -57,6 +64,8 @@ class LoginRestServlet(ClientV1RestServlet): def on_GET(self, request): flows = [] + if self.jwt_enabled: + flows.append({"type": LoginRestServlet.JWT_TYPE}) if self.saml2_enabled: flows.append({"type": LoginRestServlet.SAML2_TYPE}) if self.cas_enabled: @@ -98,6 +107,10 @@ class LoginRestServlet(ClientV1RestServlet): "uri": "%s%s" % (self.idp_redirect_url, relay_state) } defer.returnValue((200, result)) + elif self.jwt_enabled and (login_submission["type"] == + LoginRestServlet.JWT_TYPE): + result = yield self.do_jwt_login(login_submission) + defer.returnValue(result) # TODO Delete this after all CAS clients switch to token login instead elif self.cas_enabled and (login_submission["type"] == LoginRestServlet.CAS_TYPE): @@ -209,6 +222,46 @@ class LoginRestServlet(ClientV1RestServlet): defer.returnValue((200, result)) + @defer.inlineCallbacks + def do_jwt_login(self, login_submission): + token = login_submission['token'] + if token is None: + raise LoginError(401, "Unauthorized", errcode=Codes.UNAUTHORIZED) + + try: + payload = jwt.decode(token, self.jwt_secret, algorithms=[self.jwt_algorithm]) + except InvalidTokenError: + raise LoginError(401, "Invalid JWT", errcode=Codes.UNAUTHORIZED) + + user = payload['user'] + if user is None: + raise LoginError(401, "Invalid JWT", errcode=Codes.UNAUTHORIZED) + + user_id = UserID.create(user, self.hs.hostname).to_string() + auth_handler = self.handlers.auth_handler + user_exists = yield auth_handler.does_user_exist(user_id) + if user_exists: + user_id, access_token, refresh_token = ( + yield auth_handler.get_login_tuple_for_user_id(user_id) + ) + result = { + "user_id": user_id, # may have changed + "access_token": access_token, + "refresh_token": refresh_token, + "home_server": self.hs.hostname, + } + else: + user_id, access_token = ( + yield self.handlers.registration_handler.register(localpart=user) + ) + result = { + "user_id": user_id, # may have changed + "access_token": access_token, + "home_server": self.hs.hostname, + } + + defer.returnValue((200, result)) + # TODO Delete this after all CAS clients switch to token login instead def parse_cas_response(self, cas_response_body): root = ET.fromstring(cas_response_body) -- cgit 1.5.1 From fddb6fddc1f1e70ab79d8d4ed276f722ab2ea058 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Wed, 30 Mar 2016 10:54:01 +0100 Subject: Require user to have left room to forget room This dramatically simplifies the forget API code - in particular it no longer generates a leave event. --- synapse/handlers/room.py | 22 ++++++++++++++++------ synapse/rest/client/v1/room.py | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 6 deletions(-) (limited to 'synapse/rest') diff --git a/synapse/handlers/room.py b/synapse/handlers/room.py index 133183a257..1d4c2c39a1 100644 --- a/synapse/handlers/room.py +++ b/synapse/handlers/room.py @@ -416,8 +416,6 @@ class RoomMemberHandler(BaseHandler): effective_membership_state = action if action in ["kick", "unban"]: effective_membership_state = "leave" - elif action == "forget": - effective_membership_state = "leave" if third_party_signed is not None: replication = self.hs.get_replication_layer() @@ -473,9 +471,6 @@ class RoomMemberHandler(BaseHandler): remote_room_hosts=remote_room_hosts, ) - if action == "forget": - yield self.forget(requester.user, room_id) - @defer.inlineCallbacks def send_membership_event( self, @@ -935,8 +930,23 @@ class RoomMemberHandler(BaseHandler): display_name = data["display_name"] defer.returnValue((token, public_keys, fallback_public_key, display_name)) + @defer.inlineCallbacks def forget(self, user, room_id): - return self.store.forget(user.to_string(), room_id) + user_id = user.to_string() + + member = yield self.state_handler.get_current_state( + room_id=room_id, + event_type=EventTypes.Member, + state_key=user_id + ) + membership = member.membership if member else None + + if membership is not None and membership != Membership.LEAVE: + raise SynapseError(400, "User %s in room %s" % ( + user_id, room_id + )) + + yield self.store.forget(user_id, room_id) class RoomListHandler(BaseHandler): diff --git a/synapse/rest/client/v1/room.py b/synapse/rest/client/v1/room.py index a1fa7daf79..ccb6e3c45e 100644 --- a/synapse/rest/client/v1/room.py +++ b/synapse/rest/client/v1/room.py @@ -405,6 +405,43 @@ class RoomEventContext(ClientV1RestServlet): defer.returnValue((200, results)) +class RoomForgetRestServlet(ClientV1RestServlet): + def register(self, http_server): + # /rooms/$roomid/[invite|join|leave] + PATTERNS = ("/rooms/(?P[^/]*)/forget") + register_txn_path(self, PATTERNS, http_server) + + @defer.inlineCallbacks + def on_POST(self, request, room_id, txn_id=None): + requester = yield self.auth.get_user_by_req( + request, + allow_guest=False, + ) + + yield self.handlers.room_member_handler.forget( + user=requester.user, + room_id=room_id, + ) + + defer.returnValue((200, {})) + + @defer.inlineCallbacks + def on_PUT(self, request, room_id, txn_id): + try: + defer.returnValue( + self.txns.get_client_transaction(request, txn_id) + ) + except KeyError: + pass + + response = yield self.on_POST( + request, room_id, txn_id + ) + + self.txns.store_client_transaction(request, txn_id, response) + defer.returnValue(response) + + # TODO: Needs unit testing class RoomMembershipRestServlet(ClientV1RestServlet): @@ -624,6 +661,7 @@ def register_servlets(hs, http_server): RoomMemberListRestServlet(hs).register(http_server) RoomMessageListRestServlet(hs).register(http_server) JoinRoomAliasServlet(hs).register(http_server) + RoomForgetRestServlet(hs).register(http_server) RoomMembershipRestServlet(hs).register(http_server) RoomSendEventRestServlet(hs).register(http_server) PublicRoomListRestServlet(hs).register(http_server) -- cgit 1.5.1 From 08a8514b7a05bf2b6d1f8a5d8a3b8985c78ade9e Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Wed, 30 Mar 2016 15:05:33 +0100 Subject: Remove spurious comment --- synapse/rest/client/v1/room.py | 1 - 1 file changed, 1 deletion(-) (limited to 'synapse/rest') diff --git a/synapse/rest/client/v1/room.py b/synapse/rest/client/v1/room.py index ccb6e3c45e..b223fb7e5f 100644 --- a/synapse/rest/client/v1/room.py +++ b/synapse/rest/client/v1/room.py @@ -407,7 +407,6 @@ class RoomEventContext(ClientV1RestServlet): class RoomForgetRestServlet(ClientV1RestServlet): def register(self, http_server): - # /rooms/$roomid/[invite|join|leave] PATTERNS = ("/rooms/(?P[^/]*)/forget") register_txn_path(self, PATTERNS, http_server) -- cgit 1.5.1 From a8a5dd3b44a4526307502bd621ee0bd43c87c77f Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Thu, 31 Mar 2016 01:55:21 +0100 Subject: handle requests with missing content-length headers (e.g. YouTube) --- synapse/http/client.py | 33 +++++++++++++++++++++------ synapse/rest/media/v1/preview_url_resource.py | 4 ++-- 2 files changed, 28 insertions(+), 9 deletions(-) (limited to 'synapse/rest') diff --git a/synapse/http/client.py b/synapse/http/client.py index 71b2e3375e..30f31a915d 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -23,8 +23,9 @@ from canonicaljson import encode_canonical_json from twisted.internet import defer, reactor, ssl, protocol from twisted.web.client import ( - RedirectAgent, Agent, readBody, FileBodyProducer, PartialDownloadError, + BrowserLikeRedirectAgent, Agent, readBody, FileBodyProducer, PartialDownloadError, ) +from twisted.web.http import PotentialDataLoss from twisted.web.http_headers import Headers from twisted.web._newclient import ResponseDone @@ -59,11 +60,11 @@ class SimpleHttpClient(object): # The default context factory in Twisted 14.0.0 (which we require) is # BrowserLikePolicyForHTTPS which will do regular cert validation # 'like a browser' - self.agent = RedirectAgent(Agent( + self.agent = Agent( reactor, connectTimeout=15, contextFactory=hs.get_http_client_context_factory() - )) + ) self.user_agent = hs.version_string if hs.config.user_agent_suffix: self.user_agent = "%s %s" % (self.user_agent, hs.config.user_agent_suffix,) @@ -253,10 +254,6 @@ class SimpleHttpClient(object): headers. """ - def body_callback(method, url_bytes, headers_dict): - self.sign_request(destination, method, url_bytes, headers_dict) - return None - response = yield self.request( "GET", url.encode("ascii"), @@ -309,6 +306,10 @@ class _ReadBodyToFileProtocol(protocol.Protocol): def connectionLost(self, reason): if reason.check(ResponseDone): self.deferred.callback(self.length) + elif reason.check(PotentialDataLoss): + # stolen from https://github.com/twisted/treq/pull/49/files + # http://twistedmatrix.com/trac/ticket/4840 + self.deferred.callback(self.length) else: self.deferred.errback(reason) @@ -350,6 +351,24 @@ class CaptchaServerHttpClient(SimpleHttpClient): # twisted dislikes google's response, no content length. defer.returnValue(e.response) +class SpiderHttpClient(SimpleHttpClient): + """ + Separate HTTP client for spidering arbitrary URLs. + Special in that it follows retries and has a UA that looks + like a browser. + + used by the preview_url endpoint in the content repo. + """ + def __init__(self, hs): + SimpleHttpClient.__init__(self, hs) + # clobber the base class's agent and UA: + self.agent = BrowserLikeRedirectAgent(Agent( + reactor, + connectTimeout=15, + contextFactory=hs.get_http_client_context_factory() + )) + # Look like Chrome for now + #self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko) Chrome Safari" % hs.version_string) def encode_urlencode_args(args): return {k: encode_urlencode_arg(v) for k, v in args.items()} diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index b999944e86..ca2529cc10 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -19,7 +19,7 @@ from twisted.web.server import NOT_DONE_YET from twisted.internet import defer from lxml import html from synapse.util.stringutils import random_string -from synapse.http.client import SimpleHttpClient +from synapse.http.client import SpiderHttpClient from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes import os @@ -33,7 +33,7 @@ class PreviewUrlResource(BaseMediaResource): def __init__(self, hs, filepaths): BaseMediaResource.__init__(self, hs, filepaths) - self.client = SimpleHttpClient(hs) + self.client = SpiderHttpClient(hs) def render_GET(self, request): self._async_render_GET(request) -- cgit 1.5.1 From bb9a2ca87c280e1c6ff6740ee9d2764e1b5226a5 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Thu, 31 Mar 2016 14:15:09 +0100 Subject: synthesise basig OG metadata from pages lacking it --- synapse/rest/media/v1/preview_url_resource.py | 47 +++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'synapse/rest') diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index ca2529cc10..b1d5cabfaa 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -23,6 +23,7 @@ from synapse.http.client import SpiderHttpClient from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes import os +import re import ujson as json import logging @@ -70,6 +71,7 @@ class PreviewUrlResource(BaseMediaResource): # define our OG response for this media elif self._is_html(media_info['media_type']): + # TODO: somehow stop a big HTML tree from exploding synapse's RAM tree = html.parse(media_info['filename']) # suck it up into lxml and define our OG response. @@ -82,17 +84,58 @@ class PreviewUrlResource(BaseMediaResource): # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" # "og:site_name" : "Twitter" + + # or: + + # "og:type" : "video", + # "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw", + # "og:site_name" : "YouTube", + # "og:video:type" : "application/x-shockwave-flash", + # "og:description" : " ", + # "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon", + # "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg", + # "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", + # "og:video:width" : "1280" + # "og:video:height" : "720", + # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", og = {} for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): og[tag.attrib['property']] = tag.attrib['content'] + if not og: + # do some basic spidering of the HTML + title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]") + og['og:title'] = title[0].text if title else None + + images = tree.xpath("//img") + big_images = [ i for i in images if ( + 'width' in i and 'height' in i and + i.attrib['width'] > 64 and i.attrib['height'] > 64 + )] or images + og['og:image'] = images[0].attrib['src'] if images else None + + text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()") + text = '' + for text_node in text_nodes: + if len(text) < 1024: + text += text_node + ' ' + else: + break + text = re.sub(r'[\t ]+', ' ', text) + text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text) + text = text.strip()[:1024] + og['og:description'] = text if text else None + + # TODO: turn any OG media URLs into mxc URLs to capture and thumbnail them too # TODO: store our OG details in a cache (and expire them when stale) # TODO: delete the content to stop diskfilling, as we only ever cared about its OG else: logger.warn("Failed to find any OG data in %s", url) og = {} + logger.warn(og) + respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) except: # XXX: if we don't explicitly respond here, the request never returns. @@ -111,6 +154,10 @@ class PreviewUrlResource(BaseMediaResource): @defer.inlineCallbacks def _download_url(self, url, user): + # TODO: we should probably honour robots.txt... except in practice + # we're most likely being explicitly triggered by a human rather than a + # bot, so are we really a robot? + # XXX: horrible duplication with base_resource's _download_remote_file() file_id = random_string(24) -- cgit 1.5.1 From 72550c3803e5020aa377f8d10c0c20afd4273c0d Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Thu, 31 Mar 2016 15:14:14 +0100 Subject: prevent choking on invalid utf-8, and handle image thumbnailing smarter --- synapse/rest/media/v1/preview_url_resource.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'synapse/rest') diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index b1d5cabfaa..04d02ee427 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -72,7 +72,15 @@ class PreviewUrlResource(BaseMediaResource): # define our OG response for this media elif self._is_html(media_info['media_type']): # TODO: somehow stop a big HTML tree from exploding synapse's RAM - tree = html.parse(media_info['filename']) + + # XXX: can't work out how to make lxml ignore UTF8 decoding errors + # so slurp as a string at this point. + file = open(media_info['filename']) + body = file.read() + file.close() + # FIXME: we shouldn't be forcing utf-8 if the page isn't actually utf-8... + tree = html.fromstring(body.decode('utf-8','ignore')) + # tree = html.parse(media_info['filename']) # suck it up into lxml and define our OG response. # if we see any URLs in the OG response, then spider them @@ -108,14 +116,19 @@ class PreviewUrlResource(BaseMediaResource): title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]") og['og:title'] = title[0].text if title else None - images = tree.xpath("//img") + images = [ i for i in tree.xpath("//img") if 'src' in i.attrib ] big_images = [ i for i in images if ( - 'width' in i and 'height' in i and + 'width' in i.attrib and 'height' in i.attrib and i.attrib['width'] > 64 and i.attrib['height'] > 64 - )] or images - og['og:image'] = images[0].attrib['src'] if images else None + )] + big_images = big_images.sort(key=lambda i: (-1 * int(i.attrib['width']) * int(i.attrib['height']))) + images = big_images if big_images else images + + if images: + og['og:image'] = images[0].attrib['src'] text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()") + # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text()") text = '' for text_node in text_nodes: if len(text) < 1024: -- cgit 1.5.1 From 683e564815be5f7852c417cbab06876db6122401 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Thu, 31 Mar 2016 23:52:58 +0100 Subject: handle spidered relative images correctly --- synapse/http/client.py | 2 +- synapse/rest/media/v1/preview_url_resource.py | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'synapse/rest') diff --git a/synapse/http/client.py b/synapse/http/client.py index 219b734268..1b6f7cb795 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -282,7 +282,7 @@ class SimpleHttpClient(object): logger.exception("Failed to download body") raise - defer.returnValue((length, headers)) + defer.returnValue((length, headers, response.request.absoluteURI)) # XXX: FIXME: This is horribly copy-pasted from matrixfederationclient. diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 04d02ee427..bae3905a43 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -18,6 +18,7 @@ from twisted.web.resource import Resource from twisted.web.server import NOT_DONE_YET from twisted.internet import defer from lxml import html +from urlparse import urlparse, urlunparse from synapse.util.stringutils import random_string from synapse.http.client import SpiderHttpClient from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes @@ -125,7 +126,14 @@ class PreviewUrlResource(BaseMediaResource): images = big_images if big_images else images if images: - og['og:image'] = images[0].attrib['src'] + base = list(urlparse(media_info['uri'])) + src = list(urlparse(images[0].attrib['src'])) + if not src[0] and not src[1]: + src[0] = base[0] + src[1] = base[1] + if not src[2].startswith('/'): + src[2] = re.sub(r'/[^/]+$', '/', base[2]) + src[2] + og['og:image'] = urlunparse(src) text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()") # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text()") @@ -140,6 +148,7 @@ class PreviewUrlResource(BaseMediaResource): text = text.strip()[:1024] og['og:description'] = text if text else None + # TODO: extract a favicon? # TODO: turn any OG media URLs into mxc URLs to capture and thumbnail them too # TODO: store our OG details in a cache (and expire them when stale) # TODO: delete the content to stop diskfilling, as we only ever cared about its OG @@ -180,7 +189,7 @@ class PreviewUrlResource(BaseMediaResource): try: with open(fname, "wb") as f: logger.debug("Trying to get url '%s'" % url) - length, headers = yield self.client.get_file( + length, headers, uri = yield self.client.get_file( url, output_stream=f, max_size=self.max_spider_size, ) # FIXME: handle 404s sanely - don't spider an error page @@ -233,6 +242,7 @@ class PreviewUrlResource(BaseMediaResource): "created_ts": time_now_ms, "filesystem_id": file_id, "filename": fname, + "uri": uri, }) def _is_media(self, content_type): -- cgit 1.5.1 From c60b751694bbeb82105eb828d41c0b5c26d5e195 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Fri, 1 Apr 2016 02:17:48 +0100 Subject: fix assorted redirect, unicode and screenscraping bugs --- synapse/rest/media/v1/preview_url_resource.py | 174 ++++++++++++++------------ 1 file changed, 96 insertions(+), 78 deletions(-) (limited to 'synapse/rest') diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index bae3905a43..a7ffe593b1 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -74,84 +74,93 @@ class PreviewUrlResource(BaseMediaResource): elif self._is_html(media_info['media_type']): # TODO: somehow stop a big HTML tree from exploding synapse's RAM - # XXX: can't work out how to make lxml ignore UTF8 decoding errors - # so slurp as a string at this point. - file = open(media_info['filename']) - body = file.read() - file.close() - # FIXME: we shouldn't be forcing utf-8 if the page isn't actually utf-8... - tree = html.fromstring(body.decode('utf-8','ignore')) - # tree = html.parse(media_info['filename']) - - # suck it up into lxml and define our OG response. - # if we see any URLs in the OG response, then spider them - # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) - - # "og:type" : "article" - # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" - # "og:title" : "Matrix on Twitter" - # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" - # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" - # "og:site_name" : "Twitter" - - # or: - - # "og:type" : "video", - # "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw", - # "og:site_name" : "YouTube", - # "og:video:type" : "application/x-shockwave-flash", - # "og:description" : " ", - # "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon", - # "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg", - # "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", - # "og:video:width" : "1280" - # "og:video:height" : "720", - # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", - - og = {} - for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): - og[tag.attrib['property']] = tag.attrib['content'] - - if not og: - # do some basic spidering of the HTML - title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]") - og['og:title'] = title[0].text if title else None - - images = [ i for i in tree.xpath("//img") if 'src' in i.attrib ] - big_images = [ i for i in images if ( - 'width' in i.attrib and 'height' in i.attrib and - i.attrib['width'] > 64 and i.attrib['height'] > 64 - )] - big_images = big_images.sort(key=lambda i: (-1 * int(i.attrib['width']) * int(i.attrib['height']))) - images = big_images if big_images else images - - if images: - base = list(urlparse(media_info['uri'])) - src = list(urlparse(images[0].attrib['src'])) - if not src[0] and not src[1]: - src[0] = base[0] - src[1] = base[1] - if not src[2].startswith('/'): - src[2] = re.sub(r'/[^/]+$', '/', base[2]) + src[2] - og['og:image'] = urlunparse(src) - - text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()") - # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text()") - text = '' - for text_node in text_nodes: - if len(text) < 1024: - text += text_node + ' ' + def _calc_og(): + # suck it up into lxml and define our OG response. + # if we see any URLs in the OG response, then spider them + # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) + + # "og:type" : "article" + # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" + # "og:title" : "Matrix on Twitter" + # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" + # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" + # "og:site_name" : "Twitter" + + # or: + + # "og:type" : "video", + # "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw", + # "og:site_name" : "YouTube", + # "og:video:type" : "application/x-shockwave-flash", + # "og:description" : " ", + # "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon", + # "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg", + # "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", + # "og:video:width" : "1280" + # "og:video:height" : "720", + # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", + + og = {} + for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): + og[tag.attrib['property']] = tag.attrib['content'] + + if 'og:title' not in og: + # do some basic spidering of the HTML + title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]") + og['og:title'] = title[0].text if title else None + + + if 'og:image' not in og: + meta_image = tree.xpath("//*/meta[@itemprop='image']/@content"); + if meta_image: + og['og:image'] = self._rebase_url(meta_image[0], media_info['uri']) else: - break - text = re.sub(r'[\t ]+', ' ', text) - text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text) - text = text.strip()[:1024] - og['og:description'] = text if text else None - - # TODO: extract a favicon? - # TODO: turn any OG media URLs into mxc URLs to capture and thumbnail them too - # TODO: store our OG details in a cache (and expire them when stale) - # TODO: delete the content to stop diskfilling, as we only ever cared about its OG + images = [ i for i in tree.xpath("//img") if 'src' in i.attrib ] + big_images = [ i for i in images if ( + 'width' in i.attrib and 'height' in i.attrib and + i.attrib['width'] > 64 and i.attrib['height'] > 64 + )] + big_images = big_images.sort(key=lambda i: (-1 * int(i.attrib['width']) * int(i.attrib['height']))) + images = big_images if big_images else images + + if images: + og['og:image'] = self._rebase_url(images[0].attrib['src'], media_info['uri']) + + if 'og:description' not in og: + meta_description = tree.xpath("//*/meta[@name='description']/@content"); + if meta_description: + og['og:description'] = meta_description[0] + else: + text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()") + # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text()") + text = '' + for text_node in text_nodes: + if len(text) < 500: + text += text_node + ' ' + else: + break + text = re.sub(r'[\t ]+', ' ', text) + text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text) + text = text.strip()[:500] + og['og:description'] = text if text else None + + # TODO: extract a favicon? + # TODO: turn any OG media URLs into mxc URLs to capture and thumbnail them too + # TODO: store our OG details in a cache (and expire them when stale) + # TODO: delete the content to stop diskfilling, as we only ever cared about its OG + return og + + try: + tree = html.parse(media_info['filename']) + og = _calc_og() + except UnicodeDecodeError: + # XXX: evil evil bodge + file = open(media_info['filename']) + body = file.read() + file.close() + tree = html.fromstring(body.decode('utf-8','ignore')) + og = _calc_og() + else: logger.warn("Failed to find any OG data in %s", url) og = {} @@ -173,6 +182,15 @@ class PreviewUrlResource(BaseMediaResource): ) raise + def _rebase_url(self, url, base): + base = list(urlparse(base)) + url = list(urlparse(url)) + if not url[0] and not url[1]: + url[0] = base[0] + url[1] = base[1] + if not url[2].startswith('/'): + url[2] = re.sub(r'/[^/]+$', '/', base[2]) + url[2] + return urlunparse(url) @defer.inlineCallbacks def _download_url(self, url, user): @@ -223,7 +241,7 @@ class PreviewUrlResource(BaseMediaResource): download_name = None yield self.store.store_local_media( - media_id=fname, + media_id=file_id, media_type=media_type, time_now_ms=self.clock.time_msec(), upload_name=download_name, -- cgit 1.5.1 From 2a37467fa1358eb41513893efe44cbd294dca36c Mon Sep 17 00:00:00 2001 From: Mark Haines Date: Fri, 1 Apr 2016 16:08:59 +0100 Subject: Use google style doc strings. pycharm supports them so there is no need to use the other format. Might as well convert the existing strings to reduce the risk of people accidentally cargo culting the wrong doc string format. --- setup.cfg | 3 ++ synapse/handlers/_base.py | 27 +++++++----- synapse/handlers/auth.py | 26 +++++++---- synapse/handlers/federation.py | 23 +++++----- synapse/handlers/room_member.py | 48 ++++++++++----------- synapse/handlers/sync.py | 49 +++++++++++++-------- synapse/http/servlet.py | 81 ++++++++++++++++++++++------------- synapse/notifier.py | 15 ++++--- synapse/push/baserules.py | 8 ++-- synapse/rest/client/v2_alpha/sync.py | 79 ++++++++++++++++++---------------- synapse/state.py | 19 ++++---- synapse/storage/event_push_actions.py | 5 ++- synapse/storage/registration.py | 15 ++++--- synapse/storage/state.py | 13 +++--- 14 files changed, 242 insertions(+), 169 deletions(-) (limited to 'synapse/rest') diff --git a/setup.cfg b/setup.cfg index f8cc13c840..5ebce1c56b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,3 +17,6 @@ ignore = [flake8] max-line-length = 90 ignore = W503 ; W503 requires that binary operators be at the end, not start, of lines. Erik doesn't like it. + +[pep8] +max-line-length = 90 diff --git a/synapse/handlers/_base.py b/synapse/handlers/_base.py index 90eabb6eb7..5601ecea6e 100644 --- a/synapse/handlers/_base.py +++ b/synapse/handlers/_base.py @@ -41,8 +41,9 @@ class BaseHandler(object): """ Common base class for the event handlers. - :type store: synapse.storage.events.StateStore - :type state_handler: synapse.state.StateHandler + Attributes: + store (synapse.storage.events.StateStore): + state_handler (synapse.state.StateHandler): """ def __init__(self, hs): @@ -65,11 +66,12 @@ class BaseHandler(object): """ Returns dict of user_id -> list of events that user is allowed to see. - :param (str, bool) user_tuples: (user id, is_peeking) for each - user to be checked. is_peeking should be true if: - * the user is not currently a member of the room, and: - * the user has not been a member of the room since the given - events + Args: + user_tuples (str, bool): (user id, is_peeking) for each user to be + checked. is_peeking should be true if: + * the user is not currently a member of the room, and: + * the user has not been a member of the room since the + given events """ forgotten = yield defer.gatherResults([ self.store.who_forgot_in_room( @@ -165,13 +167,16 @@ class BaseHandler(object): """ Check which events a user is allowed to see - :param str user_id: user id to be checked - :param [synapse.events.EventBase] events: list of events to be checked - :param bool is_peeking should be True if: + Args: + user_id(str): user id to be checked + events([synapse.events.EventBase]): list of events to be checked + is_peeking(bool): should be True if: * the user is not currently a member of the room, and: * the user has not been a member of the room since the given events - :rtype [synapse.events.EventBase] + + Returns: + [synapse.events.EventBase] """ types = ( (EventTypes.RoomHistoryVisibility, ""), diff --git a/synapse/handlers/auth.py b/synapse/handlers/auth.py index 82d458b424..d5d6faa85f 100644 --- a/synapse/handlers/auth.py +++ b/synapse/handlers/auth.py @@ -163,9 +163,13 @@ class AuthHandler(BaseHandler): def get_session_id(self, clientdict): """ Gets the session ID for a client given the client dictionary - :param clientdict: The dictionary sent by the client in the request - :return: The string session ID the client sent. If the client did not - send a session ID, returns None. + + Args: + clientdict: The dictionary sent by the client in the request + + Returns: + str|None: The string session ID the client sent. If the client did + not send a session ID, returns None. """ sid = None if clientdict and 'auth' in clientdict: @@ -179,9 +183,11 @@ class AuthHandler(BaseHandler): Store a key-value pair into the sessions data associated with this request. This data is stored server-side and cannot be modified by the client. - :param session_id: (string) The ID of this session as returned from check_auth - :param key: (string) The key to store the data under - :param value: (any) The data to store + + Args: + session_id (string): The ID of this session as returned from check_auth + key (string): The key to store the data under + value (any): The data to store """ sess = self._get_session_info(session_id) sess.setdefault('serverdict', {})[key] = value @@ -190,9 +196,11 @@ class AuthHandler(BaseHandler): def get_session_data(self, session_id, key, default=None): """ Retrieve data stored with set_session_data - :param session_id: (string) The ID of this session as returned from check_auth - :param key: (string) The key to store the data under - :param default: (any) Value to return if the key has not been set + + Args: + session_id (string): The ID of this session as returned from check_auth + key (string): The key to store the data under + default (any): Value to return if the key has not been set """ sess = self._get_session_info(session_id) return sess.setdefault('serverdict', {}).get(key, default) diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py index 4a35344d32..092802b973 100644 --- a/synapse/handlers/federation.py +++ b/synapse/handlers/federation.py @@ -1706,13 +1706,15 @@ class FederationHandler(BaseHandler): def _check_signature(self, event, auth_events): """ Checks that the signature in the event is consistent with its invite. - :param event (Event): The m.room.member event to check - :param auth_events (dict<(event type, state_key), event>) - :raises - AuthError if signature didn't match any keys, or key has been + Args: + event (Event): The m.room.member event to check + auth_events (dict<(event type, state_key), event>): + + Raises: + AuthError: if signature didn't match any keys, or key has been revoked, - SynapseError if a transient error meant a key couldn't be checked + SynapseError: if a transient error meant a key couldn't be checked for revocation. """ signed = event.content["third_party_invite"]["signed"] @@ -1754,12 +1756,13 @@ class FederationHandler(BaseHandler): """ Checks whether public_key has been revoked. - :param public_key (str): base-64 encoded public key. - :param url (str): Key revocation URL. + Args: + public_key (str): base-64 encoded public key. + url (str): Key revocation URL. - :raises - AuthError if they key has been revoked. - SynapseError if a transient error meant a key couldn't be checked + Raises: + AuthError: if they key has been revoked. + SynapseError: if a transient error meant a key couldn't be checked for revocation. """ try: diff --git a/synapse/handlers/room_member.py b/synapse/handlers/room_member.py index 5fdbd3adcc..01f833c371 100644 --- a/synapse/handlers/room_member.py +++ b/synapse/handlers/room_member.py @@ -411,7 +411,7 @@ class RoomMemberHandler(BaseHandler): address (str): The third party identifier (e.g. "foo@example.com"). Returns: - (str) the matrix ID of the 3pid, or None if it is not recognized. + str: the matrix ID of the 3pid, or None if it is not recognized. """ try: data = yield self.hs.get_simple_http_client().get_json( @@ -545,29 +545,29 @@ class RoomMemberHandler(BaseHandler): """ Asks an identity server for a third party invite. - :param id_server (str): hostname + optional port for the identity server. - :param medium (str): The literal string "email". - :param address (str): The third party address being invited. - :param room_id (str): The ID of the room to which the user is invited. - :param inviter_user_id (str): The user ID of the inviter. - :param room_alias (str): An alias for the room, for cosmetic - notifications. - :param room_avatar_url (str): The URL of the room's avatar, for cosmetic - notifications. - :param room_join_rules (str): The join rules of the email - (e.g. "public"). - :param room_name (str): The m.room.name of the room. - :param inviter_display_name (str): The current display name of the - inviter. - :param inviter_avatar_url (str): The URL of the inviter's avatar. - - :return: A deferred tuple containing: - token (str): The token which must be signed to prove authenticity. - public_keys ([{"public_key": str, "key_validity_url": str}]): - public_key is a base64-encoded ed25519 public key. - fallback_public_key: One element from public_keys. - display_name (str): A user-friendly name to represent the invited - user. + Args: + id_server (str): hostname + optional port for the identity server. + medium (str): The literal string "email". + address (str): The third party address being invited. + room_id (str): The ID of the room to which the user is invited. + inviter_user_id (str): The user ID of the inviter. + room_alias (str): An alias for the room, for cosmetic notifications. + room_avatar_url (str): The URL of the room's avatar, for cosmetic + notifications. + room_join_rules (str): The join rules of the email (e.g. "public"). + room_name (str): The m.room.name of the room. + inviter_display_name (str): The current display name of the + inviter. + inviter_avatar_url (str): The URL of the inviter's avatar. + + Returns: + A deferred tuple containing: + token (str): The token which must be signed to prove authenticity. + public_keys ([{"public_key": str, "key_validity_url": str}]): + public_key is a base64-encoded ed25519 public key. + fallback_public_key: One element from public_keys. + display_name (str): A user-friendly name to represent the invited + user. """ is_url = "%s%s/_matrix/identity/api/v1/store-invite" % ( diff --git a/synapse/handlers/sync.py b/synapse/handlers/sync.py index 48ab5707e1..20a0626574 100644 --- a/synapse/handlers/sync.py +++ b/synapse/handlers/sync.py @@ -671,7 +671,8 @@ class SyncHandler(BaseHandler): def load_filtered_recents(self, room_id, sync_config, now_token, since_token=None, recents=None, newly_joined_room=False): """ - :returns a Deferred TimelineBatch + Returns: + a Deferred TimelineBatch """ with Measure(self.clock, "load_filtered_recents"): filtering_factor = 2 @@ -838,8 +839,11 @@ class SyncHandler(BaseHandler): """ Get the room state after the given event - :param synapse.events.EventBase event: event of interest - :return: A Deferred map from ((type, state_key)->Event) + Args: + event(synapse.events.EventBase): event of interest + + Returns: + A Deferred map from ((type, state_key)->Event) """ state = yield self.store.get_state_for_event(event.event_id) if event.is_state(): @@ -850,9 +854,13 @@ class SyncHandler(BaseHandler): @defer.inlineCallbacks def get_state_at(self, room_id, stream_position): """ Get the room state at a particular stream position - :param str room_id: room for which to get state - :param StreamToken stream_position: point at which to get state - :returns: A Deferred map from ((type, state_key)->Event) + + Args: + room_id(str): room for which to get state + stream_position(StreamToken): point at which to get state + + Returns: + A Deferred map from ((type, state_key)->Event) """ last_events, token = yield self.store.get_recent_events_for_room( room_id, end_token=stream_position.room_key, limit=1, @@ -873,15 +881,18 @@ class SyncHandler(BaseHandler): """ Works out the differnce in state between the start of the timeline and the previous sync. - :param str room_id - :param TimelineBatch batch: The timeline batch for the room that will - be sent to the user. - :param sync_config - :param str since_token: Token of the end of the previous batch. May be None. - :param str now_token: Token of the end of the current batch. - :param bool full_state: Whether to force returning the full state. + Args: + room_id(str): + batch(synapse.handlers.sync.TimelineBatch): The timeline batch for + the room that will be sent to the user. + sync_config(synapse.handlers.sync.SyncConfig): + since_token(str|None): Token of the end of the previous batch. May + be None. + now_token(str): Token of the end of the current batch. + full_state(bool): Whether to force returning the full state. - :returns A new event dictionary + Returns: + A deferred new event dictionary """ # TODO(mjark) Check if the state events were received by the server # after the previous sync, since we need to include those state @@ -953,11 +964,13 @@ class SyncHandler(BaseHandler): Check if the user has just joined the given room (so should be given the full state) - :param sync_config: - :param dict[(str,str), synapse.events.FrozenEvent] state_delta: the - difference in state since the last sync + Args: + sync_config(synapse.handlers.sync.SyncConfig): + state_delta(dict[(str,str), synapse.events.FrozenEvent]): the + difference in state since the last sync - :returns A deferred Tuple (state_delta, limited) + Returns: + A deferred Tuple (state_delta, limited) """ join_event = state_delta.get(( EventTypes.Member, sync_config.user.to_string()), None) diff --git a/synapse/http/servlet.py b/synapse/http/servlet.py index 1c8bd8666f..e41afeab8e 100644 --- a/synapse/http/servlet.py +++ b/synapse/http/servlet.py @@ -26,14 +26,19 @@ logger = logging.getLogger(__name__) def parse_integer(request, name, default=None, required=False): """Parse an integer parameter from the request string - :param request: the twisted HTTP request. - :param name (str): the name of the query parameter. - :param default: value to use if the parameter is absent, defaults to None. - :param required (bool): whether to raise a 400 SynapseError if the - parameter is absent, defaults to False. - :return: An int value or the default. - :raises - SynapseError if the parameter is absent and required, or if the + Args: + request: the twisted HTTP request. + name (str): the name of the query parameter. + default (int|None): value to use if the parameter is absent, defaults + to None. + required (bool): whether to raise a 400 SynapseError if the + parameter is absent, defaults to False. + + Returns: + int|None: An int value or the default. + + Raises: + SynapseError: if the parameter is absent and required, or if the parameter is present and not an integer. """ if name in request.args: @@ -53,14 +58,19 @@ def parse_integer(request, name, default=None, required=False): def parse_boolean(request, name, default=None, required=False): """Parse a boolean parameter from the request query string - :param request: the twisted HTTP request. - :param name (str): the name of the query parameter. - :param default: value to use if the parameter is absent, defaults to None. - :param required (bool): whether to raise a 400 SynapseError if the - parameter is absent, defaults to False. - :return: A bool value or the default. - :raises - SynapseError if the parameter is absent and required, or if the + Args: + request: the twisted HTTP request. + name (str): the name of the query parameter. + default (bool|None): value to use if the parameter is absent, defaults + to None. + required (bool): whether to raise a 400 SynapseError if the + parameter is absent, defaults to False. + + Returns: + bool|None: A bool value or the default. + + Raises: + SynapseError: if the parameter is absent and required, or if the parameter is present and not one of "true" or "false". """ @@ -88,15 +98,20 @@ def parse_string(request, name, default=None, required=False, allowed_values=None, param_type="string"): """Parse a string parameter from the request query string. - :param request: the twisted HTTP request. - :param name (str): the name of the query parameter. - :param default: value to use if the parameter is absent, defaults to None. - :param required (bool): whether to raise a 400 SynapseError if the - parameter is absent, defaults to False. - :param allowed_values (list): List of allowed values for the string, - or None if any value is allowed, defaults to None - :return: A string value or the default. - :raises + Args: + request: the twisted HTTP request. + name (str): the name of the query parameter. + default (str|None): value to use if the parameter is absent, defaults + to None. + required (bool): whether to raise a 400 SynapseError if the + parameter is absent, defaults to False. + allowed_values (list[str]): List of allowed values for the string, + or None if any value is allowed, defaults to None + + Returns: + str|None: A string value or the default. + + Raises: SynapseError if the parameter is absent and required, or if the parameter is present, must be one of a list of allowed values and is not one of those allowed values. @@ -122,9 +137,13 @@ def parse_string(request, name, default=None, required=False, def parse_json_value_from_request(request): """Parse a JSON value from the body of a twisted HTTP request. - :param request: the twisted HTTP request. - :returns: The JSON value. - :raises + Args: + request: the twisted HTTP request. + + Returns: + The JSON value. + + Raises: SynapseError if the request body couldn't be decoded as JSON. """ try: @@ -143,8 +162,10 @@ def parse_json_value_from_request(request): def parse_json_object_from_request(request): """Parse a JSON object from the body of a twisted HTTP request. - :param request: the twisted HTTP request. - :raises + Args: + request: the twisted HTTP request. + + Raises: SynapseError if the request body couldn't be decoded as JSON or if it wasn't a JSON object. """ diff --git a/synapse/notifier.py b/synapse/notifier.py index f00cd8c588..6af7a8f424 100644 --- a/synapse/notifier.py +++ b/synapse/notifier.py @@ -503,13 +503,14 @@ class Notifier(object): def wait_for_replication(self, callback, timeout): """Wait for an event to happen. - :param callback: - Gets called whenever an event happens. If this returns a truthy - value then ``wait_for_replication`` returns, otherwise it waits - for another event. - :param int timeout: - How many milliseconds to wait for callback return a truthy value. - :returns: + Args: + callback: Gets called whenever an event happens. If this returns a + truthy value then ``wait_for_replication`` returns, otherwise + it waits for another event. + timeout: How many milliseconds to wait for callback return a truthy + value. + + Returns: A deferred that resolves with the value returned by the callback. """ listener = _NotificationListener(None) diff --git a/synapse/push/baserules.py b/synapse/push/baserules.py index 792af70eb7..6add94beeb 100644 --- a/synapse/push/baserules.py +++ b/synapse/push/baserules.py @@ -19,9 +19,11 @@ import copy def list_with_base_rules(rawrules): """Combine the list of rules set by the user with the default push rules - :param list rawrules: The rules the user has modified or set. - :returns: A new list with the rules set by the user combined with the - defaults. + Args: + rawrules(list): The rules the user has modified or set. + + Returns: + A new list with the rules set by the user combined with the defaults. """ ruleslist = [] diff --git a/synapse/rest/client/v2_alpha/sync.py b/synapse/rest/client/v2_alpha/sync.py index c5785d7074..60d3dc4030 100644 --- a/synapse/rest/client/v2_alpha/sync.py +++ b/synapse/rest/client/v2_alpha/sync.py @@ -199,15 +199,17 @@ class SyncRestServlet(RestServlet): """ Encode the joined rooms in a sync result - :param list[synapse.handlers.sync.JoinedSyncResult] rooms: list of sync - results for rooms this user is joined to - :param int time_now: current time - used as a baseline for age - calculations - :param int token_id: ID of the user's auth token - used for namespacing - of transaction IDs - - :return: the joined rooms list, in our response format - :rtype: dict[str, dict[str, object]] + Args: + rooms(list[synapse.handlers.sync.JoinedSyncResult]): list of sync + results for rooms this user is joined to + time_now(int): current time - used as a baseline for age + calculations + token_id(int): ID of the user's auth token - used for namespacing + of transaction IDs + + Returns: + dict[str, dict[str, object]]: the joined rooms list, in our + response format """ joined = {} for room in rooms: @@ -221,15 +223,17 @@ class SyncRestServlet(RestServlet): """ Encode the invited rooms in a sync result - :param list[synapse.handlers.sync.InvitedSyncResult] rooms: list of - sync results for rooms this user is joined to - :param int time_now: current time - used as a baseline for age - calculations - :param int token_id: ID of the user's auth token - used for namespacing + Args: + rooms(list[synapse.handlers.sync.InvitedSyncResult]): list of + sync results for rooms this user is joined to + time_now(int): current time - used as a baseline for age + calculations + token_id(int): ID of the user's auth token - used for namespacing of transaction IDs - :return: the invited rooms list, in our response format - :rtype: dict[str, dict[str, object]] + Returns: + dict[str, dict[str, object]]: the invited rooms list, in our + response format """ invited = {} for room in rooms: @@ -251,15 +255,17 @@ class SyncRestServlet(RestServlet): """ Encode the archived rooms in a sync result - :param list[synapse.handlers.sync.ArchivedSyncResult] rooms: list of - sync results for rooms this user is joined to - :param int time_now: current time - used as a baseline for age - calculations - :param int token_id: ID of the user's auth token - used for namespacing - of transaction IDs - - :return: the invited rooms list, in our response format - :rtype: dict[str, dict[str, object]] + Args: + rooms (list[synapse.handlers.sync.ArchivedSyncResult]): list of + sync results for rooms this user is joined to + time_now(int): current time - used as a baseline for age + calculations + token_id(int): ID of the user's auth token - used for namespacing + of transaction IDs + + Returns: + dict[str, dict[str, object]]: The invited rooms list, in our + response format """ joined = {} for room in rooms: @@ -272,17 +278,18 @@ class SyncRestServlet(RestServlet): @staticmethod def encode_room(room, time_now, token_id, joined=True): """ - :param JoinedSyncResult|ArchivedSyncResult room: sync result for a - single room - :param int time_now: current time - used as a baseline for age - calculations - :param int token_id: ID of the user's auth token - used for namespacing - of transaction IDs - :param joined: True if the user is joined to this room - will mean - we handle ephemeral events - - :return: the room, encoded in our response format - :rtype: dict[str, object] + Args: + room (JoinedSyncResult|ArchivedSyncResult): sync result for a + single room + time_now (int): current time - used as a baseline for age + calculations + token_id (int): ID of the user's auth token - used for namespacing + of transaction IDs + joined (bool): True if the user is joined to this room - will mean + we handle ephemeral events + + Returns: + dict[str, object]: the room, encoded in our response format """ def serialize(event): # TODO(mjark): Respect formatting requirements in the filter. diff --git a/synapse/state.py b/synapse/state.py index 41d32e664a..4a9e148de7 100644 --- a/synapse/state.py +++ b/synapse/state.py @@ -86,7 +86,8 @@ class StateHandler(object): If `event_type` is specified, then the method returns only the one event (or None) with that `event_type` and `state_key`. - :returns map from (type, state_key) to event + Returns: + map from (type, state_key) to event """ event_ids = yield self.store.get_latest_event_ids_in_room(room_id) @@ -176,10 +177,11 @@ class StateHandler(object): """ Given a list of event_ids this method fetches the state at each event, resolves conflicts between them and returns them. - :returns a Deferred tuple of (`state_group`, `state`, `prev_state`). - `state_group` is the name of a state group if one and only one is - involved. `state` is a map from (type, state_key) to event, and - `prev_state` is a list of event ids. + Returns: + a Deferred tuple of (`state_group`, `state`, `prev_state`). + `state_group` is the name of a state group if one and only one is + involved. `state` is a map from (type, state_key) to event, and + `prev_state` is a list of event ids. """ logger.debug("resolve_state_groups event_ids %s", event_ids) @@ -251,9 +253,10 @@ class StateHandler(object): def _resolve_events(self, state_sets, event_type=None, state_key=""): """ - :returns a tuple (new_state, prev_states). new_state is a map - from (type, state_key) to event. prev_states is a list of event_ids. - :rtype: (dict[(str, str), synapse.events.FrozenEvent], list[str]) + Returns + (dict[(str, str), synapse.events.FrozenEvent], list[str]): a tuple + (new_state, prev_states). new_state is a map from (type, state_key) + to event. prev_states is a list of event_ids. """ with Measure(self.clock, "state._resolve_events"): state = {} diff --git a/synapse/storage/event_push_actions.py b/synapse/storage/event_push_actions.py index dc5830450a..3933b6e2c5 100644 --- a/synapse/storage/event_push_actions.py +++ b/synapse/storage/event_push_actions.py @@ -26,8 +26,9 @@ logger = logging.getLogger(__name__) class EventPushActionsStore(SQLBaseStore): def _set_push_actions_for_event_and_users_txn(self, txn, event, tuples): """ - :param event: the event set actions for - :param tuples: list of tuples of (user_id, actions) + Args: + event: the event set actions for + tuples: list of tuples of (user_id, actions) """ values = [] for uid, actions in tuples: diff --git a/synapse/storage/registration.py b/synapse/storage/registration.py index bd4eb88a92..d46a963bb8 100644 --- a/synapse/storage/registration.py +++ b/synapse/storage/registration.py @@ -458,12 +458,15 @@ class RegistrationStore(SQLBaseStore): """ Gets the 3pid's guest access token if exists, else saves access_token. - :param medium (str): Medium of the 3pid. Must be "email". - :param address (str): 3pid address. - :param access_token (str): The access token to persist if none is - already persisted. - :param inviter_user_id (str): User ID of the inviter. - :return (deferred str): Whichever access token is persisted at the end + Args: + medium (str): Medium of the 3pid. Must be "email". + address (str): 3pid address. + access_token (str): The access token to persist if none is + already persisted. + inviter_user_id (str): User ID of the inviter. + + Returns: + deferred str: Whichever access token is persisted at the end of this function call. """ def insert(txn): diff --git a/synapse/storage/state.py b/synapse/storage/state.py index 7fc9a4f264..f84fd0e30a 100644 --- a/synapse/storage/state.py +++ b/synapse/storage/state.py @@ -249,11 +249,14 @@ class StateStore(SQLBaseStore): """ Get the state dict corresponding to a particular event - :param str event_id: event whose state should be returned - :param list[(str, str)]|None types: List of (type, state_key) tuples - which are used to filter the state fetched. May be None, which - matches any key - :return: a deferred dict from (type, state_key) -> state_event + Args: + event_id(str): event whose state should be returned + types(list[(str, str)]|None): List of (type, state_key) tuples + which are used to filter the state fetched. May be None, which + matches any key + + Returns: + A deferred dict from (type, state_key) -> state_event """ state_map = yield self.get_state_for_events([event_id], types) defer.returnValue(state_map[event_id]) -- cgit 1.5.1 From 5fd07da76473f7a361db4b16b58fc4c21acc4af0 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sat, 2 Apr 2016 00:35:49 +0100 Subject: refactor calc_og; spider image URLs; fix xpath; add a (broken) expiringcache; loads of other fixes --- synapse/rest/media/v1/preview_url_resource.py | 202 +++++++++++++++----------- 1 file changed, 121 insertions(+), 81 deletions(-) (limited to 'synapse/rest') diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index a7ffe593b1..1273472dab 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -20,6 +20,7 @@ from twisted.internet import defer from lxml import html from urlparse import urlparse, urlunparse from synapse.util.stringutils import random_string +from synapse.util.caches.expiringcache import ExpiringCache from synapse.http.client import SpiderHttpClient from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes @@ -36,6 +37,12 @@ class PreviewUrlResource(BaseMediaResource): def __init__(self, hs, filepaths): BaseMediaResource.__init__(self, hs, filepaths) self.client = SpiderHttpClient(hs) + self.cache = ExpiringCache( + cache_name = "url_previews", + clock = self.clock, + expiry_ms = 60*60*1000, # don't spider URLs more often than once an hour + ) + self.cache.start() def render_GET(self, request): self._async_render_GET(request) @@ -50,6 +57,11 @@ class PreviewUrlResource(BaseMediaResource): requester = yield self.auth.get_user_by_req(request) url = request.args.get("url")[0] + if self.cache: + og = self.cache.get(url) + respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) + return + # TODO: keep track of whether there's an ongoing request for this preview # and block and return their details if there is one. @@ -74,98 +86,25 @@ class PreviewUrlResource(BaseMediaResource): elif self._is_html(media_info['media_type']): # TODO: somehow stop a big HTML tree from exploding synapse's RAM - def _calc_og(): - # suck it up into lxml and define our OG response. - # if we see any URLs in the OG response, then spider them - # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) - - # "og:type" : "article" - # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" - # "og:title" : "Matrix on Twitter" - # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" - # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" - # "og:site_name" : "Twitter" - - # or: - - # "og:type" : "video", - # "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw", - # "og:site_name" : "YouTube", - # "og:video:type" : "application/x-shockwave-flash", - # "og:description" : " ", - # "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon", - # "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg", - # "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", - # "og:video:width" : "1280" - # "og:video:height" : "720", - # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", - - og = {} - for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): - og[tag.attrib['property']] = tag.attrib['content'] - - if 'og:title' not in og: - # do some basic spidering of the HTML - title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]") - og['og:title'] = title[0].text if title else None - - - if 'og:image' not in og: - meta_image = tree.xpath("//*/meta[@itemprop='image']/@content"); - if meta_image: - og['og:image'] = self._rebase_url(meta_image[0], media_info['uri']) - else: - images = [ i for i in tree.xpath("//img") if 'src' in i.attrib ] - big_images = [ i for i in images if ( - 'width' in i.attrib and 'height' in i.attrib and - i.attrib['width'] > 64 and i.attrib['height'] > 64 - )] - big_images = big_images.sort(key=lambda i: (-1 * int(i.attrib['width']) * int(i.attrib['height']))) - images = big_images if big_images else images - - if images: - og['og:image'] = self._rebase_url(images[0].attrib['src'], media_info['uri']) - - if 'og:description' not in og: - meta_description = tree.xpath("//*/meta[@name='description']/@content"); - if meta_description: - og['og:description'] = meta_description[0] - else: - text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()") - # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text()") - text = '' - for text_node in text_nodes: - if len(text) < 500: - text += text_node + ' ' - else: - break - text = re.sub(r'[\t ]+', ' ', text) - text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text) - text = text.strip()[:500] - og['og:description'] = text if text else None - - # TODO: extract a favicon? - # TODO: turn any OG media URLs into mxc URLs to capture and thumbnail them too - # TODO: store our OG details in a cache (and expire them when stale) - # TODO: delete the content to stop diskfilling, as we only ever cared about its OG - return og - try: tree = html.parse(media_info['filename']) - og = _calc_og() + og = yield self._calc_og(tree, media_info, requester) except UnicodeDecodeError: # XXX: evil evil bodge file = open(media_info['filename']) body = file.read() file.close() tree = html.fromstring(body.decode('utf-8','ignore')) - og = _calc_og() + og = yield self._calc_og(tree, media_info, requester) else: logger.warn("Failed to find any OG data in %s", url) og = {} - logger.warn(og) + if self.cache: + self.cache[url] = og + + logger.warn(og); respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) except: @@ -182,11 +121,112 @@ class PreviewUrlResource(BaseMediaResource): ) raise + @defer.inlineCallbacks + def _calc_og(self, tree, media_info, requester): + # suck our tree into lxml and define our OG response. + + # if we see any image URLs in the OG response, then spider them + # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) + + # "og:type" : "article" + # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" + # "og:title" : "Matrix on Twitter" + # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" + # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" + # "og:site_name" : "Twitter" + + # or: + + # "og:type" : "video", + # "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw", + # "og:site_name" : "YouTube", + # "og:video:type" : "application/x-shockwave-flash", + # "og:description" : " ", + # "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon", + # "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg", + # "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", + # "og:video:width" : "1280" + # "og:video:height" : "720", + # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", + + og = {} + for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): + og[tag.attrib['property']] = tag.attrib['content'] + + # TODO: grab article: meta tags too, e.g.: + + # + # + # + # + # + # + + if 'og:title' not in og: + # do some basic spidering of the HTML + title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]") + og['og:title'] = title[0].text.strip() if title else None + + + if 'og:image' not in og: + # TODO: extract a favicon failing all else + meta_image = tree.xpath("//*/meta[@itemprop='image']/@content"); + if meta_image: + og['og:image'] = self._rebase_url(meta_image[0], media_info['uri']) + else: + images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]") + images = sorted(images, key=lambda i: (-1 * int(i.attrib['width']) * int(i.attrib['height']))) + if not images: + images = tree.xpath("//img[@src]") + if images: + og['og:image'] = self._rebase_url(images[0].attrib['src'], media_info['uri']) + + # pre-cache the image for posterity + if 'og:image' in og and og['og:image']: + image_info = yield self._download_url(og['og:image'], requester.user) + + if self._is_media(image_info['media_type']): + # TODO: make sure we don't choke on white-on-transparent images + dims = yield self._generate_local_thumbnails( + image_info['filesystem_id'], image_info + ) + og["og:image"] = "mxc://%s/%s" % (self.server_name, image_info['filesystem_id']) + og["og:image:type"] = image_info['media_type'] + og["og:image:width"] = dims['width'] + og["og:image:height"] = dims['height'] + else: + del og["og:image"] + + if 'og:description' not in og: + meta_description = tree.xpath("//*/meta[@name='description']/@content"); + if meta_description: + og['og:description'] = meta_description[0] + else: + # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()") + text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | ancestor::aside | " + + "ancestor::footer | ancestor::script | ancestor::style)]" + + "[ancestor::body]") + text = '' + for text_node in text_nodes: + if len(text) < 500: + text += text_node + ' ' + else: + break + text = re.sub(r'[\t ]+', ' ', text) + text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text) + text = text.strip()[:500] + og['og:description'] = text if text else None + + # TODO: persist a cache mapping { url, etag } -> { og, mxc of url (if we bother keeping it around), age } + # TODO: delete the url downloads to stop diskfilling, as we only ever cared about its OG + defer.returnValue(og); + def _rebase_url(self, url, base): base = list(urlparse(base)) url = list(urlparse(url)) - if not url[0] and not url[1]: - url[0] = base[0] + if not url[0]: + url[0] = base[0] or "http" + if not url[1]: url[1] = base[1] if not url[2].startswith('/'): url[2] = re.sub(r'/[^/]+$', '/', base[2]) + url[2] -- cgit 1.5.1 From b26e8604f168b0f1ecc095bd0d6a717128361a41 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sat, 2 Apr 2016 01:35:44 +0100 Subject: make meta comparisons case insensitive --- synapse/rest/media/v1/preview_url_resource.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'synapse/rest') diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 1273472dab..77757548bd 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -170,7 +170,7 @@ class PreviewUrlResource(BaseMediaResource): if 'og:image' not in og: # TODO: extract a favicon failing all else - meta_image = tree.xpath("//*/meta[@itemprop='image']/@content"); + meta_image = tree.xpath("//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"); if meta_image: og['og:image'] = self._rebase_url(meta_image[0], media_info['uri']) else: @@ -198,7 +198,7 @@ class PreviewUrlResource(BaseMediaResource): del og["og:image"] if 'og:description' not in og: - meta_description = tree.xpath("//*/meta[@name='description']/@content"); + meta_description = tree.xpath("//*/meta[translate(@name, 'DESCRIPTION', 'description')='description']/@content"); if meta_description: og['og:description'] = meta_description[0] else: -- cgit 1.5.1 From 5037ee0d37f7e5c7a62f5af5ceef5363701e3202 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sat, 2 Apr 2016 02:29:57 +0100 Subject: handle missing dimensions without crashing --- synapse/rest/media/v1/preview_url_resource.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'synapse/rest') diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 77757548bd..3ffdafce09 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -78,10 +78,14 @@ class PreviewUrlResource(BaseMediaResource): "og:description" : media_info['download_name'], "og:image" : "mxc://%s/%s" % (self.server_name, media_info['filesystem_id']), "og:image:type" : media_info['media_type'], - "og:image:width" : dims['width'], - "og:image:height" : dims['height'], } + if dims: + og["og:image:width"] = dims['width'] + og["og:image:height"] = dims['height'] + else: + logger.warn("Couldn't get dims for %s" % url) + # define our OG response for this media elif self._is_html(media_info['media_type']): # TODO: somehow stop a big HTML tree from exploding synapse's RAM @@ -174,6 +178,7 @@ class PreviewUrlResource(BaseMediaResource): if meta_image: og['og:image'] = self._rebase_url(meta_image[0], media_info['uri']) else: + # TODO: consider inlined CSS styles as well as width & height attribs images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]") images = sorted(images, key=lambda i: (-1 * int(i.attrib['width']) * int(i.attrib['height']))) if not images: @@ -190,10 +195,14 @@ class PreviewUrlResource(BaseMediaResource): dims = yield self._generate_local_thumbnails( image_info['filesystem_id'], image_info ) + if dims: + og["og:image:width"] = dims['width'] + og["og:image:height"] = dims['height'] + else: + logger.warn("Couldn't get dims for %s" % og["og:image"]) + og["og:image"] = "mxc://%s/%s" % (self.server_name, image_info['filesystem_id']) og["og:image:type"] = image_info['media_type'] - og["og:image:width"] = dims['width'] - og["og:image:height"] = dims['height'] else: del og["og:image"] -- cgit 1.5.1 From 2c838f6459db35ad9812a83184d85a06ca5d940a Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sat, 2 Apr 2016 02:30:07 +0100 Subject: pass back SVGs as their own thumbnails --- synapse/rest/media/v1/thumbnail_resource.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'synapse/rest') diff --git a/synapse/rest/media/v1/thumbnail_resource.py b/synapse/rest/media/v1/thumbnail_resource.py index ab52499785..1e71738bc4 100644 --- a/synapse/rest/media/v1/thumbnail_resource.py +++ b/synapse/rest/media/v1/thumbnail_resource.py @@ -72,6 +72,11 @@ class ThumbnailResource(BaseMediaResource): self._respond_404(request) return + if media_info["media_type"] == "image/svg+xml": + file_path = self.filepaths.local_media_filepath(media_id) + yield self._respond_with_file(request, media_info["media_type"], file_path) + return + thumbnail_infos = yield self.store.get_local_media_thumbnails(media_id) if thumbnail_infos: @@ -103,6 +108,11 @@ class ThumbnailResource(BaseMediaResource): self._respond_404(request) return + if media_info["media_type"] == "image/svg+xml": + file_path = self.filepaths.local_media_filepath(media_id) + yield self._respond_with_file(request, media_info["media_type"], file_path) + return + thumbnail_infos = yield self.store.get_local_media_thumbnails(media_id) for info in thumbnail_infos: t_w = info["thumbnail_width"] == desired_width @@ -138,6 +148,11 @@ class ThumbnailResource(BaseMediaResource): desired_method, desired_type): media_info = yield self._get_remote_media(server_name, media_id) + if media_info["media_type"] == "image/svg+xml": + file_path = self.filepaths.remote_media_filepath(server_name, media_id) + yield self._respond_with_file(request, media_info["media_type"], file_path) + return + thumbnail_infos = yield self.store.get_remote_media_thumbnails( server_name, media_id, ) @@ -181,6 +196,11 @@ class ThumbnailResource(BaseMediaResource): # We should proxy the thumbnail from the remote server instead. media_info = yield self._get_remote_media(server_name, media_id) + if media_info["media_type"] == "image/svg+xml": + file_path = self.filepaths.remote_media_filepath(server_name, media_id) + yield self._respond_with_file(request, media_info["media_type"], file_path) + return + thumbnail_infos = yield self.store.get_remote_media_thumbnails( server_name, media_id, ) -- cgit 1.5.1 From 93771579610d723488486f40622d6c99ed061d7f Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sat, 2 Apr 2016 02:31:45 +0100 Subject: how was _respond_default_thumbnail ever meant to work? --- synapse/rest/media/v1/thumbnail_resource.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'synapse/rest') diff --git a/synapse/rest/media/v1/thumbnail_resource.py b/synapse/rest/media/v1/thumbnail_resource.py index 1e71738bc4..513b445688 100644 --- a/synapse/rest/media/v1/thumbnail_resource.py +++ b/synapse/rest/media/v1/thumbnail_resource.py @@ -228,6 +228,8 @@ class ThumbnailResource(BaseMediaResource): @defer.inlineCallbacks def _respond_default_thumbnail(self, request, media_info, width, height, method, m_type): + # XXX: how is this meant to work? store.get_default_thumbnails + # appears to always return [] so won't this always 404? media_type = media_info["media_type"] top_level_type = media_type.split("/")[0] sub_type = media_type.split("/")[-1].split(";")[0] -- cgit 1.5.1 From d1b154a10fc0f71fb36010f784ca6570f845c8d5 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sat, 2 Apr 2016 03:06:39 +0100 Subject: support gzip compression, and don't pass through error msgs --- synapse/http/client.py | 11 ++++++++--- synapse/rest/media/v1/preview_url_resource.py | 5 +++-- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'synapse/rest') diff --git a/synapse/http/client.py b/synapse/http/client.py index 1b6f7cb795..b21bf17378 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -23,7 +23,8 @@ from canonicaljson import encode_canonical_json from twisted.internet import defer, reactor, ssl, protocol from twisted.web.client import ( - BrowserLikeRedirectAgent, Agent, readBody, FileBodyProducer, PartialDownloadError, + BrowserLikeRedirectAgent, ContentDecoderAgent, GzipDecoder, Agent, + readBody, FileBodyProducer, PartialDownloadError, ) from twisted.web.http import PotentialDataLoss from twisted.web.http_headers import Headers @@ -269,6 +270,10 @@ class SimpleHttpClient(object): # XXX: do we want to explicitly drop the connection here somehow? if so, how? raise # what should we be raising here? + if response.code > 299: + logger.warn("Got %d when downloading %s" % (response.code, url)) + raise + # TODO: if our Content-Type is HTML or something, just read the first # N bytes into RAM rather than saving it all to disk only to read it # straight back in again @@ -366,11 +371,11 @@ class SpiderHttpClient(SimpleHttpClient): def __init__(self, hs): SimpleHttpClient.__init__(self, hs) # clobber the base class's agent and UA: - self.agent = BrowserLikeRedirectAgent(Agent( + self.agent = ContentDecoderAgent(BrowserLikeRedirectAgent(Agent( reactor, connectTimeout=15, contextFactory=hs.get_http_client_context_factory() - )) + )), [('gzip', GzipDecoder)]) # Look like Chrome for now #self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko) Chrome Safari" % hs.version_string) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 3ffdafce09..162e09ba71 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -200,7 +200,7 @@ class PreviewUrlResource(BaseMediaResource): og["og:image:height"] = dims['height'] else: logger.warn("Couldn't get dims for %s" % og["og:image"]) - + og["og:image"] = "mxc://%s/%s" % (self.server_name, image_info['filesystem_id']) og["og:image:type"] = image_info['media_type'] else: @@ -259,7 +259,8 @@ class PreviewUrlResource(BaseMediaResource): length, headers, uri = yield self.client.get_file( url, output_stream=f, max_size=self.max_spider_size, ) - # FIXME: handle 404s sanely - don't spider an error page + # FIXME: pass through 404s and other error messages nicely + media_type = headers["Content-Type"][0] time_now_ms = self.clock.time_msec() -- cgit 1.5.1 From 7426c86eb88a7abef9af7ba544ccd709b25e8304 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 3 Apr 2016 00:31:57 +0100 Subject: add a persistent cache of URL lookups, and fix up the in-memory one to work --- synapse/http/client.py | 6 +- synapse/rest/media/v1/preview_url_resource.py | 64 ++++++++++++++++++---- synapse/storage/media_repository.py | 54 +++++++++++++++++- .../delta/30/local_media_repository_url_cache.sql | 27 +++++++++ 4 files changed, 137 insertions(+), 14 deletions(-) create mode 100644 synapse/storage/schema/delta/30/local_media_repository_url_cache.sql (limited to 'synapse/rest') diff --git a/synapse/http/client.py b/synapse/http/client.py index b21bf17378..f42a36ffa6 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -251,8 +251,8 @@ class SimpleHttpClient(object): url (str): The URL to GET output_stream (file): File to write the response body to. Returns: - A (int,dict) tuple of the file length and a dict of the response - headers. + A (int,dict,string,int) tuple of the file length, dict of the response + headers, absolute URI of the response and HTTP response code. """ response = yield self.request( @@ -287,7 +287,7 @@ class SimpleHttpClient(object): logger.exception("Failed to download body") raise - defer.returnValue((length, headers, response.request.absoluteURI)) + defer.returnValue((length, headers, response.request.absoluteURI, response.code)) # XXX: FIXME: This is horribly copy-pasted from matrixfederationclient. diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 162e09ba71..86341cc4cc 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -37,6 +37,8 @@ class PreviewUrlResource(BaseMediaResource): def __init__(self, hs, filepaths): BaseMediaResource.__init__(self, hs, filepaths) self.client = SpiderHttpClient(hs) + + # simple memory cache mapping urls to OG metadata self.cache = ExpiringCache( cache_name = "url_previews", clock = self.clock, @@ -56,17 +58,41 @@ class PreviewUrlResource(BaseMediaResource): # XXX: if get_user_by_req fails, what should we do in an async render? requester = yield self.auth.get_user_by_req(request) url = request.args.get("url")[0] - - if self.cache: - og = self.cache.get(url) - respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) - return + ts = request.args.get("ts")[0] if "ts" in request.args else self.clock.time_msec() # TODO: keep track of whether there's an ongoing request for this preview # and block and return their details if there is one. + # first check the memory cache - good to handle all the clients on this + # HS thundering away to preview the same URL at the same time. + try: + og = self.cache[url] + respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) + return + except: + pass + + # then check the URL cache in the DB (which will also provide us with + # historical previews, if we have any) + cache_result = yield self.store.get_url_cache(url, ts) + if ( + cache_result and + cache_result["download_ts"] + cache_result["expires"] > ts and + cache_result["response_code"] / 100 == 2 + ): + respond_with_json_bytes( + request, 200, cache_result["og"].encode('utf-8'), + send_cors=True + ) + return + media_info = yield self._download_url(url, requester.user) + # FIXME: we should probably update our cache now anyway, so that + # even if the OG calculation raises, we don't keep hammering on the + # remote server. For now, leave it uncached to aid debugging OG + # calculation problems + logger.debug("got media_info of '%s'" % media_info) if self._is_media(media_info['media_type']): @@ -105,10 +131,21 @@ class PreviewUrlResource(BaseMediaResource): logger.warn("Failed to find any OG data in %s", url) og = {} - if self.cache: - self.cache[url] = og + logger.debug("Calculated OG for %s as %s" % (url, og)); + + # store OG in ephemeral in-memory cache + self.cache[url] = og - logger.warn(og); + # store OG in history-aware DB cache + yield self.store.store_url_cache( + url, + media_info["response_code"], + media_info["etag"], + media_info["expires"], + json.dumps(og), + media_info["filesystem_id"], + media_info["created_ts"], + ) respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) except: @@ -187,6 +224,9 @@ class PreviewUrlResource(BaseMediaResource): og['og:image'] = self._rebase_url(images[0].attrib['src'], media_info['uri']) # pre-cache the image for posterity + # FIXME: it might be cleaner to use the same flow as the main /preview_url request itself + # and benefit from the same caching etc. But for now we just rely on the caching + # of the master request to speed things up. if 'og:image' in og and og['og:image']: image_info = yield self._download_url(og['og:image'], requester.user) @@ -226,7 +266,6 @@ class PreviewUrlResource(BaseMediaResource): text = text.strip()[:500] og['og:description'] = text if text else None - # TODO: persist a cache mapping { url, etag } -> { og, mxc of url (if we bother keeping it around), age } # TODO: delete the url downloads to stop diskfilling, as we only ever cared about its OG defer.returnValue(og); @@ -256,7 +295,7 @@ class PreviewUrlResource(BaseMediaResource): try: with open(fname, "wb") as f: logger.debug("Trying to get url '%s'" % url) - length, headers, uri = yield self.client.get_file( + length, headers, uri, code = yield self.client.get_file( url, output_stream=f, max_size=self.max_spider_size, ) # FIXME: pass through 404s and other error messages nicely @@ -311,6 +350,11 @@ class PreviewUrlResource(BaseMediaResource): "filesystem_id": file_id, "filename": fname, "uri": uri, + "response_code": code, + # FIXME: we should calculate a proper expiration based on the + # Cache-Control and Expire headers. But for now, assume 1 hour. + "expires": 60 * 60 * 1000, + "etag": headers["ETag"] if "ETag" in headers else None, }) def _is_media(self, content_type): diff --git a/synapse/storage/media_repository.py b/synapse/storage/media_repository.py index 9d3ba32478..bb002081ae 100644 --- a/synapse/storage/media_repository.py +++ b/synapse/storage/media_repository.py @@ -25,7 +25,7 @@ class MediaRepositoryStore(SQLBaseStore): def get_local_media(self, media_id): """Get the metadata for a local piece of media Returns: - None if the meia_id doesn't exist. + None if the media_id doesn't exist. """ return self._simple_select_one( "local_media_repository", @@ -50,6 +50,58 @@ class MediaRepositoryStore(SQLBaseStore): desc="store_local_media", ) + def get_url_cache(self, url, ts): + """Get the media_id and ts for a cached URL as of the given timestamp + Returns: + None if the URL isn't cached. + """ + def get_url_cache_txn(txn): + # get the most recently cached result (relative to the given ts) + sql = ( + "SELECT response_code, etag, expires, og, media_id, max(download_ts)" + " FROM local_media_repository_url_cache" + " WHERE url = ? AND download_ts <= ?" + ) + txn.execute(sql, (url, ts)) + row = txn.fetchone() + + if not row[3]: + # ...or if we've requested a timestamp older than the oldest + # copy in the cache, return the oldest copy (if any) + sql = ( + "SELECT response_code, etag, expires, og, media_id, min(download_ts)" + " FROM local_media_repository_url_cache" + " WHERE url = ? AND download_ts > ?" + ) + txn.execute(sql, (url, ts)) + row = txn.fetchone() + + if not row[3]: + return None + + return dict(zip(( + 'response_code', 'etag', 'expires', 'og', 'media_id', 'download_ts' + ), row)) + + return self.runInteraction( + "get_url_cache", get_url_cache_txn + ) + + def store_url_cache(self, url, response_code, etag, expires, og, media_id, download_ts): + return self._simple_insert( + "local_media_repository_url_cache", + { + "url": url, + "response_code": response_code, + "etag": etag, + "expires": expires, + "og": og, + "media_id": media_id, + "download_ts": download_ts, + }, + desc="store_url_cache", + ) + def get_local_media_thumbnails(self, media_id): return self._simple_select_list( "local_media_repository_thumbnails", diff --git a/synapse/storage/schema/delta/30/local_media_repository_url_cache.sql b/synapse/storage/schema/delta/30/local_media_repository_url_cache.sql new file mode 100644 index 0000000000..9efb4280eb --- /dev/null +++ b/synapse/storage/schema/delta/30/local_media_repository_url_cache.sql @@ -0,0 +1,27 @@ +/* Copyright 2016 OpenMarket Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CREATE TABLE local_media_repository_url_cache( + url TEXT, -- the URL being cached + response_code INTEGER, -- the HTTP response code of this download attempt + etag TEXT, -- the etag header of this response + expires INTEGER, -- the number of ms this response was valid for + og TEXT, -- cache of the OG metadata of this URL as JSON + media_id TEXT, -- the media_id, if any, of the URL's content in the repo + download_ts BIGINT -- the timestamp of this download attempt +); + +CREATE INDEX local_media_repository_url_cache_by_url_download_ts + ON local_media_repository_url_cache(url, download_ts); -- cgit 1.5.1 From b09e29a03ca95c577215acbe8d5037d6337e1af3 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 3 Apr 2016 00:47:40 +0100 Subject: Ensure only one download for a given URL is active at a time --- synapse/rest/media/v1/preview_url_resource.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'synapse/rest') diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 86341cc4cc..c20de57991 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -23,6 +23,7 @@ from synapse.util.stringutils import random_string from synapse.util.caches.expiringcache import ExpiringCache from synapse.http.client import SpiderHttpClient from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes +from synapse.util.async import ObservableDeferred import os import re @@ -46,6 +47,8 @@ class PreviewUrlResource(BaseMediaResource): ) self.cache.start() + self.downloads = {} + def render_GET(self, request): self._async_render_GET(request) return NOT_DONE_YET @@ -86,7 +89,21 @@ class PreviewUrlResource(BaseMediaResource): ) return - media_info = yield self._download_url(url, requester.user) + # Ensure only one download for a given URL is active at a time + download = self.downloads.get(url) + if download is None: + download = self._download_url(url, requester.user) + download = ObservableDeferred( + download, + consumeErrors=True + ) + self.downloads[url] = download + + @download.addBoth + def callback(media_info): + del self.downloads[key] + return media_info + media_info = yield download.observe() # FIXME: we should probably update our cache now anyway, so that # even if the OG calculation raises, we don't keep hammering on the -- cgit 1.5.1 From 110780b18b029c5b6f1c34f7b4e027b88ea8b8ce Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 3 Apr 2016 00:48:31 +0100 Subject: remove stale todo --- synapse/rest/media/v1/preview_url_resource.py | 3 --- 1 file changed, 3 deletions(-) (limited to 'synapse/rest') diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index c20de57991..582dd20fa6 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -63,9 +63,6 @@ class PreviewUrlResource(BaseMediaResource): url = request.args.get("url")[0] ts = request.args.get("ts")[0] if "ts" in request.args else self.clock.time_msec() - # TODO: keep track of whether there's an ongoing request for this preview - # and block and return their details if there is one. - # first check the memory cache - good to handle all the clients on this # HS thundering away to preview the same URL at the same time. try: -- cgit 1.5.1 From c3916462f68df84df29ad924c07f8e83c0143fcc Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 3 Apr 2016 01:33:12 +0100 Subject: rebase all image URLs --- synapse/rest/media/v1/preview_url_resource.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'synapse/rest') diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 582dd20fa6..31ce2b5831 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -235,14 +235,14 @@ class PreviewUrlResource(BaseMediaResource): if not images: images = tree.xpath("//img[@src]") if images: - og['og:image'] = self._rebase_url(images[0].attrib['src'], media_info['uri']) + og['og:image'] = images[0].attrib['src'] # pre-cache the image for posterity # FIXME: it might be cleaner to use the same flow as the main /preview_url request itself # and benefit from the same caching etc. But for now we just rely on the caching # of the master request to speed things up. if 'og:image' in og and og['og:image']: - image_info = yield self._download_url(og['og:image'], requester.user) + image_info = yield self._download_url(self._rebase_url(og['og:image'], media_info['uri']), requester.user) if self._is_media(image_info['media_type']): # TODO: make sure we don't choke on white-on-transparent images @@ -286,9 +286,9 @@ class PreviewUrlResource(BaseMediaResource): def _rebase_url(self, url, base): base = list(urlparse(base)) url = list(urlparse(url)) - if not url[0]: + if not url[0]: # fix up schema url[0] = base[0] or "http" - if not url[1]: + if not url[1]: # fix up hostname url[1] = base[1] if not url[2].startswith('/'): url[2] = re.sub(r'/[^/]+$', '/', base[2]) + url[2] -- cgit 1.5.1 From eab4d462f8e5d17c5ca7592d1ea15d8e4771a00c Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 3 Apr 2016 02:02:46 +0100 Subject: fix etag typing error. fix timestamp typing error --- synapse/rest/media/v1/preview_url_resource.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'synapse/rest') diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 31ce2b5831..7c69c01a6c 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -61,7 +61,7 @@ class PreviewUrlResource(BaseMediaResource): # XXX: if get_user_by_req fails, what should we do in an async render? requester = yield self.auth.get_user_by_req(request) url = request.args.get("url")[0] - ts = request.args.get("ts")[0] if "ts" in request.args else self.clock.time_msec() + ts = int(request.args.get("ts")[0]) if "ts" in request.args else self.clock.time_msec() # first check the memory cache - good to handle all the clients on this # HS thundering away to preview the same URL at the same time. @@ -368,7 +368,7 @@ class PreviewUrlResource(BaseMediaResource): # FIXME: we should calculate a proper expiration based on the # Cache-Control and Expire headers. But for now, assume 1 hour. "expires": 60 * 60 * 1000, - "etag": headers["ETag"] if "ETag" in headers else None, + "etag": headers["ETag"][0] if "ETag" in headers else None, }) def _is_media(self, content_type): -- cgit 1.5.1 From 8b98a7e8c37f0fae09f33a6d93953584288ed394 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 3 Apr 2016 12:56:29 +0100 Subject: pep8 --- synapse/http/client.py | 14 ++- synapse/rest/media/v1/media_repository.py | 1 - synapse/rest/media/v1/preview_url_resource.py | 127 +++++++++++++++----------- synapse/storage/media_repository.py | 3 +- 4 files changed, 85 insertions(+), 60 deletions(-) (limited to 'synapse/rest') diff --git a/synapse/http/client.py b/synapse/http/client.py index f42a36ffa6..442b4bb73d 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -15,7 +15,9 @@ from OpenSSL import SSL from OpenSSL.SSL import VERIFY_NONE -from synapse.api.errors import CodeMessageException +from synapse.api.errors import ( + CodeMessageException, SynapseError, Codes, +) from synapse.util.logcontext import preserve_context_over_fn import synapse.metrics @@ -268,7 +270,7 @@ class SimpleHttpClient(object): if 'Content-Length' in headers and headers['Content-Length'] > max_size: logger.warn("Requested URL is too large > %r bytes" % (self.max_size,)) # XXX: do we want to explicitly drop the connection here somehow? if so, how? - raise # what should we be raising here? + raise # what should we be raising here? if response.code > 299: logger.warn("Got %d when downloading %s" % (response.code, url)) @@ -331,6 +333,7 @@ def _readBodyToFile(response, stream, max_size): response.deliverBody(_ReadBodyToFileProtocol(stream, d, max_size)) return d + class CaptchaServerHttpClient(SimpleHttpClient): """ Separate HTTP client for talking to google's captcha servers @@ -360,6 +363,7 @@ class CaptchaServerHttpClient(SimpleHttpClient): # twisted dislikes google's response, no content length. defer.returnValue(e.response) + class SpiderHttpClient(SimpleHttpClient): """ Separate HTTP client for spidering arbitrary URLs. @@ -376,8 +380,10 @@ class SpiderHttpClient(SimpleHttpClient): connectTimeout=15, contextFactory=hs.get_http_client_context_factory() )), [('gzip', GzipDecoder)]) - # Look like Chrome for now - #self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko) Chrome Safari" % hs.version_string) + # We could look like Chrome: + # self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko) + # Chrome Safari" % hs.version_string) + def encode_urlencode_args(args): return {k: encode_urlencode_arg(v) for k, v in args.items()} diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index 8f3491b91c..11f672aeab 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -80,4 +80,3 @@ class MediaRepositoryResource(Resource): self.putChild("thumbnail", ThumbnailResource(hs, filepaths)) self.putChild("identicon", IdenticonResource()) self.putChild("preview_url", PreviewUrlResource(hs, filepaths)) - diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 7c69c01a6c..29db5c7fce 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -13,25 +13,31 @@ # limitations under the License. from .base_resource import BaseMediaResource -from synapse.api.errors import Codes -from twisted.web.resource import Resource + from twisted.web.server import NOT_DONE_YET from twisted.internet import defer from lxml import html from urlparse import urlparse, urlunparse + +from synapse.api.errors import Codes from synapse.util.stringutils import random_string from synapse.util.caches.expiringcache import ExpiringCache from synapse.http.client import SpiderHttpClient -from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes +from synapse.http.server import ( + request_handler, respond_with_json, respond_with_json_bytes +) from synapse.util.async import ObservableDeferred +from synapse.util.stringutils import is_ascii import os import re +import cgi import ujson as json import logging logger = logging.getLogger(__name__) + class PreviewUrlResource(BaseMediaResource): isLeaf = True @@ -41,9 +47,10 @@ class PreviewUrlResource(BaseMediaResource): # simple memory cache mapping urls to OG metadata self.cache = ExpiringCache( - cache_name = "url_previews", - clock = self.clock, - expiry_ms = 60*60*1000, # don't spider URLs more often than once an hour + cache_name="url_previews", + clock=self.clock, + # don't spider URLs more often than once an hour + expiry_ms=60 * 60 * 1000, ) self.cache.start() @@ -56,12 +63,15 @@ class PreviewUrlResource(BaseMediaResource): @request_handler @defer.inlineCallbacks def _async_render_GET(self, request): - + try: # XXX: if get_user_by_req fails, what should we do in an async render? requester = yield self.auth.get_user_by_req(request) url = request.args.get("url")[0] - ts = int(request.args.get("ts")[0]) if "ts" in request.args else self.clock.time_msec() + if "ts" in request.args: + ts = int(request.args.get("ts")[0]) + else: + ts = self.clock.time_msec() # first check the memory cache - good to handle all the clients on this # HS thundering away to preview the same URL at the same time. @@ -98,7 +108,7 @@ class PreviewUrlResource(BaseMediaResource): @download.addBoth def callback(media_info): - del self.downloads[key] + del self.downloads[url] return media_info media_info = yield download.observe() @@ -111,13 +121,15 @@ class PreviewUrlResource(BaseMediaResource): if self._is_media(media_info['media_type']): dims = yield self._generate_local_thumbnails( - media_info['filesystem_id'], media_info - ) + media_info['filesystem_id'], media_info + ) og = { - "og:description" : media_info['download_name'], - "og:image" : "mxc://%s/%s" % (self.server_name, media_info['filesystem_id']), - "og:image:type" : media_info['media_type'], + "og:description": media_info['download_name'], + "og:image": "mxc://%s/%s" % ( + self.server_name, media_info['filesystem_id'] + ), + "og:image:type": media_info['media_type'], } if dims: @@ -138,14 +150,14 @@ class PreviewUrlResource(BaseMediaResource): file = open(media_info['filename']) body = file.read() file.close() - tree = html.fromstring(body.decode('utf-8','ignore')) + tree = html.fromstring(body.decode('utf-8', 'ignore')) og = yield self._calc_og(tree, media_info, requester) else: logger.warn("Failed to find any OG data in %s", url) og = {} - logger.debug("Calculated OG for %s as %s" % (url, og)); + logger.debug("Calculated OG for %s as %s" % (url, og)) # store OG in ephemeral in-memory cache self.cache[url] = og @@ -181,28 +193,20 @@ class PreviewUrlResource(BaseMediaResource): # suck our tree into lxml and define our OG response. # if we see any image URLs in the OG response, then spider them - # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) - - # "og:type" : "article" - # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" - # "og:title" : "Matrix on Twitter" - # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" - # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" - # "og:site_name" : "Twitter" - - # or: + # (although the client could choose to do this by asking for previews of those + # URLs to avoid DoSing the server) # "og:type" : "video", # "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw", # "og:site_name" : "YouTube", # "og:video:type" : "application/x-shockwave-flash", - # "og:description" : " ", + # "og:description" : "Fun stuff happening here", # "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon", # "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg", # "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", # "og:video:width" : "1280" # "og:video:height" : "720", - # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", + # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3", og = {} for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): @@ -210,64 +214,76 @@ class PreviewUrlResource(BaseMediaResource): # TODO: grab article: meta tags too, e.g.: - # - # - # - # - # - # + # "article:publisher" : "https://www.facebook.com/thethudonline" /> + # "article:author" content="https://www.facebook.com/thethudonline" /> + # "article:tag" content="baby" /> + # "article:section" content="Breaking News" /> + # "article:published_time" content="2016-03-31T19:58:24+00:00" /> + # "article:modified_time" content="2016-04-01T18:31:53+00:00" /> if 'og:title' not in og: # do some basic spidering of the HTML title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]") og['og:title'] = title[0].text.strip() if title else None - if 'og:image' not in og: # TODO: extract a favicon failing all else - meta_image = tree.xpath("//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"); + meta_image = tree.xpath( + "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content" + ) if meta_image: og['og:image'] = self._rebase_url(meta_image[0], media_info['uri']) else: # TODO: consider inlined CSS styles as well as width & height attribs images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]") - images = sorted(images, key=lambda i: (-1 * int(i.attrib['width']) * int(i.attrib['height']))) + images = sorted(images, key=lambda i: ( + -1 * int(i.attrib['width']) * int(i.attrib['height']) + )) if not images: images = tree.xpath("//img[@src]") if images: og['og:image'] = images[0].attrib['src'] # pre-cache the image for posterity - # FIXME: it might be cleaner to use the same flow as the main /preview_url request itself - # and benefit from the same caching etc. But for now we just rely on the caching - # of the master request to speed things up. + # FIXME: it might be cleaner to use the same flow as the main /preview_url request + # itself and benefit from the same caching etc. But for now we just rely on the + # caching on the master request to speed things up. if 'og:image' in og and og['og:image']: - image_info = yield self._download_url(self._rebase_url(og['og:image'], media_info['uri']), requester.user) + image_info = yield self._download_url( + self._rebase_url(og['og:image'], media_info['uri']), requester.user + ) if self._is_media(image_info['media_type']): # TODO: make sure we don't choke on white-on-transparent images dims = yield self._generate_local_thumbnails( - image_info['filesystem_id'], image_info - ) + image_info['filesystem_id'], image_info + ) if dims: og["og:image:width"] = dims['width'] og["og:image:height"] = dims['height'] else: logger.warn("Couldn't get dims for %s" % og["og:image"]) - og["og:image"] = "mxc://%s/%s" % (self.server_name, image_info['filesystem_id']) + og["og:image"] = "mxc://%s/%s" % ( + self.server_name, image_info['filesystem_id'] + ) og["og:image:type"] = image_info['media_type'] else: del og["og:image"] if 'og:description' not in og: - meta_description = tree.xpath("//*/meta[translate(@name, 'DESCRIPTION', 'description')='description']/@content"); + meta_description = tree.xpath( + "//*/meta" + "[translate(@name, 'DESCRIPTION', 'description')='description']" + "/@content") if meta_description: og['og:description'] = meta_description[0] else: - # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()") - text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | ancestor::aside | " + - "ancestor::footer | ancestor::script | ancestor::style)]" + + # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | " + # "//p/text() | //div/text() | //span/text() | //a/text()") + text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | " + "ancestor::aside | ancestor::footer | " + "ancestor::script | ancestor::style)]" + "[ancestor::body]") text = '' for text_node in text_nodes: @@ -280,15 +296,16 @@ class PreviewUrlResource(BaseMediaResource): text = text.strip()[:500] og['og:description'] = text if text else None - # TODO: delete the url downloads to stop diskfilling, as we only ever cared about its OG - defer.returnValue(og); + # TODO: delete the url downloads to stop diskfilling, + # as we only ever cared about its OG + defer.returnValue(og) def _rebase_url(self, url, base): base = list(urlparse(base)) url = list(urlparse(url)) - if not url[0]: # fix up schema + if not url[0]: # fix up schema url[0] = base[0] or "http" - if not url[1]: # fix up hostname + if not url[1]: # fix up hostname url[1] = base[1] if not url[2].startswith('/'): url[2] = re.sub(r'/[^/]+$', '/', base[2]) + url[2] @@ -377,6 +394,8 @@ class PreviewUrlResource(BaseMediaResource): def _is_html(self, content_type): content_type = content_type.lower() - if (content_type.startswith("text/html") or - content_type.startswith("application/xhtml")): + if ( + content_type.startswith("text/html") or + content_type.startswith("application/xhtml") + ): return True diff --git a/synapse/storage/media_repository.py b/synapse/storage/media_repository.py index bb002081ae..c9dd20eed8 100644 --- a/synapse/storage/media_repository.py +++ b/synapse/storage/media_repository.py @@ -87,7 +87,8 @@ class MediaRepositoryStore(SQLBaseStore): "get_url_cache", get_url_cache_txn ) - def store_url_cache(self, url, response_code, etag, expires, og, media_id, download_ts): + def store_url_cache(self, url, response_code, etag, expires, og, media_id, + download_ts): return self._simple_insert( "local_media_repository_url_cache", { -- cgit 1.5.1 From 0834b152fb05e110428a4834a2e5dc51b6f7d327 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 3 Apr 2016 12:59:27 +0100 Subject: char encoding --- synapse/rest/media/v1/preview_url_resource.py | 1 + 1 file changed, 1 insertion(+) (limited to 'synapse/rest') diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 29db5c7fce..ff522c5fb8 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright 2016 OpenMarket Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); -- cgit 1.5.1 From cf51c4120e79a59a798fcf88c5c7d9f95dc6e76d Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 3 Apr 2016 23:57:05 +0100 Subject: report image size (bytewise) in OG meta --- synapse/rest/media/v1/preview_url_resource.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'synapse/rest') diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index ff522c5fb8..f5ec32d8f2 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -131,6 +131,7 @@ class PreviewUrlResource(BaseMediaResource): self.server_name, media_info['filesystem_id'] ), "og:image:type": media_info['media_type'], + "matrix:image:size": media_info['media_length'], } if dims: @@ -269,6 +270,7 @@ class PreviewUrlResource(BaseMediaResource): self.server_name, image_info['filesystem_id'] ) og["og:image:type"] = image_info['media_type'] + og["matrix:image:size"] = image_info['media_length'] else: del og["og:image"] -- cgit 1.5.1 From dafef5a688b8684232346a26a789a2da600ec58e Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Fri, 8 Apr 2016 18:37:15 +0100 Subject: Add url_preview_enabled config option to turn on/off preview_url endpoint. defaults to off. Add url_preview_ip_range_blacklist to let admins specify internal IP ranges that must not be spidered. Add url_preview_url_blacklist to let admins specify URL patterns that must not be spidered. Implement a custom SpiderEndpoint and associated support classes to implement url_preview_ip_range_blacklist Add commentary and generally address PR feedback --- synapse/config/repository.py | 77 +++++++++++++++++++++++++-- synapse/http/client.py | 44 +++++++++++++-- synapse/http/endpoint.py | 35 +++++++++++- synapse/python_dependencies.py | 7 ++- synapse/rest/media/v1/media_repository.py | 7 ++- synapse/rest/media/v1/preview_url_resource.py | 75 ++++++++++++++++++++------ 6 files changed, 214 insertions(+), 31 deletions(-) (limited to 'synapse/rest') diff --git a/synapse/config/repository.py b/synapse/config/repository.py index f4ab705701..da1007d767 100644 --- a/synapse/config/repository.py +++ b/synapse/config/repository.py @@ -16,6 +16,8 @@ from ._base import Config from collections import namedtuple +import sys + ThumbnailRequirement = namedtuple( "ThumbnailRequirement", ["width", "height", "method", "media_type"] ) @@ -23,7 +25,7 @@ ThumbnailRequirement = namedtuple( def parse_thumbnail_requirements(thumbnail_sizes): """ Takes a list of dictionaries with "width", "height", and "method" keys - and creates a map from image media types to the thumbnail size, thumnailing + and creates a map from image media types to the thumbnail size, thumbnailing method, and thumbnail media type to precalculate Args: @@ -60,6 +62,18 @@ class ContentRepositoryConfig(Config): self.thumbnail_requirements = parse_thumbnail_requirements( config["thumbnail_sizes"] ) + self.url_preview_enabled = config["url_preview_enabled"] + if self.url_preview_enabled: + try: + from netaddr import IPSet + if "url_preview_ip_range_blacklist" in config: + self.url_preview_ip_range_blacklist = IPSet( + config["url_preview_ip_range_blacklist"] + ) + if "url_preview_url_blacklist" in config: + self.url_preview_url_blacklist = config["url_preview_url_blacklist"] + except ImportError: + sys.stderr.write("\nmissing netaddr dep - disabling preview_url API\n") def default_config(self, **kwargs): media_store = self.default_path("media_store") @@ -74,9 +88,6 @@ class ContentRepositoryConfig(Config): # The largest allowed upload size in bytes max_upload_size: "10M" - # The largest allowed URL preview spidering size in bytes - max_spider_size: "10M" - # Maximum number of pixels that will be thumbnailed max_image_pixels: "32M" @@ -104,4 +115,62 @@ class ContentRepositoryConfig(Config): - width: 800 height: 600 method: scale + + # Is the preview URL API enabled? If enabled, you *must* specify + # an explicit url_preview_ip_range_blacklist of IPs that the spider is + # denied from accessing. + url_preview_enabled: False + + # List of IP address CIDR ranges that the URL preview spider is denied + # from accessing. There are no defaults: you must explicitly + # specify a list for URL previewing to work. You should specify any + # internal services in your network that you do not want synapse to try + # to connect to, otherwise anyone in any Matrix room could cause your + # synapse to issue arbitrary GET requests to your internal services, + # causing serious security issues. + # + # url_preview_ip_range_blacklist: + # - '127.0.0.0/8' + # - '10.0.0.0/8' + # - '172.16.0.0/12' + # - '192.168.0.0/16' + + # Optional list of URL matches that the URL preview spider is + # denied from accessing. You should use url_preview_ip_range_blacklist + # in preference to this, otherwise someone could define a public DNS + # entry that points to a private IP address and circumvent the blacklist. + # This is more useful if you know there is an entire shape of URL that + # you know that will never want synapse to try to spider. + # + # Each list entry is a dictionary of url component attributes as returned + # by urlparse.urlsplit as applied to the absolute form of the URL. See + # https://docs.python.org/2/library/urlparse.html#urlparse.urlsplit + # The values of the dictionary are treated as an filename match pattern + # applied to that component of URLs, unless they start with a ^ in which + # case they are treated as a regular expression match. If all the + # specified component matches for a given list item succeed, the URL is + # blacklisted. + # + # url_preview_url_blacklist: + # # blacklist any URL with a username in its URI + # - username: '*'' + # + # # blacklist all *.google.com URLs + # - netloc: 'google.com' + # - netloc: '*.google.com' + # + # # blacklist all plain HTTP URLs + # - scheme: 'http' + # + # # blacklist http(s)://www.acme.com/foo + # - netloc: 'www.acme.com' + # path: '/foo' + # + # # blacklist any URL with a literal IPv4 address + # - netloc: '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$' + + # The largest allowed URL preview spidering size in bytes + max_spider_size: "10M" + + """ % locals() diff --git a/synapse/http/client.py b/synapse/http/client.py index 442b4bb73d..3b8ffcd3ef 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -20,10 +20,12 @@ from synapse.api.errors import ( ) from synapse.util.logcontext import preserve_context_over_fn import synapse.metrics +from synapse.http.endpoint import SpiderEndpoint from canonicaljson import encode_canonical_json from twisted.internet import defer, reactor, ssl, protocol +from twisted.internet.endpoints import SSL4ClientEndpoint, TCP4ClientEndpoint from twisted.web.client import ( BrowserLikeRedirectAgent, ContentDecoderAgent, GzipDecoder, Agent, readBody, FileBodyProducer, PartialDownloadError, @@ -364,6 +366,35 @@ class CaptchaServerHttpClient(SimpleHttpClient): defer.returnValue(e.response) +class SpiderEndpointFactory(object): + def __init__(self, hs): + self.blacklist = hs.config.url_preview_ip_range_blacklist + self.policyForHTTPS = hs.get_http_client_context_factory() + + def endpointForURI(self, uri): + logger.info("Getting endpoint for %s", uri.toBytes()) + if uri.scheme == "http": + return SpiderEndpoint( + reactor, uri.host, uri.port, self.blacklist, + endpoint=TCP4ClientEndpoint, + endpoint_kw_args={ + 'timeout': 15 + }, + ) + elif uri.scheme == "https": + tlsPolicy = self.policyForHTTPS.creatorForNetloc(uri.host, uri.port) + return SpiderEndpoint( + reactor, uri.host, uri.port, self.blacklist, + endpoint=SSL4ClientEndpoint, + endpoint_kw_args={ + 'sslContextFactory': tlsPolicy, + 'timeout': 15 + }, + ) + else: + logger.warn("Can't get endpoint for unrecognised scheme %s", uri.scheme) + + class SpiderHttpClient(SimpleHttpClient): """ Separate HTTP client for spidering arbitrary URLs. @@ -375,11 +406,14 @@ class SpiderHttpClient(SimpleHttpClient): def __init__(self, hs): SimpleHttpClient.__init__(self, hs) # clobber the base class's agent and UA: - self.agent = ContentDecoderAgent(BrowserLikeRedirectAgent(Agent( - reactor, - connectTimeout=15, - contextFactory=hs.get_http_client_context_factory() - )), [('gzip', GzipDecoder)]) + self.agent = ContentDecoderAgent( + BrowserLikeRedirectAgent( + Agent.usingEndpointFactory( + reactor, + SpiderEndpointFactory(hs) + ) + ), [('gzip', GzipDecoder)] + ) # We could look like Chrome: # self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko) # Chrome Safari" % hs.version_string) diff --git a/synapse/http/endpoint.py b/synapse/http/endpoint.py index 4775f6707d..de5c762f50 100644 --- a/synapse/http/endpoint.py +++ b/synapse/http/endpoint.py @@ -74,6 +74,37 @@ def matrix_federation_endpoint(reactor, destination, ssl_context_factory=None, return transport_endpoint(reactor, domain, port, **endpoint_kw_args) +class SpiderEndpoint(object): + """An endpoint which refuses to connect to blacklisted IP addresses + Implements twisted.internet.interfaces.IStreamClientEndpoint. + """ + def __init__(self, reactor, host, port, blacklist, + endpoint=TCP4ClientEndpoint, endpoint_kw_args={}): + self.reactor = reactor + self.host = host + self.port = port + self.blacklist = blacklist + self.endpoint = endpoint + self.endpoint_kw_args = endpoint_kw_args + + @defer.inlineCallbacks + def connect(self, protocolFactory): + address = yield self.reactor.resolve(self.host) + + from netaddr import IPAddress + if IPAddress(address) in self.blacklist: + raise ConnectError( + "Refusing to spider blacklisted IP address %s" % address + ) + + logger.info("Connecting to %s:%s", address, self.port) + endpoint = self.endpoint( + self.reactor, address, self.port, **self.endpoint_kw_args + ) + connection = yield endpoint.connect(protocolFactory) + defer.returnValue(connection) + + class SRVClientEndpoint(object): """An endpoint which looks up SRV records for a service. Cycles through the list of servers starting with each call to connect @@ -118,7 +149,7 @@ class SRVClientEndpoint(object): return self.default_server else: raise ConnectError( - "Not server available for %s", self.service_name + "Not server available for %s" % self.service_name ) min_priority = self.servers[0].priority @@ -166,7 +197,7 @@ def resolve_service(service_name, dns_client=client, cache=SERVER_CACHE): and answers[0].type == dns.SRV and answers[0].payload and answers[0].payload.target == dns.Name('.')): - raise ConnectError("Service %s unavailable", service_name) + raise ConnectError("Service %s unavailable" % service_name) for answer in answers: if answer.type != dns.SRV or not answer.payload: diff --git a/synapse/python_dependencies.py b/synapse/python_dependencies.py index 86b8331760..1adbdd9421 100644 --- a/synapse/python_dependencies.py +++ b/synapse/python_dependencies.py @@ -36,13 +36,16 @@ REQUIREMENTS = { "blist": ["blist"], "pysaml2>=3.0.0,<4.0.0": ["saml2>=3.0.0,<4.0.0"], "pymacaroons-pynacl": ["pymacaroons"], - "lxml>=3.6.0": ["lxml"], "pyjwt": ["jwt"], } CONDITIONAL_REQUIREMENTS = { "web_client": { "matrix_angular_sdk>=0.6.8": ["syweb>=0.6.8"], - } + }, + "preview_url": { + "lxml>=3.6.0": ["lxml"], + "netaddr>=0.7.18": ["netaddr"], + }, } diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index 11f672aeab..97b7e84af9 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -79,4 +79,9 @@ class MediaRepositoryResource(Resource): self.putChild("download", DownloadResource(hs, filepaths)) self.putChild("thumbnail", ThumbnailResource(hs, filepaths)) self.putChild("identicon", IdenticonResource()) - self.putChild("preview_url", PreviewUrlResource(hs, filepaths)) + if hs.config.url_preview_enabled: + try: + self.putChild("preview_url", PreviewUrlResource(hs, filepaths)) + except Exception as e: + logger.warn("Failed to mount preview_url") + logger.exception(e) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index f5ec32d8f2..faa88deb6e 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -17,34 +17,52 @@ from .base_resource import BaseMediaResource from twisted.web.server import NOT_DONE_YET from twisted.internet import defer -from lxml import html -from urlparse import urlparse, urlunparse +from urlparse import urlparse, urlsplit, urlunparse -from synapse.api.errors import Codes from synapse.util.stringutils import random_string from synapse.util.caches.expiringcache import ExpiringCache from synapse.http.client import SpiderHttpClient from synapse.http.server import ( - request_handler, respond_with_json, respond_with_json_bytes + request_handler, respond_with_json_bytes ) from synapse.util.async import ObservableDeferred from synapse.util.stringutils import is_ascii import os import re +import fnmatch import cgi import ujson as json import logging logger = logging.getLogger(__name__) +try: + from lxml import html +except ImportError: + pass + class PreviewUrlResource(BaseMediaResource): isLeaf = True def __init__(self, hs, filepaths): + if not html: + logger.warn("Disabling PreviewUrlResource as lxml not available") + raise + + if not hasattr(hs.config, "url_preview_ip_range_blacklist"): + logger.warn( + "For security, you must specify an explicit target IP address " + "blacklist in url_preview_ip_range_blacklist for url previewing " + "to work" + ) + raise + BaseMediaResource.__init__(self, hs, filepaths) self.client = SpiderHttpClient(hs) + if hasattr(hs.config, "url_preview_url_blacklist"): + self.url_preview_url_blacklist = hs.config.url_preview_url_blacklist # simple memory cache mapping urls to OG metadata self.cache = ExpiringCache( @@ -74,6 +92,36 @@ class PreviewUrlResource(BaseMediaResource): else: ts = self.clock.time_msec() + # impose the URL pattern blacklist + if hasattr(self, "url_preview_url_blacklist"): + url_tuple = urlsplit(url) + for entry in self.url_preview_url_blacklist: + match = True + for attrib in entry: + pattern = entry[attrib] + value = getattr(url_tuple, attrib) + logger.debug("Matching attrib '%s' with value '%s' against pattern '%s'" % ( + attrib, value, pattern + )) + + if value is None: + match = False + continue + + if pattern.startswith('^'): + if not re.match(pattern, getattr(url_tuple, attrib)): + match = False + continue + else: + if not fnmatch.fnmatch(getattr(url_tuple, attrib), pattern): + match = False + continue + if match: + logger.warn( + "URL %s blocked by url_blacklist entry %s", url, entry + ) + raise + # first check the memory cache - good to handle all the clients on this # HS thundering away to preview the same URL at the same time. try: @@ -177,17 +225,6 @@ class PreviewUrlResource(BaseMediaResource): respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) except: - # XXX: if we don't explicitly respond here, the request never returns. - # isn't this what server.py's wrapper is meant to be doing for us? - respond_with_json( - request, - 500, - { - "error": "Internal server error", - "errcode": Codes.UNKNOWN, - }, - send_cors=True - ) raise @defer.inlineCallbacks @@ -282,8 +319,12 @@ class PreviewUrlResource(BaseMediaResource): if meta_description: og['og:description'] = meta_description[0] else: - # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | " - # "//p/text() | //div/text() | //span/text() | //a/text()") + # grab any text nodes which are inside the tag... + # unless they are within an HTML5 semantic markup tag... + #
,