diff options
author | Matthew Hodgson <matthew@matrix.org> | 2016-04-02 00:35:49 +0100 |
---|---|---|
committer | Matthew Hodgson <matthew@matrix.org> | 2016-04-02 00:35:49 +0100 |
commit | 5fd07da76473f7a361db4b16b58fc4c21acc4af0 (patch) | |
tree | 34988a1522cfa14816a0a27b435b536c5cda3501 /synapse | |
parent | fix assorted redirect, unicode and screenscraping bugs (diff) | |
download | synapse-5fd07da76473f7a361db4b16b58fc4c21acc4af0.tar.xz |
refactor calc_og; spider image URLs; fix xpath; add a (broken) expiringcache; loads of other fixes
Diffstat (limited to 'synapse')
-rw-r--r-- | synapse/rest/media/v1/preview_url_resource.py | 202 |
1 files changed, 121 insertions, 81 deletions
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index a7ffe593b1..1273472dab 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -20,6 +20,7 @@ from twisted.internet import defer from lxml import html from urlparse import urlparse, urlunparse from synapse.util.stringutils import random_string +from synapse.util.caches.expiringcache import ExpiringCache from synapse.http.client import SpiderHttpClient from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes @@ -36,6 +37,12 @@ class PreviewUrlResource(BaseMediaResource): def __init__(self, hs, filepaths): BaseMediaResource.__init__(self, hs, filepaths) self.client = SpiderHttpClient(hs) + self.cache = ExpiringCache( + cache_name = "url_previews", + clock = self.clock, + expiry_ms = 60*60*1000, # don't spider URLs more often than once an hour + ) + self.cache.start() def render_GET(self, request): self._async_render_GET(request) @@ -50,6 +57,11 @@ class PreviewUrlResource(BaseMediaResource): requester = yield self.auth.get_user_by_req(request) url = request.args.get("url")[0] + if self.cache: + og = self.cache.get(url) + respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) + return + # TODO: keep track of whether there's an ongoing request for this preview # and block and return their details if there is one. @@ -74,98 +86,25 @@ class PreviewUrlResource(BaseMediaResource): elif self._is_html(media_info['media_type']): # TODO: somehow stop a big HTML tree from exploding synapse's RAM - def _calc_og(): - # suck it up into lxml and define our OG response. - # if we see any URLs in the OG response, then spider them - # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) - - # "og:type" : "article" - # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" - # "og:title" : "Matrix on Twitter" - # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" - # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" - # "og:site_name" : "Twitter" - - # or: - - # "og:type" : "video", - # "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw", - # "og:site_name" : "YouTube", - # "og:video:type" : "application/x-shockwave-flash", - # "og:description" : " ", - # "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon", - # "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg", - # "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", - # "og:video:width" : "1280" - # "og:video:height" : "720", - # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", - - og = {} - for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): - og[tag.attrib['property']] = tag.attrib['content'] - - if 'og:title' not in og: - # do some basic spidering of the HTML - title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]") - og['og:title'] = title[0].text if title else None - - - if 'og:image' not in og: - meta_image = tree.xpath("//*/meta[@itemprop='image']/@content"); - if meta_image: - og['og:image'] = self._rebase_url(meta_image[0], media_info['uri']) - else: - images = [ i for i in tree.xpath("//img") if 'src' in i.attrib ] - big_images = [ i for i in images if ( - 'width' in i.attrib and 'height' in i.attrib and - i.attrib['width'] > 64 and i.attrib['height'] > 64 - )] - big_images = big_images.sort(key=lambda i: (-1 * int(i.attrib['width']) * int(i.attrib['height']))) - images = big_images if big_images else images - - if images: - og['og:image'] = self._rebase_url(images[0].attrib['src'], media_info['uri']) - - if 'og:description' not in og: - meta_description = tree.xpath("//*/meta[@name='description']/@content"); - if meta_description: - og['og:description'] = meta_description[0] - else: - text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()") - # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text()") - text = '' - for text_node in text_nodes: - if len(text) < 500: - text += text_node + ' ' - else: - break - text = re.sub(r'[\t ]+', ' ', text) - text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text) - text = text.strip()[:500] - og['og:description'] = text if text else None - - # TODO: extract a favicon? - # TODO: turn any OG media URLs into mxc URLs to capture and thumbnail them too - # TODO: store our OG details in a cache (and expire them when stale) - # TODO: delete the content to stop diskfilling, as we only ever cared about its OG - return og - try: tree = html.parse(media_info['filename']) - og = _calc_og() + og = yield self._calc_og(tree, media_info, requester) except UnicodeDecodeError: # XXX: evil evil bodge file = open(media_info['filename']) body = file.read() file.close() tree = html.fromstring(body.decode('utf-8','ignore')) - og = _calc_og() + og = yield self._calc_og(tree, media_info, requester) else: logger.warn("Failed to find any OG data in %s", url) og = {} - logger.warn(og) + if self.cache: + self.cache[url] = og + + logger.warn(og); respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) except: @@ -182,11 +121,112 @@ class PreviewUrlResource(BaseMediaResource): ) raise + @defer.inlineCallbacks + def _calc_og(self, tree, media_info, requester): + # suck our tree into lxml and define our OG response. + + # if we see any image URLs in the OG response, then spider them + # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) + + # "og:type" : "article" + # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" + # "og:title" : "Matrix on Twitter" + # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" + # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" + # "og:site_name" : "Twitter" + + # or: + + # "og:type" : "video", + # "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw", + # "og:site_name" : "YouTube", + # "og:video:type" : "application/x-shockwave-flash", + # "og:description" : " ", + # "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon", + # "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg", + # "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", + # "og:video:width" : "1280" + # "og:video:height" : "720", + # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", + + og = {} + for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): + og[tag.attrib['property']] = tag.attrib['content'] + + # TODO: grab article: meta tags too, e.g.: + + # <meta property="article:publisher" content="https://www.facebook.com/thethudonline" /> + # <meta property="article:author" content="https://www.facebook.com/thethudonline" /> + # <meta property="article:tag" content="baby" /> + # <meta property="article:section" content="Breaking News" /> + # <meta property="article:published_time" content="2016-03-31T19:58:24+00:00" /> + # <meta property="article:modified_time" content="2016-04-01T18:31:53+00:00" /> + + if 'og:title' not in og: + # do some basic spidering of the HTML + title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]") + og['og:title'] = title[0].text.strip() if title else None + + + if 'og:image' not in og: + # TODO: extract a favicon failing all else + meta_image = tree.xpath("//*/meta[@itemprop='image']/@content"); + if meta_image: + og['og:image'] = self._rebase_url(meta_image[0], media_info['uri']) + else: + images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]") + images = sorted(images, key=lambda i: (-1 * int(i.attrib['width']) * int(i.attrib['height']))) + if not images: + images = tree.xpath("//img[@src]") + if images: + og['og:image'] = self._rebase_url(images[0].attrib['src'], media_info['uri']) + + # pre-cache the image for posterity + if 'og:image' in og and og['og:image']: + image_info = yield self._download_url(og['og:image'], requester.user) + + if self._is_media(image_info['media_type']): + # TODO: make sure we don't choke on white-on-transparent images + dims = yield self._generate_local_thumbnails( + image_info['filesystem_id'], image_info + ) + og["og:image"] = "mxc://%s/%s" % (self.server_name, image_info['filesystem_id']) + og["og:image:type"] = image_info['media_type'] + og["og:image:width"] = dims['width'] + og["og:image:height"] = dims['height'] + else: + del og["og:image"] + + if 'og:description' not in og: + meta_description = tree.xpath("//*/meta[@name='description']/@content"); + if meta_description: + og['og:description'] = meta_description[0] + else: + # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()") + text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | ancestor::aside | " + + "ancestor::footer | ancestor::script | ancestor::style)]" + + "[ancestor::body]") + text = '' + for text_node in text_nodes: + if len(text) < 500: + text += text_node + ' ' + else: + break + text = re.sub(r'[\t ]+', ' ', text) + text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text) + text = text.strip()[:500] + og['og:description'] = text if text else None + + # TODO: persist a cache mapping { url, etag } -> { og, mxc of url (if we bother keeping it around), age } + # TODO: delete the url downloads to stop diskfilling, as we only ever cared about its OG + defer.returnValue(og); + def _rebase_url(self, url, base): base = list(urlparse(base)) url = list(urlparse(url)) - if not url[0] and not url[1]: - url[0] = base[0] + if not url[0]: + url[0] = base[0] or "http" + if not url[1]: url[1] = base[1] if not url[2].startswith('/'): url[2] = re.sub(r'/[^/]+$', '/', base[2]) + url[2] |