Add url_preview_enabled config option to turn on/off preview_url endpoint. defaults to off.

Add url_preview_ip_range_blacklist to let admins specify internal IP ranges that must not be spidered. Add url_preview_url_blacklist to let admins specify URL patterns that must not be spidered. Implement a custom SpiderEndpoint and associated support classes to implement url_preview_ip_range_blacklist Add commentary and generally address PR feedback
author: Matthew Hodgson <matthew@matrix.org> 2016-04-08 18:37:15 +0100
committer: Matthew Hodgson <matthew@matrix.org> 2016-04-08 18:37:15 +0100
commit: dafef5a688b8684232346a26a789a2da600ec58e (patch)
tree: f2f25fd4d55dec7c0785c670d56af458e647ba2e /synapse/rest/media/v1
parent: Merge branch 'develop' into matthew/preview_urls (diff)
download: synapse-dafef5a688b8684232346a26a789a2da600ec58e.tar.xz
2 files changed, 64 insertions, 18 deletions
diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py
index 11f672aeab..97b7e84af9 100644
--- a/synapse/rest/media/v1/media_repository.py
+++ b/synapse/rest/media/v1/media_repository.py
@@ -79,4 +79,9 @@ class MediaRepositoryResource(Resource):
         self.putChild("download", DownloadResource(hs, filepaths))
         self.putChild("thumbnail", ThumbnailResource(hs, filepaths))
         self.putChild("identicon", IdenticonResource())
-        self.putChild("preview_url", PreviewUrlResource(hs, filepaths))
+        if hs.config.url_preview_enabled:
+            try:
+                self.putChild("preview_url", PreviewUrlResource(hs, filepaths))
+            except Exception as e:
+                logger.warn("Failed to mount preview_url")
+                logger.exception(e)
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index f5ec32d8f2..faa88deb6e 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -17,34 +17,52 @@ from .base_resource import BaseMediaResource
 
 from twisted.web.server import NOT_DONE_YET
 from twisted.internet import defer
-from lxml import html
-from urlparse import urlparse, urlunparse
+from urlparse import urlparse, urlsplit, urlunparse
 
-from synapse.api.errors import Codes
 from synapse.util.stringutils import random_string
 from synapse.util.caches.expiringcache import ExpiringCache
 from synapse.http.client import SpiderHttpClient
 from synapse.http.server import (
-    request_handler, respond_with_json, respond_with_json_bytes
+    request_handler, respond_with_json_bytes
 )
 from synapse.util.async import ObservableDeferred
 from synapse.util.stringutils import is_ascii
 
 import os
 import re
+import fnmatch
 import cgi
 import ujson as json
 
 import logging
 logger = logging.getLogger(__name__)
 
+try:
+    from lxml import html
+except ImportError:
+    pass
+
 
 class PreviewUrlResource(BaseMediaResource):
     isLeaf = True
 
     def __init__(self, hs, filepaths):
+        if not html:
+            logger.warn("Disabling PreviewUrlResource as lxml not available")
+            raise
+
+        if not hasattr(hs.config, "url_preview_ip_range_blacklist"):
+            logger.warn(
+                "For security, you must specify an explicit target IP address "
+                "blacklist in url_preview_ip_range_blacklist for url previewing "
+                "to work"
+            )
+            raise
+
         BaseMediaResource.__init__(self, hs, filepaths)
         self.client = SpiderHttpClient(hs)
+        if hasattr(hs.config, "url_preview_url_blacklist"):
+            self.url_preview_url_blacklist = hs.config.url_preview_url_blacklist
 
         # simple memory cache mapping urls to OG metadata
         self.cache = ExpiringCache(
@@ -74,6 +92,36 @@ class PreviewUrlResource(BaseMediaResource):
             else:
                 ts = self.clock.time_msec()
 
+            # impose the URL pattern blacklist
+            if hasattr(self, "url_preview_url_blacklist"):
+                url_tuple = urlsplit(url)
+                for entry in self.url_preview_url_blacklist:
+                    match = True
+                    for attrib in entry:
+                        pattern = entry[attrib]
+                        value = getattr(url_tuple, attrib)
+                        logger.debug("Matching attrib '%s' with value '%s' against pattern '%s'" % (
+                            attrib, value, pattern
+                        ))
+
+                        if value is None:
+                            match = False
+                            continue
+
+                        if pattern.startswith('^'):
+                            if not re.match(pattern, getattr(url_tuple, attrib)):
+                                match = False
+                                continue
+                        else:
+                            if not fnmatch.fnmatch(getattr(url_tuple, attrib), pattern):
+                                match = False
+                                continue
+                    if match:
+                        logger.warn(
+                            "URL %s blocked by url_blacklist entry %s", url, entry
+                        )
+                        raise
+
             # first check the memory cache - good to handle all the clients on this
             # HS thundering away to preview the same URL at the same time.
             try:
@@ -177,17 +225,6 @@ class PreviewUrlResource(BaseMediaResource):
 
             respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True)
         except:
-            # XXX: if we don't explicitly respond here, the request never returns.
-            # isn't this what server.py's wrapper is meant to be doing for us?
-            respond_with_json(
-                request,
-                500,
-                {
-                    "error": "Internal server error",
-                    "errcode": Codes.UNKNOWN,
-                },
-                send_cors=True
-            )
             raise
 
     @defer.inlineCallbacks
@@ -282,8 +319,12 @@ class PreviewUrlResource(BaseMediaResource):
             if meta_description:
                 og['og:description'] = meta_description[0]
             else:
-                # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | "
-                #    "//p/text() | //div/text() | //span/text() | //a/text()")
+                # grab any text nodes which are inside the <body/> tag...
+                # unless they are within an HTML5 semantic markup tag...
+                # <header/>, <nav/>, <aside/>, <footer/>
+                # ...or if they are within a <script/> or <style/> tag.
+                # This is a very very very coarse approximation to a plain text
+                # render of the page.
                 text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | "
                                         "ancestor::aside | ancestor::footer | "
                                         "ancestor::script | ancestor::style)]" +
author	Matthew Hodgson <matthew@matrix.org>	2016-04-08 18:37:15 +0100
committer	Matthew Hodgson <matthew@matrix.org>	2016-04-08 18:37:15 +0100
commit	dafef5a688b8684232346a26a789a2da600ec58e (patch)
tree	f2f25fd4d55dec7c0785c670d56af458e647ba2e /synapse/rest/media/v1
parent	Merge branch 'develop' into matthew/preview_urls (diff)
download	synapse-dafef5a688b8684232346a26a789a2da600ec58e.tar.xz