summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--synapse/config/repository.py18
-rw-r--r--synapse/http/client.py5
-rw-r--r--synapse/http/endpoint.py14
-rw-r--r--synapse/rest/media/v1/preview_url_resource.py63
4 files changed, 58 insertions, 42 deletions
diff --git a/synapse/config/repository.py b/synapse/config/repository.py
index d61e525e62..8810079848 100644
--- a/synapse/config/repository.py
+++ b/synapse/config/repository.py
@@ -100,8 +100,13 @@ class ContentRepositoryConfig(Config):
                     "to work"
                 )
 
-            if "url_preview_url_blacklist" in config:
-                self.url_preview_url_blacklist = config["url_preview_url_blacklist"]
+            self.url_preview_ip_range_whitelist = IPSet(
+                config.get("url_preview_ip_range_whitelist", ())
+            )
+
+            self.url_preview_url_blacklist = config.get(
+                "url_preview_url_blacklist", ()
+            )
 
     def default_config(self, **kwargs):
         media_store = self.default_path("media_store")
@@ -162,6 +167,15 @@ class ContentRepositoryConfig(Config):
         # - '10.0.0.0/8'
         # - '172.16.0.0/12'
         # - '192.168.0.0/16'
+        #
+        # List of IP address CIDR ranges that the URL preview spider is allowed
+        # to access even if they are specified in url_preview_ip_range_blacklist.
+        # This is useful for specifying exceptions to wide-ranging blacklisted
+        # target IP ranges - e.g. for enabling URL previews for a specific private
+        # website only visible in your network.
+        #
+        # url_preview_ip_range_whitelist:
+        # - '192.168.1.1'
 
         # Optional list of URL matches that the URL preview spider is
         # denied from accessing.  You should use url_preview_ip_range_blacklist
diff --git a/synapse/http/client.py b/synapse/http/client.py
index 902ae7a203..c7fa692435 100644
--- a/synapse/http/client.py
+++ b/synapse/http/client.py
@@ -380,13 +380,14 @@ class CaptchaServerHttpClient(SimpleHttpClient):
 class SpiderEndpointFactory(object):
     def __init__(self, hs):
         self.blacklist = hs.config.url_preview_ip_range_blacklist
+        self.whitelist = hs.config.url_preview_ip_range_whitelist
         self.policyForHTTPS = hs.get_http_client_context_factory()
 
     def endpointForURI(self, uri):
         logger.info("Getting endpoint for %s", uri.toBytes())
         if uri.scheme == "http":
             return SpiderEndpoint(
-                reactor, uri.host, uri.port, self.blacklist,
+                reactor, uri.host, uri.port, self.blacklist, self.whitelist,
                 endpoint=TCP4ClientEndpoint,
                 endpoint_kw_args={
                     'timeout': 15
@@ -395,7 +396,7 @@ class SpiderEndpointFactory(object):
         elif uri.scheme == "https":
             tlsPolicy = self.policyForHTTPS.creatorForNetloc(uri.host, uri.port)
             return SpiderEndpoint(
-                reactor, uri.host, uri.port, self.blacklist,
+                reactor, uri.host, uri.port, self.blacklist, self.whitelist,
                 endpoint=SSL4ClientEndpoint,
                 endpoint_kw_args={
                     'sslContextFactory': tlsPolicy,
diff --git a/synapse/http/endpoint.py b/synapse/http/endpoint.py
index a456dc19da..442696d393 100644
--- a/synapse/http/endpoint.py
+++ b/synapse/http/endpoint.py
@@ -79,12 +79,13 @@ class SpiderEndpoint(object):
     """An endpoint which refuses to connect to blacklisted IP addresses
     Implements twisted.internet.interfaces.IStreamClientEndpoint.
     """
-    def __init__(self, reactor, host, port, blacklist,
+    def __init__(self, reactor, host, port, blacklist, whitelist,
                  endpoint=TCP4ClientEndpoint, endpoint_kw_args={}):
         self.reactor = reactor
         self.host = host
         self.port = port
         self.blacklist = blacklist
+        self.whitelist = whitelist
         self.endpoint = endpoint
         self.endpoint_kw_args = endpoint_kw_args
 
@@ -93,10 +94,13 @@ class SpiderEndpoint(object):
         address = yield self.reactor.resolve(self.host)
 
         from netaddr import IPAddress
-        if IPAddress(address) in self.blacklist:
-            raise ConnectError(
-                "Refusing to spider blacklisted IP address %s" % address
-            )
+        ip_address = IPAddress(address)
+
+        if ip_address in self.blacklist:
+            if self.whitelist is None or ip_address not in self.whitelist:
+                raise ConnectError(
+                    "Refusing to spider blacklisted IP address %s" % address
+                )
 
         logger.info("Connecting to %s:%s", address, self.port)
         endpoint = self.endpoint(
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index dc1e5fbdb3..37dd1de899 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -56,8 +56,7 @@ class PreviewUrlResource(Resource):
         self.client = SpiderHttpClient(hs)
         self.media_repo = media_repo
 
-        if hasattr(hs.config, "url_preview_url_blacklist"):
-            self.url_preview_url_blacklist = hs.config.url_preview_url_blacklist
+        self.url_preview_url_blacklist = hs.config.url_preview_url_blacklist
 
         # simple memory cache mapping urls to OG metadata
         self.cache = ExpiringCache(
@@ -86,39 +85,37 @@ class PreviewUrlResource(Resource):
         else:
             ts = self.clock.time_msec()
 
-        # impose the URL pattern blacklist
-        if hasattr(self, "url_preview_url_blacklist"):
-            url_tuple = urlparse.urlsplit(url)
-            for entry in self.url_preview_url_blacklist:
-                match = True
-                for attrib in entry:
-                    pattern = entry[attrib]
-                    value = getattr(url_tuple, attrib)
-                    logger.debug((
-                        "Matching attrib '%s' with value '%s' against"
-                        " pattern '%s'"
-                    ) % (attrib, value, pattern))
-
-                    if value is None:
+        url_tuple = urlparse.urlsplit(url)
+        for entry in self.url_preview_url_blacklist:
+            match = True
+            for attrib in entry:
+                pattern = entry[attrib]
+                value = getattr(url_tuple, attrib)
+                logger.debug((
+                    "Matching attrib '%s' with value '%s' against"
+                    " pattern '%s'"
+                ) % (attrib, value, pattern))
+
+                if value is None:
+                    match = False
+                    continue
+
+                if pattern.startswith('^'):
+                    if not re.match(pattern, getattr(url_tuple, attrib)):
                         match = False
                         continue
-
-                    if pattern.startswith('^'):
-                        if not re.match(pattern, getattr(url_tuple, attrib)):
-                            match = False
-                            continue
-                    else:
-                        if not fnmatch.fnmatch(getattr(url_tuple, attrib), pattern):
-                            match = False
-                            continue
-                if match:
-                    logger.warn(
-                        "URL %s blocked by url_blacklist entry %s", url, entry
-                    )
-                    raise SynapseError(
-                        403, "URL blocked by url pattern blacklist entry",
-                        Codes.UNKNOWN
-                    )
+                else:
+                    if not fnmatch.fnmatch(getattr(url_tuple, attrib), pattern):
+                        match = False
+                        continue
+            if match:
+                logger.warn(
+                    "URL %s blocked by url_blacklist entry %s", url, entry
+                )
+                raise SynapseError(
+                    403, "URL blocked by url pattern blacklist entry",
+                    Codes.UNKNOWN
+                )
 
         # first check the memory cache - good to handle all the clients on this
         # HS thundering away to preview the same URL at the same time.