diff options
author | Matthew Hodgson <matthew@matrix.org> | 2016-04-08 18:37:15 +0100 |
---|---|---|
committer | Matthew Hodgson <matthew@matrix.org> | 2016-04-08 18:37:15 +0100 |
commit | dafef5a688b8684232346a26a789a2da600ec58e (patch) | |
tree | f2f25fd4d55dec7c0785c670d56af458e647ba2e /synapse/http | |
parent | Merge branch 'develop' into matthew/preview_urls (diff) | |
download | synapse-dafef5a688b8684232346a26a789a2da600ec58e.tar.xz |
Add url_preview_enabled config option to turn on/off preview_url endpoint. defaults to off.
Add url_preview_ip_range_blacklist to let admins specify internal IP ranges that must not be spidered. Add url_preview_url_blacklist to let admins specify URL patterns that must not be spidered. Implement a custom SpiderEndpoint and associated support classes to implement url_preview_ip_range_blacklist Add commentary and generally address PR feedback
Diffstat (limited to 'synapse/http')
-rw-r--r-- | synapse/http/client.py | 44 | ||||
-rw-r--r-- | synapse/http/endpoint.py | 35 |
2 files changed, 72 insertions, 7 deletions
diff --git a/synapse/http/client.py b/synapse/http/client.py index 442b4bb73d..3b8ffcd3ef 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -20,10 +20,12 @@ from synapse.api.errors import ( ) from synapse.util.logcontext import preserve_context_over_fn import synapse.metrics +from synapse.http.endpoint import SpiderEndpoint from canonicaljson import encode_canonical_json from twisted.internet import defer, reactor, ssl, protocol +from twisted.internet.endpoints import SSL4ClientEndpoint, TCP4ClientEndpoint from twisted.web.client import ( BrowserLikeRedirectAgent, ContentDecoderAgent, GzipDecoder, Agent, readBody, FileBodyProducer, PartialDownloadError, @@ -364,6 +366,35 @@ class CaptchaServerHttpClient(SimpleHttpClient): defer.returnValue(e.response) +class SpiderEndpointFactory(object): + def __init__(self, hs): + self.blacklist = hs.config.url_preview_ip_range_blacklist + self.policyForHTTPS = hs.get_http_client_context_factory() + + def endpointForURI(self, uri): + logger.info("Getting endpoint for %s", uri.toBytes()) + if uri.scheme == "http": + return SpiderEndpoint( + reactor, uri.host, uri.port, self.blacklist, + endpoint=TCP4ClientEndpoint, + endpoint_kw_args={ + 'timeout': 15 + }, + ) + elif uri.scheme == "https": + tlsPolicy = self.policyForHTTPS.creatorForNetloc(uri.host, uri.port) + return SpiderEndpoint( + reactor, uri.host, uri.port, self.blacklist, + endpoint=SSL4ClientEndpoint, + endpoint_kw_args={ + 'sslContextFactory': tlsPolicy, + 'timeout': 15 + }, + ) + else: + logger.warn("Can't get endpoint for unrecognised scheme %s", uri.scheme) + + class SpiderHttpClient(SimpleHttpClient): """ Separate HTTP client for spidering arbitrary URLs. @@ -375,11 +406,14 @@ class SpiderHttpClient(SimpleHttpClient): def __init__(self, hs): SimpleHttpClient.__init__(self, hs) # clobber the base class's agent and UA: - self.agent = ContentDecoderAgent(BrowserLikeRedirectAgent(Agent( - reactor, - connectTimeout=15, - contextFactory=hs.get_http_client_context_factory() - )), [('gzip', GzipDecoder)]) + self.agent = ContentDecoderAgent( + BrowserLikeRedirectAgent( + Agent.usingEndpointFactory( + reactor, + SpiderEndpointFactory(hs) + ) + ), [('gzip', GzipDecoder)] + ) # We could look like Chrome: # self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko) # Chrome Safari" % hs.version_string) diff --git a/synapse/http/endpoint.py b/synapse/http/endpoint.py index 4775f6707d..de5c762f50 100644 --- a/synapse/http/endpoint.py +++ b/synapse/http/endpoint.py @@ -74,6 +74,37 @@ def matrix_federation_endpoint(reactor, destination, ssl_context_factory=None, return transport_endpoint(reactor, domain, port, **endpoint_kw_args) +class SpiderEndpoint(object): + """An endpoint which refuses to connect to blacklisted IP addresses + Implements twisted.internet.interfaces.IStreamClientEndpoint. + """ + def __init__(self, reactor, host, port, blacklist, + endpoint=TCP4ClientEndpoint, endpoint_kw_args={}): + self.reactor = reactor + self.host = host + self.port = port + self.blacklist = blacklist + self.endpoint = endpoint + self.endpoint_kw_args = endpoint_kw_args + + @defer.inlineCallbacks + def connect(self, protocolFactory): + address = yield self.reactor.resolve(self.host) + + from netaddr import IPAddress + if IPAddress(address) in self.blacklist: + raise ConnectError( + "Refusing to spider blacklisted IP address %s" % address + ) + + logger.info("Connecting to %s:%s", address, self.port) + endpoint = self.endpoint( + self.reactor, address, self.port, **self.endpoint_kw_args + ) + connection = yield endpoint.connect(protocolFactory) + defer.returnValue(connection) + + class SRVClientEndpoint(object): """An endpoint which looks up SRV records for a service. Cycles through the list of servers starting with each call to connect @@ -118,7 +149,7 @@ class SRVClientEndpoint(object): return self.default_server else: raise ConnectError( - "Not server available for %s", self.service_name + "Not server available for %s" % self.service_name ) min_priority = self.servers[0].priority @@ -166,7 +197,7 @@ def resolve_service(service_name, dns_client=client, cache=SERVER_CACHE): and answers[0].type == dns.SRV and answers[0].payload and answers[0].payload.target == dns.Name('.')): - raise ConnectError("Service %s unavailable", service_name) + raise ConnectError("Service %s unavailable" % service_name) for answer in answers: if answer.type != dns.SRV or not answer.payload: |