summary refs log tree commit diff
diff options
context:
space:
mode:
authorMatthew Hodgson <matthew@matrix.org>2016-03-31 01:55:21 +0100
committerMatthew Hodgson <matthew@matrix.org>2016-03-31 01:55:21 +0100
commita8a5dd3b44a4526307502bd621ee0bd43c87c77f (patch)
tree131834f160f29c89a9ec68b6143de149d6d49273
parentspell out more packages (diff)
downloadsynapse-a8a5dd3b44a4526307502bd621ee0bd43c87c77f.tar.xz
handle requests with missing content-length headers (e.g. YouTube)
-rw-r--r--synapse/http/client.py33
-rw-r--r--synapse/rest/media/v1/preview_url_resource.py4
2 files changed, 28 insertions, 9 deletions
diff --git a/synapse/http/client.py b/synapse/http/client.py
index 71b2e3375e..30f31a915d 100644
--- a/synapse/http/client.py
+++ b/synapse/http/client.py
@@ -23,8 +23,9 @@ from canonicaljson import encode_canonical_json
 
 from twisted.internet import defer, reactor, ssl, protocol
 from twisted.web.client import (
-    RedirectAgent, Agent, readBody, FileBodyProducer, PartialDownloadError,
+    BrowserLikeRedirectAgent, Agent, readBody, FileBodyProducer, PartialDownloadError,
 )
+from twisted.web.http import PotentialDataLoss
 from twisted.web.http_headers import Headers
 from twisted.web._newclient import ResponseDone
 
@@ -59,11 +60,11 @@ class SimpleHttpClient(object):
         # The default context factory in Twisted 14.0.0 (which we require) is
         # BrowserLikePolicyForHTTPS which will do regular cert validation
         # 'like a browser'
-        self.agent = RedirectAgent(Agent(
+        self.agent = Agent(
             reactor,
             connectTimeout=15,
             contextFactory=hs.get_http_client_context_factory()
-        ))
+        )
         self.user_agent = hs.version_string
         if hs.config.user_agent_suffix:
             self.user_agent = "%s %s" % (self.user_agent, hs.config.user_agent_suffix,)
@@ -253,10 +254,6 @@ class SimpleHttpClient(object):
             headers.
         """
 
-        def body_callback(method, url_bytes, headers_dict):
-            self.sign_request(destination, method, url_bytes, headers_dict)
-            return None
-
         response = yield self.request(
             "GET",
             url.encode("ascii"),
@@ -309,6 +306,10 @@ class _ReadBodyToFileProtocol(protocol.Protocol):
     def connectionLost(self, reason):
         if reason.check(ResponseDone):
             self.deferred.callback(self.length)
+        elif reason.check(PotentialDataLoss):
+            # stolen from https://github.com/twisted/treq/pull/49/files
+            # http://twistedmatrix.com/trac/ticket/4840
+            self.deferred.callback(self.length)
         else:
             self.deferred.errback(reason)
 
@@ -350,6 +351,24 @@ class CaptchaServerHttpClient(SimpleHttpClient):
             # twisted dislikes google's response, no content length.
             defer.returnValue(e.response)
 
+class SpiderHttpClient(SimpleHttpClient):
+    """
+    Separate HTTP client for spidering arbitrary URLs.
+    Special in that it follows retries and has a UA that looks
+    like a browser.
+
+    used by the preview_url endpoint in the content repo.
+    """
+    def __init__(self, hs):
+        SimpleHttpClient.__init__(self, hs)
+        # clobber the base class's agent and UA:
+        self.agent = BrowserLikeRedirectAgent(Agent(
+            reactor,
+            connectTimeout=15,
+            contextFactory=hs.get_http_client_context_factory()
+        ))
+        # Look like Chrome for now
+        #self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko) Chrome Safari" % hs.version_string)
 
 def encode_urlencode_args(args):
     return {k: encode_urlencode_arg(v) for k, v in args.items()}
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index b999944e86..ca2529cc10 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -19,7 +19,7 @@ from twisted.web.server import NOT_DONE_YET
 from twisted.internet import defer
 from lxml import html
 from synapse.util.stringutils import random_string
-from synapse.http.client import SimpleHttpClient
+from synapse.http.client import SpiderHttpClient
 from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes
 
 import os
@@ -33,7 +33,7 @@ class PreviewUrlResource(BaseMediaResource):
 
     def __init__(self, hs, filepaths):
         BaseMediaResource.__init__(self, hs, filepaths)
-        self.client = SimpleHttpClient(hs)
+        self.client = SpiderHttpClient(hs)
 
     def render_GET(self, request):
         self._async_render_GET(request)