3 files changed, 166 insertions, 43 deletions
diff --git a/synapse/rest/media/v1/filepath.py b/synapse/rest/media/v1/filepath.py
index 422ab86fb3..0137458f71 100644
--- a/synapse/rest/media/v1/filepath.py
+++ b/synapse/rest/media/v1/filepath.py
@@ -65,3 +65,9 @@ class MediaFilePaths(object):
             file_id[0:2], file_id[2:4], file_id[4:],
             file_name
         )
+
+    def remote_media_thumbnail_dir(self, server_name, file_id):
+        return os.path.join(
+            self.base_path, "remote_thumbnail", server_name,
+            file_id[0:2], file_id[2:4], file_id[4:],
+        )
diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py
index d96bf9afe2..692e078419 100644
--- a/synapse/rest/media/v1/media_repository.py
+++ b/synapse/rest/media/v1/media_repository.py
@@ -26,14 +26,17 @@ from .thumbnailer import Thumbnailer
 
 from synapse.http.matrixfederationclient import MatrixFederationHttpClient
 from synapse.util.stringutils import random_string
+from synapse.api.errors import SynapseError
 
 from twisted.internet import defer, threads
 
-from synapse.util.async import ObservableDeferred
+from synapse.util.async import Linearizer
 from synapse.util.stringutils import is_ascii
 from synapse.util.logcontext import preserve_context_over_fn
 
 import os
+import errno
+import shutil
 
 import cgi
 import logging
@@ -42,8 +45,11 @@ import urlparse
 logger = logging.getLogger(__name__)
 
 
+UPDATE_RECENTLY_ACCESSED_REMOTES_TS = 60 * 1000
+
+
 class MediaRepository(object):
-    def __init__(self, hs, filepaths):
+    def __init__(self, hs):
         self.auth = hs.get_auth()
         self.client = MatrixFederationHttpClient(hs)
         self.clock = hs.get_clock()
@@ -51,11 +57,28 @@ class MediaRepository(object):
         self.store = hs.get_datastore()
         self.max_upload_size = hs.config.max_upload_size
         self.max_image_pixels = hs.config.max_image_pixels
-        self.filepaths = filepaths
-        self.downloads = {}
+        self.filepaths = MediaFilePaths(hs.config.media_store_path)
         self.dynamic_thumbnails = hs.config.dynamic_thumbnails
         self.thumbnail_requirements = hs.config.thumbnail_requirements
 
+        self.remote_media_linearizer = Linearizer()
+
+        self.recently_accessed_remotes = set()
+
+        self.clock.looping_call(
+            self._update_recently_accessed_remotes,
+            UPDATE_RECENTLY_ACCESSED_REMOTES_TS
+        )
+
+    @defer.inlineCallbacks
+    def _update_recently_accessed_remotes(self):
+        media = self.recently_accessed_remotes
+        self.recently_accessed_remotes = set()
+
+        yield self.store.update_cached_last_access_time(
+            media, self.clock.time_msec()
+        )
+
     @staticmethod
     def _makedirs(filepath):
         dirname = os.path.dirname(filepath)
@@ -92,22 +115,12 @@ class MediaRepository(object):
 
         defer.returnValue("mxc://%s/%s" % (self.server_name, media_id))
 
+    @defer.inlineCallbacks
     def get_remote_media(self, server_name, media_id):
         key = (server_name, media_id)
-        download = self.downloads.get(key)
-        if download is None:
-            download = self._get_remote_media_impl(server_name, media_id)
-            download = ObservableDeferred(
-                download,
-                consumeErrors=True
-            )
-            self.downloads[key] = download
-
-            @download.addBoth
-            def callback(media_info):
-                del self.downloads[key]
-                return media_info
-        return download.observe()
+        with (yield self.remote_media_linearizer.queue(key)):
+            media_info = yield self._get_remote_media_impl(server_name, media_id)
+        defer.returnValue(media_info)
 
     @defer.inlineCallbacks
     def _get_remote_media_impl(self, server_name, media_id):
@@ -118,6 +131,11 @@ class MediaRepository(object):
             media_info = yield self._download_remote_file(
                 server_name, media_id
             )
+        else:
+            self.recently_accessed_remotes.add((server_name, media_id))
+            yield self.store.update_cached_last_access_time(
+                [(server_name, media_id)], self.clock.time_msec()
+            )
         defer.returnValue(media_info)
 
     @defer.inlineCallbacks
@@ -134,10 +152,15 @@ class MediaRepository(object):
                 request_path = "/".join((
                     "/_matrix/media/v1/download", server_name, media_id,
                 ))
-                length, headers = yield self.client.get_file(
-                    server_name, request_path, output_stream=f,
-                    max_size=self.max_upload_size,
-                )
+                try:
+                    length, headers = yield self.client.get_file(
+                        server_name, request_path, output_stream=f,
+                        max_size=self.max_upload_size,
+                    )
+                except Exception as e:
+                    logger.warn("Failed to fetch remoted media %r", e)
+                    raise SynapseError(502, "Failed to fetch remoted media")
+
             media_type = headers["Content-Type"][0]
             time_now_ms = self.clock.time_msec()
 
@@ -410,6 +433,41 @@ class MediaRepository(object):
             "height": m_height,
         })
 
+    @defer.inlineCallbacks
+    def delete_old_remote_media(self, before_ts):
+        old_media = yield self.store.get_remote_media_before(before_ts)
+
+        deleted = 0
+
+        for media in old_media:
+            origin = media["media_origin"]
+            media_id = media["media_id"]
+            file_id = media["filesystem_id"]
+            key = (origin, media_id)
+
+            logger.info("Deleting: %r", key)
+
+            with (yield self.remote_media_linearizer.queue(key)):
+                full_path = self.filepaths.remote_media_filepath(origin, file_id)
+                try:
+                    os.remove(full_path)
+                except OSError as e:
+                    logger.warn("Failed to remove file: %r", full_path)
+                    if e.errno == errno.ENOENT:
+                        pass
+                    else:
+                        continue
+
+                thumbnail_dir = self.filepaths.remote_media_thumbnail_dir(
+                    origin, file_id
+                )
+                shutil.rmtree(thumbnail_dir, ignore_errors=True)
+
+                yield self.store.delete_remote_media(origin, media_id)
+                deleted += 1
+
+        defer.returnValue({"deleted": deleted})
+
 
 class MediaRepositoryResource(Resource):
     """File uploading and downloading.
@@ -458,9 +516,8 @@ class MediaRepositoryResource(Resource):
 
     def __init__(self, hs):
         Resource.__init__(self)
-        filepaths = MediaFilePaths(hs.config.media_store_path)
 
-        media_repo = MediaRepository(hs, filepaths)
+        media_repo = hs.get_media_repository()
 
         self.putChild("upload", UploadResource(hs, media_repo))
         self.putChild("download", DownloadResource(hs, media_repo))
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 37dd1de899..bdd0e60c5b 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -29,6 +29,8 @@ from synapse.http.server import (
 from synapse.util.async import ObservableDeferred
 from synapse.util.stringutils import is_ascii
 
+from copy import deepcopy
+
 import os
 import re
 import fnmatch
@@ -252,7 +254,8 @@ class PreviewUrlResource(Resource):
 
         og = {}
         for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
-            og[tag.attrib['property']] = tag.attrib['content']
+            if 'content' in tag.attrib:
+                og[tag.attrib['property']] = tag.attrib['content']
 
         # TODO: grab article: meta tags too, e.g.:
 
@@ -279,7 +282,7 @@ class PreviewUrlResource(Resource):
                 # TODO: consider inlined CSS styles as well as width & height attribs
                 images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
                 images = sorted(images, key=lambda i: (
-                    -1 * int(i.attrib['width']) * int(i.attrib['height'])
+                    -1 * float(i.attrib['width']) * float(i.attrib['height'])
                 ))
                 if not images:
                     images = tree.xpath("//img[@src]")
@@ -287,9 +290,9 @@ class PreviewUrlResource(Resource):
                     og['og:image'] = images[0].attrib['src']
 
         # pre-cache the image for posterity
-        # FIXME: it might be cleaner to use the same flow as the main /preview_url request
-        # itself and benefit from the same caching etc.  But for now we just rely on the
-        # caching on the master request to speed things up.
+        # FIXME: it might be cleaner to use the same flow as the main /preview_url
+        # request itself and benefit from the same caching etc.  But for now we
+        # just rely on the caching on the master request to speed things up.
         if 'og:image' in og and og['og:image']:
             image_info = yield self._download_url(
                 self._rebase_url(og['og:image'], media_info['uri']), requester.user
@@ -328,20 +331,24 @@ class PreviewUrlResource(Resource):
                 # ...or if they are within a <script/> or <style/> tag.
                 # This is a very very very coarse approximation to a plain text
                 # render of the page.
-                text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | "
-                                        "ancestor::aside | ancestor::footer | "
-                                        "ancestor::script | ancestor::style)]" +
-                                        "[ancestor::body]")
-                text = ''
-                for text_node in text_nodes:
-                    if len(text) < 500:
-                        text += text_node + ' '
-                    else:
-                        break
-                text = re.sub(r'[\t ]+', ' ', text)
-                text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text)
-                text = text.strip()[:500]
-                og['og:description'] = text if text else None
+
+                # We don't just use XPATH here as that is slow on some machines.
+
+                # We clone `tree` as we modify it.
+                cloned_tree = deepcopy(tree.find("body"))
+
+                TAGS_TO_REMOVE = ("header", "nav", "aside", "footer", "script", "style",)
+                for el in cloned_tree.iter(TAGS_TO_REMOVE):
+                    el.getparent().remove(el)
+
+                # Split all the text nodes into paragraphs (by splitting on new
+                # lines)
+                text_nodes = (
+                    re.sub(r'\s+', '\n', el.text).strip()
+                    for el in cloned_tree.iter()
+                    if el.text and isinstance(el.tag, basestring)  # Removes comments
+                )
+                og['og:description'] = summarize_paragraphs(text_nodes)
 
         # TODO: delete the url downloads to stop diskfilling,
         # as we only ever cared about its OG
@@ -449,3 +456,56 @@ class PreviewUrlResource(Resource):
             content_type.startswith("application/xhtml")
         ):
             return True
+
+
+def summarize_paragraphs(text_nodes, min_size=200, max_size=500):
+    # Try to get a summary of between 200 and 500 words, respecting
+    # first paragraph and then word boundaries.
+    # TODO: Respect sentences?
+
+    description = ''
+
+    # Keep adding paragraphs until we get to the MIN_SIZE.
+    for text_node in text_nodes:
+        if len(description) < min_size:
+            text_node = re.sub(r'[\t \r\n]+', ' ', text_node)
+            description += text_node + '\n\n'
+        else:
+            break
+
+    description = description.strip()
+    description = re.sub(r'[\t ]+', ' ', description)
+    description = re.sub(r'[\t \r\n]*[\r\n]+', '\n\n', description)
+
+    # If the concatenation of paragraphs to get above MIN_SIZE
+    # took us over MAX_SIZE, then we need to truncate mid paragraph
+    if len(description) > max_size:
+        new_desc = ""
+
+        # This splits the paragraph into words, but keeping the
+        # (preceeding) whitespace intact so we can easily concat
+        # words back together.
+        for match in re.finditer("\s*\S+", description):
+            word = match.group()
+
+            # Keep adding words while the total length is less than
+            # MAX_SIZE.
+            if len(word) + len(new_desc) < max_size:
+                new_desc += word
+            else:
+                # At this point the next word *will* take us over
+                # MAX_SIZE, but we also want to ensure that its not
+                # a huge word. If it is add it anyway and we'll
+                # truncate later.
+                if len(new_desc) < min_size:
+                    new_desc += word
+                break
+
+        # Double check that we're not over the limit
+        if len(new_desc) > max_size:
+            new_desc = new_desc[:max_size]
+
+        # We always add an ellipsis because at the very least
+        # we chopped mid paragraph.
+        description = new_desc.strip() + "…"
+    return description if description else None