diff options
Diffstat (limited to 'synapse/rest/media/v1')
-rw-r--r-- | synapse/rest/media/v1/filepath.py | 6 | ||||
-rw-r--r-- | synapse/rest/media/v1/media_repository.py | 105 | ||||
-rw-r--r-- | synapse/rest/media/v1/preview_url_resource.py | 98 |
3 files changed, 166 insertions, 43 deletions
diff --git a/synapse/rest/media/v1/filepath.py b/synapse/rest/media/v1/filepath.py index 422ab86fb3..0137458f71 100644 --- a/synapse/rest/media/v1/filepath.py +++ b/synapse/rest/media/v1/filepath.py @@ -65,3 +65,9 @@ class MediaFilePaths(object): file_id[0:2], file_id[2:4], file_id[4:], file_name ) + + def remote_media_thumbnail_dir(self, server_name, file_id): + return os.path.join( + self.base_path, "remote_thumbnail", server_name, + file_id[0:2], file_id[2:4], file_id[4:], + ) diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index d96bf9afe2..692e078419 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -26,14 +26,17 @@ from .thumbnailer import Thumbnailer from synapse.http.matrixfederationclient import MatrixFederationHttpClient from synapse.util.stringutils import random_string +from synapse.api.errors import SynapseError from twisted.internet import defer, threads -from synapse.util.async import ObservableDeferred +from synapse.util.async import Linearizer from synapse.util.stringutils import is_ascii from synapse.util.logcontext import preserve_context_over_fn import os +import errno +import shutil import cgi import logging @@ -42,8 +45,11 @@ import urlparse logger = logging.getLogger(__name__) +UPDATE_RECENTLY_ACCESSED_REMOTES_TS = 60 * 1000 + + class MediaRepository(object): - def __init__(self, hs, filepaths): + def __init__(self, hs): self.auth = hs.get_auth() self.client = MatrixFederationHttpClient(hs) self.clock = hs.get_clock() @@ -51,11 +57,28 @@ class MediaRepository(object): self.store = hs.get_datastore() self.max_upload_size = hs.config.max_upload_size self.max_image_pixels = hs.config.max_image_pixels - self.filepaths = filepaths - self.downloads = {} + self.filepaths = MediaFilePaths(hs.config.media_store_path) self.dynamic_thumbnails = hs.config.dynamic_thumbnails self.thumbnail_requirements = hs.config.thumbnail_requirements + self.remote_media_linearizer = Linearizer() + + self.recently_accessed_remotes = set() + + self.clock.looping_call( + self._update_recently_accessed_remotes, + UPDATE_RECENTLY_ACCESSED_REMOTES_TS + ) + + @defer.inlineCallbacks + def _update_recently_accessed_remotes(self): + media = self.recently_accessed_remotes + self.recently_accessed_remotes = set() + + yield self.store.update_cached_last_access_time( + media, self.clock.time_msec() + ) + @staticmethod def _makedirs(filepath): dirname = os.path.dirname(filepath) @@ -92,22 +115,12 @@ class MediaRepository(object): defer.returnValue("mxc://%s/%s" % (self.server_name, media_id)) + @defer.inlineCallbacks def get_remote_media(self, server_name, media_id): key = (server_name, media_id) - download = self.downloads.get(key) - if download is None: - download = self._get_remote_media_impl(server_name, media_id) - download = ObservableDeferred( - download, - consumeErrors=True - ) - self.downloads[key] = download - - @download.addBoth - def callback(media_info): - del self.downloads[key] - return media_info - return download.observe() + with (yield self.remote_media_linearizer.queue(key)): + media_info = yield self._get_remote_media_impl(server_name, media_id) + defer.returnValue(media_info) @defer.inlineCallbacks def _get_remote_media_impl(self, server_name, media_id): @@ -118,6 +131,11 @@ class MediaRepository(object): media_info = yield self._download_remote_file( server_name, media_id ) + else: + self.recently_accessed_remotes.add((server_name, media_id)) + yield self.store.update_cached_last_access_time( + [(server_name, media_id)], self.clock.time_msec() + ) defer.returnValue(media_info) @defer.inlineCallbacks @@ -134,10 +152,15 @@ class MediaRepository(object): request_path = "/".join(( "/_matrix/media/v1/download", server_name, media_id, )) - length, headers = yield self.client.get_file( - server_name, request_path, output_stream=f, - max_size=self.max_upload_size, - ) + try: + length, headers = yield self.client.get_file( + server_name, request_path, output_stream=f, + max_size=self.max_upload_size, + ) + except Exception as e: + logger.warn("Failed to fetch remoted media %r", e) + raise SynapseError(502, "Failed to fetch remoted media") + media_type = headers["Content-Type"][0] time_now_ms = self.clock.time_msec() @@ -410,6 +433,41 @@ class MediaRepository(object): "height": m_height, }) + @defer.inlineCallbacks + def delete_old_remote_media(self, before_ts): + old_media = yield self.store.get_remote_media_before(before_ts) + + deleted = 0 + + for media in old_media: + origin = media["media_origin"] + media_id = media["media_id"] + file_id = media["filesystem_id"] + key = (origin, media_id) + + logger.info("Deleting: %r", key) + + with (yield self.remote_media_linearizer.queue(key)): + full_path = self.filepaths.remote_media_filepath(origin, file_id) + try: + os.remove(full_path) + except OSError as e: + logger.warn("Failed to remove file: %r", full_path) + if e.errno == errno.ENOENT: + pass + else: + continue + + thumbnail_dir = self.filepaths.remote_media_thumbnail_dir( + origin, file_id + ) + shutil.rmtree(thumbnail_dir, ignore_errors=True) + + yield self.store.delete_remote_media(origin, media_id) + deleted += 1 + + defer.returnValue({"deleted": deleted}) + class MediaRepositoryResource(Resource): """File uploading and downloading. @@ -458,9 +516,8 @@ class MediaRepositoryResource(Resource): def __init__(self, hs): Resource.__init__(self) - filepaths = MediaFilePaths(hs.config.media_store_path) - media_repo = MediaRepository(hs, filepaths) + media_repo = hs.get_media_repository() self.putChild("upload", UploadResource(hs, media_repo)) self.putChild("download", DownloadResource(hs, media_repo)) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 37dd1de899..bdd0e60c5b 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -29,6 +29,8 @@ from synapse.http.server import ( from synapse.util.async import ObservableDeferred from synapse.util.stringutils import is_ascii +from copy import deepcopy + import os import re import fnmatch @@ -252,7 +254,8 @@ class PreviewUrlResource(Resource): og = {} for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): - og[tag.attrib['property']] = tag.attrib['content'] + if 'content' in tag.attrib: + og[tag.attrib['property']] = tag.attrib['content'] # TODO: grab article: meta tags too, e.g.: @@ -279,7 +282,7 @@ class PreviewUrlResource(Resource): # TODO: consider inlined CSS styles as well as width & height attribs images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]") images = sorted(images, key=lambda i: ( - -1 * int(i.attrib['width']) * int(i.attrib['height']) + -1 * float(i.attrib['width']) * float(i.attrib['height']) )) if not images: images = tree.xpath("//img[@src]") @@ -287,9 +290,9 @@ class PreviewUrlResource(Resource): og['og:image'] = images[0].attrib['src'] # pre-cache the image for posterity - # FIXME: it might be cleaner to use the same flow as the main /preview_url request - # itself and benefit from the same caching etc. But for now we just rely on the - # caching on the master request to speed things up. + # FIXME: it might be cleaner to use the same flow as the main /preview_url + # request itself and benefit from the same caching etc. But for now we + # just rely on the caching on the master request to speed things up. if 'og:image' in og and og['og:image']: image_info = yield self._download_url( self._rebase_url(og['og:image'], media_info['uri']), requester.user @@ -328,20 +331,24 @@ class PreviewUrlResource(Resource): # ...or if they are within a <script/> or <style/> tag. # This is a very very very coarse approximation to a plain text # render of the page. - text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | " - "ancestor::aside | ancestor::footer | " - "ancestor::script | ancestor::style)]" + - "[ancestor::body]") - text = '' - for text_node in text_nodes: - if len(text) < 500: - text += text_node + ' ' - else: - break - text = re.sub(r'[\t ]+', ' ', text) - text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text) - text = text.strip()[:500] - og['og:description'] = text if text else None + + # We don't just use XPATH here as that is slow on some machines. + + # We clone `tree` as we modify it. + cloned_tree = deepcopy(tree.find("body")) + + TAGS_TO_REMOVE = ("header", "nav", "aside", "footer", "script", "style",) + for el in cloned_tree.iter(TAGS_TO_REMOVE): + el.getparent().remove(el) + + # Split all the text nodes into paragraphs (by splitting on new + # lines) + text_nodes = ( + re.sub(r'\s+', '\n', el.text).strip() + for el in cloned_tree.iter() + if el.text and isinstance(el.tag, basestring) # Removes comments + ) + og['og:description'] = summarize_paragraphs(text_nodes) # TODO: delete the url downloads to stop diskfilling, # as we only ever cared about its OG @@ -449,3 +456,56 @@ class PreviewUrlResource(Resource): content_type.startswith("application/xhtml") ): return True + + +def summarize_paragraphs(text_nodes, min_size=200, max_size=500): + # Try to get a summary of between 200 and 500 words, respecting + # first paragraph and then word boundaries. + # TODO: Respect sentences? + + description = '' + + # Keep adding paragraphs until we get to the MIN_SIZE. + for text_node in text_nodes: + if len(description) < min_size: + text_node = re.sub(r'[\t \r\n]+', ' ', text_node) + description += text_node + '\n\n' + else: + break + + description = description.strip() + description = re.sub(r'[\t ]+', ' ', description) + description = re.sub(r'[\t \r\n]*[\r\n]+', '\n\n', description) + + # If the concatenation of paragraphs to get above MIN_SIZE + # took us over MAX_SIZE, then we need to truncate mid paragraph + if len(description) > max_size: + new_desc = "" + + # This splits the paragraph into words, but keeping the + # (preceeding) whitespace intact so we can easily concat + # words back together. + for match in re.finditer("\s*\S+", description): + word = match.group() + + # Keep adding words while the total length is less than + # MAX_SIZE. + if len(word) + len(new_desc) < max_size: + new_desc += word + else: + # At this point the next word *will* take us over + # MAX_SIZE, but we also want to ensure that its not + # a huge word. If it is add it anyway and we'll + # truncate later. + if len(new_desc) < min_size: + new_desc += word + break + + # Double check that we're not over the limit + if len(new_desc) > max_size: + new_desc = new_desc[:max_size] + + # We always add an ellipsis because at the very least + # we chopped mid paragraph. + description = new_desc.strip() + "…" + return description if description else None |