prevent choking on invalid utf-8, and handle image thumbnailing smarter

author: Matthew Hodgson <matthew@matrix.org> 2016-03-31 15:14:14 +0100
committer: Matthew Hodgson <matthew@matrix.org> 2016-03-31 15:14:14 +0100
commit: 72550c3803e5020aa377f8d10c0c20afd4273c0d (patch)
tree: 1fa814f6fbf7db37cb0ce3f92ebe1007d38b74ff /synapse/rest/media
parent: synthesise basig OG metadata from pages lacking it (diff)
download: synapse-72550c3803e5020aa377f8d10c0c20afd4273c0d.tar.xz
1 files changed, 18 insertions, 5 deletions
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index b1d5cabfaa..04d02ee427 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -72,7 +72,15 @@ class PreviewUrlResource(BaseMediaResource):
                 # define our OG response for this media
             elif self._is_html(media_info['media_type']):
                 # TODO: somehow stop a big HTML tree from exploding synapse's RAM
-                tree = html.parse(media_info['filename'])
+
+                # XXX: can't work out how to make lxml ignore UTF8 decoding errors
+                # so slurp as a string at this point.
+                file = open(media_info['filename'])
+                body = file.read()
+                file.close()
+                # FIXME: we shouldn't be forcing utf-8 if the page isn't actually utf-8...
+                tree = html.fromstring(body.decode('utf-8','ignore'))
+                # tree = html.parse(media_info['filename'])
 
                 # suck it up into lxml and define our OG response.
                 # if we see any URLs in the OG response, then spider them
@@ -108,14 +116,19 @@ class PreviewUrlResource(BaseMediaResource):
                     title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
                     og['og:title'] = title[0].text if title else None
 
-                    images = tree.xpath("//img")
+                    images = [ i for i in tree.xpath("//img") if 'src' in i.attrib ]
                     big_images = [ i for i in images if (
-                        'width' in i and 'height' in i and
+                        'width' in i.attrib and 'height' in i.attrib and
                         i.attrib['width'] > 64 and i.attrib['height'] > 64
-                    )] or images
-                    og['og:image'] = images[0].attrib['src'] if images else None
+                    )]
+                    big_images = big_images.sort(key=lambda i: (-1 * int(i.attrib['width']) * int(i.attrib['height'])))
+                    images = big_images if big_images else images
+
+                    if images:
+                        og['og:image'] = images[0].attrib['src']
 
                     text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()")
+                    # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text()")
                     text = ''
                     for text_node in text_nodes:
                         if len(text) < 1024:
author	Matthew Hodgson <matthew@matrix.org>	2016-03-31 15:14:14 +0100
committer	Matthew Hodgson <matthew@matrix.org>	2016-03-31 15:14:14 +0100
commit	72550c3803e5020aa377f8d10c0c20afd4273c0d (patch)
tree	1fa814f6fbf7db37cb0ce3f92ebe1007d38b74ff /synapse/rest/media
parent	synthesise basig OG metadata from pages lacking it (diff)
download	synapse-72550c3803e5020aa377f8d10c0c20afd4273c0d.tar.xz