From 01df5bacac3aa0e8356fed889ea0b69c4c044535 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@users.noreply.github.com>
Date: Fri, 3 Jun 2022 12:09:12 -0400
Subject: Improve URL previews for some pages (#12951)

* Skip `og` and `meta` tags where the value is empty.
* Fallback to the favicon if there are no other images.
* Ignore tags meant for navigation.
---
 changelog.d/12951.feature                |  1 +
 synapse/rest/media/v1/preview_html.py    | 52 +++++++++++++++++++++-----------
 tests/rest/media/v1/test_html_preview.py | 37 ++++++++++++++++++++++-
 3 files changed, 72 insertions(+), 18 deletions(-)
 create mode 100644 changelog.d/12951.feature

diff --git a/changelog.d/12951.feature b/changelog.d/12951.feature
new file mode 100644
index 0000000000..f885be9fe4
--- /dev/null
+++ b/changelog.d/12951.feature
@@ -0,0 +1 @@
+Improve URL previews for pages with empty elements.
diff --git a/synapse/rest/media/v1/preview_html.py b/synapse/rest/media/v1/preview_html.py
index 13ec7ab533..ed8f21a483 100644
--- a/synapse/rest/media/v1/preview_html.py
+++ b/synapse/rest/media/v1/preview_html.py
@@ -30,6 +30,9 @@ _xml_encoding_match = re.compile(
 )
 _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
 
+# Certain elements aren't meant for display.
+ARIA_ROLES_TO_IGNORE = {"directory", "menu", "menubar", "toolbar"}
+
 
 def _normalise_encoding(encoding: str) -> Optional[str]:
     """Use the Python codec's name as the normalised entry."""
@@ -174,13 +177,15 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
     # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
 
     og: Dict[str, Optional[str]] = {}
-    for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
-        if "content" in tag.attrib:
-            # if we've got more than 50 tags, someone is taking the piss
-            if len(og) >= 50:
-                logger.warning("Skipping OG for page with too many 'og:' tags")
-                return {}
-            og[tag.attrib["property"]] = tag.attrib["content"]
+    for tag in tree.xpath(
+        "//*/meta[starts-with(@property, 'og:')][@content][not(@content='')]"
+    ):
+        # if we've got more than 50 tags, someone is taking the piss
+        if len(og) >= 50:
+            logger.warning("Skipping OG for page with too many 'og:' tags")
+            return {}
+
+        og[tag.attrib["property"]] = tag.attrib["content"]
 
     # TODO: grab article: meta tags too, e.g.:
 
@@ -192,21 +197,23 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
     # "article:modified_time" content="2016-04-01T18:31:53+00:00" />
 
     if "og:title" not in og:
-        # do some basic spidering of the HTML
-        title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
-        if title and title[0].text is not None:
-            og["og:title"] = title[0].text.strip()
+        # Attempt to find a title from the title tag, or the biggest header on the page.
+        title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")
+        if title:
+            og["og:title"] = title[0].strip()
         else:
             og["og:title"] = None
 
     if "og:image" not in og:
-        # TODO: extract a favicon failing all else
         meta_image = tree.xpath(
-            "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
+            "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
         )
+        # If a meta image is found, use it.
         if meta_image:
             og["og:image"] = meta_image[0]
         else:
+            # Try to find images which are larger than 10px by 10px.
+            #
             # TODO: consider inlined CSS styles as well as width & height attribs
             images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
             images = sorted(
@@ -215,17 +222,24 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
                     -1 * float(i.attrib["width"]) * float(i.attrib["height"])
                 ),
             )
+            # If no images were found, try to find *any* images.
             if not images:
-                images = tree.xpath("//img[@src]")
+                images = tree.xpath("//img[@src][1]")
             if images:
                 og["og:image"] = images[0].attrib["src"]
 
+            # Finally, fallback to the favicon if nothing else.
+            else:
+                favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]")
+                if favicons:
+                    og["og:image"] = favicons[0]
+
     if "og:description" not in og:
+        # Check the first meta description tag for content.
         meta_description = tree.xpath(
-            "//*/meta"
-            "[translate(@name, 'DESCRIPTION', 'description')='description']"
-            "/@content"
+            "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
         )
+        # If a meta description is found with content, use it.
         if meta_description:
             og["og:description"] = meta_description[0]
         else:
@@ -306,6 +320,10 @@ def _iterate_over_text(
         if isinstance(el, str):
             yield el
         elif el.tag not in tags_to_ignore:
+            # If the element isn't meant for display, ignore it.
+            if el.get("role") in ARIA_ROLES_TO_IGNORE:
+                continue
+
             # el.text is the text before the first child, so we can immediately
             # return it if the text exists.
             if el.text:
diff --git a/tests/rest/media/v1/test_html_preview.py b/tests/rest/media/v1/test_html_preview.py
index 62e308814d..ea9e5889bf 100644
--- a/tests/rest/media/v1/test_html_preview.py
+++ b/tests/rest/media/v1/test_html_preview.py
@@ -145,7 +145,7 @@ class SummarizeTestCase(unittest.TestCase):
         )
 
 
-class CalcOgTestCase(unittest.TestCase):
+class OpenGraphFromHtmlTestCase(unittest.TestCase):
     if not lxml:
         skip = "url preview feature requires lxml"
 
@@ -235,6 +235,21 @@ class CalcOgTestCase(unittest.TestCase):
 
         self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
 
+        # Another variant is a title with no content.
+        html = b"""
+        <html>
+        <head><title></title></head>
+        <body>
+        <h1>Title</h1>
+        </body>
+        </html>
+        """
+
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree)
+
+        self.assertEqual(og, {"og:title": "Title", "og:description": "Title"})
+
     def test_h1_as_title(self) -> None:
         html = b"""
         <html>
@@ -250,6 +265,26 @@ class CalcOgTestCase(unittest.TestCase):
 
         self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
 
+    def test_empty_description(self) -> None:
+        """Description tags with empty content should be ignored."""
+        html = b"""
+        <html>
+        <meta property="og:description" content=""/>
+        <meta property="og:description"/>
+        <meta name="description" content=""/>
+        <meta name="description"/>
+        <meta name="description" content="Finally!"/>
+        <body>
+        <h1>Title</h1>
+        </body>
+        </html>
+        """
+
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree)
+
+        self.assertEqual(og, {"og:title": "Title", "og:description": "Finally!"})
+
     def test_missing_title_and_broken_h1(self) -> None:
         html = b"""
         <html>
-- 
cgit 1.5.1