summary refs log tree commit diff
path: root/synapse/rest/media/v1/preview_html.py
diff options
context:
space:
mode:
authorPatrick Cloke <clokep@users.noreply.github.com>2022-03-16 07:21:36 -0400
committerGitHub <noreply@github.com>2022-03-16 07:21:36 -0400
commit4587b35929d22731644a11120a9e7d6a9c3bc304 (patch)
tree7c7f269f750021b51842d12130980748637f1e1e /synapse/rest/media/v1/preview_html.py
parentUse the ignored_users table to test event visibility & sync. (#12225) (diff)
downloadsynapse-4587b35929d22731644a11120a9e7d6a9c3bc304.tar.xz
Clean-up logic for rebasing URLs during URL preview. (#12219)
By using urljoin from the standard library and reducing the number
of places URLs are rebased.
Diffstat (limited to 'synapse/rest/media/v1/preview_html.py')
-rw-r--r--synapse/rest/media/v1/preview_html.py39
1 files changed, 2 insertions, 37 deletions
diff --git a/synapse/rest/media/v1/preview_html.py b/synapse/rest/media/v1/preview_html.py
index 872a9e72e8..4cc9c66fbe 100644
--- a/synapse/rest/media/v1/preview_html.py
+++ b/synapse/rest/media/v1/preview_html.py
@@ -16,7 +16,6 @@ import itertools
 import logging
 import re
 from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
-from urllib import parse as urlparse
 
 if TYPE_CHECKING:
     from lxml import etree
@@ -144,9 +143,7 @@ def decode_body(
     return etree.fromstring(body, parser)
 
 
-def parse_html_to_open_graph(
-    tree: "etree.Element", media_uri: str
-) -> Dict[str, Optional[str]]:
+def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
     """
     Parse the HTML document into an Open Graph response.
 
@@ -155,7 +152,6 @@ def parse_html_to_open_graph(
 
     Args:
         tree: The parsed HTML document.
-        media_url: The URI used to download the body.
 
     Returns:
         The Open Graph response as a dictionary.
@@ -209,7 +205,7 @@ def parse_html_to_open_graph(
             "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
         )
         if meta_image:
-            og["og:image"] = rebase_url(meta_image[0], media_uri)
+            og["og:image"] = meta_image[0]
         else:
             # TODO: consider inlined CSS styles as well as width & height attribs
             images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
@@ -320,37 +316,6 @@ def _iterate_over_text(
             )
 
 
-def rebase_url(url: str, base: str) -> str:
-    """
-    Resolves a potentially relative `url` against an absolute `base` URL.
-
-    For example:
-
-        >>> rebase_url("subpage", "https://example.com/foo/")
-        'https://example.com/foo/subpage'
-        >>> rebase_url("sibling", "https://example.com/foo")
-        'https://example.com/sibling'
-        >>> rebase_url("/bar", "https://example.com/foo/")
-        'https://example.com/bar'
-        >>> rebase_url("https://alice.com/a/", "https://example.com/foo/")
-        'https://alice.com/a'
-    """
-    base_parts = urlparse.urlparse(base)
-    # Convert the parsed URL to a list for (potential) modification.
-    url_parts = list(urlparse.urlparse(url))
-    # Add a scheme, if one does not exist.
-    if not url_parts[0]:
-        url_parts[0] = base_parts.scheme or "http"
-    # Fix up the hostname, if this is not a data URL.
-    if url_parts[0] != "data" and not url_parts[1]:
-        url_parts[1] = base_parts.netloc
-        # If the path does not start with a /, nest it under the base path's last
-        # directory.
-        if not url_parts[2].startswith("/"):
-            url_parts[2] = re.sub(r"/[^/]+$", "/", base_parts.path) + url_parts[2]
-    return urlparse.urlunparse(url_parts)
-
-
 def summarize_paragraphs(
     text_nodes: Iterable[str], min_size: int = 200, max_size: int = 500
 ) -> Optional[str]: