From 807efd26aec9b65c6a2f02d10fd139095a5b3387 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@users.noreply.github.com>
Date: Mon, 24 Jan 2022 08:58:18 -0500
Subject: Support rendering previews with data: URLs in them (#11767)

Images which are data URLs will no longer break URL
previews and will properly be "downloaded" and
thumbnailed.
---
 changelog.d/11767.bugfix                      |   1 +
 synapse/rest/media/v1/preview_html.py         |  31 +-
 synapse/rest/media/v1/preview_url_resource.py | 224 ++++++++----
 tests/rest/media/v1/test_html_preview.py      | 481 ++++++++++++++++++++++++++
 tests/rest/media/v1/test_url_preview.py       |  81 ++++-
 tests/server.py                               |   2 +-
 tests/test_preview.py                         | 449 ------------------------
 7 files changed, 747 insertions(+), 522 deletions(-)
 create mode 100644 changelog.d/11767.bugfix
 create mode 100644 tests/rest/media/v1/test_html_preview.py
 delete mode 100644 tests/test_preview.py

diff --git a/changelog.d/11767.bugfix b/changelog.d/11767.bugfix
new file mode 100644
index 0000000000..3e344747f4
--- /dev/null
+++ b/changelog.d/11767.bugfix
@@ -0,0 +1 @@
+Fix a long-standing bug when previewing Reddit URLs which do not contain an image.
diff --git a/synapse/rest/media/v1/preview_html.py b/synapse/rest/media/v1/preview_html.py
index 30b067dd42..872a9e72e8 100644
--- a/synapse/rest/media/v1/preview_html.py
+++ b/synapse/rest/media/v1/preview_html.py
@@ -321,14 +321,33 @@ def _iterate_over_text(
 
 
 def rebase_url(url: str, base: str) -> str:
-    base_parts = list(urlparse.urlparse(base))
+    """
+    Resolves a potentially relative `url` against an absolute `base` URL.
+
+    For example:
+
+        >>> rebase_url("subpage", "https://example.com/foo/")
+        'https://example.com/foo/subpage'
+        >>> rebase_url("sibling", "https://example.com/foo")
+        'https://example.com/sibling'
+        >>> rebase_url("/bar", "https://example.com/foo/")
+        'https://example.com/bar'
+        >>> rebase_url("https://alice.com/a/", "https://example.com/foo/")
+        'https://alice.com/a'
+    """
+    base_parts = urlparse.urlparse(base)
+    # Convert the parsed URL to a list for (potential) modification.
     url_parts = list(urlparse.urlparse(url))
-    if not url_parts[0]:  # fix up schema
-        url_parts[0] = base_parts[0] or "http"
-    if not url_parts[1]:  # fix up hostname
-        url_parts[1] = base_parts[1]
+    # Add a scheme, if one does not exist.
+    if not url_parts[0]:
+        url_parts[0] = base_parts.scheme or "http"
+    # Fix up the hostname, if this is not a data URL.
+    if url_parts[0] != "data" and not url_parts[1]:
+        url_parts[1] = base_parts.netloc
+        # If the path does not start with a /, nest it under the base path's last
+        # directory.
         if not url_parts[2].startswith("/"):
-            url_parts[2] = re.sub(r"/[^/]+$", "/", base_parts[2]) + url_parts[2]
+            url_parts[2] = re.sub(r"/[^/]+$", "/", base_parts.path) + url_parts[2]
     return urlparse.urlunparse(url_parts)
 
 
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index e8881bc870..efd84ced8f 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -21,8 +21,9 @@ import re
 import shutil
 import sys
 import traceback
-from typing import TYPE_CHECKING, Iterable, Optional, Tuple
+from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple
 from urllib import parse as urlparse
+from urllib.request import urlopen
 
 import attr
 
@@ -70,6 +71,17 @@ ONE_DAY = 24 * ONE_HOUR
 IMAGE_CACHE_EXPIRY_MS = 2 * ONE_DAY
 
 
+@attr.s(slots=True, frozen=True, auto_attribs=True)
+class DownloadResult:
+    length: int
+    uri: str
+    response_code: int
+    media_type: str
+    download_name: Optional[str]
+    expires: int
+    etag: Optional[str]
+
+
 @attr.s(slots=True, frozen=True, auto_attribs=True)
 class MediaInfo:
     """
@@ -256,7 +268,7 @@ class PreviewUrlResource(DirectServeJsonResource):
         if oembed_url:
             url_to_download = oembed_url
 
-        media_info = await self._download_url(url_to_download, user)
+        media_info = await self._handle_url(url_to_download, user)
 
         logger.debug("got media_info of '%s'", media_info)
 
@@ -297,7 +309,9 @@ class PreviewUrlResource(DirectServeJsonResource):
                 oembed_url = self._oembed.autodiscover_from_html(tree)
                 og_from_oembed: JsonDict = {}
                 if oembed_url:
-                    oembed_info = await self._download_url(oembed_url, user)
+                    oembed_info = await self._handle_url(
+                        oembed_url, user, allow_data_urls=True
+                    )
                     (
                         og_from_oembed,
                         author_name,
@@ -367,7 +381,135 @@ class PreviewUrlResource(DirectServeJsonResource):
 
         return jsonog.encode("utf8")
 
-    async def _download_url(self, url: str, user: UserID) -> MediaInfo:
+    async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult:
+        """
+        Fetches a remote URL and parses the headers.
+
+        Args:
+             url: The URL to fetch.
+             output_stream: The stream to write the content to.
+
+        Returns:
+            A tuple of:
+                Media length, URL downloaded, the HTTP response code,
+                the media type, the downloaded file name, the number of
+                milliseconds the result is valid for, the etag header.
+        """
+
+        try:
+            logger.debug("Trying to get preview for url '%s'", url)
+            length, headers, uri, code = await self.client.get_file(
+                url,
+                output_stream=output_stream,
+                max_size=self.max_spider_size,
+                headers={"Accept-Language": self.url_preview_accept_language},
+            )
+        except SynapseError:
+            # Pass SynapseErrors through directly, so that the servlet
+            # handler will return a SynapseError to the client instead of
+            # blank data or a 500.
+            raise
+        except DNSLookupError:
+            # DNS lookup returned no results
+            # Note: This will also be the case if one of the resolved IP
+            # addresses is blacklisted
+            raise SynapseError(
+                502,
+                "DNS resolution failure during URL preview generation",
+                Codes.UNKNOWN,
+            )
+        except Exception as e:
+            # FIXME: pass through 404s and other error messages nicely
+            logger.warning("Error downloading %s: %r", url, e)
+
+            raise SynapseError(
+                500,
+                "Failed to download content: %s"
+                % (traceback.format_exception_only(sys.exc_info()[0], e),),
+                Codes.UNKNOWN,
+            )
+
+        if b"Content-Type" in headers:
+            media_type = headers[b"Content-Type"][0].decode("ascii")
+        else:
+            media_type = "application/octet-stream"
+
+        download_name = get_filename_from_headers(headers)
+
+        # FIXME: we should calculate a proper expiration based on the
+        # Cache-Control and Expire headers.  But for now, assume 1 hour.
+        expires = ONE_HOUR
+        etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
+
+        return DownloadResult(
+            length, uri, code, media_type, download_name, expires, etag
+        )
+
+    async def _parse_data_url(
+        self, url: str, output_stream: BinaryIO
+    ) -> DownloadResult:
+        """
+        Parses a data: URL.
+
+        Args:
+             url: The URL to parse.
+             output_stream: The stream to write the content to.
+
+        Returns:
+            A tuple of:
+                Media length, URL downloaded, the HTTP response code,
+                the media type, the downloaded file name, the number of
+                milliseconds the result is valid for, the etag header.
+        """
+
+        try:
+            logger.debug("Trying to parse data url '%s'", url)
+            with urlopen(url) as url_info:
+                # TODO Can this be more efficient.
+                output_stream.write(url_info.read())
+        except Exception as e:
+            logger.warning("Error parsing data: URL %s: %r", url, e)
+
+            raise SynapseError(
+                500,
+                "Failed to parse data URL: %s"
+                % (traceback.format_exception_only(sys.exc_info()[0], e),),
+                Codes.UNKNOWN,
+            )
+
+        return DownloadResult(
+            # Read back the length that has been written.
+            length=output_stream.tell(),
+            uri=url,
+            # If it was parsed, consider this a 200 OK.
+            response_code=200,
+            # urlopen shoves the media-type from the data URL into the content type
+            # header object.
+            media_type=url_info.headers.get_content_type(),
+            # Some features are not supported by data: URLs.
+            download_name=None,
+            expires=ONE_HOUR,
+            etag=None,
+        )
+
+    async def _handle_url(
+        self, url: str, user: UserID, allow_data_urls: bool = False
+    ) -> MediaInfo:
+        """
+        Fetches content from a URL and parses the result to generate a MediaInfo.
+
+        It uses the media storage provider to persist the fetched content and
+        stores the mapping into the database.
+
+        Args:
+             url: The URL to fetch.
+             user: The user who ahs requested this URL.
+             allow_data_urls: True if data URLs should be allowed.
+
+        Returns:
+            A MediaInfo object describing the fetched content.
+        """
+
         # TODO: we should probably honour robots.txt... except in practice
         # we're most likely being explicitly triggered by a human rather than a
         # bot, so are we really a robot?
@@ -377,61 +519,27 @@ class PreviewUrlResource(DirectServeJsonResource):
         file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)
 
         with self.media_storage.store_into_file(file_info) as (f, fname, finish):
-            try:
-                logger.debug("Trying to get preview for url '%s'", url)
-                length, headers, uri, code = await self.client.get_file(
-                    url,
-                    output_stream=f,
-                    max_size=self.max_spider_size,
-                    headers={"Accept-Language": self.url_preview_accept_language},
-                )
-            except SynapseError:
-                # Pass SynapseErrors through directly, so that the servlet
-                # handler will return a SynapseError to the client instead of
-                # blank data or a 500.
-                raise
-            except DNSLookupError:
-                # DNS lookup returned no results
-                # Note: This will also be the case if one of the resolved IP
-                # addresses is blacklisted
-                raise SynapseError(
-                    502,
-                    "DNS resolution failure during URL preview generation",
-                    Codes.UNKNOWN,
-                )
-            except Exception as e:
-                # FIXME: pass through 404s and other error messages nicely
-                logger.warning("Error downloading %s: %r", url, e)
-
-                raise SynapseError(
-                    500,
-                    "Failed to download content: %s"
-                    % (traceback.format_exception_only(sys.exc_info()[0], e),),
-                    Codes.UNKNOWN,
-                )
-            await finish()
+            if url.startswith("data:"):
+                if not allow_data_urls:
+                    raise SynapseError(
+                        500, "Previewing of data: URLs is forbidden", Codes.UNKNOWN
+                    )
 
-            if b"Content-Type" in headers:
-                media_type = headers[b"Content-Type"][0].decode("ascii")
+                download_result = await self._parse_data_url(url, f)
             else:
-                media_type = "application/octet-stream"
+                download_result = await self._download_url(url, f)
 
-            download_name = get_filename_from_headers(headers)
-
-            # FIXME: we should calculate a proper expiration based on the
-            # Cache-Control and Expire headers.  But for now, assume 1 hour.
-            expires = ONE_HOUR
-            etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
+            await finish()
 
         try:
             time_now_ms = self.clock.time_msec()
 
             await self.store.store_local_media(
                 media_id=file_id,
-                media_type=media_type,
+                media_type=download_result.media_type,
                 time_now_ms=time_now_ms,
-                upload_name=download_name,
-                media_length=length,
+                upload_name=download_result.download_name,
+                media_length=download_result.length,
                 user_id=user,
                 url_cache=url,
             )
@@ -444,16 +552,16 @@ class PreviewUrlResource(DirectServeJsonResource):
             raise
 
         return MediaInfo(
-            media_type=media_type,
-            media_length=length,
-            download_name=download_name,
+            media_type=download_result.media_type,
+            media_length=download_result.length,
+            download_name=download_result.download_name,
             created_ts_ms=time_now_ms,
             filesystem_id=file_id,
             filename=fname,
-            uri=uri,
-            response_code=code,
-            expires=expires,
-            etag=etag,
+            uri=download_result.uri,
+            response_code=download_result.response_code,
+            expires=download_result.expires,
+            etag=download_result.etag,
         )
 
     async def _precache_image_url(
@@ -474,8 +582,8 @@ class PreviewUrlResource(DirectServeJsonResource):
         # FIXME: it might be cleaner to use the same flow as the main /preview_url
         # request itself and benefit from the same caching etc.  But for now we
         # just rely on the caching on the master request to speed things up.
-        image_info = await self._download_url(
-            rebase_url(og["og:image"], media_info.uri), user
+        image_info = await self._handle_url(
+            rebase_url(og["og:image"], media_info.uri), user, allow_data_urls=True
         )
 
         if _is_media(image_info.media_type):
diff --git a/tests/rest/media/v1/test_html_preview.py b/tests/rest/media/v1/test_html_preview.py
new file mode 100644
index 0000000000..a4b57e3d1f
--- /dev/null
+++ b/tests/rest/media/v1/test_html_preview.py
@@ -0,0 +1,481 @@
+# Copyright 2014-2016 OpenMarket Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from synapse.rest.media.v1.preview_html import (
+    _get_html_media_encodings,
+    decode_body,
+    parse_html_to_open_graph,
+    rebase_url,
+    summarize_paragraphs,
+)
+
+from tests import unittest
+
+try:
+    import lxml
+except ImportError:
+    lxml = None
+
+
+class SummarizeTestCase(unittest.TestCase):
+    if not lxml:
+        skip = "url preview feature requires lxml"
+
+    def test_long_summarize(self):
+        example_paras = [
+            """Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:
+            Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in
+            Troms county, Norway. The administrative centre of the municipality is
+            the city of Tromsø. Outside of Norway, Tromso and Tromsö are
+            alternative spellings of the city.Tromsø is considered the northernmost
+            city in the world with a population above 50,000. The most populous town
+            north of it is Alta, Norway, with a population of 14,272 (2013).""",
+            """Tromsø lies in Northern Norway. The municipality has a population of
+            (2015) 72,066, but with an annual influx of students it has over 75,000
+            most of the year. It is the largest urban area in Northern Norway and the
+            third largest north of the Arctic Circle (following Murmansk and Norilsk).
+            Most of Tromsø, including the city centre, is located on the island of
+            Tromsøya, 350 kilometres (217 mi) north of the Arctic Circle. In 2012,
+            Tromsøya had a population of 36,088. Substantial parts of the urban area
+            are also situated on the mainland to the east, and on parts of Kvaløya—a
+            large island to the west. Tromsøya is connected to the mainland by the Tromsø
+            Bridge and the Tromsøysund Tunnel, and to the island of Kvaløya by the
+            Sandnessund Bridge. Tromsø Airport connects the city to many destinations
+            in Europe. The city is warmer than most other places located on the same
+            latitude, due to the warming effect of the Gulf Stream.""",
+            """The city centre of Tromsø contains the highest number of old wooden
+            houses in Northern Norway, the oldest house dating from 1789. The Arctic
+            Cathedral, a modern church from 1965, is probably the most famous landmark
+            in Tromsø. The city is a cultural centre for its region, with several
+            festivals taking place in the summer. Some of Norway's best-known
+             musicians, Torbjørn Brundtland and Svein Berge of the electronica duo
+             Röyksopp and Lene Marlin grew up and started their careers in Tromsø.
+             Noted electronic musician Geir Jenssen also hails from Tromsø.""",
+        ]
+
+        desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
+
+        self.assertEqual(
+            desc,
+            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
+            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
+            " Troms county, Norway. The administrative centre of the municipality is"
+            " the city of Tromsø. Outside of Norway, Tromso and Tromsö are"
+            " alternative spellings of the city.Tromsø is considered the northernmost"
+            " city in the world with a population above 50,000. The most populous town"
+            " north of it is Alta, Norway, with a population of 14,272 (2013).",
+        )
+
+        desc = summarize_paragraphs(example_paras[1:], min_size=200, max_size=500)
+
+        self.assertEqual(
+            desc,
+            "Tromsø lies in Northern Norway. The municipality has a population of"
+            " (2015) 72,066, but with an annual influx of students it has over 75,000"
+            " most of the year. It is the largest urban area in Northern Norway and the"
+            " third largest north of the Arctic Circle (following Murmansk and Norilsk)."
+            " Most of Tromsø, including the city centre, is located on the island of"
+            " Tromsøya, 350 kilometres (217 mi) north of the Arctic Circle. In 2012,"
+            " Tromsøya had a population of 36,088. Substantial parts of the urban…",
+        )
+
+    def test_short_summarize(self):
+        example_paras = [
+            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
+            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
+            " Troms county, Norway.",
+            "Tromsø lies in Northern Norway. The municipality has a population of"
+            " (2015) 72,066, but with an annual influx of students it has over 75,000"
+            " most of the year.",
+            "The city centre of Tromsø contains the highest number of old wooden"
+            " houses in Northern Norway, the oldest house dating from 1789. The Arctic"
+            " Cathedral, a modern church from 1965, is probably the most famous landmark"
+            " in Tromsø.",
+        ]
+
+        desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
+
+        self.assertEqual(
+            desc,
+            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
+            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
+            " Troms county, Norway.\n"
+            "\n"
+            "Tromsø lies in Northern Norway. The municipality has a population of"
+            " (2015) 72,066, but with an annual influx of students it has over 75,000"
+            " most of the year.",
+        )
+
+    def test_small_then_large_summarize(self):
+        example_paras = [
+            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
+            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
+            " Troms county, Norway.",
+            "Tromsø lies in Northern Norway. The municipality has a population of"
+            " (2015) 72,066, but with an annual influx of students it has over 75,000"
+            " most of the year."
+            " The city centre of Tromsø contains the highest number of old wooden"
+            " houses in Northern Norway, the oldest house dating from 1789. The Arctic"
+            " Cathedral, a modern church from 1965, is probably the most famous landmark"
+            " in Tromsø.",
+        ]
+
+        desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
+        self.assertEqual(
+            desc,
+            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
+            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
+            " Troms county, Norway.\n"
+            "\n"
+            "Tromsø lies in Northern Norway. The municipality has a population of"
+            " (2015) 72,066, but with an annual influx of students it has over 75,000"
+            " most of the year. The city centre of Tromsø contains the highest number"
+            " of old wooden houses in Northern Norway, the oldest house dating from"
+            " 1789. The Arctic Cathedral, a modern church from…",
+        )
+
+
+class CalcOgTestCase(unittest.TestCase):
+    if not lxml:
+        skip = "url preview feature requires lxml"
+
+    def test_simple(self):
+        html = b"""
+        <html>
+        <head><title>Foo</title></head>
+        <body>
+        Some text.
+        </body>
+        </html>
+        """
+
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+
+        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
+
+    def test_comment(self):
+        html = b"""
+        <html>
+        <head><title>Foo</title></head>
+        <body>
+        <!-- HTML comment -->
+        Some text.
+        </body>
+        </html>
+        """
+
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+
+        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
+
+    def test_comment2(self):
+        html = b"""
+        <html>
+        <head><title>Foo</title></head>
+        <body>
+        Some text.
+        <!-- HTML comment -->
+        Some more text.
+        <p>Text</p>
+        More text
+        </body>
+        </html>
+        """
+
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+
+        self.assertEqual(
+            og,
+            {
+                "og:title": "Foo",
+                "og:description": "Some text.\n\nSome more text.\n\nText\n\nMore text",
+            },
+        )
+
+    def test_script(self):
+        html = b"""
+        <html>
+        <head><title>Foo</title></head>
+        <body>
+        <script> (function() {})() </script>
+        Some text.
+        </body>
+        </html>
+        """
+
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+
+        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
+
+    def test_missing_title(self):
+        html = b"""
+        <html>
+        <body>
+        Some text.
+        </body>
+        </html>
+        """
+
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+
+        self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
+
+    def test_h1_as_title(self):
+        html = b"""
+        <html>
+        <meta property="og:description" content="Some text."/>
+        <body>
+        <h1>Title</h1>
+        </body>
+        </html>
+        """
+
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+
+        self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
+
+    def test_missing_title_and_broken_h1(self):
+        html = b"""
+        <html>
+        <body>
+        <h1><a href="foo"/></h1>
+        Some text.
+        </body>
+        </html>
+        """
+
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+
+        self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
+
+    def test_empty(self):
+        """Test a body with no data in it."""
+        html = b""
+        tree = decode_body(html, "http://example.com/test.html")
+        self.assertIsNone(tree)
+
+    def test_no_tree(self):
+        """A valid body with no tree in it."""
+        html = b"\x00"
+        tree = decode_body(html, "http://example.com/test.html")
+        self.assertIsNone(tree)
+
+    def test_xml(self):
+        """Test decoding XML and ensure it works properly."""
+        # Note that the strip() call is important to ensure the xml tag starts
+        # at the initial byte.
+        html = b"""
+        <?xml version="1.0" encoding="UTF-8"?>
+
+        <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+        <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+        <head><title>Foo</title></head><body>Some text.</body></html>
+        """.strip()
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
+
+    def test_invalid_encoding(self):
+        """An invalid character encoding should be ignored and treated as UTF-8, if possible."""
+        html = b"""
+        <html>
+        <head><title>Foo</title></head>
+        <body>
+        Some text.
+        </body>
+        </html>
+        """
+        tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
+        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
+
+    def test_invalid_encoding2(self):
+        """A body which doesn't match the sent character encoding."""
+        # Note that this contains an invalid UTF-8 sequence in the title.
+        html = b"""
+        <html>
+        <head><title>\xff\xff Foo</title></head>
+        <body>
+        Some text.
+        </body>
+        </html>
+        """
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+        self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
+
+    def test_windows_1252(self):
+        """A body which uses cp1252, but doesn't declare that."""
+        html = b"""
+        <html>
+        <head><title>\xf3</title></head>
+        <body>
+        Some text.
+        </body>
+        </html>
+        """
+        tree = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
+        self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
+
+
+class MediaEncodingTestCase(unittest.TestCase):
+    def test_meta_charset(self):
+        """A character encoding is found via the meta tag."""
+        encodings = _get_html_media_encodings(
+            b"""
+        <html>
+        <head><meta charset="ascii">
+        </head>
+        </html>
+        """,
+            "text/html",
+        )
+        self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
+
+        # A less well-formed version.
+        encodings = _get_html_media_encodings(
+            b"""
+        <html>
+        <head>< meta charset = ascii>
+        </head>
+        </html>
+        """,
+            "text/html",
+        )
+        self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
+
+    def test_meta_charset_underscores(self):
+        """A character encoding contains underscore."""
+        encodings = _get_html_media_encodings(
+            b"""
+        <html>
+        <head><meta charset="Shift_JIS">
+        </head>
+        </html>
+        """,
+            "text/html",
+        )
+        self.assertEqual(list(encodings), ["shift_jis", "utf-8", "cp1252"])
+
+    def test_xml_encoding(self):
+        """A character encoding is found via the meta tag."""
+        encodings = _get_html_media_encodings(
+            b"""
+        <?xml version="1.0" encoding="ascii"?>
+        <html>
+        </html>
+        """,
+            "text/html",
+        )
+        self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
+
+    def test_meta_xml_encoding(self):
+        """Meta tags take precedence over XML encoding."""
+        encodings = _get_html_media_encodings(
+            b"""
+        <?xml version="1.0" encoding="ascii"?>
+        <html>
+        <head><meta charset="UTF-16">
+        </head>
+        </html>
+        """,
+            "text/html",
+        )
+        self.assertEqual(list(encodings), ["utf-16", "ascii", "utf-8", "cp1252"])
+
+    def test_content_type(self):
+        """A character encoding is found via the Content-Type header."""
+        # Test a few variations of the header.
+        headers = (
+            'text/html; charset="ascii";',
+            "text/html;charset=ascii;",
+            'text/html;  charset="ascii"',
+            "text/html; charset=ascii",
+            'text/html; charset="ascii;',
+            'text/html; charset=ascii";',
+        )
+        for header in headers:
+            encodings = _get_html_media_encodings(b"", header)
+            self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
+
+    def test_fallback(self):
+        """A character encoding cannot be found in the body or header."""
+        encodings = _get_html_media_encodings(b"", "text/html")
+        self.assertEqual(list(encodings), ["utf-8", "cp1252"])
+
+    def test_duplicates(self):
+        """Ensure each encoding is only attempted once."""
+        encodings = _get_html_media_encodings(
+            b"""
+        <?xml version="1.0" encoding="utf8"?>
+        <html>
+        <head><meta charset="UTF-8">
+        </head>
+        </html>
+        """,
+            'text/html; charset="UTF_8"',
+        )
+        self.assertEqual(list(encodings), ["utf-8", "cp1252"])
+
+    def test_unknown_invalid(self):
+        """A character encoding should be ignored if it is unknown or invalid."""
+        encodings = _get_html_media_encodings(
+            b"""
+        <html>
+        <head><meta charset="invalid">
+        </head>
+        </html>
+        """,
+            'text/html; charset="invalid"',
+        )
+        self.assertEqual(list(encodings), ["utf-8", "cp1252"])
+
+
+class RebaseUrlTestCase(unittest.TestCase):
+    def test_relative(self):
+        """Relative URLs should be resolved based on the context of the base URL."""
+        self.assertEqual(
+            rebase_url("subpage", "https://example.com/foo/"),
+            "https://example.com/foo/subpage",
+        )
+        self.assertEqual(
+            rebase_url("sibling", "https://example.com/foo"),
+            "https://example.com/sibling",
+        )
+        self.assertEqual(
+            rebase_url("/bar", "https://example.com/foo/"),
+            "https://example.com/bar",
+        )
+
+    def test_absolute(self):
+        """Absolute URLs should not be modified."""
+        self.assertEqual(
+            rebase_url("https://alice.com/a/", "https://example.com/foo/"),
+            "https://alice.com/a/",
+        )
+
+    def test_data(self):
+        """Data URLs should not be modified."""
+        self.assertEqual(
+            rebase_url("data:,Hello%2C%20World%21", "https://example.com/foo/"),
+            "data:,Hello%2C%20World%21",
+        )
diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py
index 16e904f15b..53f6186213 100644
--- a/tests/rest/media/v1/test_url_preview.py
+++ b/tests/rest/media/v1/test_url_preview.py
@@ -12,9 +12,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import base64
 import json
 import os
 import re
+from urllib.parse import urlencode
 
 from twisted.internet._resolver import HostResolution
 from twisted.internet.address import IPv4Address, IPv6Address
@@ -23,6 +25,7 @@ from twisted.test.proto_helpers import AccumulatingProtocol
 
 from synapse.config.oembed import OEmbedEndpointConfig
 from synapse.rest.media.v1.preview_url_resource import IMAGE_CACHE_EXPIRY_MS
+from synapse.types import JsonDict
 from synapse.util.stringutils import parse_and_validate_mxc_uri
 
 from tests import unittest
@@ -142,6 +145,14 @@ class URLPreviewTests(unittest.HomeserverTestCase):
     def create_test_resource(self):
         return self.hs.get_media_repository_resource()
 
+    def _assert_small_png(self, json_body: JsonDict) -> None:
+        """Assert properties from the SMALL_PNG test image."""
+        self.assertTrue(json_body["og:image"].startswith("mxc://"))
+        self.assertEqual(json_body["og:image:height"], 1)
+        self.assertEqual(json_body["og:image:width"], 1)
+        self.assertEqual(json_body["og:image:type"], "image/png")
+        self.assertEqual(json_body["matrix:image:size"], 67)
+
     def test_cache_returns_correct_type(self):
         self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
 
@@ -569,6 +580,66 @@ class URLPreviewTests(unittest.HomeserverTestCase):
             server.data,
         )
 
+    def test_data_url(self):
+        """
+        Requesting to preview a data URL is not supported.
+        """
+        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
+
+        data = base64.b64encode(SMALL_PNG).decode()
+
+        query_params = urlencode(
+            {
+                "url": f'<html><head><img src="data:image/png;base64,{data}" /></head></html>'
+            }
+        )
+
+        channel = self.make_request(
+            "GET",
+            f"preview_url?{query_params}",
+            shorthand=False,
+        )
+        self.pump()
+
+        self.assertEqual(channel.code, 500)
+
+    def test_inline_data_url(self):
+        """
+        An inline image (as a data URL) should be parsed properly.
+        """
+        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
+
+        data = base64.b64encode(SMALL_PNG)
+
+        end_content = (
+            b"<html><head>" b'<img src="data:image/png;base64,%s" />' b"</head></html>"
+        ) % (data,)
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://matrix.org",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b'Content-Type: text/html; charset="utf8"\r\n\r\n'
+            )
+            % (len(end_content),)
+            + end_content
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 200)
+        self._assert_small_png(channel.json_body)
+
     def test_oembed_photo(self):
         """Test an oEmbed endpoint which returns a 'photo' type which redirects the preview to a new URL."""
         self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
@@ -626,10 +697,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
         self.assertEqual(channel.code, 200)
         body = channel.json_body
         self.assertEqual(body["og:url"], "http://twitter.com/matrixdotorg/status/12345")
-        self.assertTrue(body["og:image"].startswith("mxc://"))
-        self.assertEqual(body["og:image:height"], 1)
-        self.assertEqual(body["og:image:width"], 1)
-        self.assertEqual(body["og:image:type"], "image/png")
+        self._assert_small_png(body)
 
     def test_oembed_rich(self):
         """Test an oEmbed endpoint which returns HTML content via the 'rich' type."""
@@ -820,10 +888,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
         self.assertEqual(
             body["og:url"], "http://www.twitter.com/matrixdotorg/status/12345"
         )
-        self.assertTrue(body["og:image"].startswith("mxc://"))
-        self.assertEqual(body["og:image:height"], 1)
-        self.assertEqual(body["og:image:width"], 1)
-        self.assertEqual(body["og:image:type"], "image/png")
+        self._assert_small_png(body)
 
     def _download_image(self):
         """Downloads an image into the URL cache.
diff --git a/tests/server.py b/tests/server.py
index a0cd14ea45..82990c2eb9 100644
--- a/tests/server.py
+++ b/tests/server.py
@@ -313,7 +313,7 @@ def make_request(
     req = request(channel, site)
     req.content = BytesIO(content)
     # Twisted expects to be at the end of the content when parsing the request.
-    req.content.seek(SEEK_END)
+    req.content.seek(0, SEEK_END)
 
     if access_token:
         req.requestHeaders.addRawHeader(
diff --git a/tests/test_preview.py b/tests/test_preview.py
deleted file mode 100644
index 46e02f483f..0000000000
--- a/tests/test_preview.py
+++ /dev/null
@@ -1,449 +0,0 @@
-# Copyright 2014-2016 OpenMarket Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from synapse.rest.media.v1.preview_html import (
-    _get_html_media_encodings,
-    decode_body,
-    parse_html_to_open_graph,
-    summarize_paragraphs,
-)
-
-from . import unittest
-
-try:
-    import lxml
-except ImportError:
-    lxml = None
-
-
-class SummarizeTestCase(unittest.TestCase):
-    if not lxml:
-        skip = "url preview feature requires lxml"
-
-    def test_long_summarize(self):
-        example_paras = [
-            """Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:
-            Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in
-            Troms county, Norway. The administrative centre of the municipality is
-            the city of Tromsø. Outside of Norway, Tromso and Tromsö are
-            alternative spellings of the city.Tromsø is considered the northernmost
-            city in the world with a population above 50,000. The most populous town
-            north of it is Alta, Norway, with a population of 14,272 (2013).""",
-            """Tromsø lies in Northern Norway. The municipality has a population of
-            (2015) 72,066, but with an annual influx of students it has over 75,000
-            most of the year. It is the largest urban area in Northern Norway and the
-            third largest north of the Arctic Circle (following Murmansk and Norilsk).
-            Most of Tromsø, including the city centre, is located on the island of
-            Tromsøya, 350 kilometres (217 mi) north of the Arctic Circle. In 2012,
-            Tromsøya had a population of 36,088. Substantial parts of the urban area
-            are also situated on the mainland to the east, and on parts of Kvaløya—a
-            large island to the west. Tromsøya is connected to the mainland by the Tromsø
-            Bridge and the Tromsøysund Tunnel, and to the island of Kvaløya by the
-            Sandnessund Bridge. Tromsø Airport connects the city to many destinations
-            in Europe. The city is warmer than most other places located on the same
-            latitude, due to the warming effect of the Gulf Stream.""",
-            """The city centre of Tromsø contains the highest number of old wooden
-            houses in Northern Norway, the oldest house dating from 1789. The Arctic
-            Cathedral, a modern church from 1965, is probably the most famous landmark
-            in Tromsø. The city is a cultural centre for its region, with several
-            festivals taking place in the summer. Some of Norway's best-known
-             musicians, Torbjørn Brundtland and Svein Berge of the electronica duo
-             Röyksopp and Lene Marlin grew up and started their careers in Tromsø.
-             Noted electronic musician Geir Jenssen also hails from Tromsø.""",
-        ]
-
-        desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
-
-        self.assertEqual(
-            desc,
-            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
-            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
-            " Troms county, Norway. The administrative centre of the municipality is"
-            " the city of Tromsø. Outside of Norway, Tromso and Tromsö are"
-            " alternative spellings of the city.Tromsø is considered the northernmost"
-            " city in the world with a population above 50,000. The most populous town"
-            " north of it is Alta, Norway, with a population of 14,272 (2013).",
-        )
-
-        desc = summarize_paragraphs(example_paras[1:], min_size=200, max_size=500)
-
-        self.assertEqual(
-            desc,
-            "Tromsø lies in Northern Norway. The municipality has a population of"
-            " (2015) 72,066, but with an annual influx of students it has over 75,000"
-            " most of the year. It is the largest urban area in Northern Norway and the"
-            " third largest north of the Arctic Circle (following Murmansk and Norilsk)."
-            " Most of Tromsø, including the city centre, is located on the island of"
-            " Tromsøya, 350 kilometres (217 mi) north of the Arctic Circle. In 2012,"
-            " Tromsøya had a population of 36,088. Substantial parts of the urban…",
-        )
-
-    def test_short_summarize(self):
-        example_paras = [
-            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
-            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
-            " Troms county, Norway.",
-            "Tromsø lies in Northern Norway. The municipality has a population of"
-            " (2015) 72,066, but with an annual influx of students it has over 75,000"
-            " most of the year.",
-            "The city centre of Tromsø contains the highest number of old wooden"
-            " houses in Northern Norway, the oldest house dating from 1789. The Arctic"
-            " Cathedral, a modern church from 1965, is probably the most famous landmark"
-            " in Tromsø.",
-        ]
-
-        desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
-
-        self.assertEqual(
-            desc,
-            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
-            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
-            " Troms county, Norway.\n"
-            "\n"
-            "Tromsø lies in Northern Norway. The municipality has a population of"
-            " (2015) 72,066, but with an annual influx of students it has over 75,000"
-            " most of the year.",
-        )
-
-    def test_small_then_large_summarize(self):
-        example_paras = [
-            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
-            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
-            " Troms county, Norway.",
-            "Tromsø lies in Northern Norway. The municipality has a population of"
-            " (2015) 72,066, but with an annual influx of students it has over 75,000"
-            " most of the year."
-            " The city centre of Tromsø contains the highest number of old wooden"
-            " houses in Northern Norway, the oldest house dating from 1789. The Arctic"
-            " Cathedral, a modern church from 1965, is probably the most famous landmark"
-            " in Tromsø.",
-        ]
-
-        desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
-        self.assertEqual(
-            desc,
-            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
-            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
-            " Troms county, Norway.\n"
-            "\n"
-            "Tromsø lies in Northern Norway. The municipality has a population of"
-            " (2015) 72,066, but with an annual influx of students it has over 75,000"
-            " most of the year. The city centre of Tromsø contains the highest number"
-            " of old wooden houses in Northern Norway, the oldest house dating from"
-            " 1789. The Arctic Cathedral, a modern church from…",
-        )
-
-
-class CalcOgTestCase(unittest.TestCase):
-    if not lxml:
-        skip = "url preview feature requires lxml"
-
-    def test_simple(self):
-        html = b"""
-        <html>
-        <head><title>Foo</title></head>
-        <body>
-        Some text.
-        </body>
-        </html>
-        """
-
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
-
-        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
-
-    def test_comment(self):
-        html = b"""
-        <html>
-        <head><title>Foo</title></head>
-        <body>
-        <!-- HTML comment -->
-        Some text.
-        </body>
-        </html>
-        """
-
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
-
-        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
-
-    def test_comment2(self):
-        html = b"""
-        <html>
-        <head><title>Foo</title></head>
-        <body>
-        Some text.
-        <!-- HTML comment -->
-        Some more text.
-        <p>Text</p>
-        More text
-        </body>
-        </html>
-        """
-
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
-
-        self.assertEqual(
-            og,
-            {
-                "og:title": "Foo",
-                "og:description": "Some text.\n\nSome more text.\n\nText\n\nMore text",
-            },
-        )
-
-    def test_script(self):
-        html = b"""
-        <html>
-        <head><title>Foo</title></head>
-        <body>
-        <script> (function() {})() </script>
-        Some text.
-        </body>
-        </html>
-        """
-
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
-
-        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
-
-    def test_missing_title(self):
-        html = b"""
-        <html>
-        <body>
-        Some text.
-        </body>
-        </html>
-        """
-
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
-
-        self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
-
-    def test_h1_as_title(self):
-        html = b"""
-        <html>
-        <meta property="og:description" content="Some text."/>
-        <body>
-        <h1>Title</h1>
-        </body>
-        </html>
-        """
-
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
-
-        self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
-
-    def test_missing_title_and_broken_h1(self):
-        html = b"""
-        <html>
-        <body>
-        <h1><a href="foo"/></h1>
-        Some text.
-        </body>
-        </html>
-        """
-
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
-
-        self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
-
-    def test_empty(self):
-        """Test a body with no data in it."""
-        html = b""
-        tree = decode_body(html, "http://example.com/test.html")
-        self.assertIsNone(tree)
-
-    def test_no_tree(self):
-        """A valid body with no tree in it."""
-        html = b"\x00"
-        tree = decode_body(html, "http://example.com/test.html")
-        self.assertIsNone(tree)
-
-    def test_xml(self):
-        """Test decoding XML and ensure it works properly."""
-        # Note that the strip() call is important to ensure the xml tag starts
-        # at the initial byte.
-        html = b"""
-        <?xml version="1.0" encoding="UTF-8"?>
-
-        <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-        <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
-        <head><title>Foo</title></head><body>Some text.</body></html>
-        """.strip()
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
-        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
-
-    def test_invalid_encoding(self):
-        """An invalid character encoding should be ignored and treated as UTF-8, if possible."""
-        html = b"""
-        <html>
-        <head><title>Foo</title></head>
-        <body>
-        Some text.
-        </body>
-        </html>
-        """
-        tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
-        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
-        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
-
-    def test_invalid_encoding2(self):
-        """A body which doesn't match the sent character encoding."""
-        # Note that this contains an invalid UTF-8 sequence in the title.
-        html = b"""
-        <html>
-        <head><title>\xff\xff Foo</title></head>
-        <body>
-        Some text.
-        </body>
-        </html>
-        """
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
-        self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
-
-    def test_windows_1252(self):
-        """A body which uses cp1252, but doesn't declare that."""
-        html = b"""
-        <html>
-        <head><title>\xf3</title></head>
-        <body>
-        Some text.
-        </body>
-        </html>
-        """
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree, "http://example.com/test.html")
-        self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
-
-
-class MediaEncodingTestCase(unittest.TestCase):
-    def test_meta_charset(self):
-        """A character encoding is found via the meta tag."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <html>
-        <head><meta charset="ascii">
-        </head>
-        </html>
-        """,
-            "text/html",
-        )
-        self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
-
-        # A less well-formed version.
-        encodings = _get_html_media_encodings(
-            b"""
-        <html>
-        <head>< meta charset = ascii>
-        </head>
-        </html>
-        """,
-            "text/html",
-        )
-        self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
-
-    def test_meta_charset_underscores(self):
-        """A character encoding contains underscore."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <html>
-        <head><meta charset="Shift_JIS">
-        </head>
-        </html>
-        """,
-            "text/html",
-        )
-        self.assertEqual(list(encodings), ["shift_jis", "utf-8", "cp1252"])
-
-    def test_xml_encoding(self):
-        """A character encoding is found via the meta tag."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <?xml version="1.0" encoding="ascii"?>
-        <html>
-        </html>
-        """,
-            "text/html",
-        )
-        self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
-
-    def test_meta_xml_encoding(self):
-        """Meta tags take precedence over XML encoding."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <?xml version="1.0" encoding="ascii"?>
-        <html>
-        <head><meta charset="UTF-16">
-        </head>
-        </html>
-        """,
-            "text/html",
-        )
-        self.assertEqual(list(encodings), ["utf-16", "ascii", "utf-8", "cp1252"])
-
-    def test_content_type(self):
-        """A character encoding is found via the Content-Type header."""
-        # Test a few variations of the header.
-        headers = (
-            'text/html; charset="ascii";',
-            "text/html;charset=ascii;",
-            'text/html;  charset="ascii"',
-            "text/html; charset=ascii",
-            'text/html; charset="ascii;',
-            'text/html; charset=ascii";',
-        )
-        for header in headers:
-            encodings = _get_html_media_encodings(b"", header)
-            self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
-
-    def test_fallback(self):
-        """A character encoding cannot be found in the body or header."""
-        encodings = _get_html_media_encodings(b"", "text/html")
-        self.assertEqual(list(encodings), ["utf-8", "cp1252"])
-
-    def test_duplicates(self):
-        """Ensure each encoding is only attempted once."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <?xml version="1.0" encoding="utf8"?>
-        <html>
-        <head><meta charset="UTF-8">
-        </head>
-        </html>
-        """,
-            'text/html; charset="UTF_8"',
-        )
-        self.assertEqual(list(encodings), ["utf-8", "cp1252"])
-
-    def test_unknown_invalid(self):
-        """A character encoding should be ignored if it is unknown or invalid."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <html>
-        <head><meta charset="invalid">
-        </head>
-        </html>
-        """,
-            'text/html; charset="invalid"',
-        )
-        self.assertEqual(list(encodings), ["utf-8", "cp1252"])
-- 
cgit 1.5.1