diff options
author | Patrick Cloke <patrickc@matrix.org> | 2021-12-09 15:27:37 -0500 |
---|---|---|
committer | Patrick Cloke <patrickc@matrix.org> | 2022-05-24 13:18:58 -0400 |
commit | 6235ed2656fa68a45e8b5cda99fe257811988956 (patch) | |
tree | baa7051142165f848694f6de3b619b708c1731b3 | |
parent | Use BeautifulSoup instead of LXML directly. (diff) | |
download | synapse-6235ed2656fa68a45e8b5cda99fe257811988956.tar.xz |
Remove dead code.
-rw-r--r-- | synapse/rest/media/v1/preview_html.py | 83 | ||||
-rw-r--r-- | synapse/rest/media/v1/preview_url_resource.py | 2 | ||||
-rw-r--r-- | tests/rest/media/v1/test_html_preview.py | 128 |
3 files changed, 3 insertions, 210 deletions
diff --git a/synapse/rest/media/v1/preview_html.py b/synapse/rest/media/v1/preview_html.py index 1866df60bb..c9711956f1 100644 --- a/synapse/rest/media/v1/preview_html.py +++ b/synapse/rest/media/v1/preview_html.py @@ -11,11 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import codecs import itertools import logging import re -from typing import TYPE_CHECKING, Dict, Generator, Iterable, Iterator, Optional, Set +from typing import TYPE_CHECKING, Dict, Generator, Iterable, Iterator, Optional if TYPE_CHECKING: from bs4 import BeautifulSoup @@ -23,88 +22,10 @@ if TYPE_CHECKING: logger = logging.getLogger(__name__) -_charset_match = re.compile( - rb'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I -) -_xml_encoding_match = re.compile( - rb'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I -) _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I) -def _normalise_encoding(encoding: str) -> Optional[str]: - """Use the Python codec's name as the normalised entry.""" - try: - return codecs.lookup(encoding).name - except LookupError: - return None - - -def _get_html_media_encodings( - body: bytes, content_type: Optional[str] -) -> Iterable[str]: - """ - Get potential encoding of the body based on the (presumably) HTML body or the content-type header. - - The precedence used for finding a character encoding is: - - 1. <meta> tag with a charset declared. - 2. The XML document's character encoding attribute. - 3. The Content-Type header. - 4. Fallback to utf-8. - 5. Fallback to windows-1252. - - This roughly follows the algorithm used by BeautifulSoup's bs4.dammit.EncodingDetector. - - Args: - body: The HTML document, as bytes. - content_type: The Content-Type header. - - Returns: - The character encoding of the body, as a string. - """ - # There's no point in returning an encoding more than once. - attempted_encodings: Set[str] = set() - - # Limit searches to the first 1kb, since it ought to be at the top. - body_start = body[:1024] - - # Check if it has an encoding set in a meta tag. - match = _charset_match.search(body_start) - if match: - encoding = _normalise_encoding(match.group(1).decode("ascii")) - if encoding: - attempted_encodings.add(encoding) - yield encoding - - # TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> - - # Check if it has an XML document with an encoding. - match = _xml_encoding_match.match(body_start) - if match: - encoding = _normalise_encoding(match.group(1).decode("ascii")) - if encoding and encoding not in attempted_encodings: - attempted_encodings.add(encoding) - yield encoding - - # Check the HTTP Content-Type header for a character set. - if content_type: - content_match = _content_type_match.match(content_type) - if content_match: - encoding = _normalise_encoding(content_match.group(1)) - if encoding and encoding not in attempted_encodings: - attempted_encodings.add(encoding) - yield encoding - - # Finally, fallback to UTF-8, then windows-1252. - for fallback in ("utf-8", "cp1252"): - if fallback not in attempted_encodings: - yield fallback - - -def decode_body( - body: bytes, uri: str, content_type: Optional[str] = None -) -> Optional["BeautifulSoup"]: +def decode_body(body: bytes, uri: str) -> Optional["BeautifulSoup"]: """ This uses BeautifulSoup to parse the HTML document. diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 2b2db63bf7..e406f6c704 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -303,7 +303,7 @@ class PreviewUrlResource(DirectServeJsonResource): with open(media_info.filename, "rb") as file: body = file.read() - tree = decode_body(body, media_info.uri, media_info.media_type) + tree = decode_body(body, media_info.uri) if tree is not None: # Check if this HTML document points to oEmbed information and # defer to that. diff --git a/tests/rest/media/v1/test_html_preview.py b/tests/rest/media/v1/test_html_preview.py index 62e308814d..5e600450dc 100644 --- a/tests/rest/media/v1/test_html_preview.py +++ b/tests/rest/media/v1/test_html_preview.py @@ -13,7 +13,6 @@ # limitations under the License. from synapse.rest.media.v1.preview_html import ( - _get_html_media_encodings, decode_body, parse_html_to_open_graph, summarize_paragraphs, @@ -292,20 +291,6 @@ class CalcOgTestCase(unittest.TestCase): og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) - def test_invalid_encoding(self) -> None: - """An invalid character encoding should be ignored and treated as UTF-8, if possible.""" - html = b""" - <html> - <head><title>Foo</title></head> - <body> - Some text. - </body> - </html> - """ - tree = decode_body(html, "http://example.com/test.html", "invalid-encoding") - og = parse_html_to_open_graph(tree) - self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) - def test_invalid_encoding2(self) -> None: """A body which doesn't match the sent character encoding.""" # Note that this contains an invalid UTF-8 sequence in the title. @@ -334,116 +319,3 @@ class CalcOgTestCase(unittest.TestCase): tree = decode_body(html, "http://example.com/test.html") og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": "รณ", "og:description": "Some text."}) - - -class MediaEncodingTestCase(unittest.TestCase): - def test_meta_charset(self) -> None: - """A character encoding is found via the meta tag.""" - encodings = _get_html_media_encodings( - b""" - <html> - <head><meta charset="ascii"> - </head> - </html> - """, - "text/html", - ) - self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"]) - - # A less well-formed version. - encodings = _get_html_media_encodings( - b""" - <html> - <head>< meta charset = ascii> - </head> - </html> - """, - "text/html", - ) - self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"]) - - def test_meta_charset_underscores(self) -> None: - """A character encoding contains underscore.""" - encodings = _get_html_media_encodings( - b""" - <html> - <head><meta charset="Shift_JIS"> - </head> - </html> - """, - "text/html", - ) - self.assertEqual(list(encodings), ["shift_jis", "utf-8", "cp1252"]) - - def test_xml_encoding(self) -> None: - """A character encoding is found via the meta tag.""" - encodings = _get_html_media_encodings( - b""" - <?xml version="1.0" encoding="ascii"?> - <html> - </html> - """, - "text/html", - ) - self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"]) - - def test_meta_xml_encoding(self) -> None: - """Meta tags take precedence over XML encoding.""" - encodings = _get_html_media_encodings( - b""" - <?xml version="1.0" encoding="ascii"?> - <html> - <head><meta charset="UTF-16"> - </head> - </html> - """, - "text/html", - ) - self.assertEqual(list(encodings), ["utf-16", "ascii", "utf-8", "cp1252"]) - - def test_content_type(self) -> None: - """A character encoding is found via the Content-Type header.""" - # Test a few variations of the header. - headers = ( - 'text/html; charset="ascii";', - "text/html;charset=ascii;", - 'text/html; charset="ascii"', - "text/html; charset=ascii", - 'text/html; charset="ascii;', - 'text/html; charset=ascii";', - ) - for header in headers: - encodings = _get_html_media_encodings(b"", header) - self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"]) - - def test_fallback(self) -> None: - """A character encoding cannot be found in the body or header.""" - encodings = _get_html_media_encodings(b"", "text/html") - self.assertEqual(list(encodings), ["utf-8", "cp1252"]) - - def test_duplicates(self) -> None: - """Ensure each encoding is only attempted once.""" - encodings = _get_html_media_encodings( - b""" - <?xml version="1.0" encoding="utf8"?> - <html> - <head><meta charset="UTF-8"> - </head> - </html> - """, - 'text/html; charset="UTF_8"', - ) - self.assertEqual(list(encodings), ["utf-8", "cp1252"]) - - def test_unknown_invalid(self) -> None: - """A character encoding should be ignored if it is unknown or invalid.""" - encodings = _get_html_media_encodings( - b""" - <html> - <head><meta charset="invalid"> - </head> - </html> - """, - 'text/html; charset="invalid"', - ) - self.assertEqual(list(encodings), ["utf-8", "cp1252"]) |