From 10332c175c829a21b6565bc5e865bca154ffb084 Mon Sep 17 00:00:00 2001 From: David Teller Date: Mon, 18 Jan 2021 15:02:22 +0100 Subject: New API /_synapse/admin/rooms/{roomId}/context/{eventId} Signed-off-by: David Teller --- tests/rest/admin/test_room.py | 48 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'tests') diff --git a/tests/rest/admin/test_room.py b/tests/rest/admin/test_room.py index a0f32c5512..7e89eb4793 100644 --- a/tests/rest/admin/test_room.py +++ b/tests/rest/admin/test_room.py @@ -1430,6 +1430,54 @@ class JoinAliasRoomTestCase(unittest.HomeserverTestCase): self.assertEquals(200, int(channel.result["code"]), msg=channel.result["body"]) self.assertEqual(private_room_id, channel.json_body["joined_rooms"][0]) + def test_context(self): + """ + Test that, as admin, we can find the context of an event without having joined the room. + """ + + # Create a room. We're not part of it. + user_id = self.register_user("test", "test") + user_tok = self.login("test", "test") + room_id = self.helper.create_room_as(user_id, tok=user_tok) + + # Populate the room with events. + events = [] + for i in range(30): + events.append( + self.helper.send_event( + room_id, "com.example.test", content={"index": i}, tok=user_tok + ) + ) + + # Now let's fetch the context for this room. + midway = (len(events) - 1) // 2 + channel = self.make_request( + "GET", + "/_synapse/admin/v1/rooms/%s/context/%s" + % (room_id, events[midway]["event_id"]), + access_token=self.admin_user_tok, + ) + self.assertEquals(200, int(channel.result["code"]), msg=channel.result["body"]) + self.assertEquals( + channel.json_body["event"]["event_id"], events[midway]["event_id"] + ) + + for i, found_event in enumerate(channel.json_body["events_before"]): + for j, posted_event in enumerate(events): + if found_event["event_id"] == posted_event["event_id"]: + self.assertTrue(j < midway) + break + else: + self.fail("Event %s from events_before not found" % j) + + for i, found_event in enumerate(channel.json_body["events_after"]): + for j, posted_event in enumerate(events): + if found_event["event_id"] == posted_event["event_id"]: + self.assertTrue(j > midway) + break + else: + self.fail("Event %s from events_after not found" % j) + class MakeRoomAdminTestCase(unittest.HomeserverTestCase): servlets = [ -- cgit 1.5.1 From b859919acc3ad6c61ba26a3c9b1e36c75a30c3fe Mon Sep 17 00:00:00 2001 From: David Teller Date: Thu, 28 Jan 2021 12:18:07 +0100 Subject: FIXUP: Now testing that the user is admin! --- changelog.d/9150.feature | 5 +---- synapse/rest/admin/rooms.py | 3 ++- tests/rest/admin/test_room.py | 36 +++++++++++++++++++++++++++++++++++- 3 files changed, 38 insertions(+), 6 deletions(-) (limited to 'tests') diff --git a/changelog.d/9150.feature b/changelog.d/9150.feature index 86c4fd3d72..48a8148dee 100644 --- a/changelog.d/9150.feature +++ b/changelog.d/9150.feature @@ -1,4 +1 @@ -New API /_synapse/admin/rooms/{roomId}/context/{eventId} - -This API mirrors /_matrix/client/r0/rooms/{roomId}/context/{eventId} but lets administrators -inspect rooms. Designed to annotate abuse reports with context. +New API /_synapse/admin/rooms/{roomId}/context/{eventId}. diff --git a/synapse/rest/admin/rooms.py b/synapse/rest/admin/rooms.py index 6539655289..4393197549 100644 --- a/synapse/rest/admin/rooms.py +++ b/synapse/rest/admin/rooms.py @@ -578,7 +578,8 @@ class RoomEventContextServlet(RestServlet): self.auth = hs.get_auth() async def on_GET(self, request, room_id, event_id): - requester = await self.auth.get_user_by_req(request, allow_guest=True) + requester = await self.auth.get_user_by_req(request, allow_guest=False) + await assert_user_is_admin(self.auth, requester.user) limit = parse_integer(request, "limit", default=10) diff --git a/tests/rest/admin/test_room.py b/tests/rest/admin/test_room.py index 7e89eb4793..fd201993d3 100644 --- a/tests/rest/admin/test_room.py +++ b/tests/rest/admin/test_room.py @@ -1430,7 +1430,41 @@ class JoinAliasRoomTestCase(unittest.HomeserverTestCase): self.assertEquals(200, int(channel.result["code"]), msg=channel.result["body"]) self.assertEqual(private_room_id, channel.json_body["joined_rooms"][0]) - def test_context(self): + def test_context_as_non_admin(self): + """ + Test that, without being admin, one cannot use the context admin API + """ + # Create a room. + user_id = self.register_user("test", "test") + user_tok = self.login("test", "test") + + self.register_user("test_2", "test") + user_tok_2 = self.login("test_2", "test") + + room_id = self.helper.create_room_as(user_id, tok=user_tok) + + # Populate the room with events. + events = [] + for i in range(30): + events.append( + self.helper.send_event( + room_id, "com.example.test", content={"index": i}, tok=user_tok + ) + ) + + # Now attempt to find the context using the admin API without being admin. + midway = (len(events) - 1) // 2 + for tok in [user_tok, user_tok_2]: + channel = self.make_request( + "GET", + "/_synapse/admin/v1/rooms/%s/context/%s" + % (room_id, events[midway]["event_id"]), + access_token=tok, + ) + self.assertEquals(403, int(channel.result["code"]), msg=channel.result["body"]) + self.assertEqual(Codes.FORBIDDEN, channel.json_body["errcode"]) + + def test_context_as_admin(self): """ Test that, as admin, we can find the context of an event without having joined the room. """ -- cgit 1.5.1 From 31d072aea0a37ad5408995359b89080b5280f57d Mon Sep 17 00:00:00 2001 From: David Teller Date: Thu, 28 Jan 2021 16:53:40 +0100 Subject: FIXUP: linter --- synapse/handlers/room.py | 2 +- synapse/rest/admin/rooms.py | 5 +++++ tests/rest/admin/test_room.py | 4 +++- 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'tests') diff --git a/synapse/handlers/room.py b/synapse/handlers/room.py index e039cea024..99fd063084 100644 --- a/synapse/handlers/room.py +++ b/synapse/handlers/room.py @@ -53,7 +53,7 @@ from synapse.types import ( create_requester, ) from synapse.util import stringutils -from synapse.util.async_helpers import Linearizer, maybe_awaitable +from synapse.util.async_helpers import Linearizer from synapse.util.caches.response_cache import ResponseCache from synapse.util.stringutils import parse_and_validate_server_name from synapse.visibility import filter_events_for_client diff --git a/synapse/rest/admin/rooms.py b/synapse/rest/admin/rooms.py index 641e325581..c673ff07e7 100644 --- a/synapse/rest/admin/rooms.py +++ b/synapse/rest/admin/rooms.py @@ -15,9 +15,11 @@ import logging from http import HTTPStatus from typing import TYPE_CHECKING, List, Optional, Tuple +from urllib import parse as urlparse from synapse.api.constants import EventTypes, JoinRules, Membership from synapse.api.errors import AuthError, Codes, NotFoundError, SynapseError +from synapse.api.filtering import Filter from synapse.http.servlet import ( RestServlet, assert_params_in_dict, @@ -33,6 +35,7 @@ from synapse.rest.admin._base import ( ) from synapse.storage.databases.main.room import RoomSortOrder from synapse.types import JsonDict, RoomAlias, RoomID, UserID, create_requester +from synapse.util import json_decoder if TYPE_CHECKING: from synapse.server import HomeServer @@ -567,6 +570,7 @@ class ForwardExtremitiesRestServlet(RestServlet): extremities = await self.store.get_forward_extremities_for_room(room_id) return 200, {"count": len(extremities), "results": extremities} + class RoomEventContextServlet(RestServlet): """ Provide the context for an event. @@ -574,6 +578,7 @@ class RoomEventContextServlet(RestServlet): an abuse report and understand what happened during and immediately prior to this event. """ + PATTERNS = admin_patterns("/rooms/(?P[^/]*)/context/(?P[^/]*)$") def __init__(self, hs): diff --git a/tests/rest/admin/test_room.py b/tests/rest/admin/test_room.py index fd201993d3..f4f5569a89 100644 --- a/tests/rest/admin/test_room.py +++ b/tests/rest/admin/test_room.py @@ -1461,7 +1461,9 @@ class JoinAliasRoomTestCase(unittest.HomeserverTestCase): % (room_id, events[midway]["event_id"]), access_token=tok, ) - self.assertEquals(403, int(channel.result["code"]), msg=channel.result["body"]) + self.assertEquals( + 403, int(channel.result["code"]), msg=channel.result["body"] + ) self.assertEqual(Codes.FORBIDDEN, channel.json_body["errcode"]) def test_context_as_admin(self): -- cgit 1.5.1 From 0963d39ea66be95d44f8f4ae29fa6f33fc6740b4 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 8 Feb 2021 12:33:30 -0500 Subject: Handle additional errors when previewing URLs. (#9333) * Handle the case of lxml not finding a document tree. * Parse the document encoding from the XML tag. --- changelog.d/9333.bugfix | 1 + synapse/rest/media/v1/preview_url_resource.py | 71 +++++++++++++----- tests/test_preview.py | 103 +++++++++++++++++++++++--- 3 files changed, 145 insertions(+), 30 deletions(-) create mode 100644 changelog.d/9333.bugfix (limited to 'tests') diff --git a/changelog.d/9333.bugfix b/changelog.d/9333.bugfix new file mode 100644 index 0000000000..c34ba378c5 --- /dev/null +++ b/changelog.d/9333.bugfix @@ -0,0 +1 @@ +Fix additional errors when previewing URLs: "AttributeError 'NoneType' object has no attribute 'xpath'" and "ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.". diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index bf3be653aa..ae53b1d23f 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -58,7 +58,10 @@ if TYPE_CHECKING: logger = logging.getLogger(__name__) -_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I) +_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I) +_xml_encoding_match = re.compile( + br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I +) _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I) OG_TAG_NAME_MAXLEN = 50 @@ -300,24 +303,7 @@ class PreviewUrlResource(DirectServeJsonResource): with open(media_info["filename"], "rb") as file: body = file.read() - encoding = None - - # Let's try and figure out if it has an encoding set in a meta tag. - # Limit it to the first 1kb, since it ought to be in the meta tags - # at the top. - match = _charset_match.search(body[:1000]) - - # If we find a match, it should take precedence over the - # Content-Type header, so set it here. - if match: - encoding = match.group(1).decode("ascii") - - # If we don't find a match, we'll look at the HTTP Content-Type, and - # if that doesn't exist, we'll fall back to UTF-8. - if not encoding: - content_match = _content_type_match.match(media_info["media_type"]) - encoding = content_match.group(1) if content_match else "utf-8" - + encoding = get_html_media_encoding(body, media_info["media_type"]) og = decode_and_calc_og(body, media_info["uri"], encoding) # pre-cache the image for posterity @@ -689,6 +675,48 @@ class PreviewUrlResource(DirectServeJsonResource): logger.debug("No media removed from url cache") +def get_html_media_encoding(body: bytes, content_type: str) -> str: + """ + Get the encoding of the body based on the (presumably) HTML body or media_type. + + The precedence used for finding a character encoding is: + + 1. meta tag with a charset declared. + 2. The XML document's character encoding attribute. + 3. The Content-Type header. + 4. Fallback to UTF-8. + + Args: + body: The HTML document, as bytes. + content_type: The Content-Type header. + + Returns: + The character encoding of the body, as a string. + """ + # Limit searches to the first 1kb, since it ought to be at the top. + body_start = body[:1024] + + # Let's try and figure out if it has an encoding set in a meta tag. + match = _charset_match.search(body_start) + if match: + return match.group(1).decode("ascii") + + # TODO Support + + # If we didn't find a match, see if it an XML document with an encoding. + match = _xml_encoding_match.match(body_start) + if match: + return match.group(1).decode("ascii") + + # If we don't find a match, we'll look at the HTTP Content-Type, and + # if that doesn't exist, we'll fall back to UTF-8. + content_match = _content_type_match.match(content_type) + if content_match: + return content_match.group(1) + + return "utf-8" + + def decode_and_calc_og( body: bytes, media_uri: str, request_encoding: Optional[str] = None ) -> Dict[str, Optional[str]]: @@ -725,6 +753,11 @@ def decode_and_calc_og( def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]: # Attempt to parse the body. If this fails, log and return no metadata. tree = etree.fromstring(body_attempt, parser) + + # The data was successfully parsed, but no tree was found. + if tree is None: + return {} + return _calc_og(tree, media_uri) # Attempt to parse the body. If this fails, log and return no metadata. diff --git a/tests/test_preview.py b/tests/test_preview.py index 0c6cbbd921..ea83299918 100644 --- a/tests/test_preview.py +++ b/tests/test_preview.py @@ -15,6 +15,7 @@ from synapse.rest.media.v1.preview_url_resource import ( decode_and_calc_og, + get_html_media_encoding, summarize_paragraphs, ) @@ -26,7 +27,7 @@ except ImportError: lxml = None -class PreviewTestCase(unittest.TestCase): +class SummarizeTestCase(unittest.TestCase): if not lxml: skip = "url preview feature requires lxml" @@ -144,12 +145,12 @@ class PreviewTestCase(unittest.TestCase): ) -class PreviewUrlTestCase(unittest.TestCase): +class CalcOgTestCase(unittest.TestCase): if not lxml: skip = "url preview feature requires lxml" def test_simple(self): - html = """ + html = b""" Foo @@ -163,7 +164,7 @@ class PreviewUrlTestCase(unittest.TestCase): self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) def test_comment(self): - html = """ + html = b""" Foo @@ -178,7 +179,7 @@ class PreviewUrlTestCase(unittest.TestCase): self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) def test_comment2(self): - html = """ + html = b""" Foo @@ -202,7 +203,7 @@ class PreviewUrlTestCase(unittest.TestCase): ) def test_script(self): - html = """ + html = b""" Foo @@ -217,7 +218,7 @@ class PreviewUrlTestCase(unittest.TestCase): self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) def test_missing_title(self): - html = """ + html = b""" Some text. @@ -230,7 +231,7 @@ class PreviewUrlTestCase(unittest.TestCase): self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) def test_h1_as_title(self): - html = """ + html = b""" @@ -244,7 +245,7 @@ class PreviewUrlTestCase(unittest.TestCase): self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."}) def test_missing_title_and_broken_h1(self): - html = """ + html = b"""

@@ -258,13 +259,20 @@ class PreviewUrlTestCase(unittest.TestCase): self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) def test_empty(self): - html = "" + """Test a body with no data in it.""" + html = b"" + og = decode_and_calc_og(html, "http://example.com/test.html") + self.assertEqual(og, {}) + + def test_no_tree(self): + """A valid body with no tree in it.""" + html = b"\x00" og = decode_and_calc_og(html, "http://example.com/test.html") self.assertEqual(og, {}) def test_invalid_encoding(self): """An invalid character encoding should be ignored and treated as UTF-8, if possible.""" - html = """ + html = b""" Foo @@ -290,3 +298,76 @@ class PreviewUrlTestCase(unittest.TestCase): """ og = decode_and_calc_og(html, "http://example.com/test.html") self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."}) + + +class MediaEncodingTestCase(unittest.TestCase): + def test_meta_charset(self): + """A character encoding is found via the meta tag.""" + encoding = get_html_media_encoding( + b""" + + + + + """, + "text/html", + ) + self.assertEqual(encoding, "ascii") + + # A less well-formed version. + encoding = get_html_media_encoding( + b""" + + < meta charset = ascii> + + + """, + "text/html", + ) + self.assertEqual(encoding, "ascii") + + def test_xml_encoding(self): + """A character encoding is found via the meta tag.""" + encoding = get_html_media_encoding( + b""" + + + + """, + "text/html", + ) + self.assertEqual(encoding, "ascii") + + def test_meta_xml_encoding(self): + """Meta tags take precedence over XML encoding.""" + encoding = get_html_media_encoding( + b""" + + + + + + """, + "text/html", + ) + self.assertEqual(encoding, "UTF-16") + + def test_content_type(self): + """A character encoding is found via the Content-Type header.""" + # Test a few variations of the header. + headers = ( + 'text/html; charset="ascii";', + "text/html;charset=ascii;", + 'text/html; charset="ascii"', + "text/html; charset=ascii", + 'text/html; charset="ascii;', + 'text/html; charset=ascii";', + ) + for header in headers: + encoding = get_html_media_encoding(b"", header) + self.assertEqual(encoding, "ascii") + + def test_fallback(self): + """A character encoding cannot be found in the body or header.""" + encoding = get_html_media_encoding(b"", "text/html") + self.assertEqual(encoding, "utf-8") -- cgit 1.5.1