summary refs log tree commit diff
diff options
context:
space:
mode:
authorJeyachandran Rathnam <jai.rathnem@gmail.com>2023-01-09 09:22:02 -0500
committerGitHub <noreply@github.com>2023-01-09 14:22:02 +0000
commitbabeeb4e7a6f5b5c643b837bf724d674805546f6 (patch)
treeab7413393109e7b3cd4d65173b73f713ef2e4d11
parentImprove /sync performance of when passing filters with empty arrays. (#14786) (diff)
downloadsynapse-babeeb4e7a6f5b5c643b837bf724d674805546f6.tar.xz
Unescape HTML entities in oEmbed titles. (#14781)
It doesn't seem valid that HTML entities should appear in
the title field of oEmbed responses, but a popular WordPress
plug-in seems to do it.

There should not be harm in unescaping these.
-rw-r--r--changelog.d/14781.misc1
-rw-r--r--synapse/rest/media/v1/oembed.py15
-rw-r--r--tests/rest/media/v1/test_oembed.py10
3 files changed, 20 insertions, 6 deletions
diff --git a/changelog.d/14781.misc b/changelog.d/14781.misc
new file mode 100644
index 0000000000..04f565b410
--- /dev/null
+++ b/changelog.d/14781.misc
@@ -0,0 +1 @@
+Unescape HTML entities in URL preview titles making use of oEmbed responses.
diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py
index 827afd868d..a3738a6250 100644
--- a/synapse/rest/media/v1/oembed.py
+++ b/synapse/rest/media/v1/oembed.py
@@ -11,6 +11,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+import html
 import logging
 import urllib.parse
 from typing import TYPE_CHECKING, List, Optional
@@ -161,7 +162,9 @@ class OEmbedProvider:
 
         title = oembed.get("title")
         if title and isinstance(title, str):
-            open_graph_response["og:title"] = title
+            # A common WordPress plug-in seems to incorrectly escape entities
+            # in the oEmbed response.
+            open_graph_response["og:title"] = html.unescape(title)
 
         author_name = oembed.get("author_name")
         if not isinstance(author_name, str):
@@ -180,9 +183,9 @@ class OEmbedProvider:
         # Process each type separately.
         oembed_type = oembed.get("type")
         if oembed_type == "rich":
-            html = oembed.get("html")
-            if isinstance(html, str):
-                calc_description_and_urls(open_graph_response, html)
+            html_str = oembed.get("html")
+            if isinstance(html_str, str):
+                calc_description_and_urls(open_graph_response, html_str)
 
         elif oembed_type == "photo":
             # If this is a photo, use the full image, not the thumbnail.
@@ -192,8 +195,8 @@ class OEmbedProvider:
 
         elif oembed_type == "video":
             open_graph_response["og:type"] = "video.other"
-            html = oembed.get("html")
-            if html and isinstance(html, str):
+            html_str = oembed.get("html")
+            if html_str and isinstance(html_str, str):
                 calc_description_and_urls(open_graph_response, oembed["html"])
             for size in ("width", "height"):
                 val = oembed.get(size)
diff --git a/tests/rest/media/v1/test_oembed.py b/tests/rest/media/v1/test_oembed.py
index 319ae8b1cc..3f7f1dbab9 100644
--- a/tests/rest/media/v1/test_oembed.py
+++ b/tests/rest/media/v1/test_oembed.py
@@ -150,3 +150,13 @@ class OEmbedTests(HomeserverTestCase):
         result = self.parse_response({"type": "link"})
         self.assertIn("og:type", result.open_graph_result)
         self.assertEqual(result.open_graph_result["og:type"], "website")
+
+    def test_title_html_entities(self) -> None:
+        """Test HTML entities in title"""
+        result = self.parse_response(
+            {"title": "Why JSON isn&#8217;t a Good Configuration Language"}
+        )
+        self.assertEqual(
+            result.open_graph_result["og:title"],
+            "Why JSON isn’t a Good Configuration Language",
+        )