Unescape HTML entities in oEmbed titles. (#14781)

It doesn't seem valid that HTML entities should appear in the title field of oEmbed responses, but a popular WordPress plug-in seems to do it. There should not be harm in unescaping these.
author: Jeyachandran Rathnam <jai.rathnem@gmail.com> 2023-01-09 09:22:02 -0500
committer: GitHub <noreply@github.com> 2023-01-09 14:22:02 +0000
commit: babeeb4e7a6f5b5c643b837bf724d674805546f6 (patch)
tree: ab7413393109e7b3cd4d65173b73f713ef2e4d11
parent: Improve /sync performance of when passing filters with empty arrays. (#14786) (diff)
download: synapse-babeeb4e7a6f5b5c643b837bf724d674805546f6.tar.xz
3 files changed, 20 insertions, 6 deletions
diff --git a/changelog.d/14781.misc b/changelog.d/14781.misc
new file mode 100644
index 0000000000..04f565b410
--- /dev/null
+++ b/changelog.d/14781.misc
@@ -0,0 +1 @@
+Unescape HTML entities in URL preview titles making use of oEmbed responses.
diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py
index 827afd868d..a3738a6250 100644
--- a/synapse/rest/media/v1/oembed.py
+++ b/synapse/rest/media/v1/oembed.py
@@ -11,6 +11,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+import html
 import logging
 import urllib.parse
 from typing import TYPE_CHECKING, List, Optional
@@ -161,7 +162,9 @@ class OEmbedProvider:
 
         title = oembed.get("title")
         if title and isinstance(title, str):
-            open_graph_response["og:title"] = title
+            # A common WordPress plug-in seems to incorrectly escape entities
+            # in the oEmbed response.
+            open_graph_response["og:title"] = html.unescape(title)
 
         author_name = oembed.get("author_name")
         if not isinstance(author_name, str):
@@ -180,9 +183,9 @@ class OEmbedProvider:
         # Process each type separately.
         oembed_type = oembed.get("type")
         if oembed_type == "rich":
-            html = oembed.get("html")
-            if isinstance(html, str):
-                calc_description_and_urls(open_graph_response, html)
+            html_str = oembed.get("html")
+            if isinstance(html_str, str):
+                calc_description_and_urls(open_graph_response, html_str)
 
         elif oembed_type == "photo":
             # If this is a photo, use the full image, not the thumbnail.
@@ -192,8 +195,8 @@ class OEmbedProvider:
 
         elif oembed_type == "video":
             open_graph_response["og:type"] = "video.other"
-            html = oembed.get("html")
-            if html and isinstance(html, str):
+            html_str = oembed.get("html")
+            if html_str and isinstance(html_str, str):
                 calc_description_and_urls(open_graph_response, oembed["html"])
             for size in ("width", "height"):
                 val = oembed.get(size)
diff --git a/tests/rest/media/v1/test_oembed.py b/tests/rest/media/v1/test_oembed.py
index 319ae8b1cc..3f7f1dbab9 100644
--- a/tests/rest/media/v1/test_oembed.py
+++ b/tests/rest/media/v1/test_oembed.py
@@ -150,3 +150,13 @@ class OEmbedTests(HomeserverTestCase):
         result = self.parse_response({"type": "link"})
         self.assertIn("og:type", result.open_graph_result)
         self.assertEqual(result.open_graph_result["og:type"], "website")
+
+    def test_title_html_entities(self) -> None:
+        """Test HTML entities in title"""
+        result = self.parse_response(
+            {"title": "Why JSON isn&#8217;t a Good Configuration Language"}
+        )
+        self.assertEqual(
+            result.open_graph_result["og:title"],
+            "Why JSON isn’t a Good Configuration Language",
+        )
author	Jeyachandran Rathnam <jai.rathnem@gmail.com>	2023-01-09 09:22:02 -0500
committer	GitHub <noreply@github.com>	2023-01-09 14:22:02 +0000
commit	babeeb4e7a6f5b5c643b837bf724d674805546f6 (patch)
tree	ab7413393109e7b3cd4d65173b73f713ef2e4d11
parent	Improve /sync performance of when passing filters with empty arrays. (#14786) (diff)
download	synapse-babeeb4e7a6f5b5c643b837bf724d674805546f6.tar.xz