summary refs log tree commit diff
path: root/tests/test_preview.py
blob: a883d707df2814aa40114554dfa41f9729496382 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# -*- coding: utf-8 -*-
# Copyright 2014-2016 OpenMarket Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from synapse.rest.media.v1.preview_url_resource import (
    decode_and_calc_og,
    summarize_paragraphs,
)

from . import unittest


class PreviewTestCase(unittest.TestCase):
    def test_long_summarize(self):
        example_paras = [
            """Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:
            Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in
            Troms county, Norway. The administrative centre of the municipality is
            the city of Tromsø. Outside of Norway, Tromso and Tromsö are
            alternative spellings of the city.Tromsø is considered the northernmost
            city in the world with a population above 50,000. The most populous town
            north of it is Alta, Norway, with a population of 14,272 (2013).""",
            """Tromsø lies in Northern Norway. The municipality has a population of
            (2015) 72,066, but with an annual influx of students it has over 75,000
            most of the year. It is the largest urban area in Northern Norway and the
            third largest north of the Arctic Circle (following Murmansk and Norilsk).
            Most of Tromsø, including the city centre, is located on the island of
            Tromsøya, 350 kilometres (217 mi) north of the Arctic Circle. In 2012,
            Tromsøya had a population of 36,088. Substantial parts of the urban area
            are also situated on the mainland to the east, and on parts of Kvaløya—a
            large island to the west. Tromsøya is connected to the mainland by the Tromsø
            Bridge and the Tromsøysund Tunnel, and to the island of Kvaløya by the
            Sandnessund Bridge. Tromsø Airport connects the city to many destinations
            in Europe. The city is warmer than most other places located on the same
            latitude, due to the warming effect of the Gulf Stream.""",
            """The city centre of Tromsø contains the highest number of old wooden
            houses in Northern Norway, the oldest house dating from 1789. The Arctic
            Cathedral, a modern church from 1965, is probably the most famous landmark
            in Tromsø. The city is a cultural centre for its region, with several
            festivals taking place in the summer. Some of Norway's best-known
             musicians, Torbjørn Brundtland and Svein Berge of the electronica duo
             Röyksopp and Lene Marlin grew up and started their careers in Tromsø.
             Noted electronic musician Geir Jenssen also hails from Tromsø.""",
        ]

        desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)

        self.assertEqual(
            desc,
            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
            " Troms county, Norway. The administrative centre of the municipality is"
            " the city of Tromsø. Outside of Norway, Tromso and Tromsö are"
            " alternative spellings of the city.Tromsø is considered the northernmost"
            " city in the world with a population above 50,000. The most populous town"
            " north of it is Alta, Norway, with a population of 14,272 (2013).",
        )

        desc = summarize_paragraphs(example_paras[1:], min_size=200, max_size=500)

        self.assertEqual(
            desc,
            "Tromsø lies in Northern Norway. The municipality has a population of"
            " (2015) 72,066, but with an annual influx of students it has over 75,000"
            " most of the year. It is the largest urban area in Northern Norway and the"
            " third largest north of the Arctic Circle (following Murmansk and Norilsk)."
            " Most of Tromsø, including the city centre, is located on the island of"
            " Tromsøya, 350 kilometres (217 mi) north of the Arctic Circle. In 2012,"
            " Tromsøya had a population of 36,088. Substantial parts of the urban…",
        )

    def test_short_summarize(self):
        example_paras = [
            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
            " Troms county, Norway.",
            "Tromsø lies in Northern Norway. The municipality has a population of"
            " (2015) 72,066, but with an annual influx of students it has over 75,000"
            " most of the year.",
            "The city centre of Tromsø contains the highest number of old wooden"
            " houses in Northern Norway, the oldest house dating from 1789. The Arctic"
            " Cathedral, a modern church from 1965, is probably the most famous landmark"
            " in Tromsø.",
        ]

        desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)

        self.assertEqual(
            desc,
            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
            " Troms county, Norway.\n"
            "\n"
            "Tromsø lies in Northern Norway. The municipality has a population of"
            " (2015) 72,066, but with an annual influx of students it has over 75,000"
            " most of the year.",
        )

    def test_small_then_large_summarize(self):
        example_paras = [
            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
            " Troms county, Norway.",
            "Tromsø lies in Northern Norway. The municipality has a population of"
            " (2015) 72,066, but with an annual influx of students it has over 75,000"
            " most of the year."
            " The city centre of Tromsø contains the highest number of old wooden"
            " houses in Northern Norway, the oldest house dating from 1789. The Arctic"
            " Cathedral, a modern church from 1965, is probably the most famous landmark"
            " in Tromsø.",
        ]

        desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
        self.assertEqual(
            desc,
            "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
            " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
            " Troms county, Norway.\n"
            "\n"
            "Tromsø lies in Northern Norway. The municipality has a population of"
            " (2015) 72,066, but with an annual influx of students it has over 75,000"
            " most of the year. The city centre of Tromsø contains the highest number"
            " of old wooden houses in Northern Norway, the oldest house dating from"
            " 1789. The Arctic Cathedral, a modern church from…",
        )


class PreviewUrlTestCase(unittest.TestCase):
    def test_simple(self):
        html = """
        <html>
        <head><title>Foo</title></head>
        <body>
        Some text.
        </body>
        </html>
        """

        og = decode_and_calc_og(html, "http://example.com/test.html")

        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

    def test_comment(self):
        html = """
        <html>
        <head><title>Foo</title></head>
        <body>
        <!-- HTML comment -->
        Some text.
        </body>
        </html>
        """

        og = decode_and_calc_og(html, "http://example.com/test.html")

        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

    def test_comment2(self):
        html = """
        <html>
        <head><title>Foo</title></head>
        <body>
        Some text.
        <!-- HTML comment -->
        Some more text.
        <p>Text</p>
        More text
        </body>
        </html>
        """

        og = decode_and_calc_og(html, "http://example.com/test.html")

        self.assertEqual(
            og,
            {
                "og:title": "Foo",
                "og:description": "Some text.\n\nSome more text.\n\nText\n\nMore text",
            },
        )

    def test_script(self):
        html = """
        <html>
        <head><title>Foo</title></head>
        <body>
        <script> (function() {})() </script>
        Some text.
        </body>
        </html>
        """

        og = decode_and_calc_og(html, "http://example.com/test.html")

        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

    def test_missing_title(self):
        html = """
        <html>
        <body>
        Some text.
        </body>
        </html>
        """

        og = decode_and_calc_og(html, "http://example.com/test.html")

        self.assertEqual(og, {"og:title": None, "og:description": "Some text."})

    def test_h1_as_title(self):
        html = """
        <html>
        <meta property="og:description" content="Some text."/>
        <body>
        <h1>Title</h1>
        </body>
        </html>
        """

        og = decode_and_calc_og(html, "http://example.com/test.html")

        self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})

    def test_missing_title_and_broken_h1(self):
        html = """
        <html>
        <body>
        <h1><a href="foo"/></h1>
        Some text.
        </body>
        </html>
        """

        og = decode_and_calc_og(html, "http://example.com/test.html")

        self.assertEqual(og, {"og:title": None, "og:description": "Some text."})

    def test_empty(self):
        html = ""
        og = decode_and_calc_og(html, "http://example.com/test.html")
        self.assertEqual(og, {})