diff --git a/synapse/rest/media/v1/preview_html.py b/synapse/rest/media/v1/preview_html.py
index c826a13093..afe4e29758 100644
--- a/synapse/rest/media/v1/preview_html.py
+++ b/synapse/rest/media/v1/preview_html.py
@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import codecs
-import itertools
import logging
import re
from typing import (
@@ -21,7 +20,7 @@ from typing import (
Dict,
Generator,
Iterable,
- Optional,
+ List, Optional,
Set,
Union,
)
@@ -354,7 +353,7 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
from lxml import etree
- TAGS_TO_REMOVE = (
+ TAGS_TO_REMOVE = {
"header",
"nav",
"aside",
@@ -369,31 +368,42 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
"img",
"picture",
etree.Comment,
- )
+ }
# Split all the text nodes into paragraphs (by splitting on new
# lines)
text_nodes = (
re.sub(r"\s+", "\n", el).strip()
- for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
+ for el in _iterate_over_text(tree.find("body"), TAGS_TO_REMOVE)
)
return summarize_paragraphs(text_nodes)
def _iterate_over_text(
- tree: "etree.Element", *tags_to_ignore: Union[str, "etree.Comment"]
+ tree: Optional["etree.Element"],
+ tags_to_ignore: Set[Union[str, "etree.Comment"]],
+ stack_limit: int = 1024,
) -> Generator[str, None, None]:
"""Iterate over the tree returning text nodes in a depth first fashion,
skipping text nodes inside certain tags.
+
+ Args:
+ tree: The parent element to iterate. Can be None if there isn't one.
+ tags_to_ignore: Set of tags to ignore
+ stack_limit: Maximum stack size limit for depth-first traversal.
+ Nodes will be dropped if this limit is hit, which may truncate the
+ textual result.
+ Intended to limit the maximum working memory when generating a preview.
"""
- # This is basically a stack that we extend using itertools.chain.
- # This will either consist of an element to iterate over *or* a string
+
+ if tree is None:
+ return
+
+ # This is a stack whose items are elements to iterate over *or* strings
# to be returned.
- elements = iter([tree])
- while True:
- el = next(elements, None)
- if el is None:
- return
+ elements: List[Union[str, "etree.Element"]] = [tree]
+ while elements:
+ el = elements.pop()
if isinstance(el, str):
yield el
@@ -407,17 +417,22 @@ def _iterate_over_text(
if el.text:
yield el.text
- # We add to the stack all the elements children, interspersed with
- # each child's tail text (if it exists). The tail text of a node
- # is text that comes *after* the node, so we always include it even
- # if we ignore the child node.
- elements = itertools.chain(
- itertools.chain.from_iterable( # Basically a flatmap
- [child, child.tail] if child.tail else [child]
- for child in el.iterchildren()
- ),
- elements,
- )
+ # We add to the stack all the element's children, interspersed with
+ # each child's tail text (if it exists).
+ #
+ # We iterate in reverse order so that earlier pieces of text appear
+ # closer to the top of the stack.
+ for child in el.iterchildren(reversed=True):
+ if len(elements) > stack_limit:
+ # We've hit our limit for working memory
+ break
+
+ if child.tail:
+ # The tail text of a node is text that comes *after* the node,
+ # so we always include it even if we ignore the child node.
+ elements.append(child.tail)
+
+ elements.append(child)
def summarize_paragraphs(
|