From dbf350c12291279c0be56cb82922c2fae1c87eb2 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 27 Nov 2022 16:34:06 +0100 Subject: [PATCH] [parsing] return unclosed matched tags --- test/test_parsing.py | 9 +++++---- yt_dlp/parsing.py | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/test/test_parsing.py b/test/test_parsing.py index 880c41a34..588711518 100644 --- a/test/test_parsing.py +++ b/test/test_parsing.py @@ -218,8 +218,9 @@ def test_get_element_text_and_html_by_tag_malformed(self): get_element_text_and_html_by_tag('malnested_b', html), (f'{inner_text}', f'{inner_text}')) + self.assertEqual( + get_element_text_and_html_by_tag('orphan', f'{html}'), ('', '')) self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}')) - self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}')) def test_strict_html_parsing(self): class StrictTagParser(HTMLTagParser): @@ -244,13 +245,13 @@ def test_relaxed_html_parsing(self): parser = HTMLTagParser() self.assertEqual(parser.taglist('

', reset=True), []) - self.assertEqual(parser.taglist('

', reset=True), []) + self.assertEqual(parser.taglist('

', reset=True), [Tag('div'), Tag('p')]) tags = parser.taglist('

', reset=True) - self.assertEqual(tags, [Tag('p'), Tag('div')]) + self.assertEqual(tags, [Tag('div'), Tag('p')]) tags = parser.taglist('

/p>

', reset=True) - self.assertEqual(tags, [Tag('div')]) + self.assertEqual(tags, [Tag('div'), Tag('p')]) tags = parser.taglist('

paragraph

', reset=True) self.assertEqual(tags, [Tag('div'), Tag('p')]) diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py index 8fbb4db14..5ecd6b75c 100644 --- a/yt_dlp/parsing.py +++ b/yt_dlp/parsing.py @@ -190,7 +190,7 @@ def handle_starttag(self, tag, attrs): tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs) tag_obj.openrange(self._offset, len(tag_text)) if tag_is_open: - nesting = [] + nesting = [tag_obj] self._nestedtags[-1].append(nesting) self._nestedtags.append(nesting) else: @@ -218,7 +218,7 @@ def handle_endtag(self, tag): if isinstance(tag_obj, self.Tag): close_idx = self.rawdata.find('>', self._offset) + 1 tag_obj.closerange(self._offset, close_idx - self._offset) - self._nestedtags.pop().insert(0, tag_obj) + self._nestedtags.pop() self.callback(tag_obj) except ValueError as exc: if isinstance(exc, compat_HTMLParseError):