mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-03-09 12:50:23 -05:00
[parsing] return unclosed matched tags
This commit is contained in:
parent
8451074b50
commit
dbf350c122
2 changed files with 7 additions and 6 deletions
|
@ -218,8 +218,9 @@ def test_get_element_text_and_html_by_tag_malformed(self):
|
||||||
get_element_text_and_html_by_tag('malnested_b', html),
|
get_element_text_and_html_by_tag('malnested_b', html),
|
||||||
(f'{inner_text}</malnested_a>',
|
(f'{inner_text}</malnested_a>',
|
||||||
f'<malnested_b>{inner_text}</malnested_a></malnested_b>'))
|
f'<malnested_b>{inner_text}</malnested_a></malnested_b>'))
|
||||||
|
self.assertEqual(
|
||||||
|
get_element_text_and_html_by_tag('orphan', f'<orphan>{html}'), ('', '<orphan>'))
|
||||||
self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}</orphan>'))
|
self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}</orphan>'))
|
||||||
self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'<orphan>{html}'))
|
|
||||||
|
|
||||||
def test_strict_html_parsing(self):
|
def test_strict_html_parsing(self):
|
||||||
class StrictTagParser(HTMLTagParser):
|
class StrictTagParser(HTMLTagParser):
|
||||||
|
@ -244,13 +245,13 @@ def test_relaxed_html_parsing(self):
|
||||||
parser = HTMLTagParser()
|
parser = HTMLTagParser()
|
||||||
|
|
||||||
self.assertEqual(parser.taglist('</p>', reset=True), [])
|
self.assertEqual(parser.taglist('</p>', reset=True), [])
|
||||||
self.assertEqual(parser.taglist('<div><p>', reset=True), [])
|
self.assertEqual(parser.taglist('<div><p>', reset=True), [Tag('div'), Tag('p')])
|
||||||
|
|
||||||
tags = parser.taglist('<div><p></div></p>', reset=True)
|
tags = parser.taglist('<div><p></div></p>', reset=True)
|
||||||
self.assertEqual(tags, [Tag('p'), Tag('div')])
|
self.assertEqual(tags, [Tag('div'), Tag('p')])
|
||||||
|
|
||||||
tags = parser.taglist('<div><p>/p></div>', reset=True)
|
tags = parser.taglist('<div><p>/p></div>', reset=True)
|
||||||
self.assertEqual(tags, [Tag('div')])
|
self.assertEqual(tags, [Tag('div'), Tag('p')])
|
||||||
|
|
||||||
tags = parser.taglist('<div><p>paragraph</p<ignored></div>', reset=True)
|
tags = parser.taglist('<div><p>paragraph</p<ignored></div>', reset=True)
|
||||||
self.assertEqual(tags, [Tag('div'), Tag('p')])
|
self.assertEqual(tags, [Tag('div'), Tag('p')])
|
||||||
|
|
|
@ -190,7 +190,7 @@ def handle_starttag(self, tag, attrs):
|
||||||
tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs)
|
tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs)
|
||||||
tag_obj.openrange(self._offset, len(tag_text))
|
tag_obj.openrange(self._offset, len(tag_text))
|
||||||
if tag_is_open:
|
if tag_is_open:
|
||||||
nesting = []
|
nesting = [tag_obj]
|
||||||
self._nestedtags[-1].append(nesting)
|
self._nestedtags[-1].append(nesting)
|
||||||
self._nestedtags.append(nesting)
|
self._nestedtags.append(nesting)
|
||||||
else:
|
else:
|
||||||
|
@ -218,7 +218,7 @@ def handle_endtag(self, tag):
|
||||||
if isinstance(tag_obj, self.Tag):
|
if isinstance(tag_obj, self.Tag):
|
||||||
close_idx = self.rawdata.find('>', self._offset) + 1
|
close_idx = self.rawdata.find('>', self._offset) + 1
|
||||||
tag_obj.closerange(self._offset, close_idx - self._offset)
|
tag_obj.closerange(self._offset, close_idx - self._offset)
|
||||||
self._nestedtags.pop().insert(0, tag_obj)
|
self._nestedtags.pop()
|
||||||
self.callback(tag_obj)
|
self.callback(tag_obj)
|
||||||
except ValueError as exc:
|
except ValueError as exc:
|
||||||
if isinstance(exc, compat_HTMLParseError):
|
if isinstance(exc, compat_HTMLParseError):
|
||||||
|
|
Loading…
Reference in a new issue