1
0
Fork 0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-03-09 12:50:23 -05:00

[parsing] return unclosed matched tags

This commit is contained in:
Marcel 2022-11-27 16:34:06 +01:00
parent 8451074b50
commit dbf350c122
No known key found for this signature in database
GPG key ID: 7813C97693AD6AAE
2 changed files with 7 additions and 6 deletions

View file

@ -218,8 +218,9 @@ def test_get_element_text_and_html_by_tag_malformed(self):
get_element_text_and_html_by_tag('malnested_b', html), get_element_text_and_html_by_tag('malnested_b', html),
(f'{inner_text}</malnested_a>', (f'{inner_text}</malnested_a>',
f'<malnested_b>{inner_text}</malnested_a></malnested_b>')) f'<malnested_b>{inner_text}</malnested_a></malnested_b>'))
self.assertEqual(
get_element_text_and_html_by_tag('orphan', f'<orphan>{html}'), ('', '<orphan>'))
self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}</orphan>')) self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}</orphan>'))
self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'<orphan>{html}'))
def test_strict_html_parsing(self): def test_strict_html_parsing(self):
class StrictTagParser(HTMLTagParser): class StrictTagParser(HTMLTagParser):
@ -244,13 +245,13 @@ def test_relaxed_html_parsing(self):
parser = HTMLTagParser() parser = HTMLTagParser()
self.assertEqual(parser.taglist('</p>', reset=True), []) self.assertEqual(parser.taglist('</p>', reset=True), [])
self.assertEqual(parser.taglist('<div><p>', reset=True), []) self.assertEqual(parser.taglist('<div><p>', reset=True), [Tag('div'), Tag('p')])
tags = parser.taglist('<div><p></div></p>', reset=True) tags = parser.taglist('<div><p></div></p>', reset=True)
self.assertEqual(tags, [Tag('p'), Tag('div')]) self.assertEqual(tags, [Tag('div'), Tag('p')])
tags = parser.taglist('<div><p>/p></div>', reset=True) tags = parser.taglist('<div><p>/p></div>', reset=True)
self.assertEqual(tags, [Tag('div')]) self.assertEqual(tags, [Tag('div'), Tag('p')])
tags = parser.taglist('<div><p>paragraph</p<ignored></div>', reset=True) tags = parser.taglist('<div><p>paragraph</p<ignored></div>', reset=True)
self.assertEqual(tags, [Tag('div'), Tag('p')]) self.assertEqual(tags, [Tag('div'), Tag('p')])

View file

@ -190,7 +190,7 @@ def handle_starttag(self, tag, attrs):
tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs) tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs)
tag_obj.openrange(self._offset, len(tag_text)) tag_obj.openrange(self._offset, len(tag_text))
if tag_is_open: if tag_is_open:
nesting = [] nesting = [tag_obj]
self._nestedtags[-1].append(nesting) self._nestedtags[-1].append(nesting)
self._nestedtags.append(nesting) self._nestedtags.append(nesting)
else: else:
@ -218,7 +218,7 @@ def handle_endtag(self, tag):
if isinstance(tag_obj, self.Tag): if isinstance(tag_obj, self.Tag):
close_idx = self.rawdata.find('>', self._offset) + 1 close_idx = self.rawdata.find('>', self._offset) + 1
tag_obj.closerange(self._offset, close_idx - self._offset) tag_obj.closerange(self._offset, close_idx - self._offset)
self._nestedtags.pop().insert(0, tag_obj) self._nestedtags.pop()
self.callback(tag_obj) self.callback(tag_obj)
except ValueError as exc: except ValueError as exc:
if isinstance(exc, compat_HTMLParseError): if isinstance(exc, compat_HTMLParseError):