', reset=True)
+ with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '
', reset=True)
+ with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after ''"):
+ parser.taglist('
', reset=True)
+ with self.assertRaisesRegex(compat_HTMLParseError, "malformed closing tag 'p<<'"):
+ parser.taglist('
', reset=True)
+ with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"):
+ parser.taglist('
![]()
must be empty', reset=True)
+
+ def test_relaxed_html_parsing(self):
+ Tag = HTMLTagParser.Tag
+ parser = HTMLTagParser()
+
+ self.assertEqual(parser.taglist('', reset=True), [])
+
+ tags = parser.taglist('
', reset=True)
+ self.assertEqual(tags, [Tag('div'), Tag('p')])
+ self.assertEqual(tags[0].text_and_html(), ('', '
'))
+ self.assertEqual(tags[1].text_and_html(), ('', '
'))
+
+ tags = parser.taglist('
', reset=True)
+ self.assertEqual(tags, [Tag('div'), Tag('p')])
+ self.assertEqual(tags[0].text_and_html(), ('
', '
'))
+ self.assertEqual(tags[1].text_and_html(), ('
', '
'))
+
+ tags = parser.taglist('
', reset=True)
+ self.assertEqual(tags, [Tag('div'), Tag('p')])
+ self.assertEqual(tags[0].text_and_html(), ('
/p>', '
'))
+ self.assertEqual(tags[1].text_and_html(), ('', '
'))
+
+ tags = parser.taglist('
', reset=True)
+ self.assertEqual(tags, [Tag('div'), Tag('p')])
+ self.assertEqual(tags[0].text_and_html(),
+ ('
paragraph
', '
'))
+ self.assertEqual(tags[1].text_and_html(), ('paragraph', '
paragraph
'))
+
+ tags = parser.taglist('
![]()
must be empty', reset=True)
+ self.assertEqual(tags, [Tag('img')])
+ self.assertEqual(tags[0].text_and_html(), ('', '
![]()
'))
+
+ def test_compliant_html_parsing(self):
+ # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS)
+ Tag = HTMLTagParser.Tag
+ html = '''
+ no error without closing tag:
![]()
+ self closing is ok:
![]()
+ '''
+ parser = HTMLTagParser()
+ tags = parser.taglist(html, reset=True)
+ self.assertEqual(tags, [Tag('img'), Tag('img')])
+
+ # don't get fooled by '>' in attributes
+ html = '''
![]()
'''
+ tags = parser.taglist(html, reset=True)
+ self.assertEqual(tags[0].text_and_html(), ('', html))
+
+ def test_tag_return_order(self):
+ Tag = HTMLTagParser.Tag
+ html = '''
+
+
+
+
+
+
+
+
+
+
+
+
+
+ '''
+ parser = HTMLTagParser()
+ tags = parser.taglist(html, reset=True)
+ self.assertEqual(
+ str(tags), str([Tag('t0'), Tag('t1'), Tag('t2'), Tag('t3'), Tag('t4'),
+ Tag('t5'), Tag('t6'), Tag('t7'), Tag('t8')]))
+
+ tags = parser.taglist(html, reset=True, depth_first=True)
+ self.assertEqual(
+ str(tags), str([Tag('t3'), Tag('t4'), Tag('t2'), Tag('t1'), Tag('t6'),
+ Tag('t5'), Tag('t0'), Tag('t8'), Tag('t7')]))
+
+ # return tags in nested order
+ tags = parser.taglist(html, reset=True, depth_first=None)
+ self.assertEqual(
+ str(tags), str([
+ [Tag('t0'),
+ [Tag('t1'),
+ [Tag('t2'), [Tag('t3')], [Tag('t4')]]],
+ [Tag('t5'), [Tag('t6')]]],
+ [Tag('t7'), [Tag('t8')]]]))
+
+ def test_html_ignored_ranges(self):
+ def mark_comments(_string, char='^', nochar='-'):
+ cmts = HTMLIgnoreRanges(_string)
+ return "".join(char if _idx in cmts else nochar for _idx in range(len(_string)))
+
+ html_string = '''
+ no comments in this line
+ ---------------------------------------------------------------------
+
+ ----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^---
+ before after
+ -----------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^-----------
+ this is a leftover comment --> and end
+ ----------------^^^^^^^^^^^----------------^^^^^^^^^^^^^^^^^---------
+
+ --------^^^^^^^^^^^^^-----------------------------^^^^^^^^^^---------
+ '''
+
+ lines = textwrap.dedent(html_string).strip().splitlines()
+ for line, marker in zip(lines[0::2], lines[1::2]):
+ self.assertEqual((line, mark_comments(line)), (line, marker))
+
+ # yet we must be able to match script elements
+ test_string = ''''''
+ items = get_element_text_and_html_by_tag('script', test_string)
+ self.assertEqual(items, ("var foo = 'bar';", test_string))
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
new file mode 100644
index 000000000..72d7e448b
--- /dev/null
+++ b/yt_dlp/parsing.py
@@ -0,0 +1,348 @@
+import collections
+import contextlib
+import itertools
+import re
+from html.parser import HTMLParser
+
+from .compat import compat_HTMLParseError
+from .utils import orderedSet
+
+
+class HTMLIgnoreRanges:
+ """check if an offset is within CDATA content elements (script, style) or XML comments
+
+ note:
+ * given offsets must be in increasing order
+ * no detection of nested constructs (e.g. comments within script tags)
+
+ usage:
+ ranges = HTMLIgnoreRanges(html)
+ if offset in ranges:
+ ...
+ """
+ REGEX = re.compile(r'', '--!>'):
+ return offset < self._last_match.start()
+ return offset >= self._last_match.end()
+
+
+class HTMLTagParser(HTMLParser):
+ """HTML parser which returns found elements as instances of 'Tag'
+ when STRICT=True can raise compat_HTMLParseError() on malformed HTML elements
+
+ usage:
+ parser = HTMLTagParser()
+ for tag_obj in parser.taglist(html):
+ tag_obj.text_and_html()
+
+ """
+
+ STRICT = False
+ ANY_TAG_REGEX = re.compile(r'''<(?:"[^"]*"|'[^']*'|[^"'>])*?>''')
+ VOID_TAGS = {
+ 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
+ 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr',
+ }
+
+ class Tag:
+ __slots__ = 'name', 'string', 'attrs', '_openrange', '_closerange'
+
+ def __init__(self, name, *, string='', attrs=()):
+ self.name = name
+ self.string = string
+ self.attrs = tuple(attrs)
+ self._openrange = None
+ self._closerange = None
+
+ def __str__(self):
+ return self.name
+
+ def __repr__(self):
+ return f'{self.__class__.__name__}({str(self)!r})'
+
+ def __eq__(self, other):
+ return self.name == other
+
+ def openrange(self, offset, startlen=0):
+ if isinstance(offset, slice):
+ self._openrange = offset
+ else:
+ self._openrange = slice(offset, offset + startlen)
+
+ def closerange(self, offset, stoplen=0):
+ if isinstance(offset, slice):
+ self._closerange = offset
+ else:
+ self._closerange = slice(offset, offset + stoplen)
+
+ def opentag(self):
+ return self.string[self._openrange] if self._openrange else ''
+
+ def html(self):
+ if not self._openrange:
+ return ''
+ if self._closerange:
+ return self.string[self._openrange.start:self._closerange.stop]
+ return self.string[self._openrange]
+
+ def text(self):
+ if self._openrange and self._closerange:
+ return self.string[self._openrange.stop:self._closerange.start]
+ return ''
+
+ def text_and_html(self):
+ return self.text(), self.html()
+
+ class AbortException(Exception):
+ pass
+
+ def __init__(self):
+ self.tagstack = collections.deque()
+ self._nestedtags = [[]]
+ super().__init__()
+ self._offset = self.offset
+
+ def predicate(self, tag, attrs):
+ """ return True for every encountered opening tag that should be processed """
+ return True
+
+ def callback(self, tag_obj):
+ """ this will be called when the requested tag is closed """
+
+ def reset(self):
+ super().reset()
+ self.tagstack.clear()
+
+ def taglist(self, data, reset=True, depth_first=False):
+ """ parse data and return found tag objects
+ @param data: html string
+ @param reset: reset state
+ @param depth_first: return order: as opened (False), as closed (True), nested (None)
+ @return: list of Tag objects
+ """
+ def flatten(_list, first=True):
+ rlist = _list if first or not depth_first else itertools.chain(_list[1:], _list[:1])
+ for item in rlist:
+ if isinstance(item, list):
+ yield from flatten(item, first=False)
+ else:
+ yield item
+
+ if reset:
+ self.reset()
+ with contextlib.suppress(HTMLTagParser.AbortException):
+ self.feed(data)
+ if self.STRICT and self.tagstack:
+ orphans = ', '.join(map(repr, map(str, orderedSet(self.tagstack, lazy=True))))
+ raise compat_HTMLParseError(f'unclosed tag {orphans}')
+ taglist = self._nestedtags[0] if depth_first is None else list(flatten(self._nestedtags[0]))
+ self._nestedtags = [[]]
+ return taglist
+
+ def updatepos(self, i, j):
+ offset = self._offset = super().updatepos(i, j)
+ return offset
+
+ def handle_starttag(self, tag, attrs):
+ try:
+ # we use internal variable for performance reasons
+ tag_text = getattr(self, '_HTMLParser__starttag_text')
+ except AttributeError:
+ tag_text = HTMLTagParser.ANY_TAG_REGEX.match(self.rawdata[self._offset:]).group()
+
+ tag_obj = tag
+ tag_is_open = not (tag_text.endswith('/>') or tag in self.VOID_TAGS)
+ if self.predicate(tag, attrs):
+ tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs)
+ tag_obj.openrange(self._offset, len(tag_text))
+ nesting = [tag_obj]
+ self._nestedtags[-1].append(nesting)
+ if tag_is_open:
+ self._nestedtags.append(nesting)
+ else:
+ self.callback(tag_obj)
+ if tag_is_open:
+ self.tagstack.appendleft(tag_obj)
+
+ handle_startendtag = handle_starttag
+
+ def handle_endtag(self, tag):
+ if '<' in tag:
+ if self.STRICT:
+ raise compat_HTMLParseError(f'malformed closing tag {tag!r}')
+ tag = tag[:tag.index('<')]
+
+ try:
+ idx = self.tagstack.index(tag)
+ if self.STRICT and idx:
+ open_tags = ''.join(f'{tag}>' for tag in itertools.islice(self.tagstack, idx))
+ raise compat_HTMLParseError(
+ f'malnested closing tag {tag!r}, expected after {open_tags!r}')
+ tag_obj = self.tagstack[idx]
+ self.tagstack.remove(tag)
+ if isinstance(tag_obj, self.Tag):
+ tag_obj.closerange(slice(self._offset, self.rawdata.find('>', self._offset) + 1))
+ self._nestedtags.pop()
+ self.callback(tag_obj)
+ except ValueError as exc:
+ if isinstance(exc, compat_HTMLParseError):
+ raise
+ if self.STRICT:
+ raise compat_HTMLParseError(f'stray closing tag {tag!r}') from exc
+
+
+class MatchingElementParser(HTMLTagParser):
+ """ optimized version of HTMLTagParser
+ """
+ def __init__(self, matchfunc):
+ super().__init__()
+ self.matchfunc = matchfunc
+ self.found_none = True
+
+ def reset(self):
+ super().reset()
+ self.found_none = True
+
+ def callback(self, tag_obj):
+ raise self.AbortException()
+
+ def predicate(self, tag, attrs):
+ if self.found_none and self.matchfunc(tag, attrs):
+ self.found_none = False
+ return True
+ return False
+
+ @staticmethod
+ def class_value_regex(class_name):
+ return rf'[\w\s\-]*(?"'\\]|"[^"\\]*"|'[^'\\]*')*)?
+ \s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q)
+ '''
+
+ @classmethod
+ def iter_tags(cls, regex, html, *, matchfunc):
+ ignored = HTMLIgnoreRanges(html)
+ parser = cls(matchfunc)
+ for match in re.finditer(regex, html):
+ if match.start() not in ignored:
+ yield from parser.taglist(html[match.start():], reset=True)
+
+ @classmethod
+ def tags_by_name(cls, tag, html):
+ def matchfunc(tag_str, _attrs):
+ return tag_str == tag
+
+ tag_regex = rf'''<\s*(?i:{re.escape(tag)})(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?>'''
+ yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc)
+
+ @classmethod
+ def tags_by_attribute(cls, attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
+ def matchfunc(_tag_str, attrs):
+ return any(attr == attribute and re.fullmatch(value, value_str)
+ for attr, value_str in attrs)
+
+ tag_regex = cls.matching_tag_regex(tag, attribute, value, escape_value)
+ yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc)
+
+ @classmethod
+ def extract_attributes(cls, html):
+ attr_dict = {}
+
+ def matchfunc(_tag, attrs):
+ attr_dict.update(attrs)
+ raise cls.AbortException()
+
+ with contextlib.suppress(cls.AbortException):
+ cls(matchfunc).feed(html)
+
+ return attr_dict
+
+ @classmethod
+ def get_elements_text_and_html_by_tag(cls, tag, html):
+ return [tag.text_and_html() for tag in cls.tags_by_name(tag, html)]
+
+ @classmethod
+ def get_element_text_and_html_by_tag(cls, tag, html):
+ tag = next(cls.tags_by_name(tag, html), None)
+ return tag and tag.text_and_html()
+
+ @classmethod
+ def get_elements_text_and_html_by_attribute(cls, *args, **kwargs):
+ return [tag.text_and_html() for tag in cls.tags_by_attribute(*args, **kwargs)]
+
+ @classmethod
+ def get_elements_by_attribute(cls, *args, **kwargs):
+ return [tag.text() for tag in cls.tags_by_attribute(*args, **kwargs)]
+
+ @classmethod
+ def get_elements_html_by_attribute(cls, *args, **kwargs):
+ return [tag.html() for tag in cls.tags_by_attribute(*args, **kwargs)]
+
+ @classmethod
+ def get_element_by_attribute(cls, *args, **kwargs):
+ tag = next(cls.tags_by_attribute(*args, **kwargs), None)
+ return tag and tag.text()
+
+ @classmethod
+ def get_element_html_by_attribute(cls, *args, **kwargs):
+ tag = next(cls.tags_by_attribute(*args, **kwargs), None)
+ return tag and tag.html()
+
+ @classmethod
+ def get_elements_by_class(cls, class_name, html):
+ value = cls.class_value_regex(class_name)
+ return [tag.text() for tag
+ in cls.tags_by_attribute('class', value, html, escape_value=False)]
+
+ @classmethod
+ def get_elements_html_by_class(cls, class_name, html):
+ value = cls.class_value_regex(class_name)
+ return [tag.html() for tag
+ in cls.tags_by_attribute('class', value, html, escape_value=False)]
+
+ @classmethod
+ def get_elements_text_and_html_by_class(cls, class_name, html):
+ value = cls.class_value_regex(class_name)
+ return [tag.text_and_html() for tag
+ in cls.tags_by_attribute('class', value, html, escape_value=False)]
+
+ @classmethod
+ def get_element_html_by_class(cls, class_name, html):
+ value = cls.class_value_regex(class_name)
+ tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None)
+ return tag and tag.html()
+
+ @classmethod
+ def get_element_by_class(cls, class_name, html):
+ value = cls.class_value_regex(class_name)
+ tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None)
+ return tag and tag.text()
diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py
index 4093c238c..c2c5b5f81 100644
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@@ -410,17 +410,13 @@ def close(self):
pass
def handle_starttag(self, tag, _):
- self.tagstack.append(tag)
+ self.tagstack.appendleft(tag)
def handle_endtag(self, tag):
if not self.tagstack:
raise compat_HTMLParseError('no tags in the stack')
- while self.tagstack:
- inner_tag = self.tagstack.pop()
- if inner_tag == tag:
- break
- else:
- raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
+ with contextlib.suppress(ValueError):
+ self.tagstack.remove(tag)
if not self.tagstack:
raise self.HTMLBreakOnClosingTagException
@@ -454,6 +450,8 @@ def find_or_raise(haystack, needle, exc):
next_closing_tag_end = next_closing_tag_start + len(closing_tag)
try:
parser.feed(html[offset:offset + next_closing_tag_end])
+ if tag not in parser.tagstack:
+ raise HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException()
offset += next_closing_tag_end
except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
return html[content_start:offset + next_closing_tag_start], \