mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-03-09 12:50:23 -05:00
[parsing] search for case-insensitive tag names
This commit is contained in:
parent
6169b3eca8
commit
65f91148fc
2 changed files with 6 additions and 2 deletions
|
@ -222,6 +222,10 @@ def test_get_element_text_and_html_by_tag_malformed(self):
|
||||||
get_element_text_and_html_by_tag('orphan', f'<orphan>{html}'), ('', '<orphan>'))
|
get_element_text_and_html_by_tag('orphan', f'<orphan>{html}'), ('', '<orphan>'))
|
||||||
self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}</orphan>'))
|
self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}</orphan>'))
|
||||||
|
|
||||||
|
# ignore case on tags
|
||||||
|
ci_html = f'<SpAn>{html}</sPaN>'
|
||||||
|
self.assertEqual(get_element_text_and_html_by_tag('span', ci_html), (html, ci_html))
|
||||||
|
|
||||||
def test_strict_html_parsing(self):
|
def test_strict_html_parsing(self):
|
||||||
class StrictTagParser(HTMLTagParser):
|
class StrictTagParser(HTMLTagParser):
|
||||||
STRICT = True
|
STRICT = True
|
||||||
|
|
|
@ -245,7 +245,7 @@ def matching_tag_regex(tag, attribute, value_regex, escape=True):
|
||||||
value_regex = re.escape(value_regex)
|
value_regex = re.escape(value_regex)
|
||||||
|
|
||||||
return rf'''(?x)
|
return rf'''(?x)
|
||||||
<(?:{tag})
|
<(?i:{tag})
|
||||||
(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?
|
(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?
|
||||||
\s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q)
|
\s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q)
|
||||||
'''
|
'''
|
||||||
|
@ -263,7 +263,7 @@ def tags_by_name(cls, tag, html):
|
||||||
def matchfunc(tag_str, _attrs):
|
def matchfunc(tag_str, _attrs):
|
||||||
return tag_str == tag
|
return tag_str == tag
|
||||||
|
|
||||||
tag_regex = rf'''<\s*{re.escape(tag)}(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?>'''
|
tag_regex = rf'''<\s*(?i:{re.escape(tag)})(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?>'''
|
||||||
yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc)
|
yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
Loading…
Reference in a new issue