From d9195b5133946d53f13434849433c4760243b040 Mon Sep 17 00:00:00 2001 From: Allen <64094914+allendema@users.noreply.github.com> Date: Wed, 4 Dec 2024 23:10:45 +0100 Subject: [PATCH 1/3] [core] utils: fix get_element_by_* --- test/test_utils.py | 20 ++++++++++++++++++++ yt_dlp/utils/_utils.py | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index b3de14198..2e26d224a 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1769,6 +1769,10 @@ def test_get_element_html_by_class(self): ''' + GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING_UPPERCASE = ''' + + ''' + def test_get_element_by_attribute(self): html = self.GET_ELEMENT_BY_CLASS_TEST_STRING @@ -1780,6 +1784,10 @@ def test_get_element_by_attribute(self): self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo') + html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING_UPPERCASE + + self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo') + def test_get_element_html_by_attribute(self): html = self.GET_ELEMENT_BY_CLASS_TEST_STRING @@ -1851,6 +1859,11 @@ def test_get_elements_text_and_html_by_attribute(self): GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[78:119] GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT = GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML[6:-7] + + GET_ELEMENT_BY_TAG_TEST_STRING_UPPERCASE = ''' + nice + ''' + def test_get_element_text_and_html_by_tag(self): html = self.GET_ELEMENT_BY_TAG_TEST_STRING @@ -1860,8 +1873,15 @@ def test_get_element_text_and_html_by_tag(self): self.assertEqual( get_element_text_and_html_by_tag('span', html), (self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML)) + self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html) + html = self.GET_ELEMENT_BY_TAG_TEST_STRING_UPPERCASE + + self.assertEqual( + get_element_text_and_html_by_tag('SPAN', html), + ('nice', html.strip()), html) + def test_iri_to_uri(self): self.assertEqual( iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'), diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 699bf1e7f..9aae60584 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -442,7 +442,7 @@ def find_or_raise(haystack, needle, exc): content_start += whole_start + 1 with HTMLBreakOnClosingTagParser() as parser: parser.feed(html[whole_start:content_start]) - if not parser.tagstack or parser.tagstack[0] != tag: + if not parser.tagstack or parser.tagstack[0] != tag.lower(): raise compat_HTMLParseError(f'parser did not match opening {tag} tag') offset = content_start while offset < len(html): From ec3a0927c4c9c10c062e0953dd8fd1a6df2c0263 Mon Sep 17 00:00:00 2001 From: Allen <64094914+allendema@users.noreply.github.com> Date: Wed, 4 Dec 2024 23:30:05 +0100 Subject: [PATCH 2/3] fixup! [core] utils: fix get_element_by_* --- test/test_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 2e26d224a..e8ef15e01 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1859,7 +1859,6 @@ def test_get_elements_text_and_html_by_attribute(self): GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[78:119] GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT = GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML[6:-7] - GET_ELEMENT_BY_TAG_TEST_STRING_UPPERCASE = ''' nice ''' From c49aa772cc0ae6f8cb92c56315bbf01b45c61a8e Mon Sep 17 00:00:00 2001 From: Allen <64094914+allendema@users.noreply.github.com> Date: Thu, 5 Dec 2024 00:43:15 +0100 Subject: [PATCH 3/3] [core] utils: fix some cases --- test/test_utils.py | 15 ++++----------- yt_dlp/utils/_utils.py | 10 +++++++--- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index e8ef15e01..896cf9f32 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1846,7 +1846,7 @@ def test_get_elements_text_and_html_by_attribute(self): random text lorem ipsum

this should be returned - this should also be returned + this should also be returned
this should also be returned
@@ -1859,10 +1859,6 @@ def test_get_elements_text_and_html_by_attribute(self): GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[78:119] GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT = GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML[6:-7] - GET_ELEMENT_BY_TAG_TEST_STRING_UPPERCASE = ''' - nice - ''' - def test_get_element_text_and_html_by_tag(self): html = self.GET_ELEMENT_BY_TAG_TEST_STRING @@ -1872,14 +1868,11 @@ def test_get_element_text_and_html_by_tag(self): self.assertEqual( get_element_text_and_html_by_tag('span', html), (self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML)) - - self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html) - - html = self.GET_ELEMENT_BY_TAG_TEST_STRING_UPPERCASE - self.assertEqual( get_element_text_and_html_by_tag('SPAN', html), - ('nice', html.strip()), html) + (self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML)) + + self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html) def test_iri_to_uri(self): self.assertEqual( diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 9aae60584..d600a522a 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -430,10 +430,14 @@ def get_element_text_and_html_by_tag(tag, html): return its' content (text) and the whole element (html) """ def find_or_raise(haystack, needle, exc): - try: + with contextlib.suppress(ValueError): return haystack.index(needle) - except ValueError: - raise exc + + with contextlib.suppress(ValueError): + return haystack.index(needle.upper()) + + raise exc + closing_tag = f'' whole_start = find_or_raise( html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))