diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 170a6ac19..910c409ef 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -33,7 +33,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@v2 + uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -47,7 +47,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@v2 + uses: github/codeql-action/autobuild@v3 # ℹ️ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -60,6 +60,6 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 + uses: github/codeql-action/analyze@v3 with: category: "/language:${{matrix.language}}" diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 7376b1801..0b90c0e6c 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -736,3 +736,9 @@ NecroRomnt pjrobertson subsense test20140 +arantius +entourage8 +lfavole +mp3butcher +slipinthedove +YoshiTabletopGamer diff --git a/Changelog.md b/Changelog.md index 3232c158b..9c544feb9 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,49 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2025.02.19 + +#### Core changes +- **jsinterp** + - [Add `js_number_to_string`](https://github.com/yt-dlp/yt-dlp/commit/0d9f061d38c3a4da61972e2adad317079f2f1c84) ([#12110](https://github.com/yt-dlp/yt-dlp/issues/12110)) by [Grub4K](https://github.com/Grub4K) + - [Improve zeroise](https://github.com/yt-dlp/yt-dlp/commit/4ca8c44a073d5aa3a3e3112c35b2b23d6ce25ac6) ([#12313](https://github.com/yt-dlp/yt-dlp/issues/12313)) by [seproDev](https://github.com/seproDev) + +#### Extractor changes +- **acast**: [Support shows.acast.com URLs](https://github.com/yt-dlp/yt-dlp/commit/57c717fee4bfbc9309845bbb48901b72e4b69304) ([#12223](https://github.com/yt-dlp/yt-dlp/issues/12223)) by [barsnick](https://github.com/barsnick) +- **cwtv** + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/18a28514e306e822eab4f3a79c76d515bf076406) ([#12207](https://github.com/yt-dlp/yt-dlp/issues/12207)) by [arantius](https://github.com/arantius) + - movie: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/03c3d705778c07739e0034b51490877cffdc0983) ([#12227](https://github.com/yt-dlp/yt-dlp/issues/12227)) by [bashonly](https://github.com/bashonly) +- **digiview**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/f53553087d3fde9dcd61d6e9f98caf09db1d8ef2) ([#9902](https://github.com/yt-dlp/yt-dlp/issues/9902)) by [lfavole](https://github.com/lfavole) +- **dropbox**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/861aeec449c8f3c062d962945b234ff0341f61f3) ([#12228](https://github.com/yt-dlp/yt-dlp/issues/12228)) by [bashonly](https://github.com/bashonly) +- **francetv** + - site + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/817483ccc68aed6049ed9c4a2ffae44ca82d2b1c) ([#12236](https://github.com/yt-dlp/yt-dlp/issues/12236)) by [bashonly](https://github.com/bashonly) + - [Fix livestream extraction](https://github.com/yt-dlp/yt-dlp/commit/1295bbedd45fa8d9bc3f7a194864ae280297848e) ([#12316](https://github.com/yt-dlp/yt-dlp/issues/12316)) by [bashonly](https://github.com/bashonly) +- **francetvinfo.fr**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/5c4c2ddfaa47988b4d50c1ad4988badc0b4f30c2) ([#12402](https://github.com/yt-dlp/yt-dlp/issues/12402)) by [bashonly](https://github.com/bashonly) +- **gem.cbc.ca**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/5271ef48c6f61c145e03e18e960995d2e651d205) ([#12404](https://github.com/yt-dlp/yt-dlp/issues/12404)) by [bashonly](https://github.com/bashonly), [dirkf](https://github.com/dirkf) +- **generic**: [Extract `live_status` for DASH manifest URLs](https://github.com/yt-dlp/yt-dlp/commit/19edaa44fcd375f54e63d6227b092f5252d3e889) ([#12256](https://github.com/yt-dlp/yt-dlp/issues/12256)) by [mp3butcher](https://github.com/mp3butcher) +- **globo**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/f8d0161455f00add65585ca1a476a7b5d56f5f96) ([#11795](https://github.com/yt-dlp/yt-dlp/issues/11795)) by [slipinthedove](https://github.com/slipinthedove), [YoshiTabletopGamer](https://github.com/YoshiTabletopGamer) +- **goplay**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/d59f14a0a7a8b55e6bf468237def62b73ab4a517) ([#12237](https://github.com/yt-dlp/yt-dlp/issues/12237)) by [alard](https://github.com/alard) +- **pbs**: [Support www.thirteen.org URLs](https://github.com/yt-dlp/yt-dlp/commit/9fb8ab2ff67fb699f60cce09163a580976e90c0e) ([#11191](https://github.com/yt-dlp/yt-dlp/issues/11191)) by [rohieb](https://github.com/rohieb) +- **reddit**: [Bypass gated subreddit warning](https://github.com/yt-dlp/yt-dlp/commit/6ca23ffaa4663cb552f937f0b1e9769b66db11bd) ([#12335](https://github.com/yt-dlp/yt-dlp/issues/12335)) by [bashonly](https://github.com/bashonly) +- **twitter**: [Fix syndication token generation](https://github.com/yt-dlp/yt-dlp/commit/14cd7f3443c6da4d49edaefcc12da9dee86e243e) ([#12107](https://github.com/yt-dlp/yt-dlp/issues/12107)) by [Grub4K](https://github.com/Grub4K), [pjrobertson](https://github.com/pjrobertson) +- **youtube** + - [Retry on more critical requests](https://github.com/yt-dlp/yt-dlp/commit/d48e612609d012abbea3785be4d26d78a014abb2) ([#12339](https://github.com/yt-dlp/yt-dlp/issues/12339)) by [coletdjnz](https://github.com/coletdjnz) + - [nsig workaround for `tce` player JS](https://github.com/yt-dlp/yt-dlp/commit/ec17fb16e8d69d4e3e10fb73bf3221be8570dfee) ([#12401](https://github.com/yt-dlp/yt-dlp/issues/12401)) by [bashonly](https://github.com/bashonly) +- **zdf**: [Extract more metadata](https://github.com/yt-dlp/yt-dlp/commit/241ace4f104d50fdf7638f9203927aefcf57a1f7) ([#9565](https://github.com/yt-dlp/yt-dlp/issues/9565)) by [StefanLobbenmeier](https://github.com/StefanLobbenmeier) (With fixes in [e7882b6](https://github.com/yt-dlp/yt-dlp/commit/e7882b682b959e476d8454911655b3e9b14c79b2) by [bashonly](https://github.com/bashonly)) + +#### Downloader changes +- **hls** + - [Fix `BYTERANGE` logic](https://github.com/yt-dlp/yt-dlp/commit/10b7ff68e98f17655e31952f6e17120b2d7dda96) ([#11972](https://github.com/yt-dlp/yt-dlp/issues/11972)) by [entourage8](https://github.com/entourage8) + - [Support `--write-pages` for m3u8 media playlists](https://github.com/yt-dlp/yt-dlp/commit/be69468752ff598cacee57bb80533deab2367a5d) ([#12333](https://github.com/yt-dlp/yt-dlp/issues/12333)) by [bashonly](https://github.com/bashonly) + - [Support `hls_media_playlist_data` format field](https://github.com/yt-dlp/yt-dlp/commit/c987be0acb6872c6561f28aa28171e803393d851) ([#12322](https://github.com/yt-dlp/yt-dlp/issues/12322)) by [bashonly](https://github.com/bashonly) + +#### Misc. changes +- [Improve Issue/PR templates](https://github.com/yt-dlp/yt-dlp/commit/517ddf3c3f12560ab93e3d36244dc82db9f97818) ([#11499](https://github.com/yt-dlp/yt-dlp/issues/11499)) by [seproDev](https://github.com/seproDev) (With fixes in [4ecb833](https://github.com/yt-dlp/yt-dlp/commit/4ecb833472c90e078567b561fb7c089f1aa9587b) by [bashonly](https://github.com/bashonly)) +- **cleanup**: Miscellaneous: [4985a40](https://github.com/yt-dlp/yt-dlp/commit/4985a4041770eaa0016271809a1fd950dc809a55) by [dirkf](https://github.com/dirkf), [Grub4K](https://github.com/Grub4K), [StefanLobbenmeier](https://github.com/StefanLobbenmeier) +- **docs**: [Add note to `supportedsites.md`](https://github.com/yt-dlp/yt-dlp/commit/01a63629a21781458dcbd38779898e117678f5ff) ([#12382](https://github.com/yt-dlp/yt-dlp/issues/12382)) by [seproDev](https://github.com/seproDev) +- **test**: download: [Validate and sort info dict fields](https://github.com/yt-dlp/yt-dlp/commit/208163447408c78673b08c172beafe5c310fb167) ([#12299](https://github.com/yt-dlp/yt-dlp/issues/12299)) by [bashonly](https://github.com/bashonly), [pzhlkj6612](https://github.com/pzhlkj6612) + ### 2025.01.26 #### Core changes diff --git a/README.md b/README.md index 592468020..634cff5a5 100644 --- a/README.md +++ b/README.md @@ -337,10 +337,11 @@ ## General Options: --plugin-dirs PATH Path to an additional directory to search for plugins. This option can be used multiple times to add multiple directories. - Note that this currently only works for - extractor plugins; postprocessor plugins can - only be loaded from the default plugin - directories + Use "default" to search the default plugin + directories (default) + --no-plugin-dirs Clear plugin directories to search, + including defaults and those provided by + previous --plugin-dirs --flat-playlist Do not extract a playlist's URL result entries; some entry metadata may be missing and downloading may be bypassed @@ -1525,7 +1526,7 @@ ## Sorting Formats - `hasvid`: Gives priority to formats that have a video stream - `hasaud`: Gives priority to formats that have an audio stream - `ie_pref`: The format preference - - `lang`: The language preference + - `lang`: The language preference as determined by the extractor (e.g. original language preferred over audio description) - `quality`: The quality of the format - `source`: The preference of the source - `proto`: Protocol used for download (`https`/`ftps` > `http`/`ftp` > `m3u8_native`/`m3u8` > `http_dash_segments`> `websocket_frag` > `mms`/`rtsp` > `f4f`/`f4m`) diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index d288d8429..0ce773e82 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -10,6 +10,9 @@ from inspect import getsource from devscripts.utils import get_filename_args, read_file, write_file +from yt_dlp.extractor import import_extractors +from yt_dlp.extractor.common import InfoExtractor, SearchInfoExtractor +from yt_dlp.globals import extractors NO_ATTR = object() STATIC_CLASS_PROPERTIES = [ @@ -38,8 +41,7 @@ def main(): lazy_extractors_filename = get_filename_args(default_outfile='yt_dlp/extractor/lazy_extractors.py') - from yt_dlp.extractor.extractors import _ALL_CLASSES - from yt_dlp.extractor.common import InfoExtractor, SearchInfoExtractor + import_extractors() DummyInfoExtractor = type('InfoExtractor', (InfoExtractor,), {'IE_NAME': NO_ATTR}) module_src = '\n'.join(( @@ -47,7 +49,7 @@ def main(): ' _module = None', *extra_ie_code(DummyInfoExtractor), '\nclass LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n', - *build_ies(_ALL_CLASSES, (InfoExtractor, SearchInfoExtractor), DummyInfoExtractor), + *build_ies(list(extractors.value.values()), (InfoExtractor, SearchInfoExtractor), DummyInfoExtractor), )) write_file(lazy_extractors_filename, f'{module_src}\n') @@ -73,7 +75,7 @@ def build_ies(ies, bases, attr_base): if ie in ies: names.append(ie.__name__) - yield f'\n_ALL_CLASSES = [{", ".join(names)}]' + yield '\n_CLASS_LOOKUP = {%s}' % ', '.join(f'{name!r}: {name}' for name in names) def sort_ies(ies, ignored_bases): diff --git a/devscripts/make_supportedsites.py b/devscripts/make_supportedsites.py index 01548ef97..145f6d47f 100644 --- a/devscripts/make_supportedsites.py +++ b/devscripts/make_supportedsites.py @@ -10,10 +10,21 @@ from devscripts.utils import get_filename_args, write_file from yt_dlp.extractor import list_extractor_classes +TEMPLATE = '''\ +# Supported sites + +Below is a list of all extractors that are currently included with yt-dlp. +If a site is not listed here, it might still be supported by yt-dlp's embed extraction or generic extractor. +Not all sites listed here are guaranteed to work; websites are constantly changing and sometimes this breaks yt-dlp's support for them. +The only reliable way to check if a site is supported is to try it. + +{ie_list} +''' + def main(): out = '\n'.join(ie.description() for ie in list_extractor_classes() if ie.IE_DESC is not False) - write_file(get_filename_args(), f'# Supported sites\n{out}\n') + write_file(get_filename_args(), TEMPLATE.format(ie_list=out)) if __name__ == '__main__': diff --git a/devscripts/run_tests.py b/devscripts/run_tests.py index eb614fe59..ebb3500b6 100755 --- a/devscripts/run_tests.py +++ b/devscripts/run_tests.py @@ -25,7 +25,8 @@ def parse_args(): def run_tests(*tests, pattern=None, ci=False): - run_core = 'core' in tests or (not pattern and not tests) + # XXX: hatch uses `tests` if no arguments are passed + run_core = 'core' in tests or 'tests' in tests or (not pattern and not tests) run_download = 'download' in tests pytest_args = args.pytest_args or os.getenv('HATCH_TEST_ARGS', '') diff --git a/pyproject.toml b/pyproject.toml index 5eb9a9644..2a0008a45 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -384,6 +384,7 @@ select = [ "W391", "W504", ] +exclude = "*/extractor/lazy_extractors.py,*venv*,*/test/testdata/sigs/player-*.js,.idea,.vscode" [tool.pytest.ini_options] addopts = "-ra -v --strict-markers" diff --git a/supportedsites.md b/supportedsites.md index 70909ef00..0924c88ff 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -1,4 +1,10 @@ # Supported sites + +Below is a list of all extractors that are currently included with yt-dlp. +If a site is not listed here, it might still be supported by yt-dlp's embed extraction or generic extractor. +Not all sites listed here are guaranteed to work; websites are constantly changing and sometimes this breaks yt-dlp's support for them. +The only reliable way to check if a site is supported is to try it. + - **17live** - **17live:clip** - **1News**: 1news.co.nz article videos @@ -314,7 +320,8 @@ # Supported sites - **curiositystream**: [*curiositystream*](## "netrc machine") - **curiositystream:collections**: [*curiositystream*](## "netrc machine") - **curiositystream:series**: [*curiositystream*](## "netrc machine") - - **CWTV** + - **cwtv** + - **cwtv:movie** - **Cybrary**: [*cybrary*](## "netrc machine") - **CybraryCourse**: [*cybrary*](## "netrc machine") - **DacastPlaylist** @@ -349,6 +356,7 @@ # Supported sites - **DigitalConcertHall**: [*digitalconcerthall*](## "netrc machine") DigitalConcertHall extractor - **DigitallySpeaking** - **Digiteka** + - **Digiview** - **DiscogsReleasePlaylist** - **DiscoveryLife** - **DiscoveryNetworksDe** @@ -465,9 +473,9 @@ # Supported sites - **fptplay**: fptplay.vn - **FranceCulture** - **FranceInter** - - **FranceTV** + - **francetv** + - **francetv:site** - **francetvinfo.fr** - - **FranceTVSite** - **Freesound** - **freespeech.org** - **freetv:series** @@ -499,7 +507,7 @@ # Supported sites - **GediDigital** - **gem.cbc.ca**: [*cbcgem*](## "netrc machine") - **gem.cbc.ca:live** - - **gem.cbc.ca:playlist** + - **gem.cbc.ca:playlist**: [*cbcgem*](## "netrc machine") - **Genius** - **GeniusLyrics** - **Germanupa**: germanupa.de diff --git a/test/helper.py b/test/helper.py index 193019019..4169af799 100644 --- a/test/helper.py +++ b/test/helper.py @@ -101,87 +101,109 @@ def getwebpagetestcases(): md5 = lambda s: hashlib.md5(s.encode()).hexdigest() -def expect_value(self, got, expected, field): - if isinstance(expected, str) and expected.startswith('re:'): - match_str = expected[len('re:'):] - match_rex = re.compile(match_str) +def _iter_differences(got, expected, field): + if isinstance(expected, str): + op, _, val = expected.partition(':') + if op in ('mincount', 'maxcount', 'count'): + if not isinstance(got, (list, dict)): + yield field, f'expected either {list.__name__} or {dict.__name__}, got {type(got).__name__}' + return - self.assertTrue( - isinstance(got, str), - f'Expected a {str.__name__} object, but got {type(got).__name__} for field {field}') - self.assertTrue( - match_rex.match(got), - f'field {field} (value: {got!r}) should match {match_str!r}') - elif isinstance(expected, str) and expected.startswith('startswith:'): - start_str = expected[len('startswith:'):] - self.assertTrue( - isinstance(got, str), - f'Expected a {str.__name__} object, but got {type(got).__name__} for field {field}') - self.assertTrue( - got.startswith(start_str), - f'field {field} (value: {got!r}) should start with {start_str!r}') - elif isinstance(expected, str) and expected.startswith('contains:'): - contains_str = expected[len('contains:'):] - self.assertTrue( - isinstance(got, str), - f'Expected a {str.__name__} object, but got {type(got).__name__} for field {field}') - self.assertTrue( - contains_str in got, - f'field {field} (value: {got!r}) should contain {contains_str!r}') - elif isinstance(expected, type): - self.assertTrue( - isinstance(got, expected), - f'Expected type {expected!r} for field {field}, but got value {got!r} of type {type(got)!r}') - elif isinstance(expected, dict) and isinstance(got, dict): - expect_dict(self, got, expected) - elif isinstance(expected, list) and isinstance(got, list): - self.assertEqual( - len(expected), len(got), - f'Expect a list of length {len(expected)}, but got a list of length {len(got)} for field {field}') - for index, (item_got, item_expected) in enumerate(zip(got, expected)): - type_got = type(item_got) - type_expected = type(item_expected) - self.assertEqual( - type_expected, type_got, - f'Type mismatch for list item at index {index} for field {field}, ' - f'expected {type_expected!r}, got {type_got!r}') - expect_value(self, item_got, item_expected, field) - else: - if isinstance(expected, str) and expected.startswith('md5:'): - self.assertTrue( - isinstance(got, str), - f'Expected field {field} to be a unicode object, but got value {got!r} of type {type(got)!r}') - got = 'md5:' + md5(got) - elif isinstance(expected, str) and re.match(r'^(?:min|max)?count:\d+', expected): - self.assertTrue( - isinstance(got, (list, dict)), - f'Expected field {field} to be a list or a dict, but it is of type {type(got).__name__}') - op, _, expected_num = expected.partition(':') - expected_num = int(expected_num) + expected_num = int(val) + got_num = len(got) if op == 'mincount': - assert_func = assertGreaterEqual - msg_tmpl = 'Expected %d items in field %s, but only got %d' - elif op == 'maxcount': - assert_func = assertLessEqual - msg_tmpl = 'Expected maximum %d items in field %s, but got %d' - elif op == 'count': - assert_func = assertEqual - msg_tmpl = 'Expected exactly %d items in field %s, but got %d' - else: - assert False - assert_func( - self, len(got), expected_num, - msg_tmpl % (expected_num, field, len(got))) + if got_num < expected_num: + yield field, f'expected at least {val} items, got {got_num}' + return + + if op == 'maxcount': + if got_num > expected_num: + yield field, f'expected at most {val} items, got {got_num}' + return + + assert op == 'count' + if got_num != expected_num: + yield field, f'expected exactly {val} items, got {got_num}' return - self.assertEqual( - expected, got, - f'Invalid value for field {field}, expected {expected!r}, got {got!r}') + + if not isinstance(got, str): + yield field, f'expected {str.__name__}, got {type(got).__name__}' + return + + if op == 're': + if not re.match(val, got): + yield field, f'should match {val!r}, got {got!r}' + return + + if op == 'startswith': + if not val.startswith(got): + yield field, f'should start with {val!r}, got {got!r}' + return + + if op == 'contains': + if not val.startswith(got): + yield field, f'should contain {val!r}, got {got!r}' + return + + if op == 'md5': + hash_val = md5(got) + if hash_val != val: + yield field, f'expected hash {val}, got {hash_val}' + return + + if got != expected: + yield field, f'expected {expected!r}, got {got!r}' + return + + if isinstance(expected, dict) and isinstance(got, dict): + for key, expected_val in expected.items(): + if key not in got: + yield field, f'missing key: {key!r}' + continue + + field_name = key if field is None else f'{field}.{key}' + yield from _iter_differences(got[key], expected_val, field_name) + return + + if isinstance(expected, type): + if not isinstance(got, expected): + yield field, f'expected {expected.__name__}, got {type(got).__name__}' + return + + if isinstance(expected, list) and isinstance(got, list): + # TODO: clever diffing algorithm lmao + if len(expected) != len(got): + yield field, f'expected length of {len(expected)}, got {len(got)}' + return + + for index, (got_val, expected_val) in enumerate(zip(got, expected)): + field_name = str(index) if field is None else f'{field}.{index}' + yield from _iter_differences(got_val, expected_val, field_name) + return + + if got != expected: + yield field, f'expected {expected!r}, got {got!r}' + + +def _expect_value(message, got, expected, field): + mismatches = list(_iter_differences(got, expected, field)) + if not mismatches: + return + + fields = [field for field, _ in mismatches if field is not None] + return ''.join(( + message, f' ({", ".join(fields)})' if fields else '', + *(f'\n\t{field}: {message}' for field, message in mismatches))) + + +def expect_value(self, got, expected, field): + if message := _expect_value('values differ', got, expected, field): + self.fail(message) def expect_dict(self, got_dict, expected_dict): - for info_field, expected in expected_dict.items(): - got = got_dict.get(info_field) - expect_value(self, got, expected, info_field) + if message := _expect_value('dictionaries differ', got_dict, expected_dict, None): + self.fail(message) def sanitize_got_info_dict(got_dict): diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 17e081bc6..708a04f92 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -6,6 +6,8 @@ import unittest from unittest.mock import patch +from yt_dlp.globals import all_plugins_loaded + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -1427,6 +1429,12 @@ def check_for_cookie_header(result): self.assertFalse(result.get('cookies'), msg='Cookies set in cookies field for wrong domain') self.assertFalse(ydl.cookiejar.get_cookie_header(fmt['url']), msg='Cookies set in cookiejar for wrong domain') + def test_load_plugins_compat(self): + # Should try to reload plugins if they haven't already been loaded + all_plugins_loaded.value = False + FakeYDL().close() + assert all_plugins_loaded.value + if __name__ == '__main__': unittest.main() diff --git a/test/test_plugins.py b/test/test_plugins.py index 77545d136..195726b18 100644 --- a/test/test_plugins.py +++ b/test/test_plugins.py @@ -10,22 +10,71 @@ sys.path.append(str(TEST_DATA_DIR)) importlib.invalidate_caches() -from yt_dlp.utils import Config -from yt_dlp.plugins import PACKAGE_NAME, directories, load_plugins +from yt_dlp.plugins import ( + PACKAGE_NAME, + PluginSpec, + directories, + load_plugins, + load_all_plugins, + register_plugin_spec, +) + +from yt_dlp.globals import ( + extractors, + postprocessors, + plugin_dirs, + plugin_ies, + plugin_pps, + all_plugins_loaded, + plugin_specs, +) + + +EXTRACTOR_PLUGIN_SPEC = PluginSpec( + module_name='extractor', + suffix='IE', + destination=extractors, + plugin_destination=plugin_ies, +) + +POSTPROCESSOR_PLUGIN_SPEC = PluginSpec( + module_name='postprocessor', + suffix='PP', + destination=postprocessors, + plugin_destination=plugin_pps, +) + + +def reset_plugins(): + plugin_ies.value = {} + plugin_pps.value = {} + plugin_dirs.value = ['default'] + plugin_specs.value = {} + all_plugins_loaded.value = False + # Clearing override plugins is probably difficult + for module_name in tuple(sys.modules): + for plugin_type in ('extractor', 'postprocessor'): + if module_name.startswith(f'{PACKAGE_NAME}.{plugin_type}.'): + del sys.modules[module_name] + + importlib.invalidate_caches() class TestPlugins(unittest.TestCase): TEST_PLUGIN_DIR = TEST_DATA_DIR / PACKAGE_NAME + def setUp(self): + reset_plugins() + + def tearDown(self): + reset_plugins() + def test_directories_containing_plugins(self): self.assertIn(self.TEST_PLUGIN_DIR, map(Path, directories())) def test_extractor_classes(self): - for module_name in tuple(sys.modules): - if module_name.startswith(f'{PACKAGE_NAME}.extractor'): - del sys.modules[module_name] - plugins_ie = load_plugins('extractor', 'IE') + plugins_ie = load_plugins(EXTRACTOR_PLUGIN_SPEC) self.assertIn(f'{PACKAGE_NAME}.extractor.normal', sys.modules.keys()) self.assertIn('NormalPluginIE', plugins_ie.keys()) @@ -35,17 +84,29 @@ def test_extractor_classes(self): f'{PACKAGE_NAME}.extractor._ignore' in sys.modules, 'loaded module beginning with underscore') self.assertNotIn('IgnorePluginIE', plugins_ie.keys()) + self.assertNotIn('IgnorePluginIE', plugin_ies.value) # Don't load extractors with underscore prefix self.assertNotIn('_IgnoreUnderscorePluginIE', plugins_ie.keys()) + self.assertNotIn('_IgnoreUnderscorePluginIE', plugin_ies.value) # Don't load extractors not specified in __all__ (if supplied) self.assertNotIn('IgnoreNotInAllPluginIE', plugins_ie.keys()) + self.assertNotIn('IgnoreNotInAllPluginIE', plugin_ies.value) self.assertIn('InAllPluginIE', plugins_ie.keys()) + self.assertIn('InAllPluginIE', plugin_ies.value) + + # Don't load override extractors + self.assertNotIn('OverrideGenericIE', plugins_ie.keys()) + self.assertNotIn('OverrideGenericIE', plugin_ies.value) + self.assertNotIn('_UnderscoreOverrideGenericIE', plugins_ie.keys()) + self.assertNotIn('_UnderscoreOverrideGenericIE', plugin_ies.value) def test_postprocessor_classes(self): - plugins_pp = load_plugins('postprocessor', 'PP') + plugins_pp = load_plugins(POSTPROCESSOR_PLUGIN_SPEC) self.assertIn('NormalPluginPP', plugins_pp.keys()) + self.assertIn(f'{PACKAGE_NAME}.postprocessor.normal', sys.modules.keys()) + self.assertIn('NormalPluginPP', plugin_pps.value) def test_importing_zipped_module(self): zip_path = TEST_DATA_DIR / 'zipped_plugins.zip' @@ -58,10 +119,10 @@ def test_importing_zipped_module(self): package = importlib.import_module(f'{PACKAGE_NAME}.{plugin_type}') self.assertIn(zip_path / PACKAGE_NAME / plugin_type, map(Path, package.__path__)) - plugins_ie = load_plugins('extractor', 'IE') + plugins_ie = load_plugins(EXTRACTOR_PLUGIN_SPEC) self.assertIn('ZippedPluginIE', plugins_ie.keys()) - plugins_pp = load_plugins('postprocessor', 'PP') + plugins_pp = load_plugins(POSTPROCESSOR_PLUGIN_SPEC) self.assertIn('ZippedPluginPP', plugins_pp.keys()) finally: @@ -69,23 +130,116 @@ def test_importing_zipped_module(self): os.remove(zip_path) importlib.invalidate_caches() # reset the import caches - def test_plugin_dirs(self): - # Internal plugin dirs hack for CLI --plugin-dirs - # To be replaced with proper system later - custom_plugin_dir = TEST_DATA_DIR / 'plugin_packages' - Config._plugin_dirs = [str(custom_plugin_dir)] - importlib.invalidate_caches() # reset the import caches + def test_reloading_plugins(self): + reload_plugins_path = TEST_DATA_DIR / 'reload_plugins' + load_plugins(EXTRACTOR_PLUGIN_SPEC) + load_plugins(POSTPROCESSOR_PLUGIN_SPEC) + # Remove default folder and add reload_plugin path + sys.path.remove(str(TEST_DATA_DIR)) + sys.path.append(str(reload_plugins_path)) + importlib.invalidate_caches() try: - package = importlib.import_module(f'{PACKAGE_NAME}.extractor') - self.assertIn(custom_plugin_dir / 'testpackage' / PACKAGE_NAME / 'extractor', map(Path, package.__path__)) + for plugin_type in ('extractor', 'postprocessor'): + package = importlib.import_module(f'{PACKAGE_NAME}.{plugin_type}') + self.assertIn(reload_plugins_path / PACKAGE_NAME / plugin_type, map(Path, package.__path__)) - plugins_ie = load_plugins('extractor', 'IE') - self.assertIn('PackagePluginIE', plugins_ie.keys()) + plugins_ie = load_plugins(EXTRACTOR_PLUGIN_SPEC) + self.assertIn('NormalPluginIE', plugins_ie.keys()) + self.assertTrue( + plugins_ie['NormalPluginIE'].REPLACED, + msg='Reloading has not replaced original extractor plugin') + self.assertTrue( + extractors.value['NormalPluginIE'].REPLACED, + msg='Reloading has not replaced original extractor plugin globally') + + plugins_pp = load_plugins(POSTPROCESSOR_PLUGIN_SPEC) + self.assertIn('NormalPluginPP', plugins_pp.keys()) + self.assertTrue(plugins_pp['NormalPluginPP'].REPLACED, + msg='Reloading has not replaced original postprocessor plugin') + self.assertTrue( + postprocessors.value['NormalPluginPP'].REPLACED, + msg='Reloading has not replaced original postprocessor plugin globally') finally: - Config._plugin_dirs = [] - importlib.invalidate_caches() # reset the import caches + sys.path.remove(str(reload_plugins_path)) + sys.path.append(str(TEST_DATA_DIR)) + importlib.invalidate_caches() + + def test_extractor_override_plugin(self): + load_plugins(EXTRACTOR_PLUGIN_SPEC) + + from yt_dlp.extractor.generic import GenericIE + + self.assertEqual(GenericIE.TEST_FIELD, 'override') + self.assertEqual(GenericIE.SECONDARY_TEST_FIELD, 'underscore-override') + + self.assertEqual(GenericIE.IE_NAME, 'generic+override+underscore-override') + importlib.invalidate_caches() + # test that loading a second time doesn't wrap a second time + load_plugins(EXTRACTOR_PLUGIN_SPEC) + from yt_dlp.extractor.generic import GenericIE + self.assertEqual(GenericIE.IE_NAME, 'generic+override+underscore-override') + + def test_load_all_plugin_types(self): + + # no plugin specs registered + load_all_plugins() + + self.assertNotIn(f'{PACKAGE_NAME}.extractor.normal', sys.modules.keys()) + self.assertNotIn(f'{PACKAGE_NAME}.postprocessor.normal', sys.modules.keys()) + + register_plugin_spec(EXTRACTOR_PLUGIN_SPEC) + register_plugin_spec(POSTPROCESSOR_PLUGIN_SPEC) + load_all_plugins() + self.assertTrue(all_plugins_loaded.value) + + self.assertIn(f'{PACKAGE_NAME}.extractor.normal', sys.modules.keys()) + self.assertIn(f'{PACKAGE_NAME}.postprocessor.normal', sys.modules.keys()) + + def test_no_plugin_dirs(self): + register_plugin_spec(EXTRACTOR_PLUGIN_SPEC) + register_plugin_spec(POSTPROCESSOR_PLUGIN_SPEC) + + plugin_dirs.value = [] + load_all_plugins() + + self.assertNotIn(f'{PACKAGE_NAME}.extractor.normal', sys.modules.keys()) + self.assertNotIn(f'{PACKAGE_NAME}.postprocessor.normal', sys.modules.keys()) + + def test_set_plugin_dirs(self): + custom_plugin_dir = str(TEST_DATA_DIR / 'plugin_packages') + plugin_dirs.value = [custom_plugin_dir] + + load_plugins(EXTRACTOR_PLUGIN_SPEC) + + self.assertIn(f'{PACKAGE_NAME}.extractor.package', sys.modules.keys()) + self.assertIn('PackagePluginIE', plugin_ies.value) + + def test_invalid_plugin_dir(self): + plugin_dirs.value = ['invalid_dir'] + with self.assertRaises(ValueError): + load_plugins(EXTRACTOR_PLUGIN_SPEC) + + def test_append_plugin_dirs(self): + custom_plugin_dir = str(TEST_DATA_DIR / 'plugin_packages') + + self.assertEqual(plugin_dirs.value, ['default']) + plugin_dirs.value.append(custom_plugin_dir) + self.assertEqual(plugin_dirs.value, ['default', custom_plugin_dir]) + + load_plugins(EXTRACTOR_PLUGIN_SPEC) + + self.assertIn(f'{PACKAGE_NAME}.extractor.package', sys.modules.keys()) + self.assertIn('PackagePluginIE', plugin_ies.value) + + def test_get_plugin_spec(self): + register_plugin_spec(EXTRACTOR_PLUGIN_SPEC) + register_plugin_spec(POSTPROCESSOR_PLUGIN_SPEC) + + self.assertEqual(plugin_specs.value.get('extractor'), EXTRACTOR_PLUGIN_SPEC) + self.assertEqual(plugin_specs.value.get('postprocessor'), POSTPROCESSOR_PLUGIN_SPEC) + self.assertIsNone(plugin_specs.value.get('invalid')) if __name__ == '__main__': diff --git a/test/testdata/plugin_packages/testpackage/yt_dlp_plugins/extractor/package.py b/test/testdata/plugin_packages/testpackage/yt_dlp_plugins/extractor/package.py index b860300d8..39020fef9 100644 --- a/test/testdata/plugin_packages/testpackage/yt_dlp_plugins/extractor/package.py +++ b/test/testdata/plugin_packages/testpackage/yt_dlp_plugins/extractor/package.py @@ -2,4 +2,5 @@ class PackagePluginIE(InfoExtractor): + _VALID_URL = 'package' pass diff --git a/test/testdata/reload_plugins/yt_dlp_plugins/extractor/normal.py b/test/testdata/reload_plugins/yt_dlp_plugins/extractor/normal.py new file mode 100644 index 000000000..6b927077f --- /dev/null +++ b/test/testdata/reload_plugins/yt_dlp_plugins/extractor/normal.py @@ -0,0 +1,10 @@ +from yt_dlp.extractor.common import InfoExtractor + + +class NormalPluginIE(InfoExtractor): + _VALID_URL = 'normal' + REPLACED = True + + +class _IgnoreUnderscorePluginIE(InfoExtractor): + pass diff --git a/test/testdata/reload_plugins/yt_dlp_plugins/postprocessor/normal.py b/test/testdata/reload_plugins/yt_dlp_plugins/postprocessor/normal.py new file mode 100644 index 000000000..5e44ba2b5 --- /dev/null +++ b/test/testdata/reload_plugins/yt_dlp_plugins/postprocessor/normal.py @@ -0,0 +1,5 @@ +from yt_dlp.postprocessor.common import PostProcessor + + +class NormalPluginPP(PostProcessor): + REPLACED = True diff --git a/test/testdata/yt_dlp_plugins/extractor/ignore.py b/test/testdata/yt_dlp_plugins/extractor/ignore.py index 816a16aa2..dca111a37 100644 --- a/test/testdata/yt_dlp_plugins/extractor/ignore.py +++ b/test/testdata/yt_dlp_plugins/extractor/ignore.py @@ -6,6 +6,7 @@ class IgnoreNotInAllPluginIE(InfoExtractor): class InAllPluginIE(InfoExtractor): + _VALID_URL = 'inallpluginie' pass diff --git a/test/testdata/yt_dlp_plugins/extractor/normal.py b/test/testdata/yt_dlp_plugins/extractor/normal.py index b09009bdc..996b2936f 100644 --- a/test/testdata/yt_dlp_plugins/extractor/normal.py +++ b/test/testdata/yt_dlp_plugins/extractor/normal.py @@ -2,8 +2,10 @@ class NormalPluginIE(InfoExtractor): - pass + _VALID_URL = 'normalpluginie' + REPLACED = False class _IgnoreUnderscorePluginIE(InfoExtractor): + _VALID_URL = 'ignoreunderscorepluginie' pass diff --git a/test/testdata/yt_dlp_plugins/extractor/override.py b/test/testdata/yt_dlp_plugins/extractor/override.py new file mode 100644 index 000000000..766dc32e1 --- /dev/null +++ b/test/testdata/yt_dlp_plugins/extractor/override.py @@ -0,0 +1,5 @@ +from yt_dlp.extractor.generic import GenericIE + + +class OverrideGenericIE(GenericIE, plugin_name='override'): + TEST_FIELD = 'override' diff --git a/test/testdata/yt_dlp_plugins/extractor/overridetwo.py b/test/testdata/yt_dlp_plugins/extractor/overridetwo.py new file mode 100644 index 000000000..826184c64 --- /dev/null +++ b/test/testdata/yt_dlp_plugins/extractor/overridetwo.py @@ -0,0 +1,5 @@ +from yt_dlp.extractor.generic import GenericIE + + +class _UnderscoreOverrideGenericIE(GenericIE, plugin_name='underscore-override'): + SECONDARY_TEST_FIELD = 'underscore-override' diff --git a/test/testdata/yt_dlp_plugins/postprocessor/normal.py b/test/testdata/yt_dlp_plugins/postprocessor/normal.py index 315b85a48..1e94d7b8b 100644 --- a/test/testdata/yt_dlp_plugins/postprocessor/normal.py +++ b/test/testdata/yt_dlp_plugins/postprocessor/normal.py @@ -2,4 +2,4 @@ class NormalPluginPP(PostProcessor): - pass + REPLACED = False diff --git a/test/testdata/zipped_plugins/yt_dlp_plugins/extractor/zipped.py b/test/testdata/zipped_plugins/yt_dlp_plugins/extractor/zipped.py index 01542e0d8..c5140bb02 100644 --- a/test/testdata/zipped_plugins/yt_dlp_plugins/extractor/zipped.py +++ b/test/testdata/zipped_plugins/yt_dlp_plugins/extractor/zipped.py @@ -2,4 +2,5 @@ class ZippedPluginIE(InfoExtractor): + _VALID_URL = 'zippedpluginie' pass diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 3293a9076..8790b326b 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -30,9 +30,18 @@ from .cookies import CookieLoadError, LenientSimpleCookie, load_cookies from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name from .downloader.rtmp import rtmpdump_version -from .extractor import gen_extractor_classes, get_info_extractor +from .extractor import gen_extractor_classes, get_info_extractor, import_extractors from .extractor.common import UnsupportedURLIE from .extractor.openload import PhantomJSwrapper +from .globals import ( + IN_CLI, + LAZY_EXTRACTORS, + plugin_ies, + plugin_ies_overrides, + plugin_pps, + all_plugins_loaded, + plugin_dirs, +) from .minicurses import format_text from .networking import HEADRequest, Request, RequestDirector from .networking.common import _REQUEST_HANDLERS, _RH_PREFERENCES @@ -44,8 +53,7 @@ network_exceptions, ) from .networking.impersonate import ImpersonateRequestHandler -from .plugins import directories as plugin_directories -from .postprocessor import _PLUGIN_CLASSES as plugin_pps +from .plugins import directories as plugin_directories, load_all_plugins from .postprocessor import ( EmbedThumbnailPP, FFmpegFixupDuplicateMoovPP, @@ -157,7 +165,7 @@ write_json_file, write_string, ) -from .utils._utils import _UnsafeExtensionError, _YDLLogger +from .utils._utils import _UnsafeExtensionError, _YDLLogger, _ProgressState from .utils.networking import ( HTTPHeaderDict, clean_headers, @@ -642,20 +650,23 @@ def __init__(self, params=None, auto_init=True): self.cache = Cache(self) self.__header_cookies = [] - stdout = sys.stderr if self.params.get('logtostderr') else sys.stdout - self._out_files = Namespace( - out=stdout, - error=sys.stderr, - screen=sys.stderr if self.params.get('quiet') else stdout, - console=None if os.name == 'nt' else next( - filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None), - ) + # compat for API: load plugins if they have not already + if not all_plugins_loaded.value: + load_all_plugins() try: windows_enable_vt_mode() except Exception as e: self.write_debug(f'Failed to enable VT mode: {e}') + stdout = sys.stderr if self.params.get('logtostderr') else sys.stdout + self._out_files = Namespace( + out=stdout, + error=sys.stderr, + screen=sys.stderr if self.params.get('quiet') else stdout, + console=next(filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None), + ) + if self.params.get('no_color'): if self.params.get('color') is not None: self.params.setdefault('_warnings', []).append( @@ -956,21 +967,22 @@ def to_stderr(self, message, only_once=False): self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.error, only_once=only_once) def _send_console_code(self, code): - if os.name == 'nt' or not self._out_files.console: - return + if not supports_terminal_sequences(self._out_files.console): + return False self._write_string(code, self._out_files.console) + return True - def to_console_title(self, message): - if not self.params.get('consoletitle', False): + def to_console_title(self, message=None, progress_state=None, percent=None): + if not self.params.get('consoletitle'): return - message = remove_terminal_sequences(message) - if os.name == 'nt': - if ctypes.windll.kernel32.GetConsoleWindow(): - # c_wchar_p() might not be necessary if `message` is - # already of type unicode() - ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) - else: - self._send_console_code(f'\033]0;{message}\007') + + if message: + success = self._send_console_code(f'\033]0;{remove_terminal_sequences(message)}\007') + if not success and os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): + ctypes.windll.kernel32.SetConsoleTitleW(message) + + if isinstance(progress_state, _ProgressState): + self._send_console_code(progress_state.get_ansi_escape(percent)) def save_console_title(self): if not self.params.get('consoletitle') or self.params.get('simulate'): @@ -984,6 +996,7 @@ def restore_console_title(self): def __enter__(self): self.save_console_title() + self.to_console_title(progress_state=_ProgressState.INDETERMINATE) return self def save_cookies(self): @@ -992,6 +1005,7 @@ def save_cookies(self): def __exit__(self, *args): self.restore_console_title() + self.to_console_title(progress_state=_ProgressState.HIDDEN) self.close() def close(self): @@ -3993,15 +4007,6 @@ def print_debug_header(self): if not self.params.get('verbose'): return - from . import _IN_CLI # Must be delayed import - - # These imports can be slow. So import them only as needed - from .extractor.extractors import _LAZY_LOADER - from .extractor.extractors import ( - _PLUGIN_CLASSES as plugin_ies, - _PLUGIN_OVERRIDES as plugin_ie_overrides, - ) - def get_encoding(stream): ret = str(getattr(stream, 'encoding', f'missing ({type(stream).__name__})')) additional_info = [] @@ -4040,17 +4045,18 @@ def get_encoding(stream): _make_label(ORIGIN, CHANNEL.partition('@')[2] or __version__, __version__), f'[{RELEASE_GIT_HEAD[:9]}]' if RELEASE_GIT_HEAD else '', '' if source == 'unknown' else f'({source})', - '' if _IN_CLI else 'API' if klass == YoutubeDL else f'API:{self.__module__}.{klass.__qualname__}', + '' if IN_CLI.value else 'API' if klass == YoutubeDL else f'API:{self.__module__}.{klass.__qualname__}', delim=' ')) - if not _IN_CLI: + if not IN_CLI.value: write_debug(f'params: {self.params}') - if not _LAZY_LOADER: - if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): - write_debug('Lazy loading extractors is forcibly disabled') - else: - write_debug('Lazy loading extractors is disabled') + import_extractors() + lazy_extractors = LAZY_EXTRACTORS.value + if lazy_extractors is None: + write_debug('Lazy loading extractors is disabled') + elif not lazy_extractors: + write_debug('Lazy loading extractors is forcibly disabled') if self.params['compat_opts']: write_debug('Compatibility options: {}'.format(', '.join(self.params['compat_opts']))) @@ -4079,24 +4085,27 @@ def get_encoding(stream): write_debug(f'Proxy map: {self.proxies}') write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers.values())}') - if os.environ.get('YTDLP_NO_PLUGINS'): - write_debug('Plugins are forcibly disabled') - return - for plugin_type, plugins in {'Extractor': plugin_ies, 'Post-Processor': plugin_pps}.items(): - display_list = ['{}{}'.format( - klass.__name__, '' if klass.__name__ == name else f' as {name}') - for name, klass in plugins.items()] + for plugin_type, plugins in (('Extractor', plugin_ies), ('Post-Processor', plugin_pps)): + display_list = [ + klass.__name__ if klass.__name__ == name else f'{klass.__name__} as {name}' + for name, klass in plugins.value.items()] if plugin_type == 'Extractor': display_list.extend(f'{plugins[-1].IE_NAME.partition("+")[2]} ({parent.__name__})' - for parent, plugins in plugin_ie_overrides.items()) + for parent, plugins in plugin_ies_overrides.value.items()) if not display_list: continue write_debug(f'{plugin_type} Plugins: {", ".join(sorted(display_list))}') - plugin_dirs = plugin_directories() - if plugin_dirs: - write_debug(f'Plugin directories: {plugin_dirs}') + plugin_dirs_msg = 'none' + if not plugin_dirs.value: + plugin_dirs_msg = 'none (disabled)' + else: + found_plugin_directories = plugin_directories() + if found_plugin_directories: + plugin_dirs_msg = ', '.join(found_plugin_directories) + + write_debug(f'Plugin directories: {plugin_dirs_msg}') @functools.cached_property def proxies(self): diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index c76fe2748..7d8f10047 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -19,7 +19,9 @@ from .extractor import list_extractor_classes from .extractor.adobepass import MSO_INFO from .networking.impersonate import ImpersonateTarget +from .globals import IN_CLI, plugin_dirs from .options import parseOpts +from .plugins import load_all_plugins as _load_all_plugins from .postprocessor import ( FFmpegExtractAudioPP, FFmpegMergerPP, @@ -33,7 +35,6 @@ ) from .update import Updater from .utils import ( - Config, NO_DEFAULT, POSTPROCESS_WHEN, DateRange, @@ -66,8 +67,6 @@ from .utils._utils import _UnsafeExtensionError from .YoutubeDL import YoutubeDL -_IN_CLI = False - def _exit(status=0, *args): for msg in args: @@ -295,18 +294,20 @@ def parse_sleep_func(expr): raise ValueError(f'invalid {key} retry sleep expression {expr!r}') # Bytes - def validate_bytes(name, value): + def validate_bytes(name, value, strict_positive=False): if value is None: return None numeric_limit = parse_bytes(value) - validate(numeric_limit is not None, 'rate limit', value) + validate(numeric_limit is not None, name, value) + if strict_positive: + validate_positive(name, numeric_limit, True) return numeric_limit - opts.ratelimit = validate_bytes('rate limit', opts.ratelimit) + opts.ratelimit = validate_bytes('rate limit', opts.ratelimit, True) opts.throttledratelimit = validate_bytes('throttled rate limit', opts.throttledratelimit) opts.min_filesize = validate_bytes('min filesize', opts.min_filesize) opts.max_filesize = validate_bytes('max filesize', opts.max_filesize) - opts.buffersize = validate_bytes('buffer size', opts.buffersize) + opts.buffersize = validate_bytes('buffer size', opts.buffersize, True) opts.http_chunk_size = validate_bytes('http chunk size', opts.http_chunk_size) # Output templates @@ -431,6 +432,10 @@ def metadataparser_actions(f): } # Other options + opts.plugin_dirs = opts.plugin_dirs + if opts.plugin_dirs is None: + opts.plugin_dirs = ['default'] + if opts.playlist_items is not None: try: tuple(PlaylistEntries.parse_playlist_items(opts.playlist_items)) @@ -971,11 +976,6 @@ def _real_main(argv=None): parser, opts, all_urls, ydl_opts = parse_options(argv) - # HACK: Set the plugin dirs early on - # TODO(coletdjnz): remove when plugin globals system is implemented - if opts.plugin_dirs is not None: - Config._plugin_dirs = list(map(expand_path, opts.plugin_dirs)) - # Dump user agent if opts.dump_user_agent: ua = traverse_obj(opts.headers, 'User-Agent', casesense=False, default=std_headers['User-Agent']) @@ -990,6 +990,11 @@ def _real_main(argv=None): if opts.ffmpeg_location: FFmpegPostProcessor._ffmpeg_location.set(opts.ffmpeg_location) + # load all plugins into the global lookup + plugin_dirs.value = opts.plugin_dirs + if plugin_dirs.value: + _load_all_plugins() + with YoutubeDL(ydl_opts) as ydl: pre_process = opts.update_self or opts.rm_cachedir actual_use = all_urls or opts.load_info_filename @@ -1089,8 +1094,7 @@ def make_row(target, handler): def main(argv=None): - global _IN_CLI - _IN_CLI = True + IN_CLI.value = True try: _exit(*variadic(_real_main(argv))) except (CookieLoadError, DownloadError): diff --git a/yt_dlp/downloader/__init__.py b/yt_dlp/downloader/__init__.py index 51a9f28f0..1b12bd4be 100644 --- a/yt_dlp/downloader/__init__.py +++ b/yt_dlp/downloader/__init__.py @@ -35,6 +35,7 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N from .rtsp import RtspFD from .websocket import WebSocketFragmentFD from .youtube_live_chat import YoutubeLiveChatFD +from .bunnycdn import BunnyCdnFD PROTOCOL_MAP = { 'rtmp': RtmpFD, @@ -55,6 +56,7 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N 'websocket_frag': WebSocketFragmentFD, 'youtube_live_chat': YoutubeLiveChatFD, 'youtube_live_chat_replay': YoutubeLiveChatFD, + 'bunnycdn': BunnyCdnFD, } diff --git a/yt_dlp/downloader/bunnycdn.py b/yt_dlp/downloader/bunnycdn.py new file mode 100644 index 000000000..e787f698a --- /dev/null +++ b/yt_dlp/downloader/bunnycdn.py @@ -0,0 +1,50 @@ +import hashlib +import random +import threading + +from .common import FileDownloader +from . import HlsFD +from ..networking import Request +from ..networking.exceptions import network_exceptions + + +class BunnyCdnFD(FileDownloader): + """ + Downloads from BunnyCDN with required pings + Note, this is not a part of public API, and will be removed without notice. + DO NOT USE + """ + + def real_download(self, filename, info_dict): + self.to_screen(f'[{self.FD_NAME}] Downloading from BunnyCDN') + + fd = HlsFD(self.ydl, self.params) + + stop_event = threading.Event() + ping_thread = threading.Thread(target=self.ping_thread, args=(stop_event,), kwargs=info_dict['_bunnycdn_ping_data']) + ping_thread.start() + + try: + return fd.real_download(filename, info_dict) + finally: + stop_event.set() + + def ping_thread(self, stop_event, url, headers, secret, context_id): + # Site sends ping every 4 seconds, but this throttles the download. Pinging every 2 seconds seems to work. + ping_interval = 2 + # Hard coded resolution as it doesn't seem to matter + res = 1080 + paused = 'false' + current_time = 0 + + while not stop_event.wait(ping_interval): + current_time += ping_interval + + time = current_time + round(random.random(), 6) + md5_hash = hashlib.md5(f'{secret}_{context_id}_{time}_{paused}_{res}'.encode()).hexdigest() + ping_url = f'{url}?hash={md5_hash}&time={time}&paused={paused}&resolution={res}' + + try: + self.ydl.urlopen(Request(ping_url, headers=headers)).read() + except network_exceptions as e: + self.to_screen(f'[{self.FD_NAME}] Ping failed: {e}') diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index e8dcb37cc..bb9303f8a 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -31,6 +31,7 @@ timetuple_from_msec, try_call, ) +from ..utils._utils import _ProgressState class FileDownloader: @@ -333,7 +334,7 @@ def _report_progress_status(self, s, default_template): progress_dict), s.get('progress_idx') or 0) self.to_console_title(self.ydl.evaluate_outtmpl( progress_template.get('download-title') or 'yt-dlp %(progress._default_template)s', - progress_dict)) + progress_dict), _ProgressState.from_dict(s), s.get('_percent')) def _format_progress(self, *args, **kwargs): return self.ydl._format_text( @@ -357,6 +358,7 @@ def with_fields(*tups, default=''): '_speed_str': self.format_speed(speed).strip(), '_total_bytes_str': _format_bytes('total_bytes'), '_elapsed_str': self.format_seconds(s.get('elapsed')), + '_percent': 100.0, '_percent_str': self.format_percent(100), }) self._report_progress_status(s, join_nonempty( @@ -375,13 +377,15 @@ def with_fields(*tups, default=''): return self._progress_delta_time += update_delta + progress = try_call( + lambda: 100 * s['downloaded_bytes'] / s['total_bytes'], + lambda: 100 * s['downloaded_bytes'] / s['total_bytes_estimate'], + lambda: s['downloaded_bytes'] == 0 and 0) s.update({ '_eta_str': self.format_eta(s.get('eta')).strip(), '_speed_str': self.format_speed(s.get('speed')), - '_percent_str': self.format_percent(try_call( - lambda: 100 * s['downloaded_bytes'] / s['total_bytes'], - lambda: 100 * s['downloaded_bytes'] / s['total_bytes_estimate'], - lambda: s['downloaded_bytes'] == 0 and 0)), + '_percent': progress, + '_percent_str': self.format_percent(progress), '_total_bytes_str': _format_bytes('total_bytes'), '_total_bytes_estimate_str': _format_bytes('total_bytes_estimate'), '_downloaded_bytes_str': _format_bytes('downloaded_bytes'), diff --git a/yt_dlp/downloader/hls.py b/yt_dlp/downloader/hls.py index 7a47f8f83..1f36a07f5 100644 --- a/yt_dlp/downloader/hls.py +++ b/yt_dlp/downloader/hls.py @@ -16,6 +16,7 @@ update_url_query, urljoin, ) +from ..utils._utils import _request_dump_filename class HlsFD(FragmentFD): @@ -80,7 +81,15 @@ def real_download(self, filename, info_dict): self.to_screen(f'[{self.FD_NAME}] Downloading m3u8 manifest') urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) man_url = urlh.url - s = urlh.read().decode('utf-8', 'ignore') + s_bytes = urlh.read() + if self.params.get('write_pages'): + dump_filename = _request_dump_filename( + man_url, info_dict['id'], None, + trim_length=self.params.get('trim_file_name')) + self.to_screen(f'[{self.FD_NAME}] Saving request to {dump_filename}') + with open(dump_filename, 'wb') as outf: + outf.write(s_bytes) + s = s_bytes.decode('utf-8', 'ignore') can_download, message = self.can_download(s, info_dict, self.params.get('allow_unplayable_formats')), None if can_download: diff --git a/yt_dlp/extractor/__init__.py b/yt_dlp/extractor/__init__.py index 6bfa4bd7b..a090e942d 100644 --- a/yt_dlp/extractor/__init__.py +++ b/yt_dlp/extractor/__init__.py @@ -1,16 +1,25 @@ from ..compat.compat_utils import passthrough_module +from ..globals import extractors as _extractors_context +from ..globals import plugin_ies as _plugin_ies_context +from ..plugins import PluginSpec, register_plugin_spec passthrough_module(__name__, '.extractors') del passthrough_module +register_plugin_spec(PluginSpec( + module_name='extractor', + suffix='IE', + destination=_extractors_context, + plugin_destination=_plugin_ies_context, +)) + def gen_extractor_classes(): """ Return a list of supported extractors. The order does matter; the first extractor matched is the one handling the URL. """ - from .extractors import _ALL_CLASSES - - return _ALL_CLASSES + import_extractors() + return list(_extractors_context.value.values()) def gen_extractors(): @@ -37,6 +46,9 @@ def list_extractors(age_limit=None): def get_info_extractor(ie_name): """Returns the info extractor class with the given ie_name""" - from . import extractors + import_extractors() + return _extractors_context.value[f'{ie_name}IE'] - return getattr(extractors, f'{ie_name}IE') + +def import_extractors(): + from . import extractors # noqa: F401 diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c937dfe13..9a49bcb30 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -312,6 +312,7 @@ ) from .bundesliga import BundesligaIE from .bundestag import BundestagIE +from .bunnycdn import BunnyCdnIE from .businessinsider import BusinessInsiderIE from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE diff --git a/yt_dlp/extractor/bunnycdn.py b/yt_dlp/extractor/bunnycdn.py new file mode 100644 index 000000000..d78753384 --- /dev/null +++ b/yt_dlp/extractor/bunnycdn.py @@ -0,0 +1,178 @@ +import json + +from .common import InfoExtractor +from ..networking import HEADRequest +from ..utils import ( + ExtractorError, + extract_attributes, + int_or_none, + parse_qs, + smuggle_url, + unsmuggle_url, + url_or_none, + urlhandle_detect_ext, +) +from ..utils.traversal import find_element, traverse_obj + + +class BunnyCdnIE(InfoExtractor): + _VALID_URL = r'https?://(?:iframe\.mediadelivery\.net|video\.bunnycdn\.com)/(?:embed|play)/(?P\d+)/(?P[\da-f-]+)' + _EMBED_REGEX = [rf']+src=[\'"](?P{_VALID_URL}[^\'"]*)[\'"]'] + _TESTS = [{ + 'url': 'https://iframe.mediadelivery.net/embed/113933/e73edec1-e381-4c8b-ae73-717a140e0924', + 'info_dict': { + 'id': 'e73edec1-e381-4c8b-ae73-717a140e0924', + 'ext': 'mp4', + 'title': 'mistress morgana (3).mp4', + 'description': '', + 'timestamp': 1693251673, + 'thumbnail': r're:^https?://.*\.b-cdn\.net/e73edec1-e381-4c8b-ae73-717a140e0924/thumbnail\.jpg', + 'duration': 7.0, + 'upload_date': '20230828', + }, + 'params': {'skip_download': True}, + }, { + 'url': 'https://iframe.mediadelivery.net/play/136145/32e34c4b-0d72-437c-9abb-05e67657da34', + 'info_dict': { + 'id': '32e34c4b-0d72-437c-9abb-05e67657da34', + 'ext': 'mp4', + 'timestamp': 1691145748, + 'thumbnail': r're:^https?://.*\.b-cdn\.net/32e34c4b-0d72-437c-9abb-05e67657da34/thumbnail_9172dc16\.jpg', + 'duration': 106.0, + 'description': 'md5:981a3e899a5c78352b21ed8b2f1efd81', + 'upload_date': '20230804', + 'title': 'Sanela ist Teil der #arbeitsmarktkraft', + }, + 'params': {'skip_download': True}, + }, { + # Stream requires activation and pings + 'url': 'https://iframe.mediadelivery.net/embed/200867/2e8545ec-509d-4571-b855-4cf0235ccd75', + 'info_dict': { + 'id': '2e8545ec-509d-4571-b855-4cf0235ccd75', + 'ext': 'mp4', + 'timestamp': 1708497752, + 'title': 'netflix part 1', + 'duration': 3959.0, + 'description': '', + 'upload_date': '20240221', + 'thumbnail': r're:^https?://.*\.b-cdn\.net/2e8545ec-509d-4571-b855-4cf0235ccd75/thumbnail\.jpg', + }, + 'params': {'skip_download': True}, + }] + _WEBPAGE_TESTS = [{ + # Stream requires Referer + 'url': 'https://conword.io/', + 'info_dict': { + 'id': '3a5d863e-9cd6-447e-b6ef-e289af50b349', + 'ext': 'mp4', + 'title': 'Conword bei der Stadt Köln und Stadt Dortmund', + 'description': '', + 'upload_date': '20231031', + 'duration': 31.0, + 'thumbnail': 'https://video.watchuh.com/3a5d863e-9cd6-447e-b6ef-e289af50b349/thumbnail.jpg', + 'timestamp': 1698783879, + }, + 'params': {'skip_download': True}, + }, { + # URL requires token and expires + 'url': 'https://www.stockphotos.com/video/moscow-subway-the-train-is-arriving-at-the-park-kultury-station-10017830', + 'info_dict': { + 'id': '0b02fa20-4e8c-4140-8f87-f64d820a3386', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.b-cdn\.net/0b02fa20-4e8c-4140-8f87-f64d820a3386/thumbnail\.jpg', + 'title': 'Moscow subway. The train is arriving at the Park Kultury station.', + 'upload_date': '20240531', + 'duration': 18.0, + 'timestamp': 1717152269, + 'description': '', + }, + 'params': {'skip_download': True}, + }] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + for embed_url in super()._extract_embed_urls(url, webpage): + yield smuggle_url(embed_url, {'Referer': url}) + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + video_id, library_id = self._match_valid_url(url).group('id', 'library_id') + webpage = self._download_webpage( + f'https://iframe.mediadelivery.net/embed/{library_id}/{video_id}', video_id, + headers=traverse_obj(smuggled_data, {'Referer': 'Referer'}), + query=traverse_obj(parse_qs(url), {'token': 'token', 'expires': 'expires'})) + + if html_title := self._html_extract_title(webpage, default=None) == '403': + raise ExtractorError( + 'This video is inaccessible. Setting a Referer header ' + 'might be required to access the video', expected=True) + elif html_title == '404': + raise ExtractorError('This video does not exist', expected=True) + + headers = {'Referer': url} + + info = traverse_obj(self._parse_html5_media_entries(url, webpage, video_id, _headers=headers), 0) or {} + formats = info.get('formats') or [] + subtitles = info.get('subtitles') or {} + + original_url = self._search_regex( + r'(?:var|const|let)\s+originalUrl\s*=\s*["\']([^"\']+)["\']', webpage, 'original url', default=None) + if url_or_none(original_url): + urlh = self._request_webpage( + HEADRequest(original_url), video_id=video_id, note='Checking original', + headers=headers, fatal=False, expected_status=(403, 404)) + if urlh and urlh.status == 200: + formats.append({ + 'url': original_url, + 'format_id': 'source', + 'quality': 1, + 'http_headers': headers, + 'ext': urlhandle_detect_ext(urlh, default='mp4'), + 'filesize': int_or_none(urlh.get_header('Content-Length')), + }) + + # MediaCage Streams require activation and pings + src_url = self._search_regex( + r'\.setAttribute\([\'"]src[\'"],\s*[\'"]([^\'"]+)[\'"]\)', webpage, 'src url', default=None) + activation_url = self._search_regex( + r'loadUrl\([\'"]([^\'"]+/activate)[\'"]', webpage, 'activation url', default=None) + ping_url = self._search_regex( + r'loadUrl\([\'"]([^\'"]+/ping)[\'"]', webpage, 'ping url', default=None) + secret = traverse_obj(parse_qs(src_url), ('secret', 0)) + context_id = traverse_obj(parse_qs(src_url), ('contextId', 0)) + ping_data = {} + if src_url and activation_url and ping_url and secret and context_id: + self._download_webpage( + activation_url, video_id, headers=headers, note='Downloading activation data') + + fmts, subs = self._extract_m3u8_formats_and_subtitles( + src_url, video_id, 'mp4', headers=headers, m3u8_id='hls', fatal=False) + for fmt in fmts: + fmt.update({ + 'protocol': 'bunnycdn', + 'http_headers': headers, + }) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + ping_data = { + '_bunnycdn_ping_data': { + 'url': ping_url, + 'headers': headers, + 'secret': secret, + 'context_id': context_id, + }, + } + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(webpage, ({find_element(id='main-video', html=True)}, {extract_attributes}, { + 'title': ('data-plyr-config', {json.loads}, 'title', {str}), + 'thumbnail': ('data-poster', {url_or_none}), + })), + **ping_data, + **self._search_json_ld(webpage, video_id, fatal=False), + } diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index c0cf3da3d..e63c608fc 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -14,16 +14,18 @@ js_to_json, mimetype2ext, orderedSet, + parse_age_limit, parse_iso8601, replace_extension, smuggle_url, strip_or_none, - traverse_obj, try_get, + unified_timestamp, update_url, url_basename, url_or_none, ) +from ..utils.traversal import require, traverse_obj, trim_str class CBCIE(InfoExtractor): @@ -516,9 +518,43 @@ def entries(): return self.playlist_result(entries(), playlist_id) -class CBCGemIE(InfoExtractor): +class CBCGemBaseIE(InfoExtractor): + _NETRC_MACHINE = 'cbcgem' + _GEO_COUNTRIES = ['CA'] + + def _call_show_api(self, item_id, display_id=None): + return self._download_json( + f'https://services.radio-canada.ca/ott/catalog/v2/gem/show/{item_id}', + display_id or item_id, query={'device': 'web'}) + + def _extract_item_info(self, item_info): + episode_number = None + title = traverse_obj(item_info, ('title', {str})) + if title and (mobj := re.match(r'(?P\d+)\. (?P.+)', title)): + episode_number = int_or_none(mobj.group('episode')) + title = mobj.group('title') + + return { + 'episode_number': episode_number, + **traverse_obj(item_info, { + 'id': ('url', {str}), + 'episode_id': ('url', {str}), + 'description': ('description', {str}), + 'thumbnail': ('images', 'card', 'url', {url_or_none}, {update_url(query=None)}), + 'episode_number': ('episodeNumber', {int_or_none}), + 'duration': ('metadata', 'duration', {int_or_none}), + 'release_timestamp': ('metadata', 'airDate', {unified_timestamp}), + 'timestamp': ('metadata', 'availabilityDate', {unified_timestamp}), + 'age_limit': ('metadata', 'rating', {trim_str(start='C')}, {parse_age_limit}), + }), + 'episode': title, + 'title': title, + } + + +class CBCGemIE(CBCGemBaseIE): IE_NAME = 'gem.cbc.ca' - _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P<id>[0-9a-z-]+/s[0-9]+[a-z][0-9]+)' + _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P<id>[0-9a-z-]+/s(?P<season>[0-9]+)[a-z][0-9]+)' _TESTS = [{ # This is a normal, public, TV show video 'url': 'https://gem.cbc.ca/media/schitts-creek/s06e01', @@ -529,7 +565,7 @@ class CBCGemIE(InfoExtractor): 'description': 'md5:929868d20021c924020641769eb3e7f1', 'thumbnail': r're:https://images\.radio-canada\.ca/[^#?]+/cbc_schitts_creek_season_06e01_thumbnail_v01\.jpg', 'duration': 1324, - 'categories': ['comedy'], + 'genres': ['Comédie et humour'], 'series': 'Schitt\'s Creek', 'season': 'Season 6', 'season_number': 6, @@ -537,9 +573,10 @@ class CBCGemIE(InfoExtractor): 'episode_number': 1, 'episode_id': 'schitts-creek/s06e01', 'upload_date': '20210618', - 'timestamp': 1623988800, + 'timestamp': 1623974400, 'release_date': '20200107', - 'release_timestamp': 1578427200, + 'release_timestamp': 1578355200, + 'age_limit': 14, }, 'params': {'format': 'bv'}, }, { @@ -557,12 +594,13 @@ class CBCGemIE(InfoExtractor): 'episode_number': 1, 'episode': 'The Cup Runneth Over', 'episode_id': 'schitts-creek/s01e01', - 'duration': 1309, - 'categories': ['comedy'], + 'duration': 1308, + 'genres': ['Comédie et humour'], 'upload_date': '20210617', - 'timestamp': 1623902400, - 'release_date': '20151124', - 'release_timestamp': 1448323200, + 'timestamp': 1623888000, + 'release_date': '20151123', + 'release_timestamp': 1448236800, + 'age_limit': 14, }, 'params': {'format': 'bv'}, }, { @@ -570,9 +608,7 @@ class CBCGemIE(InfoExtractor): 'only_matching': True, }] - _GEO_COUNTRIES = ['CA'] _TOKEN_API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37' - _NETRC_MACHINE = 'cbcgem' _claims_token = None def _new_claims_token(self, email, password): @@ -634,10 +670,12 @@ def _real_initialize(self): self._claims_token = self.cache.load(self._NETRC_MACHINE, 'claims_token') def _real_extract(self, url): - video_id = self._match_id(url) - video_info = self._download_json( - f'https://services.radio-canada.ca/ott/cbc-api/v2/assets/{video_id}', - video_id, expected_status=426) + video_id, season_number = self._match_valid_url(url).group('id', 'season') + video_info = self._call_show_api(video_id) + item_info = traverse_obj(video_info, ( + 'content', ..., 'lineups', ..., 'items', + lambda _, v: v['url'] == video_id, any, {require('item info')})) + media_id = item_info['idMedia'] email, password = self._get_login_info() if email and password: @@ -645,7 +683,20 @@ def _real_extract(self, url): headers = {'x-claims-token': claims_token} else: headers = {} - m3u8_info = self._download_json(video_info['playSession']['url'], video_id, headers=headers) + + m3u8_info = self._download_json( + 'https://services.radio-canada.ca/media/validation/v2/', + video_id, headers=headers, query={ + 'appCode': 'gem', + 'connectionType': 'hd', + 'deviceType': 'ipad', + 'multibitrate': 'true', + 'output': 'json', + 'tech': 'hls', + 'manifestVersion': '2', + 'manifestType': 'desktop', + 'idMedia': media_id, + }) if m3u8_info.get('errorCode') == 1: self.raise_geo_restricted(countries=['CA']) @@ -671,26 +722,20 @@ def _real_extract(self, url): fmt['preference'] = -2 return { + 'season_number': int_or_none(season_number), + **traverse_obj(video_info, { + 'series': ('title', {str}), + 'season_number': ('structuredMetadata', 'partofSeason', 'seasonNumber', {int_or_none}), + 'genres': ('structuredMetadata', 'genre', ..., {str}), + }), + **self._extract_item_info(item_info), 'id': video_id, 'episode_id': video_id, 'formats': formats, - **traverse_obj(video_info, { - 'title': ('title', {str}), - 'episode': ('title', {str}), - 'description': ('description', {str}), - 'thumbnail': ('image', {url_or_none}), - 'series': ('series', {str}), - 'season_number': ('season', {int_or_none}), - 'episode_number': ('episode', {int_or_none}), - 'duration': ('duration', {int_or_none}), - 'categories': ('category', {str}, all), - 'release_timestamp': ('airDate', {int_or_none(scale=1000)}), - 'timestamp': ('availableDate', {int_or_none(scale=1000)}), - }), } -class CBCGemPlaylistIE(InfoExtractor): +class CBCGemPlaylistIE(CBCGemBaseIE): IE_NAME = 'gem.cbc.ca:playlist' _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P<id>(?P<show>[0-9a-z-]+)/s(?P<season>[0-9]+))/?(?:[?#]|$)' _TESTS = [{ @@ -700,70 +745,35 @@ class CBCGemPlaylistIE(InfoExtractor): 'info_dict': { 'id': 'schitts-creek/s06', 'title': 'Season 6', - 'description': 'md5:6a92104a56cbeb5818cc47884d4326a2', 'series': 'Schitt\'s Creek', 'season_number': 6, 'season': 'Season 6', - 'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/season/perso/cbc_schitts_creek_season_06_carousel_v03.jpg?impolicy=ott&im=Resize=(_Size_)&quality=75', }, }, { 'url': 'https://gem.cbc.ca/schitts-creek/s06', 'only_matching': True, }] - _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/shows/' + + def _entries(self, season_info): + for episode in traverse_obj(season_info, ('items', lambda _, v: v['url'])): + yield self.url_result( + f'https://gem.cbc.ca/media/{episode["url"]}', CBCGemIE, + **self._extract_item_info(episode)) def _real_extract(self, url): - match = self._match_valid_url(url) - season_id = match.group('id') - show = match.group('show') - show_info = self._download_json(self._API_BASE + show, season_id, expected_status=426) - season = int(match.group('season')) + season_id, show, season = self._match_valid_url(url).group('id', 'show', 'season') + show_info = self._call_show_api(show, display_id=season_id) + season_info = traverse_obj(show_info, ( + 'content', ..., 'lineups', + lambda _, v: v['seasonNumber'] == int(season), any, {require('season info')})) - season_info = next((s for s in show_info['seasons'] if s.get('season') == season), None) - - if season_info is None: - raise ExtractorError(f'Couldn\'t find season {season} of {show}') - - episodes = [] - for episode in season_info['assets']: - episodes.append({ - '_type': 'url_transparent', - 'ie_key': 'CBCGem', - 'url': 'https://gem.cbc.ca/media/' + episode['id'], - 'id': episode['id'], - 'title': episode.get('title'), - 'description': episode.get('description'), - 'thumbnail': episode.get('image'), - 'series': episode.get('series'), - 'season_number': episode.get('season'), - 'season': season_info['title'], - 'season_id': season_info.get('id'), - 'episode_number': episode.get('episode'), - 'episode': episode.get('title'), - 'episode_id': episode['id'], - 'duration': episode.get('duration'), - 'categories': [episode.get('category')], - }) - - thumbnail = None - tn_uri = season_info.get('image') - # the-national was observed to use a "data:image/png;base64" - # URI for their 'image' value. The image was 1x1, and is - # probably just a placeholder, so it is ignored. - if tn_uri is not None and not tn_uri.startswith('data:'): - thumbnail = tn_uri - - return { - '_type': 'playlist', - 'entries': episodes, - 'id': season_id, - 'title': season_info['title'], - 'description': season_info.get('description'), - 'thumbnail': thumbnail, - 'series': show_info.get('title'), - 'season_number': season_info.get('season'), - 'season': season_info['title'], - } + return self.playlist_result( + self._entries(season_info), season_id, + **traverse_obj(season_info, { + 'title': ('title', {str}), + 'season': ('title', {str}), + 'season_number': ('seasonNumber', {int_or_none}), + }), series=traverse_obj(show_info, ('title', {str}))) class CBCGemLiveIE(InfoExtractor): diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 3e7734ce1..b816d788f 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -2,7 +2,6 @@ import collections import functools import getpass -import hashlib import http.client import http.cookiejar import http.cookies @@ -30,6 +29,7 @@ from ..cookies import LenientSimpleCookie from ..downloader.f4m import get_base_url, remove_encrypted_media from ..downloader.hls import HlsFD +from ..globals import plugin_ies_overrides from ..networking import HEADRequest, Request from ..networking.exceptions import ( HTTPError, @@ -78,7 +78,6 @@ parse_iso8601, parse_m3u8_attributes, parse_resolution, - sanitize_filename, sanitize_url, smuggle_url, str_or_none, @@ -100,6 +99,7 @@ xpath_text, xpath_with_ns, ) +from ..utils._utils import _request_dump_filename class InfoExtractor: @@ -1022,23 +1022,6 @@ def __check_blocked(self, content): 'Visit http://blocklist.rkn.gov.ru/ for a block reason.', expected=True) - def _request_dump_filename(self, url, video_id, data=None): - if data is not None: - data = hashlib.md5(data).hexdigest() - basen = join_nonempty(video_id, data, url, delim='_') - trim_length = self.get_param('trim_file_name') or 240 - if len(basen) > trim_length: - h = '___' + hashlib.md5(basen.encode()).hexdigest() - basen = basen[:trim_length - len(h)] + h - filename = sanitize_filename(f'{basen}.dump', restricted=True) - # Working around MAX_PATH limitation on Windows (see - # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) - if os.name == 'nt': - absfilepath = os.path.abspath(filename) - if len(absfilepath) > 259: - filename = fR'\\?\{absfilepath}' - return filename - def __decode_webpage(self, webpage_bytes, encoding, headers): if not encoding: encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes) @@ -1067,7 +1050,9 @@ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errno if self.get_param('write_pages'): if isinstance(url_or_request, Request): data = self._create_request(url_or_request, data).data - filename = self._request_dump_filename(urlh.url, video_id, data) + filename = _request_dump_filename( + urlh.url, video_id, data, + trim_length=self.get_param('trim_file_name')) self.to_screen(f'Saving request to {filename}') with open(filename, 'wb') as outf: outf.write(webpage_bytes) @@ -1128,7 +1113,9 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote, impersonate=None, require_impersonation=False): if self.get_param('load_pages'): url_or_request = self._create_request(url_or_request, data, headers, query) - filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data) + filename = _request_dump_filename( + url_or_request.url, video_id, url_or_request.data, + trim_length=self.get_param('trim_file_name')) self.to_screen(f'Loading request from {filename}') try: with open(filename, 'rb') as dumpf: @@ -3968,14 +3955,18 @@ def _extract_url(cls, webpage): # TODO: Remove def __init_subclass__(cls, *, plugin_name=None, **kwargs): if plugin_name: mro = inspect.getmro(cls) - super_class = cls.__wrapped__ = mro[mro.index(cls) + 1] - cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key - cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}' + next_mro_class = super_class = mro[mro.index(cls) + 1] + while getattr(super_class, '__wrapped__', None): super_class = super_class.__wrapped__ - setattr(sys.modules[super_class.__module__], super_class.__name__, cls) - _PLUGIN_OVERRIDES[super_class].append(cls) + if not any(override.PLUGIN_NAME == plugin_name for override in plugin_ies_overrides.value[super_class]): + cls.__wrapped__ = next_mro_class + cls.PLUGIN_NAME, cls.ie_key = plugin_name, next_mro_class.ie_key + cls.IE_NAME = f'{next_mro_class.IE_NAME}+{plugin_name}' + + setattr(sys.modules[super_class.__module__], super_class.__name__, cls) + plugin_ies_overrides.value[super_class].append(cls) return super().__init_subclass__(**kwargs) @@ -4031,6 +4022,3 @@ class UnsupportedURLIE(InfoExtractor): def _real_extract(self, url): raise UnsupportedError(url) - - -_PLUGIN_OVERRIDES = collections.defaultdict(list) diff --git a/yt_dlp/extractor/dreisat.py b/yt_dlp/extractor/dreisat.py index 4b0a269b9..376ff672d 100644 --- a/yt_dlp/extractor/dreisat.py +++ b/yt_dlp/extractor/dreisat.py @@ -1,10 +1,24 @@ -from .zdf import ZDFIE +from .zdf import ZDFBaseIE -class DreiSatIE(ZDFIE): # XXX: Do not subclass from concrete IE +class DreiSatIE(ZDFBaseIE): IE_NAME = '3sat' _VALID_URL = r'https?://(?:www\.)?3sat\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html' _TESTS = [{ + 'url': 'https://www.3sat.de/dokumentation/reise/traumziele-suedostasiens-die-philippinen-und-vietnam-102.html', + 'info_dict': { + 'id': '231124_traumziele_philippinen_und_vietnam_dokreise', + 'ext': 'mp4', + 'title': 'Traumziele Südostasiens (1/2): Die Philippinen und Vietnam', + 'description': 'md5:26329ce5197775b596773b939354079d', + 'duration': 2625.0, + 'thumbnail': 'https://www.3sat.de/assets/traumziele-suedostasiens-die-philippinen-und-vietnam-100~2400x1350?cb=1699870351148', + 'episode': 'Traumziele Südostasiens (1/2): Die Philippinen und Vietnam', + 'episode_id': 'POS_cc7ff51c-98cf-4d12-b99d-f7a551de1c95', + 'timestamp': 1738593000, + 'upload_date': '20250203', + }, + }, { # Same as https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html 'url': 'https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html', 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', @@ -17,6 +31,7 @@ class DreiSatIE(ZDFIE): # XXX: Do not subclass from concrete IE 'timestamp': 1608604200, 'upload_date': '20201222', }, + 'skip': '410 Gone', }, { 'url': 'https://www.3sat.de/gesellschaft/schweizweit/waidmannsheil-100.html', 'info_dict': { @@ -30,6 +45,7 @@ class DreiSatIE(ZDFIE): # XXX: Do not subclass from concrete IE 'params': { 'skip_download': True, }, + 'skip': '404 Not Found', }, { # Same as https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html 'url': 'https://www.3sat.de/film/spielfilm/der-hauptmann-100.html', @@ -39,3 +55,14 @@ class DreiSatIE(ZDFIE): # XXX: Do not subclass from concrete IE 'url': 'https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html', 'only_matching': True, }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id, fatal=False) + if webpage: + player = self._extract_player(webpage, url, fatal=False) + if player: + return self._extract_regular(url, player, video_id) + + return self._extract_mobile(video_id) diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index baa69d242..050bed2da 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -1,28 +1,35 @@ -import contextlib +import inspect import os -from ..plugins import load_plugins +from ..globals import LAZY_EXTRACTORS +from ..globals import extractors as _extractors_context -# NB: Must be before other imports so that plugins can be correctly injected -_PLUGIN_CLASSES = load_plugins('extractor', 'IE') - -_LAZY_LOADER = False +_CLASS_LOOKUP = None if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): - with contextlib.suppress(ImportError): - from .lazy_extractors import * # noqa: F403 - from .lazy_extractors import _ALL_CLASSES - _LAZY_LOADER = True + try: + from .lazy_extractors import _CLASS_LOOKUP + LAZY_EXTRACTORS.value = True + except ImportError: + LAZY_EXTRACTORS.value = False -if not _LAZY_LOADER: - from ._extractors import * # noqa: F403 - _ALL_CLASSES = [ # noqa: F811 - klass - for name, klass in globals().items() +if not _CLASS_LOOKUP: + from . import _extractors + + _CLASS_LOOKUP = { + name: value + for name, value in inspect.getmembers(_extractors) if name.endswith('IE') and name != 'GenericIE' - ] - _ALL_CLASSES.append(GenericIE) # noqa: F405 + } + _CLASS_LOOKUP['GenericIE'] = _extractors.GenericIE -globals().update(_PLUGIN_CLASSES) -_ALL_CLASSES[:0] = _PLUGIN_CLASSES.values() +# We want to append to the main lookup +_current = _extractors_context.value +for name, ie in _CLASS_LOOKUP.items(): + _current.setdefault(name, ie) -from .common import _PLUGIN_OVERRIDES # noqa: F401 + +def __getattr__(name): + value = _CLASS_LOOKUP.get(name) + if not value: + raise AttributeError(f'module {__name__} has no attribute {name}') + return value diff --git a/yt_dlp/extractor/francetv.py b/yt_dlp/extractor/francetv.py index c6036b306..5c9f8e36d 100644 --- a/yt_dlp/extractor/francetv.py +++ b/yt_dlp/extractor/francetv.py @@ -9,6 +9,7 @@ ExtractorError, clean_html, determine_ext, + extract_attributes, filter_dict, format_field, int_or_none, @@ -18,7 +19,7 @@ unsmuggle_url, url_or_none, ) -from ..utils.traversal import traverse_obj +from ..utils.traversal import find_element, traverse_obj class FranceTVBaseInfoExtractor(InfoExtractor): @@ -460,11 +461,16 @@ def _real_extract(self, url): self.url_result(dailymotion_url, DailymotionIE.ie_key()) for dailymotion_url in dailymotion_urls]) - video_id = self._search_regex( - (r'player\.load[^;]+src:\s*["\']([^"\']+)', - r'id-video=([^@]+@[^"]+)', - r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"', - r'(?:data-id|<figure[^<]+\bid)=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'), - webpage, 'video id') + video_id = ( + traverse_obj(webpage, ( + {find_element(tag='button', attr='data-cy', value='francetv-player-wrapper', html=True)}, + {extract_attributes}, 'id')) + or self._search_regex( + (r'player\.load[^;]+src:\s*["\']([^"\']+)', + r'id-video=([^@]+@[^"]+)', + r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"', + r'(?:data-id|<figure[^<]+\bid)=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'), + webpage, 'video id') + ) return self._make_url_result(video_id, url=url) diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index 39354bfc8..94a685cd8 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -695,7 +695,7 @@ def _query_vars_for(data): class InstagramStoryIE(InstagramBaseIE): - _VALID_URL = r'https?://(?:www\.)?instagram\.com/stories/(?P<user>[^/]+)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?instagram\.com/stories/(?P<user>[^/?#]+)(?:/(?P<id>\d+))?' IE_NAME = 'instagram:story' _TESTS = [{ @@ -705,25 +705,38 @@ class InstagramStoryIE(InstagramBaseIE): 'title': 'Rare', }, 'playlist_mincount': 50, + }, { + 'url': 'https://www.instagram.com/stories/fruits_zipper/3570766765028588805/', + 'only_matching': True, + }, { + 'url': 'https://www.instagram.com/stories/fruits_zipper', + 'only_matching': True, }] def _real_extract(self, url): - username, story_id = self._match_valid_url(url).groups() - story_info = self._download_webpage(url, story_id) - user_info = self._search_json(r'"user":', story_info, 'user info', story_id, fatal=False) + username, story_id = self._match_valid_url(url).group('user', 'id') + if username == 'highlights' and not story_id: # story id is only mandatory for highlights + raise ExtractorError('Input URL is missing a highlight ID', expected=True) + display_id = story_id or username + story_info = self._download_webpage(url, display_id) + user_info = self._search_json(r'"user":', story_info, 'user info', display_id, fatal=False) if not user_info: self.raise_login_required('This content is unreachable') user_id = traverse_obj(user_info, 'pk', 'id', expected_type=str) - story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}' - if not story_info_url: # user id is only mandatory for non-highlights - raise ExtractorError('Unable to extract user id') + if username == 'highlights': + story_info_url = f'highlight:{story_id}' + else: + if not user_id: # user id is only mandatory for non-highlights + raise ExtractorError('Unable to extract user id') + story_info_url = user_id videos = traverse_obj(self._download_json( f'{self._API_BASE_URL}/feed/reels_media/?reel_ids={story_info_url}', - story_id, errnote=False, fatal=False, headers=self._api_headers), 'reels') + display_id, errnote=False, fatal=False, headers=self._api_headers), 'reels') if not videos: self.raise_login_required('You need to log in to access this content') + user_info = traverse_obj(videos, (user_id, 'user', {dict})) or {} full_name = traverse_obj(videos, (f'highlight:{story_id}', 'user', 'full_name'), (user_id, 'user', 'full_name')) story_title = traverse_obj(videos, (f'highlight:{story_id}', 'title')) @@ -733,6 +746,7 @@ def _real_extract(self, url): highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (user_id, 'items')) info_data = [] for highlight in highlights: + highlight.setdefault('user', {}).update(user_info) highlight_data = self._extract_product(highlight) if highlight_data.get('formats'): info_data.append({ @@ -740,4 +754,7 @@ def _real_extract(self, url): 'uploader_id': user_id, **filter_dict(highlight_data), }) + if username != 'highlights' and story_id and not self._yes_playlist(username, story_id): + return traverse_obj(info_data, (lambda _, v: v['id'] == _pk_to_id(story_id), any)) + return self.playlist_result(info_data, playlist_id=story_id, playlist_title=story_title) diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 2f04de9e2..77a22777b 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -13,11 +13,13 @@ ExtractorError, OnDemandPagedList, clean_html, + determine_ext, float_or_none, int_or_none, join_nonempty, parse_duration, parse_iso8601, + parse_qs, parse_resolution, qualities, remove_start, @@ -1033,6 +1035,7 @@ def _real_extract(self, url): thumbnails.append({ 'id': f'{name}_{width}x{height}', 'url': img_url, + 'ext': traverse_obj(parse_qs(img_url), ('image', 0, {determine_ext(default_ext='jpg')})), **res, }) diff --git a/yt_dlp/extractor/pbs.py b/yt_dlp/extractor/pbs.py index 2f839a2e9..53b199415 100644 --- a/yt_dlp/extractor/pbs.py +++ b/yt_dlp/extractor/pbs.py @@ -501,7 +501,7 @@ def _extract_webpage(self, url): r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed r'class="coveplayerid">([^<]+)<', # coveplayer r'<section[^>]+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/ - r'\bclass="passportcoveplayer"[^>]+\bdata-media="(\d+)', # https://www.thirteen.org/programs/the-woodwrights-shop/who-wrote-the-book-of-sloyd-fggvvq/ + r'\sclass="passportcoveplayer"[^>]*\sdata-media="(\d+)', # https://www.thirteen.org/programs/the-woodwrights-shop/who-wrote-the-book-of-sloyd-fggvvq/ r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", r'<div[^>]+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/ diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index eafa306f2..c70940a60 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -52,7 +52,8 @@ class SoundcloudBaseIE(InfoExtractor): _API_VERIFY_AUTH_TOKEN = 'https://api-auth.soundcloud.com/connect/session%s' _HEADERS = {} - _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' + _IMAGE_REPL_RE = r'-[0-9a-z]+\.(?P<ext>jpg|png)' + _TAGS_RE = re.compile(r'"([^"]+)"|([^ ]+)') _ARTWORK_MAP = { 'mini': 16, @@ -331,12 +332,14 @@ def invalid_url(url): thumbnails = [] artwork_url = info.get('artwork_url') thumbnail = artwork_url or user.get('avatar_url') - if isinstance(thumbnail, str): - if re.search(self._IMAGE_REPL_RE, thumbnail): + if url_or_none(thumbnail): + if mobj := re.search(self._IMAGE_REPL_RE, thumbnail): for image_id, size in self._ARTWORK_MAP.items(): + # Soundcloud serves JPEG regardless of URL's ext *except* for "original" thumb + ext = mobj.group('ext') if image_id == 'original' else 'jpg' i = { 'id': image_id, - 'url': re.sub(self._IMAGE_REPL_RE, f'-{image_id}.jpg', thumbnail), + 'url': re.sub(self._IMAGE_REPL_RE, f'-{image_id}.{ext}', thumbnail), } if image_id == 'tiny' and not artwork_url: size = 18 @@ -372,6 +375,7 @@ def extract_count(key): 'comment_count': extract_count('comment'), 'repost_count': extract_count('reposts'), 'genres': traverse_obj(info, ('genre', {str}, filter, all, filter)), + 'tags': traverse_obj(info, ('tag_list', {self._TAGS_RE.findall}, ..., ..., filter)), 'artists': traverse_obj(info, ('publisher_metadata', 'artist', {str}, filter, all, filter)), 'formats': formats if not extract_flat else None, } @@ -425,6 +429,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'repost_count': int, 'thumbnail': 'https://i1.sndcdn.com/artworks-000031955188-rwb18x-original.jpg', 'uploader_url': 'https://soundcloud.com/ethmusic', + 'tags': 'count:14', }, }, # geo-restricted @@ -440,7 +445,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'uploader_id': '9615865', 'timestamp': 1337635207, 'upload_date': '20120521', - 'duration': 227.155, + 'duration': 227.103, 'license': 'all-rights-reserved', 'view_count': int, 'like_count': int, @@ -450,6 +455,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'thumbnail': 'https://i1.sndcdn.com/artworks-v8bFHhXm7Au6-0-original.jpg', 'genres': ['Alternative'], 'artists': ['The Royal Concept'], + 'tags': [], }, }, # private link @@ -475,6 +481,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'uploader_url': 'https://soundcloud.com/jaimemf', 'thumbnail': 'https://a1.sndcdn.com/images/default_avatar_large.png', 'genres': ['youtubedl'], + 'tags': [], }, }, # private link (alt format) @@ -500,15 +507,16 @@ class SoundcloudIE(SoundcloudBaseIE): 'uploader_url': 'https://soundcloud.com/jaimemf', 'thumbnail': 'https://a1.sndcdn.com/images/default_avatar_large.png', 'genres': ['youtubedl'], + 'tags': [], }, }, # downloadable song { 'url': 'https://soundcloud.com/the80m/the-following', - 'md5': '9ffcddb08c87d74fb5808a3c183a1d04', + 'md5': 'ecb87d7705d5f53e6c02a63760573c75', # wav: '9ffcddb08c87d74fb5808a3c183a1d04' 'info_dict': { 'id': '343609555', - 'ext': 'wav', + 'ext': 'opus', # wav original available with auth 'title': 'The Following', 'track': 'The Following', 'description': '', @@ -526,15 +534,18 @@ class SoundcloudIE(SoundcloudBaseIE): 'view_count': int, 'genres': ['Dance & EDM'], 'artists': ['80M'], + 'tags': ['80M', 'EDM', 'Dance', 'Music'], }, + 'expected_warnings': ['Original download format is only available for registered users'], }, # private link, downloadable format + # tags with spaces (e.g. "Uplifting Trance", "Ori Uplift") { 'url': 'https://soundcloud.com/oriuplift/uponly-238-no-talking-wav/s-AyZUd', - 'md5': '64a60b16e617d41d0bef032b7f55441e', + 'md5': '2e1530d0e9986a833a67cb34fc90ece0', # wav: '64a60b16e617d41d0bef032b7f55441e' 'info_dict': { 'id': '340344461', - 'ext': 'wav', + 'ext': 'opus', # wav original available with auth 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]', 'track': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]', 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366', @@ -552,7 +563,9 @@ class SoundcloudIE(SoundcloudBaseIE): 'uploader_url': 'https://soundcloud.com/oriuplift', 'genres': ['Trance'], 'artists': ['Ori Uplift'], + 'tags': ['Orchestral', 'Emotional', 'Uplifting Trance', 'Trance', 'Ori Uplift', 'UpOnly'], }, + 'expected_warnings': ['Original download format is only available for registered users'], }, # no album art, use avatar pic for thumbnail { @@ -577,6 +590,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'repost_count': int, 'uploader_url': 'https://soundcloud.com/garyvee', 'artists': ['MadReal'], + 'tags': [], }, 'params': { 'skip_download': True, @@ -604,8 +618,47 @@ class SoundcloudIE(SoundcloudBaseIE): 'repost_count': int, 'genres': ['Piano'], 'uploader_url': 'https://soundcloud.com/giovannisarani', + 'tags': 'count:10', }, }, + # .png "original" artwork, 160kbps m4a HLS format + { + 'url': 'https://soundcloud.com/skorxh/audio-dealer', + 'info_dict': { + 'id': '2011421339', + 'ext': 'm4a', + 'title': 'audio dealer', + 'description': '', + 'uploader': '$KORCH', + 'uploader_id': '150292288', + 'uploader_url': 'https://soundcloud.com/skorxh', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'duration': 213.469, + 'tags': [], + 'artists': ['$KORXH'], + 'track': 'audio dealer', + 'timestamp': 1737143201, + 'upload_date': '20250117', + 'license': 'all-rights-reserved', + 'thumbnail': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-original.png', + 'thumbnails': [ + {'id': 'mini', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-mini.jpg'}, + {'id': 'tiny', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-tiny.jpg'}, + {'id': 'small', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-small.jpg'}, + {'id': 'badge', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-badge.jpg'}, + {'id': 't67x67', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-t67x67.jpg'}, + {'id': 'large', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-large.jpg'}, + {'id': 't300x300', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-t300x300.jpg'}, + {'id': 'crop', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-crop.jpg'}, + {'id': 't500x500', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-t500x500.jpg'}, + {'id': 'original', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-original.png'}, + ], + }, + 'params': {'skip_download': 'm3u8', 'format': 'hls_aac_160k'}, + }, { # AAC HQ format available (account with active subscription needed) 'url': 'https://soundcloud.com/wandw/the-chainsmokers-ft-daya-dont-let-me-down-ww-remix-1', diff --git a/yt_dlp/extractor/sovietscloset.py b/yt_dlp/extractor/sovietscloset.py index 773ddd344..d35214aa8 100644 --- a/yt_dlp/extractor/sovietscloset.py +++ b/yt_dlp/extractor/sovietscloset.py @@ -1,5 +1,6 @@ +from .bunnycdn import BunnyCdnIE from .common import InfoExtractor -from ..utils import try_get, unified_timestamp +from ..utils import make_archive_id, try_get, unified_timestamp class SovietsClosetBaseIE(InfoExtractor): @@ -43,7 +44,7 @@ class SovietsClosetIE(SovietsClosetBaseIE): 'url': 'https://sovietscloset.com/video/1337', 'md5': 'bd012b04b261725510ca5383074cdd55', 'info_dict': { - 'id': '1337', + 'id': '2f0cfbf4-3588-43a9-a7d6-7c9ea3755e67', 'ext': 'mp4', 'title': 'The Witcher #13', 'thumbnail': r're:^https?://.*\.b-cdn\.net/2f0cfbf4-3588-43a9-a7d6-7c9ea3755e67/thumbnail\.jpg$', @@ -55,20 +56,23 @@ class SovietsClosetIE(SovietsClosetBaseIE): 'upload_date': '20170413', 'uploader_id': 'SovietWomble', 'uploader_url': 'https://www.twitch.tv/SovietWomble', - 'duration': 7007, + 'duration': 7008, 'was_live': True, 'availability': 'public', 'series': 'The Witcher', 'season': 'Misc', 'episode_number': 13, 'episode': 'Episode 13', + 'creators': ['SovietWomble'], + 'description': '', + '_old_archive_ids': ['sovietscloset 1337'], }, }, { 'url': 'https://sovietscloset.com/video/1105', 'md5': '89fa928f183893cb65a0b7be846d8a90', 'info_dict': { - 'id': '1105', + 'id': 'c0e5e76f-3a93-40b4-bf01-12343c2eec5d', 'ext': 'mp4', 'title': 'Arma 3 - Zeus Games #5', 'uploader': 'SovietWomble', @@ -80,39 +84,20 @@ class SovietsClosetIE(SovietsClosetBaseIE): 'upload_date': '20160420', 'uploader_id': 'SovietWomble', 'uploader_url': 'https://www.twitch.tv/SovietWomble', - 'duration': 8804, + 'duration': 8805, 'was_live': True, 'availability': 'public', 'series': 'Arma 3', 'season': 'Zeus Games', 'episode_number': 5, 'episode': 'Episode 5', + 'creators': ['SovietWomble'], + 'description': '', + '_old_archive_ids': ['sovietscloset 1105'], }, }, ] - def _extract_bunnycdn_iframe(self, video_id, bunnycdn_id): - iframe = self._download_webpage( - f'https://iframe.mediadelivery.net/embed/5105/{bunnycdn_id}', - video_id, note='Downloading BunnyCDN iframe', headers=self.MEDIADELIVERY_REFERER) - - m3u8_url = self._search_regex(r'(https?://.*?\.m3u8)', iframe, 'm3u8 url') - thumbnail_url = self._search_regex(r'(https?://.*?thumbnail\.jpg)', iframe, 'thumbnail url') - - m3u8_formats = self._extract_m3u8_formats(m3u8_url, video_id, headers=self.MEDIADELIVERY_REFERER) - - if not m3u8_formats: - duration = None - else: - duration = self._extract_m3u8_vod_duration( - m3u8_formats[0]['url'], video_id, headers=self.MEDIADELIVERY_REFERER) - - return { - 'formats': m3u8_formats, - 'thumbnail': thumbnail_url, - 'duration': duration, - } - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -122,13 +107,13 @@ def _real_extract(self, url): stream = self.parse_nuxt_jsonp(f'{static_assets_base}/video/{video_id}/payload.js', video_id, 'video')['stream'] - return { + return self.url_result( + f'https://iframe.mediadelivery.net/embed/5105/{stream["bunnyId"]}', ie=BunnyCdnIE, url_transparent=True, **self.video_meta( video_id=video_id, game_name=stream['game']['name'], category_name=try_get(stream, lambda x: x['subcategory']['name'], str), episode_number=stream.get('number'), stream_date=stream.get('date')), - **self._extract_bunnycdn_iframe(video_id, stream['bunnyId']), - } + _old_archive_ids=[make_archive_id(self, video_id)]) class SovietsClosetPlaylistIE(SovietsClosetBaseIE): diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 9e53b3407..19336252b 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -249,6 +249,12 @@ def _extract_web_data_and_status(self, url, video_id, fatal=True): elif fatal: raise ExtractorError('Unable to extract webpage video data') + if not traverse_obj(video_data, ('video', {dict})) and traverse_obj(video_data, ('isContentClassified', {bool})): + message = 'This post may not be comfortable for some audiences. Log in for access' + if fatal: + self.raise_login_required(message) + self.report_warning(f'{message}. {self._login_hint()}', video_id=video_id) + return video_data, status def _get_subtitles(self, aweme_detail, aweme_id, user_name): @@ -895,8 +901,12 @@ def _real_extract(self, url): if video_data and status == 0: return self._parse_aweme_video_web(video_data, url, video_id) - elif status == 10216: - raise ExtractorError('This video is private', expected=True) + elif status in (10216, 10222): + # 10216: private post; 10222: private account + self.raise_login_required( + 'You do not have permission to view this post. Log into an account that has access') + elif status == 10204: + raise ExtractorError('Your IP address is blocked from accessing this post', expected=True) raise ExtractorError(f'Video not available, status code {status}', video_id=video_id) diff --git a/yt_dlp/extractor/wsj.py b/yt_dlp/extractor/wsj.py index b6b656f7d..7cf46141c 100644 --- a/yt_dlp/extractor/wsj.py +++ b/yt_dlp/extractor/wsj.py @@ -100,8 +100,8 @@ def _real_extract(self, url): class WSJArticleIE(InfoExtractor): - _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>[^/?#&]+)' - _TEST = { + _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/(?:articles|opinion)/(?P<id>[^/?#&]+)' + _TESTS = [{ 'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?', 'info_dict': { 'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362', @@ -110,11 +110,20 @@ class WSJArticleIE(InfoExtractor): 'uploader_id': 'ralcaraz', 'title': 'Bao Bao the Panda Leaves for China', }, - } + }, { + 'url': 'https://www.wsj.com/opinion/hamas-hostages-caskets-bibas-family-israel-gaza-29da083b', + 'info_dict': { + 'id': 'CE68D629-8DB8-4CD3-B30A-92112C102054', + 'ext': 'mp4', + 'upload_date': '20241007', + 'uploader_id': 'Tinnes, David', + 'title': 'WSJ Opinion: "Get the Jew": The Crown Heights Riot Revisited', + }, + }] def _real_extract(self, url): article_id = self._match_id(url) - webpage = self._download_webpage(url, article_id) + webpage = self._download_webpage(url, article_id, impersonate=True) video_id = self._search_regex( r'(?:id=["\']video|video-|iframe\.html\?guid=|data-src=["\'])([a-fA-F0-9-]{36})', webpage, 'video id') diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 738ade1b0..4dc80fe28 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -857,6 +857,18 @@ def generate_api_headers( } return filter_dict(headers) + def _download_webpage_with_retries(self, *args, retry_fatal=False, retry_on_status=None, **kwargs): + for retry in self.RetryManager(fatal=retry_fatal): + try: + return self._download_webpage(*args, **kwargs) + except ExtractorError as e: + if isinstance(e.cause, network_exceptions): + if not isinstance(e.cause, HTTPError) or e.cause.status not in (retry_on_status or (403, 429)): + retry.error = e + continue + self._error_or_warning(e, fatal=retry_fatal) + break + def _download_ytcfg(self, client, video_id): url = { 'web': 'https://www.youtube.com', @@ -866,8 +878,8 @@ def _download_ytcfg(self, client, video_id): }.get(client) if not url: return {} - webpage = self._download_webpage( - url, video_id, fatal=False, note=f'Downloading {client.replace("_", " ").strip()} client config', + webpage = self._download_webpage_with_retries( + url, video_id, note=f'Downloading {client.replace("_", " ").strip()} client config', headers=traverse_obj(self._get_default_ytcfg(client), { 'User-Agent': ('INNERTUBE_CONTEXT', 'client', 'userAgent', {str}), })) @@ -3135,15 +3147,22 @@ def _extract_player_url(self, *ytcfgs, webpage=None): get_all=False, expected_type=str) if not player_url: return + # TODO: Add proper support for the 'tce' variant players + # See https://github.com/yt-dlp/yt-dlp/issues/12398 + if '/player_ias_tce.vflset/' in player_url: + self.write_debug(f'Modifying tce player URL: {player_url}') + player_url = player_url.replace('/player_ias_tce.vflset/', '/player_ias.vflset/') return urljoin('https://www.youtube.com', player_url) def _download_player_url(self, video_id, fatal=False): - res = self._download_webpage( + iframe_webpage = self._download_webpage_with_retries( 'https://www.youtube.com/iframe_api', - note='Downloading iframe API JS', video_id=video_id, fatal=fatal) - if res: + note='Downloading iframe API JS', + video_id=video_id, retry_fatal=fatal) + + if iframe_webpage: player_version = self._search_regex( - r'player\\?/([0-9a-fA-F]{8})\\?/', res, 'player version', fatal=fatal) + r'player\\?/([0-9a-fA-F]{8})\\?/', iframe_webpage, 'player version', fatal=fatal) if player_version: return f'https://www.youtube.com/s/player/{player_version}/player_ias.vflset/en_US/base.js' @@ -3369,7 +3388,7 @@ def _fixup_n_function_code(self, argnames, code): def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) - func_code = self.cache.load('youtube-nsig', player_id, min_ver='2024.07.09') + func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.02.19') jscode = func_code or self._load_player(video_id, player_url) jsi = JSInterpreter(jscode) @@ -4556,8 +4575,7 @@ def _download_player_responses(self, url, smuggled_data, video_id, webpage_url): pp = self._configuration_arg('player_params', [None], casesense=True)[0] if pp: query['pp'] = pp - webpage = self._download_webpage( - webpage_url, video_id, fatal=False, query=query) + webpage = self._download_webpage_with_retries(webpage_url, video_id, query=query) master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg() diff --git a/yt_dlp/extractor/zdf.py b/yt_dlp/extractor/zdf.py index b64a88f6c..4bef1017c 100644 --- a/yt_dlp/extractor/zdf.py +++ b/yt_dlp/extractor/zdf.py @@ -137,6 +137,116 @@ def _extract_player(self, webpage, video_id, fatal=True): group='json'), video_id) + def _extract_entry(self, url, player, content, video_id): + title = content.get('title') or content['teaserHeadline'] + + t = content['mainVideoContent']['http://zdf.de/rels/target'] + ptmd_path = traverse_obj(t, ( + (('streams', 'default'), None), + ('http://zdf.de/rels/streams/ptmd', 'http://zdf.de/rels/streams/ptmd-template'), + ), get_all=False) + if not ptmd_path: + raise ExtractorError('Could not extract ptmd_path') + + info = self._extract_ptmd( + urljoin(url, ptmd_path.replace('{playerId}', 'android_native_5')), video_id, player['apiToken'], url) + + thumbnails = [] + layouts = try_get( + content, lambda x: x['teaserImageRef']['layouts'], dict) + if layouts: + for layout_key, layout_url in layouts.items(): + layout_url = url_or_none(layout_url) + if not layout_url: + continue + thumbnail = { + 'url': layout_url, + 'format_id': layout_key, + } + mobj = re.search(r'(?P<width>\d+)x(?P<height>\d+)', layout_key) + if mobj: + thumbnail.update({ + 'width': int(mobj.group('width')), + 'height': int(mobj.group('height')), + }) + thumbnails.append(thumbnail) + + chapter_marks = t.get('streamAnchorTag') or [] + chapter_marks.append({'anchorOffset': int_or_none(t.get('duration'))}) + chapters = [{ + 'start_time': chap.get('anchorOffset'), + 'end_time': next_chap.get('anchorOffset'), + 'title': chap.get('anchorLabel'), + } for chap, next_chap in zip(chapter_marks, chapter_marks[1:])] + + return merge_dicts(info, { + 'title': title, + 'description': content.get('leadParagraph') or content.get('teasertext'), + 'duration': int_or_none(t.get('duration')), + 'timestamp': unified_timestamp(content.get('editorialDate')), + 'thumbnails': thumbnails, + 'chapters': chapters or None, + 'episode': title, + **traverse_obj(content, ('programmeItem', 0, 'http://zdf.de/rels/target', { + 'series_id': ('http://zdf.de/rels/cmdm/series', 'seriesUuid', {str}), + 'series': ('http://zdf.de/rels/cmdm/series', 'seriesTitle', {str}), + 'season': ('http://zdf.de/rels/cmdm/season', 'seasonTitle', {str}), + 'season_number': ('http://zdf.de/rels/cmdm/season', 'seasonNumber', {int_or_none}), + 'season_id': ('http://zdf.de/rels/cmdm/season', 'seasonUuid', {str}), + 'episode_number': ('episodeNumber', {int_or_none}), + 'episode_id': ('contentId', {str}), + })), + }) + + def _extract_regular(self, url, player, video_id, query=None): + player_url = player['content'] + + content = self._call_api( + update_url_query(player_url, query), + video_id, 'content', player['apiToken'], url) + + return self._extract_entry(player_url, player, content, video_id) + + def _extract_mobile(self, video_id): + video = self._download_v2_doc(video_id) + + formats = [] + formitaeten = try_get(video, lambda x: x['document']['formitaeten'], list) + document = formitaeten and video['document'] + if formitaeten: + title = document['titel'] + content_id = document['basename'] + + format_urls = set() + for f in formitaeten or []: + self._extract_format(content_id, formats, format_urls, f) + + thumbnails = [] + teaser_bild = document.get('teaserBild') + if isinstance(teaser_bild, dict): + for thumbnail_key, thumbnail in teaser_bild.items(): + thumbnail_url = try_get( + thumbnail, lambda x: x['url'], str) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + 'id': thumbnail_key, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': content_id, + 'title': title, + 'description': document.get('beschreibung'), + 'duration': int_or_none(document.get('length')), + 'timestamp': unified_timestamp(document.get('date')) or unified_timestamp( + try_get(video, lambda x: x['meta']['editorialDate'], str)), + 'thumbnails': thumbnails, + 'subtitles': self._extract_subtitles(document), + 'formats': formats, + } + class ZDFIE(ZDFBaseIE): _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html' @@ -306,121 +416,6 @@ class ZDFIE(ZDFBaseIE): }, }] - def _extract_entry(self, url, player, content, video_id): - title = content.get('title') or content['teaserHeadline'] - - t = content['mainVideoContent']['http://zdf.de/rels/target'] - ptmd_path = traverse_obj(t, ( - (('streams', 'default'), None), - ('http://zdf.de/rels/streams/ptmd', 'http://zdf.de/rels/streams/ptmd-template'), - ), get_all=False) - if not ptmd_path: - raise ExtractorError('Could not extract ptmd_path') - - info = self._extract_ptmd( - urljoin(url, ptmd_path.replace('{playerId}', 'android_native_5')), video_id, player['apiToken'], url) - - thumbnails = [] - layouts = try_get( - content, lambda x: x['teaserImageRef']['layouts'], dict) - if layouts: - for layout_key, layout_url in layouts.items(): - layout_url = url_or_none(layout_url) - if not layout_url: - continue - thumbnail = { - 'url': layout_url, - 'format_id': layout_key, - } - mobj = re.search(r'(?P<width>\d+)x(?P<height>\d+)', layout_key) - if mobj: - thumbnail.update({ - 'width': int(mobj.group('width')), - 'height': int(mobj.group('height')), - }) - thumbnails.append(thumbnail) - - chapter_marks = t.get('streamAnchorTag') or [] - chapter_marks.append({'anchorOffset': int_or_none(t.get('duration'))}) - chapters = [{ - 'start_time': chap.get('anchorOffset'), - 'end_time': next_chap.get('anchorOffset'), - 'title': chap.get('anchorLabel'), - } for chap, next_chap in zip(chapter_marks, chapter_marks[1:])] - - return merge_dicts(info, { - 'title': title, - 'description': content.get('leadParagraph') or content.get('teasertext'), - 'duration': int_or_none(t.get('duration')), - 'timestamp': unified_timestamp(content.get('editorialDate')), - 'thumbnails': thumbnails, - 'chapters': chapters or None, - 'episode': title, - **traverse_obj(content, ('programmeItem', 0, 'http://zdf.de/rels/target', { - 'series_id': ('http://zdf.de/rels/cmdm/series', 'seriesUuid', {str}), - 'series': ('http://zdf.de/rels/cmdm/series', 'seriesTitle', {str}), - 'season': ('http://zdf.de/rels/cmdm/season', 'seasonTitle', {str}), - 'season_number': ('http://zdf.de/rels/cmdm/season', 'seasonNumber', {int_or_none}), - 'season_id': ('http://zdf.de/rels/cmdm/season', 'seasonUuid', {str}), - 'episode_number': ('episodeNumber', {int_or_none}), - 'episode_id': ('contentId', {str}), - })), - }) - - def _extract_regular(self, url, player, video_id): - player_url = player['content'] - - try: - content = self._call_api( - update_url_query(player_url, {'profile': 'player-3'}), - video_id, 'content', player['apiToken'], url) - except ExtractorError as e: - self.report_warning(f'{video_id}: {e.orig_msg}; retrying with v2 profile') - content = self._call_api( - player_url, video_id, 'content', player['apiToken'], url) - - return self._extract_entry(player_url, player, content, video_id) - - def _extract_mobile(self, video_id): - video = self._download_v2_doc(video_id) - - formats = [] - formitaeten = try_get(video, lambda x: x['document']['formitaeten'], list) - document = formitaeten and video['document'] - if formitaeten: - title = document['titel'] - content_id = document['basename'] - - format_urls = set() - for f in formitaeten or []: - self._extract_format(content_id, formats, format_urls, f) - - thumbnails = [] - teaser_bild = document.get('teaserBild') - if isinstance(teaser_bild, dict): - for thumbnail_key, thumbnail in teaser_bild.items(): - thumbnail_url = try_get( - thumbnail, lambda x: x['url'], str) - if thumbnail_url: - thumbnails.append({ - 'url': thumbnail_url, - 'id': thumbnail_key, - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - }) - - return { - 'id': content_id, - 'title': title, - 'description': document.get('beschreibung'), - 'duration': int_or_none(document.get('length')), - 'timestamp': unified_timestamp(document.get('date')) or unified_timestamp( - try_get(video, lambda x: x['meta']['editorialDate'], str)), - 'thumbnails': thumbnails, - 'subtitles': self._extract_subtitles(document), - 'formats': formats, - } - def _real_extract(self, url): video_id = self._match_id(url) @@ -428,7 +423,7 @@ def _real_extract(self, url): if webpage: player = self._extract_player(webpage, url, fatal=False) if player: - return self._extract_regular(url, player, video_id) + return self._extract_regular(url, player, video_id, query={'profile': 'player-3'}) return self._extract_mobile(video_id) @@ -474,7 +469,8 @@ def _extract_entry(self, entry): 'title': ('titel', {str}), 'description': ('beschreibung', {str}), 'duration': ('length', {float_or_none}), - # TODO: seasonNumber and episodeNumber can be extracted but need to also be in ZDFIE + 'season_number': ('seasonNumber', {int_or_none}), + 'episode_number': ('episodeNumber', {int_or_none}), })) def _entries(self, data, document_id): diff --git a/yt_dlp/globals.py b/yt_dlp/globals.py new file mode 100644 index 000000000..e1c189d5a --- /dev/null +++ b/yt_dlp/globals.py @@ -0,0 +1,30 @@ +from collections import defaultdict + +# Please Note: Due to necessary changes and the complex nature involved in the plugin/globals system, +# no backwards compatibility is guaranteed for the plugin system API. +# However, we will still try our best. + + +class Indirect: + def __init__(self, initial, /): + self.value = initial + + def __repr__(self, /): + return f'{type(self).__name__}({self.value!r})' + + +postprocessors = Indirect({}) +extractors = Indirect({}) + +# Plugins +all_plugins_loaded = Indirect(False) +plugin_specs = Indirect({}) +plugin_dirs = Indirect(['default']) + +plugin_ies = Indirect({}) +plugin_pps = Indirect({}) +plugin_ies_overrides = Indirect(defaultdict(list)) + +# Misc +IN_CLI = Indirect(False) +LAZY_EXTRACTORS = Indirect(False) # `False`=force, `None`=disabled, `True`=enabled diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 06b65e0ea..91c2635a7 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -398,7 +398,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '(Alias: --no-config)')) general.add_option( '--no-config-locations', - action='store_const', dest='config_locations', const=[], + action='store_const', dest='config_locations', const=None, help=( 'Do not load any custom configuration files (default). When given inside a ' 'configuration file, ignore all previous --config-locations defined in the current file')) @@ -410,12 +410,21 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '("-" for stdin). Can be used multiple times and inside other configuration files')) general.add_option( '--plugin-dirs', - dest='plugin_dirs', metavar='PATH', action='append', + metavar='PATH', + dest='plugin_dirs', + action='callback', + callback=_list_from_options_callback, + type='str', + callback_kwargs={'delim': None}, + default=['default'], help=( 'Path to an additional directory to search for plugins. ' 'This option can be used multiple times to add multiple directories. ' - 'Note that this currently only works for extractor plugins; ' - 'postprocessor plugins can only be loaded from the default plugin directories')) + 'Use "default" to search the default plugin directories (default)')) + general.add_option( + '--no-plugin-dirs', + dest='plugin_dirs', action='store_const', const=[], + help='Clear plugin directories to search, including defaults and those provided by previous --plugin-dirs') general.add_option( '--flat-playlist', action='store_const', dest='extract_flat', const='in_playlist', default=False, diff --git a/yt_dlp/plugins.py b/yt_dlp/plugins.py index 94335a9a3..941709b21 100644 --- a/yt_dlp/plugins.py +++ b/yt_dlp/plugins.py @@ -1,4 +1,5 @@ import contextlib +import dataclasses import functools import importlib import importlib.abc @@ -14,17 +15,48 @@ from pathlib import Path from zipfile import ZipFile +from .globals import ( + Indirect, + plugin_dirs, + all_plugins_loaded, + plugin_specs, +) + from .utils import ( - Config, get_executable_path, get_system_config_dirs, get_user_config_dirs, + merge_dicts, orderedSet, write_string, ) PACKAGE_NAME = 'yt_dlp_plugins' COMPAT_PACKAGE_NAME = 'ytdlp_plugins' +_BASE_PACKAGE_PATH = Path(__file__).parent + + +# Please Note: Due to necessary changes and the complex nature involved, +# no backwards compatibility is guaranteed for the plugin system API. +# However, we will still try our best. + +__all__ = [ + 'COMPAT_PACKAGE_NAME', + 'PACKAGE_NAME', + 'PluginSpec', + 'directories', + 'load_all_plugins', + 'load_plugins', + 'register_plugin_spec', +] + + +@dataclasses.dataclass +class PluginSpec: + module_name: str + suffix: str + destination: Indirect + plugin_destination: Indirect class PluginLoader(importlib.abc.Loader): @@ -44,7 +76,42 @@ def dirs_in_zip(archive): pass except Exception as e: write_string(f'WARNING: Could not read zip file {archive}: {e}\n') - return set() + return () + + +def default_plugin_paths(): + def _get_package_paths(*root_paths, containing_folder): + for config_dir in orderedSet(map(Path, root_paths), lazy=True): + # We need to filter the base path added when running __main__.py directly + if config_dir == _BASE_PACKAGE_PATH: + continue + with contextlib.suppress(OSError): + yield from (config_dir / containing_folder).iterdir() + + # Load from yt-dlp config folders + yield from _get_package_paths( + *get_user_config_dirs('yt-dlp'), + *get_system_config_dirs('yt-dlp'), + containing_folder='plugins', + ) + + # Load from yt-dlp-plugins folders + yield from _get_package_paths( + get_executable_path(), + *get_user_config_dirs(''), + *get_system_config_dirs(''), + containing_folder='yt-dlp-plugins', + ) + + # Load from PYTHONPATH directories + yield from (path for path in map(Path, sys.path) if path != _BASE_PACKAGE_PATH) + + +def candidate_plugin_paths(candidate): + candidate_path = Path(candidate) + if not candidate_path.is_dir(): + raise ValueError(f'Invalid plugin directory: {candidate_path}') + yield from candidate_path.iterdir() class PluginFinder(importlib.abc.MetaPathFinder): @@ -56,40 +123,16 @@ class PluginFinder(importlib.abc.MetaPathFinder): def __init__(self, *packages): self._zip_content_cache = {} - self.packages = set(itertools.chain.from_iterable( - itertools.accumulate(name.split('.'), lambda a, b: '.'.join((a, b))) - for name in packages)) + self.packages = set( + itertools.chain.from_iterable( + itertools.accumulate(name.split('.'), lambda a, b: '.'.join((a, b))) + for name in packages)) def search_locations(self, fullname): - candidate_locations = [] - - def _get_package_paths(*root_paths, containing_folder='plugins'): - for config_dir in orderedSet(map(Path, root_paths), lazy=True): - with contextlib.suppress(OSError): - yield from (config_dir / containing_folder).iterdir() - - # Load from yt-dlp config folders - candidate_locations.extend(_get_package_paths( - *get_user_config_dirs('yt-dlp'), - *get_system_config_dirs('yt-dlp'), - containing_folder='plugins')) - - # Load from yt-dlp-plugins folders - candidate_locations.extend(_get_package_paths( - get_executable_path(), - *get_user_config_dirs(''), - *get_system_config_dirs(''), - containing_folder='yt-dlp-plugins')) - - candidate_locations.extend(map(Path, sys.path)) # PYTHONPATH - with contextlib.suppress(ValueError): # Added when running __main__.py directly - candidate_locations.remove(Path(__file__).parent) - - # TODO(coletdjnz): remove when plugin globals system is implemented - if Config._plugin_dirs: - candidate_locations.extend(_get_package_paths( - *Config._plugin_dirs, - containing_folder='')) + candidate_locations = itertools.chain.from_iterable( + default_plugin_paths() if candidate == 'default' else candidate_plugin_paths(candidate) + for candidate in plugin_dirs.value + ) parts = Path(*fullname.split('.')) for path in orderedSet(candidate_locations, lazy=True): @@ -109,7 +152,8 @@ def find_spec(self, fullname, path=None, target=None): search_locations = list(map(str, self.search_locations(fullname))) if not search_locations: - return None + # Prevent using built-in meta finders for searching plugins. + raise ModuleNotFoundError(fullname) spec = importlib.machinery.ModuleSpec(fullname, PluginLoader(), is_package=True) spec.submodule_search_locations = search_locations @@ -123,8 +167,10 @@ def invalidate_caches(self): def directories(): - spec = importlib.util.find_spec(PACKAGE_NAME) - return spec.submodule_search_locations if spec else [] + with contextlib.suppress(ModuleNotFoundError): + if spec := importlib.util.find_spec(PACKAGE_NAME): + return list(spec.submodule_search_locations) + return [] def iter_modules(subpackage): @@ -134,19 +180,23 @@ def iter_modules(subpackage): yield from pkgutil.iter_modules(path=pkg.__path__, prefix=f'{fullname}.') -def load_module(module, module_name, suffix): +def get_regular_classes(module, module_name, suffix): + # Find standard public plugin classes (not overrides) return inspect.getmembers(module, lambda obj: ( inspect.isclass(obj) and obj.__name__.endswith(suffix) and obj.__module__.startswith(module_name) and not obj.__name__.startswith('_') - and obj.__name__ in getattr(module, '__all__', [obj.__name__]))) + and obj.__name__ in getattr(module, '__all__', [obj.__name__]) + and getattr(obj, 'PLUGIN_NAME', None) is None + )) -def load_plugins(name, suffix): - classes = {} - if os.environ.get('YTDLP_NO_PLUGINS'): - return classes +def load_plugins(plugin_spec: PluginSpec): + name, suffix = plugin_spec.module_name, plugin_spec.suffix + regular_classes = {} + if os.environ.get('YTDLP_NO_PLUGINS') or not plugin_dirs.value: + return regular_classes for finder, module_name, _ in iter_modules(name): if any(x.startswith('_') for x in module_name.split('.')): @@ -163,24 +213,42 @@ def load_plugins(name, suffix): sys.modules[module_name] = module spec.loader.exec_module(module) except Exception: - write_string(f'Error while importing module {module_name!r}\n{traceback.format_exc(limit=-1)}') + write_string( + f'Error while importing module {module_name!r}\n{traceback.format_exc(limit=-1)}', + ) continue - classes.update(load_module(module, module_name, suffix)) + regular_classes.update(get_regular_classes(module, module_name, suffix)) # Compat: old plugin system using __init__.py # Note: plugins imported this way do not show up in directories() # nor are considered part of the yt_dlp_plugins namespace package - with contextlib.suppress(FileNotFoundError): - spec = importlib.util.spec_from_file_location( - name, Path(get_executable_path(), COMPAT_PACKAGE_NAME, name, '__init__.py')) - plugins = importlib.util.module_from_spec(spec) - sys.modules[spec.name] = plugins - spec.loader.exec_module(plugins) - classes.update(load_module(plugins, spec.name, suffix)) + if 'default' in plugin_dirs.value: + with contextlib.suppress(FileNotFoundError): + spec = importlib.util.spec_from_file_location( + name, + Path(get_executable_path(), COMPAT_PACKAGE_NAME, name, '__init__.py'), + ) + plugins = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = plugins + spec.loader.exec_module(plugins) + regular_classes.update(get_regular_classes(plugins, spec.name, suffix)) - return classes + # Add the classes into the global plugin lookup for that type + plugin_spec.plugin_destination.value = regular_classes + # We want to prepend to the main lookup for that type + plugin_spec.destination.value = merge_dicts(regular_classes, plugin_spec.destination.value) + + return regular_classes -sys.meta_path.insert(0, PluginFinder(f'{PACKAGE_NAME}.extractor', f'{PACKAGE_NAME}.postprocessor')) +def load_all_plugins(): + for plugin_spec in plugin_specs.value.values(): + load_plugins(plugin_spec) + all_plugins_loaded.value = True -__all__ = ['COMPAT_PACKAGE_NAME', 'PACKAGE_NAME', 'directories', 'load_plugins'] + +def register_plugin_spec(plugin_spec: PluginSpec): + # If the plugin spec for a module is already registered, it will not be added again + if plugin_spec.module_name not in plugin_specs.value: + plugin_specs.value[plugin_spec.module_name] = plugin_spec + sys.meta_path.insert(0, PluginFinder(f'{PACKAGE_NAME}.{plugin_spec.module_name}')) diff --git a/yt_dlp/postprocessor/__init__.py b/yt_dlp/postprocessor/__init__.py index 7b1620544..20e8b14b2 100644 --- a/yt_dlp/postprocessor/__init__.py +++ b/yt_dlp/postprocessor/__init__.py @@ -33,15 +33,38 @@ from .sponskrub import SponSkrubPP from .sponsorblock import SponsorBlockPP from .xattrpp import XAttrMetadataPP -from ..plugins import load_plugins +from ..globals import plugin_pps, postprocessors +from ..plugins import PACKAGE_NAME, register_plugin_spec, PluginSpec +from ..utils import deprecation_warning -_PLUGIN_CLASSES = load_plugins('postprocessor', 'PP') + +def __getattr__(name): + lookup = plugin_pps.value + if name in lookup: + deprecation_warning( + f'Importing a plugin Post-Processor from {__name__} is deprecated. ' + f'Please import {PACKAGE_NAME}.postprocessor.{name} instead.') + return lookup[name] + + raise AttributeError(f'module {__name__!r} has no attribute {name!r}') def get_postprocessor(key): - return globals()[key + 'PP'] + return postprocessors.value[key + 'PP'] -globals().update(_PLUGIN_CLASSES) -__all__ = [name for name in globals() if name.endswith('PP')] -__all__.extend(('FFmpegPostProcessor', 'PostProcessor')) +register_plugin_spec(PluginSpec( + module_name='postprocessor', + suffix='PP', + destination=postprocessors, + plugin_destination=plugin_pps, +)) + +_default_pps = { + name: value + for name, value in globals().items() + if name.endswith('PP') or name in ('FFmpegPostProcessor', 'PostProcessor') +} +postprocessors.value.update(_default_pps) + +__all__ = list(_default_pps.values()) diff --git a/yt_dlp/postprocessor/common.py b/yt_dlp/postprocessor/common.py index be2bb33f6..f0a71c1ff 100644 --- a/yt_dlp/postprocessor/common.py +++ b/yt_dlp/postprocessor/common.py @@ -10,6 +10,7 @@ _configuration_args, deprecation_warning, ) +from ..utils._utils import _ProgressState class PostProcessorMetaClass(type): @@ -189,7 +190,7 @@ def report_progress(self, s): self._downloader.to_console_title(self._downloader.evaluate_outtmpl( progress_template.get('postprocess-title') or 'yt-dlp %(progress._default_template)s', - progress_dict)) + progress_dict), _ProgressState.from_dict(s), s.get('_percent')) def _retry_download(self, err, count, retries): # While this is not an extractor, it behaves similar to one and diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index a71a381e5..4093c238c 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -8,6 +8,7 @@ import datetime as dt import email.header import email.utils +import enum import errno import functools import hashlib @@ -51,6 +52,7 @@ compat_HTMLParseError, ) from ..dependencies import xattr +from ..globals import IN_CLI __name__ = __name__.rsplit('.', 1)[0] # noqa: A001: Pretend to be the parent module @@ -1486,8 +1488,7 @@ def write_string(s, out=None, encoding=None): # TODO: Use global logger def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs): - from .. import _IN_CLI - if _IN_CLI: + if IN_CLI.value: if msg in deprecation_warning._cache: return deprecation_warning._cache.add(msg) @@ -4890,10 +4891,6 @@ class Config: filename = None __initialized = False - # Internal only, do not use! Hack to enable --plugin-dirs - # TODO(coletdjnz): remove when plugin globals system is implemented - _plugin_dirs = None - def __init__(self, parser, label=None): self.parser, self.label = parser, label self._loaded_paths, self.configs = set(), [] @@ -5631,6 +5628,24 @@ def filesize_from_tbr(tbr, duration): return int(duration * tbr * (1000 / 8)) +def _request_dump_filename(url, video_id, data=None, trim_length=None): + if data is not None: + data = hashlib.md5(data).hexdigest() + basen = join_nonempty(video_id, data, url, delim='_') + trim_length = trim_length or 240 + if len(basen) > trim_length: + h = '___' + hashlib.md5(basen.encode()).hexdigest() + basen = basen[:trim_length - len(h)] + h + filename = sanitize_filename(f'{basen}.dump', restricted=True) + # Working around MAX_PATH limitation on Windows (see + # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) + if os.name == 'nt': + absfilepath = os.path.abspath(filename) + if len(absfilepath) > 259: + filename = fR'\\?\{absfilepath}' + return filename + + # XXX: Temporary class _YDLLogger: def __init__(self, ydl=None): @@ -5659,3 +5674,32 @@ def stdout(self, message): def stderr(self, message): if self._ydl: self._ydl.to_stderr(message) + + +class _ProgressState(enum.Enum): + """ + Represents a state for a progress bar. + + See: https://conemu.github.io/en/AnsiEscapeCodes.html#ConEmu_specific_OSC + """ + + HIDDEN = 0 + INDETERMINATE = 3 + VISIBLE = 1 + WARNING = 4 + ERROR = 2 + + @classmethod + def from_dict(cls, s, /): + if s['status'] == 'finished': + return cls.INDETERMINATE + + # Not currently used + if s['status'] == 'error': + return cls.ERROR + + return cls.INDETERMINATE if s.get('_percent') is None else cls.VISIBLE + + def get_ansi_escape(self, /, percent=None): + percent = 0 if percent is None else int(percent) + return f'\033]9;4;{self.value};{percent}\007' diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 4f7a2ec90..7346ca49c 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2025.01.26' +__version__ = '2025.02.19' -RELEASE_GIT_HEAD = '3b4531934465580be22937fecbb6e1a3a9e2334f' +RELEASE_GIT_HEAD = '4985a4041770eaa0016271809a1fd950dc809a55' VARIANT = None @@ -12,4 +12,4 @@ ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2025.01.26' +_pkg_version = '2025.02.19'