Merge remote-tracking branch 'upstream/master' into misc/globals-and-plugins-revive

2025-03-09 12:50:23 -05:00 · 2025-02-23 08:59:20 +13:00 · 2025-02-23 08:59:20 +13:00 · 5a2120c0e9
commit 5a2120c0e9
parent 80f8fa771c 3a1583ca75
6 changed files with 343 additions and 105 deletions
--- a/test/helper.py
+++ b/test/helper.py
@ -101,87 +101,109 @@ def getwebpagetestcases():
 md5 = lambda s: hashlib.md5(s.encode()).hexdigest()


-def expect_value(self, got, expected, field):
-    if isinstance(expected, str) and expected.startswith('re:'):
-        match_str = expected[len('re:'):]
-        match_rex = re.compile(match_str)
+def _iter_differences(got, expected, field):
+    if isinstance(expected, str):
+        op, _, val = expected.partition(':')
+        if op in ('mincount', 'maxcount', 'count'):
+            if not isinstance(got, (list, dict)):
+                yield field, f'expected either {list.__name__} or {dict.__name__}, got {type(got).__name__}'
+                return

-        self.assertTrue(
-            isinstance(got, str),
-            f'Expected a {str.__name__} object, but got {type(got).__name__} for field {field}')
-        self.assertTrue(
-            match_rex.match(got),
-            f'field {field} (value: {got!r}) should match {match_str!r}')
-    elif isinstance(expected, str) and expected.startswith('startswith:'):
-        start_str = expected[len('startswith:'):]
-        self.assertTrue(
-            isinstance(got, str),
-            f'Expected a {str.__name__} object, but got {type(got).__name__} for field {field}')
-        self.assertTrue(
-            got.startswith(start_str),
-            f'field {field} (value: {got!r}) should start with {start_str!r}')
-    elif isinstance(expected, str) and expected.startswith('contains:'):
-        contains_str = expected[len('contains:'):]
-        self.assertTrue(
-            isinstance(got, str),
-            f'Expected a {str.__name__} object, but got {type(got).__name__} for field {field}')
-        self.assertTrue(
-            contains_str in got,
-            f'field {field} (value: {got!r}) should contain {contains_str!r}')
-    elif isinstance(expected, type):
-        self.assertTrue(
-            isinstance(got, expected),
-            f'Expected type {expected!r} for field {field}, but got value {got!r} of type {type(got)!r}')
-    elif isinstance(expected, dict) and isinstance(got, dict):
-        expect_dict(self, got, expected)
-    elif isinstance(expected, list) and isinstance(got, list):
-        self.assertEqual(
-            len(expected), len(got),
-            f'Expect a list of length {len(expected)}, but got a list of length {len(got)} for field {field}')
-        for index, (item_got, item_expected) in enumerate(zip(got, expected)):
-            type_got = type(item_got)
-            type_expected = type(item_expected)
-            self.assertEqual(
-                type_expected, type_got,
-                f'Type mismatch for list item at index {index} for field {field}, '
-                f'expected {type_expected!r}, got {type_got!r}')
-            expect_value(self, item_got, item_expected, field)
-    else:
-        if isinstance(expected, str) and expected.startswith('md5:'):
-            self.assertTrue(
-                isinstance(got, str),
-                f'Expected field {field} to be a unicode object, but got value {got!r} of type {type(got)!r}')
-            got = 'md5:' + md5(got)
-        elif isinstance(expected, str) and re.match(r'^(?:min|max)?count:\d+', expected):
-            self.assertTrue(
-                isinstance(got, (list, dict)),
-                f'Expected field {field} to be a list or a dict, but it is of type {type(got).__name__}')
-            op, _, expected_num = expected.partition(':')
-            expected_num = int(expected_num)
+            expected_num = int(val)
+            got_num = len(got)
            if op == 'mincount':
-                assert_func = assertGreaterEqual
-                msg_tmpl = 'Expected %d items in field %s, but only got %d'
-            elif op == 'maxcount':
-                assert_func = assertLessEqual
-                msg_tmpl = 'Expected maximum %d items in field %s, but got %d'
-            elif op == 'count':
-                assert_func = assertEqual
-                msg_tmpl = 'Expected exactly %d items in field %s, but got %d'
-            else:
-                assert False
-            assert_func(
-                self, len(got), expected_num,
-                msg_tmpl % (expected_num, field, len(got)))
+                if got_num < expected_num:
+                    yield field, f'expected at least {val} items, got {got_num}'
+                return
+
+            if op == 'maxcount':
+                if got_num > expected_num:
+                    yield field, f'expected at most {val} items, got {got_num}'
+                return
+
+            assert op == 'count'
+            if got_num != expected_num:
+                yield field, f'expected exactly {val} items, got {got_num}'
            return
-        self.assertEqual(
-            expected, got,
-            f'Invalid value for field {field}, expected {expected!r}, got {got!r}')
+
+        if not isinstance(got, str):
+            yield field, f'expected {str.__name__}, got {type(got).__name__}'
+            return
+
+        if op == 're':
+            if not re.match(val, got):
+                yield field, f'should match {val!r}, got {got!r}'
+            return
+
+        if op == 'startswith':
+            if not val.startswith(got):
+                yield field, f'should start with {val!r}, got {got!r}'
+            return
+
+        if op == 'contains':
+            if not val.startswith(got):
+                yield field, f'should contain {val!r}, got {got!r}'
+            return
+
+        if op == 'md5':
+            hash_val = md5(got)
+            if hash_val != val:
+                yield field, f'expected hash {val}, got {hash_val}'
+            return
+
+        if got != expected:
+            yield field, f'expected {expected!r}, got {got!r}'
+        return
+
+    if isinstance(expected, dict) and isinstance(got, dict):
+        for key, expected_val in expected.items():
+            if key not in got:
+                yield field, f'missing key: {key!r}'
+                continue
+
+            field_name = key if field is None else f'{field}.{key}'
+            yield from _iter_differences(got[key], expected_val, field_name)
+        return
+
+    if isinstance(expected, type):
+        if not isinstance(got, expected):
+            yield field, f'expected {expected.__name__}, got {type(got).__name__}'
+        return
+
+    if isinstance(expected, list) and isinstance(got, list):
+        # TODO: clever diffing algorithm lmao
+        if len(expected) != len(got):
+            yield field, f'expected length of {len(expected)}, got {len(got)}'
+            return
+
+        for index, (got_val, expected_val) in enumerate(zip(got, expected)):
+            field_name = str(index) if field is None else f'{field}.{index}'
+            yield from _iter_differences(got_val, expected_val, field_name)
+        return
+
+    if got != expected:
+        yield field, f'expected {expected!r}, got {got!r}'
+
+
+def _expect_value(message, got, expected, field):
+    mismatches = list(_iter_differences(got, expected, field))
+    if not mismatches:
+        return
+
+    fields = [field for field, _ in mismatches if field is not None]
+    return ''.join((
+        message, f' ({", ".join(fields)})' if fields else '',
+        *(f'\n\t{field}: {message}' for field, message in mismatches)))
+
+
+def expect_value(self, got, expected, field):
+    if message := _expect_value('values differ', got, expected, field):
+        self.fail(message)


 def expect_dict(self, got_dict, expected_dict):
-    for info_field, expected in expected_dict.items():
-        got = got_dict.get(info_field)
-        expect_value(self, got, expected, info_field)
+    if message := _expect_value('dictionaries differ', got_dict, expected_dict, None):
+        self.fail(message)


 def sanitize_got_info_dict(got_dict):
--- a/yt_dlp/downloader/init.py
+++ b/yt_dlp/downloader/init.py
@ -35,6 +35,7 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N
 from .rtsp import RtspFD
 from .websocket import WebSocketFragmentFD
 from .youtube_live_chat import YoutubeLiveChatFD
+from .bunnycdn import BunnyCdnFD

 PROTOCOL_MAP = {
    'rtmp': RtmpFD,
@ -55,6 +56,7 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N
    'websocket_frag': WebSocketFragmentFD,
    'youtube_live_chat': YoutubeLiveChatFD,
    'youtube_live_chat_replay': YoutubeLiveChatFD,
+    'bunnycdn': BunnyCdnFD,
 }


--- a/yt_dlp/downloader/bunnycdn.py
+++ b/yt_dlp/downloader/bunnycdn.py
@ -0,0 +1,50 @@
+import hashlib
+import random
+import threading
+
+from .common import FileDownloader
+from . import HlsFD
+from ..networking import Request
+from ..networking.exceptions import network_exceptions
+
+
+class BunnyCdnFD(FileDownloader):
+    """
+    Downloads from BunnyCDN with required pings
+    Note, this is not a part of public API, and will be removed without notice.
+    DO NOT USE
+    """
+
+    def real_download(self, filename, info_dict):
+        self.to_screen(f'[{self.FD_NAME}] Downloading from BunnyCDN')
+
+        fd = HlsFD(self.ydl, self.params)
+
+        stop_event = threading.Event()
+        ping_thread = threading.Thread(target=self.ping_thread, args=(stop_event,), kwargs=info_dict['_bunnycdn_ping_data'])
+        ping_thread.start()
+
+        try:
+            return fd.real_download(filename, info_dict)
+        finally:
+            stop_event.set()
+
+    def ping_thread(self, stop_event, url, headers, secret, context_id):
+        # Site sends ping every 4 seconds, but this throttles the download. Pinging every 2 seconds seems to work.
+        ping_interval = 2
+        # Hard coded resolution as it doesn't seem to matter
+        res = 1080
+        paused = 'false'
+        current_time = 0
+
+        while not stop_event.wait(ping_interval):
+            current_time += ping_interval
+
+            time = current_time + round(random.random(), 6)
+            md5_hash = hashlib.md5(f'{secret}_{context_id}_{time}_{paused}_{res}'.encode()).hexdigest()
+            ping_url = f'{url}?hash={md5_hash}&time={time}&paused={paused}&resolution={res}'
+
+            try:
+                self.ydl.urlopen(Request(ping_url, headers=headers)).read()
+            except network_exceptions as e:
+                self.to_screen(f'[{self.FD_NAME}] Ping failed: {e}')
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -312,6 +312,7 @@
 )
 from .bundesliga import BundesligaIE
 from .bundestag import BundestagIE
+from .bunnycdn import BunnyCdnIE
 from .businessinsider import BusinessInsiderIE
 from .buzzfeed import BuzzFeedIE
 from .byutv import BYUtvIE
--- a/yt_dlp/extractor/bunnycdn.py
+++ b/yt_dlp/extractor/bunnycdn.py
@ -0,0 +1,178 @@
+import json
+
+from .common import InfoExtractor
+from ..networking import HEADRequest
+from ..utils import (
+    ExtractorError,
+    extract_attributes,
+    int_or_none,
+    parse_qs,
+    smuggle_url,
+    unsmuggle_url,
+    url_or_none,
+    urlhandle_detect_ext,
+)
+from ..utils.traversal import find_element, traverse_obj
+
+
+class BunnyCdnIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:iframe\.mediadelivery\.net|video\.bunnycdn\.com)/(?:embed|play)/(?P<library_id>\d+)/(?P<id>[\da-f-]+)'
+    _EMBED_REGEX = [rf'<iframe[^>]+src=[\'"](?P<url>{_VALID_URL}[^\'"]*)[\'"]']
+    _TESTS = [{
+        'url': 'https://iframe.mediadelivery.net/embed/113933/e73edec1-e381-4c8b-ae73-717a140e0924',
+        'info_dict': {
+            'id': 'e73edec1-e381-4c8b-ae73-717a140e0924',
+            'ext': 'mp4',
+            'title': 'mistress morgana (3).mp4',
+            'description': '',
+            'timestamp': 1693251673,
+            'thumbnail': r're:^https?://.*\.b-cdn\.net/e73edec1-e381-4c8b-ae73-717a140e0924/thumbnail\.jpg',
+            'duration': 7.0,
+            'upload_date': '20230828',
+        },
+        'params': {'skip_download': True},
+    }, {
+        'url': 'https://iframe.mediadelivery.net/play/136145/32e34c4b-0d72-437c-9abb-05e67657da34',
+        'info_dict': {
+            'id': '32e34c4b-0d72-437c-9abb-05e67657da34',
+            'ext': 'mp4',
+            'timestamp': 1691145748,
+            'thumbnail': r're:^https?://.*\.b-cdn\.net/32e34c4b-0d72-437c-9abb-05e67657da34/thumbnail_9172dc16\.jpg',
+            'duration': 106.0,
+            'description': 'md5:981a3e899a5c78352b21ed8b2f1efd81',
+            'upload_date': '20230804',
+            'title': 'Sanela ist Teil der #arbeitsmarktkraft',
+        },
+        'params': {'skip_download': True},
+    }, {
+        # Stream requires activation and pings
+        'url': 'https://iframe.mediadelivery.net/embed/200867/2e8545ec-509d-4571-b855-4cf0235ccd75',
+        'info_dict': {
+            'id': '2e8545ec-509d-4571-b855-4cf0235ccd75',
+            'ext': 'mp4',
+            'timestamp': 1708497752,
+            'title': 'netflix part 1',
+            'duration': 3959.0,
+            'description': '',
+            'upload_date': '20240221',
+            'thumbnail': r're:^https?://.*\.b-cdn\.net/2e8545ec-509d-4571-b855-4cf0235ccd75/thumbnail\.jpg',
+        },
+        'params': {'skip_download': True},
+    }]
+    _WEBPAGE_TESTS = [{
+        # Stream requires Referer
+        'url': 'https://conword.io/',
+        'info_dict': {
+            'id': '3a5d863e-9cd6-447e-b6ef-e289af50b349',
+            'ext': 'mp4',
+            'title': 'Conword bei der Stadt Köln und Stadt Dortmund',
+            'description': '',
+            'upload_date': '20231031',
+            'duration': 31.0,
+            'thumbnail': 'https://video.watchuh.com/3a5d863e-9cd6-447e-b6ef-e289af50b349/thumbnail.jpg',
+            'timestamp': 1698783879,
+        },
+        'params': {'skip_download': True},
+    }, {
+        # URL requires token and expires
+        'url': 'https://www.stockphotos.com/video/moscow-subway-the-train-is-arriving-at-the-park-kultury-station-10017830',
+        'info_dict': {
+            'id': '0b02fa20-4e8c-4140-8f87-f64d820a3386',
+            'ext': 'mp4',
+            'thumbnail': r're:^https?://.*\.b-cdn\.net/0b02fa20-4e8c-4140-8f87-f64d820a3386/thumbnail\.jpg',
+            'title': 'Moscow subway. The train is arriving at the Park Kultury station.',
+            'upload_date': '20240531',
+            'duration': 18.0,
+            'timestamp': 1717152269,
+            'description': '',
+        },
+        'params': {'skip_download': True},
+    }]
+
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
+        for embed_url in super()._extract_embed_urls(url, webpage):
+            yield smuggle_url(embed_url, {'Referer': url})
+
+    def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+
+        video_id, library_id = self._match_valid_url(url).group('id', 'library_id')
+        webpage = self._download_webpage(
+            f'https://iframe.mediadelivery.net/embed/{library_id}/{video_id}', video_id,
+            headers=traverse_obj(smuggled_data, {'Referer': 'Referer'}),
+            query=traverse_obj(parse_qs(url), {'token': 'token', 'expires': 'expires'}))
+
+        if html_title := self._html_extract_title(webpage, default=None) == '403':
+            raise ExtractorError(
+                'This video is inaccessible. Setting a Referer header '
+                'might be required to access the video', expected=True)
+        elif html_title == '404':
+            raise ExtractorError('This video does not exist', expected=True)
+
+        headers = {'Referer': url}
+
+        info = traverse_obj(self._parse_html5_media_entries(url, webpage, video_id, _headers=headers), 0) or {}
+        formats = info.get('formats') or []
+        subtitles = info.get('subtitles') or {}
+
+        original_url = self._search_regex(
+            r'(?:var|const|let)\s+originalUrl\s*=\s*["\']([^"\']+)["\']', webpage, 'original url', default=None)
+        if url_or_none(original_url):
+            urlh = self._request_webpage(
+                HEADRequest(original_url), video_id=video_id, note='Checking original',
+                headers=headers, fatal=False, expected_status=(403, 404))
+            if urlh and urlh.status == 200:
+                formats.append({
+                    'url': original_url,
+                    'format_id': 'source',
+                    'quality': 1,
+                    'http_headers': headers,
+                    'ext': urlhandle_detect_ext(urlh, default='mp4'),
+                    'filesize': int_or_none(urlh.get_header('Content-Length')),
+                })
+
+        # MediaCage Streams require activation and pings
+        src_url = self._search_regex(
+            r'\.setAttribute\([\'"]src[\'"],\s*[\'"]([^\'"]+)[\'"]\)', webpage, 'src url', default=None)
+        activation_url = self._search_regex(
+            r'loadUrl\([\'"]([^\'"]+/activate)[\'"]', webpage, 'activation url', default=None)
+        ping_url = self._search_regex(
+            r'loadUrl\([\'"]([^\'"]+/ping)[\'"]', webpage, 'ping url', default=None)
+        secret = traverse_obj(parse_qs(src_url), ('secret', 0))
+        context_id = traverse_obj(parse_qs(src_url), ('contextId', 0))
+        ping_data = {}
+        if src_url and activation_url and ping_url and secret and context_id:
+            self._download_webpage(
+                activation_url, video_id, headers=headers, note='Downloading activation data')
+
+            fmts, subs = self._extract_m3u8_formats_and_subtitles(
+                src_url, video_id, 'mp4', headers=headers, m3u8_id='hls', fatal=False)
+            for fmt in fmts:
+                fmt.update({
+                    'protocol': 'bunnycdn',
+                    'http_headers': headers,
+                })
+            formats.extend(fmts)
+            self._merge_subtitles(subs, target=subtitles)
+
+            ping_data = {
+                '_bunnycdn_ping_data': {
+                    'url': ping_url,
+                    'headers': headers,
+                    'secret': secret,
+                    'context_id': context_id,
+                },
+            }
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'subtitles': subtitles,
+            **traverse_obj(webpage, ({find_element(id='main-video', html=True)}, {extract_attributes}, {
+                'title': ('data-plyr-config', {json.loads}, 'title', {str}),
+                'thumbnail': ('data-poster', {url_or_none}),
+            })),
+            **ping_data,
+            **self._search_json_ld(webpage, video_id, fatal=False),
+        }
--- a/yt_dlp/extractor/sovietscloset.py
+++ b/yt_dlp/extractor/sovietscloset.py
@ -1,5 +1,6 @@
+from .bunnycdn import BunnyCdnIE
 from .common import InfoExtractor
-from ..utils import try_get, unified_timestamp
+from ..utils import make_archive_id, try_get, unified_timestamp


 class SovietsClosetBaseIE(InfoExtractor):
@ -43,7 +44,7 @@ class SovietsClosetIE(SovietsClosetBaseIE):
            'url': 'https://sovietscloset.com/video/1337',
            'md5': 'bd012b04b261725510ca5383074cdd55',
            'info_dict': {
-                'id': '1337',
+                'id': '2f0cfbf4-3588-43a9-a7d6-7c9ea3755e67',
                'ext': 'mp4',
                'title': 'The Witcher #13',
                'thumbnail': r're:^https?://.*\.b-cdn\.net/2f0cfbf4-3588-43a9-a7d6-7c9ea3755e67/thumbnail\.jpg$',
@ -55,20 +56,23 @@ class SovietsClosetIE(SovietsClosetBaseIE):
                'upload_date': '20170413',
                'uploader_id': 'SovietWomble',
                'uploader_url': 'https://www.twitch.tv/SovietWomble',
-                'duration': 7007,
+                'duration': 7008,
                'was_live': True,
                'availability': 'public',
                'series': 'The Witcher',
                'season': 'Misc',
                'episode_number': 13,
                'episode': 'Episode 13',
+                'creators': ['SovietWomble'],
+                'description': '',
+                '_old_archive_ids': ['sovietscloset 1337'],
            },
        },
        {
            'url': 'https://sovietscloset.com/video/1105',
            'md5': '89fa928f183893cb65a0b7be846d8a90',
            'info_dict': {
-                'id': '1105',
+                'id': 'c0e5e76f-3a93-40b4-bf01-12343c2eec5d',
                'ext': 'mp4',
                'title': 'Arma 3 - Zeus Games #5',
                'uploader': 'SovietWomble',
@ -80,39 +84,20 @@ class SovietsClosetIE(SovietsClosetBaseIE):
                'upload_date': '20160420',
                'uploader_id': 'SovietWomble',
                'uploader_url': 'https://www.twitch.tv/SovietWomble',
-                'duration': 8804,
+                'duration': 8805,
                'was_live': True,
                'availability': 'public',
                'series': 'Arma 3',
                'season': 'Zeus Games',
                'episode_number': 5,
                'episode': 'Episode 5',
+                'creators': ['SovietWomble'],
+                'description': '',
+                '_old_archive_ids': ['sovietscloset 1105'],
            },
        },
    ]

-    def _extract_bunnycdn_iframe(self, video_id, bunnycdn_id):
-        iframe = self._download_webpage(
-            f'https://iframe.mediadelivery.net/embed/5105/{bunnycdn_id}',
-            video_id, note='Downloading BunnyCDN iframe', headers=self.MEDIADELIVERY_REFERER)
-
-        m3u8_url = self._search_regex(r'(https?://.*?\.m3u8)', iframe, 'm3u8 url')
-        thumbnail_url = self._search_regex(r'(https?://.*?thumbnail\.jpg)', iframe, 'thumbnail url')
-
-        m3u8_formats = self._extract_m3u8_formats(m3u8_url, video_id, headers=self.MEDIADELIVERY_REFERER)
-
-        if not m3u8_formats:
-            duration = None
-        else:
-            duration = self._extract_m3u8_vod_duration(
-                m3u8_formats[0]['url'], video_id, headers=self.MEDIADELIVERY_REFERER)
-
-        return {
-            'formats': m3u8_formats,
-            'thumbnail': thumbnail_url,
-            'duration': duration,
-        }
-
    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
@ -122,13 +107,13 @@ def _real_extract(self, url):

        stream = self.parse_nuxt_jsonp(f'{static_assets_base}/video/{video_id}/payload.js', video_id, 'video')['stream']

-        return {
+        return self.url_result(
+            f'https://iframe.mediadelivery.net/embed/5105/{stream["bunnyId"]}', ie=BunnyCdnIE, url_transparent=True,
            **self.video_meta(
                video_id=video_id, game_name=stream['game']['name'],
                category_name=try_get(stream, lambda x: x['subcategory']['name'], str),
                episode_number=stream.get('number'), stream_date=stream.get('date')),
-            **self._extract_bunnycdn_iframe(video_id, stream['bunnyId']),
-        }
+            _old_archive_ids=[make_archive_id(self, video_id)])


 class SovietsClosetPlaylistIE(SovietsClosetBaseIE):