From 258ae2324485a3c148b9e93c787c889ee8018bd8 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Mon, 22 Jul 2024 18:48:39 -0400 Subject: [PATCH 01/48] draft --- yt_dlp/extractor/_extractors.py | 5 + yt_dlp/extractor/rplay.py | 244 ++++++++++++++++++++++++++++++++ 2 files changed, 249 insertions(+) create mode 100644 yt_dlp/extractor/rplay.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index d2140bc52..ea9d0e7bd 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1726,6 +1726,11 @@ RozhlasIE, RozhlasVltavaIE, ) +from .rplay import ( + RPlayVideoIE, + RPlayUserIE, + RPlayLiveIE, +) from .rte import ( RteIE, RteRadioIE, diff --git a/yt_dlp/extractor/rplay.py b/yt_dlp/extractor/rplay.py new file mode 100644 index 000000000..e307e15e1 --- /dev/null +++ b/yt_dlp/extractor/rplay.py @@ -0,0 +1,244 @@ +import time +import re +import random +from .common import ( + InfoExtractor, +) +from ..utils import ( + ExtractorError, + encode_data_uri, + float_or_none, + traverse_obj, + parse_iso8601, + url_or_none, +) + +from playwright.sync_api import sync_playwright + + +class RPlayBaseIE(InfoExtractor): + _TOKEN_CACHE = {} + _user_id = None + _login_type = None + _jwt_token = None + + @property + def user_id(self): + return self._user_id + + @property + def login_type(self): + return self._login_type + + @property + def jwt_token(self): + return self._jwt_token + + def _perform_login(self, username, password): + _ = { + 'alg': 'HS256', + 'typ': 'JWT', + } + raise NotImplementedError + + def _login_by_token(self, jwt_token, video_id): + user_info = self._download_json( + 'https://api.rplay.live/account/login', video_id, note='performing login', errnote='Failed to login', + data=f'{{"token":"{jwt_token}","lang":"en","loginType":null,"checkAdmin":null}}'.encode(), + headers={'Content-Type': 'application/json', 'Authorization': 'null'}, fatal=False) + if user_info: + self._user_id = traverse_obj(user_info, 'oid') + self._login_type = traverse_obj(user_info, 'accountType') + self._jwt_token = jwt_token + + def _get_butter_files(self): + cache = self.cache.load('rplay', 'butter-code') or {} + if cache.get('date', 0) > time.time() - 86400: + return cache['js'], cache['wasm'] + butter_js = self._download_webpage('https://pb.rplay.live/kr/public/smooth_like_butter.js', 'butter', + 'getting butter-sign js') + urlh = self._request_webpage('https://pb.rplay.live/kr/public/smooth_like_butter_bg.wasm', 'butter', + 'getting butter-sign wasm') + butter_wasm_array = list(urlh.read()) + self.cache.store('rplay', 'butter-code', {'js': butter_js, 'wasm': butter_wasm_array, 'date': time.time()}) + return butter_js, butter_wasm_array + + def _playwrite_eval(self, jscode, goto=None): + with sync_playwright() as p: + browser = p.chromium.launch() + page = browser.new_page() + if goto: + page.goto(goto) + value = page.evaluate(''' + const proxy = new Proxy(window.navigator, {get(target, prop, receiver) { + if (prop == "webdriver") return false; + throw new Error(prop); + return target[prop]; + }}); + Object.defineProperty(window, "navigator", {get: ()=> proxy}); + window.navigator.webdriver; + ''') + value = page.evaluate(jscode) + browser.close() + return value + + def _get_butter_token(self): + butter_js, butter_wasm_array = self._get_butter_files() + butter_js = butter_js.replace('export{initSync};export default __wbg_init;', '') + butter_js = butter_js.replace('export class', 'class') + butter_js = butter_js.replace('new URL("smooth_like_butter_bg.wasm",import.meta.url)', '""') + + butter_js += ''';__new_init = async () => { + const t = __wbg_get_imports(); + __wbg_init_memory(t); + const {module, instance} = await WebAssembly.instantiate(Uint8Array.from(%s), t); + __wbg_finalize_init(instance, module); + };''' % butter_wasm_array + butter_js += '__new_init().then(() => (new ButterFactory()).generate_butter())' + return self._playwrite_eval(butter_js, goto='https://rplay.live/') + + def get_butter_token(self): + cache = self.cache.load('rplay', 'butter-token') or {} + timestamp = str(int(time.time() / 360)) + if cache.get(timestamp): + return cache[timestamp] + token = self._get_butter_token() + self.cache.store('rplay', 'butter-token', {timestamp: token}) + return token + + +class RPlayVideoIE(RPlayBaseIE): + _VALID_URL = r'https://rplay.live/play/(?P[\d\w]+)' + _TESTS = [{ + 'url': 'https://rplay.live/play/669203d25223214e67579dc3/', + 'info_dict': { + 'id': '669203d25223214e67579dc3', + 'ext': 'mp4', + 'title': '〖ASMR/SR3D〗密着♥じっくり奥ゾリゾリ耳舐め♡〖杏都める〗Earlicking,Mouthsound,舔耳,힐링,귀 핥기', + 'description': 'md5:d2fb2f74a623be439cf454df5ff3344a', + 'release_timestamp': 1720846360, + 'release_date': '20240713', + 'duration': 5349.0, + 'thumbnail': r're:https://[\w\d]+.cloudfront.net/.*', + 'uploader': '杏都める', + 'uploader_id': '667adc9e9aa7f739a2158ff3', + 'tags': ["杏都める", "めいどるーちぇ", "無料", "耳舐め", "ASMR"], + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + if self._configuration_arg('jwt_token') and not self.user_id: + self._login_by_token(self._configuration_arg('jwt_token', casesense=True)[0], video_id) + + headers = {'Origin': 'https://rplay.live', 'Referer': 'https://rplay.live/'} + content = self._download_json('https://api.rplay.live/content', video_id, query={ + 'contentOid': video_id, + 'status': 'published', + 'withComments': True, + 'requestCanView': True, + **({ + 'requestorOid': self.user_id, + 'loginType': self.login_type, + } if self.user_id else {}), + }, headers={**headers, 'Authorization': self.jwt_token or 'null'}) + if content.get('drm'): + raise ExtractorError('This video is DRM-protected') + content.pop('daily_views', None) + content.get('creatorInfo', {}).pop('subscriptionTiers', None) + + metainfo = traverse_obj(content, { + 'title': ('title', {str}), + 'description': ('introText', {str}), + 'release_timestamp': ('publishedAt', {parse_iso8601}), + 'duration': ('length', {float_or_none}), + 'uploader': ('nickname', {str}), + 'uploader_id': ('creatorOid', {str}), + 'tags': ('hashtags', lambda _, v: v[0] != '_'), + }) + + m3u8_url = traverse_obj(content, ('canView', 'url')) + if not m3u8_url: + raise ExtractorError('You do not have access to this video. ' + 'Passing JWT token using --extractor-args RPlayVideo:jwt_token=xxx.xxxxx.xxx to login') + + thumbnail_key = traverse_obj(content, ('streamables', lambda _, v: v['type'].startswith('image/'), 's3key', any)) + if thumbnail_key: + metainfo['thumbnail'] = url_or_none(self._download_webpage( + 'https://api.rplay.live/upload/privateasset', video_id, 'getting cover url', query={ + 'key': thumbnail_key, + 'contentOid': video_id, + 'creatorOid': metainfo.get('uploader_id'), + **({ + 'requestorOid': self.user_id, + 'loginType': self.login_type + } if self.user_id else {}), + }, fatal=False)) + + formats = self._extract_m3u8_formats(m3u8_url, video_id, headers={**headers, 'Butter': self.get_butter_token()}) + for fmt in formats: + m3u8_doc = self._download_webpage(fmt['url'], video_id, 'getting m3u8 contents', + headers={**headers, 'Butter': self.get_butter_token()}) + fmt['url'] = encode_data_uri(m3u8_doc.encode(), 'application/x-mpegurl') + match = re.search(r'^#EXT-X-KEY.*?URI="([^"]+)"', m3u8_doc, flags=re.M) + if match: + urlh = self._request_webpage(match[1], video_id, 'getting hls key', headers={ + **headers, + 'rplay-private-content-requestor': self.user_id or 'not-logged-in', + 'age': random.randint(100, 10000), + }) + fmt['hls_aes'] = {'key': urlh.read().hex()} + + return { + 'id': video_id, + 'formats': formats, + **metainfo, + 'http_headers': {'Origin': 'https://rplay.live', 'Referer': 'https://rplay.live/'}, + } + + +class RPlayUserIE(RPlayBaseIE): + _VALID_URL = r'https://rplay.live/(?Pc|creatorhome)/(?P[\d\w]+)/?(?:[#?]|$)' + _TESTS = [{ + 'url': 'https://rplay.live/creatorhome/667adc9e9aa7f739a2158ff3?page=contents', + 'info_dict': { + 'id': '667adc9e9aa7f739a2158ff3', + 'title': '杏都める', + }, + 'playlist_mincount': 33, + }, { + 'url': 'https://rplay.live/c/furachi?page=contents', + 'info_dict': { + 'id': '65e07e60850f4527aab74757', + 'title': '逢瀬ふらち OuseFurachi', + }, + 'playlist_mincount': 77, + }] + + def _real_extract(self, url): + user_id, short = self._match_valid_url(url).group('id', 'short') + key = 'customUrl' if short == 'c' else 'userOid' + + user_info = self._download_json( + f'https://api.rplay.live/account/getuser?{key}={user_id}&filter[]=nickname&filter[]=published', user_id) + entries = traverse_obj(user_info, ('published', ..., { + lambda x: self.url_result(f'https://rplay.live/play/{x}/', ie=RPlayVideoIE, video_id=x)})) + + return self.playlist_result(entries, user_info.get('_id', user_id), user_info.get('nickname')) + + +class RPlayLiveIE(RPlayBaseIE): + _VALID_URL = r'https://rplay.live/c/(?P[\d\w]+)/live' + + def _real_extract(self, url): + user_id = self._match_id(url) + + user_id = self._download_json(f'https://api.rplay.live/account/getuser?customUrl={user_id}', user_id)['_id'] + live_info = self._download_json('https://api.rplay.live/live/play', user_id, query={ + 'creatorOid': user_id, + }) + if live_info['streamState'] == 'youtube': + return self.url_result(f'https://www.youtube.com/watch?v={live_info["liveStreamId"]}') + else: + raise ExtractorError(f'Unknow streamState: {live_info["streamState"]}') From 048b6b403ec28231bb8a9985e4097c7eab4c7ce0 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Mon, 22 Jul 2024 19:17:44 -0400 Subject: [PATCH 02/48] update --- yt_dlp/extractor/_extractors.py | 4 +-- yt_dlp/extractor/rplay.py | 56 ++++++++++++++++----------------- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index ea9d0e7bd..8d499a974 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1727,9 +1727,9 @@ RozhlasVltavaIE, ) from .rplay import ( - RPlayVideoIE, - RPlayUserIE, RPlayLiveIE, + RPlayUserIE, + RPlayVideoIE, ) from .rte import ( RteIE, diff --git a/yt_dlp/extractor/rplay.py b/yt_dlp/extractor/rplay.py index e307e15e1..d8afaa3ba 100644 --- a/yt_dlp/extractor/rplay.py +++ b/yt_dlp/extractor/rplay.py @@ -1,20 +1,20 @@ -import time -import re import random -from .common import ( - InfoExtractor, -) -from ..utils import ( - ExtractorError, - encode_data_uri, - float_or_none, - traverse_obj, - parse_iso8601, - url_or_none, -) +import re +import time from playwright.sync_api import sync_playwright +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + UserNotLive, + encode_data_uri, + float_or_none, + parse_iso8601, + traverse_obj, + url_or_none, +) + class RPlayBaseIE(InfoExtractor): _TOKEN_CACHE = {} @@ -69,15 +69,12 @@ def _playwrite_eval(self, jscode, goto=None): page = browser.new_page() if goto: page.goto(goto) - value = page.evaluate(''' + page.evaluate(''' const proxy = new Proxy(window.navigator, {get(target, prop, receiver) { if (prop == "webdriver") return false; - throw new Error(prop); return target[prop]; }}); - Object.defineProperty(window, "navigator", {get: ()=> proxy}); - window.navigator.webdriver; - ''') + Object.defineProperty(window, "navigator", {get: ()=> proxy});''') value = page.evaluate(jscode) browser.close() return value @@ -93,7 +90,7 @@ def _get_butter_token(self): __wbg_init_memory(t); const {module, instance} = await WebAssembly.instantiate(Uint8Array.from(%s), t); __wbg_finalize_init(instance, module); - };''' % butter_wasm_array + };''' % butter_wasm_array # noqa: UP031 butter_js += '__new_init().then(() => (new ButterFactory()).generate_butter())' return self._playwrite_eval(butter_js, goto='https://rplay.live/') @@ -122,8 +119,8 @@ class RPlayVideoIE(RPlayBaseIE): 'thumbnail': r're:https://[\w\d]+.cloudfront.net/.*', 'uploader': '杏都める', 'uploader_id': '667adc9e9aa7f739a2158ff3', - 'tags': ["杏都める", "めいどるーちぇ", "無料", "耳舐め", "ASMR"], - } + 'tags': ['杏都める', 'めいどるーちぇ', '無料', '耳舐め', 'ASMR'], + }, }] def _real_extract(self, url): @@ -172,7 +169,7 @@ def _real_extract(self, url): 'creatorOid': metainfo.get('uploader_id'), **({ 'requestorOid': self.user_id, - 'loginType': self.login_type + 'loginType': self.login_type, } if self.user_id else {}), }, fatal=False)) @@ -206,7 +203,7 @@ class RPlayUserIE(RPlayBaseIE): 'id': '667adc9e9aa7f739a2158ff3', 'title': '杏都める', }, - 'playlist_mincount': 33, + 'playlist_mincount': 34, }, { 'url': 'https://rplay.live/c/furachi?page=contents', 'info_dict': { @@ -235,10 +232,13 @@ def _real_extract(self, url): user_id = self._match_id(url) user_id = self._download_json(f'https://api.rplay.live/account/getuser?customUrl={user_id}', user_id)['_id'] - live_info = self._download_json('https://api.rplay.live/live/play', user_id, query={ - 'creatorOid': user_id, - }) - if live_info['streamState'] == 'youtube': + live_info = self._download_json('https://api.rplay.live/live/play', user_id, + query={'creatorOid': user_id}) + + stream_state = live_info['streamState'] + if stream_state == 'youtube': return self.url_result(f'https://www.youtube.com/watch?v={live_info["liveStreamId"]}') + elif stream_state == 'offline': + raise UserNotLive else: - raise ExtractorError(f'Unknow streamState: {live_info["streamState"]}') + raise ExtractorError(f'Unknow streamState: {stream_state}') From 358e9425bd48a2ffafdee0cd6b4a81583e03296a Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Mon, 22 Jul 2024 20:22:17 -0400 Subject: [PATCH 03/48] improve eval wrapper --- yt_dlp/extractor/rplay.py | 66 +++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 20 deletions(-) diff --git a/yt_dlp/extractor/rplay.py b/yt_dlp/extractor/rplay.py index d8afaa3ba..f494022ca 100644 --- a/yt_dlp/extractor/rplay.py +++ b/yt_dlp/extractor/rplay.py @@ -1,8 +1,9 @@ +import asyncio import random import re import time -from playwright.sync_api import sync_playwright +from playwright.async_api import async_playwright from .common import InfoExtractor from ..utils import ( @@ -63,43 +64,68 @@ def _get_butter_files(self): self.cache.store('rplay', 'butter-code', {'js': butter_js, 'wasm': butter_wasm_array, 'date': time.time()}) return butter_js, butter_wasm_array - def _playwrite_eval(self, jscode, goto=None): - with sync_playwright() as p: - browser = p.chromium.launch() - page = browser.new_page() - if goto: - page.goto(goto) - page.evaluate(''' - const proxy = new Proxy(window.navigator, {get(target, prop, receiver) { - if (prop == "webdriver") return false; - return target[prop]; - }}); - Object.defineProperty(window, "navigator", {get: ()=> proxy});''') - value = page.evaluate(jscode) - browser.close() - return value + def _playwright_eval(self, jscode, goto=None, wait_until='commit', stop_loading=True): + async def __aeval(): + async with async_playwright() as p: + browser = await p.chromium.launch(chromium_sandbox=True) + page = await browser.new_page() + if goto: + try: + start = time.time() + await page.goto(goto, wait_until=wait_until) + self.write_debug(f'{wait_until} loaded in {time.time() - start} s') + if stop_loading: + await page.evaluate('window.stop();') + except Exception as e: + self.report_warning(f'Failed to navigate to {goto}: {e}') + await browser.close() + return + try: + start = time.time() + value = await asyncio.wait_for(page.evaluate(jscode), timeout=10) + self.write_debug(f'JS execution finished in {time.time() - start} s') + except asyncio.TimeoutError: + self.report_warning('PlayWright JS evaluation timed out') + value = None + finally: + await browser.close() + return value - def _get_butter_token(self): + try: + return asyncio.run(__aeval()) + except asyncio.InvalidStateError: + pass + + def _calc_butter_token(self): butter_js, butter_wasm_array = self._get_butter_files() butter_js = butter_js.replace('export{initSync};export default __wbg_init;', '') butter_js = butter_js.replace('export class', 'class') butter_js = butter_js.replace('new URL("smooth_like_butter_bg.wasm",import.meta.url)', '""') - butter_js += ''';__new_init = async () => { + butter_js += ''';const proxy = new Proxy(window.navigator, {get(target, prop, receiver) { + if (prop == "webdriver") return false; + return target[prop]; + }}); + Object.defineProperty(window, "navigator", {get: ()=> proxy});''' + + butter_js += '''__new_init = async () => { const t = __wbg_get_imports(); __wbg_init_memory(t); const {module, instance} = await WebAssembly.instantiate(Uint8Array.from(%s), t); __wbg_finalize_init(instance, module); };''' % butter_wasm_array # noqa: UP031 + butter_js += '__new_init().then(() => (new ButterFactory()).generate_butter())' - return self._playwrite_eval(butter_js, goto='https://rplay.live/') + + # The generator checks `navigator` and `location` to generate correct token + return self._playwright_eval(butter_js, goto='https://rplay.live/') def get_butter_token(self): cache = self.cache.load('rplay', 'butter-token') or {} timestamp = str(int(time.time() / 360)) if cache.get(timestamp): return cache[timestamp] - token = self._get_butter_token() + token = self._calc_butter_token() self.cache.store('rplay', 'butter-token', {timestamp: token}) return token From dc967e8c994f4b04a303fb2143c25b4cdb1f8363 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Mon, 22 Jul 2024 20:23:08 -0400 Subject: [PATCH 04/48] use md5 --- yt_dlp/extractor/rplay.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/rplay.py b/yt_dlp/extractor/rplay.py index f494022ca..e720ad61c 100644 --- a/yt_dlp/extractor/rplay.py +++ b/yt_dlp/extractor/rplay.py @@ -137,7 +137,7 @@ class RPlayVideoIE(RPlayBaseIE): 'info_dict': { 'id': '669203d25223214e67579dc3', 'ext': 'mp4', - 'title': '〖ASMR/SR3D〗密着♥じっくり奥ゾリゾリ耳舐め♡〖杏都める〗Earlicking,Mouthsound,舔耳,힐링,귀 핥기', + 'title': 'md5:6ab0a76410b40b1f5fb48a2ad7571264', 'description': 'md5:d2fb2f74a623be439cf454df5ff3344a', 'release_timestamp': 1720846360, 'release_date': '20240713', From 2c44cf7a9fc0cfa6c61547034a9f3a205aa08a38 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Mon, 22 Jul 2024 21:32:14 -0400 Subject: [PATCH 05/48] age limit --- yt_dlp/extractor/rplay.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/rplay.py b/yt_dlp/extractor/rplay.py index e720ad61c..8f691a3b2 100644 --- a/yt_dlp/extractor/rplay.py +++ b/yt_dlp/extractor/rplay.py @@ -73,7 +73,7 @@ async def __aeval(): try: start = time.time() await page.goto(goto, wait_until=wait_until) - self.write_debug(f'{wait_until} loaded in {time.time() - start} s') + self.write_debug(f'{wait_until} loaded in {time.time() - start:.3f}s') if stop_loading: await page.evaluate('window.stop();') except Exception as e: @@ -83,7 +83,7 @@ async def __aeval(): try: start = time.time() value = await asyncio.wait_for(page.evaluate(jscode), timeout=10) - self.write_debug(f'JS execution finished in {time.time() - start} s') + self.write_debug(f'JS execution finished in {time.time() - start:.3f}s') except asyncio.TimeoutError: self.report_warning('PlayWright JS evaluation timed out') value = None @@ -139,6 +139,8 @@ class RPlayVideoIE(RPlayBaseIE): 'ext': 'mp4', 'title': 'md5:6ab0a76410b40b1f5fb48a2ad7571264', 'description': 'md5:d2fb2f74a623be439cf454df5ff3344a', + 'timestamp': 1720845266, + 'upload_date': '20240713', 'release_timestamp': 1720846360, 'release_date': '20240713', 'duration': 5349.0, @@ -147,6 +149,24 @@ class RPlayVideoIE(RPlayBaseIE): 'uploader_id': '667adc9e9aa7f739a2158ff3', 'tags': ['杏都める', 'めいどるーちぇ', '無料', '耳舐め', 'ASMR'], }, + }, { + 'url': 'https://rplay.live/play/660bee4fd3c1d09d69db6870/', + 'info_dict': { + 'id': '660bee4fd3c1d09d69db6870', + 'ext': 'mp4', + 'title': 'md5:7de162a0f1c2266ec428234620a124fc', + 'description': 'md5:c6d12cc8110b748d5588d5f00787cd35', + 'timestamp': 1712057935, + 'upload_date': '20240402', + 'release_timestamp': 1712061900, + 'release_date': '20240402', + 'duration': 6791.0, + 'thumbnail': r're:https://[\w\d]+.cloudfront.net/.*', + 'uploader': '狐月れんげ', + 'uploader_id': '65eeb4b237043dc0b5654f86', + 'tags': 'count:10', + 'age_limit': 18, + }, }] def _real_extract(self, url): @@ -175,10 +195,12 @@ def _real_extract(self, url): 'title': ('title', {str}), 'description': ('introText', {str}), 'release_timestamp': ('publishedAt', {parse_iso8601}), + 'timestamp': ('createdAt', {parse_iso8601}), 'duration': ('length', {float_or_none}), 'uploader': ('nickname', {str}), 'uploader_id': ('creatorOid', {str}), 'tags': ('hashtags', lambda _, v: v[0] != '_'), + 'age_limit': (('hideContent', 'isAdultContent'), {lambda x: 18 if x else None}, any), }) m3u8_url = traverse_obj(content, ('canView', 'url')) From cddca4e4fcf2117f00fddb1a28cc016991955bb6 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Tue, 23 Jul 2024 11:02:25 -0400 Subject: [PATCH 06/48] native live stream --- yt_dlp/extractor/rplay.py | 69 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/rplay.py b/yt_dlp/extractor/rplay.py index 8f691a3b2..b7151f150 100644 --- a/yt_dlp/extractor/rplay.py +++ b/yt_dlp/extractor/rplay.py @@ -274,18 +274,77 @@ def _real_extract(self, url): class RPlayLiveIE(RPlayBaseIE): - _VALID_URL = r'https://rplay.live/c/(?P[\d\w]+)/live' + _VALID_URL = [ + r'https://rplay.live/(?Pc)/(?P[\d\w]+)/live', + r'https://rplay.live/(?Plive)/(?P[\d\w]+)', + ] + _TESTS = [{ + 'url': 'https://rplay.live/c/chachamaru/live', + 'info_dict': { + 'id': '667e4cd99aa7f739a2c91852', + 'ext': 'mp4', + 'title': r're:【ASMR】ん~っやば//スキスキ耐久.*', + 'description': 'md5:7f88ac0a7a3d5d0b926a0baecd1d40e1', + 'timestamp': 1721739947, + 'upload_date': '20240723', + 'live_status': 'is_live', + 'thumbnail': 'https://pb.rplay.live/liveChannelThumbnails/667e4cd99aa7f739a2c91852', + 'uploader': '愛犬茶々丸', + 'uploader_id': '667e4cd99aa7f739a2c91852', + 'tags': 'count:9', + }, + 'skip': 'live', + }, { + 'url': 'https://rplay.live/live/667adc9e9aa7f739a2158ff3', + 'only_matching': True, + }] def _real_extract(self, url): - user_id = self._match_id(url) + user_id, short = self._match_valid_url(url).group('id', 'short') - user_id = self._download_json(f'https://api.rplay.live/account/getuser?customUrl={user_id}', user_id)['_id'] - live_info = self._download_json('https://api.rplay.live/live/play', user_id, - query={'creatorOid': user_id}) + if short == 'c': + user_info = self._download_json(f'https://api.rplay.live/account/getuser?customUrl={user_id}', user_id) + user_id = user_info['_id'] + else: + user_info = self._download_json(f'https://api.rplay.live/account/getuser?userOid={user_id}', user_id) + + live_info = self._download_json('https://api.rplay.live/live/play', user_id, query={'creatorOid': user_id}) stream_state = live_info['streamState'] if stream_state == 'youtube': return self.url_result(f'https://www.youtube.com/watch?v={live_info["liveStreamId"]}') + elif stream_state == 'live': + if self._configuration_arg('jwt_token') and not self.user_id: + self._login_by_token(self._configuration_arg('jwt_token', casesense=True)[0], user_id) + if not live_info.get('allowAnonymous') and not self.user_id: + self.raise_login_required() + key2 = self._download_webpage( + 'https://api.rplay.live/live/key2', user_id, 'getting live key', + headers={'Authorization': self.jwt_token}, + query={ + 'requestorOid': self.user_id, + 'loginType': self.login_type, + }) + formats = self._extract_m3u8_formats('https://api.rplay.live/live/stream/playlist.m3u8', user_id, query={ + 'creatorOid': user_id, + 'key2': key2, + }) + return { + 'id': user_id, + 'formats': formats, + 'is_live': True, + 'http_headers': {'Referer': 'https://rplay.live'}, + 'thumbnail': f'https://pb.rplay.live/liveChannelThumbnails/{user_id}', + 'uploader': traverse_obj(user_info, ('nickname', {str})), + 'uploader_id': user_id, + **traverse_obj(live_info, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('streamStartTime', {parse_iso8601}), + 'tags': ('hashtags', ..., {str}), + 'age_limit': ('isAdultContent', {lambda x: 18 if x else None}), + }), + } elif stream_state == 'offline': raise UserNotLive else: From ef048b6b0128d4539a6d09a7ebdde2f83ed9b30c Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Fri, 26 Jul 2024 03:09:55 -0400 Subject: [PATCH 07/48] login and misc fixes --- yt_dlp/extractor/rplay.py | 85 +++++++++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 26 deletions(-) diff --git a/yt_dlp/extractor/rplay.py b/yt_dlp/extractor/rplay.py index b7151f150..253447811 100644 --- a/yt_dlp/extractor/rplay.py +++ b/yt_dlp/extractor/rplay.py @@ -1,4 +1,9 @@ import asyncio +import base64 +import datetime as dt +import hashlib +import hmac +import json import random import re import time @@ -18,6 +23,7 @@ class RPlayBaseIE(InfoExtractor): + _NETRC_MACHINE = 'rplaylive' _TOKEN_CACHE = {} _user_id = None _login_type = None @@ -35,22 +41,40 @@ def login_type(self): def jwt_token(self): return self._jwt_token - def _perform_login(self, username, password): - _ = { - 'alg': 'HS256', - 'typ': 'JWT', - } - raise NotImplementedError + def _jwt_encode_hs256(self, payload: dict, key: str): + # ..utils.jwt_encode_hs256() uses slightly different details that would fails + # and we need to re-implement it with minor changes + b64encode = lambda x: base64.urlsafe_b64encode( + json.dumps(x, separators=(',', ':')).encode()).strip(b'=') - def _login_by_token(self, jwt_token, video_id): + header_b64 = b64encode({'alg': 'HS256', 'typ': 'JWT'}) + payload_b64 = b64encode(payload) + h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256) + signature_b64 = base64.urlsafe_b64encode(h.digest()).strip(b'=') + return header_b64 + b'.' + payload_b64 + b'.' + signature_b64 + + def _perform_login(self, username, password): + payload = { + 'eml': username, + 'dat': dt.datetime.now(dt.timezone.utc).isoformat(timespec='milliseconds').replace('+00:00', 'Z'), + 'iat': int(time.time()), + } + key = hashlib.sha256(password.encode()).hexdigest() + + self._login_by_token(self._jwt_encode_hs256(payload, key).decode()) + + def _login_by_token(self, jwt_token): user_info = self._download_json( - 'https://api.rplay.live/account/login', video_id, note='performing login', errnote='Failed to login', + 'https://api.rplay.live/account/login', 'login', note='performing login', errnote='Failed to login', data=f'{{"token":"{jwt_token}","lang":"en","loginType":null,"checkAdmin":null}}'.encode(), headers={'Content-Type': 'application/json', 'Authorization': 'null'}, fatal=False) + if user_info: self._user_id = traverse_obj(user_info, 'oid') self._login_type = traverse_obj(user_info, 'accountType') - self._jwt_token = jwt_token + self._jwt_token = jwt_token if self._user_id else None + if not self._user_id: + self.report_warning('Failed to login, possibly due to wrong password or website change') def _get_butter_files(self): cache = self.cache.load('rplay', 'butter-code') or {} @@ -172,11 +196,8 @@ class RPlayVideoIE(RPlayBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - if self._configuration_arg('jwt_token') and not self.user_id: - self._login_by_token(self._configuration_arg('jwt_token', casesense=True)[0], video_id) - headers = {'Origin': 'https://rplay.live', 'Referer': 'https://rplay.live/'} - content = self._download_json('https://api.rplay.live/content', video_id, query={ + video_info = self._download_json('https://api.rplay.live/content', video_id, query={ 'contentOid': video_id, 'status': 'published', 'withComments': True, @@ -186,12 +207,10 @@ def _real_extract(self, url): 'loginType': self.login_type, } if self.user_id else {}), }, headers={**headers, 'Authorization': self.jwt_token or 'null'}) - if content.get('drm'): + if video_info.get('drm'): raise ExtractorError('This video is DRM-protected') - content.pop('daily_views', None) - content.get('creatorInfo', {}).pop('subscriptionTiers', None) - metainfo = traverse_obj(content, { + metainfo = traverse_obj(video_info, { 'title': ('title', {str}), 'description': ('introText', {str}), 'release_timestamp': ('publishedAt', {parse_iso8601}), @@ -203,12 +222,16 @@ def _real_extract(self, url): 'age_limit': (('hideContent', 'isAdultContent'), {lambda x: 18 if x else None}, any), }) - m3u8_url = traverse_obj(content, ('canView', 'url')) + m3u8_url = traverse_obj(video_info, ('canView', 'url')) if not m3u8_url: - raise ExtractorError('You do not have access to this video. ' - 'Passing JWT token using --extractor-args RPlayVideo:jwt_token=xxx.xxxxx.xxx to login') + msg = 'You do not have access to this video' + if traverse_obj(video_info, ('viewableTiers', 'free')): + msg += '. This video requires a free subscription' + if not self.user_id: + msg += f'. {self._login_hint(method="password")}' + raise ExtractorError(msg) - thumbnail_key = traverse_obj(content, ('streamables', lambda _, v: v['type'].startswith('image/'), 's3key', any)) + thumbnail_key = traverse_obj(video_info, ('streamables', lambda _, v: v['type'].startswith('image/'), 's3key', any)) if thumbnail_key: metainfo['thumbnail'] = url_or_none(self._download_webpage( 'https://api.rplay.live/upload/privateasset', video_id, 'getting cover url', query={ @@ -231,7 +254,7 @@ def _real_extract(self, url): urlh = self._request_webpage(match[1], video_id, 'getting hls key', headers={ **headers, 'rplay-private-content-requestor': self.user_id or 'not-logged-in', - 'age': random.randint(100, 10000), + 'age': random.randint(1, 4999), }) fmt['hls_aes'] = {'key': urlh.read().hex()} @@ -261,14 +284,26 @@ class RPlayUserIE(RPlayBaseIE): 'playlist_mincount': 77, }] + def _perform_login(self, username, password): + # This playlist extractor does not require login + return + def _real_extract(self, url): user_id, short = self._match_valid_url(url).group('id', 'short') key = 'customUrl' if short == 'c' else 'userOid' user_info = self._download_json( f'https://api.rplay.live/account/getuser?{key}={user_id}&filter[]=nickname&filter[]=published', user_id) + replays = self._download_json( + 'https://api.rplay.live/live/replays?=667e4cd99aa7f739a2c91852', user_id, query={ + 'creatorOid': user_info.get('_id')}) + entries = traverse_obj(user_info, ('published', ..., { lambda x: self.url_result(f'https://rplay.live/play/{x}/', ie=RPlayVideoIE, video_id=x)})) + for entry_id in traverse_obj(replays, (..., '_id', {str})): + if entry_id in user_info.get('published', []): + continue + entries.append(self.url_result(f'https://rplay.live/play/{entry_id}/', ie=RPlayVideoIE, video_id=entry_id)) return self.playlist_result(entries, user_info.get('_id', user_id), user_info.get('nickname')) @@ -314,10 +349,8 @@ def _real_extract(self, url): if stream_state == 'youtube': return self.url_result(f'https://www.youtube.com/watch?v={live_info["liveStreamId"]}') elif stream_state == 'live': - if self._configuration_arg('jwt_token') and not self.user_id: - self._login_by_token(self._configuration_arg('jwt_token', casesense=True)[0], user_id) - if not live_info.get('allowAnonymous') and not self.user_id: - self.raise_login_required() + if not self.user_id: + self.raise_login_required(method='password') key2 = self._download_webpage( 'https://api.rplay.live/live/key2', user_id, 'getting live key', headers={'Authorization': self.jwt_token}, From dd294fc10ec138965309cc8e588dbdce329fa1f0 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Fri, 2 Aug 2024 12:33:39 -0400 Subject: [PATCH 08/48] playlist and misc --- yt_dlp/extractor/rplay.py | 142 ++++++++++++++++++++------------------ 1 file changed, 76 insertions(+), 66 deletions(-) diff --git a/yt_dlp/extractor/rplay.py b/yt_dlp/extractor/rplay.py index 253447811..ab1d5c35f 100644 --- a/yt_dlp/extractor/rplay.py +++ b/yt_dlp/extractor/rplay.py @@ -17,6 +17,7 @@ encode_data_uri, float_or_none, parse_iso8601, + parse_qs, traverse_obj, url_or_none, ) @@ -41,6 +42,20 @@ def login_type(self): def jwt_token(self): return self._jwt_token + @property + def requestor_query(self): + return { + 'requestorOid': self.user_id, + 'loginType': self.login_type, + } if self.user_id else {} + + @property + def jwt_header(self): + return { + 'Referer': 'https://rplay.live/', + 'Authorization': self.jwt_token or 'null' + } + def _jwt_encode_hs256(self, payload: dict, key: str): # ..utils.jwt_encode_hs256() uses slightly different details that would fails # and we need to re-implement it with minor changes @@ -66,7 +81,7 @@ def _perform_login(self, username, password): def _login_by_token(self, jwt_token): user_info = self._download_json( 'https://api.rplay.live/account/login', 'login', note='performing login', errnote='Failed to login', - data=f'{{"token":"{jwt_token}","lang":"en","loginType":null,"checkAdmin":null}}'.encode(), + data=f'{{"token":"{jwt_token}","loginType":null,"checkAdmin":null}}'.encode(), headers={'Content-Type': 'application/json', 'Authorization': 'null'}, fatal=False) if user_info: @@ -80,57 +95,51 @@ def _get_butter_files(self): cache = self.cache.load('rplay', 'butter-code') or {} if cache.get('date', 0) > time.time() - 86400: return cache['js'], cache['wasm'] - butter_js = self._download_webpage('https://pb.rplay.live/kr/public/smooth_like_butter.js', 'butter', - 'getting butter-sign js') - urlh = self._request_webpage('https://pb.rplay.live/kr/public/smooth_like_butter_bg.wasm', 'butter', - 'getting butter-sign wasm') + butter_js = self._download_webpage( + 'https://pb.rplay.live/kr/public/smooth_like_butter.js', 'butter', 'getting butter-sign js') + urlh = self._request_webpage( + 'https://pb.rplay.live/kr/public/smooth_like_butter_bg.wasm', 'butter', 'getting butter-sign wasm') butter_wasm_array = list(urlh.read()) self.cache.store('rplay', 'butter-code', {'js': butter_js, 'wasm': butter_wasm_array, 'date': time.time()}) return butter_js, butter_wasm_array - def _playwright_eval(self, jscode, goto=None, wait_until='commit', stop_loading=True): + def _playwright_eval(self, jscode, location='about:blank', body=''): async def __aeval(): async with async_playwright() as p: browser = await p.chromium.launch(chromium_sandbox=True) page = await browser.new_page() - if goto: - try: - start = time.time() - await page.goto(goto, wait_until=wait_until) - self.write_debug(f'{wait_until} loaded in {time.time() - start:.3f}s') - if stop_loading: - await page.evaluate('window.stop();') - except Exception as e: - self.report_warning(f'Failed to navigate to {goto}: {e}') - await browser.close() - return + # use page.route to skip network request while allowing changing window.location + await page.route('**', lambda route: route.fulfill(status=200, body=body)) + # mock navigator to mimic regular browser + await page.add_init_script('''const proxy = new Proxy(window.navigator, {get(target, prop, receiver) { + if (prop === "webdriver") return false; + if (prop === "appVersion" || prop === "userAgent") return target[prop].replace(/Headless/g, ''); + return target[prop]; + }}); + Object.defineProperty(window, "navigator", {get: ()=> proxy});''') + + def _page_eval_js(exp, timeout=10): + return asyncio.wait_for(page.evaluate(exp), timeout=timeout) try: + await page.goto(location) # always navigate once to trigger init script start = time.time() - value = await asyncio.wait_for(page.evaluate(jscode), timeout=10) + value = await _page_eval_js(jscode) self.write_debug(f'JS execution finished in {time.time() - start:.3f}s') + return value except asyncio.TimeoutError: self.report_warning('PlayWright JS evaluation timed out') - value = None finally: await browser.close() - return value try: return asyncio.run(__aeval()) except asyncio.InvalidStateError: - pass + self.report_warning('PlayWright failed to evaluate JS') def _calc_butter_token(self): butter_js, butter_wasm_array = self._get_butter_files() - butter_js = butter_js.replace('export{initSync};export default __wbg_init;', '') - butter_js = butter_js.replace('export class', 'class') - butter_js = butter_js.replace('new URL("smooth_like_butter_bg.wasm",import.meta.url)', '""') - - butter_js += ''';const proxy = new Proxy(window.navigator, {get(target, prop, receiver) { - if (prop == "webdriver") return false; - return target[prop]; - }}); - Object.defineProperty(window, "navigator", {get: ()=> proxy});''' + butter_js = re.sub(r'export(?:\s+default)?([\s{])', r'\1', butter_js) + butter_js = butter_js.replace('import.meta', '{}') butter_js += '''__new_init = async () => { const t = __wbg_get_imports(); @@ -141,8 +150,8 @@ def _calc_butter_token(self): butter_js += '__new_init().then(() => (new ButterFactory()).generate_butter())' - # The generator checks `navigator` and `location` to generate correct token - return self._playwright_eval(butter_js, goto='https://rplay.live/') + # The script checks `navigator.webdriver` and `location.origin` to generate correct token + return self._playwright_eval(butter_js, location='https://rplay.live') def get_butter_token(self): cache = self.cache.load('rplay', 'butter-token') or {} @@ -173,6 +182,7 @@ class RPlayVideoIE(RPlayBaseIE): 'uploader_id': '667adc9e9aa7f739a2158ff3', 'tags': ['杏都める', 'めいどるーちぇ', '無料', '耳舐め', 'ASMR'], }, + 'params': {'cachedir': False}, }, { 'url': 'https://rplay.live/play/660bee4fd3c1d09d69db6870/', 'info_dict': { @@ -196,17 +206,26 @@ class RPlayVideoIE(RPlayBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - headers = {'Origin': 'https://rplay.live', 'Referer': 'https://rplay.live/'} + playlist_id = traverse_obj(parse_qs(url), ('playlist', ..., any)) + if playlist_id and self._yes_playlist(playlist_id, video_id): + playlist_info = self._download_json( + 'https://api.rplay.live/content/playlist', playlist_id, + query={'playlistOid': playlist_id, **self.requestor_query}, + headers=self.jwt_header, fatal=False) + if playlist_info: + entries = traverse_obj(playlist_info, ('contentData', ..., '_id', { + lambda x: self.url_result(f'https://rplay.live/play/{x}/', ie=RPlayVideoIE, video_id=x)})) + return self.playlist_result(entries, playlist_id, playlist_info.get('name')) + else: + self.report_warning('Failed to get playlist, downloading video only') + video_info = self._download_json('https://api.rplay.live/content', video_id, query={ 'contentOid': video_id, 'status': 'published', 'withComments': True, 'requestCanView': True, - **({ - 'requestorOid': self.user_id, - 'loginType': self.login_type, - } if self.user_id else {}), - }, headers={**headers, 'Authorization': self.jwt_token or 'null'}) + **self.requestor_query, + }, headers=self.jwt_header) if video_info.get('drm'): raise ExtractorError('This video is DRM-protected') @@ -222,37 +241,36 @@ def _real_extract(self, url): 'age_limit': (('hideContent', 'isAdultContent'), {lambda x: 18 if x else None}, any), }) - m3u8_url = traverse_obj(video_info, ('canView', 'url')) + m3u8_url = traverse_obj(video_info, ('canView', 'url', {url_or_none})) if not m3u8_url: msg = 'You do not have access to this video' if traverse_obj(video_info, ('viewableTiers', 'free')): - msg += '. This video requires a free subscription' + msg = 'This video requires a free subscription to access' if not self.user_id: msg += f'. {self._login_hint(method="password")}' - raise ExtractorError(msg) + raise ExtractorError(msg, expected=True) - thumbnail_key = traverse_obj(video_info, ('streamables', lambda _, v: v['type'].startswith('image/'), 's3key', any)) + thumbnail_key = traverse_obj(video_info, ( + 'streamables', lambda _, v: v['type'].startswith('image/'), 's3key', any)) if thumbnail_key: metainfo['thumbnail'] = url_or_none(self._download_webpage( 'https://api.rplay.live/upload/privateasset', video_id, 'getting cover url', query={ 'key': thumbnail_key, 'contentOid': video_id, 'creatorOid': metainfo.get('uploader_id'), - **({ - 'requestorOid': self.user_id, - 'loginType': self.login_type, - } if self.user_id else {}), + **self.requestor_query, }, fatal=False)) - formats = self._extract_m3u8_formats(m3u8_url, video_id, headers={**headers, 'Butter': self.get_butter_token()}) + formats = self._extract_m3u8_formats(m3u8_url, video_id, headers={ + 'Referer': 'https://rplay.live/', 'Butter': self.get_butter_token()}) for fmt in formats: - m3u8_doc = self._download_webpage(fmt['url'], video_id, 'getting m3u8 contents', - headers={**headers, 'Butter': self.get_butter_token()}) + m3u8_doc = self._download_webpage(fmt['url'], video_id, 'getting m3u8 contents', headers={ + 'Referer': 'https://rplay.live/', 'Butter': self.get_butter_token()}) fmt['url'] = encode_data_uri(m3u8_doc.encode(), 'application/x-mpegurl') match = re.search(r'^#EXT-X-KEY.*?URI="([^"]+)"', m3u8_doc, flags=re.M) if match: urlh = self._request_webpage(match[1], video_id, 'getting hls key', headers={ - **headers, + 'Referer': 'https://rplay.live/', 'rplay-private-content-requestor': self.user_id or 'not-logged-in', 'age': random.randint(1, 4999), }) @@ -262,11 +280,11 @@ def _real_extract(self, url): 'id': video_id, 'formats': formats, **metainfo, - 'http_headers': {'Origin': 'https://rplay.live', 'Referer': 'https://rplay.live/'}, + 'http_headers': {'Referer': 'https://rplay.live/'}, } -class RPlayUserIE(RPlayBaseIE): +class RPlayUserIE(InfoExtractor): _VALID_URL = r'https://rplay.live/(?Pc|creatorhome)/(?P[\d\w]+)/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://rplay.live/creatorhome/667adc9e9aa7f739a2158ff3?page=contents', @@ -284,10 +302,6 @@ class RPlayUserIE(RPlayBaseIE): 'playlist_mincount': 77, }] - def _perform_login(self, username, password): - # This playlist extractor does not require login - return - def _real_extract(self, url): user_id, short = self._match_valid_url(url).group('id', 'short') key = 'customUrl' if short == 'c' else 'userOid' @@ -349,19 +363,15 @@ def _real_extract(self, url): if stream_state == 'youtube': return self.url_result(f'https://www.youtube.com/watch?v={live_info["liveStreamId"]}') elif stream_state == 'live': - if not self.user_id: + if not self.user_id and not live_info.get('allowAnonymous'): self.raise_login_required(method='password') key2 = self._download_webpage( 'https://api.rplay.live/live/key2', user_id, 'getting live key', - headers={'Authorization': self.jwt_token}, - query={ - 'requestorOid': self.user_id, - 'loginType': self.login_type, - }) - formats = self._extract_m3u8_formats('https://api.rplay.live/live/stream/playlist.m3u8', user_id, query={ - 'creatorOid': user_id, - 'key2': key2, - }) + headers=self.jwt_header, query=self.requestor_query) if self.user_id else '' + formats = self._extract_m3u8_formats( + 'https://api.rplay.live/live/stream/playlist.m3u8', user_id, + query={'creatorOid': user_id, 'key2': key2}) + return { 'id': user_id, 'formats': formats, From b0ee898da9817043a22fb0e041660130e25cbeb4 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Fri, 2 Aug 2024 16:53:30 -0400 Subject: [PATCH 09/48] deno --- yt_dlp/extractor/openload.py | 90 ++++++++++++++++++++++++++++++++++++ yt_dlp/extractor/rplay.py | 50 +++----------------- 2 files changed, 97 insertions(+), 43 deletions(-) diff --git a/yt_dlp/extractor/openload.py b/yt_dlp/extractor/openload.py index 2d56252b1..e14bd872b 100644 --- a/yt_dlp/extractor/openload.py +++ b/yt_dlp/extractor/openload.py @@ -6,6 +6,7 @@ import tempfile import urllib.parse +from .common import InfoExtractor from ..utils import ( ExtractorError, Popen, @@ -46,6 +47,95 @@ def cookie_jar_to_list(cookie_jar): return [cookie_to_dict(cookie) for cookie in cookie_jar] +class DenoWrapper: + """Deno wrapper class + + This class is experimental. + """ + + INSTALL_HINT = 'Please install deno following https://docs.deno.com/runtime/manual/getting_started/installation/ or download its binary from https://github.com/denoland/deno/releases' + _BASE_JS = ''' + delete window.Deno; + global = window; + const navProxy = new Proxy(window.navigator, { get: (target, prop, receiver) => ({ + appCodeName: 'Mozilla', + appName: 'Netscape', + appVersion: '5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.17 Safari/537.36', + language: 'en', + languages: ['en'], + userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.17 Safari/537.36', + webdriver: false, + }[prop])}); + Object.defineProperty(window, "navigator", {get: () => navProxy}) + ''' + + @staticmethod + def _version(): + return get_exe_version('deno', version_re=r'([0-9.]+)') + + def __init__(self, extractor: InfoExtractor, required_version=None, timeout=10000): + self.extractor = extractor + self.timeout = timeout + + self.exe = check_executable('deno', ['-V']) + if not self.exe: + raise ExtractorError(f'Deno not found, {self.INSTALL_HINT}', expected=True) + if required_version: + if is_outdated_version(self._version(), required_version): + self.extractor.report_warning( + f'Deno is outdated, update it to version {required_version} or newer if you encounter any errors.') + + @contextlib.contextmanager + def _create_temp_js(self, jscode): + js_file = tempfile.NamedTemporaryFile('wt', encoding='utf-8', suffix='.js', delete=False) + try: + js_file.write(jscode) + js_file.close() + yield js_file + finally: + with contextlib.suppress(OSError): + os.remove(js_file.name) + + @staticmethod + def _location_js(location: str): + parsed = urllib.parse.urlparse(location) + return f''' + window.location = {{ + href: "{location}", + origin: "{parsed.scheme}://{parsed.netloc}", + host: "{parsed.netloc}", + hostname: "{parsed.netloc.split(':')[0]}", + hash: "{parsed.fragment}", + protocol: "{parsed.scheme}:", + }}; + ''' + + def execute(self, jscode, video_id=None, *, note='Executing JS', allow_net=None, location=None): + """Execute JS and return stdout""" + if location: + jscode = self._location_js(location) + jscode + + with self._create_temp_js(self._BASE_JS + jscode) as js_file: + self.extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') + + cmd = [self.exe, 'run', js_file.name] + if allow_net: + cmd.append('--allow-net' if isinstance(allow_net, bool) else f'--allow-net={allow_net}') + + self.extractor.write_debug(f'Deno command line: {shell_quote(cmd)}') + try: + stdout, stderr, returncode = Popen.run(cmd, timeout=self.timeout / 1000, text=True, + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + except Exception as e: + raise ExtractorError(f'{note} failed: Unable to run Deno binary', cause=e) + if returncode: + raise ExtractorError(f'{note} failed with returncode {returncode}:\n{stderr}') + elif stderr: + self.extractor.report_warning(f'JS console error msg:\n{stderr.strip()}', video_id=video_id) + + return stdout.strip() + + class PhantomJSwrapper: """PhantomJS wrapper class diff --git a/yt_dlp/extractor/rplay.py b/yt_dlp/extractor/rplay.py index ab1d5c35f..fba261bc8 100644 --- a/yt_dlp/extractor/rplay.py +++ b/yt_dlp/extractor/rplay.py @@ -1,4 +1,3 @@ -import asyncio import base64 import datetime as dt import hashlib @@ -8,9 +7,8 @@ import re import time -from playwright.async_api import async_playwright - from .common import InfoExtractor +from .openload import DenoWrapper from ..utils import ( ExtractorError, UserNotLive, @@ -53,11 +51,11 @@ def requestor_query(self): def jwt_header(self): return { 'Referer': 'https://rplay.live/', - 'Authorization': self.jwt_token or 'null' + 'Authorization': self.jwt_token or 'null', } def _jwt_encode_hs256(self, payload: dict, key: str): - # ..utils.jwt_encode_hs256() uses slightly different details that would fails + # yt_dlp.utils.jwt_encode_hs256() uses slightly different details that would fails # and we need to re-implement it with minor changes b64encode = lambda x: base64.urlsafe_b64encode( json.dumps(x, separators=(',', ':')).encode()).strip(b'=') @@ -75,7 +73,6 @@ def _perform_login(self, username, password): 'iat': int(time.time()), } key = hashlib.sha256(password.encode()).hexdigest() - self._login_by_token(self._jwt_encode_hs256(payload, key).decode()) def _login_by_token(self, jwt_token): @@ -103,55 +100,22 @@ def _get_butter_files(self): self.cache.store('rplay', 'butter-code', {'js': butter_js, 'wasm': butter_wasm_array, 'date': time.time()}) return butter_js, butter_wasm_array - def _playwright_eval(self, jscode, location='about:blank', body=''): - async def __aeval(): - async with async_playwright() as p: - browser = await p.chromium.launch(chromium_sandbox=True) - page = await browser.new_page() - # use page.route to skip network request while allowing changing window.location - await page.route('**', lambda route: route.fulfill(status=200, body=body)) - # mock navigator to mimic regular browser - await page.add_init_script('''const proxy = new Proxy(window.navigator, {get(target, prop, receiver) { - if (prop === "webdriver") return false; - if (prop === "appVersion" || prop === "userAgent") return target[prop].replace(/Headless/g, ''); - return target[prop]; - }}); - Object.defineProperty(window, "navigator", {get: ()=> proxy});''') - - def _page_eval_js(exp, timeout=10): - return asyncio.wait_for(page.evaluate(exp), timeout=timeout) - try: - await page.goto(location) # always navigate once to trigger init script - start = time.time() - value = await _page_eval_js(jscode) - self.write_debug(f'JS execution finished in {time.time() - start:.3f}s') - return value - except asyncio.TimeoutError: - self.report_warning('PlayWright JS evaluation timed out') - finally: - await browser.close() - - try: - return asyncio.run(__aeval()) - except asyncio.InvalidStateError: - self.report_warning('PlayWright failed to evaluate JS') - def _calc_butter_token(self): butter_js, butter_wasm_array = self._get_butter_files() butter_js = re.sub(r'export(?:\s+default)?([\s{])', r'\1', butter_js) butter_js = butter_js.replace('import.meta', '{}') - butter_js += '''__new_init = async () => { + butter_js += '''const __new_init = async () => { const t = __wbg_get_imports(); __wbg_init_memory(t); const {module, instance} = await WebAssembly.instantiate(Uint8Array.from(%s), t); __wbg_finalize_init(instance, module); };''' % butter_wasm_array # noqa: UP031 - butter_js += '__new_init().then(() => (new ButterFactory()).generate_butter())' + butter_js += '__new_init().then(() => console.log((new ButterFactory()).generate_butter()));' - # The script checks `navigator.webdriver` and `location.origin` to generate correct token - return self._playwright_eval(butter_js, location='https://rplay.live') + jsi = DenoWrapper(self) + return jsi.execute(butter_js, location='https://rplay.live/') def get_butter_token(self): cache = self.cache.load('rplay', 'butter-token') or {} From 33bacc192b6d36115a4f6ad379aadb850772e255 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Fri, 2 Aug 2024 16:57:43 -0400 Subject: [PATCH 10/48] reduce js --- yt_dlp/extractor/openload.py | 41 ++++-------------------------------- yt_dlp/extractor/rplay.py | 8 ++++++- 2 files changed, 11 insertions(+), 38 deletions(-) diff --git a/yt_dlp/extractor/openload.py b/yt_dlp/extractor/openload.py index e14bd872b..e1826b984 100644 --- a/yt_dlp/extractor/openload.py +++ b/yt_dlp/extractor/openload.py @@ -48,26 +48,7 @@ def cookie_jar_to_list(cookie_jar): class DenoWrapper: - """Deno wrapper class - - This class is experimental. - """ - INSTALL_HINT = 'Please install deno following https://docs.deno.com/runtime/manual/getting_started/installation/ or download its binary from https://github.com/denoland/deno/releases' - _BASE_JS = ''' - delete window.Deno; - global = window; - const navProxy = new Proxy(window.navigator, { get: (target, prop, receiver) => ({ - appCodeName: 'Mozilla', - appName: 'Netscape', - appVersion: '5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.17 Safari/537.36', - language: 'en', - languages: ['en'], - userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.17 Safari/537.36', - webdriver: false, - }[prop])}); - Object.defineProperty(window, "navigator", {get: () => navProxy}) - ''' @staticmethod def _version(): @@ -96,26 +77,12 @@ def _create_temp_js(self, jscode): with contextlib.suppress(OSError): os.remove(js_file.name) - @staticmethod - def _location_js(location: str): - parsed = urllib.parse.urlparse(location) - return f''' - window.location = {{ - href: "{location}", - origin: "{parsed.scheme}://{parsed.netloc}", - host: "{parsed.netloc}", - hostname: "{parsed.netloc.split(':')[0]}", - hash: "{parsed.fragment}", - protocol: "{parsed.scheme}:", - }}; - ''' + def deno_execute(self, jscode, video_id=None, *, note='Executing JS in Deno', allow_net=None): + """Execute JS directly in Deno environment and return stdout""" - def execute(self, jscode, video_id=None, *, note='Executing JS', allow_net=None, location=None): - """Execute JS and return stdout""" - if location: - jscode = self._location_js(location) + jscode + base_js = 'delete window.Deno; global = window' - with self._create_temp_js(self._BASE_JS + jscode) as js_file: + with self._create_temp_js(base_js + jscode) as js_file: self.extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') cmd = [self.exe, 'run', js_file.name] diff --git a/yt_dlp/extractor/rplay.py b/yt_dlp/extractor/rplay.py index fba261bc8..18dbef92f 100644 --- a/yt_dlp/extractor/rplay.py +++ b/yt_dlp/extractor/rplay.py @@ -112,10 +112,16 @@ def _calc_butter_token(self): __wbg_finalize_init(instance, module); };''' % butter_wasm_array # noqa: UP031 + butter_js += '''const navProxy = new Proxy(window.navigator, { get: (target, prop, receiver) => { + if (prop === 'webdriver') return false; + return target[prop];}); + Object.defineProperty(window, "navigator", {get: () => navProxy}); + window.location = {origin: "https://rplay.live"};''' + butter_js += '__new_init().then(() => console.log((new ButterFactory()).generate_butter()));' jsi = DenoWrapper(self) - return jsi.execute(butter_js, location='https://rplay.live/') + return jsi.deno_execute(butter_js) def get_butter_token(self): cache = self.cache.load('rplay', 'butter-token') or {} From 1bab6aa3a387b7cfcd0b8eb345292f0569ac3e48 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Fri, 2 Aug 2024 17:01:20 -0400 Subject: [PATCH 11/48] fix --- yt_dlp/extractor/openload.py | 2 +- yt_dlp/extractor/rplay.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/openload.py b/yt_dlp/extractor/openload.py index e1826b984..bcb5f6866 100644 --- a/yt_dlp/extractor/openload.py +++ b/yt_dlp/extractor/openload.py @@ -80,7 +80,7 @@ def _create_temp_js(self, jscode): def deno_execute(self, jscode, video_id=None, *, note='Executing JS in Deno', allow_net=None): """Execute JS directly in Deno environment and return stdout""" - base_js = 'delete window.Deno; global = window' + base_js = 'delete window.Deno; global = window;' with self._create_temp_js(base_js + jscode) as js_file: self.extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') diff --git a/yt_dlp/extractor/rplay.py b/yt_dlp/extractor/rplay.py index 18dbef92f..9887a25b6 100644 --- a/yt_dlp/extractor/rplay.py +++ b/yt_dlp/extractor/rplay.py @@ -114,7 +114,7 @@ def _calc_butter_token(self): butter_js += '''const navProxy = new Proxy(window.navigator, { get: (target, prop, receiver) => { if (prop === 'webdriver') return false; - return target[prop];}); + return target[prop];}}); Object.defineProperty(window, "navigator", {get: () => navProxy}); window.location = {origin: "https://rplay.live"};''' From 532c6b2eb59272338f2e16bb53903b2345514a4a Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Fri, 2 Aug 2024 17:21:48 -0400 Subject: [PATCH 12/48] jitless --- yt_dlp/extractor/openload.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/openload.py b/yt_dlp/extractor/openload.py index bcb5f6866..b465de2db 100644 --- a/yt_dlp/extractor/openload.py +++ b/yt_dlp/extractor/openload.py @@ -77,7 +77,7 @@ def _create_temp_js(self, jscode): with contextlib.suppress(OSError): os.remove(js_file.name) - def deno_execute(self, jscode, video_id=None, *, note='Executing JS in Deno', allow_net=None): + def deno_execute(self, jscode, video_id=None, *, note='Executing JS in Deno', allow_net=None, jit_less=True): """Execute JS directly in Deno environment and return stdout""" base_js = 'delete window.Deno; global = window;' @@ -88,6 +88,8 @@ def deno_execute(self, jscode, video_id=None, *, note='Executing JS in Deno', al cmd = [self.exe, 'run', js_file.name] if allow_net: cmd.append('--allow-net' if isinstance(allow_net, bool) else f'--allow-net={allow_net}') + if jit_less: + cmd.append('--v8-flags=--jitless') self.extractor.write_debug(f'Deno command line: {shell_quote(cmd)}') try: From e13c4573d95f6a86df8704a0b676cb0df84d527c Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Fri, 2 Aug 2024 19:35:28 -0400 Subject: [PATCH 13/48] update --- yt_dlp/extractor/douyutv.py | 13 +++++++--- yt_dlp/extractor/openload.py | 49 +++++++++++++++++++++--------------- yt_dlp/extractor/rplay.py | 2 +- 3 files changed, 39 insertions(+), 25 deletions(-) diff --git a/yt_dlp/extractor/douyutv.py b/yt_dlp/extractor/douyutv.py index e36eac919..8932f914e 100644 --- a/yt_dlp/extractor/douyutv.py +++ b/yt_dlp/extractor/douyutv.py @@ -4,7 +4,7 @@ import uuid from .common import InfoExtractor -from .openload import PhantomJSwrapper +from .openload import DenoWrapper, PhantomJSwrapper from ..utils import ( ExtractorError, UserNotLive, @@ -43,9 +43,14 @@ def _calc_sign(self, sign_func, video_id, a): b = uuid.uuid4().hex c = round(time.time()) js_script = f'{self._get_cryptojs_md5(video_id)};{sign_func};console.log(ub98484234("{a}","{b}","{c}"))' - phantom = PhantomJSwrapper(self) - result = phantom.execute(js_script, video_id, - note='Executing JS signing script').strip() + if DenoWrapper.is_available: + jsi = DenoWrapper(self) + elif PhantomJSwrapper.is_available: + jsi = PhantomJSwrapper(self) + else: + raise ExtractorError('You need to install either Deno or PhantomJS. ' + f'{DenoWrapper.INSTALL_HINT}. {PhantomJSwrapper.INSTALL_HINT}', expected=True) + result = jsi.execute(js_script, video_id, note='Executing JS signing script').strip() return {i: v[0] for i, v in urllib.parse.parse_qs(result).items()} def _search_js_sign_func(self, webpage, fatal=True): diff --git a/yt_dlp/extractor/openload.py b/yt_dlp/extractor/openload.py index b465de2db..525be0e6d 100644 --- a/yt_dlp/extractor/openload.py +++ b/yt_dlp/extractor/openload.py @@ -10,7 +10,7 @@ from ..utils import ( ExtractorError, Popen, - check_executable, + classproperty, format_field, get_exe_version, is_outdated_version, @@ -47,22 +47,32 @@ def cookie_jar_to_list(cookie_jar): return [cookie_to_dict(cookie) for cookie in cookie_jar] -class DenoWrapper: - INSTALL_HINT = 'Please install deno following https://docs.deno.com/runtime/manual/getting_started/installation/ or download its binary from https://github.com/denoland/deno/releases' +class ExternalJSI: + @classproperty(cache=True) + def version(cls): + return get_exe_version(cls.EXE_NAME, args=getattr(cls, 'V_ARGS', ['--version']), version_re=r'([0-9.]+)') - @staticmethod - def _version(): - return get_exe_version('deno', version_re=r'([0-9.]+)') + @classproperty + def exe(cls): + return cls.EXE_NAME if cls.version else None + + @classproperty + def is_available(cls): + return bool(cls.exe) + + +class DenoWrapper(ExternalJSI): + EXE_NAME = 'deno' + INSTALL_HINT = 'Please install deno following https://docs.deno.com/runtime/manual/getting_started/installation/ or download its binary from https://github.com/denoland/deno/releases' def __init__(self, extractor: InfoExtractor, required_version=None, timeout=10000): self.extractor = extractor self.timeout = timeout - self.exe = check_executable('deno', ['-V']) if not self.exe: raise ExtractorError(f'Deno not found, {self.INSTALL_HINT}', expected=True) if required_version: - if is_outdated_version(self._version(), required_version): + if is_outdated_version(self.version, required_version): self.extractor.report_warning( f'Deno is outdated, update it to version {required_version} or newer if you encounter any errors.') @@ -77,10 +87,11 @@ def _create_temp_js(self, jscode): with contextlib.suppress(OSError): os.remove(js_file.name) - def deno_execute(self, jscode, video_id=None, *, note='Executing JS in Deno', allow_net=None, jit_less=True): - """Execute JS directly in Deno environment and return stdout""" + def execute(self, jscode, video_id=None, *, note='Executing JS in Deno', + allow_net=None, jit_less=True, base_js=None): + """Execute JS directly in Deno runtime and return stdout""" - base_js = 'delete window.Deno; global = window;' + base_js = base_js if base_js is not None else 'delete window.Deno; global = window;' with self._create_temp_js(base_js + jscode) as js_file: self.extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') @@ -105,13 +116,13 @@ def deno_execute(self, jscode, video_id=None, *, note='Executing JS in Deno', al return stdout.strip() -class PhantomJSwrapper: +class PhantomJSwrapper(ExternalJSI): """PhantomJS wrapper class This class is experimental. """ - - INSTALL_HINT = 'Please download it from https://phantomjs.org/download.html' + EXE_NAME = 'phantomjs' + INSTALL_HINT = 'Please download PhantomJS from https://phantomjs.org/download.html' _BASE_JS = R''' phantom.onError = function(msg, trace) {{ @@ -162,22 +173,20 @@ class PhantomJSwrapper: _TMP_FILE_NAMES = ['script', 'html', 'cookies'] - @staticmethod - def _version(): - return get_exe_version('phantomjs', version_re=r'([0-9.]+)') + @classmethod + def _version(cls): + return cls.version def __init__(self, extractor, required_version=None, timeout=10000): self._TMP_FILES = {} - self.exe = check_executable('phantomjs', ['-v']) if not self.exe: raise ExtractorError(f'PhantomJS not found, {self.INSTALL_HINT}', expected=True) self.extractor = extractor if required_version: - version = self._version() - if is_outdated_version(version, required_version): + if is_outdated_version(self.version, required_version): self.extractor._downloader.report_warning( 'Your copy of PhantomJS is outdated, update it to version ' f'{required_version} or newer if you encounter any errors.') diff --git a/yt_dlp/extractor/rplay.py b/yt_dlp/extractor/rplay.py index 9887a25b6..0b9a00e68 100644 --- a/yt_dlp/extractor/rplay.py +++ b/yt_dlp/extractor/rplay.py @@ -121,7 +121,7 @@ def _calc_butter_token(self): butter_js += '__new_init().then(() => console.log((new ButterFactory()).generate_butter()));' jsi = DenoWrapper(self) - return jsi.deno_execute(butter_js) + return jsi.execute(butter_js) def get_butter_token(self): cache = self.cache.load('rplay', 'butter-token') or {} From 6327703ea81ce94e652541c5f2b2648c308bec88 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Thu, 8 Aug 2024 11:07:27 -0400 Subject: [PATCH 14/48] creating external --- test/test_jsinterp.py | 2 +- yt_dlp/YoutubeDL.py | 2 +- yt_dlp/extractor/douyutv.py | 2 +- yt_dlp/extractor/iqiyi.py | 2 +- yt_dlp/extractor/pornhub.py | 2 +- yt_dlp/extractor/rplay.py | 6 +- yt_dlp/extractor/youtube.py | 2 +- yt_dlp/jsinterp/__init__.py | 9 ++ .../openload.py => jsinterp/external.py} | 119 ++++++++++++++---- yt_dlp/{ => jsinterp}/jsinterp.py | 2 +- 10 files changed, 114 insertions(+), 34 deletions(-) create mode 100644 yt_dlp/jsinterp/__init__.py rename yt_dlp/{extractor/openload.py => jsinterp/external.py} (73%) rename yt_dlp/{ => jsinterp}/jsinterp.py (99%) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index df92c8315..34b145959 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -9,7 +9,7 @@ import math -from yt_dlp.jsinterp import JS_Undefined, JSInterpreter +from yt_dlp.jsinterp.jsinterp import JS_Undefined, JSInterpreter class NaN: diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 9691a1ea7..6ddf3f51b 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -32,7 +32,7 @@ from .downloader.rtmp import rtmpdump_version from .extractor import gen_extractor_classes, get_info_extractor from .extractor.common import UnsupportedURLIE -from .extractor.openload import PhantomJSwrapper +from .jsinterp import PhantomJSwrapper from .minicurses import format_text from .networking import HEADRequest, Request, RequestDirector from .networking.common import _REQUEST_HANDLERS, _RH_PREFERENCES diff --git a/yt_dlp/extractor/douyutv.py b/yt_dlp/extractor/douyutv.py index 8932f914e..6417c63a5 100644 --- a/yt_dlp/extractor/douyutv.py +++ b/yt_dlp/extractor/douyutv.py @@ -4,7 +4,7 @@ import uuid from .common import InfoExtractor -from .openload import DenoWrapper, PhantomJSwrapper +from ..jsinterp import DenoWrapper, PhantomJSwrapper from ..utils import ( ExtractorError, UserNotLive, diff --git a/yt_dlp/extractor/iqiyi.py b/yt_dlp/extractor/iqiyi.py index 735b44637..18ee3d2de 100644 --- a/yt_dlp/extractor/iqiyi.py +++ b/yt_dlp/extractor/iqiyi.py @@ -5,7 +5,7 @@ import urllib.parse from .common import InfoExtractor -from .openload import PhantomJSwrapper +from ..jsinterp import PhantomJSwrapper from ..utils import ( ExtractorError, clean_html, diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index 679dc6323..6e67265a0 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -5,7 +5,7 @@ import re from .common import InfoExtractor -from .openload import PhantomJSwrapper +from ..jsinterp import PhantomJSwrapper from ..networking import Request from ..networking.exceptions import HTTPError from ..utils import ( diff --git a/yt_dlp/extractor/rplay.py b/yt_dlp/extractor/rplay.py index 0b9a00e68..98bba2364 100644 --- a/yt_dlp/extractor/rplay.py +++ b/yt_dlp/extractor/rplay.py @@ -8,7 +8,7 @@ import time from .common import InfoExtractor -from .openload import DenoWrapper +from ..jsinterp import DenoWrapper from ..utils import ( ExtractorError, UserNotLive, @@ -121,7 +121,7 @@ def _calc_butter_token(self): butter_js += '__new_init().then(() => console.log((new ButterFactory()).generate_butter()));' jsi = DenoWrapper(self) - return jsi.execute(butter_js) + return jsi.execute(butter_js, jit_less=False) def get_butter_token(self): cache = self.cache.load('rplay', 'butter-token') or {} @@ -229,7 +229,7 @@ def _real_extract(self, url): 'contentOid': video_id, 'creatorOid': metainfo.get('uploader_id'), **self.requestor_query, - }, fatal=False)) + }, errnote='Failed to get thumbnail url', fatal=False)) formats = self._extract_m3u8_formats(m3u8_url, video_id, headers={ 'Referer': 'https://rplay.live/', 'Butter': self.get_butter_token()}) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 53aca3816..257e9767d 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -20,7 +20,7 @@ import urllib.parse from .common import InfoExtractor, SearchInfoExtractor -from .openload import PhantomJSwrapper +from ..jsinterp import PhantomJSwrapper from ..jsinterp import JSInterpreter from ..networking.exceptions import HTTPError, network_exceptions from ..utils import ( diff --git a/yt_dlp/jsinterp/__init__.py b/yt_dlp/jsinterp/__init__.py new file mode 100644 index 000000000..944d73fca --- /dev/null +++ b/yt_dlp/jsinterp/__init__.py @@ -0,0 +1,9 @@ +from .jsinterp import JSInterpreter +from .external import PhantomJSwrapper, DenoWrapper + + +__all__ = [ + JSInterpreter, + PhantomJSwrapper, + DenoWrapper, +] diff --git a/yt_dlp/extractor/openload.py b/yt_dlp/jsinterp/external.py similarity index 73% rename from yt_dlp/extractor/openload.py rename to yt_dlp/jsinterp/external.py index 525be0e6d..1eb21701c 100644 --- a/yt_dlp/extractor/openload.py +++ b/yt_dlp/jsinterp/external.py @@ -6,7 +6,7 @@ import tempfile import urllib.parse -from .common import InfoExtractor +from ..extractor.common import InfoExtractor from ..utils import ( ExtractorError, Popen, @@ -47,14 +47,29 @@ def cookie_jar_to_list(cookie_jar): return [cookie_to_dict(cookie) for cookie in cookie_jar] +@contextlib.contextmanager +def _temp_file(content, *, mode='wt', encoding='utf-8', suffix=None, close=True): + if 'r' in mode: + encoding = None + temp_file_handle = tempfile.NamedTemporaryFile(mode, encoding=encoding, suffix=suffix, delete=False) + try: + temp_file_handle.write(content) + if close: + temp_file_handle.close() + yield temp_file_handle + finally: + with contextlib.suppress(OSError): + os.remove(temp_file_handle.name) + + class ExternalJSI: @classproperty(cache=True) def version(cls): - return get_exe_version(cls.EXE_NAME, args=getattr(cls, 'V_ARGS', ['--version']), version_re=r'([0-9.]+)') + return get_exe_version(cls._EXE_NAME, args=getattr(cls, 'V_ARGS', ['--version']), version_re=r'([0-9.]+)') @classproperty def exe(cls): - return cls.EXE_NAME if cls.version else None + return cls._EXE_NAME if cls.version else None @classproperty def is_available(cls): @@ -62,7 +77,7 @@ def is_available(cls): class DenoWrapper(ExternalJSI): - EXE_NAME = 'deno' + _EXE_NAME = 'deno' INSTALL_HINT = 'Please install deno following https://docs.deno.com/runtime/manual/getting_started/installation/ or download its binary from https://github.com/denoland/deno/releases' def __init__(self, extractor: InfoExtractor, required_version=None, timeout=10000): @@ -76,31 +91,19 @@ def __init__(self, extractor: InfoExtractor, required_version=None, timeout=1000 self.extractor.report_warning( f'Deno is outdated, update it to version {required_version} or newer if you encounter any errors.') - @contextlib.contextmanager - def _create_temp_js(self, jscode): - js_file = tempfile.NamedTemporaryFile('wt', encoding='utf-8', suffix='.js', delete=False) - try: - js_file.write(jscode) - js_file.close() - yield js_file - finally: - with contextlib.suppress(OSError): - os.remove(js_file.name) - - def execute(self, jscode, video_id=None, *, note='Executing JS in Deno', - allow_net=None, jit_less=True, base_js=None): + def execute(self, jscode, video_id=None, *, note='Executing JS in Deno', flags=[], jit_less=True, base_js=None): """Execute JS directly in Deno runtime and return stdout""" base_js = base_js if base_js is not None else 'delete window.Deno; global = window;' - with self._create_temp_js(base_js + jscode) as js_file: - self.extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') + with _temp_file(base_js + jscode, suffix='.js') as js_file: + if note: + self.extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') - cmd = [self.exe, 'run', js_file.name] - if allow_net: - cmd.append('--allow-net' if isinstance(allow_net, bool) else f'--allow-net={allow_net}') + cmd = [self.exe, 'run'] + flags if jit_less: cmd.append('--v8-flags=--jitless') + cmd.append(js_file.name) self.extractor.write_debug(f'Deno command line: {shell_quote(cmd)}') try: @@ -116,12 +119,80 @@ def execute(self, jscode, video_id=None, *, note='Executing JS in Deno', return stdout.strip() +class PuppeteerWrapper: + version = '16.2.0' + _HEADLESS = False + + @classproperty + def is_available(cls): + return DenoWrapper.is_available + + @classproperty + def INSTALL_HINT(cls): + msg = 'Run "deno run -A https://deno.land/x/puppeteer@16.2.0/install.ts" to install puppeteer' + if not DenoWrapper.is_available: + msg = f'{DenoWrapper.INSTALL_HINT}. Then {msg}' + return msg + + def __init__(self, extractor: InfoExtractor, required_version=None, timeout=10000): + self.deno = DenoWrapper(extractor, timeout=(timeout + 30000)) + self.timeout = timeout + self.extractor = extractor + + if required_version: + self.extractor.report_warning(f'required_version is not supported on {self.__class__.__name__}') + + def _deno_execute(self, jscode, note=None): + return self.deno.execute(f''' + import puppeteer from "https://deno.land/x/puppeteer@16.2.0/mod.ts"; + const browser = await puppeteer.launch({{ + headless: {json.dumps(bool(self._HEADLESS))}, args: ["--disable-web-security"]}}); + try {{ + {jscode} + }} finally {{ + await browser.close(); + }}''', note=note, flags=['--allow-all'], jit_less=False, base_js='') + + def evaluate(self, jscode, video_id=None, note='Executing JS in Puppeteer', url='about:blank'): + self.extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') + return json.loads(self._deno_execute(f''' + const page = await browser.newPage(); + window.setTimeout(async () => {{ + console.error('Puppeteer execution timed out'); + await browser.close(); + Deno.exit(1); + }}, {int(self.timeout)}); + page.resourceTimeout = {int(self.timeout)}; + + await page.setRequestInterception(true); + page.on("request", request => request.abort()); + + const url = {json.dumps(str(url))}; + await page.evaluate(`window.history.replaceState('', '', ${{JSON.stringify(url)}})`); + + console.log(JSON.stringify(await page.evaluate({json.dumps(str(jscode))}))); + await browser.close(); + Deno.exit(0); + ''')) + + def execute(self, jscode, **args): + return self.evaluate(''' + (() => {{ + const results = []; + const origConsole = console; + const console = new Proxy(console, { get: (target, prop, receiver) => { + if (prop === 'log') return (...data) => data.forEach(i => results.push(i)); + return target[prop]}}) + }})(); + ''') + + class PhantomJSwrapper(ExternalJSI): """PhantomJS wrapper class This class is experimental. """ - EXE_NAME = 'phantomjs' + _EXE_NAME = 'phantomjs' INSTALL_HINT = 'Please download PhantomJS from https://phantomjs.org/download.html' _BASE_JS = R''' @@ -288,7 +359,7 @@ def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on w return html, stdout - def execute(self, jscode, video_id=None, *, note='Executing JS'): + def execute(self, jscode, video_id=None, *, note='Executing JS in PhantomJS'): """Execute JS and return stdout""" if 'phantom.exit();' not in jscode: jscode += ';\nphantom.exit();' diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp/jsinterp.py similarity index 99% rename from yt_dlp/jsinterp.py rename to yt_dlp/jsinterp/jsinterp.py index 851d4dc7b..8c2b5a725 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp/jsinterp.py @@ -6,7 +6,7 @@ import operator import re -from .utils import ( +from ..utils import ( NO_DEFAULT, ExtractorError, function_with_repr, From a2d56eb13c1577a8982fd4309dc8c570ac54419d Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Thu, 8 Aug 2024 12:20:19 -0400 Subject: [PATCH 15/48] remove extractor --- yt_dlp/extractor/_extractors.py | 5 - yt_dlp/extractor/rplay.py | 364 -------------------------------- 2 files changed, 369 deletions(-) delete mode 100644 yt_dlp/extractor/rplay.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 8d499a974..d2140bc52 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1726,11 +1726,6 @@ RozhlasIE, RozhlasVltavaIE, ) -from .rplay import ( - RPlayLiveIE, - RPlayUserIE, - RPlayVideoIE, -) from .rte import ( RteIE, RteRadioIE, diff --git a/yt_dlp/extractor/rplay.py b/yt_dlp/extractor/rplay.py deleted file mode 100644 index 98bba2364..000000000 --- a/yt_dlp/extractor/rplay.py +++ /dev/null @@ -1,364 +0,0 @@ -import base64 -import datetime as dt -import hashlib -import hmac -import json -import random -import re -import time - -from .common import InfoExtractor -from ..jsinterp import DenoWrapper -from ..utils import ( - ExtractorError, - UserNotLive, - encode_data_uri, - float_or_none, - parse_iso8601, - parse_qs, - traverse_obj, - url_or_none, -) - - -class RPlayBaseIE(InfoExtractor): - _NETRC_MACHINE = 'rplaylive' - _TOKEN_CACHE = {} - _user_id = None - _login_type = None - _jwt_token = None - - @property - def user_id(self): - return self._user_id - - @property - def login_type(self): - return self._login_type - - @property - def jwt_token(self): - return self._jwt_token - - @property - def requestor_query(self): - return { - 'requestorOid': self.user_id, - 'loginType': self.login_type, - } if self.user_id else {} - - @property - def jwt_header(self): - return { - 'Referer': 'https://rplay.live/', - 'Authorization': self.jwt_token or 'null', - } - - def _jwt_encode_hs256(self, payload: dict, key: str): - # yt_dlp.utils.jwt_encode_hs256() uses slightly different details that would fails - # and we need to re-implement it with minor changes - b64encode = lambda x: base64.urlsafe_b64encode( - json.dumps(x, separators=(',', ':')).encode()).strip(b'=') - - header_b64 = b64encode({'alg': 'HS256', 'typ': 'JWT'}) - payload_b64 = b64encode(payload) - h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256) - signature_b64 = base64.urlsafe_b64encode(h.digest()).strip(b'=') - return header_b64 + b'.' + payload_b64 + b'.' + signature_b64 - - def _perform_login(self, username, password): - payload = { - 'eml': username, - 'dat': dt.datetime.now(dt.timezone.utc).isoformat(timespec='milliseconds').replace('+00:00', 'Z'), - 'iat': int(time.time()), - } - key = hashlib.sha256(password.encode()).hexdigest() - self._login_by_token(self._jwt_encode_hs256(payload, key).decode()) - - def _login_by_token(self, jwt_token): - user_info = self._download_json( - 'https://api.rplay.live/account/login', 'login', note='performing login', errnote='Failed to login', - data=f'{{"token":"{jwt_token}","loginType":null,"checkAdmin":null}}'.encode(), - headers={'Content-Type': 'application/json', 'Authorization': 'null'}, fatal=False) - - if user_info: - self._user_id = traverse_obj(user_info, 'oid') - self._login_type = traverse_obj(user_info, 'accountType') - self._jwt_token = jwt_token if self._user_id else None - if not self._user_id: - self.report_warning('Failed to login, possibly due to wrong password or website change') - - def _get_butter_files(self): - cache = self.cache.load('rplay', 'butter-code') or {} - if cache.get('date', 0) > time.time() - 86400: - return cache['js'], cache['wasm'] - butter_js = self._download_webpage( - 'https://pb.rplay.live/kr/public/smooth_like_butter.js', 'butter', 'getting butter-sign js') - urlh = self._request_webpage( - 'https://pb.rplay.live/kr/public/smooth_like_butter_bg.wasm', 'butter', 'getting butter-sign wasm') - butter_wasm_array = list(urlh.read()) - self.cache.store('rplay', 'butter-code', {'js': butter_js, 'wasm': butter_wasm_array, 'date': time.time()}) - return butter_js, butter_wasm_array - - def _calc_butter_token(self): - butter_js, butter_wasm_array = self._get_butter_files() - butter_js = re.sub(r'export(?:\s+default)?([\s{])', r'\1', butter_js) - butter_js = butter_js.replace('import.meta', '{}') - - butter_js += '''const __new_init = async () => { - const t = __wbg_get_imports(); - __wbg_init_memory(t); - const {module, instance} = await WebAssembly.instantiate(Uint8Array.from(%s), t); - __wbg_finalize_init(instance, module); - };''' % butter_wasm_array # noqa: UP031 - - butter_js += '''const navProxy = new Proxy(window.navigator, { get: (target, prop, receiver) => { - if (prop === 'webdriver') return false; - return target[prop];}}); - Object.defineProperty(window, "navigator", {get: () => navProxy}); - window.location = {origin: "https://rplay.live"};''' - - butter_js += '__new_init().then(() => console.log((new ButterFactory()).generate_butter()));' - - jsi = DenoWrapper(self) - return jsi.execute(butter_js, jit_less=False) - - def get_butter_token(self): - cache = self.cache.load('rplay', 'butter-token') or {} - timestamp = str(int(time.time() / 360)) - if cache.get(timestamp): - return cache[timestamp] - token = self._calc_butter_token() - self.cache.store('rplay', 'butter-token', {timestamp: token}) - return token - - -class RPlayVideoIE(RPlayBaseIE): - _VALID_URL = r'https://rplay.live/play/(?P[\d\w]+)' - _TESTS = [{ - 'url': 'https://rplay.live/play/669203d25223214e67579dc3/', - 'info_dict': { - 'id': '669203d25223214e67579dc3', - 'ext': 'mp4', - 'title': 'md5:6ab0a76410b40b1f5fb48a2ad7571264', - 'description': 'md5:d2fb2f74a623be439cf454df5ff3344a', - 'timestamp': 1720845266, - 'upload_date': '20240713', - 'release_timestamp': 1720846360, - 'release_date': '20240713', - 'duration': 5349.0, - 'thumbnail': r're:https://[\w\d]+.cloudfront.net/.*', - 'uploader': '杏都める', - 'uploader_id': '667adc9e9aa7f739a2158ff3', - 'tags': ['杏都める', 'めいどるーちぇ', '無料', '耳舐め', 'ASMR'], - }, - 'params': {'cachedir': False}, - }, { - 'url': 'https://rplay.live/play/660bee4fd3c1d09d69db6870/', - 'info_dict': { - 'id': '660bee4fd3c1d09d69db6870', - 'ext': 'mp4', - 'title': 'md5:7de162a0f1c2266ec428234620a124fc', - 'description': 'md5:c6d12cc8110b748d5588d5f00787cd35', - 'timestamp': 1712057935, - 'upload_date': '20240402', - 'release_timestamp': 1712061900, - 'release_date': '20240402', - 'duration': 6791.0, - 'thumbnail': r're:https://[\w\d]+.cloudfront.net/.*', - 'uploader': '狐月れんげ', - 'uploader_id': '65eeb4b237043dc0b5654f86', - 'tags': 'count:10', - 'age_limit': 18, - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - playlist_id = traverse_obj(parse_qs(url), ('playlist', ..., any)) - if playlist_id and self._yes_playlist(playlist_id, video_id): - playlist_info = self._download_json( - 'https://api.rplay.live/content/playlist', playlist_id, - query={'playlistOid': playlist_id, **self.requestor_query}, - headers=self.jwt_header, fatal=False) - if playlist_info: - entries = traverse_obj(playlist_info, ('contentData', ..., '_id', { - lambda x: self.url_result(f'https://rplay.live/play/{x}/', ie=RPlayVideoIE, video_id=x)})) - return self.playlist_result(entries, playlist_id, playlist_info.get('name')) - else: - self.report_warning('Failed to get playlist, downloading video only') - - video_info = self._download_json('https://api.rplay.live/content', video_id, query={ - 'contentOid': video_id, - 'status': 'published', - 'withComments': True, - 'requestCanView': True, - **self.requestor_query, - }, headers=self.jwt_header) - if video_info.get('drm'): - raise ExtractorError('This video is DRM-protected') - - metainfo = traverse_obj(video_info, { - 'title': ('title', {str}), - 'description': ('introText', {str}), - 'release_timestamp': ('publishedAt', {parse_iso8601}), - 'timestamp': ('createdAt', {parse_iso8601}), - 'duration': ('length', {float_or_none}), - 'uploader': ('nickname', {str}), - 'uploader_id': ('creatorOid', {str}), - 'tags': ('hashtags', lambda _, v: v[0] != '_'), - 'age_limit': (('hideContent', 'isAdultContent'), {lambda x: 18 if x else None}, any), - }) - - m3u8_url = traverse_obj(video_info, ('canView', 'url', {url_or_none})) - if not m3u8_url: - msg = 'You do not have access to this video' - if traverse_obj(video_info, ('viewableTiers', 'free')): - msg = 'This video requires a free subscription to access' - if not self.user_id: - msg += f'. {self._login_hint(method="password")}' - raise ExtractorError(msg, expected=True) - - thumbnail_key = traverse_obj(video_info, ( - 'streamables', lambda _, v: v['type'].startswith('image/'), 's3key', any)) - if thumbnail_key: - metainfo['thumbnail'] = url_or_none(self._download_webpage( - 'https://api.rplay.live/upload/privateasset', video_id, 'getting cover url', query={ - 'key': thumbnail_key, - 'contentOid': video_id, - 'creatorOid': metainfo.get('uploader_id'), - **self.requestor_query, - }, errnote='Failed to get thumbnail url', fatal=False)) - - formats = self._extract_m3u8_formats(m3u8_url, video_id, headers={ - 'Referer': 'https://rplay.live/', 'Butter': self.get_butter_token()}) - for fmt in formats: - m3u8_doc = self._download_webpage(fmt['url'], video_id, 'getting m3u8 contents', headers={ - 'Referer': 'https://rplay.live/', 'Butter': self.get_butter_token()}) - fmt['url'] = encode_data_uri(m3u8_doc.encode(), 'application/x-mpegurl') - match = re.search(r'^#EXT-X-KEY.*?URI="([^"]+)"', m3u8_doc, flags=re.M) - if match: - urlh = self._request_webpage(match[1], video_id, 'getting hls key', headers={ - 'Referer': 'https://rplay.live/', - 'rplay-private-content-requestor': self.user_id or 'not-logged-in', - 'age': random.randint(1, 4999), - }) - fmt['hls_aes'] = {'key': urlh.read().hex()} - - return { - 'id': video_id, - 'formats': formats, - **metainfo, - 'http_headers': {'Referer': 'https://rplay.live/'}, - } - - -class RPlayUserIE(InfoExtractor): - _VALID_URL = r'https://rplay.live/(?Pc|creatorhome)/(?P[\d\w]+)/?(?:[#?]|$)' - _TESTS = [{ - 'url': 'https://rplay.live/creatorhome/667adc9e9aa7f739a2158ff3?page=contents', - 'info_dict': { - 'id': '667adc9e9aa7f739a2158ff3', - 'title': '杏都める', - }, - 'playlist_mincount': 34, - }, { - 'url': 'https://rplay.live/c/furachi?page=contents', - 'info_dict': { - 'id': '65e07e60850f4527aab74757', - 'title': '逢瀬ふらち OuseFurachi', - }, - 'playlist_mincount': 77, - }] - - def _real_extract(self, url): - user_id, short = self._match_valid_url(url).group('id', 'short') - key = 'customUrl' if short == 'c' else 'userOid' - - user_info = self._download_json( - f'https://api.rplay.live/account/getuser?{key}={user_id}&filter[]=nickname&filter[]=published', user_id) - replays = self._download_json( - 'https://api.rplay.live/live/replays?=667e4cd99aa7f739a2c91852', user_id, query={ - 'creatorOid': user_info.get('_id')}) - - entries = traverse_obj(user_info, ('published', ..., { - lambda x: self.url_result(f'https://rplay.live/play/{x}/', ie=RPlayVideoIE, video_id=x)})) - for entry_id in traverse_obj(replays, (..., '_id', {str})): - if entry_id in user_info.get('published', []): - continue - entries.append(self.url_result(f'https://rplay.live/play/{entry_id}/', ie=RPlayVideoIE, video_id=entry_id)) - - return self.playlist_result(entries, user_info.get('_id', user_id), user_info.get('nickname')) - - -class RPlayLiveIE(RPlayBaseIE): - _VALID_URL = [ - r'https://rplay.live/(?Pc)/(?P[\d\w]+)/live', - r'https://rplay.live/(?Plive)/(?P[\d\w]+)', - ] - _TESTS = [{ - 'url': 'https://rplay.live/c/chachamaru/live', - 'info_dict': { - 'id': '667e4cd99aa7f739a2c91852', - 'ext': 'mp4', - 'title': r're:【ASMR】ん~っやば//スキスキ耐久.*', - 'description': 'md5:7f88ac0a7a3d5d0b926a0baecd1d40e1', - 'timestamp': 1721739947, - 'upload_date': '20240723', - 'live_status': 'is_live', - 'thumbnail': 'https://pb.rplay.live/liveChannelThumbnails/667e4cd99aa7f739a2c91852', - 'uploader': '愛犬茶々丸', - 'uploader_id': '667e4cd99aa7f739a2c91852', - 'tags': 'count:9', - }, - 'skip': 'live', - }, { - 'url': 'https://rplay.live/live/667adc9e9aa7f739a2158ff3', - 'only_matching': True, - }] - - def _real_extract(self, url): - user_id, short = self._match_valid_url(url).group('id', 'short') - - if short == 'c': - user_info = self._download_json(f'https://api.rplay.live/account/getuser?customUrl={user_id}', user_id) - user_id = user_info['_id'] - else: - user_info = self._download_json(f'https://api.rplay.live/account/getuser?userOid={user_id}', user_id) - - live_info = self._download_json('https://api.rplay.live/live/play', user_id, query={'creatorOid': user_id}) - - stream_state = live_info['streamState'] - if stream_state == 'youtube': - return self.url_result(f'https://www.youtube.com/watch?v={live_info["liveStreamId"]}') - elif stream_state == 'live': - if not self.user_id and not live_info.get('allowAnonymous'): - self.raise_login_required(method='password') - key2 = self._download_webpage( - 'https://api.rplay.live/live/key2', user_id, 'getting live key', - headers=self.jwt_header, query=self.requestor_query) if self.user_id else '' - formats = self._extract_m3u8_formats( - 'https://api.rplay.live/live/stream/playlist.m3u8', user_id, - query={'creatorOid': user_id, 'key2': key2}) - - return { - 'id': user_id, - 'formats': formats, - 'is_live': True, - 'http_headers': {'Referer': 'https://rplay.live'}, - 'thumbnail': f'https://pb.rplay.live/liveChannelThumbnails/{user_id}', - 'uploader': traverse_obj(user_info, ('nickname', {str})), - 'uploader_id': user_id, - **traverse_obj(live_info, { - 'title': ('title', {str}), - 'description': ('description', {str}), - 'timestamp': ('streamStartTime', {parse_iso8601}), - 'tags': ('hashtags', ..., {str}), - 'age_limit': ('isAdultContent', {lambda x: 18 if x else None}), - }), - } - elif stream_state == 'offline': - raise UserNotLive - else: - raise ExtractorError(f'Unknow streamState: {stream_state}') From 63c5218ab14535a7b59c82613fc69819e34d4580 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sun, 11 Aug 2024 02:48:20 -0400 Subject: [PATCH 16/48] better check --- yt_dlp/jsinterp/external.py | 81 ++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 29 deletions(-) diff --git a/yt_dlp/jsinterp/external.py b/yt_dlp/jsinterp/external.py index 1eb21701c..274c1aad6 100644 --- a/yt_dlp/jsinterp/external.py +++ b/yt_dlp/jsinterp/external.py @@ -67,6 +67,10 @@ class ExternalJSI: def version(cls): return get_exe_version(cls._EXE_NAME, args=getattr(cls, 'V_ARGS', ['--version']), version_re=r'([0-9.]+)') + @classproperty + def full_version(cls): + return cls.version + @classproperty def exe(cls): return cls._EXE_NAME if cls.version else None @@ -78,7 +82,7 @@ def is_available(cls): class DenoWrapper(ExternalJSI): _EXE_NAME = 'deno' - INSTALL_HINT = 'Please install deno following https://docs.deno.com/runtime/manual/getting_started/installation/ or download its binary from https://github.com/denoland/deno/releases' + INSTALL_HINT = 'Please install Deno from https://docs.deno.com/runtime/manual/getting_started/installation/ or download binary from https://github.com/denoland/deno/releases' def __init__(self, extractor: InfoExtractor, required_version=None, timeout=10000): self.extractor = extractor @@ -91,49 +95,68 @@ def __init__(self, extractor: InfoExtractor, required_version=None, timeout=1000 self.extractor.report_warning( f'Deno is outdated, update it to version {required_version} or newer if you encounter any errors.') + @classmethod + def _execute(cls, jscode, extractor=None, video_id=None, note='', flags=[], timeout=10000): + with _temp_file(jscode, suffix='.js') as js_file: + if note and extractor: + extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') + cmd = [cls.exe, 'run', *flags, js_file.name] + try: + stdout, stderr, returncode = Popen.run( + cmd, timeout=timeout / 1000, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + except Exception as e: + raise ExtractorError('Unable to run Deno binary', cause=e) + if returncode: + raise ExtractorError(f'Failed with returncode {returncode}:\n{stderr}') + elif stderr and extractor: + extractor.report_warning(f'JS console error msg:\n{stderr.strip()}', video_id=video_id) + return stdout.strip() + def execute(self, jscode, video_id=None, *, note='Executing JS in Deno', flags=[], jit_less=True, base_js=None): """Execute JS directly in Deno runtime and return stdout""" base_js = base_js if base_js is not None else 'delete window.Deno; global = window;' - with _temp_file(base_js + jscode, suffix='.js') as js_file: - if note: - self.extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') + if jit_less: + flags = [*flags, '--v8-flags=--jitless'] - cmd = [self.exe, 'run'] + flags - if jit_less: - cmd.append('--v8-flags=--jitless') - cmd.append(js_file.name) - - self.extractor.write_debug(f'Deno command line: {shell_quote(cmd)}') - try: - stdout, stderr, returncode = Popen.run(cmd, timeout=self.timeout / 1000, text=True, - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - except Exception as e: - raise ExtractorError(f'{note} failed: Unable to run Deno binary', cause=e) - if returncode: - raise ExtractorError(f'{note} failed with returncode {returncode}:\n{stderr}') - elif stderr: - self.extractor.report_warning(f'JS console error msg:\n{stderr.strip()}', video_id=video_id) - - return stdout.strip() + return self._execute(base_js + jscode, extractor=self.extractor, video_id=video_id, note=note, + flags=flags, timeout=self.timeout) class PuppeteerWrapper: - version = '16.2.0' + _PACKAGE_VERSION = '16.2.0' _HEADLESS = False - @classproperty - def is_available(cls): - return DenoWrapper.is_available - @classproperty def INSTALL_HINT(cls): - msg = 'Run "deno run -A https://deno.land/x/puppeteer@16.2.0/install.ts" to install puppeteer' + msg = f'Run "deno run -A https://deno.land/x/puppeteer@{cls._PACKAGE_VERSION}/install.ts" to install puppeteer' if not DenoWrapper.is_available: msg = f'{DenoWrapper.INSTALL_HINT}. Then {msg}' return msg + @classproperty(cache=True) + def full_version(cls): + if not DenoWrapper.is_available: + return + try: + browser_version = DenoWrapper._execute(f''' + import puppeteer from "https://deno.land/x/puppeteer@16.2.0/mod.ts"; + const browser = await puppeteer.launch({{headless: {json.dumps(bool(cls._HEADLESS))}}}); + try {{ + //await (new ) + console.log(await browser.version()) + }} finally {{ + await browser.close(); + }}''', flags=['--allow-all']) + return f'puppeteer={cls._PACKAGE_VERSION} browser={browser_version}' + except ExtractorError: + return None + + @classproperty + def version(cls): + return cls._PACKAGE_VERSION if cls.full_version else None + def __init__(self, extractor: InfoExtractor, required_version=None, timeout=10000): self.deno = DenoWrapper(extractor, timeout=(timeout + 30000)) self.timeout = timeout @@ -144,14 +167,14 @@ def __init__(self, extractor: InfoExtractor, required_version=None, timeout=1000 def _deno_execute(self, jscode, note=None): return self.deno.execute(f''' - import puppeteer from "https://deno.land/x/puppeteer@16.2.0/mod.ts"; + import puppeteer from "https://deno.land/x/puppeteer@{self._PACKAGE_VERSION}/mod.ts"; const browser = await puppeteer.launch({{ headless: {json.dumps(bool(self._HEADLESS))}, args: ["--disable-web-security"]}}); try {{ {jscode} }} finally {{ await browser.close(); - }}''', note=note, flags=['--allow-all'], jit_less=False, base_js='') + }}''', note=note, flags=['--allow-all'], base_js='') def evaluate(self, jscode, video_id=None, note='Executing JS in Puppeteer', url='about:blank'): self.extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') From 37a3bc2a4aa88763ea546b3a7256395ae947be8e Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sun, 11 Aug 2024 08:03:00 -0400 Subject: [PATCH 17/48] update --- yt_dlp/jsinterp/__init__.py | 3 ++- yt_dlp/jsinterp/external.py | 34 ++++++++++++++++------------------ 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/yt_dlp/jsinterp/__init__.py b/yt_dlp/jsinterp/__init__.py index 944d73fca..7d44b4698 100644 --- a/yt_dlp/jsinterp/__init__.py +++ b/yt_dlp/jsinterp/__init__.py @@ -1,9 +1,10 @@ from .jsinterp import JSInterpreter -from .external import PhantomJSwrapper, DenoWrapper +from .external import PhantomJSwrapper, DenoWrapper, PuppeteerWrapper __all__ = [ JSInterpreter, PhantomJSwrapper, DenoWrapper, + PuppeteerWrapper, ] diff --git a/yt_dlp/jsinterp/external.py b/yt_dlp/jsinterp/external.py index 274c1aad6..c0a2b62d2 100644 --- a/yt_dlp/jsinterp/external.py +++ b/yt_dlp/jsinterp/external.py @@ -112,18 +112,21 @@ def _execute(cls, jscode, extractor=None, video_id=None, note='', flags=[], time extractor.report_warning(f'JS console error msg:\n{stderr.strip()}', video_id=video_id) return stdout.strip() - def execute(self, jscode, video_id=None, *, note='Executing JS in Deno', flags=[], jit_less=True, base_js=None): + def execute(self, jscode, video_id=None, *, note='Executing JS in Deno', flags=[], base_js=None): """Execute JS directly in Deno runtime and return stdout""" base_js = base_js if base_js is not None else 'delete window.Deno; global = window;' - if jit_less: - flags = [*flags, '--v8-flags=--jitless'] - return self._execute(base_js + jscode, extractor=self.extractor, video_id=video_id, note=note, flags=flags, timeout=self.timeout) +class DenoJITlessJSI(DenoWrapper): + def execute(self, jscode, video_id=None, *, note='Executing JS in Deno', flags=[], base_js=None): + return super().execute(jscode, video_id, note=note, base_js=base_js, + flags=[*flags, '--v8-flags=--jitless,--noexpose-wasm']) + + class PuppeteerWrapper: _PACKAGE_VERSION = '16.2.0' _HEADLESS = False @@ -176,9 +179,9 @@ def _deno_execute(self, jscode, note=None): await browser.close(); }}''', note=note, flags=['--allow-all'], base_js='') - def evaluate(self, jscode, video_id=None, note='Executing JS in Puppeteer', url='about:blank'): + def execute(self, jscode, video_id=None, note='Executing JS in Puppeteer', url='about:blank'): self.extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') - return json.loads(self._deno_execute(f''' + return self._deno_execute(f''' const page = await browser.newPage(); window.setTimeout(async () => {{ console.error('Puppeteer execution timed out'); @@ -187,26 +190,21 @@ def evaluate(self, jscode, video_id=None, note='Executing JS in Puppeteer', url= }}, {int(self.timeout)}); page.resourceTimeout = {int(self.timeout)}; + // drop network requests await page.setRequestInterception(true); page.on("request", request => request.abort()); + // capture console output + page.on("console", msg => {{ + msg.type() === 'log' && console.log(msg.text()); + msg.type() === 'error' && console.error(msg.text()); + }}); const url = {json.dumps(str(url))}; await page.evaluate(`window.history.replaceState('', '', ${{JSON.stringify(url)}})`); - console.log(JSON.stringify(await page.evaluate({json.dumps(str(jscode))}))); + await page.evaluate({json.dumps(str(jscode))}); await browser.close(); Deno.exit(0); - ''')) - - def execute(self, jscode, **args): - return self.evaluate(''' - (() => {{ - const results = []; - const origConsole = console; - const console = new Proxy(console, { get: (target, prop, receiver) => { - if (prop === 'log') return (...data) => data.forEach(i => results.push(i)); - return target[prop]}}) - }})(); ''') From 8e39820adcfcd820cecef595606613eaab13dfa2 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sun, 11 Aug 2024 10:43:15 -0400 Subject: [PATCH 18/48] update --- yt_dlp/jsinterp/external.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/yt_dlp/jsinterp/external.py b/yt_dlp/jsinterp/external.py index c0a2b62d2..ea67b6309 100644 --- a/yt_dlp/jsinterp/external.py +++ b/yt_dlp/jsinterp/external.py @@ -49,7 +49,7 @@ def cookie_jar_to_list(cookie_jar): @contextlib.contextmanager def _temp_file(content, *, mode='wt', encoding='utf-8', suffix=None, close=True): - if 'r' in mode: + if 'b' in mode: encoding = None temp_file_handle = tempfile.NamedTemporaryFile(mode, encoding=encoding, suffix=suffix, delete=False) try: @@ -62,6 +62,26 @@ def _temp_file(content, *, mode='wt', encoding='utf-8', suffix=None, close=True) os.remove(temp_file_handle.name) +@contextlib.contextmanager +def _tempfile_context(): + handles = [] + + def _creater(content, *, mode='wt', encoding='utf-8', suffix=None, close=True): + encoding = None if 'b' in mode else encoding + handle = tempfile.NamedTemporaryFile(mode, encoding=encoding, suffix=suffix, delete=False) + handles.append(handle) + handle.write(content) + if close: + handle.close() + return handle + try: + yield _creater + finally: + for handle in handles: + with contextlib.suppress(OSError): + os.remove(handle.name) + + class ExternalJSI: @classproperty(cache=True) def version(cls): From 567fb56a1ce7180c45af1912f7448422fe5835e8 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sun, 11 Aug 2024 16:23:58 -0400 Subject: [PATCH 19/48] temp file wrapper --- yt_dlp/jsinterp/external.py | 96 ++++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 45 deletions(-) diff --git a/yt_dlp/jsinterp/external.py b/yt_dlp/jsinterp/external.py index ea67b6309..3751941c8 100644 --- a/yt_dlp/jsinterp/external.py +++ b/yt_dlp/jsinterp/external.py @@ -47,39 +47,45 @@ def cookie_jar_to_list(cookie_jar): return [cookie_to_dict(cookie) for cookie in cookie_jar] -@contextlib.contextmanager -def _temp_file(content, *, mode='wt', encoding='utf-8', suffix=None, close=True): - if 'b' in mode: - encoding = None - temp_file_handle = tempfile.NamedTemporaryFile(mode, encoding=encoding, suffix=suffix, delete=False) - try: - temp_file_handle.write(content) - if close: - temp_file_handle.close() - yield temp_file_handle - finally: +class TempFileWrapper: + def __init__(self, content=None, text=True, encoding='utf-8', suffix=None): + self.encoding = None if not text else encoding + self.text = text + self._file = tempfile.NamedTemporaryFile('wb', suffix=suffix, delete=False) + self._file.close() + if content: + self.write(content) + + @property + def name(self): + return self._file.name + + @contextlib.contextmanager + def opened_file(self, mode, *, seek=None, seek_whence=0): + mode = mode if (self.text or 'b' in mode) else mode + 'b' + with open(self._file.name, mode, encoding=self.encoding) as f: + if seek is not None: + self._file.seek(seek, seek_whence) + yield f + + def write(self, s, seek=None, seek_whence=0): + with self.opened_file('w', seek=seek, seek_whence=seek_whence) as f: + f.write(s) + + def append_write(self, s, seek=None, seek_whence=0): + with self.opened_file('a', seek=seek, seek_whence=seek_whence) as f: + f.write(s) + + def read(self, n=-1, seek=None, seek_whence=0): + with self.opened_file('r', seek=seek, seek_whence=seek_whence) as f: + return f.read(n) + + def cleanup(self): with contextlib.suppress(OSError): - os.remove(temp_file_handle.name) + os.remove(self._file.name) - -@contextlib.contextmanager -def _tempfile_context(): - handles = [] - - def _creater(content, *, mode='wt', encoding='utf-8', suffix=None, close=True): - encoding = None if 'b' in mode else encoding - handle = tempfile.NamedTemporaryFile(mode, encoding=encoding, suffix=suffix, delete=False) - handles.append(handle) - handle.write(content) - if close: - handle.close() - return handle - try: - yield _creater - finally: - for handle in handles: - with contextlib.suppress(OSError): - os.remove(handle.name) + def __del__(self): + self.cleanup() class ExternalJSI: @@ -117,20 +123,20 @@ def __init__(self, extractor: InfoExtractor, required_version=None, timeout=1000 @classmethod def _execute(cls, jscode, extractor=None, video_id=None, note='', flags=[], timeout=10000): - with _temp_file(jscode, suffix='.js') as js_file: - if note and extractor: - extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') - cmd = [cls.exe, 'run', *flags, js_file.name] - try: - stdout, stderr, returncode = Popen.run( - cmd, timeout=timeout / 1000, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - except Exception as e: - raise ExtractorError('Unable to run Deno binary', cause=e) - if returncode: - raise ExtractorError(f'Failed with returncode {returncode}:\n{stderr}') - elif stderr and extractor: - extractor.report_warning(f'JS console error msg:\n{stderr.strip()}', video_id=video_id) - return stdout.strip() + js_file = TempFileWrapper(jscode) + if note and extractor: + extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') + cmd = [cls.exe, 'run', *flags, js_file.name] + try: + stdout, stderr, returncode = Popen.run( + cmd, timeout=timeout / 1000, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + except Exception as e: + raise ExtractorError('Unable to run Deno binary', cause=e) + if returncode: + raise ExtractorError(f'Failed with returncode {returncode}:\n{stderr}') + elif stderr and extractor: + extractor.report_warning(f'JS console error msg:\n{stderr.strip()}', video_id=video_id) + return stdout.strip() def execute(self, jscode, video_id=None, *, note='Executing JS in Deno', flags=[], base_js=None): """Execute JS directly in Deno runtime and return stdout""" From b58d59ec78e446eb8690b74a520632d8a4720fdd Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sun, 11 Aug 2024 16:35:33 -0400 Subject: [PATCH 20/48] rename --- test/test_jsinterp.py | 2 +- test/test_youtube_signature.py | 4 ++-- yt_dlp/extractor/youtube.py | 16 ++++++++-------- yt_dlp/jsinterp/__init__.py | 4 ++-- yt_dlp/jsinterp/{jsinterp.py => native.py} | 0 5 files changed, 13 insertions(+), 13 deletions(-) rename yt_dlp/jsinterp/{jsinterp.py => native.py} (100%) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 34b145959..d2084c5c5 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -9,7 +9,7 @@ import math -from yt_dlp.jsinterp.jsinterp import JS_Undefined, JSInterpreter +from yt_dlp.jsinterp.native import JS_Undefined, JSInterpreter class NaN: diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index a14bef511..84d002a0b 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -15,7 +15,7 @@ from test.helper import FakeYDL, is_download_test from yt_dlp.extractor import YoutubeIE -from yt_dlp.jsinterp import JSInterpreter +from yt_dlp.jsinterp import NativeJSI _SIG_TESTS = [ ( @@ -243,7 +243,7 @@ def signature(jscode, sig_input): def n_sig(jscode, sig_input): funcname = YoutubeIE(FakeYDL())._extract_n_function_name(jscode) - return JSInterpreter(jscode).call_function(funcname, sig_input) + return NativeJSI(jscode).call_function(funcname, sig_input) make_sig_test = t_factory( diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 257e9767d..ac0513834 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -21,7 +21,7 @@ from .common import InfoExtractor, SearchInfoExtractor from ..jsinterp import PhantomJSwrapper -from ..jsinterp import JSInterpreter +from ..jsinterp import NativeJSI from ..networking.exceptions import HTTPError, network_exceptions from ..utils import ( NO_DEFAULT, @@ -3067,7 +3067,7 @@ def _parse_sig_js(self, jscode): r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') - jsi = JSInterpreter(jscode) + jsi = NativeJSI(jscode) initial_function = jsi.extract_function(funcname) return lambda s: initial_function([s]) @@ -3111,7 +3111,7 @@ def _decrypt_nsig(self, s, video_id, player_url): try: extract_nsig = self._cached(self._extract_n_function_from_code, 'nsig func', player_url) ret = extract_nsig(jsi, func_code)(s) - except JSInterpreter.Exception as e: + except NativeJSI.Exception as e: try: jsi = PhantomJSwrapper(self, timeout=5000) except ExtractorError: @@ -3145,7 +3145,7 @@ def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) func_code = self.cache.load('youtube-nsig', player_id, min_ver='2024.07.09') jscode = func_code or self._load_player(video_id, player_url) - jsi = JSInterpreter(jscode) + jsi = NativeJSI(jscode) if func_code: return jsi, player_id, func_code @@ -3163,13 +3163,13 @@ def _extract_n_function_from_code(self, jsi, func_code): def extract_nsig(s): try: ret = func([s]) - except JSInterpreter.Exception: + except NativeJSI.Exception: raise except Exception as e: - raise JSInterpreter.Exception(traceback.format_exc(), cause=e) + raise NativeJSI.Exception(traceback.format_exc(), cause=e) if ret.startswith('enhanced_except_'): - raise JSInterpreter.Exception('Signature function returned an exception') + raise NativeJSI.Exception('Signature function returned an exception') return ret return extract_nsig @@ -3902,7 +3902,7 @@ def build_fragments(f): }) except ExtractorError as e: phantomjs_hint = '' - if isinstance(e, JSInterpreter.Exception): + if isinstance(e, NativeJSI.Exception): phantomjs_hint = (f' Install {self._downloader._format_err("PhantomJS", self._downloader.Styles.EMPHASIS)} ' f'to workaround the issue. {PhantomJSwrapper.INSTALL_HINT}\n') if player_url: diff --git a/yt_dlp/jsinterp/__init__.py b/yt_dlp/jsinterp/__init__.py index 7d44b4698..282385f0c 100644 --- a/yt_dlp/jsinterp/__init__.py +++ b/yt_dlp/jsinterp/__init__.py @@ -1,9 +1,9 @@ -from .jsinterp import JSInterpreter +from .native import JSInterpreter as NativeJSI from .external import PhantomJSwrapper, DenoWrapper, PuppeteerWrapper __all__ = [ - JSInterpreter, + NativeJSI, PhantomJSwrapper, DenoWrapper, PuppeteerWrapper, diff --git a/yt_dlp/jsinterp/jsinterp.py b/yt_dlp/jsinterp/native.py similarity index 100% rename from yt_dlp/jsinterp/jsinterp.py rename to yt_dlp/jsinterp/native.py From f1959b8edfabdb08288ce4a3e61bc73d51ff49ac Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sun, 11 Aug 2024 20:05:46 -0400 Subject: [PATCH 21/48] ruff --- pyproject.toml | 2 +- yt_dlp/extractor/youtube.py | 3 +-- yt_dlp/jsinterp/external.py | 7 ++++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d5480e1c6..3af462856 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -263,7 +263,7 @@ select = [ "A002", # builtin-argument-shadowing "C408", # unnecessary-collection-call ] -"yt_dlp/jsinterp.py" = [ +"yt_dlp/jsinterp/native.py" = [ "UP031", # printf-string-formatting ] diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index ac0513834..e3928b304 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -20,8 +20,7 @@ import urllib.parse from .common import InfoExtractor, SearchInfoExtractor -from ..jsinterp import PhantomJSwrapper -from ..jsinterp import NativeJSI +from ..jsinterp import NativeJSI, PhantomJSwrapper from ..networking.exceptions import HTTPError, network_exceptions from ..utils import ( NO_DEFAULT, diff --git a/yt_dlp/jsinterp/external.py b/yt_dlp/jsinterp/external.py index 3751941c8..b3cc88d6d 100644 --- a/yt_dlp/jsinterp/external.py +++ b/yt_dlp/jsinterp/external.py @@ -48,6 +48,7 @@ def cookie_jar_to_list(cookie_jar): class TempFileWrapper: + """Wrapper for NamedTemporaryFile, auto closes file after io and deletes file upon wrapper object gc""" def __init__(self, content=None, text=True, encoding='utf-8', suffix=None): self.encoding = None if not text else encoding self.text = text @@ -70,11 +71,11 @@ def opened_file(self, mode, *, seek=None, seek_whence=0): def write(self, s, seek=None, seek_whence=0): with self.opened_file('w', seek=seek, seek_whence=seek_whence) as f: - f.write(s) + return f.write(s) def append_write(self, s, seek=None, seek_whence=0): with self.opened_file('a', seek=seek, seek_whence=seek_whence) as f: - f.write(s) + return f.write(s) def read(self, n=-1, seek=None, seek_whence=0): with self.opened_file('r', seek=seek, seek_whence=seek_whence) as f: @@ -123,7 +124,7 @@ def __init__(self, extractor: InfoExtractor, required_version=None, timeout=1000 @classmethod def _execute(cls, jscode, extractor=None, video_id=None, note='', flags=[], timeout=10000): - js_file = TempFileWrapper(jscode) + js_file = TempFileWrapper(jscode, suffix='.js') if note and extractor: extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') cmd = [cls.exe, 'run', *flags, js_file.name] From 92c8d399879cd3a397fef931a25896e016021154 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sun, 11 Aug 2024 23:22:24 -0400 Subject: [PATCH 22/48] pep8 --- yt_dlp/jsinterp/external.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/jsinterp/external.py b/yt_dlp/jsinterp/external.py index b3cc88d6d..bd022bc56 100644 --- a/yt_dlp/jsinterp/external.py +++ b/yt_dlp/jsinterp/external.py @@ -49,6 +49,7 @@ def cookie_jar_to_list(cookie_jar): class TempFileWrapper: """Wrapper for NamedTemporaryFile, auto closes file after io and deletes file upon wrapper object gc""" + def __init__(self, content=None, text=True, encoding='utf-8', suffix=None): self.encoding = None if not text else encoding self.text = text From 062a9785f1f06b158580f4f6e476aad467d0b5e0 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sun, 25 Aug 2024 09:55:24 -0400 Subject: [PATCH 23/48] director --- yt_dlp/extractor/douyutv.py | 8 +- yt_dlp/jsinterp/__init__.py | 10 +- yt_dlp/jsinterp/common.py | 203 ++++++++++++++++++++++++++++++++++++ yt_dlp/jsinterp/external.py | 113 +++++++++++--------- 4 files changed, 279 insertions(+), 55 deletions(-) create mode 100644 yt_dlp/jsinterp/common.py diff --git a/yt_dlp/extractor/douyutv.py b/yt_dlp/extractor/douyutv.py index 6417c63a5..c6f816bc6 100644 --- a/yt_dlp/extractor/douyutv.py +++ b/yt_dlp/extractor/douyutv.py @@ -4,7 +4,7 @@ import uuid from .common import InfoExtractor -from ..jsinterp import DenoWrapper, PhantomJSwrapper +from ..jsinterp import DenoJSI, PhantomJSwrapper from ..utils import ( ExtractorError, UserNotLive, @@ -43,13 +43,13 @@ def _calc_sign(self, sign_func, video_id, a): b = uuid.uuid4().hex c = round(time.time()) js_script = f'{self._get_cryptojs_md5(video_id)};{sign_func};console.log(ub98484234("{a}","{b}","{c}"))' - if DenoWrapper.is_available: - jsi = DenoWrapper(self) + if DenoJSI.is_available: + jsi = DenoJSI(self) elif PhantomJSwrapper.is_available: jsi = PhantomJSwrapper(self) else: raise ExtractorError('You need to install either Deno or PhantomJS. ' - f'{DenoWrapper.INSTALL_HINT}. {PhantomJSwrapper.INSTALL_HINT}', expected=True) + f'{DenoJSI.INSTALL_HINT}. {PhantomJSwrapper.INSTALL_HINT}', expected=True) result = jsi.execute(js_script, video_id, note='Executing JS signing script').strip() return {i: v[0] for i, v in urllib.parse.parse_qs(result).items()} diff --git a/yt_dlp/jsinterp/__init__.py b/yt_dlp/jsinterp/__init__.py index 282385f0c..3daaba894 100644 --- a/yt_dlp/jsinterp/__init__.py +++ b/yt_dlp/jsinterp/__init__.py @@ -1,10 +1,14 @@ from .native import JSInterpreter as NativeJSI -from .external import PhantomJSwrapper, DenoWrapper, PuppeteerWrapper +from .external import PhantomJSwrapper, DenoJSI, PuppeteerJSI +from .common import _JSI_PREFERENCES, _JSI_HANDLERS, JSIDirector __all__ = [ NativeJSI, PhantomJSwrapper, - DenoWrapper, - PuppeteerWrapper, + DenoJSI, + PuppeteerJSI, + _JSI_HANDLERS, + _JSI_PREFERENCES, + JSIDirector, ] diff --git a/yt_dlp/jsinterp/common.py b/yt_dlp/jsinterp/common.py new file mode 100644 index 000000000..934dd5122 --- /dev/null +++ b/yt_dlp/jsinterp/common.py @@ -0,0 +1,203 @@ +from __future__ import annotations + +import abc +import typing +# import dataclasses + +from ..utils import classproperty + + +DEFAULT_TIMEOUT = 10000 +_JSI_HANDLERS: dict[str, type[JSI]] = {} +_JSI_PREFERENCES: set[JSIPreference] = set() +_ALL_FEATURES = { + 'js', + 'wasm', + 'dom', +} + + +def get_jsi_keys(jsi_or_keys: typing.Iterable[str | type[JSI] | JSI]) -> list[str]: + return [jok if isinstance(jok, str) else jok.JSI_KEY for jok in jsi_or_keys] + + +def order_to_pref(jsi_order: typing.Iterable[str | type[JSI] | JSI], multiplier: int) -> JSIPreference: + jsi_order = reversed(get_jsi_keys(jsi_order)) + pref_score = {jsi_cls: (i + 1) * multiplier for i, jsi_cls in enumerate(jsi_order)} + + def _pref(jsi: JSI, *args): + return pref_score.get(jsi.JSI_KEY, 0) + return _pref + + +def join_jsi_name(jsi_list: typing.Iterable[str | type[JSI] | JSI], sep=', '): + return sep.join(get_jsi_keys(jok if isinstance(jok, str) else jok.JSI_NAME for jok in jsi_list)) + + +class JSIExec(typing.Protocol): + @abc.abstractmethod + def execute(self, jscode: str) -> str: + """Execute JS code and return console.log contents, using `html` requires `dom` feature""" + + +class JSIDirector(JSIExec): + """JSIDirector class + + Helper class to forward JS interpretation need to a JSI that supports it. + + @param downloader: downloader instance. + @param features: list of features that JSI must support. + @param only_include: list of JSI to choose from. + @param exclude: list of JSI to avoid using. + @param jsi_params: extra parameters to pass to `JSI.__init__()`. + @param preferred_order: list of JSI to use. First in list is tested first. + @param fallback_jsi: list of JSI that may fail and should act non-fatal and fallback to other JSI. Pass `"all"` to always fallback + @param timeout: timeout in miliseconds for JS interpretation + """ + def __init__( + self, + downloader: YoutubeDL, + features: typing.Iterable[str] = [], + only_include: typing.Iterable[str | type[JSI]] = [], + exclude: typing.Iterable[str | type[JSI]] = [], + jsi_params: dict[str, dict] = {}, + preferred_order: typing.Iterable[str | type[JSI]] = [], + fallback_jsi: typing.Iterable[str | type[JSI]] | typing.Literal['all'] = [], + timeout: float | None = None, + verbose=False, + ): + self._downloader = downloader + self._verbose = verbose + + jsi_keys = set(get_jsi_keys(only_include or _JSI_HANDLERS)) - set(get_jsi_keys(exclude)) + handler_classes = [_JSI_HANDLERS[key] for key in jsi_keys + if _JSI_HANDLERS[key]._SUPPORTED_FEATURES.issuperset(features)] + if not handler_classes: + raise Exception(f'No JSI can be selected for features: {features}, ' + f'included: {get_jsi_keys(only_include) or "all"}, excluded: {get_jsi_keys(exclude)}') + + self._handler_dict = {cls.JSI_KEY: cls(downloader, timeout, **jsi_params.get(cls.JSI_KEY, {})) + for cls in handler_classes} + self.preferences: set[JSIPreference] = {order_to_pref(preferred_order, 100)} | _JSI_PREFERENCES + self._fallback_jsi = get_jsi_keys(handler_classes) if fallback_jsi == 'all' else get_jsi_keys(fallback_jsi) + + def add_handler(self, handler: JSI): + """Add a handler. If a handler of the same JSI_KEY exists, it will overwrite it""" + assert isinstance(handler, JSI), 'handler must be a JSI instance' + self._handler_dict[handler.JSI_KEY] = handler + + @property + def write_debug(self): + return self._downloader.write_debug + + def _get_handlers(self, method: str, *args, **kwargs) -> list[JSI]: + handlers = [h for h in self._handler_dict.values() if getattr(h, method, None)] + self.write_debug(f'JSIDirector has handlers for `{method}`: {handlers}') + if not handlers: + raise Exception(f'No JSI supports method `{method}`, ' + f'included handlers: {[handler.JSI_KEY for handler in self._handler_dict.values()]}') + + preferences = { + handler: sum(pref_func(handler, method, args, kwargs) for pref_func in self.preferences) + for handler in handlers + } + self._downloader.write_debug('JSI preferences for this request: {}'.format(', '.join( + f'{jsi.JSI_NAME}={pref}' for jsi, pref in preferences.items()))) + + return sorted(self._handler_dict.values(), key=preferences.get, reverse=True) + + # def _send(self, request: JSIRequest): + # unavailable_handlers = [] + # exec_errors = [] + # for handler in self._get_handlers(request): + # if not handler.is_available: + # unavailable_handlers.append(handler) + # continue + # try: + # return handler.handle(request) + # except Exception as e: + # exec_errors.append(e) + # if not request.fallback: + # raise + # raise EvaluationError + + def _get_handler_method(method_name: str): + def handler(self: JSIDirector, *args, **kwargs): + unavailable: list[JSI] = [] + exceptions: list[tuple[JSI, Exception]] = [] + for handler in self._get_handlers(method_name, *args, **kwargs): + if not handler.is_available: + self.write_debug(f'{handler.JSI_NAME} is not available') + unavailable.append(handler) + continue + try: + self.write_debug(f'Dispatching `{method_name}` task to {handler.JSI_NAME}') + return getattr(handler, method_name)(*args, **kwargs) + except Exception as e: + if handler.JSI_KEY not in self._fallback_jsi: + raise + else: + exceptions.append((handler, e)) + if not exceptions: + raise Exception(f'No available JSI installed, please install one of: {join_jsi_name(unavailable)}') + raise Exception(f'Failed to perform {method_name}, total {len(exceptions)} errors. Following JSI have been skipped and you can try installing one of them: {join_jsi_name(unavailable)}') + return handler + + execute = _get_handler_method('execute') + evaluate = _get_handler_method('evaluate') + + +class JSI(abc.ABC): + _SUPPORTED_FEATURES: set[str] = set() + _BASE_PREFERENCE: int = 0 + + def __init__(self, downloader: YoutubeDL, timeout: float | int | None = None): + self._downloader = downloader + self.timeout = float(timeout or DEFAULT_TIMEOUT) + + @property + @abc.abstractmethod + def is_available(self) -> bool: + raise NotImplementedError + + @classproperty + def JSI_NAME(cls) -> str: + return cls.__name__[:-3] + + @classproperty + def JSI_KEY(cls) -> str: + assert cls.__name__.endswith('JSI'), 'JSI class names must end with "JSI"' + return cls.__name__[:-3] + + +def register_jsi(handler_cls: TYPE_JSI) -> TYPE_JSI: + """Register a JS interpreter class""" + assert issubclass(handler_cls, JSI), f'{handler_cls} must be a subclass of JSI' + assert handler_cls.JSI_KEY not in _JSI_HANDLERS, f'JSI {handler_cls.JSI_KEY} already registered' + assert handler_cls._SUPPORTED_FEATURES.issubset(_ALL_FEATURES), f'{handler_cls._SUPPORTED_FEATURES - _ALL_FEATURES} is not declared in `_All_FEATURES`' + _JSI_HANDLERS[handler_cls.JSI_KEY] = handler_cls + return handler_cls + + +def register_jsi_preference(*handlers: type[JSI]): + assert all(issubclass(handler, JSI) for handler in handlers), f'{handlers} must all be a subclass of JSI' + + def outer(pref_func: JSIPreference) -> JSIPreference: + def inner(handler: JSI, *args): + if not handlers or isinstance(handler, handlers): + return pref_func(handler, *args) + return 0 + _JSI_PREFERENCES.add(inner) + return inner + return outer + + +@register_jsi_preference() +def _base_preference(handler: JSI, *args): + return getattr(handler, '_BASE_PREFERENCE', 0) + + +if typing.TYPE_CHECKING: + from ..YoutubeDL import YoutubeDL + JSIPreference = typing.Callable[[JSI, str, list, dict], int] + TYPE_JSI = typing.TypeVar('TYPE_JSI') diff --git a/yt_dlp/jsinterp/external.py b/yt_dlp/jsinterp/external.py index bd022bc56..b02b5e25c 100644 --- a/yt_dlp/jsinterp/external.py +++ b/yt_dlp/jsinterp/external.py @@ -1,3 +1,6 @@ +from __future__ import annotations + +import abc import collections import contextlib import json @@ -5,8 +8,9 @@ import subprocess import tempfile import urllib.parse +import typing + -from ..extractor.common import InfoExtractor from ..utils import ( ExtractorError, Popen, @@ -16,6 +20,7 @@ is_outdated_version, shell_quote, ) +from .common import JSI, register_jsi def cookie_to_dict(cookie): @@ -50,13 +55,15 @@ def cookie_jar_to_list(cookie_jar): class TempFileWrapper: """Wrapper for NamedTemporaryFile, auto closes file after io and deletes file upon wrapper object gc""" - def __init__(self, content=None, text=True, encoding='utf-8', suffix=None): + def __init__(self, content: str | bytes | None = None, text: bool = True, + encoding='utf-8', suffix: str | None = None): self.encoding = None if not text else encoding self.text = text - self._file = tempfile.NamedTemporaryFile('wb', suffix=suffix, delete=False) - self._file.close() + self._file = tempfile.NamedTemporaryFile('w' if text else 'wb', encoding=self.encoding, + suffix=suffix, delete=False) if content: - self.write(content) + self._file.write(content) + self._file.close() @property def name(self): @@ -90,7 +97,9 @@ def __del__(self): self.cleanup() -class ExternalJSI: +class ExternalJSI(JSI, abc.ABC): + _EXE_NAME: str = None + @classproperty(cache=True) def version(cls): return get_exe_version(cls._EXE_NAME, args=getattr(cls, 'V_ARGS', ['--version']), version_re=r'([0-9.]+)') @@ -104,30 +113,25 @@ def exe(cls): return cls._EXE_NAME if cls.version else None @classproperty - def is_available(cls): - return bool(cls.exe) + def is_available(self): + return bool(self.exe) -class DenoWrapper(ExternalJSI): +@register_jsi +class DenoJSI(ExternalJSI): + """JS interpreter class using Deno binary""" _EXE_NAME = 'deno' INSTALL_HINT = 'Please install Deno from https://docs.deno.com/runtime/manual/getting_started/installation/ or download binary from https://github.com/denoland/deno/releases' + _SUPPORTED_FEATURES = {'js', 'wasm'} - def __init__(self, extractor: InfoExtractor, required_version=None, timeout=10000): - self.extractor = extractor - self.timeout = timeout - - if not self.exe: - raise ExtractorError(f'Deno not found, {self.INSTALL_HINT}', expected=True) - if required_version: - if is_outdated_version(self.version, required_version): - self.extractor.report_warning( - f'Deno is outdated, update it to version {required_version} or newer if you encounter any errors.') + def __init__(self, downloader: YoutubeDL, timeout: float | int | None = None, required_version=None): + super().__init__(downloader, timeout) @classmethod - def _execute(cls, jscode, extractor=None, video_id=None, note='', flags=[], timeout=10000): + def _execute(cls, jscode, downloader: YoutubeDL | None = None, video_id=None, note='', flags=[], timeout=10000): js_file = TempFileWrapper(jscode, suffix='.js') - if note and extractor: - extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') + if note and downloader: + downloader.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') cmd = [cls.exe, 'run', *flags, js_file.name] try: stdout, stderr, returncode = Popen.run( @@ -136,46 +140,57 @@ def _execute(cls, jscode, extractor=None, video_id=None, note='', flags=[], time raise ExtractorError('Unable to run Deno binary', cause=e) if returncode: raise ExtractorError(f'Failed with returncode {returncode}:\n{stderr}') - elif stderr and extractor: - extractor.report_warning(f'JS console error msg:\n{stderr.strip()}', video_id=video_id) + elif stderr and downloader: + downloader.report_warning(f'JS console error msg:\n{stderr.strip()}', video_id=video_id) return stdout.strip() - def execute(self, jscode, video_id=None, *, note='Executing JS in Deno', flags=[], base_js=None): + def execute(self, jscode, video_id=None, note='Executing JS in Deno', flags=[], base_js=None): """Execute JS directly in Deno runtime and return stdout""" - base_js = base_js if base_js is not None else 'delete window.Deno; global = window;' + base_js = 'delete window.Deno; global = window;\n' if base_js is None else base_js - return self._execute(base_js + jscode, extractor=self.extractor, video_id=video_id, note=note, + return self._execute(base_js + jscode, downloader=self._downloader, video_id=video_id, note=note, flags=flags, timeout=self.timeout) -class DenoJITlessJSI(DenoWrapper): - def execute(self, jscode, video_id=None, *, note='Executing JS in Deno', flags=[], base_js=None): - return super().execute(jscode, video_id, note=note, base_js=base_js, - flags=[*flags, '--v8-flags=--jitless,--noexpose-wasm']) +@register_jsi +class DenoJITlessJSI(DenoJSI): + _EXE_NAME = DenoJSI._EXE_NAME + INSTALL_HINT = DenoJSI.INSTALL_HINT + _SUPPORTED_FEATURES = {'js'} + + @classproperty + def version(cls): + return DenoJSI.version + + def execute(self, jscode, video_id=None, note='Executing JS in Deno', flags=[], base_js=None): + # JIT-less mode does not support Wasm + return super().execute(jscode, video_id, note=note, + flags=[*flags, '--v8-flags=--jitless,--noexpose-wasm'], base_js=base_js) -class PuppeteerWrapper: +@register_jsi +class PuppeteerJSI(ExternalJSI): _PACKAGE_VERSION = '16.2.0' _HEADLESS = False + _EXE_NAME = DenoJSI._EXE_NAME @classproperty def INSTALL_HINT(cls): msg = f'Run "deno run -A https://deno.land/x/puppeteer@{cls._PACKAGE_VERSION}/install.ts" to install puppeteer' - if not DenoWrapper.is_available: - msg = f'{DenoWrapper.INSTALL_HINT}. Then {msg}' + if not DenoJSI.is_available: + msg = f'{DenoJSI.INSTALL_HINT}. Then {msg}' return msg @classproperty(cache=True) def full_version(cls): - if not DenoWrapper.is_available: + if not DenoJSI.is_available: return try: - browser_version = DenoWrapper._execute(f''' - import puppeteer from "https://deno.land/x/puppeteer@16.2.0/mod.ts"; + browser_version = DenoJSI._execute(f''' + import puppeteer from "https://deno.land/x/puppeteer@{cls._PACKAGE_VERSION}/mod.ts"; const browser = await puppeteer.launch({{headless: {json.dumps(bool(cls._HEADLESS))}}}); try {{ - //await (new ) console.log(await browser.version()) }} finally {{ await browser.close(); @@ -186,15 +201,11 @@ def full_version(cls): @classproperty def version(cls): - return cls._PACKAGE_VERSION if cls.full_version else None + return DenoJSI.version if cls.full_version else None - def __init__(self, extractor: InfoExtractor, required_version=None, timeout=10000): - self.deno = DenoWrapper(extractor, timeout=(timeout + 30000)) - self.timeout = timeout - self.extractor = extractor - - if required_version: - self.extractor.report_warning(f'required_version is not supported on {self.__class__.__name__}') + def __init__(self, downloader: YoutubeDL, timeout: float | int | None = None): + super().__init__(downloader, timeout) + self.deno = DenoJSI(downloader, timeout=(self.timeout + 30000)) def _deno_execute(self, jscode, note=None): return self.deno.execute(f''' @@ -208,7 +219,7 @@ def _deno_execute(self, jscode, note=None): }}''', note=note, flags=['--allow-all'], base_js='') def execute(self, jscode, video_id=None, note='Executing JS in Puppeteer', url='about:blank'): - self.extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') + self._downloader.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') return self._deno_execute(f''' const page = await browser.newPage(); window.setTimeout(async () => {{ @@ -297,7 +308,7 @@ class PhantomJSwrapper(ExternalJSI): def _version(cls): return cls.version - def __init__(self, extractor, required_version=None, timeout=10000): + def __init__(self, extractor: InfoExtractor, required_version=None, timeout=10000): self._TMP_FILES = {} if not self.exe: @@ -429,3 +440,9 @@ def execute(self, jscode, video_id=None, *, note='Executing JS in PhantomJS'): raise ExtractorError(f'{note} failed with returncode {returncode}:\n{stderr.strip()}') return stdout + + +if typing.TYPE_CHECKING: + from ..YoutubeDL import YoutubeDL + # from .common import JSIRequest, JSIResponse + from ..extractor.common import InfoExtractor From 302ee5d0d4e33ce80f865fc5f6ba0d9a41a574ce Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Thu, 19 Sep 2024 19:24:50 -0400 Subject: [PATCH 24/48] test behavior --- yt_dlp/jsinterp/common.py | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/yt_dlp/jsinterp/common.py b/yt_dlp/jsinterp/common.py index 934dd5122..0938cb98b 100644 --- a/yt_dlp/jsinterp/common.py +++ b/yt_dlp/jsinterp/common.py @@ -2,7 +2,6 @@ import abc import typing -# import dataclasses from ..utils import classproperty @@ -90,6 +89,10 @@ def add_handler(self, handler: JSI): def write_debug(self): return self._downloader.write_debug + @property + def report_warning(self): + return self._downloader.report_warning + def _get_handlers(self, method: str, *args, **kwargs) -> list[JSI]: handlers = [h for h in self._handler_dict.values() if getattr(h, method, None)] self.write_debug(f'JSIDirector has handlers for `{method}`: {handlers}') @@ -125,22 +128,48 @@ def _get_handler_method(method_name: str): def handler(self: JSIDirector, *args, **kwargs): unavailable: list[JSI] = [] exceptions: list[tuple[JSI, Exception]] = [] + is_test = self._downloader.params.get('test', False) + results: list[tuple[JSI, typing.Any]] = [] + for handler in self._get_handlers(method_name, *args, **kwargs): if not handler.is_available: + if is_test: + raise Exception(f'{handler.JSI_NAME} is not available for testing, ' + f'add "{handler.JSI_KEY}" in `exclude` if it should not be used') self.write_debug(f'{handler.JSI_NAME} is not available') unavailable.append(handler) continue try: self.write_debug(f'Dispatching `{method_name}` task to {handler.JSI_NAME}') - return getattr(handler, method_name)(*args, **kwargs) + result = getattr(handler, method_name)(*args, **kwargs) + if is_test: + results.append((handler, result)) + else: + return result except Exception as e: if handler.JSI_KEY not in self._fallback_jsi: raise else: exceptions.append((handler, e)) - if not exceptions: - raise Exception(f'No available JSI installed, please install one of: {join_jsi_name(unavailable)}') - raise Exception(f'Failed to perform {method_name}, total {len(exceptions)} errors. Following JSI have been skipped and you can try installing one of them: {join_jsi_name(unavailable)}') + self.write_debug(f'{handler.JSI_NAME} encountered error, fallback to next handler: {e}') + + if not is_test or not results: + if not exceptions: + msg = f'No available JSI installed, please install one of: {join_jsi_name(unavailable)}' + else: + msg = f'Failed to perform {method_name}, total {len(exceptions)} errors' + if unavailable: + msg = f'{msg}. You can try installing one of unavailable JSI: {join_jsi_name(unavailable)}' + raise Exception(msg) + + if is_test: + ref_handler, ref_result = results[0] + for handler, result in results[1:]: + if result != ref_result: + self.report_warning( + f'Different JSI results produced from {ref_handler.JSI_NAME} and {handler.JSI_NAME}') + return ref_result + return handler execute = _get_handler_method('execute') From e0bf89ff2e888afc1ecc8b65c33d15aa2ed23602 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Mon, 30 Sep 2024 02:27:38 -0400 Subject: [PATCH 25/48] features check --- yt_dlp/jsinterp/common.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/yt_dlp/jsinterp/common.py b/yt_dlp/jsinterp/common.py index 0938cb98b..6651ee648 100644 --- a/yt_dlp/jsinterp/common.py +++ b/yt_dlp/jsinterp/common.py @@ -67,6 +67,7 @@ def __init__( ): self._downloader = downloader self._verbose = verbose + self._features = features jsi_keys = set(get_jsi_keys(only_include or _JSI_HANDLERS)) - set(get_jsi_keys(exclude)) handler_classes = [_JSI_HANDLERS[key] for key in jsi_keys @@ -124,8 +125,15 @@ def _get_handlers(self, method: str, *args, **kwargs) -> list[JSI]: # raise # raise EvaluationError - def _get_handler_method(method_name: str): + def _get_handler_method(method_name: str, feature_params: dict): + assert all(feature in _ALL_FEATURES for feature in feature_params.values()) + def handler(self: JSIDirector, *args, **kwargs): + for name, feature in feature_params.items(): + if name in kwargs and feature not in self._features: + raise Exception(f'feature {feature} is required for using {name} params ' + f'but not provided when instantiating {self.__class__.__name__}') + unavailable: list[JSI] = [] exceptions: list[tuple[JSI, Exception]] = [] is_test = self._downloader.params.get('test', False) @@ -172,8 +180,8 @@ def handler(self: JSIDirector, *args, **kwargs): return handler - execute = _get_handler_method('execute') - evaluate = _get_handler_method('evaluate') + execute = _get_handler_method('execute', {'html': 'dom'}) + evaluate = _get_handler_method('evaluate', {'html': 'dom'}) class JSI(abc.ABC): From 5d32d72a837e113d01fecbfa1a6162c829ca456a Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sat, 28 Dec 2024 03:52:00 -0500 Subject: [PATCH 26/48] update --- yt_dlp/jsinterp/__init__.py | 4 +- yt_dlp/jsinterp/common.py | 239 +++++++++++++++++++----------------- yt_dlp/jsinterp/external.py | 4 +- 3 files changed, 127 insertions(+), 120 deletions(-) diff --git a/yt_dlp/jsinterp/__init__.py b/yt_dlp/jsinterp/__init__.py index 3daaba894..94b471329 100644 --- a/yt_dlp/jsinterp/__init__.py +++ b/yt_dlp/jsinterp/__init__.py @@ -1,6 +1,6 @@ from .native import JSInterpreter as NativeJSI from .external import PhantomJSwrapper, DenoJSI, PuppeteerJSI -from .common import _JSI_PREFERENCES, _JSI_HANDLERS, JSIDirector +from .common import _JSI_PREFERENCES, _JSI_HANDLERS, JSInterp __all__ = [ @@ -10,5 +10,5 @@ PuppeteerJSI, _JSI_HANDLERS, _JSI_PREFERENCES, - JSIDirector, + JSInterp, ] diff --git a/yt_dlp/jsinterp/common.py b/yt_dlp/jsinterp/common.py index 6651ee648..b898a2a21 100644 --- a/yt_dlp/jsinterp/common.py +++ b/yt_dlp/jsinterp/common.py @@ -2,8 +2,10 @@ import abc import typing +import functools -from ..utils import classproperty +from ..utils import classproperty, variadic, ExtractorError +from ..extractor.common import InfoExtractor DEFAULT_TIMEOUT = 10000 @@ -33,29 +35,36 @@ def join_jsi_name(jsi_list: typing.Iterable[str | type[JSI] | JSI], sep=', '): return sep.join(get_jsi_keys(jok if isinstance(jok, str) else jok.JSI_NAME for jok in jsi_list)) -class JSIExec(typing.Protocol): - @abc.abstractmethod - def execute(self, jscode: str) -> str: - """Execute JS code and return console.log contents, using `html` requires `dom` feature""" +def require_features(param_features: dict[str, str | typing.Iterable[str]]): + assert all(_ALL_FEATURES.issuperset(variadic(kw_feature)) for kw_feature in param_features.values()) + + def outer(func): + @functools.wraps(func) + def inner(self: JSInterp, *args, **kwargs): + for kw_name, kw_feature in param_features.items(): + if kw_name in kwargs and not self._features.issuperset(variadic(kw_feature)): + raise ExtractorError(f'feature {kw_feature} is required for `{kw_name}` param but not declared') + return func(self, *args, **kwargs) + return inner + return outer -class JSIDirector(JSIExec): - """JSIDirector class +class JSInterp: + """ + Helper class to forward JS interp request to a concrete JSI that supports it. - Helper class to forward JS interpretation need to a JSI that supports it. - - @param downloader: downloader instance. + @param dl_or_ie: `YoutubeDL` or `InfoExtractor` instance. @param features: list of features that JSI must support. - @param only_include: list of JSI to choose from. - @param exclude: list of JSI to avoid using. - @param jsi_params: extra parameters to pass to `JSI.__init__()`. + @param only_include: limit JSI to choose from. + @param exclude: JSI to avoid using. + @param jsi_params: extra kwargs to pass to `JSI.__init__()` for each JSI, using jsi key as dict key. @param preferred_order: list of JSI to use. First in list is tested first. @param fallback_jsi: list of JSI that may fail and should act non-fatal and fallback to other JSI. Pass `"all"` to always fallback - @param timeout: timeout in miliseconds for JS interpretation + @param timeout: explicit timeout parameter in miliseconds for all chosen JSI """ def __init__( self, - downloader: YoutubeDL, + dl_or_ie: YoutubeDL | InfoExtractor, features: typing.Iterable[str] = [], only_include: typing.Iterable[str | type[JSI]] = [], exclude: typing.Iterable[str | type[JSI]] = [], @@ -63,140 +72,135 @@ def __init__( preferred_order: typing.Iterable[str | type[JSI]] = [], fallback_jsi: typing.Iterable[str | type[JSI]] | typing.Literal['all'] = [], timeout: float | None = None, - verbose=False, ): - self._downloader = downloader - self._verbose = verbose - self._features = features + self._downloader: YoutubeDL = dl_or_ie._downloader if isinstance(dl_or_ie, InfoExtractor) else dl_or_ie + self._features = set(features) - jsi_keys = set(get_jsi_keys(only_include or _JSI_HANDLERS)) - set(get_jsi_keys(exclude)) + if unsupported_features := self._features - _ALL_FEATURES: + raise ExtractorError(f'Unsupported features: {unsupported_features}, allowed features: {_ALL_FEATURES}') + + jsi_keys = [key for key in get_jsi_keys(only_include or _JSI_HANDLERS) if key not in get_jsi_keys(exclude)] + self.write_debug(f'Allowed JSI keys: {jsi_keys}') handler_classes = [_JSI_HANDLERS[key] for key in jsi_keys - if _JSI_HANDLERS[key]._SUPPORTED_FEATURES.issuperset(features)] - if not handler_classes: - raise Exception(f'No JSI can be selected for features: {features}, ' - f'included: {get_jsi_keys(only_include) or "all"}, excluded: {get_jsi_keys(exclude)}') + if _JSI_HANDLERS[key]._SUPPORT_FEATURES.issuperset(self._features)] + self.write_debug(f'Selected JSI classes for given features: {get_jsi_keys(handler_classes)}, ' + f'included: {get_jsi_keys(only_include) or "all"}, excluded: {get_jsi_keys(exclude)}') - self._handler_dict = {cls.JSI_KEY: cls(downloader, timeout, **jsi_params.get(cls.JSI_KEY, {})) + self._handler_dict = {cls.JSI_KEY: cls(self._downloader, timeout, **jsi_params.get(cls.JSI_KEY, {})) for cls in handler_classes} self.preferences: set[JSIPreference] = {order_to_pref(preferred_order, 100)} | _JSI_PREFERENCES self._fallback_jsi = get_jsi_keys(handler_classes) if fallback_jsi == 'all' else get_jsi_keys(fallback_jsi) + self._is_test = self._downloader.params.get('test', False) def add_handler(self, handler: JSI): """Add a handler. If a handler of the same JSI_KEY exists, it will overwrite it""" assert isinstance(handler, JSI), 'handler must be a JSI instance' + if not handler._SUPPORT_FEATURES.issuperset(self._features): + raise ExtractorError(f'{handler.JSI_NAME} does not support all required features: {self._features}') self._handler_dict[handler.JSI_KEY] = handler - @property - def write_debug(self): - return self._downloader.write_debug + def write_debug(self, message, only_once=False): + return self._downloader.write_debug(f'[JSIDirector] {message}', only_once=only_once) - @property - def report_warning(self): - return self._downloader.report_warning + def report_warning(self, message, only_once=False): + return self._downloader.report_warning(f'[JSIDirector] {message}', only_once=only_once) - def _get_handlers(self, method: str, *args, **kwargs) -> list[JSI]: - handlers = [h for h in self._handler_dict.values() if getattr(h, method, None)] - self.write_debug(f'JSIDirector has handlers for `{method}`: {handlers}') + def _get_handlers(self, method_name: str, *args, **kwargs) -> list[JSI]: + handlers = [h for h in self._handler_dict.values() if callable(getattr(h, method_name, None))] + self.write_debug(f'Choosing handlers for method `{method_name}`: {get_jsi_keys(handlers)}') if not handlers: - raise Exception(f'No JSI supports method `{method}`, ' - f'included handlers: {[handler.JSI_KEY for handler in self._handler_dict.values()]}') + raise ExtractorError(f'No JSI supports method `{method_name}`, ' + f'included handlers: {get_jsi_keys(self._handler_dict.values())}') preferences = { - handler: sum(pref_func(handler, method, args, kwargs) for pref_func in self.preferences) + handler.JSI_KEY: sum(pref_func(handler, method_name, args, kwargs) for pref_func in self.preferences) for handler in handlers } - self._downloader.write_debug('JSI preferences for this request: {}'.format(', '.join( - f'{jsi.JSI_NAME}={pref}' for jsi, pref in preferences.items()))) + self.write_debug('JSI preferences for `{}` request: {}'.format( + method_name, ', '.join(f'{key}={pref}' for key, pref in preferences.items()))) - return sorted(self._handler_dict.values(), key=preferences.get, reverse=True) + return sorted(handlers, key=lambda h: preferences[h.JSI_KEY], reverse=True) - # def _send(self, request: JSIRequest): - # unavailable_handlers = [] - # exec_errors = [] - # for handler in self._get_handlers(request): - # if not handler.is_available: - # unavailable_handlers.append(handler) - # continue - # try: - # return handler.handle(request) - # except Exception as e: - # exec_errors.append(e) - # if not request.fallback: - # raise - # raise EvaluationError + def _dispatch_request(self, method_name: str, *args, **kwargs): + handlers = self._get_handlers(method_name, *args, **kwargs) - def _get_handler_method(method_name: str, feature_params: dict): - assert all(feature in _ALL_FEATURES for feature in feature_params.values()) + unavailable: list[JSI] = [] + exceptions: list[tuple[JSI, Exception]] = [] + test_results: list[tuple[JSI, typing.Any]] = [] - def handler(self: JSIDirector, *args, **kwargs): - for name, feature in feature_params.items(): - if name in kwargs and feature not in self._features: - raise Exception(f'feature {feature} is required for using {name} params ' - f'but not provided when instantiating {self.__class__.__name__}') - - unavailable: list[JSI] = [] - exceptions: list[tuple[JSI, Exception]] = [] - is_test = self._downloader.params.get('test', False) - results: list[tuple[JSI, typing.Any]] = [] - - for handler in self._get_handlers(method_name, *args, **kwargs): - if not handler.is_available: - if is_test: - raise Exception(f'{handler.JSI_NAME} is not available for testing, ' - f'add "{handler.JSI_KEY}" in `exclude` if it should not be used') - self.write_debug(f'{handler.JSI_NAME} is not available') - unavailable.append(handler) - continue - try: - self.write_debug(f'Dispatching `{method_name}` task to {handler.JSI_NAME}') - result = getattr(handler, method_name)(*args, **kwargs) - if is_test: - results.append((handler, result)) - else: - return result - except Exception as e: - if handler.JSI_KEY not in self._fallback_jsi: - raise - else: - exceptions.append((handler, e)) - self.write_debug(f'{handler.JSI_NAME} encountered error, fallback to next handler: {e}') - - if not is_test or not results: - if not exceptions: - msg = f'No available JSI installed, please install one of: {join_jsi_name(unavailable)}' + for handler in handlers: + if not handler.is_available(): + if self._is_test: + raise Exception(f'{handler.JSI_NAME} is not available for testing, ' + f'add "{handler.JSI_KEY}" in `exclude` if it should not be used') + self.write_debug(f'{handler.JSI_NAME} is not available') + unavailable.append(handler) + continue + try: + self.write_debug(f'Dispatching `{method_name}` task to {handler.JSI_NAME}') + result = getattr(handler, method_name)(*args, **kwargs) + if self._is_test: + test_results.append((handler, result)) else: - msg = f'Failed to perform {method_name}, total {len(exceptions)} errors' - if unavailable: - msg = f'{msg}. You can try installing one of unavailable JSI: {join_jsi_name(unavailable)}' - raise Exception(msg) + return result + except Exception as e: + if handler.JSI_KEY not in self._fallback_jsi: + raise + else: + exceptions.append((handler, e)) + self.write_debug(f'{handler.JSI_NAME} encountered error, fallback to next handler: {e}') - if is_test: - ref_handler, ref_result = results[0] - for handler, result in results[1:]: - if result != ref_result: - self.report_warning( - f'Different JSI results produced from {ref_handler.JSI_NAME} and {handler.JSI_NAME}') - return ref_result + if self._is_test and test_results: + ref_handler, ref_result = test_results[0] + for handler, result in test_results[1:]: + if result != ref_result: + self.report_warning( + f'Different JSI results produced from {ref_handler.JSI_NAME} and {handler.JSI_NAME}') + return ref_result - return handler + if not exceptions: + msg = f'No available JSI installed, please install one of: {join_jsi_name(unavailable)}' + else: + msg = f'Failed to perform {method_name}, total {len(exceptions)} errors' + if unavailable: + msg = f'{msg}. You can try installing one of unavailable JSI: {join_jsi_name(unavailable)}' + raise ExtractorError(msg) - execute = _get_handler_method('execute', {'html': 'dom'}) - evaluate = _get_handler_method('evaluate', {'html': 'dom'}) + @require_features({'html': 'dom'}) + def execute(self, jscode: str, url: str | None = None, html: str | None = None) -> str: + """ + Execute JS code and return stdout from console.log + `html` requires `dom` feature + """ + return self._dispatch_request('execute', jscode, url=url, html=html) + + @require_features({'html': 'dom'}) + def evaluate(self, jscode: str, url: str | None = None, html: str | None = None) -> typing.Any: + """ + Evaluate JS code and return result + `html` requires `dom` feature + """ + return self._dispatch_request('evaluate', jscode, url=url, html=html) class JSI(abc.ABC): - _SUPPORTED_FEATURES: set[str] = set() + _SUPPORT_FEATURES: set[str] = set() _BASE_PREFERENCE: int = 0 def __init__(self, downloader: YoutubeDL, timeout: float | int | None = None): self._downloader = downloader self.timeout = float(timeout or DEFAULT_TIMEOUT) - @property @abc.abstractmethod def is_available(self) -> bool: raise NotImplementedError + def write_debug(self, message, only_once=False): + return self._downloader.write_debug(f'[{self.JSI_KEY}] {message}', only_once=only_once) + + def report_warning(self, message, only_once=False): + return self._downloader.report_warning(f'[{self.JSI_KEY}] {message}', only_once=only_once) + @classproperty def JSI_NAME(cls) -> str: return cls.__name__[:-3] @@ -207,13 +211,13 @@ def JSI_KEY(cls) -> str: return cls.__name__[:-3] -def register_jsi(handler_cls: TYPE_JSI) -> TYPE_JSI: +def register_jsi(jsi_cls: JsiClass) -> JsiClass: """Register a JS interpreter class""" - assert issubclass(handler_cls, JSI), f'{handler_cls} must be a subclass of JSI' - assert handler_cls.JSI_KEY not in _JSI_HANDLERS, f'JSI {handler_cls.JSI_KEY} already registered' - assert handler_cls._SUPPORTED_FEATURES.issubset(_ALL_FEATURES), f'{handler_cls._SUPPORTED_FEATURES - _ALL_FEATURES} is not declared in `_All_FEATURES`' - _JSI_HANDLERS[handler_cls.JSI_KEY] = handler_cls - return handler_cls + assert issubclass(jsi_cls, JSI), f'{jsi_cls} must be a subclass of JSI' + assert jsi_cls.JSI_KEY not in _JSI_HANDLERS, f'JSI {jsi_cls.JSI_KEY} already registered' + assert jsi_cls._SUPPORT_FEATURES.issubset(_ALL_FEATURES), f'{jsi_cls._SUPPORT_FEATURES - _ALL_FEATURES} not declared in `_All_FEATURES`' + _JSI_HANDLERS[jsi_cls.JSI_KEY] = jsi_cls + return jsi_cls def register_jsi_preference(*handlers: type[JSI]): @@ -236,5 +240,8 @@ def _base_preference(handler: JSI, *args): if typing.TYPE_CHECKING: from ..YoutubeDL import YoutubeDL - JSIPreference = typing.Callable[[JSI, str, list, dict], int] - TYPE_JSI = typing.TypeVar('TYPE_JSI') + JsiClass = typing.TypeVar('JsiClass', bound=typing.Type[JSI]) + + class JSIPreference(typing.Protocol): + def __call__(self, handler: JSI, method_name: str, *args, **kwargs) -> int: + ... diff --git a/yt_dlp/jsinterp/external.py b/yt_dlp/jsinterp/external.py index b02b5e25c..36c668938 100644 --- a/yt_dlp/jsinterp/external.py +++ b/yt_dlp/jsinterp/external.py @@ -113,8 +113,8 @@ def exe(cls): return cls._EXE_NAME if cls.version else None @classproperty - def is_available(self): - return bool(self.exe) + def is_available(cls): + return bool(cls.exe) @register_jsi From 65e238c45dc6c6f9a5d070fa4a37038f508d7d89 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sat, 28 Dec 2024 03:56:12 -0500 Subject: [PATCH 27/48] update --- test/test_youtube_signature.py | 4 ++-- yt_dlp/extractor/youtube.py | 16 ++++++++-------- yt_dlp/jsinterp/__init__.py | 4 ++-- yt_dlp/jsinterp/common.py | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 44d06cc55..13436f088 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -15,7 +15,7 @@ from test.helper import FakeYDL, is_download_test from yt_dlp.extractor import YoutubeIE -from yt_dlp.jsinterp import NativeJSI +from yt_dlp.jsinterp import JSInterpreter _SIG_TESTS = [ ( @@ -274,7 +274,7 @@ def signature(jscode, sig_input): def n_sig(jscode, sig_input): ie = YoutubeIE(FakeYDL()) funcname = ie._extract_n_function_name(jscode) - jsi = NativeJSI(jscode) + jsi = JSInterpreter(jscode) func = jsi.extract_function_from_code(*ie._fixup_n_function_code(*jsi.extract_function_code(funcname))) return func([sig_input]) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 553bbcde7..fa73af7bb 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -20,7 +20,7 @@ import urllib.parse from .common import InfoExtractor, SearchInfoExtractor -from ..jsinterp import NativeJSI, PhantomJSwrapper +from ..jsinterp import JSInterpreter, PhantomJSwrapper from ..networking.exceptions import HTTPError, network_exceptions from ..utils import ( NO_DEFAULT, @@ -3169,7 +3169,7 @@ def _parse_sig_js(self, jscode): r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') - jsi = NativeJSI(jscode) + jsi = JSInterpreter(jscode) initial_function = jsi.extract_function(funcname) return lambda s: initial_function([s]) @@ -3213,7 +3213,7 @@ def _decrypt_nsig(self, s, video_id, player_url): try: extract_nsig = self._cached(self._extract_n_function_from_code, 'nsig func', player_url) ret = extract_nsig(jsi, func_code)(s) - except NativeJSI.Exception as e: + except JSInterpreter.Exception as e: try: jsi = PhantomJSwrapper(self, timeout=5000) except ExtractorError: @@ -3283,7 +3283,7 @@ def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) func_code = self.cache.load('youtube-nsig', player_id, min_ver='2024.07.09') jscode = func_code or self._load_player(video_id, player_url) - jsi = NativeJSI(jscode) + jsi = JSInterpreter(jscode) if func_code: return jsi, player_id, func_code @@ -3302,13 +3302,13 @@ def _extract_n_function_from_code(self, jsi, func_code): def extract_nsig(s): try: ret = func([s]) - except NativeJSI.Exception: + except JSInterpreter.Exception: raise except Exception as e: - raise NativeJSI.Exception(traceback.format_exc(), cause=e) + raise JSInterpreter.Exception(traceback.format_exc(), cause=e) if ret.startswith('enhanced_except_') or ret.endswith(s): - raise NativeJSI.Exception('Signature function returned an exception') + raise JSInterpreter.Exception('Signature function returned an exception') return ret return extract_nsig @@ -4156,7 +4156,7 @@ def build_fragments(f): }) except ExtractorError as e: phantomjs_hint = '' - if isinstance(e, NativeJSI.Exception): + if isinstance(e, JSInterpreter.Exception): phantomjs_hint = (f' Install {self._downloader._format_err("PhantomJS", self._downloader.Styles.EMPHASIS)} ' f'to workaround the issue. {PhantomJSwrapper.INSTALL_HINT}\n') if player_url: diff --git a/yt_dlp/jsinterp/__init__.py b/yt_dlp/jsinterp/__init__.py index 94b471329..2477f0774 100644 --- a/yt_dlp/jsinterp/__init__.py +++ b/yt_dlp/jsinterp/__init__.py @@ -1,10 +1,10 @@ -from .native import JSInterpreter as NativeJSI +from .native import JSInterpreter from .external import PhantomJSwrapper, DenoJSI, PuppeteerJSI from .common import _JSI_PREFERENCES, _JSI_HANDLERS, JSInterp __all__ = [ - NativeJSI, + JSInterpreter, PhantomJSwrapper, DenoJSI, PuppeteerJSI, diff --git a/yt_dlp/jsinterp/common.py b/yt_dlp/jsinterp/common.py index b898a2a21..d6594fd42 100644 --- a/yt_dlp/jsinterp/common.py +++ b/yt_dlp/jsinterp/common.py @@ -240,7 +240,7 @@ def _base_preference(handler: JSI, *args): if typing.TYPE_CHECKING: from ..YoutubeDL import YoutubeDL - JsiClass = typing.TypeVar('JsiClass', bound=typing.Type[JSI]) + JsiClass = typing.TypeVar('JsiClass', bound=type[JSI]) class JSIPreference(typing.Protocol): def __call__(self, handler: JSI, method_name: str, *args, **kwargs) -> int: From fdd98ba6e0f430f89d41293a3c779b4f3d5a796c Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sun, 29 Dec 2024 02:56:58 -0500 Subject: [PATCH 28/48] update --- yt_dlp/extractor/douyutv.py | 13 ++--- yt_dlp/jsinterp/common.py | 57 +++++++++++++------- yt_dlp/jsinterp/external.py | 102 +++++++++++++++++++++--------------- 3 files changed, 101 insertions(+), 71 deletions(-) diff --git a/yt_dlp/extractor/douyutv.py b/yt_dlp/extractor/douyutv.py index c6f816bc6..b413f33d2 100644 --- a/yt_dlp/extractor/douyutv.py +++ b/yt_dlp/extractor/douyutv.py @@ -4,7 +4,7 @@ import uuid from .common import InfoExtractor -from ..jsinterp import DenoJSI, PhantomJSwrapper +from ..jsinterp import PhantomJSwrapper from ..utils import ( ExtractorError, UserNotLive, @@ -43,14 +43,9 @@ def _calc_sign(self, sign_func, video_id, a): b = uuid.uuid4().hex c = round(time.time()) js_script = f'{self._get_cryptojs_md5(video_id)};{sign_func};console.log(ub98484234("{a}","{b}","{c}"))' - if DenoJSI.is_available: - jsi = DenoJSI(self) - elif PhantomJSwrapper.is_available: - jsi = PhantomJSwrapper(self) - else: - raise ExtractorError('You need to install either Deno or PhantomJS. ' - f'{DenoJSI.INSTALL_HINT}. {PhantomJSwrapper.INSTALL_HINT}', expected=True) - result = jsi.execute(js_script, video_id, note='Executing JS signing script').strip() + phantom = PhantomJSwrapper(self) + result = phantom.execute(js_script, video_id, + note='Executing JS signing script').strip() return {i: v[0] for i, v in urllib.parse.parse_qs(result).items()} def _search_js_sign_func(self, webpage, fatal=True): diff --git a/yt_dlp/jsinterp/common.py b/yt_dlp/jsinterp/common.py index d6594fd42..8f49fb2a3 100644 --- a/yt_dlp/jsinterp/common.py +++ b/yt_dlp/jsinterp/common.py @@ -4,16 +4,16 @@ import typing import functools -from ..utils import classproperty, variadic, ExtractorError +from ..utils import classproperty, format_field, variadic, ExtractorError from ..extractor.common import InfoExtractor -DEFAULT_TIMEOUT = 10000 _JSI_HANDLERS: dict[str, type[JSI]] = {} _JSI_PREFERENCES: set[JSIPreference] = set() _ALL_FEATURES = { 'js', 'wasm', + 'location', 'dom', } @@ -60,8 +60,9 @@ class JSInterp: @param jsi_params: extra kwargs to pass to `JSI.__init__()` for each JSI, using jsi key as dict key. @param preferred_order: list of JSI to use. First in list is tested first. @param fallback_jsi: list of JSI that may fail and should act non-fatal and fallback to other JSI. Pass `"all"` to always fallback - @param timeout: explicit timeout parameter in miliseconds for all chosen JSI + @param timeout: timeout parameter for all chosen JSI """ + def __init__( self, dl_or_ie: YoutubeDL | InfoExtractor, @@ -71,7 +72,7 @@ def __init__( jsi_params: dict[str, dict] = {}, preferred_order: typing.Iterable[str | type[JSI]] = [], fallback_jsi: typing.Iterable[str | type[JSI]] | typing.Literal['all'] = [], - timeout: float | None = None, + timeout: float | int = 10, ): self._downloader: YoutubeDL = dl_or_ie._downloader if isinstance(dl_or_ie, InfoExtractor) else dl_or_ie self._features = set(features) @@ -86,7 +87,7 @@ def __init__( self.write_debug(f'Selected JSI classes for given features: {get_jsi_keys(handler_classes)}, ' f'included: {get_jsi_keys(only_include) or "all"}, excluded: {get_jsi_keys(exclude)}') - self._handler_dict = {cls.JSI_KEY: cls(self._downloader, timeout, **jsi_params.get(cls.JSI_KEY, {})) + self._handler_dict = {cls.JSI_KEY: cls(self._downloader, timeout=timeout, **jsi_params.get(cls.JSI_KEY, {})) for cls in handler_classes} self.preferences: set[JSIPreference] = {order_to_pref(preferred_order, 100)} | _JSI_PREFERENCES self._fallback_jsi = get_jsi_keys(handler_classes) if fallback_jsi == 'all' else get_jsi_keys(fallback_jsi) @@ -166,40 +167,56 @@ def _dispatch_request(self, method_name: str, *args, **kwargs): msg = f'{msg}. You can try installing one of unavailable JSI: {join_jsi_name(unavailable)}' raise ExtractorError(msg) - @require_features({'html': 'dom'}) - def execute(self, jscode: str, url: str | None = None, html: str | None = None) -> str: + @require_features({'url': 'location', 'html': 'dom'}) + def execute(self, jscode: str, video_id: str | None, **kwargs) -> str: """ Execute JS code and return stdout from console.log - `html` requires `dom` feature - """ - return self._dispatch_request('execute', jscode, url=url, html=html) - @require_features({'html': 'dom'}) - def evaluate(self, jscode: str, url: str | None = None, html: str | None = None) -> typing.Any: + @param {str} jscode: JS code to execute + @param video_id: video id + @param note: note + @param {str} url: url to set location to, requires `location` feature + @param {str} html: html to load as document, requires `dom` feature + """ + return self._dispatch_request('execute', jscode, video_id, **kwargs) + + @require_features({'url': 'location', 'html': 'dom'}) + def evaluate(self, jscode: str, video_id: str | None, **kwargs) -> typing.Any: """ Evaluate JS code and return result - `html` requires `dom` feature + + @param {str} jscode: JS code to execute + @param video_id: video id + @param note: note + @param {str} url: url to set location to, requires `location` feature + @param {str} html: html to load as document, requires `dom` feature """ - return self._dispatch_request('evaluate', jscode, url=url, html=html) + return self._dispatch_request('evaluate', jscode, video_id, **kwargs) class JSI(abc.ABC): _SUPPORT_FEATURES: set[str] = set() _BASE_PREFERENCE: int = 0 - def __init__(self, downloader: YoutubeDL, timeout: float | int | None = None): + def __init__(self, downloader: YoutubeDL, timeout: float | int): self._downloader = downloader - self.timeout = float(timeout or DEFAULT_TIMEOUT) + self.timeout = timeout @abc.abstractmethod def is_available(self) -> bool: raise NotImplementedError - def write_debug(self, message, only_once=False): - return self._downloader.write_debug(f'[{self.JSI_KEY}] {message}', only_once=only_once) + def write_debug(self, message, *args, **kwargs): + self._downloader.write_debug(f'[{self.JSI_KEY}] {message}', *args, **kwargs) - def report_warning(self, message, only_once=False): - return self._downloader.report_warning(f'[{self.JSI_KEY}] {message}', only_once=only_once) + def report_warning(self, message, *args, **kwargs): + self._downloader.report_warning(f'[{self.JSI_KEY}] {message}', *args, **kwargs) + + def to_screen(self, msg, *args, **kwargs): + self._downloader.to_screen(f'[{self.JSI_KEY}] {msg}', *args, **kwargs) + + def report_note(self, video_id, note): + self.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') @classproperty def JSI_NAME(cls) -> str: diff --git a/yt_dlp/jsinterp/external.py b/yt_dlp/jsinterp/external.py index 36c668938..f0ddf74ea 100644 --- a/yt_dlp/jsinterp/external.py +++ b/yt_dlp/jsinterp/external.py @@ -98,21 +98,17 @@ def __del__(self): class ExternalJSI(JSI, abc.ABC): - _EXE_NAME: str = None + _EXE_NAME: str @classproperty(cache=True) - def version(cls): + def exe_version(cls): return get_exe_version(cls._EXE_NAME, args=getattr(cls, 'V_ARGS', ['--version']), version_re=r'([0-9.]+)') - @classproperty - def full_version(cls): - return cls.version - @classproperty def exe(cls): - return cls._EXE_NAME if cls.version else None + return cls._EXE_NAME if cls.exe_version else None - @classproperty + @classmethod def is_available(cls): return bool(cls.exe) @@ -121,55 +117,78 @@ def is_available(cls): class DenoJSI(ExternalJSI): """JS interpreter class using Deno binary""" _EXE_NAME = 'deno' - INSTALL_HINT = 'Please install Deno from https://docs.deno.com/runtime/manual/getting_started/installation/ or download binary from https://github.com/denoland/deno/releases' - _SUPPORTED_FEATURES = {'js', 'wasm'} + _SUPPORTED_FEATURES = {'js', 'wasm', 'location'} + _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check'] + _INIT_SCRIPT = 'localStorage.clear(); delete window.Deno; global = window;\n' - def __init__(self, downloader: YoutubeDL, timeout: float | int | None = None, required_version=None): + def __init__(self, downloader: YoutubeDL, timeout=None, flags=[], replace_flags=False, init_script=None): super().__init__(downloader, timeout) + self._flags = flags if replace_flags else [*self._DENO_FLAGS, *flags] + self._init_script = self._INIT_SCRIPT if init_script is None else init_script - @classmethod - def _execute(cls, jscode, downloader: YoutubeDL | None = None, video_id=None, note='', flags=[], timeout=10000): - js_file = TempFileWrapper(jscode, suffix='.js') - if note and downloader: - downloader.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') - cmd = [cls.exe, 'run', *flags, js_file.name] + def _run_deno(self, cmd, video_id=None): + self.write_debug(f'Deno command line: {shell_quote(cmd)}') try: stdout, stderr, returncode = Popen.run( - cmd, timeout=timeout / 1000, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + cmd, timeout=self.timeout, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except Exception as e: raise ExtractorError('Unable to run Deno binary', cause=e) if returncode: raise ExtractorError(f'Failed with returncode {returncode}:\n{stderr}') - elif stderr and downloader: - downloader.report_warning(f'JS console error msg:\n{stderr.strip()}', video_id=video_id) + elif stderr: + self.report_warning(f'JS console error msg:\n{stderr.strip()}', video_id=video_id) return stdout.strip() - def execute(self, jscode, video_id=None, note='Executing JS in Deno', flags=[], base_js=None): - """Execute JS directly in Deno runtime and return stdout""" - - base_js = 'delete window.Deno; global = window;\n' if base_js is None else base_js - - return self._execute(base_js + jscode, downloader=self._downloader, video_id=video_id, note=note, - flags=flags, timeout=self.timeout) + def execute(self, jscode, video_id=None, note='Executing JS in Deno', url=None): + self.report_note(video_id, note) + js_file = TempFileWrapper(f'{self._init_script};\n{jscode}', suffix='.js') + location_args = ['--location', url] if url else [] + cmd = [self.exe, 'run', *self._flags, *location_args, js_file.name] + return self._run_deno(cmd, video_id=video_id) @register_jsi class DenoJITlessJSI(DenoJSI): _EXE_NAME = DenoJSI._EXE_NAME - INSTALL_HINT = DenoJSI.INSTALL_HINT - _SUPPORTED_FEATURES = {'js'} + _SUPPORTED_FEATURES = {'js', 'location'} + _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check', '--v8-flags=--jitless,--noexpose-wasm'] @classproperty - def version(cls): - return DenoJSI.version - - def execute(self, jscode, video_id=None, note='Executing JS in Deno', flags=[], base_js=None): - # JIT-less mode does not support Wasm - return super().execute(jscode, video_id, note=note, - flags=[*flags, '--v8-flags=--jitless,--noexpose-wasm'], base_js=base_js) + def exe_version(cls): + return DenoJSI.exe_version + + +class DenoJSDomJSI(DenoJSI): + _SUPPORTED_FEATURES = {'js', 'wasm', 'dom'} + _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check'] + _JSDOM_IMPORT = False + + def _ensure_jsdom(self): + if self._JSDOM_IMPORT: + return + js_file = TempFileWrapper('import { JSDOM } from "https://cdn.esm.sh/jsdom"', suffix='.js') + cmd = [self.exe, 'run', js_file.name] + self._run_deno(cmd) + self._JSDOM_IMPORT = True + + def execute(self, jscode, video_id=None, note='Executing JS in Deno', url=None, html=None): + self.report_note(video_id, note) + if html: + self._ensure_jsdom() + init_script = '''%s; + import { JSDOM } from "https://cdn.esm.sh/jsdom"; + const dom = new JSDOM(%s); + Object.keys(dom.window).forEach((key) => {try {window[key] = dom.window[key]} catch (e) {}}); + ''' % (self._init_script, json.dumps(html)) + else: + init_script = self._init_script + js_file = TempFileWrapper(f'{init_script};\n{jscode}', suffix='.js') + + location_args = ['--location', url] if url else [] + cmd = [self.exe, 'run', *self._flags, *location_args, js_file.name] + return self._run_deno(cmd, video_id=video_id) -@register_jsi class PuppeteerJSI(ExternalJSI): _PACKAGE_VERSION = '16.2.0' _HEADLESS = False @@ -200,8 +219,8 @@ def full_version(cls): return None @classproperty - def version(cls): - return DenoJSI.version if cls.full_version else None + def exe_version(cls): + return DenoJSI.exe_version if cls.full_version else None def __init__(self, downloader: YoutubeDL, timeout: float | int | None = None): super().__init__(downloader, timeout) @@ -306,7 +325,7 @@ class PhantomJSwrapper(ExternalJSI): @classmethod def _version(cls): - return cls.version + return cls.exe_version def __init__(self, extractor: InfoExtractor, required_version=None, timeout=10000): self._TMP_FILES = {} @@ -317,7 +336,7 @@ def __init__(self, extractor: InfoExtractor, required_version=None, timeout=1000 self.extractor = extractor if required_version: - if is_outdated_version(self.version, required_version): + if is_outdated_version(self.exe_version, required_version): self.extractor._downloader.report_warning( 'Your copy of PhantomJS is outdated, update it to version ' f'{required_version} or newer if you encounter any errors.') @@ -444,5 +463,4 @@ def execute(self, jscode, video_id=None, *, note='Executing JS in PhantomJS'): if typing.TYPE_CHECKING: from ..YoutubeDL import YoutubeDL - # from .common import JSIRequest, JSIResponse from ..extractor.common import InfoExtractor From 03a99eefaee13e408e5228e483f0a4e17337b282 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sun, 29 Dec 2024 20:27:00 -0500 Subject: [PATCH 29/48] jsdom --- yt_dlp/jsinterp/common.py | 19 ++----- yt_dlp/jsinterp/external.py | 102 +++++++++++++++++++++++++++++------- 2 files changed, 86 insertions(+), 35 deletions(-) diff --git a/yt_dlp/jsinterp/common.py b/yt_dlp/jsinterp/common.py index 8f49fb2a3..4a6ba65d2 100644 --- a/yt_dlp/jsinterp/common.py +++ b/yt_dlp/jsinterp/common.py @@ -15,6 +15,7 @@ 'wasm', 'location', 'dom', + 'cookies', } @@ -167,7 +168,7 @@ def _dispatch_request(self, method_name: str, *args, **kwargs): msg = f'{msg}. You can try installing one of unavailable JSI: {join_jsi_name(unavailable)}' raise ExtractorError(msg) - @require_features({'url': 'location', 'html': 'dom'}) + @require_features({'location': 'location', 'html': 'dom', 'cookiejar': 'cookies'}) def execute(self, jscode: str, video_id: str | None, **kwargs) -> str: """ Execute JS code and return stdout from console.log @@ -175,24 +176,12 @@ def execute(self, jscode: str, video_id: str | None, **kwargs) -> str: @param {str} jscode: JS code to execute @param video_id: video id @param note: note - @param {str} url: url to set location to, requires `location` feature + @param {str} location: url to configure window.location, requires `location` feature @param {str} html: html to load as document, requires `dom` feature + @param {YoutubeDLCookieJar} cookiejar: cookiejar to set cookies, requires url and `cookies` feature """ return self._dispatch_request('execute', jscode, video_id, **kwargs) - @require_features({'url': 'location', 'html': 'dom'}) - def evaluate(self, jscode: str, video_id: str | None, **kwargs) -> typing.Any: - """ - Evaluate JS code and return result - - @param {str} jscode: JS code to execute - @param video_id: video id - @param note: note - @param {str} url: url to set location to, requires `location` feature - @param {str} html: html to load as document, requires `dom` feature - """ - return self._dispatch_request('evaluate', jscode, video_id, **kwargs) - class JSI(abc.ABC): _SUPPORT_FEATURES: set[str] = set() diff --git a/yt_dlp/jsinterp/external.py b/yt_dlp/jsinterp/external.py index f0ddf74ea..be09f3131 100644 --- a/yt_dlp/jsinterp/external.py +++ b/yt_dlp/jsinterp/external.py @@ -9,6 +9,7 @@ import tempfile import urllib.parse import typing +import http.cookiejar from ..utils import ( @@ -19,6 +20,8 @@ get_exe_version, is_outdated_version, shell_quote, + int_or_none, + unified_timestamp, ) from .common import JSI, register_jsi @@ -126,7 +129,7 @@ def __init__(self, downloader: YoutubeDL, timeout=None, flags=[], replace_flags= self._flags = flags if replace_flags else [*self._DENO_FLAGS, *flags] self._init_script = self._INIT_SCRIPT if init_script is None else init_script - def _run_deno(self, cmd, video_id=None): + def _run_deno(self, cmd): self.write_debug(f'Deno command line: {shell_quote(cmd)}') try: stdout, stderr, returncode = Popen.run( @@ -136,15 +139,15 @@ def _run_deno(self, cmd, video_id=None): if returncode: raise ExtractorError(f'Failed with returncode {returncode}:\n{stderr}') elif stderr: - self.report_warning(f'JS console error msg:\n{stderr.strip()}', video_id=video_id) + self.report_warning(f'JS console error msg:\n{stderr.strip()}') return stdout.strip() - def execute(self, jscode, video_id=None, note='Executing JS in Deno', url=None): + def execute(self, jscode, video_id=None, note='Executing JS in Deno', location=None): self.report_note(video_id, note) js_file = TempFileWrapper(f'{self._init_script};\n{jscode}', suffix='.js') - location_args = ['--location', url] if url else [] + location_args = ['--location', location] if location else [] cmd = [self.exe, 'run', *self._flags, *location_args, js_file.name] - return self._run_deno(cmd, video_id=video_id) + return self._run_deno(cmd) @register_jsi @@ -159,34 +162,92 @@ def exe_version(cls): class DenoJSDomJSI(DenoJSI): - _SUPPORTED_FEATURES = {'js', 'wasm', 'dom'} + _SUPPORTED_FEATURES = {'js', 'wasm', 'location', 'dom', 'cookies'} _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check'] _JSDOM_IMPORT = False + @staticmethod + def serialize_cookie(cookiejar: YoutubeDLCookieJar | None, url: str): + """serialize netscape-compatible fields from cookiejar for tough-cookie loading""" + # JSDOM use tough-cookie as its CookieJar https://github.com/jsdom/jsdom/blob/main/lib/api.js + # tough-cookie use Cookie.fromJSON and Cookie.toJSON for cookie serialization + # https://github.com/salesforce/tough-cookie/blob/master/lib/cookie/cookie.ts + if not cookiejar: + return json.dumps({'cookies': []}) + cookies: list[http.cookiejar.Cookie] = [cookie for cookie in cookiejar.get_cookies_for_url(url)] + return json.dumps({'cookies': [{ + 'key': cookie.name, + 'value': cookie.value, + # leading dot must be removed, otherwise will fail to match + 'domain': cookie.domain.lstrip('.') or urllib.parse.urlparse(url).hostname, + 'expires': int_or_none(cookie.expires, invscale=1000), + 'hostOnly': not cookie.domain_initial_dot, + 'secure': bool(cookie.secure), + 'path': cookie.path, + } for cookie in cookies if cookie.value]}) + + @staticmethod + def apply_cookies(cookiejar: YoutubeDLCookieJar | None, cookies: list[dict]): + """apply cookies from serialized tough-cookie""" + # see serialize_cookie + if not cookiejar: + return + for cookie_dict in cookies: + if not all(cookie_dict.get(k) for k in ('key', 'value', 'domain')): + continue + if cookie_dict.get('hostOnly'): + cookie_dict['domain'] = cookie_dict['domain'].lstrip('.') + else: + cookie_dict['domain'] = '.' + cookie_dict['domain'].lstrip('.') + + cookiejar.set_cookie(http.cookiejar.Cookie( + 0, cookie_dict['key'], cookie_dict['value'], + None, False, + cookie_dict['domain'], True, not cookie_dict.get('hostOnly'), + cookie_dict.get('path', '/'), True, + bool(cookie_dict.get('secure')), + unified_timestamp(cookie_dict.get('expires')), + False, None, None, {})) + def _ensure_jsdom(self): if self._JSDOM_IMPORT: return - js_file = TempFileWrapper('import { JSDOM } from "https://cdn.esm.sh/jsdom"', suffix='.js') + js_file = TempFileWrapper('import jsdom from "https://cdn.esm.sh/jsdom"', suffix='.js') cmd = [self.exe, 'run', js_file.name] self._run_deno(cmd) self._JSDOM_IMPORT = True - def execute(self, jscode, video_id=None, note='Executing JS in Deno', url=None, html=None): + def execute(self, jscode, video_id=None, note='Executing JS in Deno', location='', html='', cookiejar=None): self.report_note(video_id, note) - if html: - self._ensure_jsdom() - init_script = '''%s; - import { JSDOM } from "https://cdn.esm.sh/jsdom"; - const dom = new JSDOM(%s); - Object.keys(dom.window).forEach((key) => {try {window[key] = dom.window[key]} catch (e) {}}); - ''' % (self._init_script, json.dumps(html)) - else: - init_script = self._init_script - js_file = TempFileWrapper(f'{init_script};\n{jscode}', suffix='.js') + self._ensure_jsdom() + script = f'''{self._init_script}; + import jsdom from "https://cdn.esm.sh/jsdom"; + const callback = (() => {{ + const jar = jsdom.CookieJar.deserializeSync({json.dumps(self.serialize_cookie(cookiejar, location))}); + const dom = new jsdom.JSDOM({json.dumps(str(html))}, {{ + {'url: %s,' % json.dumps(str(location)) if location else ''} + cookieJar: jar, + }}); + Object.keys(dom.window).forEach((key) => {{try {{window[key] = dom.window[key]}} catch (e) {{}}}}); + delete window.jsdom; + const stdout = []; + const origLog = console.log; + console.log = (...msg) => stdout.push(msg.map(m => m.toString()).join(' ')); + return () => {{ origLog(JSON.stringify({{ + stdout: stdout.join('\\n'), cookies: jar.serializeSync().cookies}})); }} + }})(); + await (async () => {{ + {jscode} + }})().finally(callback); + ''' - location_args = ['--location', url] if url else [] + js_file = TempFileWrapper(script, suffix='.js') + + location_args = ['--location', location] if location else [] cmd = [self.exe, 'run', *self._flags, *location_args, js_file.name] - return self._run_deno(cmd, video_id=video_id) + data = json.loads(self._run_deno(cmd)) + self.apply_cookies(cookiejar, data['cookies']) + return data['stdout'] class PuppeteerJSI(ExternalJSI): @@ -464,3 +525,4 @@ def execute(self, jscode, video_id=None, *, note='Executing JS in PhantomJS'): if typing.TYPE_CHECKING: from ..YoutubeDL import YoutubeDL from ..extractor.common import InfoExtractor + from ..cookies import YoutubeDLCookieJar From 6d622d5481126f88f72f747c7a0a300e75b1ccf2 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sun, 29 Dec 2024 20:34:05 -0500 Subject: [PATCH 30/48] extract phantomjs --- yt_dlp/jsinterp/common.py | 9 +- yt_dlp/jsinterp/external.py | 239 +++++++++++++++++++----------------- 2 files changed, 133 insertions(+), 115 deletions(-) diff --git a/yt_dlp/jsinterp/common.py b/yt_dlp/jsinterp/common.py index 4a6ba65d2..aae8ea69d 100644 --- a/yt_dlp/jsinterp/common.py +++ b/yt_dlp/jsinterp/common.py @@ -62,6 +62,7 @@ class JSInterp: @param preferred_order: list of JSI to use. First in list is tested first. @param fallback_jsi: list of JSI that may fail and should act non-fatal and fallback to other JSI. Pass `"all"` to always fallback @param timeout: timeout parameter for all chosen JSI + @param user_agent: override user-agent to use for supported JSI """ def __init__( @@ -74,6 +75,7 @@ def __init__( preferred_order: typing.Iterable[str | type[JSI]] = [], fallback_jsi: typing.Iterable[str | type[JSI]] | typing.Literal['all'] = [], timeout: float | int = 10, + user_agent: str | None = None, ): self._downloader: YoutubeDL = dl_or_ie._downloader if isinstance(dl_or_ie, InfoExtractor) else dl_or_ie self._features = set(features) @@ -88,8 +90,8 @@ def __init__( self.write_debug(f'Selected JSI classes for given features: {get_jsi_keys(handler_classes)}, ' f'included: {get_jsi_keys(only_include) or "all"}, excluded: {get_jsi_keys(exclude)}') - self._handler_dict = {cls.JSI_KEY: cls(self._downloader, timeout=timeout, **jsi_params.get(cls.JSI_KEY, {})) - for cls in handler_classes} + self._handler_dict = {cls.JSI_KEY: cls(self._downloader, timeout=timeout, user_agent=user_agent, + **jsi_params.get(cls.JSI_KEY, {})) for cls in handler_classes} self.preferences: set[JSIPreference] = {order_to_pref(preferred_order, 100)} | _JSI_PREFERENCES self._fallback_jsi = get_jsi_keys(handler_classes) if fallback_jsi == 'all' else get_jsi_keys(fallback_jsi) self._is_test = self._downloader.params.get('test', False) @@ -187,9 +189,10 @@ class JSI(abc.ABC): _SUPPORT_FEATURES: set[str] = set() _BASE_PREFERENCE: int = 0 - def __init__(self, downloader: YoutubeDL, timeout: float | int): + def __init__(self, downloader: YoutubeDL, timeout: float | int, user_agent=None): self._downloader = downloader self.timeout = timeout + self.user_agent: str = user_agent or self._downloader.params['http_headers']['User-Agent'] @abc.abstractmethod def is_available(self) -> bool: diff --git a/yt_dlp/jsinterp/external.py b/yt_dlp/jsinterp/external.py index be09f3131..a944261f4 100644 --- a/yt_dlp/jsinterp/external.py +++ b/yt_dlp/jsinterp/external.py @@ -1,7 +1,6 @@ from __future__ import annotations import abc -import collections import contextlib import json import os @@ -99,6 +98,12 @@ def cleanup(self): def __del__(self): self.cleanup() + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.cleanup() + class ExternalJSI(JSI, abc.ABC): _EXE_NAME: str @@ -119,8 +124,9 @@ def is_available(cls): @register_jsi class DenoJSI(ExternalJSI): """JS interpreter class using Deno binary""" - _EXE_NAME = 'deno' _SUPPORTED_FEATURES = {'js', 'wasm', 'location'} + _BASE_PREFERENCE = 5 + _EXE_NAME = 'deno' _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check'] _INIT_SCRIPT = 'localStorage.clear(); delete window.Deno; global = window;\n' @@ -144,16 +150,17 @@ def _run_deno(self, cmd): def execute(self, jscode, video_id=None, note='Executing JS in Deno', location=None): self.report_note(video_id, note) - js_file = TempFileWrapper(f'{self._init_script};\n{jscode}', suffix='.js') location_args = ['--location', location] if location else [] - cmd = [self.exe, 'run', *self._flags, *location_args, js_file.name] - return self._run_deno(cmd) + with TempFileWrapper(f'{self._init_script};\n{jscode}', suffix='.js') as js_file: + cmd = [self.exe, 'run', *self._flags, *location_args, js_file.name] + return self._run_deno(cmd) @register_jsi class DenoJITlessJSI(DenoJSI): - _EXE_NAME = DenoJSI._EXE_NAME _SUPPORTED_FEATURES = {'js', 'location'} + _BASE_PREFERENCE = 6 + _EXE_NAME = DenoJSI._EXE_NAME _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check', '--v8-flags=--jitless,--noexpose-wasm'] @classproperty @@ -163,8 +170,9 @@ def exe_version(cls): class DenoJSDomJSI(DenoJSI): _SUPPORTED_FEATURES = {'js', 'wasm', 'location', 'dom', 'cookies'} + _BASE_PREFERENCE = 4 _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check'] - _JSDOM_IMPORT = False + _JSDOM_IMPORT_CHECKED = False @staticmethod def serialize_cookie(cookiejar: YoutubeDLCookieJar | None, url: str): @@ -210,12 +218,12 @@ def apply_cookies(cookiejar: YoutubeDLCookieJar | None, cookies: list[dict]): False, None, None, {})) def _ensure_jsdom(self): - if self._JSDOM_IMPORT: + if self._JSDOM_IMPORT_CHECKED: return - js_file = TempFileWrapper('import jsdom from "https://cdn.esm.sh/jsdom"', suffix='.js') - cmd = [self.exe, 'run', js_file.name] - self._run_deno(cmd) - self._JSDOM_IMPORT = True + with TempFileWrapper('import jsdom from "https://cdn.esm.sh/jsdom"', suffix='.js') as js_file: + cmd = [self.exe, 'run', js_file.name] + self._run_deno(cmd) + self._JSDOM_IMPORT_CHECKED = True def execute(self, jscode, video_id=None, note='Executing JS in Deno', location='', html='', cookiejar=None): self.report_note(video_id, note) @@ -241,11 +249,10 @@ def execute(self, jscode, video_id=None, note='Executing JS in Deno', location=' }})().finally(callback); ''' - js_file = TempFileWrapper(script, suffix='.js') - location_args = ['--location', location] if location else [] - cmd = [self.exe, 'run', *self._flags, *location_args, js_file.name] - data = json.loads(self._run_deno(cmd)) + with TempFileWrapper(script, suffix='.js') as js_file: + cmd = [self.exe, 'run', *self._flags, *location_args, js_file.name] + data = json.loads(self._run_deno(cmd)) self.apply_cookies(cookiejar, data['cookies']) return data['stdout'] @@ -327,13 +334,11 @@ def execute(self, jscode, video_id=None, note='Executing JS in Puppeteer', url=' ''') -class PhantomJSwrapper(ExternalJSI): - """PhantomJS wrapper class - - This class is experimental. - """ +@register_jsi +class PhantomJSJSI(ExternalJSI): _EXE_NAME = 'phantomjs' - INSTALL_HINT = 'Please download PhantomJS from https://phantomjs.org/download.html' + _SUPPORTED_FEATURES = {'js', 'location', 'cookies'} + _BASE_PREFERENCE = 3 _BASE_JS = R''' phantom.onError = function(msg, trace) {{ @@ -355,11 +360,11 @@ class PhantomJSwrapper(ExternalJSI): var fs = require('fs'); var read = {{ mode: 'r', charset: 'utf-8' }}; var write = {{ mode: 'w', charset: 'utf-8' }}; - JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{ + JSON.parse(fs.read({cookies_fn}, read)).forEach(function(x) {{ phantom.addCookie(x); }}); page.settings.resourceTimeout = {timeout}; - page.settings.userAgent = "{ua}"; + page.settings.userAgent = {ua}; page.onLoadStarted = function() {{ page.evaluate(function() {{ delete window._phantom; @@ -367,13 +372,13 @@ class PhantomJSwrapper(ExternalJSI): }}); }}; var saveAndExit = function() {{ - fs.write("{html}", page.content, write); - fs.write("{cookies}", JSON.stringify(phantom.cookies), write); + fs.write({html_fn}, page.content, write); + fs.write({cookies_fn}, JSON.stringify(phantom.cookies), write); phantom.exit(); }}; page.onLoadFinished = function(status) {{ if(page.url === "") {{ - page.setContent(fs.read("{html}", read), "{url}"); + page.setContent(fs.read({html_fn}, read), {url}); }} else {{ {jscode} @@ -382,62 +387,105 @@ class PhantomJSwrapper(ExternalJSI): page.open(""); ''' - _TMP_FILE_NAMES = ['script', 'html', 'cookies'] - - @classmethod - def _version(cls): - return cls.exe_version - - def __init__(self, extractor: InfoExtractor, required_version=None, timeout=10000): - self._TMP_FILES = {} - - if not self.exe: - raise ExtractorError(f'PhantomJS not found, {self.INSTALL_HINT}', expected=True) - - self.extractor = extractor - - if required_version: - if is_outdated_version(self.exe_version, required_version): - self.extractor._downloader.report_warning( - 'Your copy of PhantomJS is outdated, update it to version ' - f'{required_version} or newer if you encounter any errors.') - - for name in self._TMP_FILE_NAMES: - tmp = tempfile.NamedTemporaryFile(delete=False) - tmp.close() - self._TMP_FILES[name] = tmp - - self.options = collections.ChainMap({ - 'timeout': timeout, - }, { - x: self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"') - for x in self._TMP_FILE_NAMES - }) - - def __del__(self): - for name in self._TMP_FILE_NAMES: - with contextlib.suppress(OSError, KeyError): - os.remove(self._TMP_FILES[name].name) - - def _save_cookies(self, url): - cookies = cookie_jar_to_list(self.extractor.cookiejar) + def _save_cookies(self, url, cookiejar): + cookies = cookie_jar_to_list(cookiejar) if cookiejar else [] for cookie in cookies: if 'path' not in cookie: cookie['path'] = '/' if 'domain' not in cookie: cookie['domain'] = urllib.parse.urlparse(url).netloc - with open(self._TMP_FILES['cookies'].name, 'wb') as f: - f.write(json.dumps(cookies).encode()) + return json.dumps(cookies) - def _load_cookies(self): - with open(self._TMP_FILES['cookies'].name, 'rb') as f: - cookies = json.loads(f.read().decode('utf-8')) + def _load_cookies(self, cookies_json: str, cookiejar): + if not cookiejar: + return + cookies = json.loads(cookies_json) for cookie in cookies: - if cookie['httponly'] is True: - cookie['rest'] = {'httpOnly': None} - if 'expiry' in cookie: - cookie['expire_time'] = cookie['expiry'] - self.extractor._set_cookie(**cookie) + cookiejar.set_cookie(http.cookiejar.Cookie( + 0, cookie['name'], cookie['value'], cookie.get('port'), cookie.get('port') is not None, + cookie['domain'], True, cookie['domain'].startswith('.'), + cookie.get('path', '/'), True, + cookie.get('secure', False), cookie.get('expiry'), + cookie.get('discard', False), None, None, + {'httpOnly': None} if cookie.get('httponly') is True else {} + )) + + def _execute(self, jscode: str, video_id=None, *, note='Executing JS in PhantomJS'): + """Execute JS and return stdout""" + if 'phantom.exit();' not in jscode: + jscode += ';\nphantom.exit();' + jscode = self._BASE_JS + jscode + + self.report_note(video_id, note) + with TempFileWrapper(jscode, suffix='.js') as js_file: + cmd = [self.exe, '--ssl-protocol=any', js_file.name] + self.write_debug(f'PhantomJS command line: {shell_quote(cmd)}') + try: + stdout, stderr, returncode = Popen.run( + cmd, timeout=self.timeout, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + except Exception as e: + raise ExtractorError(f'{note} failed: Unable to run PhantomJS binary', cause=e) + if returncode: + raise ExtractorError(f'{note} failed with returncode {returncode}:\n{stderr.strip()}') + return stdout + + def _execute_html(self, jscode: str, url: str, html: str, cookiejar, video_id=None, note='Executing JS on webpage'): + if 'saveAndExit();' not in jscode: + raise ExtractorError('`saveAndExit();` not found in `jscode`') + + html_file = TempFileWrapper(html, suffix='.html') + cookie_file = TempFileWrapper(self._save_cookies(url, cookiejar), suffix='.json') + + jscode = self._TEMPLATE.format(**{ + 'url': json.dumps(str(url)), + 'ua': json.dumps(str(self.user_agent)), + 'jscode': jscode, + 'html_fn': json.dumps(html_file.name), + 'cookies_fn': json.dumps(cookie_file.name), + 'timeout': int(self.timeout * 1000), + }) + + stdout = self._execute(jscode, video_id, note=note) + self._load_cookies(cookie_file.read(), cookiejar) + new_html = html_file.read() + + return new_html, stdout + + def execute(self, jscode, video_id=None, + note='Executing JS in PhantomJS', location=None, html='', cookiejar=None): + if location: + return self._execute_html(jscode, location, html, cookiejar, video_id=video_id, note=note)[1] + if html: + self.report_warning('`location` is required to use `html`') + if cookiejar: + self.report_warning('`location` and `html` are required to use `cookiejar`') + return self._execute(jscode, video_id, note=note) + + +class PhantomJSwrapper: + """PhantomJS wrapper class + + This class is experimental. + """ + INSTALL_HINT = 'Please download PhantomJS from https://phantomjs.org/download.html' + + @classmethod + def _version(cls): + return PhantomJSJSI.exe_version + + def __init__(self, extractor: InfoExtractor, required_version=None, timeout=10000): + self._jsi = PhantomJSJSI(extractor._downloader, timeout=timeout / 1000) + + if not self._jsi.is_available(): + raise ExtractorError(f'PhantomJS not found, {self.INSTALL_HINT}', expected=True) + + self.extractor = extractor + + if required_version: + if is_outdated_version(self._jsi.exe_version, required_version): + self._jsi.report_warning( + 'Your copy of PhantomJS is outdated, update it to version ' + f'{required_version} or newer if you encounter any errors.') def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): """ @@ -479,47 +527,14 @@ def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on w raise ExtractorError('`saveAndExit();` not found in `jscode`') if not html: html = self.extractor._download_webpage(url, video_id, note=note, headers=headers) - with open(self._TMP_FILES['html'].name, 'wb') as f: - f.write(html.encode()) - self._save_cookies(url) + self._jsi.user_agent = headers.get('User-Agent') or self.extractor.get_param('http_headers')['User-Agent'] - user_agent = headers.get('User-Agent') or self.extractor.get_param('http_headers')['User-Agent'] - jscode = self._TEMPLATE.format_map(self.options.new_child({ - 'url': url, - 'ua': user_agent.replace('"', '\\"'), - 'jscode': jscode, - })) - - stdout = self.execute(jscode, video_id, note=note2) - - with open(self._TMP_FILES['html'].name, 'rb') as f: - html = f.read().decode('utf-8') - self._load_cookies() - - return html, stdout + return self._jsi._execute_html(jscode, url, html, self.extractor.cookiejar, video_id=video_id, note=note2) def execute(self, jscode, video_id=None, *, note='Executing JS in PhantomJS'): """Execute JS and return stdout""" - if 'phantom.exit();' not in jscode: - jscode += ';\nphantom.exit();' - jscode = self._BASE_JS + jscode - - with open(self._TMP_FILES['script'].name, 'w', encoding='utf-8') as f: - f.write(jscode) - self.extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') - - cmd = [self.exe, '--ssl-protocol=any', self._TMP_FILES['script'].name] - self.extractor.write_debug(f'PhantomJS command line: {shell_quote(cmd)}') - try: - stdout, stderr, returncode = Popen.run(cmd, timeout=self.options['timeout'] / 1000, - text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - except Exception as e: - raise ExtractorError(f'{note} failed: Unable to run PhantomJS binary', cause=e) - if returncode: - raise ExtractorError(f'{note} failed with returncode {returncode}:\n{stderr.strip()}') - - return stdout + return self._jsi.execute(jscode, video_id=video_id, note=note) if typing.TYPE_CHECKING: From 8c6d01f757688c75ae3b7a96ed20b7f9abaf25c9 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Mon, 30 Dec 2024 05:41:09 -0500 Subject: [PATCH 31/48] split and fixes for phantom --- yt_dlp/jsinterp/_helper.py | 71 +++++++++++++++ yt_dlp/jsinterp/common.py | 24 ++++- yt_dlp/jsinterp/external.py | 170 +++++++++++------------------------- 3 files changed, 147 insertions(+), 118 deletions(-) create mode 100644 yt_dlp/jsinterp/_helper.py diff --git a/yt_dlp/jsinterp/_helper.py b/yt_dlp/jsinterp/_helper.py new file mode 100644 index 000000000..22525018d --- /dev/null +++ b/yt_dlp/jsinterp/_helper.py @@ -0,0 +1,71 @@ +from __future__ import annotations +import contextlib +import os +import random +import string +import tempfile + + +class TempFileWrapper: + """ + Wrapper for NamedTemporaryFile, auto closes file after io and deletes file upon wrapper object gc + + @param {str | bytes | None} content: content to write to file upon creation + @param {bool} text: whether to open file in text mode + @param {str} encoding: encoding to use for text mode + @param {str | None} suffix: suffix for filename of temporary file + """ + + def __init__(self, content: str | bytes | None = None, text: bool = True, + encoding='utf-8', suffix: str | None = None): + self.encoding = None if not text else encoding + self.text = text + self._file = tempfile.NamedTemporaryFile('w' if text else 'wb', encoding=self.encoding, + suffix=suffix, delete=False) + if content: + self._file.write(content) + self._file.close() + + @property + def name(self): + return self._file.name + + @contextlib.contextmanager + def opened_file(self, mode, *, seek=None, seek_whence=0): + mode = mode if (self.text or 'b' in mode) else mode + 'b' + with open(self._file.name, mode, encoding=self.encoding) as f: + if seek is not None: + self._file.seek(seek, seek_whence) + yield f + + def write(self, s, seek=None, seek_whence=0): + """re-open file in write mode and write, optionally seek to position first""" + with self.opened_file('w', seek=seek, seek_whence=seek_whence) as f: + return f.write(s) + + def append_write(self, s, seek=None, seek_whence=0): + """re-open file in append mode and write, optionally seek to position first""" + with self.opened_file('a', seek=seek, seek_whence=seek_whence) as f: + return f.write(s) + + def read(self, n=-1, seek=None, seek_whence=0): + """re-open file and read, optionally seek to position first""" + with self.opened_file('r', seek=seek, seek_whence=seek_whence) as f: + return f.read(n) + + def cleanup(self): + with contextlib.suppress(OSError): + os.remove(self._file.name) + + def __del__(self): + self.cleanup() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.cleanup() + + +def random_string(length: int = 10) -> str: + return ''.join(random.choices(string.ascii_letters, k=length)) diff --git a/yt_dlp/jsinterp/common.py b/yt_dlp/jsinterp/common.py index aae8ea69d..42c3e7b02 100644 --- a/yt_dlp/jsinterp/common.py +++ b/yt_dlp/jsinterp/common.py @@ -4,8 +4,14 @@ import typing import functools -from ..utils import classproperty, format_field, variadic, ExtractorError from ..extractor.common import InfoExtractor +from ..utils import ( + classproperty, + format_field, + get_exe_version, + variadic, + ExtractorError, +) _JSI_HANDLERS: dict[str, type[JSI]] = {} @@ -220,6 +226,22 @@ def JSI_KEY(cls) -> str: return cls.__name__[:-3] +class ExternalJSI(JSI, abc.ABC): + _EXE_NAME: str + + @classproperty(cache=True) + def exe_version(cls): + return get_exe_version(cls._EXE_NAME, args=getattr(cls, 'V_ARGS', ['--version']), version_re=r'([0-9.]+)') + + @classproperty + def exe(cls): + return cls._EXE_NAME if cls.exe_version else None + + @classmethod + def is_available(cls): + return bool(cls.exe) + + def register_jsi(jsi_cls: JsiClass) -> JsiClass: """Register a JS interpreter class""" assert issubclass(jsi_cls, JSI), f'{jsi_cls} must be a subclass of JSI' diff --git a/yt_dlp/jsinterp/external.py b/yt_dlp/jsinterp/external.py index a944261f4..fd286b977 100644 --- a/yt_dlp/jsinterp/external.py +++ b/yt_dlp/jsinterp/external.py @@ -1,14 +1,11 @@ from __future__ import annotations -import abc import contextlib -import json -import os -import subprocess -import tempfile -import urllib.parse -import typing import http.cookiejar +import json +import subprocess +import typing +import urllib.parse from ..utils import ( @@ -16,109 +13,13 @@ Popen, classproperty, format_field, - get_exe_version, + int_or_none, is_outdated_version, shell_quote, - int_or_none, unified_timestamp, ) -from .common import JSI, register_jsi - - -def cookie_to_dict(cookie): - cookie_dict = { - 'name': cookie.name, - 'value': cookie.value, - } - if cookie.port_specified: - cookie_dict['port'] = cookie.port - if cookie.domain_specified: - cookie_dict['domain'] = cookie.domain - if cookie.path_specified: - cookie_dict['path'] = cookie.path - if cookie.expires is not None: - cookie_dict['expires'] = cookie.expires - if cookie.secure is not None: - cookie_dict['secure'] = cookie.secure - if cookie.discard is not None: - cookie_dict['discard'] = cookie.discard - with contextlib.suppress(TypeError): - if (cookie.has_nonstandard_attr('httpOnly') - or cookie.has_nonstandard_attr('httponly') - or cookie.has_nonstandard_attr('HttpOnly')): - cookie_dict['httponly'] = True - return cookie_dict - - -def cookie_jar_to_list(cookie_jar): - return [cookie_to_dict(cookie) for cookie in cookie_jar] - - -class TempFileWrapper: - """Wrapper for NamedTemporaryFile, auto closes file after io and deletes file upon wrapper object gc""" - - def __init__(self, content: str | bytes | None = None, text: bool = True, - encoding='utf-8', suffix: str | None = None): - self.encoding = None if not text else encoding - self.text = text - self._file = tempfile.NamedTemporaryFile('w' if text else 'wb', encoding=self.encoding, - suffix=suffix, delete=False) - if content: - self._file.write(content) - self._file.close() - - @property - def name(self): - return self._file.name - - @contextlib.contextmanager - def opened_file(self, mode, *, seek=None, seek_whence=0): - mode = mode if (self.text or 'b' in mode) else mode + 'b' - with open(self._file.name, mode, encoding=self.encoding) as f: - if seek is not None: - self._file.seek(seek, seek_whence) - yield f - - def write(self, s, seek=None, seek_whence=0): - with self.opened_file('w', seek=seek, seek_whence=seek_whence) as f: - return f.write(s) - - def append_write(self, s, seek=None, seek_whence=0): - with self.opened_file('a', seek=seek, seek_whence=seek_whence) as f: - return f.write(s) - - def read(self, n=-1, seek=None, seek_whence=0): - with self.opened_file('r', seek=seek, seek_whence=seek_whence) as f: - return f.read(n) - - def cleanup(self): - with contextlib.suppress(OSError): - os.remove(self._file.name) - - def __del__(self): - self.cleanup() - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.cleanup() - - -class ExternalJSI(JSI, abc.ABC): - _EXE_NAME: str - - @classproperty(cache=True) - def exe_version(cls): - return get_exe_version(cls._EXE_NAME, args=getattr(cls, 'V_ARGS', ['--version']), version_re=r'([0-9.]+)') - - @classproperty - def exe(cls): - return cls._EXE_NAME if cls.exe_version else None - - @classmethod - def is_available(cls): - return bool(cls.exe) +from ._helper import TempFileWrapper, random_string +from .common import ExternalJSI, register_jsi @register_jsi @@ -360,9 +261,6 @@ class PhantomJSJSI(ExternalJSI): var fs = require('fs'); var read = {{ mode: 'r', charset: 'utf-8' }}; var write = {{ mode: 'w', charset: 'utf-8' }}; - JSON.parse(fs.read({cookies_fn}, read)).forEach(function(x) {{ - phantom.addCookie(x); - }}); page.settings.resourceTimeout = {timeout}; page.settings.userAgent = {ua}; page.onLoadStarted = function() {{ @@ -381,20 +279,41 @@ class PhantomJSJSI(ExternalJSI): page.setContent(fs.read({html_fn}, read), {url}); }} else {{ + JSON.parse(fs.read({cookies_fn}, read)).forEach(function(x) {{ + phantom.addCookie(x); + }}); {jscode} }} }}; page.open(""); ''' - def _save_cookies(self, url, cookiejar): - cookies = cookie_jar_to_list(cookiejar) if cookiejar else [] - for cookie in cookies: - if 'path' not in cookie: - cookie['path'] = '/' - if 'domain' not in cookie: - cookie['domain'] = urllib.parse.urlparse(url).netloc - return json.dumps(cookies) + def _save_cookies(self, url, cookiejar: YoutubeDLCookieJar | None): + def _cookie_to_dict(cookie: http.cookiejar.Cookie): + cookie_dict = { + 'name': cookie.name, + 'value': cookie.value, + 'port': cookie.port, + 'domain': cookie.domain, + 'path': cookie.path or '/', + 'expires': int_or_none(cookie.expires, invscale=1000), + 'secure': cookie.secure, + 'discard': cookie.discard, + } + if not cookie_dict['domain']: + cookie_dict['domain'] = urllib.parse.urlparse(url).hostname + cookie_dict['port'] = urllib.parse.urlparse(url).port + for key in [key for key, value in cookie_dict.items() if value is None]: + cookie_dict.pop(key) + with contextlib.suppress(TypeError): + if (cookie.has_nonstandard_attr('httpOnly') + or cookie.has_nonstandard_attr('httponly') + or cookie.has_nonstandard_attr('HttpOnly')): + cookie_dict['httponly'] = True + return cookie_dict + + cookies = cookiejar.get_cookies_for_url(url) if cookiejar else [] + return json.dumps([_cookie_to_dict(cookie) for cookie in cookies]) def _load_cookies(self, cookies_json: str, cookiejar): if not cookiejar: @@ -454,6 +373,23 @@ def _execute_html(self, jscode: str, url: str, html: str, cookiejar, video_id=No def execute(self, jscode, video_id=None, note='Executing JS in PhantomJS', location=None, html='', cookiejar=None): if location: + jscode = '''console.log(page.evaluate(function() { + var %(std_var)s = []; + console.log = function() { + var values = ''; + for (var i = 0; i < arguments.length; i++) { + values += arguments[i] + ' '; + } + %(std_var)s.push(values); + } + %(jscode)s; + return %(std_var)s.join('\\n'); + + })); + saveAndExit();''' % { + 'std_var': f'__stdout__values_{random_string()}', + 'jscode': jscode, + } return self._execute_html(jscode, location, html, cookiejar, video_id=video_id, note=note)[1] if html: self.report_warning('`location` is required to use `html`') From 2a4ab042b84fe483b7d7c2016174529eb627677b Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Mon, 30 Dec 2024 05:52:47 -0500 Subject: [PATCH 32/48] split --- yt_dlp/jsinterp/__init__.py | 4 +- yt_dlp/jsinterp/_deno.py | 238 ++++++++++++++++++ .../jsinterp/{external.py => _phantomjs.py} | 217 ---------------- yt_dlp/jsinterp/common.py | 8 +- 4 files changed, 245 insertions(+), 222 deletions(-) create mode 100644 yt_dlp/jsinterp/_deno.py rename yt_dlp/jsinterp/{external.py => _phantomjs.py} (51%) diff --git a/yt_dlp/jsinterp/__init__.py b/yt_dlp/jsinterp/__init__.py index 2477f0774..69ab64efe 100644 --- a/yt_dlp/jsinterp/__init__.py +++ b/yt_dlp/jsinterp/__init__.py @@ -1,5 +1,6 @@ from .native import JSInterpreter -from .external import PhantomJSwrapper, DenoJSI, PuppeteerJSI +from ._phantomjs import PhantomJSwrapper +from ._deno import DenoJSI from .common import _JSI_PREFERENCES, _JSI_HANDLERS, JSInterp @@ -7,7 +8,6 @@ JSInterpreter, PhantomJSwrapper, DenoJSI, - PuppeteerJSI, _JSI_HANDLERS, _JSI_PREFERENCES, JSInterp, diff --git a/yt_dlp/jsinterp/_deno.py b/yt_dlp/jsinterp/_deno.py new file mode 100644 index 000000000..84f19ed0a --- /dev/null +++ b/yt_dlp/jsinterp/_deno.py @@ -0,0 +1,238 @@ +from __future__ import annotations + +import http.cookiejar +import json +import subprocess +import typing +import urllib.parse + + +from ..utils import ( + ExtractorError, + Popen, + classproperty, + int_or_none, + shell_quote, + unified_timestamp, +) +from ._helper import TempFileWrapper, random_string +from .common import ExternalJSI, register_jsi + + +@register_jsi +class DenoJSI(ExternalJSI): + """JS interpreter class using Deno binary""" + _SUPPORTED_FEATURES = {'js', 'wasm', 'location'} + _BASE_PREFERENCE = 5 + _EXE_NAME = 'deno' + _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check'] + _INIT_SCRIPT = 'localStorage.clear(); delete window.Deno; global = window;\n' + + def __init__(self, downloader: YoutubeDL, timeout=None, features: set[str] = {}, flags=[], replace_flags=False, init_script=None): + super().__init__(downloader, timeout, features) + self._flags = flags if replace_flags else [*self._DENO_FLAGS, *flags] + self._init_script = self._INIT_SCRIPT if init_script is None else init_script + + def _run_deno(self, cmd): + self.write_debug(f'Deno command line: {shell_quote(cmd)}') + try: + stdout, stderr, returncode = Popen.run( + cmd, timeout=self.timeout, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + except Exception as e: + raise ExtractorError('Unable to run Deno binary', cause=e) + if returncode: + raise ExtractorError(f'Failed with returncode {returncode}:\n{stderr}') + elif stderr: + self.report_warning(f'JS console error msg:\n{stderr.strip()}') + return stdout.strip() + + def execute(self, jscode, video_id=None, note='Executing JS in Deno', location=None): + self.report_note(video_id, note) + location_args = ['--location', location] if location else [] + with TempFileWrapper(f'{self._init_script};\n{jscode}', suffix='.js') as js_file: + cmd = [self.exe, 'run', *self._flags, *location_args, js_file.name] + return self._run_deno(cmd) + + +@register_jsi +class DenoJITlessJSI(DenoJSI): + _SUPPORTED_FEATURES = {'js', 'location'} + _BASE_PREFERENCE = 6 + _EXE_NAME = DenoJSI._EXE_NAME + _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check', '--v8-flags=--jitless,--noexpose-wasm'] + + @classproperty + def exe_version(cls): + return DenoJSI.exe_version + + +class DenoJSDomJSI(DenoJSI): + _SUPPORTED_FEATURES = {'js', 'wasm', 'location', 'dom', 'cookies'} + _BASE_PREFERENCE = 4 + _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check'] + _JSDOM_IMPORT_CHECKED = False + + @staticmethod + def serialize_cookie(cookiejar: YoutubeDLCookieJar | None, url: str): + """serialize netscape-compatible fields from cookiejar for tough-cookie loading""" + # JSDOM use tough-cookie as its CookieJar https://github.com/jsdom/jsdom/blob/main/lib/api.js + # tough-cookie use Cookie.fromJSON and Cookie.toJSON for cookie serialization + # https://github.com/salesforce/tough-cookie/blob/master/lib/cookie/cookie.ts + if not cookiejar: + return json.dumps({'cookies': []}) + cookies: list[http.cookiejar.Cookie] = [cookie for cookie in cookiejar.get_cookies_for_url(url)] + return json.dumps({'cookies': [{ + 'key': cookie.name, + 'value': cookie.value, + # leading dot must be removed, otherwise will fail to match + 'domain': cookie.domain.lstrip('.') or urllib.parse.urlparse(url).hostname, + 'expires': int_or_none(cookie.expires, invscale=1000), + 'hostOnly': not cookie.domain_initial_dot, + 'secure': bool(cookie.secure), + 'path': cookie.path, + } for cookie in cookies if cookie.value]}) + + @staticmethod + def apply_cookies(cookiejar: YoutubeDLCookieJar | None, cookies: list[dict]): + """apply cookies from serialized tough-cookie""" + # see serialize_cookie + if not cookiejar: + return + for cookie_dict in cookies: + if not all(cookie_dict.get(k) for k in ('key', 'value', 'domain')): + continue + if cookie_dict.get('hostOnly'): + cookie_dict['domain'] = cookie_dict['domain'].lstrip('.') + else: + cookie_dict['domain'] = '.' + cookie_dict['domain'].lstrip('.') + + cookiejar.set_cookie(http.cookiejar.Cookie( + 0, cookie_dict['key'], cookie_dict['value'], + None, False, + cookie_dict['domain'], True, not cookie_dict.get('hostOnly'), + cookie_dict.get('path', '/'), True, + bool(cookie_dict.get('secure')), + unified_timestamp(cookie_dict.get('expires')), + False, None, None, {})) + + def _ensure_jsdom(self): + if self._JSDOM_IMPORT_CHECKED: + return + with TempFileWrapper('import jsdom from "https://cdn.esm.sh/jsdom"', suffix='.js') as js_file: + cmd = [self.exe, 'run', js_file.name] + self._run_deno(cmd) + self._JSDOM_IMPORT_CHECKED = True + + def execute(self, jscode, video_id=None, note='Executing JS in Deno', location='', html='', cookiejar=None): + self.report_note(video_id, note) + self._ensure_jsdom() + callback_varname = f'__callback_{random_string()}' + script = f'''{self._init_script}; + import jsdom from "https://cdn.esm.sh/jsdom"; + const {callback_varname} = (() => {{ + const jar = jsdom.CookieJar.deserializeSync({json.dumps(self.serialize_cookie(cookiejar, location))}); + const dom = new jsdom.JSDOM({json.dumps(str(html))}, {{ + {'url: %s,' % json.dumps(str(location)) if location else ''} + cookieJar: jar, + }}); + Object.keys(dom.window).forEach((key) => {{try {{window[key] = dom.window[key]}} catch (e) {{}}}}); + delete window.jsdom; + const stdout = []; + const origLog = console.log; + console.log = (...msg) => stdout.push(msg.map(m => m.toString()).join(' ')); + return () => {{ origLog(JSON.stringify({{ + stdout: stdout.join('\\n'), cookies: jar.serializeSync().cookies}})); }} + }})(); + await (async () => {{ + {jscode} + }})().finally({callback_varname}); + ''' + + location_args = ['--location', location] if location else [] + with TempFileWrapper(script, suffix='.js') as js_file: + cmd = [self.exe, 'run', *self._flags, *location_args, js_file.name] + data = json.loads(self._run_deno(cmd)) + self.apply_cookies(cookiejar, data['cookies']) + return data['stdout'] + + +class PuppeteerJSI(ExternalJSI): + _PACKAGE_VERSION = '16.2.0' + _HEADLESS = False + _EXE_NAME = DenoJSI._EXE_NAME + + @classproperty + def INSTALL_HINT(cls): + msg = f'Run "deno run -A https://deno.land/x/puppeteer@{cls._PACKAGE_VERSION}/install.ts" to install puppeteer' + if not DenoJSI.is_available: + msg = f'{DenoJSI.INSTALL_HINT}. Then {msg}' + return msg + + @classproperty(cache=True) + def full_version(cls): + if not DenoJSI.is_available: + return + try: + browser_version = DenoJSI._execute(f''' + import puppeteer from "https://deno.land/x/puppeteer@{cls._PACKAGE_VERSION}/mod.ts"; + const browser = await puppeteer.launch({{headless: {json.dumps(bool(cls._HEADLESS))}}}); + try {{ + console.log(await browser.version()) + }} finally {{ + await browser.close(); + }}''', flags=['--allow-all']) + return f'puppeteer={cls._PACKAGE_VERSION} browser={browser_version}' + except ExtractorError: + return None + + @classproperty + def exe_version(cls): + return DenoJSI.exe_version if cls.full_version else None + + def __init__(self, downloader: YoutubeDL, timeout: float | int | None = None): + super().__init__(downloader, timeout) + self.deno = DenoJSI(downloader, timeout=(self.timeout + 30000)) + + def _deno_execute(self, jscode, note=None): + return self.deno.execute(f''' + import puppeteer from "https://deno.land/x/puppeteer@{self._PACKAGE_VERSION}/mod.ts"; + const browser = await puppeteer.launch({{ + headless: {json.dumps(bool(self._HEADLESS))}, args: ["--disable-web-security"]}}); + try {{ + {jscode} + }} finally {{ + await browser.close(); + }}''', note=note, flags=['--allow-all'], base_js='') + + def execute(self, jscode, video_id=None, note='Executing JS in Puppeteer', url='about:blank'): + self.report_note(video_id, note) + return self._deno_execute(f''' + const page = await browser.newPage(); + window.setTimeout(async () => {{ + console.error('Puppeteer execution timed out'); + await browser.close(); + Deno.exit(1); + }}, {int(self.timeout)}); + page.resourceTimeout = {int(self.timeout)}; + + // drop network requests + await page.setRequestInterception(true); + page.on("request", request => request.abort()); + // capture console output + page.on("console", msg => {{ + msg.type() === 'log' && console.log(msg.text()); + msg.type() === 'error' && console.error(msg.text()); + }}); + + const url = {json.dumps(str(url))}; + await page.evaluate(`window.history.replaceState('', '', ${{JSON.stringify(url)}})`); + + await page.evaluate({json.dumps(str(jscode))}); + await browser.close(); + Deno.exit(0); + ''') + + +if typing.TYPE_CHECKING: + from ..YoutubeDL import YoutubeDL + from ..cookies import YoutubeDLCookieJar diff --git a/yt_dlp/jsinterp/external.py b/yt_dlp/jsinterp/_phantomjs.py similarity index 51% rename from yt_dlp/jsinterp/external.py rename to yt_dlp/jsinterp/_phantomjs.py index fd286b977..42aad4d3b 100644 --- a/yt_dlp/jsinterp/external.py +++ b/yt_dlp/jsinterp/_phantomjs.py @@ -11,230 +11,14 @@ from ..utils import ( ExtractorError, Popen, - classproperty, - format_field, int_or_none, is_outdated_version, shell_quote, - unified_timestamp, ) from ._helper import TempFileWrapper, random_string from .common import ExternalJSI, register_jsi -@register_jsi -class DenoJSI(ExternalJSI): - """JS interpreter class using Deno binary""" - _SUPPORTED_FEATURES = {'js', 'wasm', 'location'} - _BASE_PREFERENCE = 5 - _EXE_NAME = 'deno' - _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check'] - _INIT_SCRIPT = 'localStorage.clear(); delete window.Deno; global = window;\n' - - def __init__(self, downloader: YoutubeDL, timeout=None, flags=[], replace_flags=False, init_script=None): - super().__init__(downloader, timeout) - self._flags = flags if replace_flags else [*self._DENO_FLAGS, *flags] - self._init_script = self._INIT_SCRIPT if init_script is None else init_script - - def _run_deno(self, cmd): - self.write_debug(f'Deno command line: {shell_quote(cmd)}') - try: - stdout, stderr, returncode = Popen.run( - cmd, timeout=self.timeout, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - except Exception as e: - raise ExtractorError('Unable to run Deno binary', cause=e) - if returncode: - raise ExtractorError(f'Failed with returncode {returncode}:\n{stderr}') - elif stderr: - self.report_warning(f'JS console error msg:\n{stderr.strip()}') - return stdout.strip() - - def execute(self, jscode, video_id=None, note='Executing JS in Deno', location=None): - self.report_note(video_id, note) - location_args = ['--location', location] if location else [] - with TempFileWrapper(f'{self._init_script};\n{jscode}', suffix='.js') as js_file: - cmd = [self.exe, 'run', *self._flags, *location_args, js_file.name] - return self._run_deno(cmd) - - -@register_jsi -class DenoJITlessJSI(DenoJSI): - _SUPPORTED_FEATURES = {'js', 'location'} - _BASE_PREFERENCE = 6 - _EXE_NAME = DenoJSI._EXE_NAME - _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check', '--v8-flags=--jitless,--noexpose-wasm'] - - @classproperty - def exe_version(cls): - return DenoJSI.exe_version - - -class DenoJSDomJSI(DenoJSI): - _SUPPORTED_FEATURES = {'js', 'wasm', 'location', 'dom', 'cookies'} - _BASE_PREFERENCE = 4 - _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check'] - _JSDOM_IMPORT_CHECKED = False - - @staticmethod - def serialize_cookie(cookiejar: YoutubeDLCookieJar | None, url: str): - """serialize netscape-compatible fields from cookiejar for tough-cookie loading""" - # JSDOM use tough-cookie as its CookieJar https://github.com/jsdom/jsdom/blob/main/lib/api.js - # tough-cookie use Cookie.fromJSON and Cookie.toJSON for cookie serialization - # https://github.com/salesforce/tough-cookie/blob/master/lib/cookie/cookie.ts - if not cookiejar: - return json.dumps({'cookies': []}) - cookies: list[http.cookiejar.Cookie] = [cookie for cookie in cookiejar.get_cookies_for_url(url)] - return json.dumps({'cookies': [{ - 'key': cookie.name, - 'value': cookie.value, - # leading dot must be removed, otherwise will fail to match - 'domain': cookie.domain.lstrip('.') or urllib.parse.urlparse(url).hostname, - 'expires': int_or_none(cookie.expires, invscale=1000), - 'hostOnly': not cookie.domain_initial_dot, - 'secure': bool(cookie.secure), - 'path': cookie.path, - } for cookie in cookies if cookie.value]}) - - @staticmethod - def apply_cookies(cookiejar: YoutubeDLCookieJar | None, cookies: list[dict]): - """apply cookies from serialized tough-cookie""" - # see serialize_cookie - if not cookiejar: - return - for cookie_dict in cookies: - if not all(cookie_dict.get(k) for k in ('key', 'value', 'domain')): - continue - if cookie_dict.get('hostOnly'): - cookie_dict['domain'] = cookie_dict['domain'].lstrip('.') - else: - cookie_dict['domain'] = '.' + cookie_dict['domain'].lstrip('.') - - cookiejar.set_cookie(http.cookiejar.Cookie( - 0, cookie_dict['key'], cookie_dict['value'], - None, False, - cookie_dict['domain'], True, not cookie_dict.get('hostOnly'), - cookie_dict.get('path', '/'), True, - bool(cookie_dict.get('secure')), - unified_timestamp(cookie_dict.get('expires')), - False, None, None, {})) - - def _ensure_jsdom(self): - if self._JSDOM_IMPORT_CHECKED: - return - with TempFileWrapper('import jsdom from "https://cdn.esm.sh/jsdom"', suffix='.js') as js_file: - cmd = [self.exe, 'run', js_file.name] - self._run_deno(cmd) - self._JSDOM_IMPORT_CHECKED = True - - def execute(self, jscode, video_id=None, note='Executing JS in Deno', location='', html='', cookiejar=None): - self.report_note(video_id, note) - self._ensure_jsdom() - script = f'''{self._init_script}; - import jsdom from "https://cdn.esm.sh/jsdom"; - const callback = (() => {{ - const jar = jsdom.CookieJar.deserializeSync({json.dumps(self.serialize_cookie(cookiejar, location))}); - const dom = new jsdom.JSDOM({json.dumps(str(html))}, {{ - {'url: %s,' % json.dumps(str(location)) if location else ''} - cookieJar: jar, - }}); - Object.keys(dom.window).forEach((key) => {{try {{window[key] = dom.window[key]}} catch (e) {{}}}}); - delete window.jsdom; - const stdout = []; - const origLog = console.log; - console.log = (...msg) => stdout.push(msg.map(m => m.toString()).join(' ')); - return () => {{ origLog(JSON.stringify({{ - stdout: stdout.join('\\n'), cookies: jar.serializeSync().cookies}})); }} - }})(); - await (async () => {{ - {jscode} - }})().finally(callback); - ''' - - location_args = ['--location', location] if location else [] - with TempFileWrapper(script, suffix='.js') as js_file: - cmd = [self.exe, 'run', *self._flags, *location_args, js_file.name] - data = json.loads(self._run_deno(cmd)) - self.apply_cookies(cookiejar, data['cookies']) - return data['stdout'] - - -class PuppeteerJSI(ExternalJSI): - _PACKAGE_VERSION = '16.2.0' - _HEADLESS = False - _EXE_NAME = DenoJSI._EXE_NAME - - @classproperty - def INSTALL_HINT(cls): - msg = f'Run "deno run -A https://deno.land/x/puppeteer@{cls._PACKAGE_VERSION}/install.ts" to install puppeteer' - if not DenoJSI.is_available: - msg = f'{DenoJSI.INSTALL_HINT}. Then {msg}' - return msg - - @classproperty(cache=True) - def full_version(cls): - if not DenoJSI.is_available: - return - try: - browser_version = DenoJSI._execute(f''' - import puppeteer from "https://deno.land/x/puppeteer@{cls._PACKAGE_VERSION}/mod.ts"; - const browser = await puppeteer.launch({{headless: {json.dumps(bool(cls._HEADLESS))}}}); - try {{ - console.log(await browser.version()) - }} finally {{ - await browser.close(); - }}''', flags=['--allow-all']) - return f'puppeteer={cls._PACKAGE_VERSION} browser={browser_version}' - except ExtractorError: - return None - - @classproperty - def exe_version(cls): - return DenoJSI.exe_version if cls.full_version else None - - def __init__(self, downloader: YoutubeDL, timeout: float | int | None = None): - super().__init__(downloader, timeout) - self.deno = DenoJSI(downloader, timeout=(self.timeout + 30000)) - - def _deno_execute(self, jscode, note=None): - return self.deno.execute(f''' - import puppeteer from "https://deno.land/x/puppeteer@{self._PACKAGE_VERSION}/mod.ts"; - const browser = await puppeteer.launch({{ - headless: {json.dumps(bool(self._HEADLESS))}, args: ["--disable-web-security"]}}); - try {{ - {jscode} - }} finally {{ - await browser.close(); - }}''', note=note, flags=['--allow-all'], base_js='') - - def execute(self, jscode, video_id=None, note='Executing JS in Puppeteer', url='about:blank'): - self._downloader.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') - return self._deno_execute(f''' - const page = await browser.newPage(); - window.setTimeout(async () => {{ - console.error('Puppeteer execution timed out'); - await browser.close(); - Deno.exit(1); - }}, {int(self.timeout)}); - page.resourceTimeout = {int(self.timeout)}; - - // drop network requests - await page.setRequestInterception(true); - page.on("request", request => request.abort()); - // capture console output - page.on("console", msg => {{ - msg.type() === 'log' && console.log(msg.text()); - msg.type() === 'error' && console.error(msg.text()); - }}); - - const url = {json.dumps(str(url))}; - await page.evaluate(`window.history.replaceState('', '', ${{JSON.stringify(url)}})`); - - await page.evaluate({json.dumps(str(jscode))}); - await browser.close(); - Deno.exit(0); - ''') - - @register_jsi class PhantomJSJSI(ExternalJSI): _EXE_NAME = 'phantomjs' @@ -474,6 +258,5 @@ def execute(self, jscode, video_id=None, *, note='Executing JS in PhantomJS'): if typing.TYPE_CHECKING: - from ..YoutubeDL import YoutubeDL from ..extractor.common import InfoExtractor from ..cookies import YoutubeDLCookieJar diff --git a/yt_dlp/jsinterp/common.py b/yt_dlp/jsinterp/common.py index 42c3e7b02..e1a22d041 100644 --- a/yt_dlp/jsinterp/common.py +++ b/yt_dlp/jsinterp/common.py @@ -96,8 +96,9 @@ def __init__( self.write_debug(f'Selected JSI classes for given features: {get_jsi_keys(handler_classes)}, ' f'included: {get_jsi_keys(only_include) or "all"}, excluded: {get_jsi_keys(exclude)}') - self._handler_dict = {cls.JSI_KEY: cls(self._downloader, timeout=timeout, user_agent=user_agent, - **jsi_params.get(cls.JSI_KEY, {})) for cls in handler_classes} + self._handler_dict = { + cls.JSI_KEY: cls(self._downloader, timeout=timeout, features=self._features, user_agent=user_agent, + **jsi_params.get(cls.JSI_KEY, {})) for cls in handler_classes} self.preferences: set[JSIPreference] = {order_to_pref(preferred_order, 100)} | _JSI_PREFERENCES self._fallback_jsi = get_jsi_keys(handler_classes) if fallback_jsi == 'all' else get_jsi_keys(fallback_jsi) self._is_test = self._downloader.params.get('test', False) @@ -195,9 +196,10 @@ class JSI(abc.ABC): _SUPPORT_FEATURES: set[str] = set() _BASE_PREFERENCE: int = 0 - def __init__(self, downloader: YoutubeDL, timeout: float | int, user_agent=None): + def __init__(self, downloader: YoutubeDL, timeout: float | int, features: set[str], user_agent=None): self._downloader = downloader self.timeout = timeout + self.features = features self.user_agent: str = user_agent or self._downloader.params['http_headers']['User-Agent'] @abc.abstractmethod From 96c6cdeec4a7d1ac732d8fca15cdbf3f8ca96d18 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Mon, 30 Dec 2024 15:53:04 -0500 Subject: [PATCH 33/48] fix --- yt_dlp/jsinterp/_deno.py | 10 ++++++---- yt_dlp/jsinterp/_phantomjs.py | 11 +++++------ 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/yt_dlp/jsinterp/_deno.py b/yt_dlp/jsinterp/_deno.py index 84f19ed0a..999bc0c22 100644 --- a/yt_dlp/jsinterp/_deno.py +++ b/yt_dlp/jsinterp/_deno.py @@ -71,6 +71,7 @@ class DenoJSDomJSI(DenoJSI): _BASE_PREFERENCE = 4 _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check'] _JSDOM_IMPORT_CHECKED = False + _JSDOM_URL = 'https://cdn.esm.sh/jsdom' @staticmethod def serialize_cookie(cookiejar: YoutubeDLCookieJar | None, url: str): @@ -80,11 +81,11 @@ def serialize_cookie(cookiejar: YoutubeDLCookieJar | None, url: str): # https://github.com/salesforce/tough-cookie/blob/master/lib/cookie/cookie.ts if not cookiejar: return json.dumps({'cookies': []}) - cookies: list[http.cookiejar.Cookie] = [cookie for cookie in cookiejar.get_cookies_for_url(url)] + cookies: list[http.cookiejar.Cookie] = list(cookiejar.get_cookies_for_url(url)) return json.dumps({'cookies': [{ 'key': cookie.name, 'value': cookie.value, - # leading dot must be removed, otherwise will fail to match + # leading dot of domain must be removed, otherwise will fail to match 'domain': cookie.domain.lstrip('.') or urllib.parse.urlparse(url).hostname, 'expires': int_or_none(cookie.expires, invscale=1000), 'hostOnly': not cookie.domain_initial_dot, @@ -118,7 +119,7 @@ def apply_cookies(cookiejar: YoutubeDLCookieJar | None, cookies: list[dict]): def _ensure_jsdom(self): if self._JSDOM_IMPORT_CHECKED: return - with TempFileWrapper('import jsdom from "https://cdn.esm.sh/jsdom"', suffix='.js') as js_file: + with TempFileWrapper(f'import jsdom from "{self._JSDOM_URL}"', suffix='.js') as js_file: cmd = [self.exe, 'run', js_file.name] self._run_deno(cmd) self._JSDOM_IMPORT_CHECKED = True @@ -128,12 +129,13 @@ def execute(self, jscode, video_id=None, note='Executing JS in Deno', location=' self._ensure_jsdom() callback_varname = f'__callback_{random_string()}' script = f'''{self._init_script}; - import jsdom from "https://cdn.esm.sh/jsdom"; + import jsdom from "{self._JSDOM_URL}"; const {callback_varname} = (() => {{ const jar = jsdom.CookieJar.deserializeSync({json.dumps(self.serialize_cookie(cookiejar, location))}); const dom = new jsdom.JSDOM({json.dumps(str(html))}, {{ {'url: %s,' % json.dumps(str(location)) if location else ''} cookieJar: jar, + runScripts: 'dangerously', }}); Object.keys(dom.window).forEach((key) => {{try {{window[key] = dom.window[key]}} catch (e) {{}}}}); delete window.jsdom; diff --git a/yt_dlp/jsinterp/_phantomjs.py b/yt_dlp/jsinterp/_phantomjs.py index 42aad4d3b..37b42a212 100644 --- a/yt_dlp/jsinterp/_phantomjs.py +++ b/yt_dlp/jsinterp/_phantomjs.py @@ -11,6 +11,7 @@ from ..utils import ( ExtractorError, Popen, + filter_dict, int_or_none, is_outdated_version, shell_quote, @@ -87,19 +88,17 @@ def _cookie_to_dict(cookie: http.cookiejar.Cookie): if not cookie_dict['domain']: cookie_dict['domain'] = urllib.parse.urlparse(url).hostname cookie_dict['port'] = urllib.parse.urlparse(url).port - for key in [key for key, value in cookie_dict.items() if value is None]: - cookie_dict.pop(key) with contextlib.suppress(TypeError): if (cookie.has_nonstandard_attr('httpOnly') or cookie.has_nonstandard_attr('httponly') or cookie.has_nonstandard_attr('HttpOnly')): cookie_dict['httponly'] = True - return cookie_dict + return filter_dict(cookie_dict) cookies = cookiejar.get_cookies_for_url(url) if cookiejar else [] return json.dumps([_cookie_to_dict(cookie) for cookie in cookies]) - def _load_cookies(self, cookies_json: str, cookiejar): + def _load_cookies(self, cookies_json: str, cookiejar: YoutubeDLCookieJar | None): if not cookiejar: return cookies = json.loads(cookies_json) @@ -110,7 +109,7 @@ def _load_cookies(self, cookies_json: str, cookiejar): cookie.get('path', '/'), True, cookie.get('secure', False), cookie.get('expiry'), cookie.get('discard', False), None, None, - {'httpOnly': None} if cookie.get('httponly') is True else {} + {'httpOnly': None} if cookie.get('httponly') is True else {}, )) def _execute(self, jscode: str, video_id=None, *, note='Executing JS in PhantomJS'): @@ -139,7 +138,7 @@ def _execute_html(self, jscode: str, url: str, html: str, cookiejar, video_id=No html_file = TempFileWrapper(html, suffix='.html') cookie_file = TempFileWrapper(self._save_cookies(url, cookiejar), suffix='.json') - jscode = self._TEMPLATE.format(**{ + jscode = self._TEMPLATE.format_map({ 'url': json.dumps(str(url)), 'ua': json.dumps(str(self.user_agent)), 'jscode': jscode, From fb474064ee9930f3a906477de135469ed68dccf7 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Mon, 30 Dec 2024 17:09:47 -0500 Subject: [PATCH 34/48] test --- test/test_jsinterp_external.py | 53 ++++++++++++++++ yt_dlp/jsinterp/_deno.py | 108 ++++++++------------------------- yt_dlp/jsinterp/_phantomjs.py | 4 +- yt_dlp/jsinterp/common.py | 2 + 4 files changed, 83 insertions(+), 84 deletions(-) create mode 100644 test/test_jsinterp_external.py diff --git a/test/test_jsinterp_external.py b/test/test_jsinterp_external.py new file mode 100644 index 000000000..b0a046db9 --- /dev/null +++ b/test/test_jsinterp_external.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 + +# Allow direct execution +import os +import sys +import unittest + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +from test.helper import ( + FakeYDL, +) +from yt_dlp.jsinterp.common import ExternalJSI +from yt_dlp.jsinterp._deno import DenoJSI, DenoJITlessJSI, DenoJSDomJSI +from yt_dlp.jsinterp._phantomjs import PhantomJSJSI + + +class Base: + class TestExternalJSI(unittest.TestCase): + _JSI_CLASS: type[ExternalJSI] = None + + def setUp(self): + self.ydl = FakeYDL() + self.jsi = self._JSI_CLASS(self.ydl, 19, {}) + if not self.jsi_available(): + self.skipTest('Not available') + + def jsi_available(self): + return self._JSI_CLASS and self._JSI_CLASS.exe_version + + def test_execute(self): + self.assertEqual(self.jsi.execute('console.log("Hello, world!");'), 'Hello, world!') + + +class TestDeno(Base.TestExternalJSI): + _JSI_CLASS = DenoJSI + + +class TestDenoJITless(Base.TestExternalJSI): + _JSI_CLASS = DenoJITlessJSI + + +class TestDenoDom(Base.TestExternalJSI): + _JSI_CLASS = DenoJSDomJSI + + +class TestPhantomJS(Base.TestExternalJSI): + _JSI_CLASS = PhantomJSJSI + + +if __name__ == '__main__': + unittest.main() diff --git a/yt_dlp/jsinterp/_deno.py b/yt_dlp/jsinterp/_deno.py index 999bc0c22..d7f4cfa78 100644 --- a/yt_dlp/jsinterp/_deno.py +++ b/yt_dlp/jsinterp/_deno.py @@ -2,6 +2,7 @@ import http.cookiejar import json +import re import subprocess import typing import urllib.parse @@ -28,11 +29,23 @@ class DenoJSI(ExternalJSI): _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check'] _INIT_SCRIPT = 'localStorage.clear(); delete window.Deno; global = window;\n' - def __init__(self, downloader: YoutubeDL, timeout=None, features: set[str] = {}, flags=[], replace_flags=False, init_script=None): - super().__init__(downloader, timeout, features) + def __init__(self, *args, flags=[], replace_flags=False, init_script=None, **kwargs): + super().__init__(*args, **kwargs) self._flags = flags if replace_flags else [*self._DENO_FLAGS, *flags] self._init_script = self._INIT_SCRIPT if init_script is None else init_script + @property + def _override_navigator_js(self): + return '\n'.join([ + 'Object.defineProperty(navigator, "%s", { value: %s, configurable: true });' % (k, json.dumps(v)) + for k, v in { + 'userAgent': self.user_agent, + 'language': 'en-US', + 'languages': ['en-US'], + 'webdriver': False, + }.items() + ]) + def _run_deno(self, cmd): self.write_debug(f'Deno command line: {shell_quote(cmd)}') try: @@ -49,7 +62,7 @@ def _run_deno(self, cmd): def execute(self, jscode, video_id=None, note='Executing JS in Deno', location=None): self.report_note(video_id, note) location_args = ['--location', location] if location else [] - with TempFileWrapper(f'{self._init_script};\n{jscode}', suffix='.js') as js_file: + with TempFileWrapper(f'{self._init_script};\n{self._override_navigator_js}\n{jscode}', suffix='.js') as js_file: cmd = [self.exe, 'run', *self._flags, *location_args, js_file.name] return self._run_deno(cmd) @@ -128,14 +141,20 @@ def execute(self, jscode, video_id=None, note='Executing JS in Deno', location=' self.report_note(video_id, note) self._ensure_jsdom() callback_varname = f'__callback_{random_string()}' + + inline_scripts = '\n'.join([ + 'try { %s } catch (e) {}' % script + for script in re.findall(r']*>(.+?)', html, re.DOTALL) + ]) + script = f'''{self._init_script}; + {self._override_navigator_js}; import jsdom from "{self._JSDOM_URL}"; const {callback_varname} = (() => {{ const jar = jsdom.CookieJar.deserializeSync({json.dumps(self.serialize_cookie(cookiejar, location))}); const dom = new jsdom.JSDOM({json.dumps(str(html))}, {{ {'url: %s,' % json.dumps(str(location)) if location else ''} cookieJar: jar, - runScripts: 'dangerously', }}); Object.keys(dom.window).forEach((key) => {{try {{window[key] = dom.window[key]}} catch (e) {{}}}}); delete window.jsdom; @@ -145,6 +164,9 @@ def execute(self, jscode, video_id=None, note='Executing JS in Deno', location=' return () => {{ origLog(JSON.stringify({{ stdout: stdout.join('\\n'), cookies: jar.serializeSync().cookies}})); }} }})(); + await (async () => {{ + {inline_scripts} + }})(); await (async () => {{ {jscode} }})().finally({callback_varname}); @@ -158,83 +180,5 @@ def execute(self, jscode, video_id=None, note='Executing JS in Deno', location=' return data['stdout'] -class PuppeteerJSI(ExternalJSI): - _PACKAGE_VERSION = '16.2.0' - _HEADLESS = False - _EXE_NAME = DenoJSI._EXE_NAME - - @classproperty - def INSTALL_HINT(cls): - msg = f'Run "deno run -A https://deno.land/x/puppeteer@{cls._PACKAGE_VERSION}/install.ts" to install puppeteer' - if not DenoJSI.is_available: - msg = f'{DenoJSI.INSTALL_HINT}. Then {msg}' - return msg - - @classproperty(cache=True) - def full_version(cls): - if not DenoJSI.is_available: - return - try: - browser_version = DenoJSI._execute(f''' - import puppeteer from "https://deno.land/x/puppeteer@{cls._PACKAGE_VERSION}/mod.ts"; - const browser = await puppeteer.launch({{headless: {json.dumps(bool(cls._HEADLESS))}}}); - try {{ - console.log(await browser.version()) - }} finally {{ - await browser.close(); - }}''', flags=['--allow-all']) - return f'puppeteer={cls._PACKAGE_VERSION} browser={browser_version}' - except ExtractorError: - return None - - @classproperty - def exe_version(cls): - return DenoJSI.exe_version if cls.full_version else None - - def __init__(self, downloader: YoutubeDL, timeout: float | int | None = None): - super().__init__(downloader, timeout) - self.deno = DenoJSI(downloader, timeout=(self.timeout + 30000)) - - def _deno_execute(self, jscode, note=None): - return self.deno.execute(f''' - import puppeteer from "https://deno.land/x/puppeteer@{self._PACKAGE_VERSION}/mod.ts"; - const browser = await puppeteer.launch({{ - headless: {json.dumps(bool(self._HEADLESS))}, args: ["--disable-web-security"]}}); - try {{ - {jscode} - }} finally {{ - await browser.close(); - }}''', note=note, flags=['--allow-all'], base_js='') - - def execute(self, jscode, video_id=None, note='Executing JS in Puppeteer', url='about:blank'): - self.report_note(video_id, note) - return self._deno_execute(f''' - const page = await browser.newPage(); - window.setTimeout(async () => {{ - console.error('Puppeteer execution timed out'); - await browser.close(); - Deno.exit(1); - }}, {int(self.timeout)}); - page.resourceTimeout = {int(self.timeout)}; - - // drop network requests - await page.setRequestInterception(true); - page.on("request", request => request.abort()); - // capture console output - page.on("console", msg => {{ - msg.type() === 'log' && console.log(msg.text()); - msg.type() === 'error' && console.error(msg.text()); - }}); - - const url = {json.dumps(str(url))}; - await page.evaluate(`window.history.replaceState('', '', ${{JSON.stringify(url)}})`); - - await page.evaluate({json.dumps(str(jscode))}); - await browser.close(); - Deno.exit(0); - ''') - - if typing.TYPE_CHECKING: - from ..YoutubeDL import YoutubeDL from ..cookies import YoutubeDLCookieJar diff --git a/yt_dlp/jsinterp/_phantomjs.py b/yt_dlp/jsinterp/_phantomjs.py index 37b42a212..2f4c6d105 100644 --- a/yt_dlp/jsinterp/_phantomjs.py +++ b/yt_dlp/jsinterp/_phantomjs.py @@ -173,12 +173,12 @@ def execute(self, jscode, video_id=None, 'std_var': f'__stdout__values_{random_string()}', 'jscode': jscode, } - return self._execute_html(jscode, location, html, cookiejar, video_id=video_id, note=note)[1] + return self._execute_html(jscode, location, html, cookiejar, video_id=video_id, note=note)[1].strip() if html: self.report_warning('`location` is required to use `html`') if cookiejar: self.report_warning('`location` and `html` are required to use `cookiejar`') - return self._execute(jscode, video_id, note=note) + return self._execute(jscode, video_id, note=note).strip() class PhantomJSwrapper: diff --git a/yt_dlp/jsinterp/common.py b/yt_dlp/jsinterp/common.py index e1a22d041..8dfe195bf 100644 --- a/yt_dlp/jsinterp/common.py +++ b/yt_dlp/jsinterp/common.py @@ -197,6 +197,8 @@ class JSI(abc.ABC): _BASE_PREFERENCE: int = 0 def __init__(self, downloader: YoutubeDL, timeout: float | int, features: set[str], user_agent=None): + if not self._SUPPORT_FEATURES.issuperset(features): + raise ExtractorError(f'{self.JSI_NAME} does not support all required features: {features}') self._downloader = downloader self.timeout = timeout self.features = features From b87a0582c52fe8ab19e65fbef044c07c20521a76 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Mon, 30 Dec 2024 17:54:45 -0500 Subject: [PATCH 35/48] test script tag --- test/test_jsinterp_external.py | 42 ++++++++++++++++++++++++++++++++++ yt_dlp/jsinterp/_deno.py | 23 +++++++++++++------ yt_dlp/jsinterp/_phantomjs.py | 2 +- yt_dlp/jsinterp/common.py | 10 ++++---- 4 files changed, 64 insertions(+), 13 deletions(-) diff --git a/test/test_jsinterp_external.py b/test/test_jsinterp_external.py index b0a046db9..7f66c032a 100644 --- a/test/test_jsinterp_external.py +++ b/test/test_jsinterp_external.py @@ -32,6 +32,48 @@ def jsi_available(self): def test_execute(self): self.assertEqual(self.jsi.execute('console.log("Hello, world!");'), 'Hello, world!') + def test_execute_dom_parse(self): + if 'dom' not in self.jsi._SUPPORTED_FEATURES: + self.skipTest('DOM not supported') + self.assertEqual(self.jsi.execute( + 'console.log(document.getElementById("test-div").innerHTML);', + location='https://example.com', + html='
Hello, world!
'), + 'Hello, world!') + + def test_execute_dom_script(self): + if 'dom' not in self.jsi._SUPPORTED_FEATURES: + self.skipTest('DOM not supported') + self.assertEqual(self.jsi.execute( + 'console.log(document.getElementById("test-div").innerHTML);', + location='https://example.com', + html=''' +
+ + + '''), + 'Hello, world!') + + def test_execute_dom_script_with_error(self): + if 'dom' not in self.jsi._SUPPORTED_FEATURES: + self.skipTest('DOM not supported') + if self.jsi.JSI_KEY == 'PhantomJS': + self.skipTest('PhantomJS does not catch errors') + self.assertEqual(self.jsi.execute( + 'console.log(document.getElementById("test-div").innerHTML);', + location='https://example.com', + html=''' +
+ + + '''), + 'Hello, world!') + class TestDeno(Base.TestExternalJSI): _JSI_CLASS = DenoJSI diff --git a/yt_dlp/jsinterp/_deno.py b/yt_dlp/jsinterp/_deno.py index d7f4cfa78..a87c104f7 100644 --- a/yt_dlp/jsinterp/_deno.py +++ b/yt_dlp/jsinterp/_deno.py @@ -137,6 +137,12 @@ def _ensure_jsdom(self): self._run_deno(cmd) self._JSDOM_IMPORT_CHECKED = True + def _parse_script_tags(self, html: str): + for match_start in re.finditer(r']*>', html, re.DOTALL): + end = html.find('', match_start.end()) + if end > match_start.end(): + yield html[match_start.end():end] + def execute(self, jscode, video_id=None, note='Executing JS in Deno', location='', html='', cookiejar=None): self.report_note(video_id, note) self._ensure_jsdom() @@ -144,13 +150,13 @@ def execute(self, jscode, video_id=None, note='Executing JS in Deno', location=' inline_scripts = '\n'.join([ 'try { %s } catch (e) {}' % script - for script in re.findall(r']*>(.+?)', html, re.DOTALL) + for script in self._parse_script_tags(html) ]) script = f'''{self._init_script}; {self._override_navigator_js}; import jsdom from "{self._JSDOM_URL}"; - const {callback_varname} = (() => {{ + let {callback_varname} = (() => {{ const jar = jsdom.CookieJar.deserializeSync({json.dumps(self.serialize_cookie(cookiejar, location))}); const dom = new jsdom.JSDOM({json.dumps(str(html))}, {{ {'url: %s,' % json.dumps(str(location)) if location else ''} @@ -158,15 +164,18 @@ def execute(self, jscode, video_id=None, note='Executing JS in Deno', location=' }}); Object.keys(dom.window).forEach((key) => {{try {{window[key] = dom.window[key]}} catch (e) {{}}}}); delete window.jsdom; - const stdout = []; - const origLog = console.log; - console.log = (...msg) => stdout.push(msg.map(m => m.toString()).join(' ')); - return () => {{ origLog(JSON.stringify({{ - stdout: stdout.join('\\n'), cookies: jar.serializeSync().cookies}})); }} + return () => {{ + const stdout = []; + const origLog = console.log; + console.log = (...msg) => stdout.push(msg.map(m => m.toString()).join(' ')); + return () => {{ origLog(JSON.stringify({{ + stdout: stdout.join('\\n'), cookies: jar.serializeSync().cookies}})); }} + }} }})(); await (async () => {{ {inline_scripts} }})(); + {callback_varname} = {callback_varname}(); await (async () => {{ {jscode} }})().finally({callback_varname}); diff --git a/yt_dlp/jsinterp/_phantomjs.py b/yt_dlp/jsinterp/_phantomjs.py index 2f4c6d105..9f03cd7d5 100644 --- a/yt_dlp/jsinterp/_phantomjs.py +++ b/yt_dlp/jsinterp/_phantomjs.py @@ -23,7 +23,7 @@ @register_jsi class PhantomJSJSI(ExternalJSI): _EXE_NAME = 'phantomjs' - _SUPPORTED_FEATURES = {'js', 'location', 'cookies'} + _SUPPORTED_FEATURES = {'js', 'location', 'cookies', 'dom'} _BASE_PREFERENCE = 3 _BASE_JS = R''' diff --git a/yt_dlp/jsinterp/common.py b/yt_dlp/jsinterp/common.py index 8dfe195bf..9169ec14f 100644 --- a/yt_dlp/jsinterp/common.py +++ b/yt_dlp/jsinterp/common.py @@ -92,7 +92,7 @@ def __init__( jsi_keys = [key for key in get_jsi_keys(only_include or _JSI_HANDLERS) if key not in get_jsi_keys(exclude)] self.write_debug(f'Allowed JSI keys: {jsi_keys}') handler_classes = [_JSI_HANDLERS[key] for key in jsi_keys - if _JSI_HANDLERS[key]._SUPPORT_FEATURES.issuperset(self._features)] + if _JSI_HANDLERS[key]._SUPPORTED_FEATURES.issuperset(self._features)] self.write_debug(f'Selected JSI classes for given features: {get_jsi_keys(handler_classes)}, ' f'included: {get_jsi_keys(only_include) or "all"}, excluded: {get_jsi_keys(exclude)}') @@ -106,7 +106,7 @@ def __init__( def add_handler(self, handler: JSI): """Add a handler. If a handler of the same JSI_KEY exists, it will overwrite it""" assert isinstance(handler, JSI), 'handler must be a JSI instance' - if not handler._SUPPORT_FEATURES.issuperset(self._features): + if not handler._SUPPORTED_FEATURES.issuperset(self._features): raise ExtractorError(f'{handler.JSI_NAME} does not support all required features: {self._features}') self._handler_dict[handler.JSI_KEY] = handler @@ -193,11 +193,11 @@ def execute(self, jscode: str, video_id: str | None, **kwargs) -> str: class JSI(abc.ABC): - _SUPPORT_FEATURES: set[str] = set() + _SUPPORTED_FEATURES: set[str] = set() _BASE_PREFERENCE: int = 0 def __init__(self, downloader: YoutubeDL, timeout: float | int, features: set[str], user_agent=None): - if not self._SUPPORT_FEATURES.issuperset(features): + if not self._SUPPORTED_FEATURES.issuperset(features): raise ExtractorError(f'{self.JSI_NAME} does not support all required features: {features}') self._downloader = downloader self.timeout = timeout @@ -250,7 +250,7 @@ def register_jsi(jsi_cls: JsiClass) -> JsiClass: """Register a JS interpreter class""" assert issubclass(jsi_cls, JSI), f'{jsi_cls} must be a subclass of JSI' assert jsi_cls.JSI_KEY not in _JSI_HANDLERS, f'JSI {jsi_cls.JSI_KEY} already registered' - assert jsi_cls._SUPPORT_FEATURES.issubset(_ALL_FEATURES), f'{jsi_cls._SUPPORT_FEATURES - _ALL_FEATURES} not declared in `_All_FEATURES`' + assert jsi_cls._SUPPORTED_FEATURES.issubset(_ALL_FEATURES), f'{jsi_cls._SUPPORTED_FEATURES - _ALL_FEATURES} not declared in `_All_FEATURES`' _JSI_HANDLERS[jsi_cls.JSI_KEY] = jsi_cls return jsi_cls From f0c1da252827ae885149dabf22cf4bda193d7051 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Tue, 31 Dec 2024 04:34:27 -0500 Subject: [PATCH 36/48] test cookie --- test/test_jsinterp_external.py | 95 ++++++++++++++++++++++++++++++++-- yt_dlp/jsinterp/_deno.py | 47 +++++++---------- yt_dlp/jsinterp/_helper.py | 30 +++++++++++ yt_dlp/jsinterp/_phantomjs.py | 12 +++-- 4 files changed, 148 insertions(+), 36 deletions(-) diff --git a/test/test_jsinterp_external.py b/test/test_jsinterp_external.py index 7f66c032a..9cc4c970f 100644 --- a/test/test_jsinterp_external.py +++ b/test/test_jsinterp_external.py @@ -1,9 +1,14 @@ #!/usr/bin/env python3 -# Allow direct execution +from __future__ import annotations import os +import dataclasses +import datetime +import time import sys import unittest +import http.cookiejar + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -11,14 +16,43 @@ from test.helper import ( FakeYDL, ) +from yt_dlp.cookies import YoutubeDLCookieJar from yt_dlp.jsinterp.common import ExternalJSI from yt_dlp.jsinterp._deno import DenoJSI, DenoJITlessJSI, DenoJSDomJSI from yt_dlp.jsinterp._phantomjs import PhantomJSJSI +@dataclasses.dataclass +class NetscapeFields: + name: str + value: str + domain: str + path: str + secure: bool + expires: int | None + + def to_cookie(self): + return http.cookiejar.Cookie( + 0, self.name, self.value, + None, False, + self.domain, True, self.domain.startswith('.'), + self.path, True, + self.secure, self.expires, False, + None, None, {}, + ) + + def expire_str(self): + return datetime.datetime.fromtimestamp( + self.expires, datetime.timezone.utc).strftime('%a, %d %b %Y %H:%M:%S GMT') + + def __eq__(self, other: NetscapeFields | http.cookiejar.Cookie): + return all(getattr(self, attr) == getattr(other, attr) for attr in ['name', 'value', 'domain', 'path', 'secure', 'expires']) + + class Base: class TestExternalJSI(unittest.TestCase): _JSI_CLASS: type[ExternalJSI] = None + maxDiff = 2000 def setUp(self): self.ydl = FakeYDL() @@ -52,6 +86,7 @@ def test_execute_dom_script(self): '''), 'Hello, world!') @@ -59,8 +94,6 @@ def test_execute_dom_script(self): def test_execute_dom_script_with_error(self): if 'dom' not in self.jsi._SUPPORTED_FEATURES: self.skipTest('DOM not supported') - if self.jsi.JSI_KEY == 'PhantomJS': - self.skipTest('PhantomJS does not catch errors') self.assertEqual(self.jsi.execute( 'console.log(document.getElementById("test-div").innerHTML);', location='https://example.com', @@ -69,11 +102,67 @@ def test_execute_dom_script_with_error(self): '''), 'Hello, world!') + def assert_cookiejar_equal(self, cookiejar: http.cookiejar.CookieJar, ref_cookiejar: http.cookiejar.CookieJar): + for cookie in cookiejar: + ref_cookie = next((c for c in ref_cookiejar if c.name == cookie.name and c.domain == cookie.domain), None) + self.assertEqual(repr(cookie), repr(ref_cookie)) + + def assert_cookie_str_equal(self, cookie_str, ref_cookie_str): + print([cookie_str, ref_cookie_str]) + self.assertEqual(set(cookie_str.split('; ')), set(ref_cookie_str.split('; '))) + + def test_execute_cookiejar(self): + if 'cookies' not in self.jsi._SUPPORTED_FEATURES: + self.skipTest('Cookies not supported') + cookiejar = YoutubeDLCookieJar() + ref_cookiejar = YoutubeDLCookieJar() + for test_cookie in [ + NetscapeFields('test1', 'test1', '.example.com', '/', False, int(time.time()) + 1000), + NetscapeFields('test2', 'test2', '.example.com', '/', True, int(time.time()) + 1000), + NetscapeFields('test3', 'test3', '.example.com', '/123', False, int(time.time()) + 1000), + NetscapeFields('test4', 'test4', '.example.com', '/456', False, int(time.time()) + 1000), + NetscapeFields('test5', 'test5', '.example.com', '/123', True, int(time.time()) + 1000), + NetscapeFields('test6', 'test6', '.example.com', '/456', True, int(time.time()) + 1000), + NetscapeFields('test1', 'other1', '.other.com', '/', False, int(time.time()) + 1000), + NetscapeFields('test2', 'other2', '.other.com', '/', False, int(time.time()) + 1000), + NetscapeFields('test7', 'other7', '.other.com', '/', False, int(time.time()) + 1000), + ]: + cookiejar.set_cookie(test_cookie.to_cookie()) + ref_cookiejar.set_cookie(test_cookie.to_cookie()) + + # test identity without modification from js + self.assert_cookie_str_equal(self.jsi.execute( + 'console.log(document.cookie);', + location='http://example.com/123/456', + html='
Hello, world!
', + cookiejar=cookiejar), + 'test1=test1; test3=test3') + self.assert_cookiejar_equal(cookiejar, ref_cookiejar) + + # test modification of existing cookie from js + new_cookie_1 = NetscapeFields('test1', 'new1', '.example.com', '/', True, int(time.time()) + 900) + new_cookie_2 = NetscapeFields('test2', 'new2', '.example.com', '/', True, int(time.time()) + 900) + ref_cookiejar.set_cookie(new_cookie_1.to_cookie()) + ref_cookiejar.set_cookie(new_cookie_2.to_cookie()) + self.assert_cookie_str_equal(self.jsi.execute( + f'''document.cookie = "test1=new1; secure; expires={new_cookie_1.expire_str()}; domain=.example.com; path=/"; + console.log(document.cookie);''', + location='https://example.com/123/456', + html=f'''
Hello, world!
+ + ''', + cookiejar=cookiejar), + 'test1=new1; test2=new2; test3=test3; test5=test5') + self.assert_cookiejar_equal(cookiejar, ref_cookiejar) + class TestDeno(Base.TestExternalJSI): _JSI_CLASS = DenoJSI diff --git a/yt_dlp/jsinterp/_deno.py b/yt_dlp/jsinterp/_deno.py index a87c104f7..9a3083c19 100644 --- a/yt_dlp/jsinterp/_deno.py +++ b/yt_dlp/jsinterp/_deno.py @@ -2,7 +2,6 @@ import http.cookiejar import json -import re import subprocess import typing import urllib.parse @@ -16,7 +15,7 @@ shell_quote, unified_timestamp, ) -from ._helper import TempFileWrapper, random_string +from ._helper import TempFileWrapper, random_string, override_navigator_js, extract_script_tags from .common import ExternalJSI, register_jsi @@ -36,15 +35,7 @@ def __init__(self, *args, flags=[], replace_flags=False, init_script=None, **kwa @property def _override_navigator_js(self): - return '\n'.join([ - 'Object.defineProperty(navigator, "%s", { value: %s, configurable: true });' % (k, json.dumps(v)) - for k, v in { - 'userAgent': self.user_agent, - 'language': 'en-US', - 'languages': ['en-US'], - 'webdriver': False, - }.items() - ]) + return override_navigator_js(self.user_agent) def _run_deno(self, cmd): self.write_debug(f'Deno command line: {shell_quote(cmd)}') @@ -137,21 +128,13 @@ def _ensure_jsdom(self): self._run_deno(cmd) self._JSDOM_IMPORT_CHECKED = True - def _parse_script_tags(self, html: str): - for match_start in re.finditer(r']*>', html, re.DOTALL): - end = html.find('', match_start.end()) - if end > match_start.end(): - yield html[match_start.end():end] - def execute(self, jscode, video_id=None, note='Executing JS in Deno', location='', html='', cookiejar=None): self.report_note(video_id, note) self._ensure_jsdom() callback_varname = f'__callback_{random_string()}' - inline_scripts = '\n'.join([ - 'try { %s } catch (e) {}' % script - for script in self._parse_script_tags(html) - ]) + html, inline_scripts = extract_script_tags(html) + wrapper_scripts = '\n'.join(['try { %s } catch (e) {}' % script for script in inline_scripts]) script = f'''{self._init_script}; {self._override_navigator_js}; @@ -164,27 +147,33 @@ def execute(self, jscode, video_id=None, note='Executing JS in Deno', location=' }}); Object.keys(dom.window).forEach((key) => {{try {{window[key] = dom.window[key]}} catch (e) {{}}}}); delete window.jsdom; + const origLog = console.log; + console.log = () => {{}}; + console.info = () => {{}}; return () => {{ const stdout = []; - const origLog = console.log; console.log = (...msg) => stdout.push(msg.map(m => m.toString()).join(' ')); return () => {{ origLog(JSON.stringify({{ stdout: stdout.join('\\n'), cookies: jar.serializeSync().cookies}})); }} }} }})(); - await (async () => {{ - {inline_scripts} - }})(); - {callback_varname} = {callback_varname}(); - await (async () => {{ + {wrapper_scripts} + {callback_varname} = {callback_varname}(); // begin to capture console.log + try {{ {jscode} - }})().finally({callback_varname}); + }} finally {{ + {callback_varname}(); + }} ''' location_args = ['--location', location] if location else [] with TempFileWrapper(script, suffix='.js') as js_file: cmd = [self.exe, 'run', *self._flags, *location_args, js_file.name] - data = json.loads(self._run_deno(cmd)) + result = self._run_deno(cmd) + try: + data = json.loads(result) + except json.JSONDecodeError as e: + raise ExtractorError(f'Failed to parse JSON output from Deno: {result}', cause=e) self.apply_cookies(cookiejar, data['cookies']) return data['stdout'] diff --git a/yt_dlp/jsinterp/_helper.py b/yt_dlp/jsinterp/_helper.py index 22525018d..dcf27deb4 100644 --- a/yt_dlp/jsinterp/_helper.py +++ b/yt_dlp/jsinterp/_helper.py @@ -1,7 +1,9 @@ from __future__ import annotations import contextlib +import json import os import random +import re import string import tempfile @@ -69,3 +71,31 @@ def __exit__(self, exc_type, exc_value, traceback): def random_string(length: int = 10) -> str: return ''.join(random.choices(string.ascii_letters, k=length)) + + +def override_navigator_js(user_agent: str) -> str: + return '\n'.join([ + 'Object.defineProperty(navigator, "%s", { value: %s, configurable: true });' % (k, json.dumps(v)) + for k, v in { + 'userAgent': user_agent, + 'language': 'en-US', + 'languages': ['en-US'], + 'webdriver': False, + }.items() + ]) + + +def extract_script_tags(html: str) -> tuple[str, list[str]]: + script_indicies = [] + inline_scripts = [] + + for match_start in re.finditer(r']*>', html, re.DOTALL): + end = html.find('', match_start.end()) + if end > match_start.end(): + script_indicies.append((match_start.start(), end + len(''))) + inline_scripts.append(html[match_start.end():end]) + + for start, end in script_indicies: + html = html[:start] + html[end:] + + return html, inline_scripts diff --git a/yt_dlp/jsinterp/_phantomjs.py b/yt_dlp/jsinterp/_phantomjs.py index 9f03cd7d5..945ee5c9b 100644 --- a/yt_dlp/jsinterp/_phantomjs.py +++ b/yt_dlp/jsinterp/_phantomjs.py @@ -16,7 +16,7 @@ is_outdated_version, shell_quote, ) -from ._helper import TempFileWrapper, random_string +from ._helper import TempFileWrapper, random_string, extract_script_tags from .common import ExternalJSI, register_jsi @@ -135,19 +135,23 @@ def _execute_html(self, jscode: str, url: str, html: str, cookiejar, video_id=No if 'saveAndExit();' not in jscode: raise ExtractorError('`saveAndExit();` not found in `jscode`') + html, inline_scripts = extract_script_tags(html) + wrapped_scripts = '\n'.join([ + 'page.evaluate(function() { try { %s } catch (e) {} });' % inline for inline in inline_scripts]) + html_file = TempFileWrapper(html, suffix='.html') cookie_file = TempFileWrapper(self._save_cookies(url, cookiejar), suffix='.json') - jscode = self._TEMPLATE.format_map({ + script = self._TEMPLATE.format_map({ 'url': json.dumps(str(url)), 'ua': json.dumps(str(self.user_agent)), - 'jscode': jscode, + 'jscode': f'{wrapped_scripts}\n{jscode}', 'html_fn': json.dumps(html_file.name), 'cookies_fn': json.dumps(cookie_file.name), 'timeout': int(self.timeout * 1000), }) - stdout = self._execute(jscode, video_id, note=note) + stdout = self._execute(script, video_id, note=note) self._load_cookies(cookie_file.read(), cookiejar) new_html = html_file.read() From b086b8635de05d469d434bf4c22a8ed598121270 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Tue, 31 Dec 2024 06:25:12 -0500 Subject: [PATCH 37/48] update interface --- test/test_jsinterp_external.py | 53 ++++++++++++--------------- yt_dlp/jsinterp/__init__.py | 6 ++-- yt_dlp/jsinterp/_deno.py | 18 ++++++---- yt_dlp/jsinterp/_phantomjs.py | 19 +++++----- yt_dlp/jsinterp/common.py | 65 +++++++++++++++++++++++----------- 5 files changed, 91 insertions(+), 70 deletions(-) diff --git a/test/test_jsinterp_external.py b/test/test_jsinterp_external.py index 9cc4c970f..081f4257a 100644 --- a/test/test_jsinterp_external.py +++ b/test/test_jsinterp_external.py @@ -56,47 +56,44 @@ class TestExternalJSI(unittest.TestCase): def setUp(self): self.ydl = FakeYDL() - self.jsi = self._JSI_CLASS(self.ydl, 19, {}) - if not self.jsi_available(): + self.url = '' + if not self._JSI_CLASS.exe_version: + print(f'{self._JSI_CLASS.__name__} is not installed, skipping') self.skipTest('Not available') - def jsi_available(self): - return self._JSI_CLASS and self._JSI_CLASS.exe_version + @property + def jsi(self): + return self._JSI_CLASS(self.ydl, self.url, 10, {}) def test_execute(self): self.assertEqual(self.jsi.execute('console.log("Hello, world!");'), 'Hello, world!') def test_execute_dom_parse(self): if 'dom' not in self.jsi._SUPPORTED_FEATURES: + print(f'{self._JSI_CLASS.__name__} does not support DOM, skipping') self.skipTest('DOM not supported') self.assertEqual(self.jsi.execute( 'console.log(document.getElementById("test-div").innerHTML);', - location='https://example.com', html='
Hello, world!
'), 'Hello, world!') def test_execute_dom_script(self): if 'dom' not in self.jsi._SUPPORTED_FEATURES: + print(f'{self._JSI_CLASS.__name__} does not support DOM, skipping') self.skipTest('DOM not supported') self.assertEqual(self.jsi.execute( 'console.log(document.getElementById("test-div").innerHTML);', - location='https://example.com', html='''
- - '''), 'Hello, world!') - def test_execute_dom_script_with_error(self): - if 'dom' not in self.jsi._SUPPORTED_FEATURES: - self.skipTest('DOM not supported') self.assertEqual(self.jsi.execute( 'console.log(document.getElementById("test-div").innerHTML);', - location='https://example.com', html='''
@@ -108,20 +105,20 @@ def test_execute_dom_script_with_error(self): '''), 'Hello, world!') - def assert_cookiejar_equal(self, cookiejar: http.cookiejar.CookieJar, ref_cookiejar: http.cookiejar.CookieJar): - for cookie in cookiejar: - ref_cookie = next((c for c in ref_cookiejar if c.name == cookie.name and c.domain == cookie.domain), None) - self.assertEqual(repr(cookie), repr(ref_cookie)) - - def assert_cookie_str_equal(self, cookie_str, ref_cookie_str): - print([cookie_str, ref_cookie_str]) - self.assertEqual(set(cookie_str.split('; ')), set(ref_cookie_str.split('; '))) - def test_execute_cookiejar(self): if 'cookies' not in self.jsi._SUPPORTED_FEATURES: + print(f'{self._JSI_CLASS.__name__} does not support cookies, skipping') self.skipTest('Cookies not supported') cookiejar = YoutubeDLCookieJar() ref_cookiejar = YoutubeDLCookieJar() + + def _assert_expected_execute(cookie_str, ref_cookie_str): + self.assertEqual(set(cookie_str.split('; ')), set(ref_cookie_str.split('; '))) + for cookie in cookiejar: + ref_cookie = next((c for c in ref_cookiejar if c.name == cookie.name + and c.domain == cookie.domain), None) + self.assertEqual(repr(cookie), repr(ref_cookie)) + for test_cookie in [ NetscapeFields('test1', 'test1', '.example.com', '/', False, int(time.time()) + 1000), NetscapeFields('test2', 'test2', '.example.com', '/', True, int(time.time()) + 1000), @@ -137,23 +134,20 @@ def test_execute_cookiejar(self): ref_cookiejar.set_cookie(test_cookie.to_cookie()) # test identity without modification from js - self.assert_cookie_str_equal(self.jsi.execute( - 'console.log(document.cookie);', - location='http://example.com/123/456', - html='
Hello, world!
', - cookiejar=cookiejar), + self.url = 'http://example.com/123/456' + _assert_expected_execute(self.jsi.execute( + 'console.log(document.cookie);', cookiejar=cookiejar), 'test1=test1; test3=test3') - self.assert_cookiejar_equal(cookiejar, ref_cookiejar) # test modification of existing cookie from js new_cookie_1 = NetscapeFields('test1', 'new1', '.example.com', '/', True, int(time.time()) + 900) new_cookie_2 = NetscapeFields('test2', 'new2', '.example.com', '/', True, int(time.time()) + 900) ref_cookiejar.set_cookie(new_cookie_1.to_cookie()) ref_cookiejar.set_cookie(new_cookie_2.to_cookie()) - self.assert_cookie_str_equal(self.jsi.execute( + self.url = 'https://example.com/123/456' + _assert_expected_execute(self.jsi.execute( f'''document.cookie = "test1=new1; secure; expires={new_cookie_1.expire_str()}; domain=.example.com; path=/"; console.log(document.cookie);''', - location='https://example.com/123/456', html=f'''
Hello, world!
+ '''), 'Hello, world!') + def test_dom_location(self): + if not self._JSI_CLASS._SUPPORTED_FEATURES.issuperset({'dom', 'location'}): + print(f'{self._JSI_CLASS.__name__} does not support both DOM and location, skipping') + self.skipTest('DOM or location not supported') + + self.url_param = 'https://example.com/123/456' self.assertEqual(self.jsi.execute( 'console.log(document.getElementById("test-div").innerHTML);', - html=''' -
- - - '''), - 'Hello, world!') + html=''' +
Hello, world!
'''), + 'example.com') def test_execute_cookiejar(self): if 'cookies' not in self.jsi._SUPPORTED_FEATURES: @@ -134,7 +154,7 @@ def _assert_expected_execute(cookie_str, ref_cookie_str): ref_cookiejar.set_cookie(test_cookie.to_cookie()) # test identity without modification from js - self.url = 'http://example.com/123/456' + self.url_param = 'http://example.com/123/456' _assert_expected_execute(self.jsi.execute( 'console.log(document.cookie);', cookiejar=cookiejar), 'test1=test1; test3=test3') @@ -144,7 +164,7 @@ def _assert_expected_execute(cookie_str, ref_cookie_str): new_cookie_2 = NetscapeFields('test2', 'new2', '.example.com', '/', True, int(time.time()) + 900) ref_cookiejar.set_cookie(new_cookie_1.to_cookie()) ref_cookiejar.set_cookie(new_cookie_2.to_cookie()) - self.url = 'https://example.com/123/456' + self.url_param = 'https://example.com/123/456' _assert_expected_execute(self.jsi.execute( f'''document.cookie = "test1=new1; secure; expires={new_cookie_1.expire_str()}; domain=.example.com; path=/"; console.log(document.cookie);''', diff --git a/yt_dlp/jsinterp/_deno.py b/yt_dlp/jsinterp/_deno.py index f002881af..8206f5b23 100644 --- a/yt_dlp/jsinterp/_deno.py +++ b/yt_dlp/jsinterp/_deno.py @@ -142,7 +142,6 @@ def execute(self, jscode, video_id=None, note='Executing JS in Deno with jsdom', callback_varname = f'__callback_{random_string()}' script = f'''{self._init_script}; - {self._override_navigator_js}; import jsdom from "{self._JSDOM_URL}"; let {callback_varname} = (() => {{ const jar = jsdom.CookieJar.deserializeSync({json.dumps(self.serialize_cookie(cookiejar, self._url))}); @@ -151,9 +150,12 @@ def execute(self, jscode, video_id=None, note='Executing JS in Deno with jsdom', cookieJar: jar, pretendToBeVisual: true, }}); - Object.keys(dom.window).filter(key => !['atob', 'btoa'].includes(key)).forEach((key) => {{ - try {{window[key] = dom.window[key]}} catch (e) {{}} + Object.keys(dom.window).filter(key => !['atob', 'btoa', 'crypto', 'location'].includes(key)) + .filter(key => !(window.location? [] : ['sessionStorage', 'localStorage']).includes(key)) + .forEach((key) => {{ + try {{window[key] = dom.window[key]}} catch (e) {{ console.error(e) }} }}); + {self._override_navigator_js}; window.screen = {{ availWidth: 1920, @@ -168,8 +170,8 @@ def execute(self, jscode, video_id=None, note='Executing JS in Deno with jsdom', width: 1920, }} Object.defineProperty(document.body, 'clientWidth', {{value: 1903}}); - Object.defineProperty(document.body, 'clientHeight', {{value: 1035}}); - document.domain = location.hostname; + Object.defineProperty(document.body, 'clientHeight', {{value: 2000}}); + document.domain = location?.hostname; delete window.jsdom; const origLog = console.log; diff --git a/yt_dlp/jsinterp/_phantomjs.py b/yt_dlp/jsinterp/_phantomjs.py index e26d70c13..4c96ebeb0 100644 --- a/yt_dlp/jsinterp/_phantomjs.py +++ b/yt_dlp/jsinterp/_phantomjs.py @@ -164,26 +164,24 @@ def _execute_html(self, jscode: str, url: str, html: str, cookiejar, video_id=No return new_html, stdout def execute(self, jscode, video_id=None, note='Executing JS in PhantomJS', html='', cookiejar=None): - if self._url or html or cookiejar: - jscode = '''console.log(page.evaluate(function() { - var %(std_var)s = []; - console.log = function() { - var values = ''; - for (var i = 0; i < arguments.length; i++) { - values += arguments[i] + ' '; - } - %(std_var)s.push(values); + jscode = '''console.log(page.evaluate(function() { + var %(std_var)s = []; + console.log = function() { + var values = ''; + for (var i = 0; i < arguments.length; i++) { + values += arguments[i] + ' '; } - %(jscode)s; - return %(std_var)s.join('\\n'); - - })); - saveAndExit();''' % { - 'std_var': f'__stdout__values_{random_string()}', - 'jscode': jscode, + %(std_var)s.push(values); } - return self._execute_html(jscode, self._url, html, cookiejar, video_id=video_id, note=note)[1].strip() - return self._execute(jscode, video_id, note=note).strip() + %(jscode)s; + return %(std_var)s.join('\\n'); + + })); + saveAndExit();''' % { + 'std_var': f'__stdout__values_{random_string()}', + 'jscode': jscode, + } + return self._execute_html(jscode, self._url, html, cookiejar, video_id=video_id, note=note)[1].strip() class PhantomJSwrapper: diff --git a/yt_dlp/jsinterp/common.py b/yt_dlp/jsinterp/common.py index c2ef18ea9..1a3bf00d1 100644 --- a/yt_dlp/jsinterp/common.py +++ b/yt_dlp/jsinterp/common.py @@ -117,7 +117,7 @@ def __init__( self.write_debug(f'Allowed JSI keys: {jsi_keys}') handler_classes = [_JSI_HANDLERS[key] for key in jsi_keys if _JSI_HANDLERS[key]._SUPPORTED_FEATURES.issuperset(self._features)] - self.write_debug(f'Selected JSI classes for given features: {get_jsi_keys(handler_classes)}, ' + self.write_debug(f'Select JSI for features={self._features}: {get_jsi_keys(handler_classes)}, ' f'included: {get_jsi_keys(only_include) or "all"}, excluded: {get_jsi_keys(exclude)}') self._handler_dict = { From e0697299b694d9c29bc5b092d26c789e1e1951d7 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Thu, 2 Jan 2025 00:17:10 -0500 Subject: [PATCH 40/48] test wasm --- test/test_jsi_external.py | 64 +++-- test/testdata/jsi_external/hello_wasm.js | 234 ++++++++++++++++++ test/testdata/jsi_external/hello_wasm_bg.wasm | Bin 0 -> 16666 bytes yt_dlp/jsinterp/_helper.py | 23 ++ 4 files changed, 303 insertions(+), 18 deletions(-) create mode 100644 test/testdata/jsi_external/hello_wasm.js create mode 100644 test/testdata/jsi_external/hello_wasm_bg.wasm diff --git a/test/test_jsi_external.py b/test/test_jsi_external.py index 1d52e3fb3..450b7ca21 100644 --- a/test/test_jsi_external.py +++ b/test/test_jsi_external.py @@ -16,10 +16,14 @@ from test.helper import ( FakeYDL, ) +from yt_dlp.utils import ( + variadic, +) from yt_dlp.cookies import YoutubeDLCookieJar -from yt_dlp.jsinterp.common import ExternalJSI +from yt_dlp.jsinterp.common import ExternalJSI, _ALL_FEATURES from yt_dlp.jsinterp._deno import DenoJSI, DenoJITlessJSI, DenoJSDomJSI from yt_dlp.jsinterp._phantomjs import PhantomJSJSI +from yt_dlp.jsinterp._helper import prepare_wasm_jsmodule @dataclasses.dataclass @@ -49,9 +53,26 @@ def __eq__(self, other: NetscapeFields | http.cookiejar.Cookie): return all(getattr(self, attr) == getattr(other, attr) for attr in ['name', 'value', 'domain', 'path', 'secure', 'expires']) +covered_features = set() + + +def requires_feature(features): + covered_features.update(variadic(features)) + + def outer(func): + def wrapper(self, *args, **kwargs): + if not self.jsi._SUPPORTED_FEATURES.issuperset(variadic(features)): + print(f'{self._JSI_CLASS.__name__} does not support {features!r}, skipping') + self.skipTest(f'{"&".join(variadic(features))} not supported') + return func(self, *args, **kwargs) + return wrapper + return outer + + class Base: class TestExternalJSI(unittest.TestCase): _JSI_CLASS: type[ExternalJSI] = None + _TESTDATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'testdata', 'jsi_external') maxDiff = 2000 def setUp(self): @@ -77,28 +98,21 @@ def test_user_agent(self): jsi = self._JSI_CLASS(self.ydl, self.url_param, 10, {}, user_agent='test/ua') self.assertEqual(jsi.execute('console.log(navigator.userAgent);'), 'test/ua') + @requires_feature('location') def test_location(self): - if 'location' not in self._JSI_CLASS._SUPPORTED_FEATURES: - print(f'{self._JSI_CLASS.__name__} does not support location, skipping') - self.skipTest('Location not supported') self.url_param = 'https://example.com/123/456' self.assertEqual(self.jsi.execute('console.log(JSON.stringify([location.href, location.hostname]));'), '["https://example.com/123/456","example.com"]') + @requires_feature('dom') def test_execute_dom_parse(self): - if 'dom' not in self.jsi._SUPPORTED_FEATURES: - print(f'{self._JSI_CLASS.__name__} does not support DOM, skipping') - self.skipTest('DOM not supported') self.assertEqual(self.jsi.execute( 'console.log(document.getElementById("test-div").innerHTML);', html='
Hello, world!
'), 'Hello, world!') + @requires_feature('dom') def test_execute_dom_script(self): - if 'dom' not in self.jsi._SUPPORTED_FEATURES: - print(f'{self._JSI_CLASS.__name__} does not support DOM, skipping') - self.skipTest('DOM not supported') - self.assertEqual(self.jsi.execute( 'console.log(document.getElementById("test-div").innerHTML);', html='''Hello, world! @@ -112,11 +126,8 @@ def test_execute_dom_script(self): '''), 'Hello, world!') + @requires_feature(['dom', 'location']) def test_dom_location(self): - if not self._JSI_CLASS._SUPPORTED_FEATURES.issuperset({'dom', 'location'}): - print(f'{self._JSI_CLASS.__name__} does not support both DOM and location, skipping') - self.skipTest('DOM or location not supported') - self.url_param = 'https://example.com/123/456' self.assertEqual(self.jsi.execute( 'console.log(document.getElementById("test-div").innerHTML);', @@ -125,10 +136,8 @@ def test_dom_location(self):
Hello, world!
'''), 'example.com') + @requires_feature('cookies') def test_execute_cookiejar(self): - if 'cookies' not in self.jsi._SUPPORTED_FEATURES: - print(f'{self._JSI_CLASS.__name__} does not support cookies, skipping') - self.skipTest('Cookies not supported') cookiejar = YoutubeDLCookieJar() ref_cookiejar = YoutubeDLCookieJar() @@ -176,6 +185,22 @@ def _assert_expected_execute(cookie_str, ref_cookie_str): cookiejar=cookiejar), 'test1=new1; test2=new2; test3=test3; test5=test5') + @requires_feature('wasm') + def test_wasm(self): + with open(os.path.join(self._TESTDATA_DIR, 'hello_wasm.js')) as f: + js_mod = f.read() + with open(os.path.join(self._TESTDATA_DIR, 'hello_wasm_bg.wasm'), 'rb') as f: + wasm = f.read() + + js_base = prepare_wasm_jsmodule(js_mod, wasm) + + js_code = js_base + '''; + console.log(add(1, 2)); + greet('world'); + ''' + + self.assertEqual(self.jsi.execute(js_code), '3\nHello, world!') + class TestDeno(Base.TestExternalJSI): _JSI_CLASS = DenoJSI @@ -193,5 +218,8 @@ class TestPhantomJS(Base.TestExternalJSI): _JSI_CLASS = PhantomJSJSI +expect_covered_features = set(_ALL_FEATURES) - {'js'} +assert covered_features.issuperset(expect_covered_features), f'Missing tests for features: {expect_covered_features - covered_features}' + if __name__ == '__main__': unittest.main() diff --git a/test/testdata/jsi_external/hello_wasm.js b/test/testdata/jsi_external/hello_wasm.js new file mode 100644 index 000000000..1a3a31c46 --- /dev/null +++ b/test/testdata/jsi_external/hello_wasm.js @@ -0,0 +1,234 @@ +// wasm-pack build --target web +/* lib.rs +use wasm_bindgen::prelude::*; + +#[wasm_bindgen] +extern "C" { + pub fn eval(s: &str); +} + +#[wasm_bindgen] +pub fn greet(name: &str) { + eval(&format!("console.log('Hello, {}!')", name)); +} + +#[wasm_bindgen] +pub fn add(left: i32, right: i32) -> i32 { + left + right +} +*/ + +let wasm; + +const cachedTextDecoder = (typeof TextDecoder !== 'undefined' ? new TextDecoder('utf-8', { ignoreBOM: true, fatal: true }) : { decode: () => { throw Error('TextDecoder not available') } } ); + +if (typeof TextDecoder !== 'undefined') { cachedTextDecoder.decode(); }; + +let cachedUint8ArrayMemory0 = null; + +function getUint8ArrayMemory0() { + if (cachedUint8ArrayMemory0 === null || cachedUint8ArrayMemory0.byteLength === 0) { + cachedUint8ArrayMemory0 = new Uint8Array(wasm.memory.buffer); + } + return cachedUint8ArrayMemory0; +} + +function getStringFromWasm0(ptr, len) { + ptr = ptr >>> 0; + return cachedTextDecoder.decode(getUint8ArrayMemory0().subarray(ptr, ptr + len)); +} + +let WASM_VECTOR_LEN = 0; + +const cachedTextEncoder = (typeof TextEncoder !== 'undefined' ? new TextEncoder('utf-8') : { encode: () => { throw Error('TextEncoder not available') } } ); + +const encodeString = (typeof cachedTextEncoder.encodeInto === 'function' + ? function (arg, view) { + return cachedTextEncoder.encodeInto(arg, view); +} + : function (arg, view) { + const buf = cachedTextEncoder.encode(arg); + view.set(buf); + return { + read: arg.length, + written: buf.length + }; +}); + +function passStringToWasm0(arg, malloc, realloc) { + + if (realloc === undefined) { + const buf = cachedTextEncoder.encode(arg); + const ptr = malloc(buf.length, 1) >>> 0; + getUint8ArrayMemory0().subarray(ptr, ptr + buf.length).set(buf); + WASM_VECTOR_LEN = buf.length; + return ptr; + } + + let len = arg.length; + let ptr = malloc(len, 1) >>> 0; + + const mem = getUint8ArrayMemory0(); + + let offset = 0; + + for (; offset < len; offset++) { + const code = arg.charCodeAt(offset); + if (code > 0x7F) break; + mem[ptr + offset] = code; + } + + if (offset !== len) { + if (offset !== 0) { + arg = arg.slice(offset); + } + ptr = realloc(ptr, len, len = offset + arg.length * 3, 1) >>> 0; + const view = getUint8ArrayMemory0().subarray(ptr + offset, ptr + len); + const ret = encodeString(arg, view); + + offset += ret.written; + ptr = realloc(ptr, len, offset, 1) >>> 0; + } + + WASM_VECTOR_LEN = offset; + return ptr; +} +/** + * @param {string} name + */ +export function greet(name) { + const ptr0 = passStringToWasm0(name, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + wasm.greet(ptr0, len0); +} + +/** + * @param {number} left + * @param {number} right + * @returns {number} + */ +export function add(left, right) { + const ret = wasm.add(left, right); + return ret; +} + +async function __wbg_load(module, imports) { + if (typeof Response === 'function' && module instanceof Response) { + if (typeof WebAssembly.instantiateStreaming === 'function') { + try { + return await WebAssembly.instantiateStreaming(module, imports); + + } catch (e) { + if (module.headers.get('Content-Type') != 'application/wasm') { + console.warn("`WebAssembly.instantiateStreaming` failed because your server does not serve Wasm with `application/wasm` MIME type. Falling back to `WebAssembly.instantiate` which is slower. Original error:\n", e); + + } else { + throw e; + } + } + } + + const bytes = await module.arrayBuffer(); + return await WebAssembly.instantiate(bytes, imports); + + } else { + const instance = await WebAssembly.instantiate(module, imports); + + if (instance instanceof WebAssembly.Instance) { + return { instance, module }; + + } else { + return instance; + } + } +} + +function __wbg_get_imports() { + const imports = {}; + imports.wbg = {}; + imports.wbg.__wbg_eval_d1c6d8ede79fdfce = function(arg0, arg1) { + eval(getStringFromWasm0(arg0, arg1)); + }; + imports.wbg.__wbindgen_init_externref_table = function() { + const table = wasm.__wbindgen_export_0; + const offset = table.grow(4); + table.set(0, undefined); + table.set(offset + 0, undefined); + table.set(offset + 1, null); + table.set(offset + 2, true); + table.set(offset + 3, false); + ; + }; + + return imports; +} + +function __wbg_init_memory(imports, memory) { + +} + +function __wbg_finalize_init(instance, module) { + wasm = instance.exports; + __wbg_init.__wbindgen_wasm_module = module; + cachedUint8ArrayMemory0 = null; + + + wasm.__wbindgen_start(); + return wasm; +} + +function initSync(module) { + if (wasm !== undefined) return wasm; + + + if (typeof module !== 'undefined') { + if (Object.getPrototypeOf(module) === Object.prototype) { + ({module} = module) + } else { + console.warn('using deprecated parameters for `initSync()`; pass a single object instead') + } + } + + const imports = __wbg_get_imports(); + + __wbg_init_memory(imports); + + if (!(module instanceof WebAssembly.Module)) { + module = new WebAssembly.Module(module); + } + + const instance = new WebAssembly.Instance(module, imports); + + return __wbg_finalize_init(instance, module); +} + +async function __wbg_init(module_or_path) { + if (wasm !== undefined) return wasm; + + + if (typeof module_or_path !== 'undefined') { + if (Object.getPrototypeOf(module_or_path) === Object.prototype) { + ({module_or_path} = module_or_path) + } else { + console.warn('using deprecated parameters for the initialization function; pass a single object instead') + } + } + + if (typeof module_or_path === 'undefined') { + module_or_path = new URL('hello_wasm_bg.wasm', import.meta.url); + } + const imports = __wbg_get_imports(); + + if (typeof module_or_path === 'string' || (typeof Request === 'function' && module_or_path instanceof Request) || (typeof URL === 'function' && module_or_path instanceof URL)) { + module_or_path = fetch(module_or_path); + } + + __wbg_init_memory(imports); + + const { instance, module } = await __wbg_load(await module_or_path, imports); + + return __wbg_finalize_init(instance, module); +} + +export { initSync }; +export default __wbg_init; diff --git a/test/testdata/jsi_external/hello_wasm_bg.wasm b/test/testdata/jsi_external/hello_wasm_bg.wasm new file mode 100644 index 0000000000000000000000000000000000000000..d8f32c44c4980c8eaca07c6528b2dffa85d56fee GIT binary patch literal 16666 zcmb`PU5s7Vb>H{d=YGxHJK~i*u_=-+&%I_MsuHRB{!rSKTPP(@6-GrL(+6tAxuke! zION>98d0_28QXHK8hX%!(yD+TT);r}Ah#}Hq;BCL459>nZ~_Go8WnH=BQX#MaZ?9y z03~qC{{Cy9bM76AV(PY(IQ#7Hwb%Drdk@|E?zO}@mn=>^>-YDQef@OLru+7tFB}})a?N_bUc1)2c75=YchxI{UT^58)AfxF zH~+CQV0t%iTptWC&LqkGW3AWLx3;cdawi@bYa8@p?fc7lnd?6ZsU~m`~KYM zzs!ZdPM)cp3y!bfzx|nKrs~J|YWBNHxVh7*hHrfNmOJfTcjEk1C7kTk8Cc!yR2cJ< z+qdf%D(A9lsAWlL>|N=)&ApK9(3K1~!!Pwa^=4KNei+{P(k<85BFTB})Nw7}RKe5- z8IkO5vT8k?eCqhHlcqqIC8vFb{kyfSx;Eu9icP*N&+wged{*mLrW`9&sJT|9sqxbc z(8wf9!zcGbWv3t7anq-ctK&HPr9jq6vt-IOTS+s#ouAEzww*bAkP7yuUaRY#1nMMAg9<~YJ8mcPQ%O!RH8;9Gb2=AG z#dB#&pC9dh=8lVaS=y;(9`w)*%ke{dditnpbzdwD4r+nbDC;m62w{g4x9aB zXou;n&Ka`g$>S`@l#H`iMRQri+TCgj4)?$Wg4Kv){+z}gjXv5z%!DP4xq7KC9 z_?BzBrau>0bJ$G-?6NrkDmUDDKGHlXX`YN|&eH$KG#AGgG_S%)tPVtQWyuF(8&^=h zTM;KA2XWs{622YTq2v92$-UK(Feu#{b}KuQFL5*v8!3lj_q`8QKktuMrK7dny}*sQL%Y=r!L(9Cg7ckoi2O|;~$dI4h&Co(nMI8 zTJR<)iArv*TR+~JP;=W9roN^-oX2MqD3t127fBNrqY$$?x=nnXNUWQT5fw_c(RCLX zL&N4XJ5l@#srS+axzzh8m!=z{a3&+EXIP_)5JahmW2DVcWQaFv6KNbtJun=W`eY>a zNwM%@sRs-JGA8x#*qu`E$E7~ScEC(g2>_;);h5BGu_*N(Q7--)!$|Tp%vcTIih|U2 z;>~LK4+oV#_WR|XE{EDX=Q=KHgo=zH8#SObgtC4_V*g20hxE2uQE@{Qh}f4YC|^i6 zHE~RztxhexBfjAq4PoCL4o=hP?+9-Fph;R)tgxna1ar7mbRf@fU|l*cl?d5j_(}WJ z`%g|8^E8a~GO6$8vyA_;g*kZ{|HJ z%VFP9<=c6W%5vDds=Sl;s4R!QugZ7x9+l;=@2m1&-lMV{_Cr;^m-nbFhy7TUALKnM z%V8g=@}s;*WjX9X{QHxN3#P`G|s+7G8u zFX6kIdT?+c*_9FYS-QqFe_SSiv^`f67+~!&JV4Q6?_ApmnQP(1h5oWD)k9qrjq*I`Wj}1oK}YtGN8?5w4Pe~q$U)QS$YC-LIdW{*P}mT)IE4%a zbdSc$Sko92gBQKfdgNC%3VL}#o1H2wn%f_Eic0QuK8cTBF$+gb6cWV?vVpLB(#AwG zo2g>6u13jWQFp|Qe3HV+t@g93@KOyCFMNR){HW?Z@S=F^kb?_gcjCo_O6J-SuOP)7 zTreoSRGVh>X~oTwOxYoelR{TcFOdZg$q3fMDU6PAfr^gv=X7F(JRBhgwswnU%yqXf zZC}h*fM?-N?)1u~RAeXmp{9~T<-`}J{IUO6n&B+w!3&-t{EvuWiXy!@eH?Pum?3kL3pTD)|V1U>>6aoWUNAeY7LgOCNQP zT#5UYDR+z(k7$Qy(T}R6s@%vcm41^gE1S|0wUCg_v2Z2Tdayjp9LuU1VJ*8?MGQ8P zr!KWZXZWrCf1;Q~sHKNFdq|oshizNWtn_7Nro&`^xZ(mbtV{IVK8ey*=Ngg@#MVNl4Q2)=Ced84?*?{yj+0wrD-r5On(eYllm4SH`S&m8Yg2q%+T zVc5~JYjjj(+0v{Teq7(_ccz$F5$NU{C}URN<9xNfLHOL~Uuo=QH9~!vko3cS*3&kUby|apVrwqmi_R#gUS{$9=fwr&n7`# z6J{1(eu`1{Lr5XuEBR)X5%6p>X&5~Qo@Di5{0I>l(BgywWztZL2*{RGuUBekS4C=wDGv`!jHa_3{dBYq3Onq&&mF|nw?990_@T016m)*{AW zD-40;-FT0~k3lU3GS)K&HF^hS0@T$AH7O!N4Yx+wD;JF5M%ouz2GA;53(&F_8mN;3 z9a3#-z@2d!+UPz{#=j}%rOy-|aP8%5=_{2+H)Nn%+R7;@fm&5D-ElgYbXo<8$q1{n zDueEKBzXwBlOs)OErE;X!fn6bJFrBv;9x3h5nn2}BcnYIFJX)0{4+rN*+SYsu66g; z=KrcD)&0x@k%XeM8AVdTNuTr~#~1En_~3V>G?jUgS-|IIixelRaChYDVJ6|Dosd3- z9yZ@aCY~c%NA&WvQM8PpfnGpkp>|_D6QkiUKU-?i!u^s_@#S`Sov3tqmfk!|w!yfS zMi;wddzbJX0*OsE`A Q=BFb>cFqDA>g)K5Fv?)k@voQ3#_sY9=wj`xbRRsAhbX+ zD20>L?=UNdN+|3iqry@NiIoe=-$2(G3-WE=OX+*%sUPhd&r%$iyfzGA-ja5q>sBtDY7Mhf`YwH zGYbGtQBccb<8&-q#VwiGyWO2MW>v_o#I4#aC(1pL^rKiN+~yAk1YqmxcO7TISL_z0 z3a?`gGO>;fGT*17^yU;1fGwP1sD$2t|K;U(KoCWjm`YkN#~>FtQhqK}vBKrv|7f&z z1HJSi2tei-1W4rw0(2YB1I`2jZ#QO!A+|ozKjWNXjml9SL`X{jROhR}e{zt}{H9F= zD(&C`ZR)WRsEeVLeT!C*Mp5Zh(1#sDJ?JGL91CrvN3+|K#Zq@&rxw?t$k&<3*Ez~M z6Y|_)a}v;+3}LzwfWv4-D@C-*;mP{lMl>cyF!;x?k{Rysv$RB%1~weS7Yx85AVwf= zNyCi1>%U18oCpa?u3DKWEq6%2YB716rb47&qoC7zcl`=)N>m_akzWu$6#7-Y(XNaX ztc_*cpFz2@q?Bv1u8v>|0Zip#f2mxx<00ib0;imY!Y?DCq}`GMxQDhNJ|?Iq8bSNL zjqU-KnCuwS7Jw)5RDyw`1(Ho$Fg9w7kBCeaa0`E9Fy46})3765i{^=eIMWWR!@I0i zpsCsXGu{ahwCL_!x5(hFec?E16w>b|r39MXren)glBeRHq1A7DX>g80klsoBIcCYl z$|B(sOPQ4;_(%#;!T9N6=7uZQZR&V5Om&f8~=3GKSjBWvt{hxnik5 zConQES&A3U4AsT6^Z}M5K~;GZn5evsi=rXY*+w^4x)w%tSJ0wKcXZE%7?LK?Ra?9+ z8^6#Z02UrHd}-(+81HhSMLv>iZ1gT%q6;m{>&T7^D9|Bp)ouG77g}1ZxX{wdBxy5m z3ELItqcg8k^!MTH!8N+w9u%I=5P3B-RRpbh>rqk$JJTUbLwB0jpclxjV@V%686xgw*;``naWx7Gv*Fqeny;1ozXRa%O|OP4<6k3 zQw{{es(NGtBBjGH6*R~N zsEB_&N(Cv;r_=xayH1}97TnnAq!`Wew`_FqP3S0LEw)3{#T~}dcs!fR$Vl$fHcSjJ z3N@5W(QPmeOxot|(?*~f?50@5pubiz(QM0uG_H_Rx5*@mlE_H-+aPq*M?d%D$BD*OaAEnaWo?x8O9EJpf? zGgI@E3r%C=kRFR>{Ro!ABMfNP}S2ZIZ5_QuYoj zG-4KudY0PPX?jld!&|%`GUIPhg>##@zwai#5;2v)OQ{)q-6fUS5zUaxK!;#}UND(Bt!)q+%gMtq z;v3-7)&<0-GXHK0V6?bNg94a0VO|sfVt9BQVtB`pji7uqvW2fPWScC5RYUb*9P5Dz zfU{bOu4@tFiXr+8z)P?T9Pk- z{E&PR4Bg527We%tAMF%KuVT1f{}L{keh4o4KdkKS_5p9meOl@_3ExA1B7UmqGsyIX z_w}V8%#8Ce#({(a%b2&vYL=v{3N}E*=e<9{W`_F|FLD3%i1G_b zlE`tyW&0IfznFqXXPRP8;|HTdf+&6p|Al~>r>#Osshr@__CuH<1_QPE3IY<L^QSfUyq9P zmF0BBT2`PW=^}W&Ga20y$PH!l|L9RR0iljz9be=|V7faK<7Vb2^sGQ(1etv0fIT)& zjEJk-P?lDKQ(hjm()alzXOvx-s7KXAQ1xi*Y9L6-ij3D1ILQ{eXc2DmG=u2L?v9EJ zH@!;}{>6?-HUY@zskGO2sCgMw*iJV)HUQmb$qvi)2=D~oF z=N2jI)N1(37jAKv*5*bp0dG*kNt=fMYBYdBUlH=L4B;mXR0;>q66SqQQ^4|wX^03XU2lEl}9-YoPg$npH`fgFdZDc<8-TP z?O5yk?e0f+Jg7vu0eR2^>LK|3D5pm>k!t&-GA6nZDm9;4AJU5ixx`1%rhl7|W-oq& zLxvI%e=*4)a08-Xt8fm&&BS`q4&oUkKT)4j>>MT^A_-K;Yi^+)+2h6fbMZgZ)5T>N zxrn6W(FT^G;Qmj&xIrQtnn7eUeU>x1(MYT%miquw8G-eq(Z%4werMc7`vGLo{1+5? zR1(;adkjA5{5il++b5&Ep^km~Q+lWdVlvl!5E4cqBLSAVXGEB*_)f>0KQDF_vyzuG zDIAQ1k!Em%>^46!8l?@&_J0gfpj_mRsI-g&oVY_L{EjIs2^FZ7wnAclKwFd;7RQhN zgIZGCFK!ZqVTO(8^zgaR(3Y{|%{M``as*^4pwC17F1e4j<5P*v^I;<1VVHegng(i2tHUF=wp)$@WwL(9{RBWP~&=$kBh$x zVq3v03iIp5U6Y=D)r;Jz?&4{x&%5r2gDZe z7c!I?K#r&20HT(!YM75`v2LE;7Qt(+Dee&BLPwsS@~xKAN7$CbYYBSRk}`>tTX+-2 zMOFcq2Oq^!_A5(rk#=}NW|>h*?H!AV>-lTHJ)r|=crmzyzDUIz^hJ~=IPC5*Pbb4J zF`aB$tTm}ae8hVBn><7CAw*2Xr)wNM$Mn-vJOE3{B1=oKoC`09n}!)n!oo_>#ZXLV zx^}$HuM@N@dV2`Ns;Ow^(Tya;i;OfHJIZZQ-gHgZurMQ+D^DMX30Q&oJk$$+j?Wa{ z{2a`F472bSAL0F?c8|WhpB&ta$S50Sg5!HHLz-pwF=0red3Vj`$N4lHD4mC4&UNXu zHS1iGUp3-{Vy&%ieU*0Fd{8zyiE>0wyKalNaX!0@dE&qSm#_cE?|<>Vzy4DqbxHLH zpZ_<%`RjlAd%ydi_X!E4v)mK!|J$#9;XnS#AO7*nam26x(I3D0JOBF2uQfH`V+^>{ zK-~etspdb&{+WzP4UF(Uj`ds#M7&E4fOXys@NYIp;amSXkc z^*?aR;Cj1Rz~8I|UgB^YfYsdunKob>FNASh%F5rq%~k&o+&1BNG7PVO@s{g9=G-qj zJ&f|3l0tg7bnB_7pod4KtZ9jn+rRxsk3F3MkJ6(490>YTC+W-WM#@Ssoj_4Y!)0!< zssG5uwTusw(uIen`#XRTZBs&&a(y3WNS~5a*e}k`<>h3SZt^_k5wqV?At#UoVlIBC zWs+_9b*l3nY%n;L5l{5Rje?4!@XTb^Y(pI7Q6|uPpk9`poke2cWLTw(sUU^iO?B?O zOhI31t~TbuW# zE^&I`((!1_D5`oggNOo>z7QV7ZI@eKiRT|BP?(D4H} z_Vwwx01t8h!=ur1yTfZ4VluZT<^bRtJ~nQx+^?7#NN}lJT14qI3AJ@u2%|1)Nq7Sj zAFAzBs1pqg2Vc4+nL~6^eih0>^1Ai(n-i24t2Ue8uVDo2*Mt8my>xwh_xe`v?AG-w zryu%skKg!xH0wO{uyd~MrujKdnNn8x+sex-WsSde{ysK%VR!i0M(@V%V;fuXH-^v5 zoSi$n^w{p;QrUENu)Dsy+Zzn8Uf<3xuV3BjZ9Jac*uDBMdf6}i!_3M@vTIkjFJ9m4 z4W93pD=6M#;|a=-n*I+))7Ni~rkC^oJ@coSAAbotv)^7`H!!YW&a&rT8uoT` zfSmgwd;T0_|0kv9><%~V6gG(?oJiXAICm+DFpcA+LA(B_RJ&l-pQ1j=cay)uq1L{S zzgkOUkL2GN3ikki&kqLI2b3SU&)4p^{d0|>a`nN zy=%SgVZ_v+H+*5R-P_34x3ivL%U*oGx19~2?`1D+Z}bLRFKO!TkY5hJmTg?!$hNN! zM{r!eHUtLee$_|peb&d*Kb|=!81(05`e=4DGc&U@b2IZZ3p0x|OEb$eD>JLJGqba^ zbF=fa3$u%}OS8+fE3>O}Gjp?Zb93`^3v-KeOLNO}D|4&!GxM|abMy1_3-gQfOY_U~ zEAy)hGYhi|a|`ne3k!=2OAE^jD+{ZOGmEo}bBptf3yX`3ON+~kD~qd3GfT5eb4&9} z3rmYjOH0d3D@&`(Gt0BfbIbF~3(JekOUuj4E6b}ZGb^(zb1U;J3oDB&ODoGOD=Vw3 zfVj%`t1P|Bv{jniXC`h8u5Y|>sW;g5jji?VD=(~H=_QqahOoF6U6*=@!^WBCUP`MH zpi7^eJ-af0b|yQ$I`iE8;`-9ue8w3U&dko9nOl0e_Tu^$!)gfD{MotIi~M^9XYzkY z-~kES>S}g+X5rHM(&hPu^@rWs5q=^6N^f}aa&H~V^>!_09}my3yNxWb)bzTf?jTGX^j8{x>yx5xW2Y literal 0 HcmV?d00001 diff --git a/yt_dlp/jsinterp/_helper.py b/yt_dlp/jsinterp/_helper.py index 9a86c73ad..389204f9e 100644 --- a/yt_dlp/jsinterp/_helper.py +++ b/yt_dlp/jsinterp/_helper.py @@ -109,3 +109,26 @@ def extract_script_tags(html: str) -> tuple[str, list[str]]: html = html[:start] + html[end:] return html, inline_scripts + + +def prepare_wasm_jsmodule(js_mod: str, wasm: bytes) -> str: + """ + Prepare wasm init for js wrapper module generated by rust wasm-pack + removes export and import.meta and inlines wasm binary as Uint8Array + See test/test_data/jsi_external/hello_wasm.js for example + + @param {str} js_mod: js wrapper module generated by rust wasm-pack + @param {bytes} wasm: wasm binary + """ + + js_mod = re.sub(r'export(?:\s+default)?([\s{])', r'\1', js_mod) + js_mod = js_mod.replace('import.meta', '{}') + + return js_mod + '''; + await (async () => { + const t = __wbg_get_imports(); + __wbg_init_memory(t); + const {module, instance} = await WebAssembly.instantiate(Uint8Array.from(%s), t); + __wbg_finalize_init(instance, module); + })(); + ''' % list(wasm) From 87218ed23905375258f87c6acb6c448d460cf651 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sat, 11 Jan 2025 13:34:38 -0500 Subject: [PATCH 41/48] update --- test/test_jsi_external.py | 2 +- yt_dlp/jsinterp/_deno.py | 6 +++--- yt_dlp/jsinterp/_phantomjs.py | 2 +- yt_dlp/jsinterp/common.py | 36 ++++++++++++++++------------------- 4 files changed, 21 insertions(+), 25 deletions(-) diff --git a/test/test_jsi_external.py b/test/test_jsi_external.py index 450b7ca21..2fa800278 100644 --- a/test/test_jsi_external.py +++ b/test/test_jsi_external.py @@ -218,7 +218,7 @@ class TestPhantomJS(Base.TestExternalJSI): _JSI_CLASS = PhantomJSJSI -expect_covered_features = set(_ALL_FEATURES) - {'js'} +expect_covered_features = set(_ALL_FEATURES) assert covered_features.issuperset(expect_covered_features), f'Missing tests for features: {expect_covered_features - covered_features}' if __name__ == '__main__': diff --git a/yt_dlp/jsinterp/_deno.py b/yt_dlp/jsinterp/_deno.py index 8206f5b23..80b223228 100644 --- a/yt_dlp/jsinterp/_deno.py +++ b/yt_dlp/jsinterp/_deno.py @@ -22,7 +22,7 @@ @register_jsi class DenoJSI(ExternalJSI): """JS interpreter class using Deno binary""" - _SUPPORTED_FEATURES = {'js', 'wasm', 'location'} + _SUPPORTED_FEATURES = {'wasm', 'location'} _BASE_PREFERENCE = 5 _EXE_NAME = 'deno' _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check'] @@ -60,7 +60,7 @@ def execute(self, jscode, video_id=None, note='Executing JS in Deno'): @register_jsi class DenoJITlessJSI(DenoJSI): - _SUPPORTED_FEATURES = {'js', 'location'} + _SUPPORTED_FEATURES = {'location'} _BASE_PREFERENCE = 6 _EXE_NAME = DenoJSI._EXE_NAME _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check', '--v8-flags=--jitless,--noexpose-wasm'] @@ -72,7 +72,7 @@ def exe_version(cls): @register_jsi class DenoJSDomJSI(DenoJSI): - _SUPPORTED_FEATURES = {'js', 'wasm', 'location', 'dom', 'cookies'} + _SUPPORTED_FEATURES = {'wasm', 'location', 'dom', 'cookies'} _BASE_PREFERENCE = 4 _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check'] _JSDOM_IMPORT_CHECKED = False diff --git a/yt_dlp/jsinterp/_phantomjs.py b/yt_dlp/jsinterp/_phantomjs.py index 4c96ebeb0..e48ded44d 100644 --- a/yt_dlp/jsinterp/_phantomjs.py +++ b/yt_dlp/jsinterp/_phantomjs.py @@ -23,7 +23,7 @@ @register_jsi class PhantomJSJSI(ExternalJSI): _EXE_NAME = 'phantomjs' - _SUPPORTED_FEATURES = {'js', 'location', 'cookies', 'dom'} + _SUPPORTED_FEATURES = {'location', 'cookies', 'dom'} _BASE_PREFERENCE = 3 _BASE_JS = R''' diff --git a/yt_dlp/jsinterp/common.py b/yt_dlp/jsinterp/common.py index 1a3bf00d1..de641aeed 100644 --- a/yt_dlp/jsinterp/common.py +++ b/yt_dlp/jsinterp/common.py @@ -20,7 +20,6 @@ _JSI_HANDLERS: dict[str, type[JSI]] = {} _JSI_PREFERENCES: set[JSIPreference] = set() _ALL_FEATURES = { - 'js', 'wasm', 'location', 'dom', @@ -41,10 +40,6 @@ def _pref(jsi: JSI, *args): return _pref -def join_jsi_name(jsi_list: typing.Iterable[str | type[JSI] | JSI], sep=', '): - return sep.join(get_jsi_keys(jok if isinstance(jok, str) else jok.JSI_NAME for jok in jsi_list)) - - def require_features(param_features: dict[str, str | typing.Iterable[str]]): assert all(_ALL_FEATURES.issuperset(variadic(kw_feature)) for kw_feature in param_features.values()) @@ -73,15 +68,14 @@ def _real_extract(self, url): ``` Features: - - `js`: supports js syntax - - `wasm`: supports WebAssembly interface - - `location`: supports setting window.location - - `dom`: supports DOM interface + - `wasm`: supports window.WebAssembly + - `location`: supports mocking window.location + - `dom`: supports DOM interface (not necessarily rendering) - `cookies`: supports document.cookie read & write @param dl_or_ie: `YoutubeDL` or `InfoExtractor` instance. @param url: setting url context, used by JSI that supports `location` feature - @param features: list of features that are necessary for JS interpretation. + @param features: only JSI that supports all of these features will be selected @param only_include: limit JSI to choose from. @param exclude: JSI to avoid using. @param jsi_params: extra kwargs to pass to `JSI.__init__()` for each JSI, using jsi key as dict key. @@ -94,7 +88,7 @@ def _real_extract(self, url): def __init__( self, dl_or_ie: YoutubeDL | InfoExtractor, - url: str, + url: str = '', features: typing.Iterable[str] = [], only_include: typing.Iterable[str | type[JSI]] = [], exclude: typing.Iterable[str | type[JSI]] = [], @@ -119,11 +113,13 @@ def __init__( if _JSI_HANDLERS[key]._SUPPORTED_FEATURES.issuperset(self._features)] self.write_debug(f'Select JSI for features={self._features}: {get_jsi_keys(handler_classes)}, ' f'included: {get_jsi_keys(only_include) or "all"}, excluded: {get_jsi_keys(exclude)}') + if not handler_classes: + raise ExtractorError(f'No JSI supports features={self._features}') - self._handler_dict = { - cls.JSI_KEY: cls(self._downloader, url=self._url, timeout=timeout, - features=self._features, user_agent=user_agent, - **jsi_params.get(cls.JSI_KEY, {})) for cls in handler_classes} + self._handler_dict = {cls.JSI_KEY: cls( + self._downloader, url=self._url, timeout=timeout, features=self._features, + user_agent=user_agent, **jsi_params.get(cls.JSI_KEY, {}) + ) for cls in handler_classes} self.preferences: set[JSIPreference] = {order_to_pref(preferred_order, 100)} | _JSI_PREFERENCES self._fallback_jsi = get_jsi_keys(handler_classes) if fallback_jsi == 'all' else get_jsi_keys(fallback_jsi) self._is_test = self._downloader.params.get('test', False) @@ -153,7 +149,7 @@ def _get_handlers(self, method_name: str, *args, **kwargs) -> list[JSI]: def _dispatch_request(self, method_name: str, *args, **kwargs): handlers = self._get_handlers(method_name, *args, **kwargs) - unavailable: list[JSI] = [] + unavailable: list[str] = [] exceptions: list[tuple[JSI, Exception]] = [] test_results: list[tuple[JSI, typing.Any]] = [] @@ -162,8 +158,8 @@ def _dispatch_request(self, method_name: str, *args, **kwargs): if self._is_test: raise Exception(f'{handler.JSI_NAME} is not available for testing, ' f'add "{handler.JSI_KEY}" in `exclude` if it should not be used') - self.write_debug(f'{handler.JSI_NAME} is not available') - unavailable.append(handler) + self.write_debug(f'{handler.JSI_KEY} is not available') + unavailable.append(handler.JSI_NAME) continue try: self.write_debug(f'Dispatching `{method_name}` task to {handler.JSI_NAME}') @@ -188,11 +184,11 @@ def _dispatch_request(self, method_name: str, *args, **kwargs): return ref_result if not exceptions: - msg = f'No available JSI installed, please install one of: {join_jsi_name(unavailable)}' + msg = f'No available JSI installed, please install one of: {", ".join(unavailable)}' else: msg = f'Failed to perform {method_name}, total {len(exceptions)} errors' if unavailable: - msg = f'{msg}. You can try installing one of unavailable JSI: {join_jsi_name(unavailable)}' + msg = f'{msg}. You can try installing one of unavailable JSI: {", ".join(unavailable)}' raise ExtractorError(msg) @require_features({'location': 'location', 'html': 'dom', 'cookiejar': 'cookies'}) From ecc1c379e6877013b7aec0e8c0d9292a8dec2f7b Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Wed, 19 Feb 2025 15:32:56 -0500 Subject: [PATCH 42/48] jsi pref --- yt_dlp/jsinterp/_deno.py | 1 - yt_dlp/options.py | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/yt_dlp/jsinterp/_deno.py b/yt_dlp/jsinterp/_deno.py index 80b223228..87bdce9a1 100644 --- a/yt_dlp/jsinterp/_deno.py +++ b/yt_dlp/jsinterp/_deno.py @@ -167,7 +167,6 @@ def execute(self, jscode, video_id=None, note='Executing JS in Deno with jsdom', onchange: null, orientation: {{angle: 0, type: 'landscape-primary', onchange: null}}, pixelDepth: 24, - width: 1920, }} Object.defineProperty(document.body, 'clientWidth', {{value: 1903}}); Object.defineProperty(document.body, 'clientHeight', {{value: 2000}}); diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 06b65e0ea..a673c024f 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1138,6 +1138,13 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '--sleep-subtitles', metavar='SECONDS', dest='sleep_interval_subtitles', default=0, type=int, help='Number of seconds to sleep before each subtitle download') + workarounds.add_option( + '--jsi-preference', + metavar='JSI', dest='jsi_preference', default=[], type='str', action='callback', + callback=_list_from_options_callback, + help=( + 'Preferred JS interpreters to use during extraction. Can be given as comma-separated values.') + ) verbosity = optparse.OptionGroup(parser, 'Verbosity and Simulation Options') verbosity.add_option( From 8e78e77d22b27f889ee85446975d6d3805833c77 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Wed, 19 Feb 2025 15:34:55 -0500 Subject: [PATCH 43/48] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 0ac27c462..8b2a644ea 100644 --- a/README.md +++ b/README.md @@ -790,6 +790,9 @@ ## Workarounds: be used along with --min-sleep-interval --sleep-subtitles SECONDS Number of seconds to sleep before each subtitle download + --jsi-preference Preferred JS interpreters to use during + extraction. Can be given as comma-separated + values ## Video Format Options: -f, --format FORMAT Video format code, see "FORMAT SELECTION" From af4f71c44a9eddb7518366e6f8a25781d400a949 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Wed, 19 Feb 2025 15:50:14 -0500 Subject: [PATCH 44/48] jsi pref --- yt_dlp/__init__.py | 1 + yt_dlp/jsinterp/common.py | 10 +++++++++- yt_dlp/options.py | 4 +--- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index c76fe2748..e3c34dbf9 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -941,6 +941,7 @@ def parse_options(argv=None): 'sleep_interval': opts.sleep_interval, 'max_sleep_interval': opts.max_sleep_interval, 'sleep_interval_subtitles': opts.sleep_interval_subtitles, + 'jsi_preference': opts.jsi_preference, 'external_downloader': opts.external_downloader, 'download_ranges': opts.download_ranges, 'force_keyframes_at_cuts': opts.force_keyframes_at_cuts, diff --git a/yt_dlp/jsinterp/common.py b/yt_dlp/jsinterp/common.py index e1b77fb13..552f12ee5 100644 --- a/yt_dlp/jsinterp/common.py +++ b/yt_dlp/jsinterp/common.py @@ -107,6 +107,11 @@ def __init__( if unsupported_features := self._features - _ALL_FEATURES: raise ExtractorError(f'Unsupported features: {unsupported_features}, allowed features: {_ALL_FEATURES}') + user_prefs = self._downloader.params.get('jsi_preference', []) + for invalid_key in [jsi_key for jsi_key in user_prefs if jsi_key not in _JSI_HANDLERS]: + self.report_warning(f'`{invalid_key}` is not a valid JSI, ignoring preference setting') + user_prefs.remove(invalid_key) + jsi_keys = [key for key in get_jsi_keys(only_include or _JSI_HANDLERS) if key not in get_jsi_keys(exclude)] self.write_debug(f'Allowed JSI keys: {jsi_keys}') handler_classes = [_JSI_HANDLERS[key] for key in jsi_keys @@ -120,7 +125,10 @@ def __init__( self._downloader, url=self._url, timeout=timeout, features=self._features, user_agent=user_agent, **jsi_params.get(cls.JSI_KEY, {}), ) for cls in handler_classes} - self.preferences: set[JSIPreference] = {order_to_pref(preferred_order, 100)} | _JSI_PREFERENCES + + self.preferences: set[JSIPreference] = { + order_to_pref(user_prefs, 10000), order_to_pref(preferred_order, 100)} | _JSI_PREFERENCES + self._fallback_jsi = get_jsi_keys(handler_classes) if fallback_jsi == 'all' else get_jsi_keys(fallback_jsi) self._is_test = self._downloader.params.get('test', False) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index a673c024f..a0d72a664 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1142,9 +1142,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '--jsi-preference', metavar='JSI', dest='jsi_preference', default=[], type='str', action='callback', callback=_list_from_options_callback, - help=( - 'Preferred JS interpreters to use during extraction. Can be given as comma-separated values.') - ) + help='Preferred JS interpreters to use during extraction. Can be given as comma-separated values.') verbosity = optparse.OptionGroup(parser, 'Verbosity and Simulation Options') verbosity.add_option( From 076ca745aab9154d65068fcd106be0fea68d41b4 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Wed, 19 Feb 2025 16:50:05 -0500 Subject: [PATCH 45/48] matrix download test --- test/test_download.py | 21 +++++++++++++++++++++ yt_dlp/extractor/iqiyi.py | 22 ++++++++++++++++++++++ yt_dlp/jsinterp/common.py | 37 +++++++++++++++++-------------------- 3 files changed, 60 insertions(+), 20 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index 3f36869d9..5c6d4f99d 100755 --- a/test/test_download.py +++ b/test/test_download.py @@ -25,6 +25,7 @@ import yt_dlp.YoutubeDL # isort: split from yt_dlp.extractor import get_info_extractor +from yt_dlp.jsinterp.common import filter_jsi_feature, filter_jsi_include from yt_dlp.networking.exceptions import HTTPError, TransportError from yt_dlp.utils import ( DownloadError, @@ -82,6 +83,26 @@ def __str__(self): # Dynamically generate tests def generator(test_case, tname): + def generate_sub_case(jsi_key): + sub_case = {k: v for k, v in test_case.items() if not k.startswith('jsi_matrix')} + sub_case['params'] = {**test_case.get('params', {}), 'jsi_preference': [jsi_key]} + return generator(sub_case, f'{tname}_{jsi_key}') + + # setting `jsi_matrix` to True, `jsi_matrix_features` to list, or + # setting `jsi_matrix_only_include` or `jsi_matrix_exclude` to non-empty + # to trigger matrix behavior + if isinstance(test_case.get('jsi_matrix_features'), list) or any(test_case.get(key) for key in [ + 'jsi_matrix', 'jsi_matrix_only_include', 'jsi_matrix_exclude', + ]): + jsi_keys = filter_jsi_feature(test_case.get('jsi_matrix_features', []), filter_jsi_include( + test_case.get('jsi_matrix_only_include', None), test_case.get('jsi_matrix_exclude', None))) + + def run_sub_cases(self): + for i, jsi_key in enumerate(jsi_keys): + print(f'Running case {tname} using JSI: {jsi_key} ({i + 1}/{len(jsi_keys)})') + generate_sub_case(jsi_key)(self) + return run_sub_cases + def test_template(self): if self.COMPLETED_TESTS.get(tname): return diff --git a/yt_dlp/extractor/iqiyi.py b/yt_dlp/extractor/iqiyi.py index 6e1e18833..813984769 100644 --- a/yt_dlp/extractor/iqiyi.py +++ b/yt_dlp/extractor/iqiyi.py @@ -398,6 +398,27 @@ class IqIE(InfoExtractor): IE_DESC = 'International version of iQiyi' _VALID_URL = r'https?://(?:www\.)?iq\.com/play/(?:[\w%-]*-)?(?P\w+)' _TESTS = [{ + 'url': 'https://www.iq.com/play/sangmin-dinneaw-episode-1-xmk7546rfw', + 'md5': '63fcb4b7d4863472fe0a9be75d9e9d60', + 'info_dict': { + 'ext': 'mp4', + 'id': 'xmk7546rfw', + 'title': '尚岷与丁尼奥 第1集', + 'description': 'md5:e8fe4a8da25f4b8c86bc5506b1c3faaa', + 'duration': 3092, + 'timestamp': 1735520401, + 'upload_date': '20241230', + 'episode_number': 1, + 'episode': 'Episode 1', + 'series': 'Sangmin Dinneaw', + 'age_limit': 18, + 'average_rating': float, + 'categories': [], + 'cast': ['Sangmin Choi', 'Ratana Aiamsaart'], + }, + 'expected_warnings': ['format is restricted'], + 'jsi_matrix_features': ['dom'], + }, { 'url': 'https://www.iq.com/play/one-piece-episode-1000-1ma1i6ferf4', 'md5': '2d7caf6eeca8a32b407094b33b757d39', 'info_dict': { @@ -418,6 +439,7 @@ class IqIE(InfoExtractor): 'format': '500', }, 'expected_warnings': ['format is restricted'], + 'skip': 'geo-restricted', }, { # VIP-restricted video 'url': 'https://www.iq.com/play/mermaid-in-the-fog-2021-gbdpx13bs4', diff --git a/yt_dlp/jsinterp/common.py b/yt_dlp/jsinterp/common.py index 552f12ee5..f95b9ab63 100644 --- a/yt_dlp/jsinterp/common.py +++ b/yt_dlp/jsinterp/common.py @@ -31,6 +31,17 @@ def get_jsi_keys(jsi_or_keys: typing.Iterable[str | type[JSI] | JSI]) -> list[st return [jok if isinstance(jok, str) else jok.JSI_KEY for jok in jsi_or_keys] +def filter_jsi_include(only_include: typing.Iterable[str] | None, exclude: typing.Iterable[str] | None): + keys = get_jsi_keys(only_include) if only_include else _JSI_HANDLERS.keys() + return [key for key in keys if key not in (exclude or [])] + + +def filter_jsi_feature(features: typing.Iterable[str], keys=None): + keys = keys if keys is not None else _JSI_HANDLERS.keys() + return [key for key in keys if key in _JSI_HANDLERS + and _JSI_HANDLERS[key]._SUPPORTED_FEATURES.issuperset(features)] + + def order_to_pref(jsi_order: typing.Iterable[str | type[JSI] | JSI], multiplier: int) -> JSIPreference: jsi_order = reversed(get_jsi_keys(jsi_order)) pref_score = {jsi_cls: (i + 1) * multiplier for i, jsi_cls in enumerate(jsi_order)} @@ -112,10 +123,9 @@ def __init__( self.report_warning(f'`{invalid_key}` is not a valid JSI, ignoring preference setting') user_prefs.remove(invalid_key) - jsi_keys = [key for key in get_jsi_keys(only_include or _JSI_HANDLERS) if key not in get_jsi_keys(exclude)] + jsi_keys = filter_jsi_include(only_include, exclude) self.write_debug(f'Allowed JSI keys: {jsi_keys}') - handler_classes = [_JSI_HANDLERS[key] for key in jsi_keys - if _JSI_HANDLERS[key]._SUPPORTED_FEATURES.issuperset(self._features)] + handler_classes = [_JSI_HANDLERS[key] for key in filter_jsi_feature(self._features, jsi_keys)] self.write_debug(f'Select JSI for features={self._features}: {get_jsi_keys(handler_classes)}, ' f'included: {get_jsi_keys(only_include) or "all"}, excluded: {get_jsi_keys(exclude)}') if not handler_classes: @@ -159,38 +169,25 @@ def _dispatch_request(self, method_name: str, *args, **kwargs): unavailable: list[str] = [] exceptions: list[tuple[JSI, Exception]] = [] - test_results: list[tuple[JSI, typing.Any]] = [] for handler in handlers: if not handler.is_available(): if self._is_test: - raise Exception(f'{handler.JSI_NAME} is not available for testing, ' - f'add "{handler.JSI_KEY}" in `exclude` if it should not be used') + raise ExtractorError(f'{handler.JSI_NAME} is not available for testing, ' + f'add "{handler.JSI_KEY}" in `exclude` if it should not be used') self.write_debug(f'{handler.JSI_KEY} is not available') unavailable.append(handler.JSI_NAME) continue try: self.write_debug(f'Dispatching `{method_name}` task to {handler.JSI_NAME}') - result = getattr(handler, method_name)(*args, **kwargs) - if self._is_test: - test_results.append((handler, result)) - else: - return result - except Exception as e: + return getattr(handler, method_name)(*args, **kwargs) + except ExtractorError as e: if handler.JSI_KEY not in self._fallback_jsi: raise else: exceptions.append((handler, e)) self.write_debug(f'{handler.JSI_NAME} encountered error, fallback to next handler: {e}') - if self._is_test and test_results: - ref_handler, ref_result = test_results[0] - for handler, result in test_results[1:]: - if result != ref_result: - self.report_warning( - f'Different JSI results produced from {ref_handler.JSI_NAME} and {handler.JSI_NAME}') - return ref_result - if not exceptions: msg = f'No available JSI installed, please install one of: {", ".join(unavailable)}' else: From 98ae72ff50cddb514f17cfd02dba8ce4618735c7 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Tue, 25 Feb 2025 23:39:47 -0500 Subject: [PATCH 46/48] remove jitless --- README.md | 4 ++-- test/test_jsi_external.py | 6 +----- yt_dlp/YoutubeDL.py | 2 ++ yt_dlp/jsinterp/__init__.py | 1 - yt_dlp/jsinterp/_deno.py | 13 ------------- 5 files changed, 5 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 8b2a644ea..169efad58 100644 --- a/README.md +++ b/README.md @@ -212,7 +212,7 @@ ### Metadata ### Misc * [**pycryptodomex**](https://github.com/Legrandin/pycryptodome)\* - For decrypting AES-128 HLS streams and various other data. Licensed under [BSD-2-Clause](https://github.com/Legrandin/pycryptodome/blob/master/LICENSE.rst) -* [**phantomjs**](https://github.com/ariya/phantomjs) - Used in extractors where javascript needs to be run. Licensed under [BSD-3-Clause](https://github.com/ariya/phantomjs/blob/master/LICENSE.BSD) +* [**phantomjs**](https://github.com/ariya/phantomjs), [**deno**](https://github.com/denoland/deno/) - Used in extractors where javascript needs to be run. Licensed under [BSD-3-Clause](https://github.com/ariya/phantomjs/blob/master/LICENSE.BSD) and [MIT](https://github.com/xattr/xattr/blob/master/LICENSE.txt) respectively * [**secretstorage**](https://github.com/mitya57/secretstorage)\* - For `--cookies-from-browser` to access the **Gnome** keyring while decrypting cookies of **Chromium**-based browsers on **Linux**. Licensed under [BSD-3-Clause](https://github.com/mitya57/secretstorage/blob/master/LICENSE) * Any external downloader that you want to use with `--downloader` @@ -790,7 +790,7 @@ ## Workarounds: be used along with --min-sleep-interval --sleep-subtitles SECONDS Number of seconds to sleep before each subtitle download - --jsi-preference Preferred JS interpreters to use during + --jsi-preference JSI Preferred JS interpreters to use during extraction. Can be given as comma-separated values diff --git a/test/test_jsi_external.py b/test/test_jsi_external.py index 2fa800278..e4b8f9c17 100644 --- a/test/test_jsi_external.py +++ b/test/test_jsi_external.py @@ -21,7 +21,7 @@ ) from yt_dlp.cookies import YoutubeDLCookieJar from yt_dlp.jsinterp.common import ExternalJSI, _ALL_FEATURES -from yt_dlp.jsinterp._deno import DenoJSI, DenoJITlessJSI, DenoJSDomJSI +from yt_dlp.jsinterp._deno import DenoJSI, DenoJSDomJSI from yt_dlp.jsinterp._phantomjs import PhantomJSJSI from yt_dlp.jsinterp._helper import prepare_wasm_jsmodule @@ -206,10 +206,6 @@ class TestDeno(Base.TestExternalJSI): _JSI_CLASS = DenoJSI -class TestDenoJITless(Base.TestExternalJSI): - _JSI_CLASS = DenoJITlessJSI - - class TestDenoDom(Base.TestExternalJSI): _JSI_CLASS = DenoJSDomJSI diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index f8dae3a53..90aa14e70 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -437,6 +437,8 @@ class YoutubeDL: Actual sleep time will be a random float from range [sleep_interval; max_sleep_interval]. sleep_interval_subtitles: Number of seconds to sleep before each subtitle download + jsi_preference: Preferred JS interpreters to use during extraction. Can be + given as comma-separated values. listformats: Print an overview of available video formats and exit. list_thumbnails: Print a table of all thumbnails and exit. match_filter: A function that gets called for every video with the signature diff --git a/yt_dlp/jsinterp/__init__.py b/yt_dlp/jsinterp/__init__.py index de1abe497..fd90b92ad 100644 --- a/yt_dlp/jsinterp/__init__.py +++ b/yt_dlp/jsinterp/__init__.py @@ -2,7 +2,6 @@ from .native import JSInterpreter from .common import _JSI_PREFERENCES, _JSI_HANDLERS, JSIWrapper from ._phantomjs import PhantomJSwrapper -from . import _deno __all__ = [ diff --git a/yt_dlp/jsinterp/_deno.py b/yt_dlp/jsinterp/_deno.py index 87bdce9a1..63d9ff9c3 100644 --- a/yt_dlp/jsinterp/_deno.py +++ b/yt_dlp/jsinterp/_deno.py @@ -10,7 +10,6 @@ from ..utils import ( ExtractorError, Popen, - classproperty, int_or_none, shell_quote, unified_timestamp, @@ -58,18 +57,6 @@ def execute(self, jscode, video_id=None, note='Executing JS in Deno'): return self._run_deno(cmd) -@register_jsi -class DenoJITlessJSI(DenoJSI): - _SUPPORTED_FEATURES = {'location'} - _BASE_PREFERENCE = 6 - _EXE_NAME = DenoJSI._EXE_NAME - _DENO_FLAGS = ['--cached-only', '--no-prompt', '--no-check', '--v8-flags=--jitless,--noexpose-wasm'] - - @classproperty - def exe_version(cls): - return DenoJSI.exe_version - - @register_jsi class DenoJSDomJSI(DenoJSI): _SUPPORTED_FEATURES = {'wasm', 'location', 'dom', 'cookies'} From 5d36f8789bc3a819618a436bdea8718ee973d3f9 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sat, 1 Mar 2025 17:00:05 -0500 Subject: [PATCH 47/48] fix --- test/test_download.py | 23 +++++++++++++---------- test/test_jsi_external.py | 2 ++ yt_dlp/jsinterp/__init__.py | 1 + yt_dlp/jsinterp/_helper.py | 3 ++- yt_dlp/jsinterp/common.py | 24 +++++++++++++++--------- 5 files changed, 33 insertions(+), 20 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index 5c6d4f99d..773172663 100755 --- a/test/test_download.py +++ b/test/test_download.py @@ -25,13 +25,14 @@ import yt_dlp.YoutubeDL # isort: split from yt_dlp.extractor import get_info_extractor -from yt_dlp.jsinterp.common import filter_jsi_feature, filter_jsi_include +from yt_dlp.jsinterp.common import filter_jsi_keys from yt_dlp.networking.exceptions import HTTPError, TransportError from yt_dlp.utils import ( DownloadError, ExtractorError, UnavailableVideoError, YoutubeDLError, + filter_dict, format_bytes, join_nonempty, ) @@ -83,24 +84,26 @@ def __str__(self): # Dynamically generate tests def generator(test_case, tname): - def generate_sub_case(jsi_key): - sub_case = {k: v for k, v in test_case.items() if not k.startswith('jsi_matrix')} - sub_case['params'] = {**test_case.get('params', {}), 'jsi_preference': [jsi_key]} - return generator(sub_case, f'{tname}_{jsi_key}') # setting `jsi_matrix` to True, `jsi_matrix_features` to list, or - # setting `jsi_matrix_only_include` or `jsi_matrix_exclude` to non-empty - # to trigger matrix behavior + # setting `jsi_matrix_only_include` or `jsi_matrix_exclude` to non-empty list + # to trigger matrix behavior for JSI if isinstance(test_case.get('jsi_matrix_features'), list) or any(test_case.get(key) for key in [ 'jsi_matrix', 'jsi_matrix_only_include', 'jsi_matrix_exclude', ]): - jsi_keys = filter_jsi_feature(test_case.get('jsi_matrix_features', []), filter_jsi_include( - test_case.get('jsi_matrix_only_include', None), test_case.get('jsi_matrix_exclude', None))) + jsi_keys = filter_jsi_keys( + test_case.get('jsi_matrix_features'), test_case.get('jsi_matrix_only_include'), + test_case.get('jsi_matrix_exclude')) + + def generate_jsi_sub_case(jsi_key): + sub_case = filter_dict(test_case, lambda k, _: not k.startswith('jsi_matrix')) + sub_case['params'] = {**test_case.get('params', {}), 'jsi_preference': [jsi_key]} + return generator(sub_case, f'{tname}_{jsi_key}') def run_sub_cases(self): for i, jsi_key in enumerate(jsi_keys): print(f'Running case {tname} using JSI: {jsi_key} ({i + 1}/{len(jsi_keys)})') - generate_sub_case(jsi_key)(self) + generate_jsi_sub_case(jsi_key)(self) return run_sub_cases def test_template(self): diff --git a/test/test_jsi_external.py b/test/test_jsi_external.py index e4b8f9c17..02098a6a3 100644 --- a/test/test_jsi_external.py +++ b/test/test_jsi_external.py @@ -20,6 +20,8 @@ variadic, ) from yt_dlp.cookies import YoutubeDLCookieJar +from yt_dlp.jsinterp import _JSI_HANDLERS +assert set(_JSI_HANDLERS) == {'Deno', 'DenoJSDom', 'PhantomJS'} from yt_dlp.jsinterp.common import ExternalJSI, _ALL_FEATURES from yt_dlp.jsinterp._deno import DenoJSI, DenoJSDomJSI from yt_dlp.jsinterp._phantomjs import PhantomJSJSI diff --git a/yt_dlp/jsinterp/__init__.py b/yt_dlp/jsinterp/__init__.py index fd90b92ad..8133cfeef 100644 --- a/yt_dlp/jsinterp/__init__.py +++ b/yt_dlp/jsinterp/__init__.py @@ -2,6 +2,7 @@ from .native import JSInterpreter from .common import _JSI_PREFERENCES, _JSI_HANDLERS, JSIWrapper from ._phantomjs import PhantomJSwrapper +from . import _deno # ensure jsi registration __all__ = [ diff --git a/yt_dlp/jsinterp/_helper.py b/yt_dlp/jsinterp/_helper.py index 389204f9e..ff2e1b87e 100644 --- a/yt_dlp/jsinterp/_helper.py +++ b/yt_dlp/jsinterp/_helper.py @@ -74,6 +74,7 @@ def random_string(length: int = 10) -> str: def override_navigator_js(user_agent: str) -> str: + """Generate js snippet to override navigator properties based on user_agent string""" return '\n'.join([ 'Object.defineProperty(navigator, "%s", { value: %s, configurable: true });' % (k, json.dumps(v)) for k, v in { @@ -113,7 +114,7 @@ def extract_script_tags(html: str) -> tuple[str, list[str]]: def prepare_wasm_jsmodule(js_mod: str, wasm: bytes) -> str: """ - Prepare wasm init for js wrapper module generated by rust wasm-pack + Sanitize js wrapper module generated by rust wasm-pack for wasm init removes export and import.meta and inlines wasm binary as Uint8Array See test/test_data/jsi_external/hello_wasm.js for example diff --git a/yt_dlp/jsinterp/common.py b/yt_dlp/jsinterp/common.py index f95b9ab63..248fbe556 100644 --- a/yt_dlp/jsinterp/common.py +++ b/yt_dlp/jsinterp/common.py @@ -31,6 +31,18 @@ def get_jsi_keys(jsi_or_keys: typing.Iterable[str | type[JSI] | JSI]) -> list[st return [jok if isinstance(jok, str) else jok.JSI_KEY for jok in jsi_or_keys] +def filter_jsi_keys(features=None, only_include=None, exclude=None): + keys = list(_JSI_HANDLERS) + if features: + keys = [key for key in keys if key in _JSI_HANDLERS + and _JSI_HANDLERS[key]._SUPPORTED_FEATURES.issuperset(features)] + if only_include: + keys = [key for key in keys if key in get_jsi_keys(only_include)] + if exclude: + keys = [key for key in keys if key not in get_jsi_keys(exclude)] + return keys + + def filter_jsi_include(only_include: typing.Iterable[str] | None, exclude: typing.Iterable[str] | None): keys = get_jsi_keys(only_include) if only_include else _JSI_HANDLERS.keys() return [key for key in keys if key not in (exclude or [])] @@ -123,9 +135,7 @@ def __init__( self.report_warning(f'`{invalid_key}` is not a valid JSI, ignoring preference setting') user_prefs.remove(invalid_key) - jsi_keys = filter_jsi_include(only_include, exclude) - self.write_debug(f'Allowed JSI keys: {jsi_keys}') - handler_classes = [_JSI_HANDLERS[key] for key in filter_jsi_feature(self._features, jsi_keys)] + handler_classes = [_JSI_HANDLERS[key] for key in filter_jsi_keys(self._features, only_include, exclude)] self.write_debug(f'Select JSI for features={self._features}: {get_jsi_keys(handler_classes)}, ' f'included: {get_jsi_keys(only_include) or "all"}, excluded: {get_jsi_keys(exclude)}') if not handler_classes: @@ -208,12 +218,8 @@ def execute(self, jscode: str, video_id: str | None, note: str | None = None, @param html: html to load as document, requires `dom` feature @param cookiejar: cookiejar to read and set cookies, requires `cookies` feature, pass `InfoExtractor.cookiejar` if you want to read and write cookies """ - kwargs = filter_dict({ - 'note': note, - 'html': html, - 'cookiejar': cookiejar, - }) - return self._dispatch_request('execute', jscode, video_id, **kwargs) + return self._dispatch_request('execute', jscode, video_id, **filter_dict({ + 'note': note, 'html': html, 'cookiejar': cookiejar})) class JSI(abc.ABC): From e4b970a8307f0c4299cf8573b5fd2bd38a6b06d7 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sat, 1 Mar 2025 18:02:22 -0500 Subject: [PATCH 48/48] flags --- yt_dlp/jsinterp/_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/jsinterp/_helper.py b/yt_dlp/jsinterp/_helper.py index ff2e1b87e..58d98f44f 100644 --- a/yt_dlp/jsinterp/_helper.py +++ b/yt_dlp/jsinterp/_helper.py @@ -100,7 +100,7 @@ def extract_script_tags(html: str) -> tuple[str, list[str]]: script_indicies = [] inline_scripts = [] - for match_start in re.finditer(r']*>', html, re.DOTALL): + for match_start in re.finditer(r']*>', html, re.DOTALL | re.IGNORECASE): end = html.find('', match_start.end()) if end > match_start.end(): script_indicies.append((match_start.start(), end + len('')))