From b87a0582c52fe8ab19e65fbef044c07c20521a76 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Mon, 30 Dec 2024 17:54:45 -0500 Subject: [PATCH] test script tag --- test/test_jsinterp_external.py | 42 ++++++++++++++++++++++++++++++++++ yt_dlp/jsinterp/_deno.py | 23 +++++++++++++------ yt_dlp/jsinterp/_phantomjs.py | 2 +- yt_dlp/jsinterp/common.py | 10 ++++---- 4 files changed, 64 insertions(+), 13 deletions(-) diff --git a/test/test_jsinterp_external.py b/test/test_jsinterp_external.py index b0a046db9..7f66c032a 100644 --- a/test/test_jsinterp_external.py +++ b/test/test_jsinterp_external.py @@ -32,6 +32,48 @@ def jsi_available(self): def test_execute(self): self.assertEqual(self.jsi.execute('console.log("Hello, world!");'), 'Hello, world!') + def test_execute_dom_parse(self): + if 'dom' not in self.jsi._SUPPORTED_FEATURES: + self.skipTest('DOM not supported') + self.assertEqual(self.jsi.execute( + 'console.log(document.getElementById("test-div").innerHTML);', + location='https://example.com', + html='
Hello, world!
'), + 'Hello, world!') + + def test_execute_dom_script(self): + if 'dom' not in self.jsi._SUPPORTED_FEATURES: + self.skipTest('DOM not supported') + self.assertEqual(self.jsi.execute( + 'console.log(document.getElementById("test-div").innerHTML);', + location='https://example.com', + html=''' +
+ + + '''), + 'Hello, world!') + + def test_execute_dom_script_with_error(self): + if 'dom' not in self.jsi._SUPPORTED_FEATURES: + self.skipTest('DOM not supported') + if self.jsi.JSI_KEY == 'PhantomJS': + self.skipTest('PhantomJS does not catch errors') + self.assertEqual(self.jsi.execute( + 'console.log(document.getElementById("test-div").innerHTML);', + location='https://example.com', + html=''' +
+ + + '''), + 'Hello, world!') + class TestDeno(Base.TestExternalJSI): _JSI_CLASS = DenoJSI diff --git a/yt_dlp/jsinterp/_deno.py b/yt_dlp/jsinterp/_deno.py index d7f4cfa78..a87c104f7 100644 --- a/yt_dlp/jsinterp/_deno.py +++ b/yt_dlp/jsinterp/_deno.py @@ -137,6 +137,12 @@ def _ensure_jsdom(self): self._run_deno(cmd) self._JSDOM_IMPORT_CHECKED = True + def _parse_script_tags(self, html: str): + for match_start in re.finditer(r']*>', html, re.DOTALL): + end = html.find('', match_start.end()) + if end > match_start.end(): + yield html[match_start.end():end] + def execute(self, jscode, video_id=None, note='Executing JS in Deno', location='', html='', cookiejar=None): self.report_note(video_id, note) self._ensure_jsdom() @@ -144,13 +150,13 @@ def execute(self, jscode, video_id=None, note='Executing JS in Deno', location=' inline_scripts = '\n'.join([ 'try { %s } catch (e) {}' % script - for script in re.findall(r']*>(.+?)', html, re.DOTALL) + for script in self._parse_script_tags(html) ]) script = f'''{self._init_script}; {self._override_navigator_js}; import jsdom from "{self._JSDOM_URL}"; - const {callback_varname} = (() => {{ + let {callback_varname} = (() => {{ const jar = jsdom.CookieJar.deserializeSync({json.dumps(self.serialize_cookie(cookiejar, location))}); const dom = new jsdom.JSDOM({json.dumps(str(html))}, {{ {'url: %s,' % json.dumps(str(location)) if location else ''} @@ -158,15 +164,18 @@ def execute(self, jscode, video_id=None, note='Executing JS in Deno', location=' }}); Object.keys(dom.window).forEach((key) => {{try {{window[key] = dom.window[key]}} catch (e) {{}}}}); delete window.jsdom; - const stdout = []; - const origLog = console.log; - console.log = (...msg) => stdout.push(msg.map(m => m.toString()).join(' ')); - return () => {{ origLog(JSON.stringify({{ - stdout: stdout.join('\\n'), cookies: jar.serializeSync().cookies}})); }} + return () => {{ + const stdout = []; + const origLog = console.log; + console.log = (...msg) => stdout.push(msg.map(m => m.toString()).join(' ')); + return () => {{ origLog(JSON.stringify({{ + stdout: stdout.join('\\n'), cookies: jar.serializeSync().cookies}})); }} + }} }})(); await (async () => {{ {inline_scripts} }})(); + {callback_varname} = {callback_varname}(); await (async () => {{ {jscode} }})().finally({callback_varname}); diff --git a/yt_dlp/jsinterp/_phantomjs.py b/yt_dlp/jsinterp/_phantomjs.py index 2f4c6d105..9f03cd7d5 100644 --- a/yt_dlp/jsinterp/_phantomjs.py +++ b/yt_dlp/jsinterp/_phantomjs.py @@ -23,7 +23,7 @@ @register_jsi class PhantomJSJSI(ExternalJSI): _EXE_NAME = 'phantomjs' - _SUPPORTED_FEATURES = {'js', 'location', 'cookies'} + _SUPPORTED_FEATURES = {'js', 'location', 'cookies', 'dom'} _BASE_PREFERENCE = 3 _BASE_JS = R''' diff --git a/yt_dlp/jsinterp/common.py b/yt_dlp/jsinterp/common.py index 8dfe195bf..9169ec14f 100644 --- a/yt_dlp/jsinterp/common.py +++ b/yt_dlp/jsinterp/common.py @@ -92,7 +92,7 @@ def __init__( jsi_keys = [key for key in get_jsi_keys(only_include or _JSI_HANDLERS) if key not in get_jsi_keys(exclude)] self.write_debug(f'Allowed JSI keys: {jsi_keys}') handler_classes = [_JSI_HANDLERS[key] for key in jsi_keys - if _JSI_HANDLERS[key]._SUPPORT_FEATURES.issuperset(self._features)] + if _JSI_HANDLERS[key]._SUPPORTED_FEATURES.issuperset(self._features)] self.write_debug(f'Selected JSI classes for given features: {get_jsi_keys(handler_classes)}, ' f'included: {get_jsi_keys(only_include) or "all"}, excluded: {get_jsi_keys(exclude)}') @@ -106,7 +106,7 @@ def __init__( def add_handler(self, handler: JSI): """Add a handler. If a handler of the same JSI_KEY exists, it will overwrite it""" assert isinstance(handler, JSI), 'handler must be a JSI instance' - if not handler._SUPPORT_FEATURES.issuperset(self._features): + if not handler._SUPPORTED_FEATURES.issuperset(self._features): raise ExtractorError(f'{handler.JSI_NAME} does not support all required features: {self._features}') self._handler_dict[handler.JSI_KEY] = handler @@ -193,11 +193,11 @@ def execute(self, jscode: str, video_id: str | None, **kwargs) -> str: class JSI(abc.ABC): - _SUPPORT_FEATURES: set[str] = set() + _SUPPORTED_FEATURES: set[str] = set() _BASE_PREFERENCE: int = 0 def __init__(self, downloader: YoutubeDL, timeout: float | int, features: set[str], user_agent=None): - if not self._SUPPORT_FEATURES.issuperset(features): + if not self._SUPPORTED_FEATURES.issuperset(features): raise ExtractorError(f'{self.JSI_NAME} does not support all required features: {features}') self._downloader = downloader self.timeout = timeout @@ -250,7 +250,7 @@ def register_jsi(jsi_cls: JsiClass) -> JsiClass: """Register a JS interpreter class""" assert issubclass(jsi_cls, JSI), f'{jsi_cls} must be a subclass of JSI' assert jsi_cls.JSI_KEY not in _JSI_HANDLERS, f'JSI {jsi_cls.JSI_KEY} already registered' - assert jsi_cls._SUPPORT_FEATURES.issubset(_ALL_FEATURES), f'{jsi_cls._SUPPORT_FEATURES - _ALL_FEATURES} not declared in `_All_FEATURES`' + assert jsi_cls._SUPPORTED_FEATURES.issubset(_ALL_FEATURES), f'{jsi_cls._SUPPORTED_FEATURES - _ALL_FEATURES} not declared in `_All_FEATURES`' _JSI_HANDLERS[jsi_cls.JSI_KEY] = jsi_cls return jsi_cls