From b87a0582c52fe8ab19e65fbef044c07c20521a76 Mon Sep 17 00:00:00 2001
From: c-basalt <117849907+c-basalt@users.noreply.github.com>
Date: Mon, 30 Dec 2024 17:54:45 -0500
Subject: [PATCH] test script tag
---
test/test_jsinterp_external.py | 42 ++++++++++++++++++++++++++++++++++
yt_dlp/jsinterp/_deno.py | 23 +++++++++++++------
yt_dlp/jsinterp/_phantomjs.py | 2 +-
yt_dlp/jsinterp/common.py | 10 ++++----
4 files changed, 64 insertions(+), 13 deletions(-)
diff --git a/test/test_jsinterp_external.py b/test/test_jsinterp_external.py
index b0a046db9..7f66c032a 100644
--- a/test/test_jsinterp_external.py
+++ b/test/test_jsinterp_external.py
@@ -32,6 +32,48 @@ def jsi_available(self):
def test_execute(self):
self.assertEqual(self.jsi.execute('console.log("Hello, world!");'), 'Hello, world!')
+ def test_execute_dom_parse(self):
+ if 'dom' not in self.jsi._SUPPORTED_FEATURES:
+ self.skipTest('DOM not supported')
+ self.assertEqual(self.jsi.execute(
+ 'console.log(document.getElementById("test-div").innerHTML);',
+ location='https://example.com',
+ html='
Hello, world!
'),
+ 'Hello, world!')
+
+ def test_execute_dom_script(self):
+ if 'dom' not in self.jsi._SUPPORTED_FEATURES:
+ self.skipTest('DOM not supported')
+ self.assertEqual(self.jsi.execute(
+ 'console.log(document.getElementById("test-div").innerHTML);',
+ location='https://example.com',
+ html='''
+
+
+
+ '''),
+ 'Hello, world!')
+
+ def test_execute_dom_script_with_error(self):
+ if 'dom' not in self.jsi._SUPPORTED_FEATURES:
+ self.skipTest('DOM not supported')
+ if self.jsi.JSI_KEY == 'PhantomJS':
+ self.skipTest('PhantomJS does not catch errors')
+ self.assertEqual(self.jsi.execute(
+ 'console.log(document.getElementById("test-div").innerHTML);',
+ location='https://example.com',
+ html='''
+
+
+
+ '''),
+ 'Hello, world!')
+
class TestDeno(Base.TestExternalJSI):
_JSI_CLASS = DenoJSI
diff --git a/yt_dlp/jsinterp/_deno.py b/yt_dlp/jsinterp/_deno.py
index d7f4cfa78..a87c104f7 100644
--- a/yt_dlp/jsinterp/_deno.py
+++ b/yt_dlp/jsinterp/_deno.py
@@ -137,6 +137,12 @@ def _ensure_jsdom(self):
self._run_deno(cmd)
self._JSDOM_IMPORT_CHECKED = True
+ def _parse_script_tags(self, html: str):
+ for match_start in re.finditer(r'', match_start.end())
+ if end > match_start.end():
+ yield html[match_start.end():end]
+
def execute(self, jscode, video_id=None, note='Executing JS in Deno', location='', html='', cookiejar=None):
self.report_note(video_id, note)
self._ensure_jsdom()
@@ -144,13 +150,13 @@ def execute(self, jscode, video_id=None, note='Executing JS in Deno', location='
inline_scripts = '\n'.join([
'try { %s } catch (e) {}' % script
- for script in re.findall(r'', html, re.DOTALL)
+ for script in self._parse_script_tags(html)
])
script = f'''{self._init_script};
{self._override_navigator_js};
import jsdom from "{self._JSDOM_URL}";
- const {callback_varname} = (() => {{
+ let {callback_varname} = (() => {{
const jar = jsdom.CookieJar.deserializeSync({json.dumps(self.serialize_cookie(cookiejar, location))});
const dom = new jsdom.JSDOM({json.dumps(str(html))}, {{
{'url: %s,' % json.dumps(str(location)) if location else ''}
@@ -158,15 +164,18 @@ def execute(self, jscode, video_id=None, note='Executing JS in Deno', location='
}});
Object.keys(dom.window).forEach((key) => {{try {{window[key] = dom.window[key]}} catch (e) {{}}}});
delete window.jsdom;
- const stdout = [];
- const origLog = console.log;
- console.log = (...msg) => stdout.push(msg.map(m => m.toString()).join(' '));
- return () => {{ origLog(JSON.stringify({{
- stdout: stdout.join('\\n'), cookies: jar.serializeSync().cookies}})); }}
+ return () => {{
+ const stdout = [];
+ const origLog = console.log;
+ console.log = (...msg) => stdout.push(msg.map(m => m.toString()).join(' '));
+ return () => {{ origLog(JSON.stringify({{
+ stdout: stdout.join('\\n'), cookies: jar.serializeSync().cookies}})); }}
+ }}
}})();
await (async () => {{
{inline_scripts}
}})();
+ {callback_varname} = {callback_varname}();
await (async () => {{
{jscode}
}})().finally({callback_varname});
diff --git a/yt_dlp/jsinterp/_phantomjs.py b/yt_dlp/jsinterp/_phantomjs.py
index 2f4c6d105..9f03cd7d5 100644
--- a/yt_dlp/jsinterp/_phantomjs.py
+++ b/yt_dlp/jsinterp/_phantomjs.py
@@ -23,7 +23,7 @@
@register_jsi
class PhantomJSJSI(ExternalJSI):
_EXE_NAME = 'phantomjs'
- _SUPPORTED_FEATURES = {'js', 'location', 'cookies'}
+ _SUPPORTED_FEATURES = {'js', 'location', 'cookies', 'dom'}
_BASE_PREFERENCE = 3
_BASE_JS = R'''
diff --git a/yt_dlp/jsinterp/common.py b/yt_dlp/jsinterp/common.py
index 8dfe195bf..9169ec14f 100644
--- a/yt_dlp/jsinterp/common.py
+++ b/yt_dlp/jsinterp/common.py
@@ -92,7 +92,7 @@ def __init__(
jsi_keys = [key for key in get_jsi_keys(only_include or _JSI_HANDLERS) if key not in get_jsi_keys(exclude)]
self.write_debug(f'Allowed JSI keys: {jsi_keys}')
handler_classes = [_JSI_HANDLERS[key] for key in jsi_keys
- if _JSI_HANDLERS[key]._SUPPORT_FEATURES.issuperset(self._features)]
+ if _JSI_HANDLERS[key]._SUPPORTED_FEATURES.issuperset(self._features)]
self.write_debug(f'Selected JSI classes for given features: {get_jsi_keys(handler_classes)}, '
f'included: {get_jsi_keys(only_include) or "all"}, excluded: {get_jsi_keys(exclude)}')
@@ -106,7 +106,7 @@ def __init__(
def add_handler(self, handler: JSI):
"""Add a handler. If a handler of the same JSI_KEY exists, it will overwrite it"""
assert isinstance(handler, JSI), 'handler must be a JSI instance'
- if not handler._SUPPORT_FEATURES.issuperset(self._features):
+ if not handler._SUPPORTED_FEATURES.issuperset(self._features):
raise ExtractorError(f'{handler.JSI_NAME} does not support all required features: {self._features}')
self._handler_dict[handler.JSI_KEY] = handler
@@ -193,11 +193,11 @@ def execute(self, jscode: str, video_id: str | None, **kwargs) -> str:
class JSI(abc.ABC):
- _SUPPORT_FEATURES: set[str] = set()
+ _SUPPORTED_FEATURES: set[str] = set()
_BASE_PREFERENCE: int = 0
def __init__(self, downloader: YoutubeDL, timeout: float | int, features: set[str], user_agent=None):
- if not self._SUPPORT_FEATURES.issuperset(features):
+ if not self._SUPPORTED_FEATURES.issuperset(features):
raise ExtractorError(f'{self.JSI_NAME} does not support all required features: {features}')
self._downloader = downloader
self.timeout = timeout
@@ -250,7 +250,7 @@ def register_jsi(jsi_cls: JsiClass) -> JsiClass:
"""Register a JS interpreter class"""
assert issubclass(jsi_cls, JSI), f'{jsi_cls} must be a subclass of JSI'
assert jsi_cls.JSI_KEY not in _JSI_HANDLERS, f'JSI {jsi_cls.JSI_KEY} already registered'
- assert jsi_cls._SUPPORT_FEATURES.issubset(_ALL_FEATURES), f'{jsi_cls._SUPPORT_FEATURES - _ALL_FEATURES} not declared in `_All_FEATURES`'
+ assert jsi_cls._SUPPORTED_FEATURES.issubset(_ALL_FEATURES), f'{jsi_cls._SUPPORTED_FEATURES - _ALL_FEATURES} not declared in `_All_FEATURES`'
_JSI_HANDLERS[jsi_cls.JSI_KEY] = jsi_cls
return jsi_cls