2022-08-18 11:04:47 -05:00
import collections
2022-04-17 15:58:28 -05:00
import contextlib
2017-09-23 12:08:27 -05:00
import json
import os
import subprocess
import tempfile
2024-06-11 18:09:58 -05:00
import urllib . parse
2017-01-25 10:27:22 -06:00
2024-08-02 15:53:30 -05:00
from . common import InfoExtractor
2016-03-20 03:49:44 -05:00
from . . utils import (
2022-04-11 17:32:57 -05:00
ExtractorError ,
Popen ,
2017-09-23 12:08:27 -05:00
check_executable ,
2022-08-18 11:04:47 -05:00
format_field ,
2017-09-23 12:08:27 -05:00
get_exe_version ,
is_outdated_version ,
2022-08-18 11:04:47 -05:00
shell_quote ,
2016-03-20 03:49:44 -05:00
)
2016-03-20 02:54:58 -05:00
2017-09-23 12:08:27 -05:00
def cookie_to_dict ( cookie ) :
cookie_dict = {
' name ' : cookie . name ,
' value ' : cookie . value ,
}
if cookie . port_specified :
cookie_dict [ ' port ' ] = cookie . port
if cookie . domain_specified :
cookie_dict [ ' domain ' ] = cookie . domain
if cookie . path_specified :
cookie_dict [ ' path ' ] = cookie . path
if cookie . expires is not None :
cookie_dict [ ' expires ' ] = cookie . expires
if cookie . secure is not None :
cookie_dict [ ' secure ' ] = cookie . secure
if cookie . discard is not None :
cookie_dict [ ' discard ' ] = cookie . discard
2022-04-17 15:58:28 -05:00
with contextlib . suppress ( TypeError ) :
2019-05-10 15:56:22 -05:00
if ( cookie . has_nonstandard_attr ( ' httpOnly ' )
or cookie . has_nonstandard_attr ( ' httponly ' )
or cookie . has_nonstandard_attr ( ' HttpOnly ' ) ) :
2017-09-23 12:08:27 -05:00
cookie_dict [ ' httponly ' ] = True
return cookie_dict
def cookie_jar_to_list ( cookie_jar ) :
return [ cookie_to_dict ( cookie ) for cookie in cookie_jar ]
2024-08-02 15:53:30 -05:00
class DenoWrapper :
INSTALL_HINT = ' Please install deno following https://docs.deno.com/runtime/manual/getting_started/installation/ or download its binary from https://github.com/denoland/deno/releases '
@staticmethod
def _version ( ) :
return get_exe_version ( ' deno ' , version_re = r ' ([0-9.]+) ' )
def __init__ ( self , extractor : InfoExtractor , required_version = None , timeout = 10000 ) :
self . extractor = extractor
self . timeout = timeout
self . exe = check_executable ( ' deno ' , [ ' -V ' ] )
if not self . exe :
raise ExtractorError ( f ' Deno not found, { self . INSTALL_HINT } ' , expected = True )
if required_version :
if is_outdated_version ( self . _version ( ) , required_version ) :
self . extractor . report_warning (
f ' Deno is outdated, update it to version { required_version } or newer if you encounter any errors. ' )
@contextlib.contextmanager
def _create_temp_js ( self , jscode ) :
js_file = tempfile . NamedTemporaryFile ( ' wt ' , encoding = ' utf-8 ' , suffix = ' .js ' , delete = False )
try :
js_file . write ( jscode )
js_file . close ( )
yield js_file
finally :
with contextlib . suppress ( OSError ) :
os . remove ( js_file . name )
2024-08-02 15:57:43 -05:00
def deno_execute ( self , jscode , video_id = None , * , note = ' Executing JS in Deno ' , allow_net = None ) :
""" Execute JS directly in Deno environment and return stdout """
2024-08-02 15:53:30 -05:00
2024-08-02 16:01:20 -05:00
base_js = ' delete window.Deno; global = window; '
2024-08-02 15:53:30 -05:00
2024-08-02 15:57:43 -05:00
with self . _create_temp_js ( base_js + jscode ) as js_file :
2024-08-02 15:53:30 -05:00
self . extractor . to_screen ( f ' { format_field ( video_id , None , " %s : " ) } { note } ' )
cmd = [ self . exe , ' run ' , js_file . name ]
if allow_net :
cmd . append ( ' --allow-net ' if isinstance ( allow_net , bool ) else f ' --allow-net= { allow_net } ' )
self . extractor . write_debug ( f ' Deno command line: { shell_quote ( cmd ) } ' )
try :
stdout , stderr , returncode = Popen . run ( cmd , timeout = self . timeout / 1000 , text = True ,
stdout = subprocess . PIPE , stderr = subprocess . PIPE )
except Exception as e :
raise ExtractorError ( f ' { note } failed: Unable to run Deno binary ' , cause = e )
if returncode :
raise ExtractorError ( f ' { note } failed with returncode { returncode } : \n { stderr } ' )
elif stderr :
self . extractor . report_warning ( f ' JS console error msg: \n { stderr . strip ( ) } ' , video_id = video_id )
return stdout . strip ( )
2022-04-11 10:10:28 -05:00
class PhantomJSwrapper :
2017-09-23 12:08:27 -05:00
""" PhantomJS wrapper class
This class is experimental .
"""
2022-08-30 06:53:59 -05:00
INSTALL_HINT = ' Please download it from https://phantomjs.org/download.html '
2022-08-18 11:04:47 -05:00
_BASE_JS = R '''
2017-09-23 12:08:27 -05:00
phantom . onError = function ( msg , trace ) { {
var msgStack = [ ' PHANTOM ERROR: ' + msg ] ;
if ( trace & & trace . length ) { {
msgStack . push ( ' TRACE: ' ) ;
trace . forEach ( function ( t ) { {
msgStack . push ( ' -> ' + ( t . file | | t . sourceURL ) + ' : ' + t . line
+ ( t . function ? ' (in function ' + t . function + ' ) ' : ' ' ) ) ;
} } ) ;
} }
console . error ( msgStack . join ( ' \n ' ) ) ;
phantom . exit ( 1 ) ;
} } ;
2022-08-18 11:04:47 -05:00
'''
_TEMPLATE = R '''
2017-09-23 12:08:27 -05:00
var page = require ( ' webpage ' ) . create ( ) ;
var fs = require ( ' fs ' ) ;
var read = { { mode : ' r ' , charset : ' utf-8 ' } } ;
var write = { { mode : ' w ' , charset : ' utf-8 ' } } ;
JSON . parse ( fs . read ( " {cookies} " , read ) ) . forEach ( function ( x ) { {
phantom . addCookie ( x ) ;
} } ) ;
page . settings . resourceTimeout = { timeout } ;
page . settings . userAgent = " {ua} " ;
page . onLoadStarted = function ( ) { {
page . evaluate ( function ( ) { {
delete window . _phantom ;
delete window . callPhantom ;
} } ) ;
} } ;
var saveAndExit = function ( ) { {
fs . write ( " {html} " , page . content , write ) ;
fs . write ( " {cookies} " , JSON . stringify ( phantom . cookies ) , write ) ;
phantom . exit ( ) ;
} } ;
page . onLoadFinished = function ( status ) { {
if ( page . url == = " " ) { {
page . setContent ( fs . read ( " {html} " , read ) , " {url} " ) ;
} }
else { {
{ jscode }
} }
} } ;
page . open ( " " ) ;
'''
_TMP_FILE_NAMES = [ ' script ' , ' html ' , ' cookies ' ]
@staticmethod
def _version ( ) :
return get_exe_version ( ' phantomjs ' , version_re = r ' ([0-9.]+) ' )
def __init__ ( self , extractor , required_version = None , timeout = 10000 ) :
2017-12-24 06:47:42 -06:00
self . _TMP_FILES = { }
2017-09-23 12:08:27 -05:00
self . exe = check_executable ( ' phantomjs ' , [ ' -v ' ] )
if not self . exe :
2022-08-30 06:53:59 -05:00
raise ExtractorError ( f ' PhantomJS not found, { self . INSTALL_HINT } ' , expected = True )
2017-09-23 12:08:27 -05:00
self . extractor = extractor
if required_version :
version = self . _version ( )
if is_outdated_version ( version , required_version ) :
self . extractor . _downloader . report_warning (
' Your copy of PhantomJS is outdated, update it to version '
2024-06-11 18:09:58 -05:00
f ' { required_version } or newer if you encounter any errors. ' )
2017-09-23 12:08:27 -05:00
for name in self . _TMP_FILE_NAMES :
tmp = tempfile . NamedTemporaryFile ( delete = False )
tmp . close ( )
self . _TMP_FILES [ name ] = tmp
2022-08-18 11:04:47 -05:00
self . options = collections . ChainMap ( {
' timeout ' : timeout ,
} , {
x : self . _TMP_FILES [ x ] . name . replace ( ' \\ ' , ' \\ \\ ' ) . replace ( ' " ' , ' \\ " ' )
for x in self . _TMP_FILE_NAMES
} )
2017-09-23 12:08:27 -05:00
def __del__ ( self ) :
for name in self . _TMP_FILE_NAMES :
2022-04-17 15:58:28 -05:00
with contextlib . suppress ( OSError , KeyError ) :
2017-09-23 12:08:27 -05:00
os . remove ( self . _TMP_FILES [ name ] . name )
def _save_cookies ( self , url ) :
2022-06-22 23:14:22 -05:00
cookies = cookie_jar_to_list ( self . extractor . cookiejar )
2017-09-23 12:08:27 -05:00
for cookie in cookies :
if ' path ' not in cookie :
cookie [ ' path ' ] = ' / '
if ' domain ' not in cookie :
2024-06-11 18:09:58 -05:00
cookie [ ' domain ' ] = urllib . parse . urlparse ( url ) . netloc
2017-09-23 12:08:27 -05:00
with open ( self . _TMP_FILES [ ' cookies ' ] . name , ' wb ' ) as f :
2024-06-11 18:09:58 -05:00
f . write ( json . dumps ( cookies ) . encode ( ) )
2017-09-23 12:08:27 -05:00
def _load_cookies ( self ) :
with open ( self . _TMP_FILES [ ' cookies ' ] . name , ' rb ' ) as f :
cookies = json . loads ( f . read ( ) . decode ( ' utf-8 ' ) )
for cookie in cookies :
if cookie [ ' httponly ' ] is True :
cookie [ ' rest ' ] = { ' httpOnly ' : None }
if ' expiry ' in cookie :
cookie [ ' expire_time ' ] = cookie [ ' expiry ' ]
2022-04-11 15:09:26 -05:00
self . extractor . _set_cookie ( * * cookie )
2017-09-23 12:08:27 -05:00
def get ( self , url , html = None , video_id = None , note = None , note2 = ' Executing JS on webpage ' , headers = { } , jscode = ' saveAndExit(); ' ) :
"""
Downloads webpage ( if needed ) and executes JS
Params :
url : website url
html : optional , html code of website
video_id : video id
note : optional , displayed when downloading webpage
note2 : optional , displayed when executing JS
headers : custom http headers
jscode : code to be executed when page is loaded
Returns tuple with :
* downloaded website ( after JS execution )
* anything you print with ` console . log ` ( but not inside ` page . execute ` ! )
In most cases you don ' t need to add any `jscode`.
It is executed in ` page . onLoadFinished ` .
` saveAndExit ( ) ; ` is mandatory , use it instead of ` phantom . exit ( ) `
2022-08-14 07:04:13 -05:00
It is possible to wait for some element on the webpage , e . g .
2017-09-23 12:08:27 -05:00
var check = function ( ) {
var elementFound = page . evaluate ( function ( ) {
return document . querySelector ( ' #b.done ' ) != = null ;
} ) ;
if ( elementFound )
saveAndExit ( ) ;
else
window . setTimeout ( check , 500 ) ;
}
page . evaluate ( function ( ) {
document . querySelector ( ' #a ' ) . click ( ) ;
} ) ;
check ( ) ;
"""
if ' saveAndExit(); ' not in jscode :
raise ExtractorError ( ' `saveAndExit();` not found in `jscode` ' )
if not html :
html = self . extractor . _download_webpage ( url , video_id , note = note , headers = headers )
with open ( self . _TMP_FILES [ ' html ' ] . name , ' wb ' ) as f :
2024-06-11 18:09:58 -05:00
f . write ( html . encode ( ) )
2017-09-23 12:08:27 -05:00
self . _save_cookies ( url )
2022-03-14 05:40:37 -05:00
user_agent = headers . get ( ' User-Agent ' ) or self . extractor . get_param ( ' http_headers ' ) [ ' User-Agent ' ]
2022-08-18 11:04:47 -05:00
jscode = self . _TEMPLATE . format_map ( self . options . new_child ( {
' url ' : url ,
' ua ' : user_agent . replace ( ' " ' , ' \\ " ' ) ,
' jscode ' : jscode ,
} ) )
2017-09-23 12:08:27 -05:00
2022-09-03 01:44:01 -05:00
stdout = self . execute ( jscode , video_id , note = note2 )
2017-09-23 12:08:27 -05:00
with open ( self . _TMP_FILES [ ' html ' ] . name , ' rb ' ) as f :
html = f . read ( ) . decode ( ' utf-8 ' )
self . _load_cookies ( )
2022-06-17 20:57:22 -05:00
return html , stdout
2022-08-18 11:04:47 -05:00
2022-08-21 19:49:06 -05:00
def execute ( self , jscode , video_id = None , * , note = ' Executing JS ' ) :
2022-08-18 11:04:47 -05:00
""" Execute JS and return stdout """
if ' phantom.exit(); ' not in jscode :
jscode + = ' ; \n phantom.exit(); '
jscode = self . _BASE_JS + jscode
with open ( self . _TMP_FILES [ ' script ' ] . name , ' w ' , encoding = ' utf-8 ' ) as f :
f . write ( jscode )
self . extractor . to_screen ( f ' { format_field ( video_id , None , " %s : " ) } { note } ' )
cmd = [ self . exe , ' --ssl-protocol=any ' , self . _TMP_FILES [ ' script ' ] . name ]
self . extractor . write_debug ( f ' PhantomJS command line: { shell_quote ( cmd ) } ' )
2022-08-21 19:49:06 -05:00
try :
stdout , stderr , returncode = Popen . run ( cmd , timeout = self . options [ ' timeout ' ] / 1000 ,
text = True , stdout = subprocess . PIPE , stderr = subprocess . PIPE )
except Exception as e :
raise ExtractorError ( f ' { note } failed: Unable to run PhantomJS binary ' , cause = e )
2022-08-18 11:04:47 -05:00
if returncode :
2022-08-30 06:53:59 -05:00
raise ExtractorError ( f ' { note } failed with returncode { returncode } : \n { stderr . strip ( ) } ' )
2022-08-18 11:04:47 -05:00
return stdout