2022-08-18 11:04:47 -05:00
import collections
2022-04-17 15:58:28 -05:00
import contextlib
2017-09-23 12:08:27 -05:00
import json
import os
import subprocess
import tempfile
2024-06-11 18:09:58 -05:00
import urllib . parse
2017-01-25 10:27:22 -06:00
2024-08-08 10:07:27 -05:00
from . . extractor . common import InfoExtractor
2016-03-20 03:49:44 -05:00
from . . utils import (
2022-04-11 17:32:57 -05:00
ExtractorError ,
Popen ,
2024-08-02 18:35:28 -05:00
classproperty ,
2022-08-18 11:04:47 -05:00
format_field ,
2017-09-23 12:08:27 -05:00
get_exe_version ,
is_outdated_version ,
2022-08-18 11:04:47 -05:00
shell_quote ,
2016-03-20 03:49:44 -05:00
)
2016-03-20 02:54:58 -05:00
2017-09-23 12:08:27 -05:00
def cookie_to_dict ( cookie ) :
cookie_dict = {
' name ' : cookie . name ,
' value ' : cookie . value ,
}
if cookie . port_specified :
cookie_dict [ ' port ' ] = cookie . port
if cookie . domain_specified :
cookie_dict [ ' domain ' ] = cookie . domain
if cookie . path_specified :
cookie_dict [ ' path ' ] = cookie . path
if cookie . expires is not None :
cookie_dict [ ' expires ' ] = cookie . expires
if cookie . secure is not None :
cookie_dict [ ' secure ' ] = cookie . secure
if cookie . discard is not None :
cookie_dict [ ' discard ' ] = cookie . discard
2022-04-17 15:58:28 -05:00
with contextlib . suppress ( TypeError ) :
2019-05-10 15:56:22 -05:00
if ( cookie . has_nonstandard_attr ( ' httpOnly ' )
or cookie . has_nonstandard_attr ( ' httponly ' )
or cookie . has_nonstandard_attr ( ' HttpOnly ' ) ) :
2017-09-23 12:08:27 -05:00
cookie_dict [ ' httponly ' ] = True
return cookie_dict
def cookie_jar_to_list ( cookie_jar ) :
return [ cookie_to_dict ( cookie ) for cookie in cookie_jar ]
2024-08-08 10:07:27 -05:00
@contextlib.contextmanager
def _temp_file ( content , * , mode = ' wt ' , encoding = ' utf-8 ' , suffix = None , close = True ) :
if ' r ' in mode :
encoding = None
temp_file_handle = tempfile . NamedTemporaryFile ( mode , encoding = encoding , suffix = suffix , delete = False )
try :
temp_file_handle . write ( content )
if close :
temp_file_handle . close ( )
yield temp_file_handle
finally :
with contextlib . suppress ( OSError ) :
os . remove ( temp_file_handle . name )
2024-08-02 18:35:28 -05:00
class ExternalJSI :
@classproperty ( cache = True )
def version ( cls ) :
2024-08-08 10:07:27 -05:00
return get_exe_version ( cls . _EXE_NAME , args = getattr ( cls , ' V_ARGS ' , [ ' --version ' ] ) , version_re = r ' ([0-9.]+) ' )
2024-08-02 18:35:28 -05:00
2024-08-11 01:48:20 -05:00
@classproperty
def full_version ( cls ) :
return cls . version
2024-08-02 18:35:28 -05:00
@classproperty
def exe ( cls ) :
2024-08-08 10:07:27 -05:00
return cls . _EXE_NAME if cls . version else None
2024-08-02 18:35:28 -05:00
@classproperty
def is_available ( cls ) :
return bool ( cls . exe )
2024-08-02 15:53:30 -05:00
2024-08-02 18:35:28 -05:00
class DenoWrapper ( ExternalJSI ) :
2024-08-08 10:07:27 -05:00
_EXE_NAME = ' deno '
2024-08-11 01:48:20 -05:00
INSTALL_HINT = ' Please install Deno from https://docs.deno.com/runtime/manual/getting_started/installation/ or download binary from https://github.com/denoland/deno/releases '
2024-08-02 15:53:30 -05:00
def __init__ ( self , extractor : InfoExtractor , required_version = None , timeout = 10000 ) :
self . extractor = extractor
self . timeout = timeout
if not self . exe :
raise ExtractorError ( f ' Deno not found, { self . INSTALL_HINT } ' , expected = True )
if required_version :
2024-08-02 18:35:28 -05:00
if is_outdated_version ( self . version , required_version ) :
2024-08-02 15:53:30 -05:00
self . extractor . report_warning (
f ' Deno is outdated, update it to version { required_version } or newer if you encounter any errors. ' )
2024-08-11 01:48:20 -05:00
@classmethod
def _execute ( cls , jscode , extractor = None , video_id = None , note = ' ' , flags = [ ] , timeout = 10000 ) :
with _temp_file ( jscode , suffix = ' .js ' ) as js_file :
if note and extractor :
extractor . to_screen ( f ' { format_field ( video_id , None , " %s : " ) } { note } ' )
cmd = [ cls . exe , ' run ' , * flags , js_file . name ]
try :
stdout , stderr , returncode = Popen . run (
cmd , timeout = timeout / 1000 , text = True , stdout = subprocess . PIPE , stderr = subprocess . PIPE )
except Exception as e :
raise ExtractorError ( ' Unable to run Deno binary ' , cause = e )
if returncode :
raise ExtractorError ( f ' Failed with returncode { returncode } : \n { stderr } ' )
elif stderr and extractor :
extractor . report_warning ( f ' JS console error msg: \n { stderr . strip ( ) } ' , video_id = video_id )
return stdout . strip ( )
2024-08-11 07:03:00 -05:00
def execute ( self , jscode , video_id = None , * , note = ' Executing JS in Deno ' , flags = [ ] , base_js = None ) :
2024-08-02 18:35:28 -05:00
""" Execute JS directly in Deno runtime and return stdout """
2024-08-02 15:53:30 -05:00
2024-08-02 18:35:28 -05:00
base_js = base_js if base_js is not None else ' delete window.Deno; global = window; '
2024-08-02 15:53:30 -05:00
2024-08-11 01:48:20 -05:00
return self . _execute ( base_js + jscode , extractor = self . extractor , video_id = video_id , note = note ,
flags = flags , timeout = self . timeout )
2024-08-02 15:53:30 -05:00
2024-08-11 07:03:00 -05:00
class DenoJITlessJSI ( DenoWrapper ) :
def execute ( self , jscode , video_id = None , * , note = ' Executing JS in Deno ' , flags = [ ] , base_js = None ) :
return super ( ) . execute ( jscode , video_id , note = note , base_js = base_js ,
flags = [ * flags , ' --v8-flags=--jitless,--noexpose-wasm ' ] )
2024-08-08 10:07:27 -05:00
class PuppeteerWrapper :
2024-08-11 01:48:20 -05:00
_PACKAGE_VERSION = ' 16.2.0 '
2024-08-08 10:07:27 -05:00
_HEADLESS = False
@classproperty
def INSTALL_HINT ( cls ) :
2024-08-11 01:48:20 -05:00
msg = f ' Run " deno run -A https://deno.land/x/puppeteer@ { cls . _PACKAGE_VERSION } /install.ts " to install puppeteer '
2024-08-08 10:07:27 -05:00
if not DenoWrapper . is_available :
msg = f ' { DenoWrapper . INSTALL_HINT } . Then { msg } '
return msg
2024-08-11 01:48:20 -05:00
@classproperty ( cache = True )
def full_version ( cls ) :
if not DenoWrapper . is_available :
return
try :
browser_version = DenoWrapper . _execute ( f '''
import puppeteer from " https://deno.land/x/puppeteer@16.2.0/mod.ts " ;
const browser = await puppeteer . launch ( { { headless : { json . dumps ( bool ( cls . _HEADLESS ) ) } } } ) ;
try { {
/ / await ( new )
console . log ( await browser . version ( ) )
} } finally { {
await browser . close ( ) ;
} } ''' , flags=[ ' --allow-all ' ])
return f ' puppeteer= { cls . _PACKAGE_VERSION } browser= { browser_version } '
except ExtractorError :
return None
@classproperty
def version ( cls ) :
return cls . _PACKAGE_VERSION if cls . full_version else None
2024-08-08 10:07:27 -05:00
def __init__ ( self , extractor : InfoExtractor , required_version = None , timeout = 10000 ) :
self . deno = DenoWrapper ( extractor , timeout = ( timeout + 30000 ) )
self . timeout = timeout
self . extractor = extractor
if required_version :
self . extractor . report_warning ( f ' required_version is not supported on { self . __class__ . __name__ } ' )
def _deno_execute ( self , jscode , note = None ) :
return self . deno . execute ( f '''
2024-08-11 01:48:20 -05:00
import puppeteer from " https://deno.land/x/puppeteer@ {self._PACKAGE_VERSION} /mod.ts " ;
2024-08-08 10:07:27 -05:00
const browser = await puppeteer . launch ( { {
headless : { json . dumps ( bool ( self . _HEADLESS ) ) } , args : [ " --disable-web-security " ] } } ) ;
try { {
{ jscode }
} } finally { {
await browser . close ( ) ;
2024-08-11 01:48:20 -05:00
} } ''' , note=note, flags=[ ' --allow-all ' ], base_js= ' ' )
2024-08-08 10:07:27 -05:00
2024-08-11 07:03:00 -05:00
def execute ( self , jscode , video_id = None , note = ' Executing JS in Puppeteer ' , url = ' about:blank ' ) :
2024-08-08 10:07:27 -05:00
self . extractor . to_screen ( f ' { format_field ( video_id , None , " %s : " ) } { note } ' )
2024-08-11 07:03:00 -05:00
return self . _deno_execute ( f '''
2024-08-08 10:07:27 -05:00
const page = await browser . newPage ( ) ;
window . setTimeout ( async ( ) = > { {
console . error ( ' Puppeteer execution timed out ' ) ;
await browser . close ( ) ;
Deno . exit ( 1 ) ;
} } , { int ( self . timeout ) } ) ;
page . resourceTimeout = { int ( self . timeout ) } ;
2024-08-11 07:03:00 -05:00
/ / drop network requests
2024-08-08 10:07:27 -05:00
await page . setRequestInterception ( true ) ;
page . on ( " request " , request = > request . abort ( ) ) ;
2024-08-11 07:03:00 -05:00
/ / capture console output
page . on ( " console " , msg = > { {
msg . type ( ) == = ' log ' & & console . log ( msg . text ( ) ) ;
msg . type ( ) == = ' error ' & & console . error ( msg . text ( ) ) ;
} } ) ;
2024-08-08 10:07:27 -05:00
const url = { json . dumps ( str ( url ) ) } ;
await page . evaluate ( ` window . history . replaceState ( ' ' , ' ' , $ { { JSON . stringify ( url ) } } ) ` ) ;
2024-08-11 07:03:00 -05:00
await page . evaluate ( { json . dumps ( str ( jscode ) ) } ) ;
2024-08-08 10:07:27 -05:00
await browser . close ( ) ;
Deno . exit ( 0 ) ;
''' )
2024-08-02 18:35:28 -05:00
class PhantomJSwrapper ( ExternalJSI ) :
2017-09-23 12:08:27 -05:00
""" PhantomJS wrapper class
This class is experimental .
"""
2024-08-08 10:07:27 -05:00
_EXE_NAME = ' phantomjs '
2024-08-02 18:35:28 -05:00
INSTALL_HINT = ' Please download PhantomJS from https://phantomjs.org/download.html '
2022-08-30 06:53:59 -05:00
2022-08-18 11:04:47 -05:00
_BASE_JS = R '''
2017-09-23 12:08:27 -05:00
phantom . onError = function ( msg , trace ) { {
var msgStack = [ ' PHANTOM ERROR: ' + msg ] ;
if ( trace & & trace . length ) { {
msgStack . push ( ' TRACE: ' ) ;
trace . forEach ( function ( t ) { {
msgStack . push ( ' -> ' + ( t . file | | t . sourceURL ) + ' : ' + t . line
+ ( t . function ? ' (in function ' + t . function + ' ) ' : ' ' ) ) ;
} } ) ;
} }
console . error ( msgStack . join ( ' \n ' ) ) ;
phantom . exit ( 1 ) ;
} } ;
2022-08-18 11:04:47 -05:00
'''
_TEMPLATE = R '''
2017-09-23 12:08:27 -05:00
var page = require ( ' webpage ' ) . create ( ) ;
var fs = require ( ' fs ' ) ;
var read = { { mode : ' r ' , charset : ' utf-8 ' } } ;
var write = { { mode : ' w ' , charset : ' utf-8 ' } } ;
JSON . parse ( fs . read ( " {cookies} " , read ) ) . forEach ( function ( x ) { {
phantom . addCookie ( x ) ;
} } ) ;
page . settings . resourceTimeout = { timeout } ;
page . settings . userAgent = " {ua} " ;
page . onLoadStarted = function ( ) { {
page . evaluate ( function ( ) { {
delete window . _phantom ;
delete window . callPhantom ;
} } ) ;
} } ;
var saveAndExit = function ( ) { {
fs . write ( " {html} " , page . content , write ) ;
fs . write ( " {cookies} " , JSON . stringify ( phantom . cookies ) , write ) ;
phantom . exit ( ) ;
} } ;
page . onLoadFinished = function ( status ) { {
if ( page . url == = " " ) { {
page . setContent ( fs . read ( " {html} " , read ) , " {url} " ) ;
} }
else { {
{ jscode }
} }
} } ;
page . open ( " " ) ;
'''
_TMP_FILE_NAMES = [ ' script ' , ' html ' , ' cookies ' ]
2024-08-02 18:35:28 -05:00
@classmethod
def _version ( cls ) :
return cls . version
2017-09-23 12:08:27 -05:00
def __init__ ( self , extractor , required_version = None , timeout = 10000 ) :
2017-12-24 06:47:42 -06:00
self . _TMP_FILES = { }
2017-09-23 12:08:27 -05:00
if not self . exe :
2022-08-30 06:53:59 -05:00
raise ExtractorError ( f ' PhantomJS not found, { self . INSTALL_HINT } ' , expected = True )
2017-09-23 12:08:27 -05:00
self . extractor = extractor
if required_version :
2024-08-02 18:35:28 -05:00
if is_outdated_version ( self . version , required_version ) :
2017-09-23 12:08:27 -05:00
self . extractor . _downloader . report_warning (
' Your copy of PhantomJS is outdated, update it to version '
2024-06-11 18:09:58 -05:00
f ' { required_version } or newer if you encounter any errors. ' )
2017-09-23 12:08:27 -05:00
for name in self . _TMP_FILE_NAMES :
tmp = tempfile . NamedTemporaryFile ( delete = False )
tmp . close ( )
self . _TMP_FILES [ name ] = tmp
2022-08-18 11:04:47 -05:00
self . options = collections . ChainMap ( {
' timeout ' : timeout ,
} , {
x : self . _TMP_FILES [ x ] . name . replace ( ' \\ ' , ' \\ \\ ' ) . replace ( ' " ' , ' \\ " ' )
for x in self . _TMP_FILE_NAMES
} )
2017-09-23 12:08:27 -05:00
def __del__ ( self ) :
for name in self . _TMP_FILE_NAMES :
2022-04-17 15:58:28 -05:00
with contextlib . suppress ( OSError , KeyError ) :
2017-09-23 12:08:27 -05:00
os . remove ( self . _TMP_FILES [ name ] . name )
def _save_cookies ( self , url ) :
2022-06-22 23:14:22 -05:00
cookies = cookie_jar_to_list ( self . extractor . cookiejar )
2017-09-23 12:08:27 -05:00
for cookie in cookies :
if ' path ' not in cookie :
cookie [ ' path ' ] = ' / '
if ' domain ' not in cookie :
2024-06-11 18:09:58 -05:00
cookie [ ' domain ' ] = urllib . parse . urlparse ( url ) . netloc
2017-09-23 12:08:27 -05:00
with open ( self . _TMP_FILES [ ' cookies ' ] . name , ' wb ' ) as f :
2024-06-11 18:09:58 -05:00
f . write ( json . dumps ( cookies ) . encode ( ) )
2017-09-23 12:08:27 -05:00
def _load_cookies ( self ) :
with open ( self . _TMP_FILES [ ' cookies ' ] . name , ' rb ' ) as f :
cookies = json . loads ( f . read ( ) . decode ( ' utf-8 ' ) )
for cookie in cookies :
if cookie [ ' httponly ' ] is True :
cookie [ ' rest ' ] = { ' httpOnly ' : None }
if ' expiry ' in cookie :
cookie [ ' expire_time ' ] = cookie [ ' expiry ' ]
2022-04-11 15:09:26 -05:00
self . extractor . _set_cookie ( * * cookie )
2017-09-23 12:08:27 -05:00
def get ( self , url , html = None , video_id = None , note = None , note2 = ' Executing JS on webpage ' , headers = { } , jscode = ' saveAndExit(); ' ) :
"""
Downloads webpage ( if needed ) and executes JS
Params :
url : website url
html : optional , html code of website
video_id : video id
note : optional , displayed when downloading webpage
note2 : optional , displayed when executing JS
headers : custom http headers
jscode : code to be executed when page is loaded
Returns tuple with :
* downloaded website ( after JS execution )
* anything you print with ` console . log ` ( but not inside ` page . execute ` ! )
In most cases you don ' t need to add any `jscode`.
It is executed in ` page . onLoadFinished ` .
` saveAndExit ( ) ; ` is mandatory , use it instead of ` phantom . exit ( ) `
2022-08-14 07:04:13 -05:00
It is possible to wait for some element on the webpage , e . g .
2017-09-23 12:08:27 -05:00
var check = function ( ) {
var elementFound = page . evaluate ( function ( ) {
return document . querySelector ( ' #b.done ' ) != = null ;
} ) ;
if ( elementFound )
saveAndExit ( ) ;
else
window . setTimeout ( check , 500 ) ;
}
page . evaluate ( function ( ) {
document . querySelector ( ' #a ' ) . click ( ) ;
} ) ;
check ( ) ;
"""
if ' saveAndExit(); ' not in jscode :
raise ExtractorError ( ' `saveAndExit();` not found in `jscode` ' )
if not html :
html = self . extractor . _download_webpage ( url , video_id , note = note , headers = headers )
with open ( self . _TMP_FILES [ ' html ' ] . name , ' wb ' ) as f :
2024-06-11 18:09:58 -05:00
f . write ( html . encode ( ) )
2017-09-23 12:08:27 -05:00
self . _save_cookies ( url )
2022-03-14 05:40:37 -05:00
user_agent = headers . get ( ' User-Agent ' ) or self . extractor . get_param ( ' http_headers ' ) [ ' User-Agent ' ]
2022-08-18 11:04:47 -05:00
jscode = self . _TEMPLATE . format_map ( self . options . new_child ( {
' url ' : url ,
' ua ' : user_agent . replace ( ' " ' , ' \\ " ' ) ,
' jscode ' : jscode ,
} ) )
2017-09-23 12:08:27 -05:00
2022-09-03 01:44:01 -05:00
stdout = self . execute ( jscode , video_id , note = note2 )
2017-09-23 12:08:27 -05:00
with open ( self . _TMP_FILES [ ' html ' ] . name , ' rb ' ) as f :
html = f . read ( ) . decode ( ' utf-8 ' )
self . _load_cookies ( )
2022-06-17 20:57:22 -05:00
return html , stdout
2022-08-18 11:04:47 -05:00
2024-08-08 10:07:27 -05:00
def execute ( self , jscode , video_id = None , * , note = ' Executing JS in PhantomJS ' ) :
2022-08-18 11:04:47 -05:00
""" Execute JS and return stdout """
if ' phantom.exit(); ' not in jscode :
jscode + = ' ; \n phantom.exit(); '
jscode = self . _BASE_JS + jscode
with open ( self . _TMP_FILES [ ' script ' ] . name , ' w ' , encoding = ' utf-8 ' ) as f :
f . write ( jscode )
self . extractor . to_screen ( f ' { format_field ( video_id , None , " %s : " ) } { note } ' )
cmd = [ self . exe , ' --ssl-protocol=any ' , self . _TMP_FILES [ ' script ' ] . name ]
self . extractor . write_debug ( f ' PhantomJS command line: { shell_quote ( cmd ) } ' )
2022-08-21 19:49:06 -05:00
try :
stdout , stderr , returncode = Popen . run ( cmd , timeout = self . options [ ' timeout ' ] / 1000 ,
text = True , stdout = subprocess . PIPE , stderr = subprocess . PIPE )
except Exception as e :
raise ExtractorError ( f ' { note } failed: Unable to run PhantomJS binary ' , cause = e )
2022-08-18 11:04:47 -05:00
if returncode :
2022-08-30 06:53:59 -05:00
raise ExtractorError ( f ' { note } failed with returncode { returncode } : \n { stderr . strip ( ) } ' )
2022-08-18 11:04:47 -05:00
return stdout