1
0
Fork 0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-03-09 12:50:23 -05:00

fix my-drive extraction

This commit is contained in:
grqx_wsl 2024-09-30 06:35:33 +13:00
parent 8d827d2460
commit 83e0860835

View file

@ -306,7 +306,7 @@ def add_source_format(urlh):
class GoogleDriveFolderIE(InfoExtractor): class GoogleDriveFolderIE(InfoExtractor):
IE_NAME = 'GoogleDrive:Folder' IE_NAME = 'GoogleDrive:Folder'
_VALID_URL = r'https?://(?:docs|drive)\.google\.com/drive/(?:folders/(?P<id>[\w-]{28,})|my-drive)' _VALID_URL = r'https?://(?:docs|drive)\.google\.com/drive/(?:folders/(?P<id>[\w-]{19,})|my-drive)'
_TESTS = [{ _TESTS = [{
'url': 'https://drive.google.com/drive/folders/1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI', 'url': 'https://drive.google.com/drive/folders/1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI',
'info_dict': { 'info_dict': {
@ -329,32 +329,36 @@ def _extract_json_meta(self, webpage, video_id, dsval=None, hashval=None, name=N
""" """
Uses regex to search for json metadata with 'ds' value(0-5) or 'hash' value(1-6) Uses regex to search for json metadata with 'ds' value(0-5) or 'hash' value(1-6)
from the webpage. from the webpage.
Folder info: ds=0, hash=1; Folder items: ds=4(public folder)/5(private folder), hash=6. Folder info: ds=0(public folder), hash=1/5;
Folder items: ds=4(logged out)/5(logged in), hash=6.
public, logged in info:ds0hash1; items:ds5hash6
public, logged out info:ds0hash1; items:ds4hash6
my-drive, logged in info:ds0hash1/4; items:ds5hash6
private, logged in info:ds0hash1; items:ds5hash6
For example, if the webpage contains the line below, the empty data array For example, if the webpage contains the line below, the empty data array
can be got by passing dsval=3 or hashval=2 to this method. can be got by passing dsval=3 or hashval=2 to this method.
AF_initDataCallback({key: 'ds:3', hash: '2', data:[], sideChannel: {}}); AF_initDataCallback({key: 'ds:3', hash: '2', data:[], sideChannel: {}});
""" """
_ARRAY_RE = r'\[(?s:.+)\]' _ARRAY_RE = r'\[(?s:.+)\]'
_META_END_RE = r', sideChannel: \{\}\}\);' # greedy match to deal with the 2nd test case _META_END_RE = r', sideChannel: \{\}\}\);' # greedy match to deal with the 2nd test case
if dsval: if dsval is not None:
if not name: if not name:
name = f'webpage JSON metadata ds:{dsval}' name = f'webpage JSON metadata ds:{dsval}'
return self._search_json( return self._search_json(
rf'''key\s*?:\s*?(['"])ds:\s*?{dsval}\1,[^\[]*?data:''', webpage, name, video_id, rf'''key\s*?:\s*?(['"])ds:\s*?{dsval}\1,[^\[]*?data:''', webpage, name, video_id,
end_pattern=_META_END_RE, contains_pattern=_ARRAY_RE, **kwargs) end_pattern=_META_END_RE, contains_pattern=_ARRAY_RE, **kwargs)
elif hashval: elif hashval is not None:
if not name: if not name:
name = f'webpage JSON metadata hash:{hashval}' name = f'webpage JSON metadata hash:{hashval}'
return self._search_json( return self._search_json(
rf'''hash\s*?:\s*?(['"]){hashval}\1,[^\[]*?data:''', webpage, name, video_id, rf'''hash\s*?:\s*?(['"]){hashval}\1,[^\[]*?data:''', webpage, name, video_id,
end_pattern=_META_END_RE, contains_pattern=_ARRAY_RE, **kwargs) end_pattern=_META_END_RE, contains_pattern=_ARRAY_RE, **kwargs)
return None
def _real_extract(self, url): def _real_extract(self, url):
def item_url_getter(item, video_id): def item_url_getter(item, video_id):
if not isinstance(item, list): if not isinstance(item, list):
return None return None
available_IEs = [GoogleDriveFolderIE, GoogleDriveIE] # subfolder or item available_IEs = (GoogleDriveFolderIE, GoogleDriveIE) # subfolder or item
if 'application/vnd.google-apps.shortcut' in item: # extract real link if 'application/vnd.google-apps.shortcut' in item: # extract real link
entry_url = traverse_obj( entry_url = traverse_obj(
item, item,
@ -377,20 +381,17 @@ def item_url_getter(item, video_id):
if e.cause.status == 404: if e.cause.status == 404:
self.raise_no_formats(e.cause.msg) self.raise_no_formats(e.cause.msg)
elif e.cause.status == 403: elif e.cause.status == 403:
self.raise_login_required('Access Denied!') # logged in with an account without access
self.raise_login_required('Access Denied')
raise raise
if urllib.parse.urlparse(urlh.url).netloc == 'accounts.google.com': if urllib.parse.urlparse(urlh.url).netloc == 'accounts.google.com':
self.raise_login_required('Access Denied!') # not logged in when visiting a private folder
self.raise_login_required('Access Denied')
json_folder_info = ( title = self._extract_json_meta(webpage, folder_id, dsval=0, name='folder info')[1][2]
self._extract_json_meta(webpage, folder_id, dsval=0, name='folder info', default=None) items = self._extract_json_meta(webpage, folder_id, hashval=6, name='folder items')[-1]
or self._extract_json_meta(webpage, folder_id, hashval=1, name='folder info - fallback')
)
json_items = self._extract_json_meta(webpage, folder_id, hashval=6, name='folder items')
title = json_folder_info[1][2] if items is False: # empty folder
items = json_items[-1]
if not isinstance(items, list): # empty folder
return self.playlist_result([], folder_id, title) return self.playlist_result([], folder_id, title)
return self.playlist_result( return self.playlist_result(