From f40c4b4ee28f5ea0e9e155b99f76c6bbbbbce34a Mon Sep 17 00:00:00 2001 From: grqx <173253225+grqx@users.noreply.github.com> Date: Mon, 24 Jun 2024 11:24:32 +0000 Subject: [PATCH 01/16] stop using codespaces modified: yt_dlp/extractor/_extractors.py modified: yt_dlp/extractor/bilibili.py --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/bilibili.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c411efb5a..117843523 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -255,6 +255,7 @@ BiliBiliPlayerIE, BilibiliPlaylistIE, BiliBiliSearchIE, + BiliBiliSearchPageIE, BilibiliSeriesListIE, BilibiliSpaceAudioIE, BilibiliSpaceVideoIE, diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 411b48c28..81d370be8 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -2233,3 +2233,22 @@ def _real_extract(self, url): 'Referer': url, }, } + + +class BiliBiliSearchPageIE(BilibiliBaseIE): + _VALID_URL = r'https?://search\.bilibili\.com/(?:all|bangumi)/?\?keyword=(?P[^/?#&]+)' + _TESTS = [{ + 'url': r'https://search.bilibili.com/all?keyword=yt+-+dlp+%E4%B8%8B%E8%BD%BD%E5%99%A8', + 'info_dict': { + 'id': 'None', + 'title': 'None', + 'ext': 'None', + }, + }] + + def _real_extract(self, url): + # video_id = self._match_id(url) + # webpage = self._download_webpage(url, video_id=video_id) + # title = self._search_regex(r'(?s)]*>([^<]+)-哔哩哔哩_bilibili', webpage, 'uploader', fatal=False) + return self.url_result(r'https://www.bilibili.com/video/BV1yt4y1Q7SS/', + ) From af0eb72a8bf7799bffd1db36a33290d47ef00d73 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Fri, 28 Jun 2024 00:32:37 +1200 Subject: [PATCH 02/16] uses json api, pass hatch test --- yt_dlp/extractor/bilibili.py | 78 +++++++++++++++++++++++++++++++----- 1 file changed, 69 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 81d370be8..2dbd3a4c7 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -2236,19 +2236,79 @@ def _real_extract(self, url): class BiliBiliSearchPageIE(BilibiliBaseIE): - _VALID_URL = r'https?://search\.bilibili\.com/(?:all|bangumi)/?\?keyword=(?P[^/?#&]+)' + _VALID_URL = r'https?://search\.bilibili\.com/(?Pall|video|bangumi|pgc|live|upuser)/?\?keyword=(?P[^/?#&]+)' _TESTS = [{ 'url': r'https://search.bilibili.com/all?keyword=yt+-+dlp+%E4%B8%8B%E8%BD%BD%E5%99%A8', + 'playlist_count': 20, 'info_dict': { - 'id': 'None', - 'title': 'None', - 'ext': 'None', + 'id': 'yt - dlp 下载器', + 'title': 'yt - dlp 下载器', }, }] def _real_extract(self, url): - # video_id = self._match_id(url) - # webpage = self._download_webpage(url, video_id=video_id) - # title = self._search_regex(r'(?s)]*>([^<]+)-哔哩哔哩_bilibili', webpage, 'uploader', fatal=False) - return self.url_result(r'https://www.bilibili.com/video/BV1yt4y1Q7SS/', - ) + entries = [] + if not self._get_cookies('https://api.bilibili.com').get('buvid3'): + self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc') + search_type, raw_playlist_id = self._match_valid_url(url).group('type', 'id') + playlist_id = urllib.parse.unquote_plus(raw_playlist_id) + search_type_mapping = { + 'video': 'video', + 'bangumi': 'media_bangumi', + 'pgc': 'media_ft', + 'live': 'live_room', + 'upuser': 'bili_user', + } + live_room_prefix = 'https://live.bilibili.com/' + bili_user_prefix = 'https://space.bilibili.com/' + if search_type == 'all': + try: + search_all_result = self._download_json( + r'https://api.bilibili.com/x/web-interface/search/all/v2', + video_id=playlist_id, query={ + 'keyword': playlist_id, + }) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 412: + raise ExtractorError('Request is blocked by server (-412).', expected=True) + status_code = search_all_result['code'] + if status_code == -400: + raise ExtractorError('Invalid request (-400).', expected=True) + + result_list = search_all_result['data']['result'] + for result_type_dict in result_list: + for result_data in result_type_dict['data']: + if result_data['type'] == 'video': + entries.append(self.url_result(result_data['arcurl'])) + elif result_data['type'] == 'live_room': + entries.append(self.url_result(live_room_prefix + str(result_data['roomid']))) + elif result_data['type'] in ['media_ft', 'media_bangumi']: + entries.append(self.url_result(result_data['url'])) + elif result_data['type'] == 'bili_user': + entries.append(self.url_result(bili_user_prefix + str(result_data['mid']))) + else: + try: + search_type_result = self._download_json( + r'https://api.bilibili.com/x/web-interface/search/type', + video_id=playlist_id, query={ + 'keyword': playlist_id, + 'search_type': search_type_mapping[search_type], + }) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 412: + raise ExtractorError('Request is blocked by server (-412).', expected=True) + status_code = search_type_result['code'] + if status_code == -400: + raise ExtractorError('Invalid request (-400).', expected=True) + result_list = search_type_result['data']['result'] + for result_data in result_list: + if result_data['type'] == 'video': + entries.append(self.url_result(result_data['arcurl'])) + elif result_data['type'] == 'live_room': + entries.append(self.url_result(live_room_prefix + str(result_data['roomid']))) + elif result_data['type'] in ['media_ft', 'media_bangumi']: + entries.append(self.url_result(result_data['url'])) + elif result_data['type'] == 'bili_user': + entries.append(self.url_result(bili_user_prefix + str(result_data['mid']))) + + return self.playlist_result(entries, playlist_id=playlist_id, playlist_title=playlist_id) From fc79d7325cc28d277e523968e29df959240fffed Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Fri, 28 Jun 2024 11:45:54 +1200 Subject: [PATCH 03/16] add test, add geo verification proxy support --- yt_dlp/extractor/bilibili.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 2dbd3a4c7..36aab7c1b 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -2244,9 +2244,17 @@ class BiliBiliSearchPageIE(BilibiliBaseIE): 'id': 'yt - dlp 下载器', 'title': 'yt - dlp 下载器', }, + }, { + 'url': r'https://search.bilibili.com/bangumi?keyword=%E5%AD%A4%E7%8B%AC%E6%91%87%E6%BB%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5', + 'playlist_mincount': 1, + 'info_dict': { + 'id': '孤独摇滚', + 'title': '孤独摇滚', + }, }] def _real_extract(self, url): + headers = self.geo_verification_headers() entries = [] if not self._get_cookies('https://api.bilibili.com').get('buvid3'): self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc') @@ -2267,15 +2275,18 @@ def _real_extract(self, url): r'https://api.bilibili.com/x/web-interface/search/all/v2', video_id=playlist_id, query={ 'keyword': playlist_id, - }) + }, headers=headers) except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 412: raise ExtractorError('Request is blocked by server (-412).', expected=True) status_code = search_all_result['code'] if status_code == -400: raise ExtractorError('Invalid request (-400).', expected=True) - - result_list = search_all_result['data']['result'] + result_list = search_all_result['data'].get('result') + if result_list is None: + self.write_debug(f'Response: {search_all_result}') + raise ExtractorError(f'Result not found in the response ({status_code}).', + expected=True) for result_type_dict in result_list: for result_data in result_type_dict['data']: if result_data['type'] == 'video': @@ -2293,14 +2304,18 @@ def _real_extract(self, url): video_id=playlist_id, query={ 'keyword': playlist_id, 'search_type': search_type_mapping[search_type], - }) + }, headers=headers) except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 412: raise ExtractorError('Request is blocked by server (-412).', expected=True) status_code = search_type_result['code'] if status_code == -400: raise ExtractorError('Invalid request (-400).', expected=True) - result_list = search_type_result['data']['result'] + result_list = search_type_result['data'].get('result') + if result_list is None: + self.write_debug(f'Response: {search_type_result}') + raise ExtractorError(f'Result not found in the response ({status_code}).', + expected=True) for result_data in result_list: if result_data['type'] == 'video': entries.append(self.url_result(result_data['arcurl'])) From 41bb0c6b3778ddc21a68461eb6b139cd5243228c Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Fri, 28 Jun 2024 12:10:00 +1200 Subject: [PATCH 04/16] Add SearchInfoExtractor: BiliBiliSearchAllIE --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/bilibili.py | 76 +++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 117843523..bb31fbb88 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -254,6 +254,7 @@ BiliBiliIE, BiliBiliPlayerIE, BilibiliPlaylistIE, + BiliBiliSearchAllIE, BiliBiliSearchIE, BiliBiliSearchPageIE, BilibiliSeriesListIE, diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 36aab7c1b..62d62edb4 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1489,6 +1489,82 @@ def _real_extract(self, url): return self.playlist_result(self._entries(category, subcategory, query), query, query) +class BiliBiliSearchAllIE(SearchInfoExtractor): + IE_DESC = 'Bilibili all search' + _MAX_RESULTS = 100000 + _SEARCH_KEY = 'bilisearchall' + _TESTS = [{ + 'url': 'bilisearchall3:靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊', + 'playlist_count': 3, + 'info_dict': { + 'id': '靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊', + 'title': '靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'BV1n44y1Q7sc', + 'ext': 'mp4', + 'title': '“出道一年,我怎么还在等你单推的女人睡觉后开播啊?”【一分钟了解靡烟miya】', + 'timestamp': 1669889987, + 'upload_date': '20221201', + 'description': 'md5:43343c0973defff527b5a4b403b4abf9', + 'tags': list, + 'uploader': '靡烟miya', + 'duration': 123.156, + 'uploader_id': '1958703906', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + '_old_archive_ids': ['bilibili 988222410_part1'], + }, + }], + }, { + 'url': 'bilisearchall:LOL', + 'playlist_count': 1, + 'info_dict': { + 'id': 'LOL', + 'title': 'LOL', + }, + }] + + def _search_results(self, query): + headers = self.geo_verification_headers() + live_room_prefix = 'https://live.bilibili.com/' + bili_user_prefix = 'https://space.bilibili.com/' + if not self._get_cookies('https://api.bilibili.com').get('buvid3'): + self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc') + for page_num in itertools.count(1): + try: + search_all_result = self._download_json( + r'https://api.bilibili.com/x/web-interface/search/all/v2', + video_id=query, query={ + 'keyword': query, + 'page': page_num, + }, headers=headers) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 412: + raise ExtractorError('Request is blocked by server (-412).', expected=True) + status_code = search_all_result['code'] + if status_code == -400: + raise ExtractorError('Invalid request (-400).', expected=True) + result_list = search_all_result['data'].get('result') + if result_list is None: + self.write_debug(f'Response: {search_all_result}') + raise ExtractorError(f'Result not found in the response ({status_code}).', + expected=True) + for result_type_dict in result_list: + for result_data in result_type_dict['data']: + if result_data['type'] == 'video': + yield self.url_result(result_data['arcurl']) + elif result_data['type'] == 'live_room': + yield self.url_result(live_room_prefix + str(result_data['roomid'])) + elif result_data['type'] in ['media_ft', 'media_bangumi']: + yield self.url_result(result_data['url']) + elif result_data['type'] == 'bili_user': + yield self.url_result(bili_user_prefix + str(result_data['mid'])) + + class BiliBiliSearchIE(SearchInfoExtractor): IE_DESC = 'Bilibili video search' _MAX_RESULTS = 100000 From 72fac58401b04d115e70efb81d59590c2efec4be Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Sat, 29 Jun 2024 20:05:53 +1200 Subject: [PATCH 05/16] Supported more params --- supportedsites.md | 2 ++ yt_dlp/extractor/bilibili.py | 54 +++++++++++++++++++++++++++--------- 2 files changed, 43 insertions(+), 13 deletions(-) diff --git a/supportedsites.md b/supportedsites.md index 387395613..e40e5be03 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -174,7 +174,9 @@ # Supported sites - **BilibiliFavoritesList** - **BiliBiliPlayer** - **BilibiliPlaylist** + - **BiliBiliSearchAll**: Bilibili all search; "bilisearchall:" prefix - **BiliBiliSearch**: Bilibili video search; "bilisearch:" prefix + - **BiliBiliSearchPage** - **BilibiliSeriesList** - **BilibiliSpaceAudio** - **BilibiliSpaceVideo** diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 62d62edb4..fcebc4511 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -2312,7 +2312,7 @@ def _real_extract(self, url): class BiliBiliSearchPageIE(BilibiliBaseIE): - _VALID_URL = r'https?://search\.bilibili\.com/(?Pall|video|bangumi|pgc|live|upuser)/?\?keyword=(?P[^/?#&]+)' + _VALID_URL = r'https?://search\.bilibili\.com/(?Pall|video|bangumi|pgc|live|upuser).*' _TESTS = [{ 'url': r'https://search.bilibili.com/all?keyword=yt+-+dlp+%E4%B8%8B%E8%BD%BD%E5%99%A8', 'playlist_count': 20, @@ -2321,20 +2321,33 @@ class BiliBiliSearchPageIE(BilibiliBaseIE): 'title': 'yt - dlp 下载器', }, }, { - 'url': r'https://search.bilibili.com/bangumi?keyword=%E5%AD%A4%E7%8B%AC%E6%91%87%E6%BB%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5', + 'url': r'https://search.bilibili.com/bangumi/?keyword=%E5%AD%A4%E7%8B%AC%E6%91%87%E6%BB%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5', 'playlist_mincount': 1, 'info_dict': { 'id': '孤独摇滚', 'title': '孤独摇滚', }, + 'skip': 'geo-restricted', + }, { + 'url': r'https://search.bilibili.com/video?keyword=%E8%AE%A9%E5%AD%90%E5%BC%B9%E9%A3%9E&from_source=webtop_search&spm_id_from=333.1007&search_source=5&order=dm&duration=4&tids=181&page=3&o=72', + 'playlist_mincount': 5, + 'info_dict': { + 'id': '让子弹飞', + 'title': '让子弹飞', + }, }] def _real_extract(self, url): headers = self.geo_verification_headers() entries = [] + params = parse_qs(url) + query = {} if not self._get_cookies('https://api.bilibili.com').get('buvid3'): self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc') - search_type, raw_playlist_id = self._match_valid_url(url).group('type', 'id') + search_type = self._match_valid_url(url).group('type') + raw_playlist_id = traverse_obj(params, ('keyword', 0)) + if raw_playlist_id is None: + raise ExtractorError('Please specify the keyword to search for!', expected=True) playlist_id = urllib.parse.unquote_plus(raw_playlist_id) search_type_mapping = { 'video': 'video', @@ -2343,24 +2356,38 @@ def _real_extract(self, url): 'live': 'live_room', 'upuser': 'bili_user', } + valid_params = [ + 'keyword', + 'page', + 'order', + 'duration', + 'tids', + 'search_type', + 'order_sort', + 'user_type', + ] + for valid_param in valid_params: + param_value = traverse_obj(params, (valid_param, 0)) + if param_value is not None: + query[valid_param] = param_value live_room_prefix = 'https://live.bilibili.com/' bili_user_prefix = 'https://space.bilibili.com/' + if search_type == 'live' and traverse_obj(params, ('search_type', 0)) == 'live_user': + raise ExtractorError('Live users are not downloadable!', expected=True) if search_type == 'all': try: search_all_result = self._download_json( r'https://api.bilibili.com/x/web-interface/search/all/v2', - video_id=playlist_id, query={ - 'keyword': playlist_id, - }, headers=headers) + video_id=playlist_id, query=query, headers=headers) except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 412: raise ExtractorError('Request is blocked by server (-412).', expected=True) + raise status_code = search_all_result['code'] if status_code == -400: raise ExtractorError('Invalid request (-400).', expected=True) result_list = search_all_result['data'].get('result') if result_list is None: - self.write_debug(f'Response: {search_all_result}') raise ExtractorError(f'Result not found in the response ({status_code}).', expected=True) for result_type_dict in result_list: @@ -2378,20 +2405,21 @@ def _real_extract(self, url): search_type_result = self._download_json( r'https://api.bilibili.com/x/web-interface/search/type', video_id=playlist_id, query={ - 'keyword': playlist_id, 'search_type': search_type_mapping[search_type], + **query, # search_type in type is overridden when specified in url params }, headers=headers) except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 412: - raise ExtractorError('Request is blocked by server (-412).', expected=True) + raise ExtractorError('Request is blocked by server (-412).') + raise status_code = search_type_result['code'] if status_code == -400: - raise ExtractorError('Invalid request (-400).', expected=True) + raise ExtractorError('Invalid request (-400).') result_list = search_type_result['data'].get('result') if result_list is None: - self.write_debug(f'Response: {search_type_result}') - raise ExtractorError(f'Result not found in the response ({status_code}).', - expected=True) + raise ExtractorError( + f'Result not found in the response ({status_code}). ' + 'You might want to try a VPN or a proxy server (with --proxy)', expected=True) for result_data in result_list: if result_data['type'] == 'video': entries.append(self.url_result(result_data['arcurl'])) From ddca2384236c3e563fdb39543bd84b020b740d1e Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Sun, 30 Jun 2024 20:53:08 +1200 Subject: [PATCH 06/16] modified: yt_dlp/extractor/bilibili.py --- yt_dlp/extractor/bilibili.py | 78 +++++++++++++++++++++++------------- 1 file changed, 50 insertions(+), 28 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index fcebc4511..717932448 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1530,38 +1530,44 @@ class BiliBiliSearchAllIE(SearchInfoExtractor): def _search_results(self, query): headers = self.geo_verification_headers() + page_size = 50 live_room_prefix = 'https://live.bilibili.com/' bili_user_prefix = 'https://space.bilibili.com/' if not self._get_cookies('https://api.bilibili.com').get('buvid3'): self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc') for page_num in itertools.count(1): + query_params = { + 'keyword': query, + 'page': page_num, + 'dynamic_offset': (page_num - 1) * page_size, + 'platform': 'pc', + } + api_url = r'https://api.bilibili.com/x/web-interface/search/all/v2' try: search_all_result = self._download_json( - r'https://api.bilibili.com/x/web-interface/search/all/v2', - video_id=query, query={ - 'keyword': query, - 'page': page_num, - }, headers=headers) + api_url, video_id=query, query=query_params, headers=headers) except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 412: raise ExtractorError('Request is blocked by server (-412).', expected=True) + raise status_code = search_all_result['code'] if status_code == -400: raise ExtractorError('Invalid request (-400).', expected=True) result_list = search_all_result['data'].get('result') - if result_list is None: + if not result_list: self.write_debug(f'Response: {search_all_result}') raise ExtractorError(f'Result not found in the response ({status_code}).', expected=True) for result_type_dict in result_list: for result_data in result_type_dict['data']: - if result_data['type'] == 'video': + result_type = result_data.get('type') + if result_type == 'video': yield self.url_result(result_data['arcurl']) - elif result_data['type'] == 'live_room': + elif result_type == 'live_room': yield self.url_result(live_room_prefix + str(result_data['roomid'])) - elif result_data['type'] in ['media_ft', 'media_bangumi']: + elif result_type in ['media_ft', 'media_bangumi']: yield self.url_result(result_data['url']) - elif result_data['type'] == 'bili_user': + elif result_type == 'bili_user': yield self.url_result(bili_user_prefix + str(result_data['mid'])) @@ -2315,7 +2321,7 @@ class BiliBiliSearchPageIE(BilibiliBaseIE): _VALID_URL = r'https?://search\.bilibili\.com/(?Pall|video|bangumi|pgc|live|upuser).*' _TESTS = [{ 'url': r'https://search.bilibili.com/all?keyword=yt+-+dlp+%E4%B8%8B%E8%BD%BD%E5%99%A8', - 'playlist_count': 20, + 'playlist_count': 36, 'info_dict': { 'id': 'yt - dlp 下载器', 'title': 'yt - dlp 下载器', @@ -2330,7 +2336,7 @@ class BiliBiliSearchPageIE(BilibiliBaseIE): 'skip': 'geo-restricted', }, { 'url': r'https://search.bilibili.com/video?keyword=%E8%AE%A9%E5%AD%90%E5%BC%B9%E9%A3%9E&from_source=webtop_search&spm_id_from=333.1007&search_source=5&order=dm&duration=4&tids=181&page=3&o=72', - 'playlist_mincount': 5, + 'playlist_count': 4, 'info_dict': { 'id': '让子弹飞', 'title': '让子弹飞', @@ -2338,15 +2344,20 @@ class BiliBiliSearchPageIE(BilibiliBaseIE): }] def _real_extract(self, url): + live_room_prefix = 'https://live.bilibili.com/' + bili_user_prefix = 'https://space.bilibili.com/' headers = self.geo_verification_headers() entries = [] params = parse_qs(url) - query = {} + query = { + 'platform': 'pc', + 'page_size': 36, + } if not self._get_cookies('https://api.bilibili.com').get('buvid3'): self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc') search_type = self._match_valid_url(url).group('type') raw_playlist_id = traverse_obj(params, ('keyword', 0)) - if raw_playlist_id is None: + if not raw_playlist_id: raise ExtractorError('Please specify the keyword to search for!', expected=True) playlist_id = urllib.parse.unquote_plus(raw_playlist_id) search_type_mapping = { @@ -2355,6 +2366,7 @@ def _real_extract(self, url): 'pgc': 'media_ft', 'live': 'live_room', 'upuser': 'bili_user', + 'all': 'video', # 'all' search calls video search after page 1 } valid_params = [ 'keyword', @@ -2362,7 +2374,7 @@ def _real_extract(self, url): 'order', 'duration', 'tids', - 'search_type', + 'search_type', # Only when searching for live_room or live_user 'order_sort', 'user_type', ] @@ -2370,11 +2382,17 @@ def _real_extract(self, url): param_value = traverse_obj(params, (valid_param, 0)) if param_value is not None: query[valid_param] = param_value - live_room_prefix = 'https://live.bilibili.com/' - bili_user_prefix = 'https://space.bilibili.com/' + page_num = int(query.get('page', 1)) + param_offset = int_or_none(traverse_obj(params, ('o', 0))) + if page_num == 1: + query['dynamic_offset'] = 0 + elif param_offset is not None: + query['dynamic_offset'] = param_offset + else: + query['dynamic_offset'] = query['page_size'] * (page_num - 1) if search_type == 'live' and traverse_obj(params, ('search_type', 0)) == 'live_user': raise ExtractorError('Live users are not downloadable!', expected=True) - if search_type == 'all': + if search_type == 'all' and page_num == 1: try: search_all_result = self._download_json( r'https://api.bilibili.com/x/web-interface/search/all/v2', @@ -2387,18 +2405,20 @@ def _real_extract(self, url): if status_code == -400: raise ExtractorError('Invalid request (-400).', expected=True) result_list = search_all_result['data'].get('result') - if result_list is None: + if not result_list: + self.write_debug(f'Response: {search_all_result}') raise ExtractorError(f'Result not found in the response ({status_code}).', expected=True) for result_type_dict in result_list: for result_data in result_type_dict['data']: - if result_data['type'] == 'video': + result_type = result_data.get('type') + if result_type == 'video': entries.append(self.url_result(result_data['arcurl'])) - elif result_data['type'] == 'live_room': + elif result_type == 'live_room': entries.append(self.url_result(live_room_prefix + str(result_data['roomid']))) - elif result_data['type'] in ['media_ft', 'media_bangumi']: + elif result_type in ['media_ft', 'media_bangumi']: entries.append(self.url_result(result_data['url'])) - elif result_data['type'] == 'bili_user': + elif result_type == 'bili_user': entries.append(self.url_result(bili_user_prefix + str(result_data['mid']))) else: try: @@ -2416,18 +2436,20 @@ def _real_extract(self, url): if status_code == -400: raise ExtractorError('Invalid request (-400).') result_list = search_type_result['data'].get('result') - if result_list is None: + if not result_list: + self.write_debug(f'Response: {search_type_result}') raise ExtractorError( f'Result not found in the response ({status_code}). ' 'You might want to try a VPN or a proxy server (with --proxy)', expected=True) for result_data in result_list: - if result_data['type'] == 'video': + result_type = result_data.get('type') + if result_type == 'video': entries.append(self.url_result(result_data['arcurl'])) - elif result_data['type'] == 'live_room': + elif result_type == 'live_room': entries.append(self.url_result(live_room_prefix + str(result_data['roomid']))) - elif result_data['type'] in ['media_ft', 'media_bangumi']: + elif result_type in ['media_ft', 'media_bangumi']: entries.append(self.url_result(result_data['url'])) - elif result_data['type'] == 'bili_user': + elif result_type == 'bili_user': entries.append(self.url_result(bili_user_prefix + str(result_data['mid']))) return self.playlist_result(entries, playlist_id=playlist_id, playlist_title=playlist_id) From d6842fcd7f4813d284f7d9df3e49440bed2b0c44 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Mon, 1 Jul 2024 10:33:22 +1200 Subject: [PATCH 07/16] change search key from bilisearchall to biliallsearch to disambiguate --- yt_dlp/extractor/bilibili.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 717932448..fb33591a5 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1492,9 +1492,9 @@ def _real_extract(self, url): class BiliBiliSearchAllIE(SearchInfoExtractor): IE_DESC = 'Bilibili all search' _MAX_RESULTS = 100000 - _SEARCH_KEY = 'bilisearchall' + _SEARCH_KEY = 'biliallsearch' _TESTS = [{ - 'url': 'bilisearchall3:靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊', + 'url': 'biliallsearch3:靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊', 'playlist_count': 3, 'info_dict': { 'id': '靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊', @@ -1520,7 +1520,7 @@ class BiliBiliSearchAllIE(SearchInfoExtractor): }, }], }, { - 'url': 'bilisearchall:LOL', + 'url': 'biliallsearch:LOL', 'playlist_count': 1, 'info_dict': { 'id': 'LOL', From ca780a228c86d769ba7ffbe3d0c7a7a6c95c5464 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Mon, 1 Jul 2024 16:32:10 +1200 Subject: [PATCH 08/16] Fix search key(prefix) in supportedsites.md --- supportedsites.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/supportedsites.md b/supportedsites.md index e40e5be03..d0c10d0cd 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -174,7 +174,7 @@ # Supported sites - **BilibiliFavoritesList** - **BiliBiliPlayer** - **BilibiliPlaylist** - - **BiliBiliSearchAll**: Bilibili all search; "bilisearchall:" prefix + - **BiliBiliSearchAll**: Bilibili all search; "biliallsearch:" prefix - **BiliBiliSearch**: Bilibili video search; "bilisearch:" prefix - **BiliBiliSearchPage** - **BilibiliSeriesList** From d079c1a67f155e7641e0b50d7f4fb8280c6d2dcf Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Tue, 2 Jul 2024 18:25:34 +1200 Subject: [PATCH 09/16] WBI signature --- yt_dlp/extractor/bilibili.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index f902c1b2e..083968cc8 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1657,7 +1657,7 @@ def _real_extract(self, url): return self.playlist_result(self._entries(category, subcategory, query), query, query) -class BiliBiliSearchAllIE(SearchInfoExtractor): +class BiliBiliSearchAllIE(SearchInfoExtractor, BilibiliBaseIE): IE_DESC = 'Bilibili all search' _MAX_RESULTS = 100000 _SEARCH_KEY = 'biliallsearch' @@ -1698,6 +1698,7 @@ class BiliBiliSearchAllIE(SearchInfoExtractor): def _search_results(self, query): headers = self.geo_verification_headers() + headers['Referer'] = 'https://www.bilibili.com/' page_size = 50 live_room_prefix = 'https://live.bilibili.com/' bili_user_prefix = 'https://space.bilibili.com/' @@ -1710,10 +1711,12 @@ def _search_results(self, query): 'dynamic_offset': (page_num - 1) * page_size, 'platform': 'pc', } - api_url = r'https://api.bilibili.com/x/web-interface/search/all/v2' + api_url = r'https://api.bilibili.com/x/web-interface/wbi/search/all/v2' try: search_all_result = self._download_json( - api_url, video_id=query, query=query_params, headers=headers) + api_url, video_id=query, query=self._sign_wbi(query_params, query), + headers=headers, + ) except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 412: raise ExtractorError('Request is blocked by server (-412).', expected=True) @@ -2517,6 +2520,7 @@ def _real_extract(self, url): live_room_prefix = 'https://live.bilibili.com/' bili_user_prefix = 'https://space.bilibili.com/' headers = self.geo_verification_headers() + headers['Referer'] = url entries = [] params = parse_qs(url) query = { @@ -2565,8 +2569,8 @@ def _real_extract(self, url): if search_type == 'all' and page_num == 1: try: search_all_result = self._download_json( - r'https://api.bilibili.com/x/web-interface/search/all/v2', - video_id=playlist_id, query=query, headers=headers) + r'https://api.bilibili.com/x/web-interface/wbi/search/all/v2', + video_id=playlist_id, query=self._sign_wbi(query, playlist_id), headers=headers) except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 412: raise ExtractorError('Request is blocked by server (-412).', expected=True) @@ -2591,13 +2595,15 @@ def _real_extract(self, url): elif result_type == 'bili_user': entries.append(self.url_result(bili_user_prefix + str(result_data['mid']))) else: + query = { + 'search_type': search_type_mapping[search_type], + **query, # search_type in type is overridden when specified in url params + } try: search_type_result = self._download_json( - r'https://api.bilibili.com/x/web-interface/search/type', - video_id=playlist_id, query={ - 'search_type': search_type_mapping[search_type], - **query, # search_type in type is overridden when specified in url params - }, headers=headers) + r'https://api.bilibili.com/x/web-interface/wbi/search/type', + video_id=playlist_id, query=self._sign_wbi(query, playlist_id), headers=headers, + ) except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 412: raise ExtractorError('Request is blocked by server (-412).') From cbbf6ad2cda34fc93396e791e1da733cd8694fd2 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Tue, 2 Jul 2024 19:47:54 +1200 Subject: [PATCH 10/16] ci run dl --- yt_dlp/extractor/bilibili.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 083968cc8..305f649b8 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -2509,7 +2509,7 @@ class BiliBiliSearchPageIE(BilibiliBaseIE): 'skip': 'geo-restricted', }, { 'url': r'https://search.bilibili.com/video?keyword=%E8%AE%A9%E5%AD%90%E5%BC%B9%E9%A3%9E&from_source=webtop_search&spm_id_from=333.1007&search_source=5&order=dm&duration=4&tids=181&page=3&o=72', - 'playlist_count': 4, + 'playlist_mincount': 4, 'info_dict': { 'id': '让子弹飞', 'title': '让子弹飞', From 4c16680c00fdc8029dcc20b4f14e495086b4c3b9 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Wed, 3 Jul 2024 22:25:53 +1200 Subject: [PATCH 11/16] (Empty commit) ci run dl all From 853a9224f569931e54c9e78429c4d80b54942a21 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Fri, 19 Jul 2024 22:38:21 +1200 Subject: [PATCH 12/16] apply patch from comment https://github.com/yt-dlp/yt-dlp/pull/10317#issuecomment-2202289727 --- yt_dlp/extractor/bilibili.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 305f649b8..0a2d197ce 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1742,7 +1742,7 @@ def _search_results(self, query): yield self.url_result(bili_user_prefix + str(result_data['mid'])) -class BiliBiliSearchIE(SearchInfoExtractor): +class BiliBiliSearchIE(SearchInfoExtractor, BilibiliBaseIE): IE_DESC = 'Bilibili video search' _MAX_RESULTS = 100000 _SEARCH_KEY = 'bilisearch' @@ -1777,21 +1777,16 @@ class BiliBiliSearchIE(SearchInfoExtractor): def _search_results(self, query): if not self._get_cookies('https://api.bilibili.com').get('buvid3'): self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc') + headers = self.geo_verification_headers() + headers['Referer'] = 'https://www.bilibili.com/' for page_num in itertools.count(1): videos = self._download_json( - 'https://api.bilibili.com/x/web-interface/search/type', query, - note=f'Extracting results from page {page_num}', query={ - 'Search_key': query, + 'https://api.bilibili.com/x/web-interface/wbi/search/type', query, + note=f'Extracting results from page {page_num}', query=self._sign_wbi({ 'keyword': query, 'page': page_num, - 'context': '', - 'duration': 0, - 'tids_2': '', - '__refresh__': 'true', 'search_type': 'video', - 'tids': 0, - 'highlight': 1, - })['data'].get('result') + }, query), headers=headers)['data'].get('result') if not videos: break for video in videos: From 50271dbd80febc77b1cd35a12b1f714b7b671cbb Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Sun, 21 Jul 2024 12:27:07 +1200 Subject: [PATCH 13/16] add IE_DESC for BiliBiliSearchPageIE --- yt_dlp/extractor/bilibili.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 0a2d197ce..362f5fc38 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -2486,6 +2486,7 @@ def _real_extract(self, url): class BiliBiliSearchPageIE(BilibiliBaseIE): + IE_DESC = 'Bilibili Search Page URL Extractor' _VALID_URL = r'https?://search\.bilibili\.com/(?Pall|video|bangumi|pgc|live|upuser).*' _TESTS = [{ 'url': r'https://search.bilibili.com/all?keyword=yt+-+dlp+%E4%B8%8B%E8%BD%BD%E5%99%A8', From b6f26805a77abf6d42c5e0c1454d8692eec5d4e6 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Sun, 21 Jul 2024 20:29:52 +1200 Subject: [PATCH 14/16] revert all changes to supportedsites.md as it's automatically generated by `devscripts/make_supportedsites.py` --- supportedsites.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/supportedsites.md b/supportedsites.md index db9697bcb..656366b4a 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -175,9 +175,7 @@ # Supported sites - **BilibiliFavoritesList** - **BiliBiliPlayer** - **BilibiliPlaylist** - - **BiliBiliSearchAll**: Bilibili all search; "biliallsearch:" prefix - **BiliBiliSearch**: Bilibili video search; "bilisearch:" prefix - - **BiliBiliSearchPage** - **BilibiliSeriesList** - **BilibiliSpaceAudio** - **BilibiliSpaceVideo** From 2da3e052ed8a5bcbc6594ed0dc9dfb5f16446359 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Sat, 17 Aug 2024 18:48:50 +1200 Subject: [PATCH 15/16] [BiliBiliSearchBaseIE, BiliBiliSearchAllIE, BiliBiliSearchPageIE] more compact - wrapped search result extractor into a method of a baseie(BiliBiliSearchBaseIE) - used `traverse_obj` for traversal through search result instead of `for` loops Resolves https://github.com/yt-dlp/yt-dlp/pull/10317#discussion_r1703943629 --- yt_dlp/extractor/bilibili.py | 62 ++++++++++++++---------------------- 1 file changed, 24 insertions(+), 38 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 72394f172..0e47e14a9 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1661,7 +1661,23 @@ def _real_extract(self, url): return self.playlist_result(self._entries(category, subcategory, query), query, query) -class BiliBiliSearchAllIE(SearchInfoExtractor, BilibiliBaseIE): +class BiliBiliSearchBaseIE(BilibiliBaseIE): + def _extract_search_result(self, result_data): + live_room_prefix = 'https://live.bilibili.com/' + bili_user_prefix = 'https://space.bilibili.com/' + self.write_debug('debug_ entry ext reslt') + result_type = result_data.get('type') + if result_type == 'video': + return self.url_result(result_data['arcurl']) + elif result_type == 'live_room': + return self.url_result(live_room_prefix + str(result_data['roomid'])) + elif result_type in ['media_ft', 'media_bangumi']: + return self.url_result(result_data['url']) + elif result_type == 'bili_user': + return self.url_result(bili_user_prefix + str(result_data['mid'])) + + +class BiliBiliSearchAllIE(SearchInfoExtractor, BiliBiliSearchBaseIE): IE_DESC = 'Bilibili all search' _MAX_RESULTS = 100000 _SEARCH_KEY = 'biliallsearch' @@ -1704,8 +1720,6 @@ def _search_results(self, query): headers = self.geo_verification_headers() headers['Referer'] = 'https://www.bilibili.com/' page_size = 50 - live_room_prefix = 'https://live.bilibili.com/' - bili_user_prefix = 'https://space.bilibili.com/' if not self._get_cookies('https://api.bilibili.com').get('buvid3'): self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc') for page_num in itertools.count(1): @@ -1733,17 +1747,8 @@ def _search_results(self, query): self.write_debug(f'Response: {search_all_result}') raise ExtractorError(f'Result not found in the response ({status_code}).', expected=True) - for result_type_dict in result_list: - for result_data in result_type_dict['data']: - result_type = result_data.get('type') - if result_type == 'video': - yield self.url_result(result_data['arcurl']) - elif result_type == 'live_room': - yield self.url_result(live_room_prefix + str(result_data['roomid'])) - elif result_type in ['media_ft', 'media_bangumi']: - yield self.url_result(result_data['url']) - elif result_type == 'bili_user': - yield self.url_result(bili_user_prefix + str(result_data['mid'])) + for result_data in traverse_obj(result_list, (..., 'data', ...)): + yield self._extract_search_result(result_data) class BiliBiliSearchIE(SearchInfoExtractor, BilibiliBaseIE): @@ -2489,7 +2494,7 @@ def _real_extract(self, url): } -class BiliBiliSearchPageIE(BilibiliBaseIE): +class BiliBiliSearchPageIE(BiliBiliSearchBaseIE): IE_DESC = 'Bilibili Search Page URL Extractor' _VALID_URL = r'https?://search\.bilibili\.com/(?Pall|video|bangumi|pgc|live|upuser).*' _TESTS = [{ @@ -2517,8 +2522,6 @@ class BiliBiliSearchPageIE(BilibiliBaseIE): }] def _real_extract(self, url): - live_room_prefix = 'https://live.bilibili.com/' - bili_user_prefix = 'https://space.bilibili.com/' headers = self.geo_verification_headers() headers['Referer'] = url entries = [] @@ -2583,17 +2586,8 @@ def _real_extract(self, url): self.write_debug(f'Response: {search_all_result}') raise ExtractorError(f'Result not found in the response ({status_code}).', expected=True) - for result_type_dict in result_list: - for result_data in result_type_dict['data']: - result_type = result_data.get('type') - if result_type == 'video': - entries.append(self.url_result(result_data['arcurl'])) - elif result_type == 'live_room': - entries.append(self.url_result(live_room_prefix + str(result_data['roomid']))) - elif result_type in ['media_ft', 'media_bangumi']: - entries.append(self.url_result(result_data['url'])) - elif result_type == 'bili_user': - entries.append(self.url_result(bili_user_prefix + str(result_data['mid']))) + + entries = [self._extract_search_result(result_data) for result_data in traverse_obj(result_list, (..., 'data', ...))] else: query = { 'search_type': search_type_mapping[search_type], @@ -2617,15 +2611,7 @@ def _real_extract(self, url): raise ExtractorError( f'Result not found in the response ({status_code}). ' 'You might want to try a VPN or a proxy server (with --proxy)', expected=True) - for result_data in result_list: - result_type = result_data.get('type') - if result_type == 'video': - entries.append(self.url_result(result_data['arcurl'])) - elif result_type == 'live_room': - entries.append(self.url_result(live_room_prefix + str(result_data['roomid']))) - elif result_type in ['media_ft', 'media_bangumi']: - entries.append(self.url_result(result_data['url'])) - elif result_type == 'bili_user': - entries.append(self.url_result(bili_user_prefix + str(result_data['mid']))) + + entries = [self._extract_search_result(result_data) for result_data in result_list] return self.playlist_result(entries, playlist_id=playlist_id, playlist_title=playlist_id) From 69054b483f6e26a0d224210d8117d7d451b7e453 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Sat, 17 Aug 2024 20:51:55 +1200 Subject: [PATCH 16/16] [BiliBiliSearchBaseIE] removed an unnecessary write_debug sentence --- yt_dlp/extractor/bilibili.py | 1 - 1 file changed, 1 deletion(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 0e47e14a9..132d5c4de 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1665,7 +1665,6 @@ class BiliBiliSearchBaseIE(BilibiliBaseIE): def _extract_search_result(self, result_data): live_room_prefix = 'https://live.bilibili.com/' bili_user_prefix = 'https://space.bilibili.com/' - self.write_debug('debug_ entry ext reslt') result_type = result_data.get('type') if result_type == 'video': return self.url_result(result_data['arcurl'])