From 4af718374e82e55460ae90fbf7282756556a31ec Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Sat, 31 Aug 2024 18:35:39 +0800 Subject: [PATCH 1/5] [ie/vmware] Add extractor --- yt_dlp/extractor/_extractors.py | 4 ++ yt_dlp/extractor/brightcove.py | 3 +- yt_dlp/extractor/vmware.py | 100 ++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 yt_dlp/extractor/vmware.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index a3610dc976..891ca84ccc 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2365,6 +2365,10 @@ VKUserVideosIE, VKWallPostIE, ) +from .vmware import ( + VMwareExploreIE, + VMwareExploreSearchIE, +) from .vocaroo import VocarooIE from .vodpl import VODPlIE from .vodplatform import VODPlatformIE diff --git a/yt_dlp/extractor/brightcove.py b/yt_dlp/extractor/brightcove.py index 2526f25dac..e621046253 100644 --- a/yt_dlp/extractor/brightcove.py +++ b/yt_dlp/extractor/brightcove.py @@ -600,7 +600,8 @@ def build_format_id(kind): return { 'id': video_id, 'title': title, - 'description': clean_html(json_data.get('description')), + 'description': clean_html(join_nonempty('description', 'long_description', + from_dict=json_data, delim='
')), 'thumbnails': thumbnails, 'duration': duration, 'timestamp': parse_iso8601(json_data.get('published_at')), diff --git a/yt_dlp/extractor/vmware.py b/yt_dlp/extractor/vmware.py new file mode 100644 index 0000000000..f219a5c757 --- /dev/null +++ b/yt_dlp/extractor/vmware.py @@ -0,0 +1,100 @@ +import itertools + +from .common import InfoExtractor, SearchInfoExtractor + + +class VMwareExploreIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vmware\.com/explore/video-library/video/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.vmware.com/explore/video-library/video/6360758183112', + 'info_dict': { + 'id': '6360758183112', + 'ext': 'mp4', + 'title': 'VCFB1440LV', + 'description': r're:^All About vSphere 8: What\'s New in the Technology', + 'thumbnail': 'https://cf-images.us-east-1.prod.boltdns.net/v1/jit/6164421911001/cde65c5a-51ff-4a0c-905f-ed71e25c0f2c/main/1920x1080/22m53s824ms/match/image.jpg', + 'tags': 'count:6', + 'timestamp': 1724585610, + 'upload_date': '20240825', + 'uploader_id': '6164421911001', + 'duration': 2747.648, + }, + }, { + 'url': 'https://www.vmware.com/explore/video-library/video/6360759173112', + 'info_dict': { + 'id': '6360759173112', + 'ext': 'mp4', + 'title': 'AODB1676LV', + 'description': r're:^Automation, Analytics and Intelligence: Our Quest for Operational Excellence', + 'thumbnail': 'https://cf-images.us-east-1.prod.boltdns.net/v1/jit/6164421911001/56cc0c8e-9d51-4c25-9d97-4b7364989c47/main/1920x1080/14m18s858ms/match/image.jpg', + 'tags': 'count:6', + 'timestamp': 1724585574, + 'upload_date': '20240825', + 'uploader_id': '6164421911001', + 'duration': 1717.717, + }, + }, { + 'url': 'https://www.vmware.com/explore/video-library/video/6360760732112', + 'info_dict': { + 'id': '6360760732112', + 'ext': 'mp4', + 'title': 'ANSB1976LV', + 'description': r're:^The Conman of the Digital Era — Ransomware', + 'thumbnail': 'https://cf-images.us-east-1.prod.boltdns.net/v1/jit/6164421911001/4ec22e41-7812-49d9-9fc8-5dbcf1ef4b3c/main/1920x1080/22m36s555ms/match/image.jpg', + 'tags': 'count:6', + 'timestamp': 1724585612, + 'upload_date': '20240825', + 'uploader_id': '6164421911001', + 'duration': 2713.11, + }, + }] + BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/6164421911001/lUBT2rAMW_default/index.html?videoId=%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % video_id, url_transparent=True) + + +class VMwareExploreSearchIE(SearchInfoExtractor): + IE_NAME = 'VMwareExplore:search' + _SEARCH_KEY = 'vmwaresearch' + _TESTS = [{ + 'url': 'vmwaresearch10:*', + 'playlist_count': 10, + 'info_dict': { + 'id': '*', + 'title': '*', + }, + }, { + 'url': 'vmwaresearchall:ransomware', + 'playlist_count': 15, + 'info_dict': { + 'id': 'ransomware', + 'title': 'ransomware', + }, + }] + _URL_TEMPLATE = 'https://www.vmware.com/explore/video-library/video/%s' + + def _search_results(self, query): + def search_query(query, offset, limit, total_count): + # search api: + # https://www.vmware.com/api/nocache/tools/brightcove/search?q=%2B{query}%20%2Byear:2023:2024%20%20-vod_on_demand_publish:%22False%22%2Bcomplete:%22true%22%2Bstate:%22ACTIVE%22&limit=12&offset=0&sort=-updated_at&account=explore + return self._download_json( + 'https://www.vmware.com/api/nocache/tools/brightcove/search', query, + note=f'Downloading result {offset + 1}-{min(offset + limit, total_count or 99999999)}', query={ + 'q': f'+{query} -vod_on_demand_publish:"False"+complete:"true"+state:"ACTIVE"', + 'limit': limit, + 'offset': offset, + 'sort': 'updated_at', # chronological ascending order. For descending order: '-updated_at' + 'account': 'explore', + }) + + limit, total_count = 100, None # limit: maximum 100 + for i in itertools.count(): + search_results = search_query(query, i * limit, limit, total_count) + total_count = search_results.get('count', 0) + for video in search_results.get('videos', []): + if video_id := video.get('id'): + yield self.url_result(self._URL_TEMPLATE % video_id) + if (i + 1) * limit >= total_count: + break From e3309bc775474e191ad6426ebb69bc831ccadf65 Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Sat, 7 Sep 2024 16:37:07 +0800 Subject: [PATCH 2/5] Update test data --- yt_dlp/extractor/vmware.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/vmware.py b/yt_dlp/extractor/vmware.py index f219a5c757..1a14ab0709 100644 --- a/yt_dlp/extractor/vmware.py +++ b/yt_dlp/extractor/vmware.py @@ -12,7 +12,7 @@ class VMwareExploreIE(InfoExtractor): 'ext': 'mp4', 'title': 'VCFB1440LV', 'description': r're:^All About vSphere 8: What\'s New in the Technology', - 'thumbnail': 'https://cf-images.us-east-1.prod.boltdns.net/v1/jit/6164421911001/cde65c5a-51ff-4a0c-905f-ed71e25c0f2c/main/1920x1080/22m53s824ms/match/image.jpg', + 'thumbnail': 'https://cf-images.us-east-1.prod.boltdns.net/v1/static/6164421911001/cde65c5a-51ff-4a0c-905f-ed71e25c0f2c/80d7489b-7b65-47d9-b30c-8056b132892f/1920x1080/match/image.jpg', 'tags': 'count:6', 'timestamp': 1724585610, 'upload_date': '20240825', From d4768b66eaea65c361d333a03bebbcec030bffde Mon Sep 17 00:00:00 2001 From: kclauhk Date: Sun, 27 Oct 2024 03:07:03 +0800 Subject: [PATCH 3/5] Add support to VMware Video Library (not Explore) --- yt_dlp/extractor/_extractors.py | 3 +- yt_dlp/extractor/vmware.py | 89 ++++++++++++++++++++++++--------- 2 files changed, 68 insertions(+), 24 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 891ca84ccc..008294becb 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2367,7 +2367,8 @@ ) from .vmware import ( VMwareExploreIE, - VMwareExploreSearchIE, + VMwareIE, + VMwareSearchIE, ) from .vocaroo import VocarooIE from .vodpl import VODPlIE diff --git a/yt_dlp/extractor/vmware.py b/yt_dlp/extractor/vmware.py index 1a14ab0709..2385cee294 100644 --- a/yt_dlp/extractor/vmware.py +++ b/yt_dlp/extractor/vmware.py @@ -3,6 +3,44 @@ from .common import InfoExtractor, SearchInfoExtractor +class VMwareIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vmware\.com/video/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.vmware.com/video/6362484671112', + 'info_dict': { + 'id': '6362484671112', + 'ext': 'mp4', + 'title': 'GCI Communications', + 'description': '', + 'thumbnail': r're:^https?://.*/image\.jpg', + 'tags': [], + 'timestamp': 1727345356, + 'upload_date': '20240926', + 'uploader_id': '6415665063001', + 'duration': 106.283, + }, + }, { + 'url': 'https://www.vmware.com/video/6350300466112', + 'info_dict': { + 'id': '6350300466112', + 'ext': 'mp4', + 'title': 'VMware Private AI', + 'description': r're:^Learn the significance of AI and Generative AI', + 'thumbnail': r're:^https?://.*/image\.jpg', + 'tags': 'count:8', + 'timestamp': 1712293111, + 'upload_date': '20240405', + 'uploader_id': '6415665063001', + 'duration': 3154.624, + }, + }] + BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/6415665063001/83iWkhhmz_default/index.html?videoId=%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % video_id, url_transparent=True) + + class VMwareExploreIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?vmware\.com/explore/video-library/video/(?P\d+)' _TESTS = [{ @@ -12,7 +50,7 @@ class VMwareExploreIE(InfoExtractor): 'ext': 'mp4', 'title': 'VCFB1440LV', 'description': r're:^All About vSphere 8: What\'s New in the Technology', - 'thumbnail': 'https://cf-images.us-east-1.prod.boltdns.net/v1/static/6164421911001/cde65c5a-51ff-4a0c-905f-ed71e25c0f2c/80d7489b-7b65-47d9-b30c-8056b132892f/1920x1080/match/image.jpg', + 'thumbnail': r're:^https?://.*/image\.jpg', 'tags': 'count:6', 'timestamp': 1724585610, 'upload_date': '20240825', @@ -26,7 +64,7 @@ class VMwareExploreIE(InfoExtractor): 'ext': 'mp4', 'title': 'AODB1676LV', 'description': r're:^Automation, Analytics and Intelligence: Our Quest for Operational Excellence', - 'thumbnail': 'https://cf-images.us-east-1.prod.boltdns.net/v1/jit/6164421911001/56cc0c8e-9d51-4c25-9d97-4b7364989c47/main/1920x1080/14m18s858ms/match/image.jpg', + 'thumbnail': r're:^https?://.*/image\.jpg', 'tags': 'count:6', 'timestamp': 1724585574, 'upload_date': '20240825', @@ -40,7 +78,7 @@ class VMwareExploreIE(InfoExtractor): 'ext': 'mp4', 'title': 'ANSB1976LV', 'description': r're:^The Conman of the Digital Era — Ransomware', - 'thumbnail': 'https://cf-images.us-east-1.prod.boltdns.net/v1/jit/6164421911001/4ec22e41-7812-49d9-9fc8-5dbcf1ef4b3c/main/1920x1080/22m36s555ms/match/image.jpg', + 'thumbnail': r're:^https?://.*/image\.jpg', 'tags': 'count:6', 'timestamp': 1724585612, 'upload_date': '20240825', @@ -55,46 +93,51 @@ def _real_extract(self, url): return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % video_id, url_transparent=True) -class VMwareExploreSearchIE(SearchInfoExtractor): - IE_NAME = 'VMwareExplore:search' +class VMwareSearchIE(SearchInfoExtractor): + IE_NAME = 'VMware:search' _SEARCH_KEY = 'vmwaresearch' _TESTS = [{ 'url': 'vmwaresearch10:*', - 'playlist_count': 10, 'info_dict': { 'id': '*', 'title': '*', }, + 'playlist_count': 10, }, { - 'url': 'vmwaresearchall:ransomware', - 'playlist_count': 15, + 'url': 'vmwaresearchall:uptime', 'info_dict': { - 'id': 'ransomware', - 'title': 'ransomware', + 'id': 'uptime', + 'title': 'uptime', }, + 'playlist_mincount': 5, }] - _URL_TEMPLATE = 'https://www.vmware.com/explore/video-library/video/%s' + _LIBRARY_MAP = { + 'explore': ('VMware Explore Video Library', 'https://www.vmware.com/explore/video-library/video/%s'), + 'vmware': ('VMware Video Library', 'https://www.vmware.com/video/%s'), + } def _search_results(self, query): - def search_query(query, offset, limit, total_count): + def search_query(query, offset, limit, account): # search api: # https://www.vmware.com/api/nocache/tools/brightcove/search?q=%2B{query}%20%2Byear:2023:2024%20%20-vod_on_demand_publish:%22False%22%2Bcomplete:%22true%22%2Bstate:%22ACTIVE%22&limit=12&offset=0&sort=-updated_at&account=explore return self._download_json( 'https://www.vmware.com/api/nocache/tools/brightcove/search', query, - note=f'Downloading result {offset + 1}-{min(offset + limit, total_count or 99999999)}', query={ + note=f'Searching videos in {self._LIBRARY_MAP[account][0]}', query={ 'q': f'+{query} -vod_on_demand_publish:"False"+complete:"true"+state:"ACTIVE"', 'limit': limit, 'offset': offset, 'sort': 'updated_at', # chronological ascending order. For descending order: '-updated_at' - 'account': 'explore', + 'account': account, }) - limit, total_count = 100, None # limit: maximum 100 - for i in itertools.count(): - search_results = search_query(query, i * limit, limit, total_count) - total_count = search_results.get('count', 0) - for video in search_results.get('videos', []): - if video_id := video.get('id'): - yield self.url_result(self._URL_TEMPLATE % video_id) - if (i + 1) * limit >= total_count: - break + for account in ['explore', 'vmware']: + limit, total_count = 100, None # limit: maximum 100 + for i in itertools.count(): + search_results = search_query(query, i * limit, limit, account) + total_count = search_results.get('count', 0) + for video in search_results.get('videos', []): + if video_id := video.get('id'): + yield self.url_result(self._LIBRARY_MAP[account][1] % video_id) + if (i + 1) * limit >= total_count: + self.to_screen(f'{query}: {total_count} video(s) found') + break From 183ebedd70f99f9e718649f4be40d1d38af566bb Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Thu, 6 Feb 2025 23:42:34 +0800 Subject: [PATCH 4/5] Update search API URL --- yt_dlp/extractor/vmware.py | 64 +++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 22 deletions(-) diff --git a/yt_dlp/extractor/vmware.py b/yt_dlp/extractor/vmware.py index 2385cee294..6593ce359d 100644 --- a/yt_dlp/extractor/vmware.py +++ b/yt_dlp/extractor/vmware.py @@ -1,6 +1,12 @@ import itertools from .common import InfoExtractor, SearchInfoExtractor +from ..utils import ( + float_or_none, + join_nonempty, + traverse_obj, + url_or_none, +) class VMwareIE(InfoExtractor): @@ -97,19 +103,19 @@ class VMwareSearchIE(SearchInfoExtractor): IE_NAME = 'VMware:search' _SEARCH_KEY = 'vmwaresearch' _TESTS = [{ - 'url': 'vmwaresearch10:*', + 'url': 'vmwaresearch5:firewall', 'info_dict': { - 'id': '*', - 'title': '*', + 'id': 'firewall', + 'title': 'firewall', }, - 'playlist_count': 10, + 'playlist_count': 5, }, { 'url': 'vmwaresearchall:uptime', 'info_dict': { 'id': 'uptime', 'title': 'uptime', }, - 'playlist_mincount': 5, + 'playlist_mincount': 2, }] _LIBRARY_MAP = { 'explore': ('VMware Explore Video Library', 'https://www.vmware.com/explore/video-library/video/%s'), @@ -117,27 +123,41 @@ class VMwareSearchIE(SearchInfoExtractor): } def _search_results(self, query): - def search_query(query, offset, limit, account): + def search_query(query, page_no, records_per_page, account): # search api: - # https://www.vmware.com/api/nocache/tools/brightcove/search?q=%2B{query}%20%2Byear:2023:2024%20%20-vod_on_demand_publish:%22False%22%2Bcomplete:%22true%22%2Bstate:%22ACTIVE%22&limit=12&offset=0&sort=-updated_at&account=explore + # https://api.swiftype.com/api/v1/public/engines/search.json?engine_key=J3yan3XpFywGvRxQMcEr&document_types[]=videos&&filters[videos][locale]=en-us&filters[videos][vod_on_demand_publish][]=!False&filters[videos][complete]=true&filters[videos][state]=ACTIVE&facets[videos][]=products&facets[videos][]=sessiontype&facets[videos][]=audience&facets[videos][]=track&facets[videos][]=level&filters[videos][year][]=!&filters[videos][account]=explore&q[]=ransomware&q[]=uptime&page=1&per_page=12&sort_field[videos]=updated_date&sort_direction[videos]=desc return self._download_json( - 'https://www.vmware.com/api/nocache/tools/brightcove/search', query, - note=f'Searching videos in {self._LIBRARY_MAP[account][0]}', query={ - 'q': f'+{query} -vod_on_demand_publish:"False"+complete:"true"+state:"ACTIVE"', - 'limit': limit, - 'offset': offset, - 'sort': 'updated_at', # chronological ascending order. For descending order: '-updated_at' - 'account': account, + 'https://api.swiftype.com/api/v1/public/engines/search.json', query, + note=f'Page {page_no}: Searching for videos in {self._LIBRARY_MAP[account][0]}', query={ + 'engine_key': 'J3yan3XpFywGvRxQMcEr', + 'document_types[]': 'videos', + 'filters[videos][state]': 'ACTIVE', + 'filters[videos][account]': account, + 'q[]': query, + 'page': page_no, + 'per_page': records_per_page, + 'sort_field[videos]': 'video_id', + 'sort_direction[videos]': 'asc', # 'desc' for descending order }) for account in ['explore', 'vmware']: - limit, total_count = 100, None # limit: maximum 100 - for i in itertools.count(): - search_results = search_query(query, i * limit, limit, account) - total_count = search_results.get('count', 0) - for video in search_results.get('videos', []): - if video_id := video.get('id'): - yield self.url_result(self._LIBRARY_MAP[account][1] % video_id) - if (i + 1) * limit >= total_count: + records_per_page, total_count = 100, None # records_per_page: maximum 100 + for i in itertools.count(start=1, step=1): + search_results = search_query(query, i, records_per_page, account) + total_count = traverse_obj( + search_results, ('info', 'videos', 'total_result_count', {int}), default=0) + for video in traverse_obj(search_results, ('records', 'videos', lambda _, v: v['external_id'])): + yield self.url_result(self._LIBRARY_MAP[account][1] % video['external_id'], + **traverse_obj(video, { + 'id': ('external_id', {str}), + 'title': ('name', {str}), + 'description': ({lambda v: join_nonempty('description', 'long_description', + from_dict=video, delim='\n')}), + 'thumbnail': (('images', 'thumbnail'), {url_or_none}), + 'tags': ('tags'), + 'uploader_id': ('account_id'), + 'duration': ('duration', {lambda v: float_or_none(v, 1000)}), + }, get_all=False)) + if i * records_per_page >= total_count: self.to_screen(f'{query}: {total_count} video(s) found') break From 117b8293c52722a2c97aba6ac3315fe2fb84bab4 Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Thu, 6 Feb 2025 23:48:42 +0800 Subject: [PATCH 5/5] fix indent --- yt_dlp/extractor/vmware.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/vmware.py b/yt_dlp/extractor/vmware.py index 6593ce359d..a42cbb0be5 100644 --- a/yt_dlp/extractor/vmware.py +++ b/yt_dlp/extractor/vmware.py @@ -147,7 +147,8 @@ def search_query(query, page_no, records_per_page, account): total_count = traverse_obj( search_results, ('info', 'videos', 'total_result_count', {int}), default=0) for video in traverse_obj(search_results, ('records', 'videos', lambda _, v: v['external_id'])): - yield self.url_result(self._LIBRARY_MAP[account][1] % video['external_id'], + yield self.url_result( + self._LIBRARY_MAP[account][1] % video['external_id'], **traverse_obj(video, { 'id': ('external_id', {str}), 'title': ('name', {str}),