From 1564c7192101af03c14b92f964c240bd3b576c89 Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Wed, 26 Jun 2024 22:50:10 +0800 Subject: [PATCH 1/6] correction and update --- yt_dlp/extractor/giphy.py | 508 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 508 insertions(+) create mode 100644 yt_dlp/extractor/giphy.py diff --git a/yt_dlp/extractor/giphy.py b/yt_dlp/extractor/giphy.py new file mode 100644 index 000000000..1f2a82428 --- /dev/null +++ b/yt_dlp/extractor/giphy.py @@ -0,0 +1,508 @@ +import itertools + +from .common import InfoExtractor, SearchInfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + merge_dicts, + str_or_none, + traverse_obj, + url_or_none, +) + + +class GiphyBaseIE(InfoExtractor): + _GIPHY_FE_WEB_API_KEY = 'Gc7131jiJuvI7IdN0HZ1D7nh0ow5BU6g' + _GIPHY_FE_STORIES_AND_GIPHY_TV_API_KEY = '3eFQvabDx69SMoOemSPiYfh9FY0nzO9x' + _GIPHY_MAX = 5000 + + def _extract_formats(self, media, is_still=False): + f = [] + for format_id in media: + if ('_still' in format_id) == is_still: + for key in [k for k, v in media[format_id].items() if str(v)[:4] == 'http']: + i = traverse_obj(media[format_id], { + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'url': (key, {url_or_none}), + }) + f.append({ + 'format_id': format_id, + **i, + }) + return f + + def _extract_info(self, gif_data, video_id): + formats, thumbnails, subtitles, uploader = [], [], {}, {} + # formats, thumbnails, subtitles + if data := gif_data.get('video'): + for lang in data.get('captions', {}): + for key in data['captions'][lang]: + subtitles.setdefault(lang, []).append({'url': data['captions'][lang][key]}) + for category in ['assets', 'previews']: + formats.extend(self._extract_formats(data.get(category, {}))) + if data.get('hls_manifest_url'): + hls_fmts, hls_subs = self._extract_m3u8_formats_and_subtitles( + data['hls_manifest_url'], video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(hls_fmts) + self._merge_subtitles(hls_subs, target=subtitles) + if data.get('dash_manifest_url'): + dash_fmts, dash_subs = self._extract_mpd_formats_and_subtitles( + data['dash_manifest_url'], video_id, mpd_id='dash', fatal=False) + formats.extend(dash_fmts) + self._merge_subtitles(dash_subs, target=subtitles) + if data := gif_data.get('images'): + if data.get('looping'): + data['looping']['height'] = traverse_obj(data, ('original_mp4', 'height'), {int}) + data['looping']['width'] = traverse_obj(data, ('original_mp4', 'width'), {int}) + sorted_data = dict(sorted(data.items(), reverse=True)) + formats.extend(self._extract_formats(sorted_data)) + thumbnails.extend(self._extract_formats(data, is_still=True)) + self._remove_duplicate_formats(formats) + for f in formats: + f.setdefault('http_headers', {})['Accept'] = 'video/*,image/*' + for l in subtitles: + for s in subtitles[l]: + s.setdefault('http_headers', {})['Accept'] = 'text/*' + for t in thumbnails: + t.setdefault('http_headers', {})['Accept'] = 'image/*' + # uploader + if data := gif_data.get('user'): + if isinstance(data, dict): + uploader = traverse_obj(data, { + 'uploader': (('display_name', 'name', 'attribution_display_name', 'username'), {str_or_none}, + {lambda x: x if x else gif_data.get('username')}), + 'uploader_id': ('username', {str_or_none}, {lambda x: x if x else gif_data.get('username')}), + 'uploader_url': (('profile_url', 'website_url'), {str_or_none}, + {lambda x: f'https://giphy.com{x}' if x[0] == '/' else url_or_none(x)}), + }, get_all=False) + # basic info + info = { + **traverse_obj(gif_data, { + 'id': ('id', {str}, {lambda x: x or video_id}), + 'title': ('title', {str_or_none}, {lambda x: x.strip() if x else ''}), + 'description': ((None, 'video'), ('alt_text', 'description'), {str_or_none}, + {lambda x: x.strip() if x and not x.startswith('Discover & share') else None}), + 'tags': ('tags', {list}), + 'age_limit': ('rating', {str_or_none}, {lambda x: 18 if x in ['r', 'nc-17'] else None}), + 'upload_date': (('import_datetime', 'create_datetime'), {str_or_none}, + {lambda x: x[:10].replace('-', '') if x else None}), + }, get_all=False), + } + return { + **info, + **{k: v for k, v in uploader.items() if v is not None}, + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, + } + + def _api_channel_feed(self, channel_id, max_offset=_GIPHY_MAX): + offset = 0 + query_url = f'https://giphy.com/api/v4/channels/{channel_id}/feed/?offset={offset}' + for _ in itertools.count(1): + search_results = self._download_json(query_url, channel_id, fatal=False, + note=f'Fetching feed {offset + 1}-{offset + 25}') + if not search_results.get('results'): + return + for video in search_results['results']: + yield { + **self._extract_info(video, video['id']), + 'webpage_url': video['url'], + } + query_url = url_or_none(search_results.get('next')) or '' + offset = int(self._search_regex(r'offset=(\d+)', query_url, 'offset', default=0)) + # offset cannot exceed 5000 + if not query_url or offset > min((max_offset or self._GIPHY_MAX) + 1, self._GIPHY_MAX): + return + + +class GiphyIE(GiphyBaseIE): + _VALID_URL = r'https?://giphy\.com/(?!(?:search)|(?:stories)/)(?:.+[/-])?(?P[^/]+)/?$' + _TESTS = [{ + 'url': 'https://giphy.com/gifs/l2JIcQ4UH5SoPtMJi', + 'info_dict': { + 'id': 'l2JIcQ4UH5SoPtMJi', + 'ext': 'mp4', + 'title': 'excited slow motion GIF by Cats the Musical', + 'tags': ['giphyupload', 'excited', 'cats', 'musical', 'flip', 'slow motion', 'somersault', 'cats musical'], + 'thumbnail': r're:^https?://.*', + 'upload_date': '20160125', + 'uploader': 'Cats the Musical', + 'uploader_id': 'catsmusical', + 'uploader_url': 'https://giphy.com/catsmusical/', + }, + }, { + 'url': 'http://giphy.com/gifs/l3vR8BKU0m8uX2mAg', + 'info_dict': { + 'id': 'l3vR8BKU0m8uX2mAg', + 'ext': 'mp4', + 'title': 'Giphy video #l3vR8BKU0m8uX2mAg', + 'tags': ['giphyupload'], + 'thumbnail': r're:^https?://.*', + 'upload_date': '20161022', + 'uploader': 'gus123', + 'uploader_id': 'gus123', + 'uploader_url': 'https://giphy.com/channel/gus123/', + }, + }, { + 'url': 'https://giphy.com/gifs/digitalpratik-digital-pratik-happy-fathers-day-dad-E1trcBzr59SGvmRDPY', + 'info_dict': { + 'id': 'E1trcBzr59SGvmRDPY', + 'ext': 'mp4', + 'title': 'Happy Fathers Day GIF by Digital Pratik', + 'tags': 'count:14', + 'thumbnail': r're:^https?://.*', + 'upload_date': '20210619', + 'uploader': 'Digital Pratik', + 'uploader_id': 'digitalpratik', + 'uploader_url': 'https://giphy.com/digitalpratik/', + }, + }, { + 'url': 'https://giphy.com/clips/southpark-south-park-episode-4-season-20-YyOPrvilA8FdiuSiQi', + 'info_dict': { + 'id': 'YyOPrvilA8FdiuSiQi', + 'ext': 'mp4', + 'title': 'You Can\'t Break Up With Me', + 'description': 'South Park, Season 20, Episode 4, Wieners Out', + 'tags': 'count:16', + 'thumbnail': r're:^https?://.*', + 'upload_date': '20220516', + 'uploader': 'South Park', + 'uploader_id': 'southpark', + 'uploader_url': 'https://giphy.com/southpark', + }, + }, { + 'url': 'https://giphy.com/embed/00xGP4zv8xENZ2tc3Y', + 'info_dict': { + 'id': '00xGP4zv8xENZ2tc3Y', + 'ext': 'mp4', + 'title': 'Love Is Blind Wow GIF by NETFLIX', + 'description': 'md5:89445e21c848eef12af249faef4fcf9f', + 'tags': 'count:24', + 'thumbnail': r're:^https?://.*', + 'upload_date': '20220214', + 'uploader': 'NETFLIX', + 'uploader_id': 'netflix', + 'uploader_url': 'https://giphy.com/netflix/', + }, + }, { + 'url': 'https://giphy.com/stickers/mario-PFxFYEZNUavG8', + 'info_dict': { + 'id': 'PFxFYEZNUavG8', + 'ext': 'mp4', + 'title': 'nintendo mario STICKER', + 'tags': ['transparent', 'gaming', 'nintendo', 'mario', 'giphynintendos'], + 'thumbnail': r're:^https?://.*', + 'upload_date': '20160908', + }, + }, { + 'url': 'https://giphy.com/catsmusical/', + 'playlist_count': 10, + 'info_dict': { + 'id': '8707', + 'title': 'Cats the Musical', + 'uploader_id': 'catsmusical', + 'uploader_url': 'https://giphy.com/channel/catsmusical', + }, + }, { + 'url': 'https://giphy.com/channel/catsmusical', + 'playlist_count': 10, + 'info_dict': { + 'id': '8707', + 'title': 'Cats the Musical', + 'uploader_id': 'catsmusical', + 'uploader_url': 'https://giphy.com/channel/catsmusical', + }, + }, { + 'url': 'https://giphy.com/southpark/reactions/lol', + 'playlist_count': 42, + 'info_dict': { + 'id': '1044', + 'title': 'LOL', + 'uploader_id': 'southpark', + 'uploader_url': 'https://giphy.com/channel/southpark', + }, + }, { + 'url': 'https://giphy.com/corgiyolk/cute-and-wholesome-corgi', + 'playlist_count': 14, + 'info_dict': { + 'id': '34458076', + 'title': 'Cute and Wholesome corgi', + 'uploader_id': 'corgiyolk', + 'uploader_url': 'https://giphy.com/channel/corgiyolk', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url.replace('/embed/', '/gifs/'), video_id) + + # {"channelId": ...} + if channel_id := self._html_search_regex(r'\{"channelId":\s*([^\}]+)\}', webpage, 'channel_id', default=None): + uploader_id = self._html_search_meta('twitter:creator', webpage).replace('@', '').lower() + entries = [] + for i in self._api_channel_feed(channel_id, int_or_none(self.get_param('playlistend'))): + entries.append(i) + + return { + 'id': channel_id, + 'title': (self._html_search_meta('twitter:title', webpage) + or self._og_search_title(webpage) + ).replace(' GIFs on GIPHY - Be Animated', '').strip(), + 'uploader_id': uploader_id, + 'uploader_url': f'https://giphy.com/channel/{uploader_id}' if uploader_id != 'giphy' else None, + '_type': 'playlist', + 'entries': entries, + } + else: + title = (self._html_search_meta('twitter:title', webpage, default=None) + or self._og_search_title(webpage).replace(' - Find & Share on GIPHY', '').strip()) + description = (self._html_search_meta('twitter:description', webpage, default=None) + or self._og_search_description(webpage)) + description = description if not description.startswith('Discover & share') else None + + # \"gif\":{\"type\":\"gif\",...}, + if json_str := self._html_search_regex(r'\\"\w+\\":({\\"type\\":\\"(?!emoji).*?is_dynamic\\":\w+}),', + webpage, 'video_data', default=None): + gif_data = self._parse_json(json_str.encode('utf-8').decode('unicode_escape'), video_id) + # gif: {"id":...}, + elif json_str := self._html_search_regex(r'\s+\w+:\s*({".*?}),\n\s+', webpage, 'video_data', default='{}'): + gif_data = self._parse_json(json_str, video_id) + + info = self._extract_info(gif_data, video_id) + + if not info.get('formats'): + formats = [] + if url := self._og_search_video_url(webpage, default=None): + formats.append({ + 'format_id': determine_ext(url), + 'width': int_or_none(self._og_search_property('video:width', webpage)), + 'height': int_or_none(self._og_search_property('video:height', webpage)), + 'url': url, + }) + if url := self._og_search_thumbnail(webpage, default=None): + formats.append({ + 'format_id': determine_ext(url), + 'width': int_or_none(self._og_search_property('image:width', webpage)), + 'height': int_or_none(self._og_search_property('image:height', webpage)), + 'url': url, + }) + if url := self._html_search_meta('twitter:image', webpage, default=None): + thumbnails = [{ + 'width': int_or_none(self._html_search_meta('twitter:image:width', webpage, default=None)), + 'height': int_or_none(self._html_search_meta('twitter:image:height', webpage, default=None)), + 'url': url, + }] + info['formats'] = formats + if not info.get('thumbnails'): + info['thumbnails'] = thumbnails + + if not info.get('uploader'): + uploader = {} + if data := gif_data.get('user'): + if isinstance(data, str): + idx = data.replace('$', '') + if json_str := self._html_search_regex(rf'"{idx}:({{.*?}})\\n"]\)', + webpage, 'uploader_data', default=None): + data = self._parse_json(json_str.encode('utf-8').decode('unicode_escape'), video_id, fatal=False) + if isinstance(data, dict): + uploader = traverse_obj(data, { + 'uploader': (('display_name', 'name', 'attribution_display_name', 'username'), {str_or_none}, + {lambda x: x if x else gif_data.get('username')}), + 'uploader_id': ('username', {str_or_none}), + 'uploader_url': (('profile_url', 'website_url'), {url_or_none}, + {lambda x: f'https://giphy.com{x}' if x[0] == '/' else x}), + }, get_all=False) + if not uploader: + up_id = (gif_data.get('username') + or self._html_search_regex(r'
@(\w+)
', webpage, 'uploader_id', default=None) + or self._html_search_regex(r'"woff2"/>]+\.giphy\.com/(?:channel_assets|avatars)/(.+?)/', + webpage, 'uploader_id', default=None)) + up_name = (title[(title.rfind(' by ') + 4):] if title.rfind(' by ') > 0 else None + or self._html_search_regex(r'(?s)]*>([^<]+)', webpage, 'uploader', default=None) + or self._html_search_regex(r'twitter:creator"[^>]+="((?!@giphy").*?)"', webpage, 'uploader', default=None) + or up_id) + uploader = { + 'uploader': up_name, + 'uploader_id': up_id, + 'uploader_url': (f'https://giphy.com/channel/{up_id}/' if up_id else None), + } + info = merge_dicts(info, {**{k: v for k, v in uploader.items() if v is not None}}) + + return { + **merge_dicts(info, { + 'title': title, + 'description': description, + }), + } + + +class GiphyChannelIE(GiphyBaseIE, SearchInfoExtractor): + IE_NAME = 'giphy:channel' + IE_DESC = 'Giphy Channel' + _SEARCH_KEY = 'giphychannel' + _TESTS = [{ + 'url': 'giphychannel30:pbsnature', + 'playlist_count': 30, + 'info_dict': { + 'id': 'pbsnature', + 'title': 'pbsnature', + }, + }] + + def _search_results(self, query): + if webpage := self._download_webpage(f'https://giphy.com/channel/{query}', query): + if channel_id := self._html_search_regex(r'\{["\']channelId["\']:\s*([^\}]+)\}', + webpage, 'channelId', default=None): + return self._api_channel_feed(channel_id) + + +class GiphySearchIE(GiphyBaseIE, SearchInfoExtractor): + IE_NAME = 'giphy:search' + IE_DESC = 'Giphy Search' + _SEARCH_KEY = 'giphysearch' + _TESTS = [{ + 'url': 'giphysearch20:super mario', + 'playlist_count': 20, + 'info_dict': { + 'id': 'super mario', + 'title': 'super mario', + }, + }, { + 'url': 'giphysearch40:mickey&type=videos,stickers', + 'playlist_count': 40, + 'info_dict': { + 'id': 'mickey&type=videos,stickers', + 'title': 'mickey&type=videos,stickers', + }, + }] + + def _search_results(self, query): + def search_query(query, offset, limit, category): + # search api: + # https://api.giphy.com/v1/gifs/search?rating=pg-13&offset=40&limit=15&type=gifs&q={query}&excludeDynamicResults=undefined&api_key=Gc7131jiJuvI7IdN0HZ1D7nh0ow5BU6g&pingback_id=1904d6e524cee33d + return self._download_json( + f'https://api.giphy.com/v1/{category}/search', query, + note=f'Fetching {category} result {offset + 1}-{offset + limit}', query={ + 'rating': 'r', # MPA film rating + 'offset': offset, + 'limit': limit, + 'type': category, # known types: 'gifs', 'stickers', 'text', 'videos' + 'q': query, + 'excludeDynamicResults': 'undefined', + 'api_key': self._GIPHY_FE_WEB_API_KEY, + }) + + # type: comma delimited list + types = self._search_regex(r'&type=([^&]+)', query, 'type', default='gifs,stickers,videos') + types = [(f'{x}s' if x[-1] != 's' and any(x in t for t in ['gifs', 'stickers', 'videos']) else x) + for x in [x.strip() for x in types.lower().split(',')]] + query = query.split('&type=')[0] + + offset, limit = 0, 50 + types_done = [] + for _ in itertools.count(1): + for t in types: + if t not in types_done: + search_results = search_query(query, offset, limit, t) + if not search_results.get('data'): + self.to_screen(f'{query}: {offset} {t} found') + types_done.append(t) + else: + for video in search_results['data']: + yield { + **self._extract_info(video, video['id']), + 'webpage_url': video['url'], + } + if len(search_results['data']) < limit: + total = offset + len(search_results['data']) + self.to_screen(f'{query}: {total} {t} found') + types_done.append(t) + if len(types) > len(types_done): + offset += limit + if offset >= self._GIPHY_MAX: + return + else: + return + + +class GiphySearchURLIE(GiphyBaseIE): + _VALID_URL = r'https?://giphy\.com/search/(?P[^/]+)' + _TESTS = [{ + 'url': 'https://giphy.com/search/carcinoma', + 'playlist_count': 13, + 'info_dict': { + 'id': 'carcinoma', + 'title': 'carcinoma', + }, + }] + + def _real_extract(self, url): + if query := self._match_id(url): + query = query.replace('-', ' ') + return self.url_result(url=f'giphysearchall:{query}', url_transparent=True) + + +class GiphyStoriesIE(GiphyBaseIE): + _VALID_URL = r'https?://giphy\.com/stories/(?P[^/]+)' + _TESTS = [{ + 'url': 'https://giphy.com/stories/indian-garyvee-de56ee78-adb5', + 'playlist_count': 6, + 'info_dict': { + 'id': 'de56ee78-adb5-428e-afb6-765bb489c4fc', + 'title': 'Indian Garyvee', + 'description': 'md5:468a8e2c898225baac05e32dcafe313b', + 'tags': [], + 'upload_date': '20190710', + 'uploader': 'Digital Pratik', + 'uploader_id': 'digitalpratik', + 'uploader_url': 'https://giphy.com/digitalpratik', + }, + }, { + 'url': 'https://giphy.com/stories/a-2021-oscars-wrap-up-161cda9a-4517', + 'playlist_count': 10, + 'info_dict': { + 'id': '161cda9a-4517-473b-9fea-6731af6c0d49', + 'title': 'A 2021 Oscars Wrap-Up', + 'description': 'What were the best moments in GIFs? Glad you asked!', + 'tags': [], + 'upload_date': '20210426', + 'uploader': 'Entertainment GIFs', + 'uploader_id': 'entertainment', + 'uploader_url': 'https://giphy.com/entertainment', + }, + }] + + def _real_extract(self, url): + slug = self._match_id(url) + # https://x.giphy.com/v1/stories/slug/{slug}?api_key=3eFQvabDx69SMoOemSPiYfh9FY0nzO9x + story = self._download_json(f'https://x.giphy.com/v1/stories/slug/{slug}?api_key={self._GIPHY_FE_STORIES_AND_GIPHY_TV_API_KEY}', slug) + + if data := story.get('data'): + entries = [] + for video in data.get('gifs'): + entries.append({ + **self._extract_info(video['gif'], video['gif']['id']), + 'description': video['caption'], + 'webpage_url': video['gif']['url'], + }) + info = traverse_obj(data, { + 'id': ('story_id', {str_or_none}), + 'title': ('title', {str_or_none}), + 'description': ('description', {str_or_none}), + 'tags': ('tags', {list}), + 'thumbnails': ('cover_gif', 'gif', 'images', {dict}, {lambda x: self._extract_formats(x, is_still=True)}), + 'upload_date': (('create_datetime', 'publish_datetime'), {str_or_none}, + {lambda x: x[:10].replace('-', '') if x else None}), + 'uploader': ('user', ('display_name', 'username'), {str_or_none}), + 'uploader_id': ('user', 'username', {str_or_none}), + 'uploader_url': ('user', ('profile_url', 'website_url'), {str_or_none}, + {lambda x: f'https://giphy.com{x}' if x[0] == '/' else url_or_none(x)}), + }, get_all=False) + + return { + **info, + '_type': 'playlist', + 'entries': entries, + } From ec1d3c97d0d301c37fcfaa7267ee34e75c2d7cbf Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Thu, 27 Jun 2024 19:38:26 +0800 Subject: [PATCH 2/6] Separate playlist-end and 5000 limit checking for accuracy and readability --- yt_dlp/extractor/giphy.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/giphy.py b/yt_dlp/extractor/giphy.py index 1f2a82428..63bfad3e8 100644 --- a/yt_dlp/extractor/giphy.py +++ b/yt_dlp/extractor/giphy.py @@ -97,8 +97,8 @@ def _extract_info(self, gif_data, video_id): 'subtitles': subtitles, } - def _api_channel_feed(self, channel_id, max_offset=_GIPHY_MAX): - offset = 0 + def _api_channel_feed(self, channel_id): + count, offset = 0, 0 query_url = f'https://giphy.com/api/v4/channels/{channel_id}/feed/?offset={offset}' for _ in itertools.count(1): search_results = self._download_json(query_url, channel_id, fatal=False, @@ -110,10 +110,13 @@ def _api_channel_feed(self, channel_id, max_offset=_GIPHY_MAX): **self._extract_info(video, video['id']), 'webpage_url': video['url'], } + count += len(search_results.get('results')) + if count >= (int_or_none(self.get_param('playlistend')) or (self._GIPHY_MAX + 1)): + return query_url = url_or_none(search_results.get('next')) or '' offset = int(self._search_regex(r'offset=(\d+)', query_url, 'offset', default=0)) # offset cannot exceed 5000 - if not query_url or offset > min((max_offset or self._GIPHY_MAX) + 1, self._GIPHY_MAX): + if not query_url or offset > self._GIPHY_MAX: return @@ -242,7 +245,7 @@ def _real_extract(self, url): if channel_id := self._html_search_regex(r'\{"channelId":\s*([^\}]+)\}', webpage, 'channel_id', default=None): uploader_id = self._html_search_meta('twitter:creator', webpage).replace('@', '').lower() entries = [] - for i in self._api_channel_feed(channel_id, int_or_none(self.get_param('playlistend'))): + for i in self._api_channel_feed(channel_id): entries.append(i) return { From 49a3edd276da7c9a661e8c70944bcfb47aaf22ab Mon Sep 17 00:00:00 2001 From: kclauhk Date: Fri, 19 Jul 2024 02:45:36 +0800 Subject: [PATCH 3/6] Separated class for channel page --- yt_dlp/extractor/giphy.py | 258 ++++++++++++++++++++------------------ 1 file changed, 135 insertions(+), 123 deletions(-) diff --git a/yt_dlp/extractor/giphy.py b/yt_dlp/extractor/giphy.py index 63bfad3e8..bb13f2b4c 100644 --- a/yt_dlp/extractor/giphy.py +++ b/yt_dlp/extractor/giphy.py @@ -53,8 +53,8 @@ def _extract_info(self, gif_data, video_id): self._merge_subtitles(dash_subs, target=subtitles) if data := gif_data.get('images'): if data.get('looping'): - data['looping']['height'] = traverse_obj(data, ('original_mp4', 'height'), {int}) - data['looping']['width'] = traverse_obj(data, ('original_mp4', 'width'), {int}) + data['looping']['height'] = traverse_obj(data, ('original_mp4', 'height', {int_or_none})) + data['looping']['width'] = traverse_obj(data, ('original_mp4', 'width', {int_or_none})) sorted_data = dict(sorted(data.items(), reverse=True)) formats.extend(self._extract_formats(sorted_data)) thumbnails.extend(self._extract_formats(data, is_still=True)) @@ -70,22 +70,22 @@ def _extract_info(self, gif_data, video_id): if data := gif_data.get('user'): if isinstance(data, dict): uploader = traverse_obj(data, { - 'uploader': (('display_name', 'name', 'attribution_display_name', 'username'), {str_or_none}, - {lambda x: x if x else gif_data.get('username')}), - 'uploader_id': ('username', {str_or_none}, {lambda x: x if x else gif_data.get('username')}), - 'uploader_url': (('profile_url', 'website_url'), {str_or_none}, - {lambda x: f'https://giphy.com{x}' if x[0] == '/' else url_or_none(x)}), + 'uploader': (('display_name', 'name', 'attribution_display_name', 'username'), + {lambda x: x or gif_data.get('username')}), + 'uploader_id': ('username', {lambda x: x or gif_data.get('username')}), + 'uploader_url': (('profile_url', 'website_url'), + {lambda x: f'https://giphy.com{x}' if x and x[0] == '/' else url_or_none(x)}), }, get_all=False) # basic info info = { **traverse_obj(gif_data, { - 'id': ('id', {str}, {lambda x: x or video_id}), - 'title': ('title', {str_or_none}, {lambda x: x.strip() if x else ''}), - 'description': ((None, 'video'), ('alt_text', 'description'), {str_or_none}, + 'id': ('id', {lambda x: x or video_id}), + 'title': ('title', {lambda x: x.strip() if x else ''}), + 'description': ((None, 'video'), ('alt_text', 'description'), {lambda x: x.strip() if x and not x.startswith('Discover & share') else None}), 'tags': ('tags', {list}), - 'age_limit': ('rating', {str_or_none}, {lambda x: 18 if x in ['r', 'nc-17'] else None}), - 'upload_date': (('import_datetime', 'create_datetime'), {str_or_none}, + 'age_limit': ('rating', {lambda x: 18 if x in ['r', 'nc-17'] else None}), + 'upload_date': (('import_datetime', 'create_datetime'), {lambda x: x[:10].replace('-', '') if x else None}), }, get_all=False), } @@ -102,8 +102,8 @@ def _api_channel_feed(self, channel_id): query_url = f'https://giphy.com/api/v4/channels/{channel_id}/feed/?offset={offset}' for _ in itertools.count(1): search_results = self._download_json(query_url, channel_id, fatal=False, - note=f'Fetching feed {offset + 1}-{offset + 25}') - if not search_results.get('results'): + note=f'Downloading feed data {offset + 1}-{offset + 25}') + if not search_results or not search_results.get('results'): return for video in search_results['results']: yield { @@ -121,7 +121,7 @@ def _api_channel_feed(self, channel_id): class GiphyIE(GiphyBaseIE): - _VALID_URL = r'https?://giphy\.com/(?!(?:search)|(?:stories)/)(?:.+[/-])?(?P[^/]+)/?$' + _VALID_URL = r'https?://giphy\.com/(?:clips|gifs|stickers|embed)/(?:.+[/-])?(?P[^/]+)/?$' _TESTS = [{ 'url': 'https://giphy.com/gifs/l2JIcQ4UH5SoPtMJi', 'info_dict': { @@ -175,6 +175,16 @@ class GiphyIE(GiphyBaseIE): 'uploader_id': 'southpark', 'uploader_url': 'https://giphy.com/southpark', }, + }, { + 'url': 'https://giphy.com/stickers/mario-PFxFYEZNUavG8', + 'info_dict': { + 'id': 'PFxFYEZNUavG8', + 'ext': 'mp4', + 'title': 'nintendo mario STICKER', + 'tags': ['transparent', 'gaming', 'nintendo', 'mario', 'giphynintendos'], + 'thumbnail': r're:^https?://.*', + 'upload_date': '20160908', + }, }, { 'url': 'https://giphy.com/embed/00xGP4zv8xENZ2tc3Y', 'info_dict': { @@ -189,17 +199,98 @@ class GiphyIE(GiphyBaseIE): 'uploader_id': 'netflix', 'uploader_url': 'https://giphy.com/netflix/', }, - }, { - 'url': 'https://giphy.com/stickers/mario-PFxFYEZNUavG8', - 'info_dict': { - 'id': 'PFxFYEZNUavG8', - 'ext': 'mp4', - 'title': 'nintendo mario STICKER', - 'tags': ['transparent', 'gaming', 'nintendo', 'mario', 'giphynintendos'], - 'thumbnail': r're:^https?://.*', - 'upload_date': '20160908', - }, - }, { + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url.replace('/embed/', '/gifs/'), video_id) + + title = (self._html_search_meta('twitter:title', webpage, default=None) + or self._og_search_title(webpage).replace(' - Find & Share on GIPHY', '').strip()) + description = (self._html_search_meta('twitter:description', webpage, default=None) + or self._og_search_description(webpage)) + description = description if not description.startswith('Discover & share') else None + + gif_data = {} + # search for: \"gif\":{\"type\":\"...}, + if json_str := self._html_search_regex(r'\\"\w+\\":({\\"type\\":\\"(?!emoji).*?is_dynamic\\":\w+}),', + webpage, 'video_data', default=None): + gif_data = self._parse_json(json_str.encode('utf-8').decode('unicode_escape'), video_id) + # search for: gif: {"...}, + elif json_str := self._html_search_regex(r'\s+\w+:\s*({".*?}),\n\s+', webpage, 'video_data', default='{}'): + gif_data = self._parse_json(json_str, video_id) + + info = self._extract_info(gif_data, video_id) + + if not info.get('formats'): + formats = [] + if url := self._og_search_video_url(webpage, default=None): + formats.append({ + 'format_id': determine_ext(url), + 'width': int_or_none(self._og_search_property('video:width', webpage)), + 'height': int_or_none(self._og_search_property('video:height', webpage)), + 'url': url, + }) + if url := self._og_search_thumbnail(webpage, default=None): + formats.append({ + 'format_id': determine_ext(url), + 'width': int_or_none(self._og_search_property('image:width', webpage)), + 'height': int_or_none(self._og_search_property('image:height', webpage)), + 'url': url, + }) + if url := self._html_search_meta('twitter:image', webpage, default=None): + thumbnails = [{ + 'width': int_or_none(self._html_search_meta('twitter:image:width', webpage, default=None)), + 'height': int_or_none(self._html_search_meta('twitter:image:height', webpage, default=None)), + 'url': url, + }] + info['formats'] = formats + if not info.get('thumbnails'): + info['thumbnails'] = thumbnails + + if not info.get('uploader'): + uploader = {} + if data := gif_data.get('user'): + if isinstance(data, str): + idx = data.replace('$', '') + if json_str := self._html_search_regex(rf'"{idx}:({{.*?}})\\n"]\)', + webpage, 'uploader_data', default=None): + data = self._parse_json(json_str.encode('utf-8').decode('unicode_escape'), video_id, fatal=False) + if isinstance(data, dict): + uploader = traverse_obj(data, { + 'uploader': (('display_name', 'name', 'attribution_display_name', 'username'), + {lambda x: x or gif_data.get('username')}), + 'uploader_id': ('username', {str_or_none}), + 'uploader_url': (('profile_url', 'website_url'), + {lambda x: f'https://giphy.com{x}' if x and x[0] == '/' else url_or_none(x)}), + }, get_all=False) + if not uploader: + up_id = (gif_data.get('username') + or self._html_search_regex(r'
@(\w+)
', webpage, 'uploader_id', default=None) + or self._html_search_regex(r'"woff2"/>]+\.giphy\.com/(?:channel_assets|avatars)/(.+?)/', + webpage, 'uploader_id', default=None)) + up_name = (title[(title.rfind(' by ') + 4):] if title.rfind(' by ') > 0 else None + or self._html_search_regex(r'(?s)]*>([^<]+)', webpage, 'uploader', default=None) + or self._html_search_regex(r'twitter:creator"[^>]+="((?!@giphy").*?)"', webpage, 'uploader', default=None) + or up_id) + uploader = { + 'uploader': up_name, + 'uploader_id': up_id, + 'uploader_url': (f'https://giphy.com/channel/{up_id}/' if up_id else None), + } + info = merge_dicts(info, {**{k: v for k, v in uploader.items() if v is not None}}) + + return { + **merge_dicts(info, { + 'title': title, + 'description': description, + }), + } + + +class GiphyChannelPageIE(GiphyBaseIE): + _VALID_URL = r'https?://giphy\.com/(?!(?:clips|gifs|stickers|stories|search|embed)/)(?:.+/)?(?P[^/]+)/?$' + _TESTS = [{ 'url': 'https://giphy.com/catsmusical/', 'playlist_count': 10, 'info_dict': { @@ -239,10 +330,11 @@ class GiphyIE(GiphyBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url.replace('/embed/', '/gifs/'), video_id) + webpage = self._download_webpage(url, video_id) - # {"channelId": ...} - if channel_id := self._html_search_regex(r'\{"channelId":\s*([^\}]+)\}', webpage, 'channel_id', default=None): + # search for: {"channelId": ...} or {..., "channel_id": ..., + if channel_id := self._html_search_regex(r'\{[^\}\n]*"channel_?[iI]d":\s*"?([^",\}]+)[",\}]', + webpage, 'channel_id', default=None): uploader_id = self._html_search_meta('twitter:creator', webpage).replace('@', '').lower() entries = [] for i in self._api_channel_feed(channel_id): @@ -258,87 +350,6 @@ def _real_extract(self, url): '_type': 'playlist', 'entries': entries, } - else: - title = (self._html_search_meta('twitter:title', webpage, default=None) - or self._og_search_title(webpage).replace(' - Find & Share on GIPHY', '').strip()) - description = (self._html_search_meta('twitter:description', webpage, default=None) - or self._og_search_description(webpage)) - description = description if not description.startswith('Discover & share') else None - - # \"gif\":{\"type\":\"gif\",...}, - if json_str := self._html_search_regex(r'\\"\w+\\":({\\"type\\":\\"(?!emoji).*?is_dynamic\\":\w+}),', - webpage, 'video_data', default=None): - gif_data = self._parse_json(json_str.encode('utf-8').decode('unicode_escape'), video_id) - # gif: {"id":...}, - elif json_str := self._html_search_regex(r'\s+\w+:\s*({".*?}),\n\s+', webpage, 'video_data', default='{}'): - gif_data = self._parse_json(json_str, video_id) - - info = self._extract_info(gif_data, video_id) - - if not info.get('formats'): - formats = [] - if url := self._og_search_video_url(webpage, default=None): - formats.append({ - 'format_id': determine_ext(url), - 'width': int_or_none(self._og_search_property('video:width', webpage)), - 'height': int_or_none(self._og_search_property('video:height', webpage)), - 'url': url, - }) - if url := self._og_search_thumbnail(webpage, default=None): - formats.append({ - 'format_id': determine_ext(url), - 'width': int_or_none(self._og_search_property('image:width', webpage)), - 'height': int_or_none(self._og_search_property('image:height', webpage)), - 'url': url, - }) - if url := self._html_search_meta('twitter:image', webpage, default=None): - thumbnails = [{ - 'width': int_or_none(self._html_search_meta('twitter:image:width', webpage, default=None)), - 'height': int_or_none(self._html_search_meta('twitter:image:height', webpage, default=None)), - 'url': url, - }] - info['formats'] = formats - if not info.get('thumbnails'): - info['thumbnails'] = thumbnails - - if not info.get('uploader'): - uploader = {} - if data := gif_data.get('user'): - if isinstance(data, str): - idx = data.replace('$', '') - if json_str := self._html_search_regex(rf'"{idx}:({{.*?}})\\n"]\)', - webpage, 'uploader_data', default=None): - data = self._parse_json(json_str.encode('utf-8').decode('unicode_escape'), video_id, fatal=False) - if isinstance(data, dict): - uploader = traverse_obj(data, { - 'uploader': (('display_name', 'name', 'attribution_display_name', 'username'), {str_or_none}, - {lambda x: x if x else gif_data.get('username')}), - 'uploader_id': ('username', {str_or_none}), - 'uploader_url': (('profile_url', 'website_url'), {url_or_none}, - {lambda x: f'https://giphy.com{x}' if x[0] == '/' else x}), - }, get_all=False) - if not uploader: - up_id = (gif_data.get('username') - or self._html_search_regex(r'
@(\w+)
', webpage, 'uploader_id', default=None) - or self._html_search_regex(r'"woff2"/>]+\.giphy\.com/(?:channel_assets|avatars)/(.+?)/', - webpage, 'uploader_id', default=None)) - up_name = (title[(title.rfind(' by ') + 4):] if title.rfind(' by ') > 0 else None - or self._html_search_regex(r'(?s)]*>([^<]+)', webpage, 'uploader', default=None) - or self._html_search_regex(r'twitter:creator"[^>]+="((?!@giphy").*?)"', webpage, 'uploader', default=None) - or up_id) - uploader = { - 'uploader': up_name, - 'uploader_id': up_id, - 'uploader_url': (f'https://giphy.com/channel/{up_id}/' if up_id else None), - } - info = merge_dicts(info, {**{k: v for k, v in uploader.items() if v is not None}}) - - return { - **merge_dicts(info, { - 'title': title, - 'description': description, - }), - } class GiphyChannelIE(GiphyBaseIE, SearchInfoExtractor): @@ -356,8 +367,8 @@ class GiphyChannelIE(GiphyBaseIE, SearchInfoExtractor): def _search_results(self, query): if webpage := self._download_webpage(f'https://giphy.com/channel/{query}', query): - if channel_id := self._html_search_regex(r'\{["\']channelId["\']:\s*([^\}]+)\}', - webpage, 'channelId', default=None): + if channel_id := self._html_search_regex(r'\{[^\}\n]*"channel_?[iI]d":\s*"?([^",\}]+)[",\}]', + webpage, 'channel_id', default=None): return self._api_channel_feed(channel_id) @@ -373,11 +384,11 @@ class GiphySearchIE(GiphyBaseIE, SearchInfoExtractor): 'title': 'super mario', }, }, { - 'url': 'giphysearch40:mickey&type=videos,stickers', + 'url': 'giphysearch40:mickey&type=clips,stickers', 'playlist_count': 40, 'info_dict': { - 'id': 'mickey&type=videos,stickers', - 'title': 'mickey&type=videos,stickers', + 'id': 'mickey&type=clips,stickers', + 'title': 'mickey&type=clips,stickers', }, }] @@ -387,11 +398,11 @@ def search_query(query, offset, limit, category): # https://api.giphy.com/v1/gifs/search?rating=pg-13&offset=40&limit=15&type=gifs&q={query}&excludeDynamicResults=undefined&api_key=Gc7131jiJuvI7IdN0HZ1D7nh0ow5BU6g&pingback_id=1904d6e524cee33d return self._download_json( f'https://api.giphy.com/v1/{category}/search', query, - note=f'Fetching {category} result {offset + 1}-{offset + limit}', query={ + note=f'Downloading {category} result {offset + 1}-{offset + limit}', query={ 'rating': 'r', # MPA film rating 'offset': offset, 'limit': limit, - 'type': category, # known types: 'gifs', 'stickers', 'text', 'videos' + 'type': category, # known types: 'clips', 'gifs', 'stickers', 'text', 'videos' 'q': query, 'excludeDynamicResults': 'undefined', 'api_key': self._GIPHY_FE_WEB_API_KEY, @@ -399,7 +410,7 @@ def search_query(query, offset, limit, category): # type: comma delimited list types = self._search_regex(r'&type=([^&]+)', query, 'type', default='gifs,stickers,videos') - types = [(f'{x}s' if x[-1] != 's' and any(x in t for t in ['gifs', 'stickers', 'videos']) else x) + types = [(f'{x}s' if x[-1] != 's' and any(x in t for t in ['clips', 'gifs', 'stickers', 'videos']) else x) for x in [x.strip() for x in types.lower().split(',')]] query = query.split('&type=')[0] @@ -408,7 +419,8 @@ def search_query(query, offset, limit, category): for _ in itertools.count(1): for t in types: if t not in types_done: - search_results = search_query(query, offset, limit, t) + search_type = 'videos' if t == 'clips' else t # clips use 'videos' type + search_results = search_query(query, offset, limit, search_type) if not search_results.get('data'): self.to_screen(f'{query}: {offset} {t} found') types_done.append(t) @@ -491,17 +503,17 @@ def _real_extract(self, url): 'webpage_url': video['gif']['url'], }) info = traverse_obj(data, { - 'id': ('story_id', {str_or_none}), + 'id': ('story_id', {lambda x: x or slug}), 'title': ('title', {str_or_none}), 'description': ('description', {str_or_none}), 'tags': ('tags', {list}), 'thumbnails': ('cover_gif', 'gif', 'images', {dict}, {lambda x: self._extract_formats(x, is_still=True)}), - 'upload_date': (('create_datetime', 'publish_datetime'), {str_or_none}, + 'upload_date': (('create_datetime', 'publish_datetime'), {lambda x: x[:10].replace('-', '') if x else None}), 'uploader': ('user', ('display_name', 'username'), {str_or_none}), 'uploader_id': ('user', 'username', {str_or_none}), - 'uploader_url': ('user', ('profile_url', 'website_url'), {str_or_none}, - {lambda x: f'https://giphy.com{x}' if x[0] == '/' else url_or_none(x)}), + 'uploader_url': ('user', ('profile_url', 'website_url'), + {lambda x: f'https://giphy.com{x}' if x and x[0] == '/' else url_or_none(x)}), }, get_all=False) return { From 5ce0b127b2a176dd161e3367235376e3237133db Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Fri, 9 Aug 2024 22:39:55 +0800 Subject: [PATCH 4/6] update pattern --- yt_dlp/extractor/_extractors.py | 7 +++++++ yt_dlp/extractor/giphy.py | 8 +++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 9b73fcd75..16a4dabd8 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -738,6 +738,13 @@ GettrStreamingIE, ) from .giantbomb import GiantBombIE +from .giphy import ( + GiphyChannelIE, + GiphyIE, + GiphySearchIE, + GiphySearchURLIE, + GiphyStoriesIE, +) from .glide import GlideIE from .globalplayer import ( GlobalPlayerAudioEpisodeIE, diff --git a/yt_dlp/extractor/giphy.py b/yt_dlp/extractor/giphy.py index bb13f2b4c..0751a46f3 100644 --- a/yt_dlp/extractor/giphy.py +++ b/yt_dlp/extractor/giphy.py @@ -1,4 +1,5 @@ import itertools +import re from .common import InfoExtractor, SearchInfoExtractor from ..utils import ( @@ -215,7 +216,7 @@ def _real_extract(self, url): # search for: \"gif\":{\"type\":\"...}, if json_str := self._html_search_regex(r'\\"\w+\\":({\\"type\\":\\"(?!emoji).*?is_dynamic\\":\w+}),', webpage, 'video_data', default=None): - gif_data = self._parse_json(json_str.encode('utf-8').decode('unicode_escape'), video_id) + gif_data = self._parse_json(json_str.replace(r'\"', '"'), video_id) # search for: gif: {"...}, elif json_str := self._html_search_regex(r'\s+\w+:\s*({".*?}),\n\s+', webpage, 'video_data', default='{}'): gif_data = self._parse_json(json_str, video_id) @@ -253,9 +254,10 @@ def _real_extract(self, url): if data := gif_data.get('user'): if isinstance(data, str): idx = data.replace('$', '') - if json_str := self._html_search_regex(rf'"{idx}:({{.*?}})\\n"]\)', + if json_str := self._html_search_regex(rf'\\n{idx}:({{.*?}})\\n\w+:', webpage, 'uploader_data', default=None): - data = self._parse_json(json_str.encode('utf-8').decode('unicode_escape'), video_id, fatal=False) + json_str = re.sub(r'"\]\)self\.__next_f\.push\(\[\d+,"', '', json_str).replace(r'\"', '"') + data = self._parse_json(json_str, video_id, fatal=False) if isinstance(data, dict): uploader = traverse_obj(data, { 'uploader': (('display_name', 'name', 'attribution_display_name', 'username'), From e60917c379783e88b49ba056612b0270730e4669 Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Thu, 15 Aug 2024 00:27:07 +0800 Subject: [PATCH 5/6] Update _extractors.py Co-Authored-By: bashonly <88596187+bashonly@users.noreply.github.com> --- yt_dlp/extractor/_extractors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 16a4dabd8..a5303452f 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -740,6 +740,7 @@ from .giantbomb import GiantBombIE from .giphy import ( GiphyChannelIE, + GiphyChannelPageIE, GiphyIE, GiphySearchIE, GiphySearchURLIE, From 477b64b15330e183bae34ebcc73647dabba10341 Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Sat, 7 Sep 2024 17:20:05 +0800 Subject: [PATCH 6/6] fix entries' `webpage_url_basename` --- yt_dlp/extractor/giphy.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/giphy.py b/yt_dlp/extractor/giphy.py index 0751a46f3..fd76ee959 100644 --- a/yt_dlp/extractor/giphy.py +++ b/yt_dlp/extractor/giphy.py @@ -8,6 +8,7 @@ merge_dicts, str_or_none, traverse_obj, + url_basename, url_or_none, ) @@ -110,6 +111,7 @@ def _api_channel_feed(self, channel_id): yield { **self._extract_info(video, video['id']), 'webpage_url': video['url'], + 'webpage_url_basename': url_basename(video['url']), } count += len(search_results.get('results')) if count >= (int_or_none(self.get_param('playlistend')) or (self._GIPHY_MAX + 1)): @@ -169,12 +171,12 @@ class GiphyIE(GiphyBaseIE): 'ext': 'mp4', 'title': 'You Can\'t Break Up With Me', 'description': 'South Park, Season 20, Episode 4, Wieners Out', - 'tags': 'count:16', + 'tags': 'count:17', 'thumbnail': r're:^https?://.*', 'upload_date': '20220516', 'uploader': 'South Park', 'uploader_id': 'southpark', - 'uploader_url': 'https://giphy.com/southpark', + 'uploader_url': 'https://giphy.com/southpark/', }, }, { 'url': 'https://giphy.com/stickers/mario-PFxFYEZNUavG8', @@ -431,6 +433,7 @@ def search_query(query, offset, limit, category): yield { **self._extract_info(video, video['id']), 'webpage_url': video['url'], + 'webpage_url_basename': url_basename(video['url']), } if len(search_results['data']) < limit: total = offset + len(search_results['data']) @@ -503,6 +506,7 @@ def _real_extract(self, url): **self._extract_info(video['gif'], video['gif']['id']), 'description': video['caption'], 'webpage_url': video['gif']['url'], + 'webpage_url_basename': url_basename(video['gif']['url']), }) info = traverse_obj(data, { 'id': ('story_id', {lambda x: x or slug}),