From 2d02bdceeeb8e391d114d32e010f431712a80326 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 4 Feb 2025 14:07:08 +0100 Subject: [PATCH] abstract facebook _extract_metadata method out from inside '_extract_from_url' --- yt_dlp/extractor/facebook.py | 126 +++++++++++++++++------------------ 1 file changed, 63 insertions(+), 63 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 24ecb0350..f4ffa0333 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -477,72 +477,72 @@ def _perform_login(self, username, password): except network_exceptions as err: self.report_warning(f'unable to log in: {err}') return + + def _extract_metadata(self, webpage, video_id): + post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall( + r'data-sjs>({.*?ScheduledServerJS.*?})', webpage)] + post = traverse_obj(post_data, ( + ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] + media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: ( + k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) + title = get_first(media, ('title', 'text')) + description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) + page_title = title or self._html_search_regex(( + r']*class="uiHeaderTitle"[^>]*>(?P[^<]*)', + r'(?s)(?P.*?)', + self._meta_regex('og:title'), self._meta_regex('twitter:title'), r'(?P<content>.+?)', + ), webpage, 'title', default=None, group='content') + description = description or self._html_search_meta( + ['description', 'og:description', 'twitter:description'], + webpage, 'description', default=None) + uploader_data = ( + get_first(media, ('owner', {dict})) + or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name'])) + or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) + or get_first(post, ('node', 'actors', ..., {dict})) + or get_first(post, ('event', 'event_creator', {dict})) + or get_first(post, ('video', 'creation_story', 'short_form_video_context', 'video_owner', {dict})) or {}) + uploader = uploader_data.get('name') or ( + clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) + or self._search_regex( + (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False)) + timestamp = int_or_none(self._search_regex( + r']+data-utime=["\'](\d+)', webpage, + 'timestamp', default=None)) + thumbnail = self._html_search_meta( + ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None) + # some webpages contain unretrievable thumbnail urls + # like https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=10155168902769113&get_thumbnail=1 + # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/ + if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail): + thumbnail = None + info_dict = { + 'description': description, + 'uploader': uploader, + 'uploader_id': uploader_data.get('id'), + 'timestamp': timestamp, + 'thumbnail': thumbnail, + 'view_count': parse_count(self._search_regex( + (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)'), + webpage, 'view count', default=None)), + 'concurrent_view_count': get_first(post, ( + ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), + **traverse_obj(post, (lambda _, v: video_id in v['url'], 'feedback', { + 'like_count': ('likers', 'count', {int}), + 'comment_count': ('total_comment_count', {int}), + 'repost_count': ('share_count_reduced', {parse_count}), + }), get_all=False), + } + + info_json_ld = self._search_json_ld(webpage, video_id, default={}) + info_json_ld['title'] = (re.sub(r'\s*\|\s*Facebook$', '', title or info_json_ld.get('title') or page_title or '') + or (description or '').replace('\n', ' ') or f'Facebook video #{video_id}') + return merge_dicts(info_json_ld, info_dict) def _extract_from_url(self, url, video_id): webpage = self._download_webpage( url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) - def extract_metadata(webpage): - post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall( - r'data-sjs>({.*?ScheduledServerJS.*?})', webpage)] - post = traverse_obj(post_data, ( - ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] - media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: ( - k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) - title = get_first(media, ('title', 'text')) - description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) - page_title = title or self._html_search_regex(( - r']*class="uiHeaderTitle"[^>]*>(?P[^<]*)', - r'(?s)(?P.*?)', - self._meta_regex('og:title'), self._meta_regex('twitter:title'), r'(?P<content>.+?)', - ), webpage, 'title', default=None, group='content') - description = description or self._html_search_meta( - ['description', 'og:description', 'twitter:description'], - webpage, 'description', default=None) - uploader_data = ( - get_first(media, ('owner', {dict})) - or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name'])) - or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) - or get_first(post, ('node', 'actors', ..., {dict})) - or get_first(post, ('event', 'event_creator', {dict})) - or get_first(post, ('video', 'creation_story', 'short_form_video_context', 'video_owner', {dict})) or {}) - uploader = uploader_data.get('name') or ( - clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) - or self._search_regex( - (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False)) - timestamp = int_or_none(self._search_regex( - r']+data-utime=["\'](\d+)', webpage, - 'timestamp', default=None)) - thumbnail = self._html_search_meta( - ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None) - # some webpages contain unretrievable thumbnail urls - # like https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=10155168902769113&get_thumbnail=1 - # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/ - if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail): - thumbnail = None - info_dict = { - 'description': description, - 'uploader': uploader, - 'uploader_id': uploader_data.get('id'), - 'timestamp': timestamp, - 'thumbnail': thumbnail, - 'view_count': parse_count(self._search_regex( - (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)'), - webpage, 'view count', default=None)), - 'concurrent_view_count': get_first(post, ( - ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), - **traverse_obj(post, (lambda _, v: video_id in v['url'], 'feedback', { - 'like_count': ('likers', 'count', {int}), - 'comment_count': ('total_comment_count', {int}), - 'repost_count': ('share_count_reduced', {parse_count}), - }), get_all=False), - } - - info_json_ld = self._search_json_ld(webpage, video_id, default={}) - info_json_ld['title'] = (re.sub(r'\s*\|\s*Facebook$', '', title or info_json_ld.get('title') or page_title or '') - or (description or '').replace('\n', ' ') or f'Facebook video #{video_id}') - return merge_dicts(info_json_ld, info_dict) - video_data = None def extract_video_data(instances): @@ -753,7 +753,7 @@ def parse_attachment(attachment, key='media'): return self.playlist_result(entries, video_id) video_info = entries[0] if entries else {'id': video_id} - webpage_info = extract_metadata(webpage) + webpage_info = self._extract_metadata(webpage, video_id) # honor precise duration in video info if video_info.get('duration'): webpage_info['duration'] = video_info['duration'] @@ -885,7 +885,7 @@ def parse_attachment(attachment, key='media'): 'subtitles': subtitles, } process_formats(info_dict) - info_dict.update(extract_metadata(webpage)) + info_dict.update(self._extract_metadata(webpage, video_id)) return info_dict