diff --git a/yt_dlp/extractor/dreisat.py b/yt_dlp/extractor/dreisat.py index 4b0a269b9..376ff672d 100644 --- a/yt_dlp/extractor/dreisat.py +++ b/yt_dlp/extractor/dreisat.py @@ -1,10 +1,24 @@ -from .zdf import ZDFIE +from .zdf import ZDFBaseIE -class DreiSatIE(ZDFIE): # XXX: Do not subclass from concrete IE +class DreiSatIE(ZDFBaseIE): IE_NAME = '3sat' _VALID_URL = r'https?://(?:www\.)?3sat\.de/(?:[^/]+/)*(?P[^/?#&]+)\.html' _TESTS = [{ + 'url': 'https://www.3sat.de/dokumentation/reise/traumziele-suedostasiens-die-philippinen-und-vietnam-102.html', + 'info_dict': { + 'id': '231124_traumziele_philippinen_und_vietnam_dokreise', + 'ext': 'mp4', + 'title': 'Traumziele Südostasiens (1/2): Die Philippinen und Vietnam', + 'description': 'md5:26329ce5197775b596773b939354079d', + 'duration': 2625.0, + 'thumbnail': 'https://www.3sat.de/assets/traumziele-suedostasiens-die-philippinen-und-vietnam-100~2400x1350?cb=1699870351148', + 'episode': 'Traumziele Südostasiens (1/2): Die Philippinen und Vietnam', + 'episode_id': 'POS_cc7ff51c-98cf-4d12-b99d-f7a551de1c95', + 'timestamp': 1738593000, + 'upload_date': '20250203', + }, + }, { # Same as https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html 'url': 'https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html', 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', @@ -17,6 +31,7 @@ class DreiSatIE(ZDFIE): # XXX: Do not subclass from concrete IE 'timestamp': 1608604200, 'upload_date': '20201222', }, + 'skip': '410 Gone', }, { 'url': 'https://www.3sat.de/gesellschaft/schweizweit/waidmannsheil-100.html', 'info_dict': { @@ -30,6 +45,7 @@ class DreiSatIE(ZDFIE): # XXX: Do not subclass from concrete IE 'params': { 'skip_download': True, }, + 'skip': '404 Not Found', }, { # Same as https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html 'url': 'https://www.3sat.de/film/spielfilm/der-hauptmann-100.html', @@ -39,3 +55,14 @@ class DreiSatIE(ZDFIE): # XXX: Do not subclass from concrete IE 'url': 'https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html', 'only_matching': True, }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id, fatal=False) + if webpage: + player = self._extract_player(webpage, url, fatal=False) + if player: + return self._extract_regular(url, player, video_id) + + return self._extract_mobile(video_id) diff --git a/yt_dlp/extractor/zdf.py b/yt_dlp/extractor/zdf.py index b64a88f6c..4bef1017c 100644 --- a/yt_dlp/extractor/zdf.py +++ b/yt_dlp/extractor/zdf.py @@ -137,6 +137,116 @@ def _extract_player(self, webpage, video_id, fatal=True): group='json'), video_id) + def _extract_entry(self, url, player, content, video_id): + title = content.get('title') or content['teaserHeadline'] + + t = content['mainVideoContent']['http://zdf.de/rels/target'] + ptmd_path = traverse_obj(t, ( + (('streams', 'default'), None), + ('http://zdf.de/rels/streams/ptmd', 'http://zdf.de/rels/streams/ptmd-template'), + ), get_all=False) + if not ptmd_path: + raise ExtractorError('Could not extract ptmd_path') + + info = self._extract_ptmd( + urljoin(url, ptmd_path.replace('{playerId}', 'android_native_5')), video_id, player['apiToken'], url) + + thumbnails = [] + layouts = try_get( + content, lambda x: x['teaserImageRef']['layouts'], dict) + if layouts: + for layout_key, layout_url in layouts.items(): + layout_url = url_or_none(layout_url) + if not layout_url: + continue + thumbnail = { + 'url': layout_url, + 'format_id': layout_key, + } + mobj = re.search(r'(?P\d+)x(?P\d+)', layout_key) + if mobj: + thumbnail.update({ + 'width': int(mobj.group('width')), + 'height': int(mobj.group('height')), + }) + thumbnails.append(thumbnail) + + chapter_marks = t.get('streamAnchorTag') or [] + chapter_marks.append({'anchorOffset': int_or_none(t.get('duration'))}) + chapters = [{ + 'start_time': chap.get('anchorOffset'), + 'end_time': next_chap.get('anchorOffset'), + 'title': chap.get('anchorLabel'), + } for chap, next_chap in zip(chapter_marks, chapter_marks[1:])] + + return merge_dicts(info, { + 'title': title, + 'description': content.get('leadParagraph') or content.get('teasertext'), + 'duration': int_or_none(t.get('duration')), + 'timestamp': unified_timestamp(content.get('editorialDate')), + 'thumbnails': thumbnails, + 'chapters': chapters or None, + 'episode': title, + **traverse_obj(content, ('programmeItem', 0, 'http://zdf.de/rels/target', { + 'series_id': ('http://zdf.de/rels/cmdm/series', 'seriesUuid', {str}), + 'series': ('http://zdf.de/rels/cmdm/series', 'seriesTitle', {str}), + 'season': ('http://zdf.de/rels/cmdm/season', 'seasonTitle', {str}), + 'season_number': ('http://zdf.de/rels/cmdm/season', 'seasonNumber', {int_or_none}), + 'season_id': ('http://zdf.de/rels/cmdm/season', 'seasonUuid', {str}), + 'episode_number': ('episodeNumber', {int_or_none}), + 'episode_id': ('contentId', {str}), + })), + }) + + def _extract_regular(self, url, player, video_id, query=None): + player_url = player['content'] + + content = self._call_api( + update_url_query(player_url, query), + video_id, 'content', player['apiToken'], url) + + return self._extract_entry(player_url, player, content, video_id) + + def _extract_mobile(self, video_id): + video = self._download_v2_doc(video_id) + + formats = [] + formitaeten = try_get(video, lambda x: x['document']['formitaeten'], list) + document = formitaeten and video['document'] + if formitaeten: + title = document['titel'] + content_id = document['basename'] + + format_urls = set() + for f in formitaeten or []: + self._extract_format(content_id, formats, format_urls, f) + + thumbnails = [] + teaser_bild = document.get('teaserBild') + if isinstance(teaser_bild, dict): + for thumbnail_key, thumbnail in teaser_bild.items(): + thumbnail_url = try_get( + thumbnail, lambda x: x['url'], str) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + 'id': thumbnail_key, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': content_id, + 'title': title, + 'description': document.get('beschreibung'), + 'duration': int_or_none(document.get('length')), + 'timestamp': unified_timestamp(document.get('date')) or unified_timestamp( + try_get(video, lambda x: x['meta']['editorialDate'], str)), + 'thumbnails': thumbnails, + 'subtitles': self._extract_subtitles(document), + 'formats': formats, + } + class ZDFIE(ZDFBaseIE): _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?#&]+)\.html' @@ -306,121 +416,6 @@ class ZDFIE(ZDFBaseIE): }, }] - def _extract_entry(self, url, player, content, video_id): - title = content.get('title') or content['teaserHeadline'] - - t = content['mainVideoContent']['http://zdf.de/rels/target'] - ptmd_path = traverse_obj(t, ( - (('streams', 'default'), None), - ('http://zdf.de/rels/streams/ptmd', 'http://zdf.de/rels/streams/ptmd-template'), - ), get_all=False) - if not ptmd_path: - raise ExtractorError('Could not extract ptmd_path') - - info = self._extract_ptmd( - urljoin(url, ptmd_path.replace('{playerId}', 'android_native_5')), video_id, player['apiToken'], url) - - thumbnails = [] - layouts = try_get( - content, lambda x: x['teaserImageRef']['layouts'], dict) - if layouts: - for layout_key, layout_url in layouts.items(): - layout_url = url_or_none(layout_url) - if not layout_url: - continue - thumbnail = { - 'url': layout_url, - 'format_id': layout_key, - } - mobj = re.search(r'(?P\d+)x(?P\d+)', layout_key) - if mobj: - thumbnail.update({ - 'width': int(mobj.group('width')), - 'height': int(mobj.group('height')), - }) - thumbnails.append(thumbnail) - - chapter_marks = t.get('streamAnchorTag') or [] - chapter_marks.append({'anchorOffset': int_or_none(t.get('duration'))}) - chapters = [{ - 'start_time': chap.get('anchorOffset'), - 'end_time': next_chap.get('anchorOffset'), - 'title': chap.get('anchorLabel'), - } for chap, next_chap in zip(chapter_marks, chapter_marks[1:])] - - return merge_dicts(info, { - 'title': title, - 'description': content.get('leadParagraph') or content.get('teasertext'), - 'duration': int_or_none(t.get('duration')), - 'timestamp': unified_timestamp(content.get('editorialDate')), - 'thumbnails': thumbnails, - 'chapters': chapters or None, - 'episode': title, - **traverse_obj(content, ('programmeItem', 0, 'http://zdf.de/rels/target', { - 'series_id': ('http://zdf.de/rels/cmdm/series', 'seriesUuid', {str}), - 'series': ('http://zdf.de/rels/cmdm/series', 'seriesTitle', {str}), - 'season': ('http://zdf.de/rels/cmdm/season', 'seasonTitle', {str}), - 'season_number': ('http://zdf.de/rels/cmdm/season', 'seasonNumber', {int_or_none}), - 'season_id': ('http://zdf.de/rels/cmdm/season', 'seasonUuid', {str}), - 'episode_number': ('episodeNumber', {int_or_none}), - 'episode_id': ('contentId', {str}), - })), - }) - - def _extract_regular(self, url, player, video_id): - player_url = player['content'] - - try: - content = self._call_api( - update_url_query(player_url, {'profile': 'player-3'}), - video_id, 'content', player['apiToken'], url) - except ExtractorError as e: - self.report_warning(f'{video_id}: {e.orig_msg}; retrying with v2 profile') - content = self._call_api( - player_url, video_id, 'content', player['apiToken'], url) - - return self._extract_entry(player_url, player, content, video_id) - - def _extract_mobile(self, video_id): - video = self._download_v2_doc(video_id) - - formats = [] - formitaeten = try_get(video, lambda x: x['document']['formitaeten'], list) - document = formitaeten and video['document'] - if formitaeten: - title = document['titel'] - content_id = document['basename'] - - format_urls = set() - for f in formitaeten or []: - self._extract_format(content_id, formats, format_urls, f) - - thumbnails = [] - teaser_bild = document.get('teaserBild') - if isinstance(teaser_bild, dict): - for thumbnail_key, thumbnail in teaser_bild.items(): - thumbnail_url = try_get( - thumbnail, lambda x: x['url'], str) - if thumbnail_url: - thumbnails.append({ - 'url': thumbnail_url, - 'id': thumbnail_key, - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - }) - - return { - 'id': content_id, - 'title': title, - 'description': document.get('beschreibung'), - 'duration': int_or_none(document.get('length')), - 'timestamp': unified_timestamp(document.get('date')) or unified_timestamp( - try_get(video, lambda x: x['meta']['editorialDate'], str)), - 'thumbnails': thumbnails, - 'subtitles': self._extract_subtitles(document), - 'formats': formats, - } - def _real_extract(self, url): video_id = self._match_id(url) @@ -428,7 +423,7 @@ def _real_extract(self, url): if webpage: player = self._extract_player(webpage, url, fatal=False) if player: - return self._extract_regular(url, player, video_id) + return self._extract_regular(url, player, video_id, query={'profile': 'player-3'}) return self._extract_mobile(video_id) @@ -474,7 +469,8 @@ def _extract_entry(self, entry): 'title': ('titel', {str}), 'description': ('beschreibung', {str}), 'duration': ('length', {float_or_none}), - # TODO: seasonNumber and episodeNumber can be extracted but need to also be in ZDFIE + 'season_number': ('seasonNumber', {int_or_none}), + 'episode_number': ('episodeNumber', {int_or_none}), })) def _entries(self, data, document_id):