[ie/digiteka] try to fetch url in iframe content

2025-03-09 12:50:23 -05:00 · 2024-12-30 23:09:04 +01:00 · 2024-12-30 23:09:04 +01:00 · 1c2cc0dde8
commit 1c2cc0dde8
parent 0b6b7742c2
1 changed files with 25 additions and 38 deletions
--- a/yt_dlp/extractor/digiteka.py
+++ b/yt_dlp/extractor/digiteka.py
@ -1,7 +1,6 @@
 from .common import InfoExtractor
 from ..utils import int_or_none
 class DigitekaIE(InfoExtractor):
    _VALID_URL = r'''(?x)
        https?://(?:www\.)?(?:digiteka\.net|ultimedia\.com)/
@ -23,39 +22,11 @@ class DigitekaIE(InfoExtractor):
            )
            /id
        )/(?P<id>[\d+a-z]+)'''
-    _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=["\'](?P<url>(?:https?:)?//(?:www\.)?ultimedia\.com/deliver/(?:generic|musique)(?:/[^/]+)*/(?:src|article)/[\d+a-z]+)']
+    _EMBED_REGEX = [r'<(?:iframe|script)(?:(?!>)[\s\S])*(?:data-)?src=["\'](?P<url>(?:https?:)?//(?:www\.)?(?:digiteka\.net|ultimedia\.com)/deliver/(?P<embed_type>generic|musique)(?:/[^/]+)*/(?:src|article)/(?P<id>[\d+a-z]+))']
-    _TESTS = [{
+    _TESTS = [
-        # news
+        {'url': 'https://www.ultimedia.com/deliver/generic/iframe/mdtk/01747256/zone/60/src/x8smpxf'}, # direct url
-        'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r',
+        {'url': 'https://www.boursorama.com/bourse/actualites/le-retour-des-taux-negatifs-est-il-possible-169e3e0cf337df132285b41e124dc98e'} # from an embed
-        'md5': '276a0e49de58c7e85d32b057837952a2',
+    ]
        'info_dict': {
            'id': 's8uk0r',
            'ext': 'mp4',
            'title': 'Loi sur la fin de vie: le texte prévoit un renforcement des directives anticipées',
            'thumbnail': r're:^https?://.*\.jpg',
            'duration': 74,
            'upload_date': '20150317',
            'timestamp': 1426604939,
            'uploader_id': '3fszv',
        },
    }, {
        # music
        'url': 'https://www.ultimedia.com/default/index/videomusic/id/xvpfp8',
        'md5': '2ea3513813cf230605c7e2ffe7eca61c',
        'info_dict': {
            'id': 'xvpfp8',
            'ext': 'mp4',
            'title': 'Two - C\'est La Vie (clip)',
            'thumbnail': r're:^https?://.*\.jpg',
            'duration': 233,
            'upload_date': '20150224',
            'timestamp': 1424760500,
            'uploader_id': '3rfzk',
        },
    }, {
        'url': 'https://www.digiteka.net/deliver/generic/iframe/mdtk/01637594/src/lqm3kl/zone/1/showtitle/1/autoplay/yes',
        'only_matching': True,
    }]
    def _real_extract(self, url):
        mobj = self._match_valid_url(url)
@ -68,18 +39,34 @@ def _real_extract(self, url):
            f'http://www.ultimedia.com/deliver/video?video={video_id}&topic={video_type}',
            video_id)
        yt_id = deliver_info.get('yt_id')
        if yt_id:
            return self.url_result(yt_id, 'Youtube')
        jwconf = deliver_info['jwconf']
        formats = []
        for source in jwconf['playlist'][0]['sources']:
-            formats.append({
+            if source['file'] is not False:
-                'url': source['file'],
+                formats.append({
-                'format_id': source.get('label'),
+                    'url': source['file'],
-            })
+                    'format_id': source.get('label'),
                })
        if len(formats) == 0:
            # the file urls are not available from the json directly anymore, but
            # can be found in the iframe content
            iframe_content = self._download_webpage(url, video_id)
            IFRAME_REGEX = '<meta property="og:video" content="(?P<url>.*)"/>'
            video_url = self._search_regex(IFRAME_REGEX, iframe_content, 'url')
            video_format = video_url.split('.')[-1]
        formats.append({
            'url': video_url,
            'ext': video_format,
        })
        title = deliver_info['title']
        thumbnail = jwconf.get('image')