1
0
Fork 0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-05-22 18:05:46 -05:00

[ie/wsj] Now needs impersonation + cover more urls

This commit is contained in:
Refael Ackermann 2025-02-21 13:11:59 -05:00
parent a3e0c7d3b2
commit 41923bc673

View file

@ -1,3 +1,5 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
float_or_none, float_or_none,
@ -100,8 +102,13 @@ def _real_extract(self, url):
class WSJArticleIE(InfoExtractor): class WSJArticleIE(InfoExtractor):
_VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>[^/?#&]+)' _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/(?:articles|opinion)/(?P<id>[^/?#&]+)'
_TEST = { _WEBPAGE_RE = (
re.compile(r'(?:id=["\']video|video-|iframe\.html\?guid=|data-src=["\'])([a-fA-F0-9-]{36})'),
)
# WSJ started using CloudFront's bot-wall, so this works iff there's a hydrated cookiejar
_TESTS = [{
'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?', 'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?',
'info_dict': { 'info_dict': {
'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362', 'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362',
@ -110,12 +117,19 @@ class WSJArticleIE(InfoExtractor):
'uploader_id': 'ralcaraz', 'uploader_id': 'ralcaraz',
'title': 'Bao Bao the Panda Leaves for China', 'title': 'Bao Bao the Panda Leaves for China',
}, },
} }, {
'url': 'https://www.wsj.com/opinion/hamas-hostages-caskets-bibas-family-israel-gaza-29da083b',
'info_dict': {
'id': 'CE68D629-8DB8-4CD3-B30A-92112C102054',
'ext': 'mp4',
'upload_date': '20241007',
'uploader_id': 'Tinnes, David',
'title': 'WSJ Opinion: "Get the Jew": The Crown Heights Riot Revisited',
},
}]
def _real_extract(self, url): def _real_extract(self, url):
article_id = self._match_id(url) article_id = self._match_id(url)
webpage = self._download_webpage(url, article_id) webpage = self._download_webpage(url, article_id, impersonate=True)
video_id = self._search_regex( video_id = self._search_regex(self._WEBPAGE_RE, webpage, 'video id')
r'(?:id=["\']video|video-|iframe\.html\?guid=|data-src=["\'])([a-fA-F0-9-]{36})',
webpage, 'video id')
return self.url_result(f'wsj:{video_id}', WSJIE.ie_key(), video_id) return self.url_result(f'wsj:{video_id}', WSJIE.ie_key(), video_id)