mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-03-09 12:50:23 -05:00
[ie/Voicy] Rework extractor
This commit is contained in:
parent
0bb3978862
commit
728334a40e
2 changed files with 354 additions and 122 deletions
|
@ -2389,6 +2389,8 @@
|
|||
from .voicy import (
|
||||
VoicyChannelIE,
|
||||
VoicyIE,
|
||||
VoicyLiveIE,
|
||||
VoicyTopicIE,
|
||||
)
|
||||
from .volejtv import VolejTVIE
|
||||
from .voxmedia import (
|
||||
|
|
|
@ -1,145 +1,375 @@
|
|||
import calendar
|
||||
import itertools
|
||||
import time
|
||||
import urllib.parse
|
||||
|
||||
from .common import InfoExtractor
|
||||
from .wrestleuniverse import WrestleUniverseBaseIE
|
||||
from ..utils import (
|
||||
ExtractorError,
|
||||
smuggle_url,
|
||||
float_or_none,
|
||||
merge_dicts,
|
||||
parse_iso8601,
|
||||
parse_qs,
|
||||
str_or_none,
|
||||
traverse_obj,
|
||||
unified_strdate,
|
||||
unsmuggle_url,
|
||||
update_url,
|
||||
url_or_none,
|
||||
)
|
||||
from ..utils.traversal import traverse_obj
|
||||
|
||||
|
||||
class VoicyBaseIE(InfoExtractor):
|
||||
def _extract_from_playlist_data(self, value):
|
||||
voice_id = str(value.get('PlaylistId'))
|
||||
upload_date = unified_strdate(value.get('Published'), False)
|
||||
items = [self._extract_single_article(voice_data) for voice_data in value['VoiceData']]
|
||||
return {
|
||||
'_type': 'multi_video',
|
||||
'entries': items,
|
||||
'id': voice_id,
|
||||
'title': str(value.get('PlaylistName')),
|
||||
'uploader': value.get('SpeakerName'),
|
||||
'uploader_id': str_or_none(value.get('SpeakerId')),
|
||||
'channel': value.get('ChannelName'),
|
||||
'channel_id': str_or_none(value.get('ChannelId')),
|
||||
'upload_date': upload_date,
|
||||
}
|
||||
class VoicyBaseIE(WrestleUniverseBaseIE):
|
||||
_LOGIN_HEADERS = {
|
||||
'Content-Type': 'application/json',
|
||||
'X-Client-Version': 'Chrome/JsCore/10.13.2/FirebaseCore-web',
|
||||
'X-Firebase-Gmpid': '1:212371279501:web:318567ddcbb953adcc5cc4',
|
||||
}
|
||||
_LOGIN_HINT = (
|
||||
'Use --username refresh --password <refreshToken>, --username and --password, '
|
||||
'--netrc-cmd, or --netrc (voicy) to provide account credentials')
|
||||
_LOGIN_QUERY = {'key': 'AIzaSyC5Rg-sxiYu6ySD8V-f6Eljwll8gHvgUK4'}
|
||||
_NETRC_MACHINE = 'voicy'
|
||||
|
||||
def _extract_single_article(self, entry):
|
||||
formats = [{
|
||||
'url': entry['VoiceHlsFile'],
|
||||
'format_id': 'hls',
|
||||
'ext': 'm4a',
|
||||
'acodec': 'aac',
|
||||
'vcodec': 'none',
|
||||
'protocol': 'm3u8_native',
|
||||
}, {
|
||||
'url': entry['VoiceFile'],
|
||||
'format_id': 'mp3',
|
||||
'ext': 'mp3',
|
||||
'acodec': 'mp3',
|
||||
'vcodec': 'none',
|
||||
}]
|
||||
return {
|
||||
'id': str(entry.get('ArticleId')),
|
||||
'title': entry.get('ArticleTitle'),
|
||||
'description': entry.get('MediaName'),
|
||||
'formats': formats,
|
||||
}
|
||||
@WrestleUniverseBaseIE._TOKEN.getter
|
||||
def _TOKEN(self):
|
||||
if not self._REAL_TOKEN or self._TOKEN_EXPIRY <= int(time.time()):
|
||||
if not self._REFRESH_TOKEN:
|
||||
self.raise_login_required(
|
||||
f'No refreshToken provided. {self._LOGIN_HINT}', method=None)
|
||||
self._refresh_token()
|
||||
return self._REAL_TOKEN
|
||||
|
||||
def _call_api(self, url, video_id, **kwargs):
|
||||
response = self._download_json(url, video_id, **kwargs)
|
||||
if response.get('Status') != 0:
|
||||
message = traverse_obj(response, ('Value', 'Error', 'Message'), expected_type=str)
|
||||
if not message:
|
||||
message = 'There was a error in the response: %d' % response.get('Status')
|
||||
raise ExtractorError(message, expected=False)
|
||||
return response.get('Value')
|
||||
def _perform_login(self, username, password):
|
||||
if username.lower() == 'refresh':
|
||||
self._REFRESH_TOKEN = password
|
||||
return self._refresh_token()
|
||||
return super()._perform_login(username, password)
|
||||
|
||||
def _call_api(self, path, some_id, note='Downloading JSON metadata', headers=None, query=None, fatal=True):
|
||||
return self._download_json(
|
||||
f'https://vmedia-player-api.voicy.jp/v1/{path}', some_id, note=note, headers={
|
||||
'Authorization': f'Bearer {self._TOKEN}',
|
||||
} | (headers or {}), query=query, fatal=fatal,
|
||||
)
|
||||
|
||||
|
||||
class VoicyIE(VoicyBaseIE):
|
||||
_WORKING = False
|
||||
IE_NAME = 'voicy'
|
||||
_VALID_URL = r'https?://voicy\.jp/channel/(?P<channel_id>\d+)/(?P<id>\d+)'
|
||||
ARTICLE_LIST_API_URL = 'https://vmw.api.voicy.jp/articles_list?channel_id=%s&pid=%s'
|
||||
IE_DESC = 'Voicy'
|
||||
|
||||
_VALID_URL = [
|
||||
r'https?://(?:www\.)?voicy\.jp/channel/(?P<channel>\d+)/(?P<id>\d+)',
|
||||
r'http://r\.voicy\.jp/\w+',
|
||||
]
|
||||
_TESTS = [{
|
||||
'url': 'https://voicy.jp/channel/1253/122754',
|
||||
'url': 'https://voicy.jp/channel/3402/6361249',
|
||||
'info_dict': {
|
||||
'id': '122754',
|
||||
'title': '1/21(木)声日記:ついに原稿終わった!!',
|
||||
'uploader': 'ちょまど@ ITエンジニアなオタク',
|
||||
'uploader_id': '7339',
|
||||
'id': '8576738',
|
||||
'ext': 'm4a',
|
||||
'title': '2025.1.19「ブラジル沖の白石康次郎さん&息子の成人式を『1年』間違えたお母さん」',
|
||||
'categories': ['トーク', '声優・アナウンサー'],
|
||||
'channel': '安住紳一郎の日曜天国',
|
||||
'channel_id': '3402',
|
||||
'comment_count': int,
|
||||
'description': 'md5:f39bb238ff7661c3b7e8934f8578cf33',
|
||||
'display_id': '6361249',
|
||||
'duration': 1588.741,
|
||||
'like_count': int,
|
||||
'release_date': '20250119',
|
||||
'release_timestamp': 1737272164,
|
||||
'series': '2025.1.19「ブラジル沖の白石康次郎さん&息子の成人式を『1年』間違えたお母さん」',
|
||||
'series_id': '6361249',
|
||||
'thumbnail': r're:https://files\.voicy\.jp/img/speaker/.+$',
|
||||
'uploader': 'TBS RADIO',
|
||||
'uploader_id': '17328',
|
||||
'view_count': int,
|
||||
},
|
||||
'playlist_mincount': 9,
|
||||
}, {
|
||||
'url': 'https://voicy.jp/channel/3272/1141448',
|
||||
'info_dict': {
|
||||
'id': '2757390',
|
||||
'ext': 'mp3',
|
||||
'title': '5/3 お久しぶり雑談回',
|
||||
'categories': ['トーク', '声優・アナウンサー'],
|
||||
'channel': '松嵜麗のボイログ!',
|
||||
'channel_id': '3272',
|
||||
'comment_count': int,
|
||||
'description': 'md5:4dee911d23cf1eedeb49687881878119',
|
||||
'display_id': '1141448',
|
||||
'duration': 433.24,
|
||||
'like_count': int,
|
||||
'release_date': '20240502',
|
||||
'release_timestamp': 1714662728,
|
||||
'series': '最近のわたし',
|
||||
'series_id': '1141448',
|
||||
'tags': ['最近のマイブーム'],
|
||||
'thumbnail': r're:https://files\.voicy\.jp/img/speaker/.+$',
|
||||
'uploader': '声優・松嵜麗',
|
||||
'uploader_id': '16462',
|
||||
'view_count': int,
|
||||
},
|
||||
}, {
|
||||
'url': 'https://voicy.jp/channel/1417/6436213',
|
||||
'info_dict': {
|
||||
'id': '6436213',
|
||||
'title': '第100回 グリム兄弟「麦のほ」',
|
||||
},
|
||||
'playlist_count': 5,
|
||||
'skip': 'Only available for premium supporters',
|
||||
}, {
|
||||
'url': 'http://r.voicy.jp/7Qm2JbexmY6',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
mobj = self._match_valid_url(url)
|
||||
assert mobj
|
||||
voice_id = mobj.group('id')
|
||||
channel_id = mobj.group('channel_id')
|
||||
url, article_list = unsmuggle_url(url)
|
||||
if not article_list:
|
||||
article_list = self._call_api(self.ARTICLE_LIST_API_URL % (channel_id, voice_id), voice_id)
|
||||
return self._extract_from_playlist_data(article_list)
|
||||
if not url.startswith('https'):
|
||||
return self.url_result(update_url(url, scheme='https'))
|
||||
|
||||
|
||||
class VoicyChannelIE(VoicyBaseIE):
|
||||
_WORKING = False
|
||||
IE_NAME = 'voicy:channel'
|
||||
_VALID_URL = r'https?://voicy\.jp/channel/(?P<id>\d+)'
|
||||
PROGRAM_LIST_API_URL = 'https://vmw.api.voicy.jp/program_list/all?channel_id=%s&limit=20&public_type=3%s'
|
||||
_TESTS = [{
|
||||
'url': 'https://voicy.jp/channel/1253/',
|
||||
'info_dict': {
|
||||
'id': '7339',
|
||||
'title': 'ゆるふわ日常ラジオ #ちょまラジ',
|
||||
'uploader': 'ちょまど@ ITエンジニアなオタク',
|
||||
'uploader_id': '7339',
|
||||
},
|
||||
'playlist_mincount': 54,
|
||||
}]
|
||||
|
||||
@classmethod
|
||||
def suitable(cls, url):
|
||||
return not VoicyIE.suitable(url) and super().suitable(url)
|
||||
|
||||
def _entries(self, channel_id):
|
||||
pager = ''
|
||||
for count in itertools.count(1):
|
||||
article_list = self._call_api(self.PROGRAM_LIST_API_URL % (channel_id, pager), channel_id, note=f'Paging #{count}')
|
||||
playlist_data = article_list.get('PlaylistData')
|
||||
if not playlist_data:
|
||||
break
|
||||
yield from playlist_data
|
||||
last = playlist_data[-1]
|
||||
pager = '&pid=%d&p_date=%s&play_count=%s' % (last['PlaylistId'], last['Published'], last['PlayCount'])
|
||||
|
||||
def _real_extract(self, url):
|
||||
channel_id = self._match_id(url)
|
||||
articles = self._entries(channel_id)
|
||||
|
||||
first_article = next(articles, None)
|
||||
title = traverse_obj(first_article, ('ChannelName', ), expected_type=str)
|
||||
speaker_name = traverse_obj(first_article, ('SpeakerName', ), expected_type=str)
|
||||
if not title and speaker_name:
|
||||
title = f'Uploads from {speaker_name}'
|
||||
if not title:
|
||||
title = f'Uploads from channel ID {channel_id}'
|
||||
|
||||
articles = itertools.chain([first_article], articles) if first_article else articles
|
||||
|
||||
playlist = (
|
||||
self.url_result(smuggle_url('https://voicy.jp/channel/%s/%d' % (channel_id, value['PlaylistId']), value), VoicyIE.ie_key())
|
||||
for value in articles)
|
||||
return {
|
||||
'_type': 'playlist',
|
||||
'entries': playlist,
|
||||
'id': channel_id,
|
||||
'title': title,
|
||||
'channel': speaker_name,
|
||||
channel_id, audio_id = self._match_valid_url(url).groups()
|
||||
channel_info = {
|
||||
'channel_id': channel_id,
|
||||
**traverse_obj(self._call_api(f'channel/{channel_id}', channel_id), {
|
||||
'categories': ('category', ('name', ('subcategory', 'name'), {str})),
|
||||
'channel': ('name', {str}),
|
||||
'thumbnail': ('image', {url_or_none}),
|
||||
'uploader': ('personality', 'name', {str}),
|
||||
'uploader_id': ('personality', 'id', {str_or_none}),
|
||||
}),
|
||||
}
|
||||
|
||||
audio_info = self._call_api(f'channels/{channel_id}/stories/{audio_id}', audio_id)
|
||||
common_info = {
|
||||
'description': self._call_api(
|
||||
f'channels/{channel_id}/stories/{audio_id}/description', audio_id, fatal=False,
|
||||
).get('description'),
|
||||
'display_id': audio_id,
|
||||
'series': audio_info['name'],
|
||||
'series_id': audio_id,
|
||||
**traverse_obj(audio_info, {
|
||||
'comment_count': ('comment_count', {int}),
|
||||
'like_count': ('like_count', {int}),
|
||||
'release_timestamp': ('published', {parse_iso8601}),
|
||||
'tags': ('hashtags', ..., 'name', {str}),
|
||||
}),
|
||||
}
|
||||
|
||||
all_entries = traverse_obj(audio_info, ('chapters', ..., {
|
||||
'id': ('id', {str_or_none}),
|
||||
'title': ('name', {str}),
|
||||
'duration': ('voice', 'duration', {float_or_none(scale=1000)}),
|
||||
'manifest': ('voice', 'file', {url_or_none}),
|
||||
'view_count': ('play_count', {int}),
|
||||
}))
|
||||
|
||||
entries = []
|
||||
for entry in all_entries:
|
||||
if manifest := entry.pop('manifest', None):
|
||||
ext = 'm4a' if 'audio_hls_aac' in manifest else 'mp3'
|
||||
merged = merge_dicts(entry, channel_info, common_info, {
|
||||
'formats': self._extract_m3u8_formats(manifest, audio_id, ext),
|
||||
})
|
||||
if len(all_entries) == 1:
|
||||
return merged
|
||||
entries.append(merged)
|
||||
if not entries:
|
||||
self.raise_login_required(
|
||||
f'Premium(VIP) authentication required. {self._LOGIN_HINT}', method=None)
|
||||
|
||||
return self.playlist_result(entries, audio_id, audio_info['name'])
|
||||
|
||||
|
||||
class VoicyLiveIE(VoicyBaseIE):
|
||||
IE_NAME = 'voicy:live'
|
||||
|
||||
_VALID_URL = r'https?://(?:www\.)?voicy\.jp/channel/\d+/live/(?P<id>\d+)'
|
||||
_TESTS = [{
|
||||
'url': 'https://voicy.jp/channel/1417/live/4858078',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
audio_id = self._match_id(url)
|
||||
|
||||
live_info = self._call_api(
|
||||
f'live/{audio_id}', audio_id, headers={'X-Platform': '3'})
|
||||
if live_info['status'] != 'ended':
|
||||
raise ExtractorError('WebRTC is not currently supported', expected=True)
|
||||
if share_url := traverse_obj(live_info, ('archive', 'share_url', {url_or_none})):
|
||||
return self.url_result(share_url, VoicyIE)
|
||||
self.raise_no_formats(
|
||||
'This livestream has ended and no archive is available', expected=True)
|
||||
|
||||
|
||||
class VoicyPlaylistBaseIE(VoicyBaseIE):
|
||||
def _entries(self, path, some_id, query, keys, ie=VoicyIE):
|
||||
pagination = ''
|
||||
|
||||
for page in itertools.count(1):
|
||||
info = self._call_api(
|
||||
path, some_id, f'Downloading page {page}',
|
||||
query={
|
||||
'page_size': '100',
|
||||
'page_token': pagination,
|
||||
} | (query or {}),
|
||||
)
|
||||
yield from (self.url_result(s, ie) for s in traverse_obj(info, (*keys, 'share_url', {url_or_none})))
|
||||
|
||||
if not (pagination := traverse_obj(info, ('pagination', 'next_page_token', {str}))):
|
||||
break
|
||||
self._sleep(1, some_id)
|
||||
|
||||
|
||||
class VoicyChannelIE(VoicyPlaylistBaseIE):
|
||||
IE_NAME = 'voicy:channel'
|
||||
|
||||
_VALID_URL = r'https?://(?:www\.)?voicy\.jp/channel/(?P<id>\d+)(?:/(?P<type>all|backnumber/\d+|premium))?(?:\?|$)'
|
||||
_TESTS = [{
|
||||
'url': 'https://voicy.jp/channel/3402',
|
||||
'info_dict': {
|
||||
'id': '3402',
|
||||
'title': '安住紳一郎の日曜天国',
|
||||
},
|
||||
'playlist_mincount': 107,
|
||||
}, {
|
||||
'url': 'https://voicy.jp/channel/1/premium',
|
||||
'info_dict': {
|
||||
'id': '1',
|
||||
'title': 'Voicy社長の頭の中',
|
||||
},
|
||||
'playlist_mincount': 145,
|
||||
}, {
|
||||
'url': 'https://voicy.jp/channel/2856/all',
|
||||
'info_dict': {
|
||||
'id': '2856',
|
||||
'title': 'そんなこんなで、茅原実里です',
|
||||
},
|
||||
'playlist_mincount': 62,
|
||||
}, {
|
||||
'url': 'https://voicy.jp/channel/3321/all?type=all&month=202412',
|
||||
'info_dict': {
|
||||
'id': '3321',
|
||||
'title': '海外安全チャンネル・りょーあん',
|
||||
},
|
||||
'playlist_count': 4,
|
||||
}, {
|
||||
'url': 'https://voicy.jp/channel/1417/backnumber/202501',
|
||||
'info_dict': {
|
||||
'id': '1417',
|
||||
'title': '繪ほんの中には 公式チャンネル',
|
||||
},
|
||||
'playlist_count': 5,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
channel_id, _type = self._match_valid_url(url).groups()
|
||||
channel_info = self._call_api(f'channel/{channel_id}', channel_id)
|
||||
query = {
|
||||
'filter_type' if k == 'type' else k: v[0]
|
||||
for k, v in parse_qs(url).items() if v
|
||||
} | {'channel_view_id': channel_id, 'order': 'new'}
|
||||
|
||||
if _type == 'premium':
|
||||
query['filter_type'] = 'premium'
|
||||
elif (ym := query.pop('month', None) or (_type not in ('all', None) and _type.split('/')[-1])):
|
||||
y, m = map(int, (ym[:4], ym[4:]))
|
||||
d = calendar.monthrange(y, m)[1]
|
||||
query.update({
|
||||
'from': f'{y}-{m:02d}-01T00:00:00+09:00',
|
||||
'to': f'{y}-{m:02d}-{d}T23:59:59+09:00',
|
||||
})
|
||||
|
||||
return self.playlist_result(self._entries(
|
||||
'stories', channel_id, query, ('stories', ...),
|
||||
), channel_id, channel_info['name'])
|
||||
|
||||
|
||||
class VoicyTopicIE(VoicyPlaylistBaseIE):
|
||||
IE_NAME = 'voicy:topic'
|
||||
|
||||
_VALID_URL = r'https?://(?:www\.)?voicy\.jp/(?P<id>{})(?:/(?P<ctg_or_kwd>[\w%]+))?(?:/(?P<sub_ctg>[\w-]+))?'.format(
|
||||
'|'.join(('audiobook', 'category', 'follow(?:ing-paystory)?', 'hashtag', 'paystory', 'pickup', 'search', 'voicedrama')))
|
||||
_TESTS = [{
|
||||
'url': 'https://voicy.jp/audiobook',
|
||||
'info_dict': {
|
||||
'id': '111',
|
||||
'title': 'audiobook',
|
||||
},
|
||||
'playlist_mincount': 96,
|
||||
}, {
|
||||
'url': 'https://voicy.jp/category/talk/voiceactor-announcer',
|
||||
'info_dict': {
|
||||
'id': 'voiceactor-announcer',
|
||||
'title': '声優・アナウンサー',
|
||||
},
|
||||
'playlist_mincount': 70,
|
||||
}, {
|
||||
'url': 'https://voicy.jp/category/sports/all',
|
||||
'info_dict': {
|
||||
'id': 'sports',
|
||||
'title': 'スポーツ',
|
||||
},
|
||||
'playlist_mincount': 174,
|
||||
}, {
|
||||
'url': 'https://voicy.jp/hashtag/%E3%82%B9%E3%83%9E%E3%83%BC%E3%83%88%E5%AE%B6%E9%9B%BB',
|
||||
'info_dict': {
|
||||
'id': 'hashtag',
|
||||
'title': 'スマート家電',
|
||||
},
|
||||
'playlist_mincount': 14,
|
||||
}, {
|
||||
'url': 'https://voicy.jp/search/%E6%81%B5%E6%96%B9%E5%B7%BB%E3%81%8D',
|
||||
'info_dict': {
|
||||
'id': 'search',
|
||||
'title': '恵方巻き',
|
||||
},
|
||||
'playlist_mincount': 102,
|
||||
}, {
|
||||
'url': 'https://voicy.jp/follow',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
topic = self._match_id(url)
|
||||
topic_id, is_story, ie = {
|
||||
'audiobook': ('111', False, VoicyChannelIE),
|
||||
'category': ('category', False, VoicyChannelIE),
|
||||
'follow': ('1', True, VoicyIE),
|
||||
'following-paystory': ('94', True, VoicyIE),
|
||||
'hashtag': ('hashtag', True, VoicyIE),
|
||||
'paystory': ('70', True, VoicyIE),
|
||||
'pickup': ('21', False, VoicyChannelIE),
|
||||
'search': ('search', True, VoicyIE),
|
||||
'voicedrama': ('93', False, VoicyChannelIE),
|
||||
}[topic]
|
||||
|
||||
keys = ('channels', ..., *('story',) * is_story)
|
||||
if topic == 'category':
|
||||
ctg, sub_ctg = self._match_valid_url(url).group('ctg_or_kwd', 'sub_ctg')
|
||||
category = topic_id = sub_ctg if (has_sub := sub_ctg != 'all') else ctg
|
||||
category_id, topic = traverse_obj(self._call_api('channel/categories', None), (
|
||||
*((..., 'subcategories') if has_sub else ()),
|
||||
lambda _, v: v['view_id'] == category, ('id', 'name'), {str_or_none},
|
||||
))
|
||||
path = f'channel/categories/{category_id}'
|
||||
query = {'exclude_story': 'true'}
|
||||
elif topic in ('hashtag', 'search'):
|
||||
keyword = self._match_valid_url(urllib.parse.unquote(url)).group('ctg_or_kwd')
|
||||
if not keyword:
|
||||
raise ExtractorError('Invalid URL', expected=True)
|
||||
path = 'search/channels/story'
|
||||
query = {
|
||||
'search_type': {
|
||||
'hashtag': 'hashtag',
|
||||
'search': 'words',
|
||||
}[topic],
|
||||
'words': keyword,
|
||||
}
|
||||
topic = keyword
|
||||
elif topic_id == '1':
|
||||
path = 'user/me/channels/story/following'
|
||||
query = {'series_filter': '2'}
|
||||
else:
|
||||
path = f'topics/channels{"/story" * is_story}'
|
||||
query = {'topic_id': topic_id}
|
||||
keys = (..., *keys)
|
||||
|
||||
return self.playlist_result(self._entries(path, topic_id, query, keys, ie), topic_id, topic)
|
||||
|
|
Loading…
Reference in a new issue