]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/voicy.py
[ie/orf:on] Improve extraction (#9677)
[yt-dlp.git] / yt_dlp / extractor / voicy.py
1 import itertools
2
3 from .common import InfoExtractor
4 from ..compat import compat_str
5 from ..utils import (
6 ExtractorError,
7 smuggle_url,
8 str_or_none,
9 traverse_obj,
10 unified_strdate,
11 unsmuggle_url,
12 )
13
14
15 class VoicyBaseIE(InfoExtractor):
16 def _extract_from_playlist_data(self, value):
17 voice_id = compat_str(value.get('PlaylistId'))
18 upload_date = unified_strdate(value.get('Published'), False)
19 items = [self._extract_single_article(voice_data) for voice_data in value['VoiceData']]
20 return {
21 '_type': 'multi_video',
22 'entries': items,
23 'id': voice_id,
24 'title': compat_str(value.get('PlaylistName')),
25 'uploader': value.get('SpeakerName'),
26 'uploader_id': str_or_none(value.get('SpeakerId')),
27 'channel': value.get('ChannelName'),
28 'channel_id': str_or_none(value.get('ChannelId')),
29 'upload_date': upload_date,
30 }
31
32 def _extract_single_article(self, entry):
33 formats = [{
34 'url': entry['VoiceHlsFile'],
35 'format_id': 'hls',
36 'ext': 'm4a',
37 'acodec': 'aac',
38 'vcodec': 'none',
39 'protocol': 'm3u8_native',
40 }, {
41 'url': entry['VoiceFile'],
42 'format_id': 'mp3',
43 'ext': 'mp3',
44 'acodec': 'mp3',
45 'vcodec': 'none',
46 }]
47 return {
48 'id': compat_str(entry.get('ArticleId')),
49 'title': entry.get('ArticleTitle'),
50 'description': entry.get('MediaName'),
51 'formats': formats,
52 }
53
54 def _call_api(self, url, video_id, **kwargs):
55 response = self._download_json(url, video_id, **kwargs)
56 if response.get('Status') != 0:
57 message = traverse_obj(response, ('Value', 'Error', 'Message'), expected_type=compat_str)
58 if not message:
59 message = 'There was a error in the response: %d' % response.get('Status')
60 raise ExtractorError(message, expected=False)
61 return response.get('Value')
62
63
64 class VoicyIE(VoicyBaseIE):
65 _WORKING = False
66 IE_NAME = 'voicy'
67 _VALID_URL = r'https?://voicy\.jp/channel/(?P<channel_id>\d+)/(?P<id>\d+)'
68 ARTICLE_LIST_API_URL = 'https://vmw.api.voicy.jp/articles_list?channel_id=%s&pid=%s'
69 _TESTS = [{
70 'url': 'https://voicy.jp/channel/1253/122754',
71 'info_dict': {
72 'id': '122754',
73 'title': '1/21(木)声日記:ついに原稿終わった!!',
74 'uploader': 'ちょまど@ ITエンジニアなオタク',
75 'uploader_id': '7339',
76 },
77 'playlist_mincount': 9,
78 }]
79
80 def _real_extract(self, url):
81 mobj = self._match_valid_url(url)
82 assert mobj
83 voice_id = mobj.group('id')
84 channel_id = mobj.group('channel_id')
85 url, article_list = unsmuggle_url(url)
86 if not article_list:
87 article_list = self._call_api(self.ARTICLE_LIST_API_URL % (channel_id, voice_id), voice_id)
88 return self._extract_from_playlist_data(article_list)
89
90
91 class VoicyChannelIE(VoicyBaseIE):
92 _WORKING = False
93 IE_NAME = 'voicy:channel'
94 _VALID_URL = r'https?://voicy\.jp/channel/(?P<id>\d+)'
95 PROGRAM_LIST_API_URL = 'https://vmw.api.voicy.jp/program_list/all?channel_id=%s&limit=20&public_type=3%s'
96 _TESTS = [{
97 'url': 'https://voicy.jp/channel/1253/',
98 'info_dict': {
99 'id': '7339',
100 'title': 'ゆるふわ日常ラジオ #ちょまラジ',
101 'uploader': 'ちょまど@ ITエンジニアなオタク',
102 'uploader_id': '7339',
103 },
104 'playlist_mincount': 54,
105 }]
106
107 @classmethod
108 def suitable(cls, url):
109 return not VoicyIE.suitable(url) and super().suitable(url)
110
111 def _entries(self, channel_id):
112 pager = ''
113 for count in itertools.count(1):
114 article_list = self._call_api(self.PROGRAM_LIST_API_URL % (channel_id, pager), channel_id, note='Paging #%d' % count)
115 playlist_data = article_list.get('PlaylistData')
116 if not playlist_data:
117 break
118 yield from playlist_data
119 last = playlist_data[-1]
120 pager = '&pid=%d&p_date=%s&play_count=%s' % (last['PlaylistId'], last['Published'], last['PlayCount'])
121
122 def _real_extract(self, url):
123 channel_id = self._match_id(url)
124 articles = self._entries(channel_id)
125
126 first_article = next(articles, None)
127 title = traverse_obj(first_article, ('ChannelName', ), expected_type=compat_str)
128 speaker_name = traverse_obj(first_article, ('SpeakerName', ), expected_type=compat_str)
129 if not title and speaker_name:
130 title = 'Uploads from %s' % speaker_name
131 if not title:
132 title = 'Uploads from channel ID %s' % channel_id
133
134 articles = itertools.chain([first_article], articles) if first_article else articles
135
136 playlist = (
137 self.url_result(smuggle_url('https://voicy.jp/channel/%s/%d' % (channel_id, value['PlaylistId']), value), VoicyIE.ie_key())
138 for value in articles)
139 return {
140 '_type': 'playlist',
141 'entries': playlist,
142 'id': channel_id,
143 'title': title,
144 'channel': speaker_name,
145 'channel_id': channel_id,
146 }