]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/prx.py
[ie/orf:on] Improve extraction (#9677)
[yt-dlp.git] / yt_dlp / extractor / prx.py
1 import itertools
2 from .common import InfoExtractor, SearchInfoExtractor
3 from ..utils import (
4 urljoin,
5 traverse_obj,
6 int_or_none,
7 mimetype2ext,
8 clean_html,
9 url_or_none,
10 unified_timestamp,
11 str_or_none,
12 )
13
14
15 class PRXBaseIE(InfoExtractor):
16 PRX_BASE_URL_RE = r'https?://(?:(?:beta|listen)\.)?prx.org/%s'
17
18 def _call_api(self, item_id, path, query=None, fatal=True, note='Downloading CMS API JSON'):
19 return self._download_json(
20 urljoin('https://cms.prx.org/api/v1/', path), item_id, query=query, fatal=fatal, note=note)
21
22 @staticmethod
23 def _get_prx_embed_response(response, section):
24 return traverse_obj(response, ('_embedded', f'prx:{section}'))
25
26 @staticmethod
27 def _extract_file_link(response):
28 return url_or_none(traverse_obj(
29 response, ('_links', 'enclosure', 'href'), expected_type=str))
30
31 @classmethod
32 def _extract_image(cls, image_response):
33 if not isinstance(image_response, dict):
34 return
35 return {
36 'id': str_or_none(image_response.get('id')),
37 'filesize': image_response.get('size'),
38 'width': image_response.get('width'),
39 'height': image_response.get('height'),
40 'url': cls._extract_file_link(image_response)
41 }
42
43 @classmethod
44 def _extract_base_info(cls, response):
45 if not isinstance(response, dict):
46 return
47 item_id = str_or_none(response.get('id'))
48 if not item_id:
49 return
50 thumbnail_dict = cls._extract_image(cls._get_prx_embed_response(response, 'image'))
51 description = (
52 clean_html(response.get('description'))
53 or response.get('shortDescription'))
54 return {
55 'id': item_id,
56 'title': response.get('title') or item_id,
57 'thumbnails': [thumbnail_dict] if thumbnail_dict else None,
58 'description': description,
59 'release_timestamp': unified_timestamp(response.get('releasedAt')),
60 'timestamp': unified_timestamp(response.get('createdAt')),
61 'modified_timestamp': unified_timestamp(response.get('updatedAt')),
62 'duration': int_or_none(response.get('duration')),
63 'tags': response.get('tags'),
64 'episode_number': int_or_none(response.get('episodeIdentifier')),
65 'season_number': int_or_none(response.get('seasonIdentifier'))
66 }
67
68 @classmethod
69 def _extract_series_info(cls, series_response):
70 base_info = cls._extract_base_info(series_response)
71 if not base_info:
72 return
73 account_info = cls._extract_account_info(
74 cls._get_prx_embed_response(series_response, 'account')) or {}
75 return {
76 **base_info,
77 'channel_id': account_info.get('channel_id'),
78 'channel_url': account_info.get('channel_url'),
79 'channel': account_info.get('channel'),
80 'series': base_info.get('title'),
81 'series_id': base_info.get('id'),
82 }
83
84 @classmethod
85 def _extract_account_info(cls, account_response):
86 base_info = cls._extract_base_info(account_response)
87 if not base_info:
88 return
89 name = account_response.get('name')
90 return {
91 **base_info,
92 'title': name,
93 'channel_id': base_info.get('id'),
94 'channel_url': 'https://beta.prx.org/accounts/%s' % base_info.get('id'),
95 'channel': name,
96 }
97
98 @classmethod
99 def _extract_story_info(cls, story_response):
100 base_info = cls._extract_base_info(story_response)
101 if not base_info:
102 return
103 series = cls._extract_series_info(
104 cls._get_prx_embed_response(story_response, 'series')) or {}
105 account = cls._extract_account_info(
106 cls._get_prx_embed_response(story_response, 'account')) or {}
107 return {
108 **base_info,
109 'series': series.get('series'),
110 'series_id': series.get('series_id'),
111 'channel_id': account.get('channel_id'),
112 'channel_url': account.get('channel_url'),
113 'channel': account.get('channel')
114 }
115
116 def _entries(self, item_id, endpoint, entry_func, query=None):
117 """
118 Extract entries from paginated list API
119 @param entry_func: Function to generate entry from response item
120 """
121 total = 0
122 for page in itertools.count(1):
123 response = self._call_api(f'{item_id}: page {page}', endpoint, query={
124 **(query or {}),
125 'page': page,
126 'per': 100
127 })
128 items = self._get_prx_embed_response(response, 'items')
129 if not response or not items:
130 break
131
132 yield from filter(None, map(entry_func, items))
133
134 total += response['count']
135 if total >= response['total']:
136 break
137
138 def _story_playlist_entry(self, response):
139 story = self._extract_story_info(response)
140 if not story:
141 return
142 story.update({
143 '_type': 'url',
144 'url': 'https://beta.prx.org/stories/%s' % story['id'],
145 'ie_key': PRXStoryIE.ie_key()
146 })
147 return story
148
149 def _series_playlist_entry(self, response):
150 series = self._extract_series_info(response)
151 if not series:
152 return
153 series.update({
154 '_type': 'url',
155 'url': 'https://beta.prx.org/series/%s' % series['id'],
156 'ie_key': PRXSeriesIE.ie_key()
157 })
158 return series
159
160
161 class PRXStoryIE(PRXBaseIE):
162 _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'stories/(?P<id>\d+)'
163
164 _TESTS = [
165 {
166 # Story with season and episode details
167 'url': 'https://beta.prx.org/stories/399200',
168 'info_dict': {
169 'id': '399200',
170 'title': 'Fly Me To The Moon',
171 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
172 'release_timestamp': 1640250000,
173 'timestamp': 1640208972,
174 'modified_timestamp': 1641318202,
175 'duration': 1004,
176 'tags': 'count:7',
177 'episode_number': 8,
178 'season_number': 5,
179 'series': 'AirSpace',
180 'series_id': '38057',
181 'channel_id': '220986',
182 'channel_url': 'https://beta.prx.org/accounts/220986',
183 'channel': 'Air and Space Museum',
184 },
185 'playlist': [{
186 'info_dict': {
187 'id': '399200_part1',
188 'title': 'Fly Me To The Moon',
189 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
190 'release_timestamp': 1640250000,
191 'timestamp': 1640208972,
192 'modified_timestamp': 1641318202,
193 'duration': 530,
194 'tags': 'count:7',
195 'episode_number': 8,
196 'season_number': 5,
197 'series': 'AirSpace',
198 'series_id': '38057',
199 'channel_id': '220986',
200 'channel_url': 'https://beta.prx.org/accounts/220986',
201 'channel': 'Air and Space Museum',
202 'ext': 'mp3',
203 'upload_date': '20211222',
204 'episode': 'Episode 8',
205 'release_date': '20211223',
206 'season': 'Season 5',
207 'modified_date': '20220104'
208 }
209 }, {
210 'info_dict': {
211 'id': '399200_part2',
212 'title': 'Fly Me To The Moon',
213 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
214 'release_timestamp': 1640250000,
215 'timestamp': 1640208972,
216 'modified_timestamp': 1641318202,
217 'duration': 474,
218 'tags': 'count:7',
219 'episode_number': 8,
220 'season_number': 5,
221 'series': 'AirSpace',
222 'series_id': '38057',
223 'channel_id': '220986',
224 'channel_url': 'https://beta.prx.org/accounts/220986',
225 'channel': 'Air and Space Museum',
226 'ext': 'mp3',
227 'upload_date': '20211222',
228 'episode': 'Episode 8',
229 'release_date': '20211223',
230 'season': 'Season 5',
231 'modified_date': '20220104'
232 }
233 }
234
235 ]
236 }, {
237 # Story with only split audio
238 'url': 'https://beta.prx.org/stories/326414',
239 'info_dict': {
240 'id': '326414',
241 'title': 'Massachusetts v EPA',
242 'description': 'md5:744fffba08f19f4deab69fa8d49d5816',
243 'timestamp': 1592509124,
244 'modified_timestamp': 1592510457,
245 'duration': 3088,
246 'tags': 'count:0',
247 'series': 'Outside/In',
248 'series_id': '36252',
249 'channel_id': '206',
250 'channel_url': 'https://beta.prx.org/accounts/206',
251 'channel': 'New Hampshire Public Radio',
252 },
253 'playlist_count': 4
254 }, {
255 # Story with single combined audio
256 'url': 'https://beta.prx.org/stories/400404',
257 'info_dict': {
258 'id': '400404',
259 'title': 'Cafe Chill (Episode 2022-01)',
260 'thumbnails': 'count:1',
261 'description': 'md5:9f1b5a3cbd64fb159d08c3baa31f1539',
262 'timestamp': 1641233952,
263 'modified_timestamp': 1641234248,
264 'duration': 3540,
265 'series': 'Café Chill',
266 'series_id': '37762',
267 'channel_id': '5767',
268 'channel_url': 'https://beta.prx.org/accounts/5767',
269 'channel': 'C89.5 - KNHC Seattle',
270 'ext': 'mp3',
271 'tags': 'count:0',
272 'thumbnail': r're:https?://cms\.prx\.org/pub/\w+/0/web/story_image/767965/medium/Aurora_Over_Trees\.jpg',
273 'upload_date': '20220103',
274 'modified_date': '20220103'
275 }
276 }, {
277 'url': 'https://listen.prx.org/stories/399200',
278 'only_matching': True
279 }
280 ]
281
282 def _extract_audio_pieces(self, audio_response):
283 return [{
284 'format_id': str_or_none(piece_response.get('id')),
285 'format_note': str_or_none(piece_response.get('label')),
286 'filesize': int_or_none(piece_response.get('size')),
287 'duration': int_or_none(piece_response.get('duration')),
288 'ext': mimetype2ext(piece_response.get('contentType')),
289 'asr': int_or_none(piece_response.get('frequency'), scale=1000),
290 'abr': int_or_none(piece_response.get('bitRate')),
291 'url': self._extract_file_link(piece_response),
292 'vcodec': 'none'
293 } for piece_response in sorted(
294 self._get_prx_embed_response(audio_response, 'items') or [],
295 key=lambda p: int_or_none(p.get('position')))]
296
297 def _extract_story(self, story_response):
298 info = self._extract_story_info(story_response)
299 if not info:
300 return
301 audio_pieces = self._extract_audio_pieces(
302 self._get_prx_embed_response(story_response, 'audio'))
303 if len(audio_pieces) == 1:
304 return {
305 'formats': audio_pieces,
306 **info
307 }
308
309 entries = [{
310 **info,
311 'id': '%s_part%d' % (info['id'], (idx + 1)),
312 'formats': [fmt],
313 } for idx, fmt in enumerate(audio_pieces)]
314 return {
315 '_type': 'multi_video',
316 'entries': entries,
317 **info
318 }
319
320 def _real_extract(self, url):
321 story_id = self._match_id(url)
322 response = self._call_api(story_id, f'stories/{story_id}')
323 return self._extract_story(response)
324
325
326 class PRXSeriesIE(PRXBaseIE):
327 _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'series/(?P<id>\d+)'
328 _TESTS = [
329 {
330 'url': 'https://beta.prx.org/series/36252',
331 'info_dict': {
332 'id': '36252',
333 'title': 'Outside/In',
334 'thumbnails': 'count:1',
335 'description': 'md5:a6bedc5f810777bcb09ab30ff9059114',
336 'timestamp': 1470684964,
337 'modified_timestamp': 1582308830,
338 'channel_id': '206',
339 'channel_url': 'https://beta.prx.org/accounts/206',
340 'channel': 'New Hampshire Public Radio',
341 'series': 'Outside/In',
342 'series_id': '36252'
343 },
344 'playlist_mincount': 39
345 }, {
346 # Blank series
347 'url': 'https://beta.prx.org/series/25038',
348 'info_dict': {
349 'id': '25038',
350 'title': '25038',
351 'timestamp': 1207612800,
352 'modified_timestamp': 1207612800,
353 'channel_id': '206',
354 'channel_url': 'https://beta.prx.org/accounts/206',
355 'channel': 'New Hampshire Public Radio',
356 'series': '25038',
357 'series_id': '25038'
358 },
359 'playlist_count': 0
360 }
361 ]
362
363 def _extract_series(self, series_response):
364 info = self._extract_series_info(series_response)
365 return {
366 '_type': 'playlist',
367 'entries': self._entries(info['id'], 'series/%s/stories' % info['id'], self._story_playlist_entry),
368 **info
369 }
370
371 def _real_extract(self, url):
372 series_id = self._match_id(url)
373 response = self._call_api(series_id, f'series/{series_id}')
374 return self._extract_series(response)
375
376
377 class PRXAccountIE(PRXBaseIE):
378 _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'accounts/(?P<id>\d+)'
379 _TESTS = [{
380 'url': 'https://beta.prx.org/accounts/206',
381 'info_dict': {
382 'id': '206',
383 'title': 'New Hampshire Public Radio',
384 'description': 'md5:277f2395301d0aca563c80c70a18ee0a',
385 'channel_id': '206',
386 'channel_url': 'https://beta.prx.org/accounts/206',
387 'channel': 'New Hampshire Public Radio',
388 'thumbnails': 'count:1'
389 },
390 'playlist_mincount': 380
391 }]
392
393 def _extract_account(self, account_response):
394 info = self._extract_account_info(account_response)
395 series = self._entries(
396 info['id'], f'accounts/{info["id"]}/series', self._series_playlist_entry)
397 stories = self._entries(
398 info['id'], f'accounts/{info["id"]}/stories', self._story_playlist_entry)
399 return {
400 '_type': 'playlist',
401 'entries': itertools.chain(series, stories),
402 **info
403 }
404
405 def _real_extract(self, url):
406 account_id = self._match_id(url)
407 response = self._call_api(account_id, f'accounts/{account_id}')
408 return self._extract_account(response)
409
410
411 class PRXStoriesSearchIE(PRXBaseIE, SearchInfoExtractor):
412 IE_DESC = 'PRX Stories Search'
413 IE_NAME = 'prxstories:search'
414 _SEARCH_KEY = 'prxstories'
415
416 def _search_results(self, query):
417 yield from self._entries(
418 f'query {query}', 'stories/search', self._story_playlist_entry, query={'q': query})
419
420
421 class PRXSeriesSearchIE(PRXBaseIE, SearchInfoExtractor):
422 IE_DESC = 'PRX Series Search'
423 IE_NAME = 'prxseries:search'
424 _SEARCH_KEY = 'prxseries'
425
426 def _search_results(self, query):
427 yield from self._entries(
428 f'query {query}', 'series/search', self._series_playlist_entry, query={'q': query})