]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/prx.py
[misc] Add `hatch`, `ruff`, `pre-commit` and improve dev docs (#7409)
[yt-dlp.git] / yt_dlp / extractor / prx.py
1 import itertools
2
3 from .common import InfoExtractor, SearchInfoExtractor
4 from ..utils import (
5 clean_html,
6 int_or_none,
7 mimetype2ext,
8 str_or_none,
9 traverse_obj,
10 unified_timestamp,
11 url_or_none,
12 urljoin,
13 )
14
15
16 class PRXBaseIE(InfoExtractor):
17 PRX_BASE_URL_RE = r'https?://(?:(?:beta|listen)\.)?prx.org/%s'
18
19 def _call_api(self, item_id, path, query=None, fatal=True, note='Downloading CMS API JSON'):
20 return self._download_json(
21 urljoin('https://cms.prx.org/api/v1/', path), item_id, query=query, fatal=fatal, note=note)
22
23 @staticmethod
24 def _get_prx_embed_response(response, section):
25 return traverse_obj(response, ('_embedded', f'prx:{section}'))
26
27 @staticmethod
28 def _extract_file_link(response):
29 return url_or_none(traverse_obj(
30 response, ('_links', 'enclosure', 'href'), expected_type=str))
31
32 @classmethod
33 def _extract_image(cls, image_response):
34 if not isinstance(image_response, dict):
35 return
36 return {
37 'id': str_or_none(image_response.get('id')),
38 'filesize': image_response.get('size'),
39 'width': image_response.get('width'),
40 'height': image_response.get('height'),
41 'url': cls._extract_file_link(image_response)
42 }
43
44 @classmethod
45 def _extract_base_info(cls, response):
46 if not isinstance(response, dict):
47 return
48 item_id = str_or_none(response.get('id'))
49 if not item_id:
50 return
51 thumbnail_dict = cls._extract_image(cls._get_prx_embed_response(response, 'image'))
52 description = (
53 clean_html(response.get('description'))
54 or response.get('shortDescription'))
55 return {
56 'id': item_id,
57 'title': response.get('title') or item_id,
58 'thumbnails': [thumbnail_dict] if thumbnail_dict else None,
59 'description': description,
60 'release_timestamp': unified_timestamp(response.get('releasedAt')),
61 'timestamp': unified_timestamp(response.get('createdAt')),
62 'modified_timestamp': unified_timestamp(response.get('updatedAt')),
63 'duration': int_or_none(response.get('duration')),
64 'tags': response.get('tags'),
65 'episode_number': int_or_none(response.get('episodeIdentifier')),
66 'season_number': int_or_none(response.get('seasonIdentifier'))
67 }
68
69 @classmethod
70 def _extract_series_info(cls, series_response):
71 base_info = cls._extract_base_info(series_response)
72 if not base_info:
73 return
74 account_info = cls._extract_account_info(
75 cls._get_prx_embed_response(series_response, 'account')) or {}
76 return {
77 **base_info,
78 'channel_id': account_info.get('channel_id'),
79 'channel_url': account_info.get('channel_url'),
80 'channel': account_info.get('channel'),
81 'series': base_info.get('title'),
82 'series_id': base_info.get('id'),
83 }
84
85 @classmethod
86 def _extract_account_info(cls, account_response):
87 base_info = cls._extract_base_info(account_response)
88 if not base_info:
89 return
90 name = account_response.get('name')
91 return {
92 **base_info,
93 'title': name,
94 'channel_id': base_info.get('id'),
95 'channel_url': 'https://beta.prx.org/accounts/%s' % base_info.get('id'),
96 'channel': name,
97 }
98
99 @classmethod
100 def _extract_story_info(cls, story_response):
101 base_info = cls._extract_base_info(story_response)
102 if not base_info:
103 return
104 series = cls._extract_series_info(
105 cls._get_prx_embed_response(story_response, 'series')) or {}
106 account = cls._extract_account_info(
107 cls._get_prx_embed_response(story_response, 'account')) or {}
108 return {
109 **base_info,
110 'series': series.get('series'),
111 'series_id': series.get('series_id'),
112 'channel_id': account.get('channel_id'),
113 'channel_url': account.get('channel_url'),
114 'channel': account.get('channel')
115 }
116
117 def _entries(self, item_id, endpoint, entry_func, query=None):
118 """
119 Extract entries from paginated list API
120 @param entry_func: Function to generate entry from response item
121 """
122 total = 0
123 for page in itertools.count(1):
124 response = self._call_api(f'{item_id}: page {page}', endpoint, query={
125 **(query or {}),
126 'page': page,
127 'per': 100
128 })
129 items = self._get_prx_embed_response(response, 'items')
130 if not response or not items:
131 break
132
133 yield from filter(None, map(entry_func, items))
134
135 total += response['count']
136 if total >= response['total']:
137 break
138
139 def _story_playlist_entry(self, response):
140 story = self._extract_story_info(response)
141 if not story:
142 return
143 story.update({
144 '_type': 'url',
145 'url': 'https://beta.prx.org/stories/%s' % story['id'],
146 'ie_key': PRXStoryIE.ie_key()
147 })
148 return story
149
150 def _series_playlist_entry(self, response):
151 series = self._extract_series_info(response)
152 if not series:
153 return
154 series.update({
155 '_type': 'url',
156 'url': 'https://beta.prx.org/series/%s' % series['id'],
157 'ie_key': PRXSeriesIE.ie_key()
158 })
159 return series
160
161
162 class PRXStoryIE(PRXBaseIE):
163 _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'stories/(?P<id>\d+)'
164
165 _TESTS = [
166 {
167 # Story with season and episode details
168 'url': 'https://beta.prx.org/stories/399200',
169 'info_dict': {
170 'id': '399200',
171 'title': 'Fly Me To The Moon',
172 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
173 'release_timestamp': 1640250000,
174 'timestamp': 1640208972,
175 'modified_timestamp': 1641318202,
176 'duration': 1004,
177 'tags': 'count:7',
178 'episode_number': 8,
179 'season_number': 5,
180 'series': 'AirSpace',
181 'series_id': '38057',
182 'channel_id': '220986',
183 'channel_url': 'https://beta.prx.org/accounts/220986',
184 'channel': 'Air and Space Museum',
185 },
186 'playlist': [{
187 'info_dict': {
188 'id': '399200_part1',
189 'title': 'Fly Me To The Moon',
190 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
191 'release_timestamp': 1640250000,
192 'timestamp': 1640208972,
193 'modified_timestamp': 1641318202,
194 'duration': 530,
195 'tags': 'count:7',
196 'episode_number': 8,
197 'season_number': 5,
198 'series': 'AirSpace',
199 'series_id': '38057',
200 'channel_id': '220986',
201 'channel_url': 'https://beta.prx.org/accounts/220986',
202 'channel': 'Air and Space Museum',
203 'ext': 'mp3',
204 'upload_date': '20211222',
205 'episode': 'Episode 8',
206 'release_date': '20211223',
207 'season': 'Season 5',
208 'modified_date': '20220104'
209 }
210 }, {
211 'info_dict': {
212 'id': '399200_part2',
213 'title': 'Fly Me To The Moon',
214 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
215 'release_timestamp': 1640250000,
216 'timestamp': 1640208972,
217 'modified_timestamp': 1641318202,
218 'duration': 474,
219 'tags': 'count:7',
220 'episode_number': 8,
221 'season_number': 5,
222 'series': 'AirSpace',
223 'series_id': '38057',
224 'channel_id': '220986',
225 'channel_url': 'https://beta.prx.org/accounts/220986',
226 'channel': 'Air and Space Museum',
227 'ext': 'mp3',
228 'upload_date': '20211222',
229 'episode': 'Episode 8',
230 'release_date': '20211223',
231 'season': 'Season 5',
232 'modified_date': '20220104'
233 }
234 }
235
236 ]
237 }, {
238 # Story with only split audio
239 'url': 'https://beta.prx.org/stories/326414',
240 'info_dict': {
241 'id': '326414',
242 'title': 'Massachusetts v EPA',
243 'description': 'md5:744fffba08f19f4deab69fa8d49d5816',
244 'timestamp': 1592509124,
245 'modified_timestamp': 1592510457,
246 'duration': 3088,
247 'tags': 'count:0',
248 'series': 'Outside/In',
249 'series_id': '36252',
250 'channel_id': '206',
251 'channel_url': 'https://beta.prx.org/accounts/206',
252 'channel': 'New Hampshire Public Radio',
253 },
254 'playlist_count': 4
255 }, {
256 # Story with single combined audio
257 'url': 'https://beta.prx.org/stories/400404',
258 'info_dict': {
259 'id': '400404',
260 'title': 'Cafe Chill (Episode 2022-01)',
261 'thumbnails': 'count:1',
262 'description': 'md5:9f1b5a3cbd64fb159d08c3baa31f1539',
263 'timestamp': 1641233952,
264 'modified_timestamp': 1641234248,
265 'duration': 3540,
266 'series': 'Café Chill',
267 'series_id': '37762',
268 'channel_id': '5767',
269 'channel_url': 'https://beta.prx.org/accounts/5767',
270 'channel': 'C89.5 - KNHC Seattle',
271 'ext': 'mp3',
272 'tags': 'count:0',
273 'thumbnail': r're:https?://cms\.prx\.org/pub/\w+/0/web/story_image/767965/medium/Aurora_Over_Trees\.jpg',
274 'upload_date': '20220103',
275 'modified_date': '20220103'
276 }
277 }, {
278 'url': 'https://listen.prx.org/stories/399200',
279 'only_matching': True
280 }
281 ]
282
283 def _extract_audio_pieces(self, audio_response):
284 return [{
285 'format_id': str_or_none(piece_response.get('id')),
286 'format_note': str_or_none(piece_response.get('label')),
287 'filesize': int_or_none(piece_response.get('size')),
288 'duration': int_or_none(piece_response.get('duration')),
289 'ext': mimetype2ext(piece_response.get('contentType')),
290 'asr': int_or_none(piece_response.get('frequency'), scale=1000),
291 'abr': int_or_none(piece_response.get('bitRate')),
292 'url': self._extract_file_link(piece_response),
293 'vcodec': 'none'
294 } for piece_response in sorted(
295 self._get_prx_embed_response(audio_response, 'items') or [],
296 key=lambda p: int_or_none(p.get('position')))]
297
298 def _extract_story(self, story_response):
299 info = self._extract_story_info(story_response)
300 if not info:
301 return
302 audio_pieces = self._extract_audio_pieces(
303 self._get_prx_embed_response(story_response, 'audio'))
304 if len(audio_pieces) == 1:
305 return {
306 'formats': audio_pieces,
307 **info
308 }
309
310 entries = [{
311 **info,
312 'id': '%s_part%d' % (info['id'], (idx + 1)),
313 'formats': [fmt],
314 } for idx, fmt in enumerate(audio_pieces)]
315 return {
316 '_type': 'multi_video',
317 'entries': entries,
318 **info
319 }
320
321 def _real_extract(self, url):
322 story_id = self._match_id(url)
323 response = self._call_api(story_id, f'stories/{story_id}')
324 return self._extract_story(response)
325
326
327 class PRXSeriesIE(PRXBaseIE):
328 _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'series/(?P<id>\d+)'
329 _TESTS = [
330 {
331 'url': 'https://beta.prx.org/series/36252',
332 'info_dict': {
333 'id': '36252',
334 'title': 'Outside/In',
335 'thumbnails': 'count:1',
336 'description': 'md5:a6bedc5f810777bcb09ab30ff9059114',
337 'timestamp': 1470684964,
338 'modified_timestamp': 1582308830,
339 'channel_id': '206',
340 'channel_url': 'https://beta.prx.org/accounts/206',
341 'channel': 'New Hampshire Public Radio',
342 'series': 'Outside/In',
343 'series_id': '36252'
344 },
345 'playlist_mincount': 39
346 }, {
347 # Blank series
348 'url': 'https://beta.prx.org/series/25038',
349 'info_dict': {
350 'id': '25038',
351 'title': '25038',
352 'timestamp': 1207612800,
353 'modified_timestamp': 1207612800,
354 'channel_id': '206',
355 'channel_url': 'https://beta.prx.org/accounts/206',
356 'channel': 'New Hampshire Public Radio',
357 'series': '25038',
358 'series_id': '25038'
359 },
360 'playlist_count': 0
361 }
362 ]
363
364 def _extract_series(self, series_response):
365 info = self._extract_series_info(series_response)
366 return {
367 '_type': 'playlist',
368 'entries': self._entries(info['id'], 'series/%s/stories' % info['id'], self._story_playlist_entry),
369 **info
370 }
371
372 def _real_extract(self, url):
373 series_id = self._match_id(url)
374 response = self._call_api(series_id, f'series/{series_id}')
375 return self._extract_series(response)
376
377
378 class PRXAccountIE(PRXBaseIE):
379 _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'accounts/(?P<id>\d+)'
380 _TESTS = [{
381 'url': 'https://beta.prx.org/accounts/206',
382 'info_dict': {
383 'id': '206',
384 'title': 'New Hampshire Public Radio',
385 'description': 'md5:277f2395301d0aca563c80c70a18ee0a',
386 'channel_id': '206',
387 'channel_url': 'https://beta.prx.org/accounts/206',
388 'channel': 'New Hampshire Public Radio',
389 'thumbnails': 'count:1'
390 },
391 'playlist_mincount': 380
392 }]
393
394 def _extract_account(self, account_response):
395 info = self._extract_account_info(account_response)
396 series = self._entries(
397 info['id'], f'accounts/{info["id"]}/series', self._series_playlist_entry)
398 stories = self._entries(
399 info['id'], f'accounts/{info["id"]}/stories', self._story_playlist_entry)
400 return {
401 '_type': 'playlist',
402 'entries': itertools.chain(series, stories),
403 **info
404 }
405
406 def _real_extract(self, url):
407 account_id = self._match_id(url)
408 response = self._call_api(account_id, f'accounts/{account_id}')
409 return self._extract_account(response)
410
411
412 class PRXStoriesSearchIE(PRXBaseIE, SearchInfoExtractor):
413 IE_DESC = 'PRX Stories Search'
414 IE_NAME = 'prxstories:search'
415 _SEARCH_KEY = 'prxstories'
416
417 def _search_results(self, query):
418 yield from self._entries(
419 f'query {query}', 'stories/search', self._story_playlist_entry, query={'q': query})
420
421
422 class PRXSeriesSearchIE(PRXBaseIE, SearchInfoExtractor):
423 IE_DESC = 'PRX Series Search'
424 IE_NAME = 'prxseries:search'
425 _SEARCH_KEY = 'prxseries'
426
427 def _search_results(self, query):
428 yield from self._entries(
429 f'query {query}', 'series/search', self._series_playlist_entry, query={'q': query})