]>
jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/prx.py
2 from .common
import InfoExtractor
, SearchInfoExtractor
15 class PRXBaseIE(InfoExtractor
):
16 PRX_BASE_URL_RE
= r
'https?://(?:(?:beta|listen)\.)?prx.org/%s'
18 def _call_api(self
, item_id
, path
, query
=None, fatal
=True, note
='Downloading CMS API JSON'):
19 return self
._download
_json
(
20 urljoin('https://cms.prx.org/api/v1/', path
), item_id
, query
=query
, fatal
=fatal
, note
=note
)
23 def _get_prx_embed_response(response
, section
):
24 return traverse_obj(response
, ('_embedded', f
'prx:{section}'))
27 def _extract_file_link(response
):
28 return url_or_none(traverse_obj(
29 response
, ('_links', 'enclosure', 'href'), expected_type
=str))
32 def _extract_image(cls
, image_response
):
33 if not isinstance(image_response
, dict):
36 'id': str_or_none(image_response
.get('id')),
37 'filesize': image_response
.get('size'),
38 'width': image_response
.get('width'),
39 'height': image_response
.get('height'),
40 'url': cls
._extract
_file
_link
(image_response
)
44 def _extract_base_info(cls
, response
):
45 if not isinstance(response
, dict):
47 item_id
= str_or_none(response
.get('id'))
50 thumbnail_dict
= cls
._extract
_image
(cls
._get
_prx
_embed
_response
(response
, 'image'))
52 clean_html(response
.get('description'))
53 or response
.get('shortDescription'))
56 'title': response
.get('title') or item_id
,
57 'thumbnails': [thumbnail_dict
] if thumbnail_dict
else None,
58 'description': description
,
59 'release_timestamp': unified_timestamp(response
.get('releasedAt')),
60 'timestamp': unified_timestamp(response
.get('createdAt')),
61 'modified_timestamp': unified_timestamp(response
.get('updatedAt')),
62 'duration': int_or_none(response
.get('duration')),
63 'tags': response
.get('tags'),
64 'episode_number': int_or_none(response
.get('episodeIdentifier')),
65 'season_number': int_or_none(response
.get('seasonIdentifier'))
69 def _extract_series_info(cls
, series_response
):
70 base_info
= cls
._extract
_base
_info
(series_response
)
73 account_info
= cls
._extract
_account
_info
(
74 cls
._get
_prx
_embed
_response
(series_response
, 'account')) or {}
77 'channel_id': account_info
.get('channel_id'),
78 'channel_url': account_info
.get('channel_url'),
79 'channel': account_info
.get('channel'),
80 'series': base_info
.get('title'),
81 'series_id': base_info
.get('id'),
85 def _extract_account_info(cls
, account_response
):
86 base_info
= cls
._extract
_base
_info
(account_response
)
89 name
= account_response
.get('name')
93 'channel_id': base_info
.get('id'),
94 'channel_url': 'https://beta.prx.org/accounts/%s' % base_info
.get('id'),
99 def _extract_story_info(cls
, story_response
):
100 base_info
= cls
._extract
_base
_info
(story_response
)
103 series
= cls
._extract
_series
_info
(
104 cls
._get
_prx
_embed
_response
(story_response
, 'series')) or {}
105 account
= cls
._extract
_account
_info
(
106 cls
._get
_prx
_embed
_response
(story_response
, 'account')) or {}
109 'series': series
.get('series'),
110 'series_id': series
.get('series_id'),
111 'channel_id': account
.get('channel_id'),
112 'channel_url': account
.get('channel_url'),
113 'channel': account
.get('channel')
116 def _entries(self
, item_id
, endpoint
, entry_func
, query
=None):
118 Extract entries from paginated list API
119 @param entry_func: Function to generate entry from response item
122 for page
in itertools
.count(1):
123 response
= self
._call
_api
(f
'{item_id}: page {page}', endpoint
, query
={
128 items
= self
._get
_prx
_embed
_response
(response
, 'items')
129 if not response
or not items
:
132 yield from filter(None, map(entry_func
, items
))
134 total
+= response
['count']
135 if total
>= response
['total']:
138 def _story_playlist_entry(self
, response
):
139 story
= self
._extract
_story
_info
(response
)
144 'url': 'https://beta.prx.org/stories/%s' % story
['id'],
145 'ie_key': PRXStoryIE
.ie_key()
149 def _series_playlist_entry(self
, response
):
150 series
= self
._extract
_series
_info
(response
)
155 'url': 'https://beta.prx.org/series/%s' % series
['id'],
156 'ie_key': PRXSeriesIE
.ie_key()
161 class PRXStoryIE(PRXBaseIE
):
162 _VALID_URL
= PRXBaseIE
.PRX_BASE_URL_RE
% r
'stories/(?P<id>\d+)'
166 # Story with season and episode details
167 'url': 'https://beta.prx.org/stories/399200',
170 'title': 'Fly Me To The Moon',
171 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
172 'release_timestamp': 1640250000,
173 'timestamp': 1640208972,
174 'modified_timestamp': 1641318202,
179 'series': 'AirSpace',
180 'series_id': '38057',
181 'channel_id': '220986',
182 'channel_url': 'https://beta.prx.org/accounts/220986',
183 'channel': 'Air and Space Museum',
187 'id': '399200_part1',
188 'title': 'Fly Me To The Moon',
189 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
190 'release_timestamp': 1640250000,
191 'timestamp': 1640208972,
192 'modified_timestamp': 1641318202,
197 'series': 'AirSpace',
198 'series_id': '38057',
199 'channel_id': '220986',
200 'channel_url': 'https://beta.prx.org/accounts/220986',
201 'channel': 'Air and Space Museum',
203 'upload_date': '20211222',
204 'episode': 'Episode 8',
205 'release_date': '20211223',
206 'season': 'Season 5',
207 'modified_date': '20220104'
211 'id': '399200_part2',
212 'title': 'Fly Me To The Moon',
213 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
214 'release_timestamp': 1640250000,
215 'timestamp': 1640208972,
216 'modified_timestamp': 1641318202,
221 'series': 'AirSpace',
222 'series_id': '38057',
223 'channel_id': '220986',
224 'channel_url': 'https://beta.prx.org/accounts/220986',
225 'channel': 'Air and Space Museum',
227 'upload_date': '20211222',
228 'episode': 'Episode 8',
229 'release_date': '20211223',
230 'season': 'Season 5',
231 'modified_date': '20220104'
237 # Story with only split audio
238 'url': 'https://beta.prx.org/stories/326414',
241 'title': 'Massachusetts v EPA',
242 'description': 'md5:744fffba08f19f4deab69fa8d49d5816',
243 'timestamp': 1592509124,
244 'modified_timestamp': 1592510457,
247 'series': 'Outside/In',
248 'series_id': '36252',
250 'channel_url': 'https://beta.prx.org/accounts/206',
251 'channel': 'New Hampshire Public Radio',
255 # Story with single combined audio
256 'url': 'https://beta.prx.org/stories/400404',
259 'title': 'Cafe Chill (Episode 2022-01)',
260 'thumbnails': 'count:1',
261 'description': 'md5:9f1b5a3cbd64fb159d08c3baa31f1539',
262 'timestamp': 1641233952,
263 'modified_timestamp': 1641234248,
265 'series': 'Café Chill',
266 'series_id': '37762',
267 'channel_id': '5767',
268 'channel_url': 'https://beta.prx.org/accounts/5767',
269 'channel': 'C89.5 - KNHC Seattle',
272 'thumbnail': r
're:https?://cms\.prx\.org/pub/\w+/0/web/story_image/767965/medium/Aurora_Over_Trees\.jpg',
273 'upload_date': '20220103',
274 'modified_date': '20220103'
277 'url': 'https://listen.prx.org/stories/399200',
278 'only_matching': True
282 def _extract_audio_pieces(self
, audio_response
):
284 'format_id': str_or_none(piece_response
.get('id')),
285 'format_note': str_or_none(piece_response
.get('label')),
286 'filesize': int_or_none(piece_response
.get('size')),
287 'duration': int_or_none(piece_response
.get('duration')),
288 'ext': mimetype2ext(piece_response
.get('contentType')),
289 'asr': int_or_none(piece_response
.get('frequency'), scale
=1000),
290 'abr': int_or_none(piece_response
.get('bitRate')),
291 'url': self
._extract
_file
_link
(piece_response
),
293 } for piece_response
in sorted(
294 self
._get
_prx
_embed
_response
(audio_response
, 'items') or [],
295 key
=lambda p
: int_or_none(p
.get('position')))]
297 def _extract_story(self
, story_response
):
298 info
= self
._extract
_story
_info
(story_response
)
301 audio_pieces
= self
._extract
_audio
_pieces
(
302 self
._get
_prx
_embed
_response
(story_response
, 'audio'))
303 if len(audio_pieces
) == 1:
305 'formats': audio_pieces
,
311 'id': '%s_part%d' % (info
['id'], (idx
+ 1)),
313 } for idx
, fmt
in enumerate(audio_pieces
)]
315 '_type': 'multi_video',
320 def _real_extract(self
, url
):
321 story_id
= self
._match
_id
(url
)
322 response
= self
._call
_api
(story_id
, f
'stories/{story_id}')
323 return self
._extract
_story
(response
)
326 class PRXSeriesIE(PRXBaseIE
):
327 _VALID_URL
= PRXBaseIE
.PRX_BASE_URL_RE
% r
'series/(?P<id>\d+)'
330 'url': 'https://beta.prx.org/series/36252',
333 'title': 'Outside/In',
334 'thumbnails': 'count:1',
335 'description': 'md5:a6bedc5f810777bcb09ab30ff9059114',
336 'timestamp': 1470684964,
337 'modified_timestamp': 1582308830,
339 'channel_url': 'https://beta.prx.org/accounts/206',
340 'channel': 'New Hampshire Public Radio',
341 'series': 'Outside/In',
344 'playlist_mincount': 39
347 'url': 'https://beta.prx.org/series/25038',
351 'timestamp': 1207612800,
352 'modified_timestamp': 1207612800,
354 'channel_url': 'https://beta.prx.org/accounts/206',
355 'channel': 'New Hampshire Public Radio',
363 def _extract_series(self
, series_response
):
364 info
= self
._extract
_series
_info
(series_response
)
367 'entries': self
._entries
(info
['id'], 'series/%s/stories' % info
['id'], self
._story
_playlist
_entry
),
371 def _real_extract(self
, url
):
372 series_id
= self
._match
_id
(url
)
373 response
= self
._call
_api
(series_id
, f
'series/{series_id}')
374 return self
._extract
_series
(response
)
377 class PRXAccountIE(PRXBaseIE
):
378 _VALID_URL
= PRXBaseIE
.PRX_BASE_URL_RE
% r
'accounts/(?P<id>\d+)'
380 'url': 'https://beta.prx.org/accounts/206',
383 'title': 'New Hampshire Public Radio',
384 'description': 'md5:277f2395301d0aca563c80c70a18ee0a',
386 'channel_url': 'https://beta.prx.org/accounts/206',
387 'channel': 'New Hampshire Public Radio',
388 'thumbnails': 'count:1'
390 'playlist_mincount': 380
393 def _extract_account(self
, account_response
):
394 info
= self
._extract
_account
_info
(account_response
)
395 series
= self
._entries
(
396 info
['id'], f
'accounts/{info["id"]}/series', self
._series
_playlist
_entry
)
397 stories
= self
._entries
(
398 info
['id'], f
'accounts/{info["id"]}/stories', self
._story
_playlist
_entry
)
401 'entries': itertools
.chain(series
, stories
),
405 def _real_extract(self
, url
):
406 account_id
= self
._match
_id
(url
)
407 response
= self
._call
_api
(account_id
, f
'accounts/{account_id}')
408 return self
._extract
_account
(response
)
411 class PRXStoriesSearchIE(PRXBaseIE
, SearchInfoExtractor
):
412 IE_DESC
= 'PRX Stories Search'
413 IE_NAME
= 'prxstories:search'
414 _SEARCH_KEY
= 'prxstories'
416 def _search_results(self
, query
):
417 yield from self
._entries
(
418 f
'query {query}', 'stories/search', self
._story
_playlist
_entry
, query
={'q': query}
)
421 class PRXSeriesSearchIE(PRXBaseIE
, SearchInfoExtractor
):
422 IE_DESC
= 'PRX Series Search'
423 IE_NAME
= 'prxseries:search'
424 _SEARCH_KEY
= 'prxseries'
426 def _search_results(self
, query
):
427 yield from self
._entries
(
428 f
'query {query}', 'series/search', self
._series
_playlist
_entry
, query
={'q': query}
)