]>
jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/prx.py
2 from __future__
import unicode_literals
5 from .common
import InfoExtractor
, SearchInfoExtractor
18 class PRXBaseIE(InfoExtractor
):
19 PRX_BASE_URL_RE
= r
'https?://(?:(?:beta|listen)\.)?prx.org/%s'
21 def _call_api(self
, item_id
, path
, query
=None, fatal
=True, note
='Downloading CMS API JSON'):
22 return self
._download
_json
(
23 urljoin('https://cms.prx.org/api/v1/', path
), item_id
, query
=query
, fatal
=fatal
, note
=note
)
26 def _get_prx_embed_response(response
, section
):
27 return traverse_obj(response
, ('_embedded', f
'prx:{section}'))
30 def _extract_file_link(response
):
31 return url_or_none(traverse_obj(
32 response
, ('_links', 'enclosure', 'href'), expected_type
=str))
35 def _extract_image(cls
, image_response
):
36 if not isinstance(image_response
, dict):
39 'id': str_or_none(image_response
.get('id')),
40 'filesize': image_response
.get('size'),
41 'width': image_response
.get('width'),
42 'height': image_response
.get('height'),
43 'url': cls
._extract
_file
_link
(image_response
)
47 def _extract_base_info(cls
, response
):
48 if not isinstance(response
, dict):
50 item_id
= str_or_none(response
.get('id'))
53 thumbnail_dict
= cls
._extract
_image
(cls
._get
_prx
_embed
_response
(response
, 'image'))
55 clean_html(response
.get('description'))
56 or response
.get('shortDescription'))
59 'title': response
.get('title') or item_id
,
60 'thumbnails': [thumbnail_dict
] if thumbnail_dict
else None,
61 'description': description
,
62 'release_timestamp': unified_timestamp(response
.get('releasedAt')),
63 'timestamp': unified_timestamp(response
.get('createdAt')),
64 'modified_timestamp': unified_timestamp(response
.get('updatedAt')),
65 'duration': int_or_none(response
.get('duration')),
66 'tags': response
.get('tags'),
67 'episode_number': int_or_none(response
.get('episodeIdentifier')),
68 'season_number': int_or_none(response
.get('seasonIdentifier'))
72 def _extract_series_info(cls
, series_response
):
73 base_info
= cls
._extract
_base
_info
(series_response
)
76 account_info
= cls
._extract
_account
_info
(
77 cls
._get
_prx
_embed
_response
(series_response
, 'account')) or {}
80 'channel_id': account_info
.get('channel_id'),
81 'channel_url': account_info
.get('channel_url'),
82 'channel': account_info
.get('channel'),
83 'series': base_info
.get('title'),
84 'series_id': base_info
.get('id'),
88 def _extract_account_info(cls
, account_response
):
89 base_info
= cls
._extract
_base
_info
(account_response
)
92 name
= account_response
.get('name')
96 'channel_id': base_info
.get('id'),
97 'channel_url': 'https://beta.prx.org/accounts/%s' % base_info
.get('id'),
102 def _extract_story_info(cls
, story_response
):
103 base_info
= cls
._extract
_base
_info
(story_response
)
106 series
= cls
._extract
_series
_info
(
107 cls
._get
_prx
_embed
_response
(story_response
, 'series')) or {}
108 account
= cls
._extract
_account
_info
(
109 cls
._get
_prx
_embed
_response
(story_response
, 'account')) or {}
112 'series': series
.get('series'),
113 'series_id': series
.get('series_id'),
114 'channel_id': account
.get('channel_id'),
115 'channel_url': account
.get('channel_url'),
116 'channel': account
.get('channel')
119 def _entries(self
, item_id
, endpoint
, entry_func
, query
=None):
121 Extract entries from paginated list API
122 @param entry_func: Function to generate entry from response item
125 for page
in itertools
.count(1):
126 response
= self
._call
_api
(f
'{item_id}: page {page}', endpoint
, query
={
131 items
= self
._get
_prx
_embed
_response
(response
, 'items')
132 if not response
or not items
:
135 yield from filter(None, map(entry_func
, items
))
137 total
+= response
['count']
138 if total
>= response
['total']:
141 def _story_playlist_entry(self
, response
):
142 story
= self
._extract
_story
_info
(response
)
147 'url': 'https://beta.prx.org/stories/%s' % story
['id'],
148 'ie_key': PRXStoryIE
.ie_key()
152 def _series_playlist_entry(self
, response
):
153 series
= self
._extract
_series
_info
(response
)
158 'url': 'https://beta.prx.org/series/%s' % series
['id'],
159 'ie_key': PRXSeriesIE
.ie_key()
164 class PRXStoryIE(PRXBaseIE
):
165 _VALID_URL
= PRXBaseIE
.PRX_BASE_URL_RE
% r
'stories/(?P<id>\d+)'
169 # Story with season and episode details
170 'url': 'https://beta.prx.org/stories/399200',
173 'title': 'Fly Me To The Moon',
174 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
175 'release_timestamp': 1640250000,
176 'timestamp': 1640208972,
177 'modified_timestamp': 1641318202,
182 'series': 'AirSpace',
183 'series_id': '38057',
184 'channel_id': '220986',
185 'channel_url': 'https://beta.prx.org/accounts/220986',
186 'channel': 'Air and Space Museum',
190 'id': '399200_part1',
191 'title': 'Fly Me To The Moon',
192 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
193 'release_timestamp': 1640250000,
194 'timestamp': 1640208972,
195 'modified_timestamp': 1641318202,
200 'series': 'AirSpace',
201 'series_id': '38057',
202 'channel_id': '220986',
203 'channel_url': 'https://beta.prx.org/accounts/220986',
204 'channel': 'Air and Space Museum',
206 'upload_date': '20211222',
207 'episode': 'Episode 8',
208 'release_date': '20211223',
209 'season': 'Season 5',
210 'modified_date': '20220104'
214 'id': '399200_part2',
215 'title': 'Fly Me To The Moon',
216 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
217 'release_timestamp': 1640250000,
218 'timestamp': 1640208972,
219 'modified_timestamp': 1641318202,
224 'series': 'AirSpace',
225 'series_id': '38057',
226 'channel_id': '220986',
227 'channel_url': 'https://beta.prx.org/accounts/220986',
228 'channel': 'Air and Space Museum',
230 'upload_date': '20211222',
231 'episode': 'Episode 8',
232 'release_date': '20211223',
233 'season': 'Season 5',
234 'modified_date': '20220104'
240 # Story with only split audio
241 'url': 'https://beta.prx.org/stories/326414',
244 'title': 'Massachusetts v EPA',
245 'description': 'md5:744fffba08f19f4deab69fa8d49d5816',
246 'timestamp': 1592509124,
247 'modified_timestamp': 1592510457,
250 'series': 'Outside/In',
251 'series_id': '36252',
253 'channel_url': 'https://beta.prx.org/accounts/206',
254 'channel': 'New Hampshire Public Radio',
258 # Story with single combined audio
259 'url': 'https://beta.prx.org/stories/400404',
262 'title': 'Cafe Chill (Episode 2022-01)',
263 'thumbnails': 'count:1',
264 'description': 'md5:9f1b5a3cbd64fb159d08c3baa31f1539',
265 'timestamp': 1641233952,
266 'modified_timestamp': 1641234248,
268 'series': 'Café Chill',
269 'series_id': '37762',
270 'channel_id': '5767',
271 'channel_url': 'https://beta.prx.org/accounts/5767',
272 'channel': 'C89.5 - KNHC Seattle',
275 'thumbnail': r
're:https?://cms\.prx\.org/pub/\w+/0/web/story_image/767965/medium/Aurora_Over_Trees\.jpg',
276 'upload_date': '20220103',
277 'modified_date': '20220103'
280 'url': 'https://listen.prx.org/stories/399200',
281 'only_matching': True
285 def _extract_audio_pieces(self
, audio_response
):
287 'format_id': str_or_none(piece_response
.get('id')),
288 'format_note': str_or_none(piece_response
.get('label')),
289 'filesize': int_or_none(piece_response
.get('size')),
290 'duration': int_or_none(piece_response
.get('duration')),
291 'ext': mimetype2ext(piece_response
.get('contentType')),
292 'asr': int_or_none(piece_response
.get('frequency'), scale
=1000),
293 'abr': int_or_none(piece_response
.get('bitRate')),
294 'url': self
._extract
_file
_link
(piece_response
),
296 } for piece_response
in sorted(
297 self
._get
_prx
_embed
_response
(audio_response
, 'items') or [],
298 key
=lambda p
: int_or_none(p
.get('position')))]
300 def _extract_story(self
, story_response
):
301 info
= self
._extract
_story
_info
(story_response
)
304 audio_pieces
= self
._extract
_audio
_pieces
(
305 self
._get
_prx
_embed
_response
(story_response
, 'audio'))
306 if len(audio_pieces
) == 1:
308 'formats': audio_pieces
,
314 'id': '%s_part%d' % (info
['id'], (idx
+ 1)),
316 } for idx
, fmt
in enumerate(audio_pieces
)]
318 '_type': 'multi_video',
323 def _real_extract(self
, url
):
324 story_id
= self
._match
_id
(url
)
325 response
= self
._call
_api
(story_id
, f
'stories/{story_id}')
326 return self
._extract
_story
(response
)
329 class PRXSeriesIE(PRXBaseIE
):
330 _VALID_URL
= PRXBaseIE
.PRX_BASE_URL_RE
% r
'series/(?P<id>\d+)'
333 'url': 'https://beta.prx.org/series/36252',
336 'title': 'Outside/In',
337 'thumbnails': 'count:1',
338 'description': 'md5:a6bedc5f810777bcb09ab30ff9059114',
339 'timestamp': 1470684964,
340 'modified_timestamp': 1582308830,
342 'channel_url': 'https://beta.prx.org/accounts/206',
343 'channel': 'New Hampshire Public Radio',
344 'series': 'Outside/In',
347 'playlist_mincount': 39
350 'url': 'https://beta.prx.org/series/25038',
354 'timestamp': 1207612800,
355 'modified_timestamp': 1207612800,
357 'channel_url': 'https://beta.prx.org/accounts/206',
358 'channel': 'New Hampshire Public Radio',
366 def _extract_series(self
, series_response
):
367 info
= self
._extract
_series
_info
(series_response
)
370 'entries': self
._entries
(info
['id'], 'series/%s/stories' % info
['id'], self
._story
_playlist
_entry
),
374 def _real_extract(self
, url
):
375 series_id
= self
._match
_id
(url
)
376 response
= self
._call
_api
(series_id
, f
'series/{series_id}')
377 return self
._extract
_series
(response
)
380 class PRXAccountIE(PRXBaseIE
):
381 _VALID_URL
= PRXBaseIE
.PRX_BASE_URL_RE
% r
'accounts/(?P<id>\d+)'
383 'url': 'https://beta.prx.org/accounts/206',
386 'title': 'New Hampshire Public Radio',
387 'description': 'md5:277f2395301d0aca563c80c70a18ee0a',
389 'channel_url': 'https://beta.prx.org/accounts/206',
390 'channel': 'New Hampshire Public Radio',
391 'thumbnails': 'count:1'
393 'playlist_mincount': 380
396 def _extract_account(self
, account_response
):
397 info
= self
._extract
_account
_info
(account_response
)
398 series
= self
._entries
(
399 info
['id'], f
'accounts/{info["id"]}/series', self
._series
_playlist
_entry
)
400 stories
= self
._entries
(
401 info
['id'], f
'accounts/{info["id"]}/stories', self
._story
_playlist
_entry
)
404 'entries': itertools
.chain(series
, stories
),
408 def _real_extract(self
, url
):
409 account_id
= self
._match
_id
(url
)
410 response
= self
._call
_api
(account_id
, f
'accounts/{account_id}')
411 return self
._extract
_account
(response
)
414 class PRXStoriesSearchIE(PRXBaseIE
, SearchInfoExtractor
):
415 IE_DESC
= 'PRX Stories Search'
416 IE_NAME
= 'prxstories:search'
417 _SEARCH_KEY
= 'prxstories'
419 def _search_results(self
, query
):
420 yield from self
._entries
(
421 f
'query {query}', 'stories/search', self
._story
_playlist
_entry
, query
={'q': query}
)
424 class PRXSeriesSearchIE(PRXBaseIE
, SearchInfoExtractor
):
425 IE_DESC
= 'PRX Series Search'
426 IE_NAME
= 'prxseries:search'
427 _SEARCH_KEY
= 'prxseries'
429 def _search_results(self
, query
):
430 yield from self
._entries
(
431 f
'query {query}', 'series/search', self
._series
_playlist
_entry
, query
={'q': query}
)