]>
Commit | Line | Data |
---|---|---|
85fee221 | 1 | import itertools |
e897bd82 | 2 | |
85fee221 | 3 | from .common import InfoExtractor, SearchInfoExtractor |
4 | from ..utils import ( | |
e897bd82 | 5 | clean_html, |
85fee221 | 6 | int_or_none, |
7 | mimetype2ext, | |
85fee221 | 8 | str_or_none, |
e897bd82 SS |
9 | traverse_obj, |
10 | unified_timestamp, | |
11 | url_or_none, | |
12 | urljoin, | |
85fee221 | 13 | ) |
14 | ||
15 | ||
16 | class PRXBaseIE(InfoExtractor): | |
17 | PRX_BASE_URL_RE = r'https?://(?:(?:beta|listen)\.)?prx.org/%s' | |
18 | ||
19 | def _call_api(self, item_id, path, query=None, fatal=True, note='Downloading CMS API JSON'): | |
20 | return self._download_json( | |
21 | urljoin('https://cms.prx.org/api/v1/', path), item_id, query=query, fatal=fatal, note=note) | |
22 | ||
23 | @staticmethod | |
24 | def _get_prx_embed_response(response, section): | |
25 | return traverse_obj(response, ('_embedded', f'prx:{section}')) | |
26 | ||
27 | @staticmethod | |
28 | def _extract_file_link(response): | |
29 | return url_or_none(traverse_obj( | |
30 | response, ('_links', 'enclosure', 'href'), expected_type=str)) | |
31 | ||
32 | @classmethod | |
33 | def _extract_image(cls, image_response): | |
34 | if not isinstance(image_response, dict): | |
35 | return | |
36 | return { | |
37 | 'id': str_or_none(image_response.get('id')), | |
38 | 'filesize': image_response.get('size'), | |
39 | 'width': image_response.get('width'), | |
40 | 'height': image_response.get('height'), | |
add96eb9 | 41 | 'url': cls._extract_file_link(image_response), |
85fee221 | 42 | } |
43 | ||
44 | @classmethod | |
45 | def _extract_base_info(cls, response): | |
46 | if not isinstance(response, dict): | |
47 | return | |
48 | item_id = str_or_none(response.get('id')) | |
49 | if not item_id: | |
50 | return | |
51 | thumbnail_dict = cls._extract_image(cls._get_prx_embed_response(response, 'image')) | |
52 | description = ( | |
53 | clean_html(response.get('description')) | |
54 | or response.get('shortDescription')) | |
55 | return { | |
56 | 'id': item_id, | |
57 | 'title': response.get('title') or item_id, | |
58 | 'thumbnails': [thumbnail_dict] if thumbnail_dict else None, | |
59 | 'description': description, | |
60 | 'release_timestamp': unified_timestamp(response.get('releasedAt')), | |
61 | 'timestamp': unified_timestamp(response.get('createdAt')), | |
62 | 'modified_timestamp': unified_timestamp(response.get('updatedAt')), | |
63 | 'duration': int_or_none(response.get('duration')), | |
64 | 'tags': response.get('tags'), | |
65 | 'episode_number': int_or_none(response.get('episodeIdentifier')), | |
add96eb9 | 66 | 'season_number': int_or_none(response.get('seasonIdentifier')), |
85fee221 | 67 | } |
68 | ||
69 | @classmethod | |
70 | def _extract_series_info(cls, series_response): | |
71 | base_info = cls._extract_base_info(series_response) | |
72 | if not base_info: | |
73 | return | |
74 | account_info = cls._extract_account_info( | |
75 | cls._get_prx_embed_response(series_response, 'account')) or {} | |
76 | return { | |
77 | **base_info, | |
78 | 'channel_id': account_info.get('channel_id'), | |
79 | 'channel_url': account_info.get('channel_url'), | |
80 | 'channel': account_info.get('channel'), | |
81 | 'series': base_info.get('title'), | |
82 | 'series_id': base_info.get('id'), | |
83 | } | |
84 | ||
85 | @classmethod | |
86 | def _extract_account_info(cls, account_response): | |
87 | base_info = cls._extract_base_info(account_response) | |
88 | if not base_info: | |
89 | return | |
90 | name = account_response.get('name') | |
91 | return { | |
92 | **base_info, | |
93 | 'title': name, | |
94 | 'channel_id': base_info.get('id'), | |
add96eb9 | 95 | 'channel_url': 'https://beta.prx.org/accounts/{}'.format(base_info.get('id')), |
85fee221 | 96 | 'channel': name, |
97 | } | |
98 | ||
99 | @classmethod | |
100 | def _extract_story_info(cls, story_response): | |
101 | base_info = cls._extract_base_info(story_response) | |
102 | if not base_info: | |
103 | return | |
104 | series = cls._extract_series_info( | |
105 | cls._get_prx_embed_response(story_response, 'series')) or {} | |
106 | account = cls._extract_account_info( | |
107 | cls._get_prx_embed_response(story_response, 'account')) or {} | |
108 | return { | |
109 | **base_info, | |
110 | 'series': series.get('series'), | |
111 | 'series_id': series.get('series_id'), | |
112 | 'channel_id': account.get('channel_id'), | |
113 | 'channel_url': account.get('channel_url'), | |
add96eb9 | 114 | 'channel': account.get('channel'), |
85fee221 | 115 | } |
116 | ||
117 | def _entries(self, item_id, endpoint, entry_func, query=None): | |
118 | """ | |
119 | Extract entries from paginated list API | |
120 | @param entry_func: Function to generate entry from response item | |
121 | """ | |
122 | total = 0 | |
123 | for page in itertools.count(1): | |
124 | response = self._call_api(f'{item_id}: page {page}', endpoint, query={ | |
125 | **(query or {}), | |
126 | 'page': page, | |
add96eb9 | 127 | 'per': 100, |
85fee221 | 128 | }) |
129 | items = self._get_prx_embed_response(response, 'items') | |
130 | if not response or not items: | |
131 | break | |
132 | ||
133 | yield from filter(None, map(entry_func, items)) | |
134 | ||
135 | total += response['count'] | |
136 | if total >= response['total']: | |
137 | break | |
138 | ||
139 | def _story_playlist_entry(self, response): | |
140 | story = self._extract_story_info(response) | |
141 | if not story: | |
142 | return | |
143 | story.update({ | |
144 | '_type': 'url', | |
add96eb9 | 145 | 'url': 'https://beta.prx.org/stories/{}'.format(story['id']), |
146 | 'ie_key': PRXStoryIE.ie_key(), | |
85fee221 | 147 | }) |
148 | return story | |
149 | ||
150 | def _series_playlist_entry(self, response): | |
151 | series = self._extract_series_info(response) | |
152 | if not series: | |
153 | return | |
154 | series.update({ | |
155 | '_type': 'url', | |
add96eb9 | 156 | 'url': 'https://beta.prx.org/series/{}'.format(series['id']), |
157 | 'ie_key': PRXSeriesIE.ie_key(), | |
85fee221 | 158 | }) |
159 | return series | |
160 | ||
161 | ||
162 | class PRXStoryIE(PRXBaseIE): | |
163 | _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'stories/(?P<id>\d+)' | |
164 | ||
165 | _TESTS = [ | |
166 | { | |
167 | # Story with season and episode details | |
168 | 'url': 'https://beta.prx.org/stories/399200', | |
169 | 'info_dict': { | |
170 | 'id': '399200', | |
171 | 'title': 'Fly Me To The Moon', | |
172 | 'description': 'md5:43230168390b95d3322048d8a56bf2bb', | |
173 | 'release_timestamp': 1640250000, | |
174 | 'timestamp': 1640208972, | |
175 | 'modified_timestamp': 1641318202, | |
176 | 'duration': 1004, | |
177 | 'tags': 'count:7', | |
178 | 'episode_number': 8, | |
179 | 'season_number': 5, | |
180 | 'series': 'AirSpace', | |
181 | 'series_id': '38057', | |
182 | 'channel_id': '220986', | |
183 | 'channel_url': 'https://beta.prx.org/accounts/220986', | |
184 | 'channel': 'Air and Space Museum', | |
185 | }, | |
186 | 'playlist': [{ | |
187 | 'info_dict': { | |
188 | 'id': '399200_part1', | |
189 | 'title': 'Fly Me To The Moon', | |
190 | 'description': 'md5:43230168390b95d3322048d8a56bf2bb', | |
191 | 'release_timestamp': 1640250000, | |
192 | 'timestamp': 1640208972, | |
193 | 'modified_timestamp': 1641318202, | |
194 | 'duration': 530, | |
195 | 'tags': 'count:7', | |
196 | 'episode_number': 8, | |
197 | 'season_number': 5, | |
198 | 'series': 'AirSpace', | |
199 | 'series_id': '38057', | |
200 | 'channel_id': '220986', | |
201 | 'channel_url': 'https://beta.prx.org/accounts/220986', | |
202 | 'channel': 'Air and Space Museum', | |
203 | 'ext': 'mp3', | |
204 | 'upload_date': '20211222', | |
205 | 'episode': 'Episode 8', | |
206 | 'release_date': '20211223', | |
207 | 'season': 'Season 5', | |
add96eb9 | 208 | 'modified_date': '20220104', |
209 | }, | |
85fee221 | 210 | }, { |
211 | 'info_dict': { | |
212 | 'id': '399200_part2', | |
213 | 'title': 'Fly Me To The Moon', | |
214 | 'description': 'md5:43230168390b95d3322048d8a56bf2bb', | |
215 | 'release_timestamp': 1640250000, | |
216 | 'timestamp': 1640208972, | |
217 | 'modified_timestamp': 1641318202, | |
218 | 'duration': 474, | |
219 | 'tags': 'count:7', | |
220 | 'episode_number': 8, | |
221 | 'season_number': 5, | |
222 | 'series': 'AirSpace', | |
223 | 'series_id': '38057', | |
224 | 'channel_id': '220986', | |
225 | 'channel_url': 'https://beta.prx.org/accounts/220986', | |
226 | 'channel': 'Air and Space Museum', | |
227 | 'ext': 'mp3', | |
228 | 'upload_date': '20211222', | |
229 | 'episode': 'Episode 8', | |
230 | 'release_date': '20211223', | |
231 | 'season': 'Season 5', | |
add96eb9 | 232 | 'modified_date': '20220104', |
233 | }, | |
234 | }, | |
85fee221 | 235 | |
add96eb9 | 236 | ], |
85fee221 | 237 | }, { |
238 | # Story with only split audio | |
239 | 'url': 'https://beta.prx.org/stories/326414', | |
240 | 'info_dict': { | |
241 | 'id': '326414', | |
242 | 'title': 'Massachusetts v EPA', | |
243 | 'description': 'md5:744fffba08f19f4deab69fa8d49d5816', | |
244 | 'timestamp': 1592509124, | |
245 | 'modified_timestamp': 1592510457, | |
246 | 'duration': 3088, | |
247 | 'tags': 'count:0', | |
248 | 'series': 'Outside/In', | |
249 | 'series_id': '36252', | |
250 | 'channel_id': '206', | |
251 | 'channel_url': 'https://beta.prx.org/accounts/206', | |
252 | 'channel': 'New Hampshire Public Radio', | |
253 | }, | |
add96eb9 | 254 | 'playlist_count': 4, |
85fee221 | 255 | }, { |
256 | # Story with single combined audio | |
257 | 'url': 'https://beta.prx.org/stories/400404', | |
258 | 'info_dict': { | |
259 | 'id': '400404', | |
260 | 'title': 'Cafe Chill (Episode 2022-01)', | |
261 | 'thumbnails': 'count:1', | |
262 | 'description': 'md5:9f1b5a3cbd64fb159d08c3baa31f1539', | |
263 | 'timestamp': 1641233952, | |
264 | 'modified_timestamp': 1641234248, | |
265 | 'duration': 3540, | |
266 | 'series': 'Café Chill', | |
267 | 'series_id': '37762', | |
268 | 'channel_id': '5767', | |
269 | 'channel_url': 'https://beta.prx.org/accounts/5767', | |
270 | 'channel': 'C89.5 - KNHC Seattle', | |
271 | 'ext': 'mp3', | |
272 | 'tags': 'count:0', | |
273 | 'thumbnail': r're:https?://cms\.prx\.org/pub/\w+/0/web/story_image/767965/medium/Aurora_Over_Trees\.jpg', | |
274 | 'upload_date': '20220103', | |
add96eb9 | 275 | 'modified_date': '20220103', |
276 | }, | |
85fee221 | 277 | }, { |
278 | 'url': 'https://listen.prx.org/stories/399200', | |
add96eb9 | 279 | 'only_matching': True, |
280 | }, | |
85fee221 | 281 | ] |
282 | ||
283 | def _extract_audio_pieces(self, audio_response): | |
284 | return [{ | |
285 | 'format_id': str_or_none(piece_response.get('id')), | |
286 | 'format_note': str_or_none(piece_response.get('label')), | |
287 | 'filesize': int_or_none(piece_response.get('size')), | |
288 | 'duration': int_or_none(piece_response.get('duration')), | |
289 | 'ext': mimetype2ext(piece_response.get('contentType')), | |
290 | 'asr': int_or_none(piece_response.get('frequency'), scale=1000), | |
291 | 'abr': int_or_none(piece_response.get('bitRate')), | |
292 | 'url': self._extract_file_link(piece_response), | |
add96eb9 | 293 | 'vcodec': 'none', |
85fee221 | 294 | } for piece_response in sorted( |
295 | self._get_prx_embed_response(audio_response, 'items') or [], | |
296 | key=lambda p: int_or_none(p.get('position')))] | |
297 | ||
298 | def _extract_story(self, story_response): | |
299 | info = self._extract_story_info(story_response) | |
300 | if not info: | |
301 | return | |
302 | audio_pieces = self._extract_audio_pieces( | |
303 | self._get_prx_embed_response(story_response, 'audio')) | |
304 | if len(audio_pieces) == 1: | |
305 | return { | |
306 | 'formats': audio_pieces, | |
add96eb9 | 307 | **info, |
85fee221 | 308 | } |
309 | ||
310 | entries = [{ | |
311 | **info, | |
add96eb9 | 312 | 'id': '{}_part{}'.format(info['id'], (idx + 1)), |
85fee221 | 313 | 'formats': [fmt], |
314 | } for idx, fmt in enumerate(audio_pieces)] | |
315 | return { | |
316 | '_type': 'multi_video', | |
317 | 'entries': entries, | |
add96eb9 | 318 | **info, |
85fee221 | 319 | } |
320 | ||
321 | def _real_extract(self, url): | |
322 | story_id = self._match_id(url) | |
323 | response = self._call_api(story_id, f'stories/{story_id}') | |
324 | return self._extract_story(response) | |
325 | ||
326 | ||
327 | class PRXSeriesIE(PRXBaseIE): | |
328 | _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'series/(?P<id>\d+)' | |
329 | _TESTS = [ | |
330 | { | |
331 | 'url': 'https://beta.prx.org/series/36252', | |
332 | 'info_dict': { | |
333 | 'id': '36252', | |
334 | 'title': 'Outside/In', | |
335 | 'thumbnails': 'count:1', | |
336 | 'description': 'md5:a6bedc5f810777bcb09ab30ff9059114', | |
337 | 'timestamp': 1470684964, | |
338 | 'modified_timestamp': 1582308830, | |
339 | 'channel_id': '206', | |
340 | 'channel_url': 'https://beta.prx.org/accounts/206', | |
341 | 'channel': 'New Hampshire Public Radio', | |
342 | 'series': 'Outside/In', | |
add96eb9 | 343 | 'series_id': '36252', |
85fee221 | 344 | }, |
add96eb9 | 345 | 'playlist_mincount': 39, |
85fee221 | 346 | }, { |
347 | # Blank series | |
348 | 'url': 'https://beta.prx.org/series/25038', | |
349 | 'info_dict': { | |
350 | 'id': '25038', | |
351 | 'title': '25038', | |
352 | 'timestamp': 1207612800, | |
353 | 'modified_timestamp': 1207612800, | |
354 | 'channel_id': '206', | |
355 | 'channel_url': 'https://beta.prx.org/accounts/206', | |
356 | 'channel': 'New Hampshire Public Radio', | |
357 | 'series': '25038', | |
add96eb9 | 358 | 'series_id': '25038', |
85fee221 | 359 | }, |
add96eb9 | 360 | 'playlist_count': 0, |
361 | }, | |
85fee221 | 362 | ] |
363 | ||
364 | def _extract_series(self, series_response): | |
365 | info = self._extract_series_info(series_response) | |
366 | return { | |
367 | '_type': 'playlist', | |
add96eb9 | 368 | 'entries': self._entries(info['id'], 'series/{}/stories'.format(info['id']), self._story_playlist_entry), |
369 | **info, | |
85fee221 | 370 | } |
371 | ||
372 | def _real_extract(self, url): | |
373 | series_id = self._match_id(url) | |
374 | response = self._call_api(series_id, f'series/{series_id}') | |
375 | return self._extract_series(response) | |
376 | ||
377 | ||
378 | class PRXAccountIE(PRXBaseIE): | |
379 | _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'accounts/(?P<id>\d+)' | |
380 | _TESTS = [{ | |
381 | 'url': 'https://beta.prx.org/accounts/206', | |
382 | 'info_dict': { | |
383 | 'id': '206', | |
384 | 'title': 'New Hampshire Public Radio', | |
385 | 'description': 'md5:277f2395301d0aca563c80c70a18ee0a', | |
386 | 'channel_id': '206', | |
387 | 'channel_url': 'https://beta.prx.org/accounts/206', | |
388 | 'channel': 'New Hampshire Public Radio', | |
add96eb9 | 389 | 'thumbnails': 'count:1', |
85fee221 | 390 | }, |
add96eb9 | 391 | 'playlist_mincount': 380, |
85fee221 | 392 | }] |
393 | ||
394 | def _extract_account(self, account_response): | |
395 | info = self._extract_account_info(account_response) | |
396 | series = self._entries( | |
397 | info['id'], f'accounts/{info["id"]}/series', self._series_playlist_entry) | |
398 | stories = self._entries( | |
399 | info['id'], f'accounts/{info["id"]}/stories', self._story_playlist_entry) | |
400 | return { | |
401 | '_type': 'playlist', | |
402 | 'entries': itertools.chain(series, stories), | |
add96eb9 | 403 | **info, |
85fee221 | 404 | } |
405 | ||
406 | def _real_extract(self, url): | |
407 | account_id = self._match_id(url) | |
408 | response = self._call_api(account_id, f'accounts/{account_id}') | |
409 | return self._extract_account(response) | |
410 | ||
411 | ||
412 | class PRXStoriesSearchIE(PRXBaseIE, SearchInfoExtractor): | |
413 | IE_DESC = 'PRX Stories Search' | |
414 | IE_NAME = 'prxstories:search' | |
415 | _SEARCH_KEY = 'prxstories' | |
416 | ||
417 | def _search_results(self, query): | |
418 | yield from self._entries( | |
419 | f'query {query}', 'stories/search', self._story_playlist_entry, query={'q': query}) | |
420 | ||
421 | ||
422 | class PRXSeriesSearchIE(PRXBaseIE, SearchInfoExtractor): | |
423 | IE_DESC = 'PRX Series Search' | |
424 | IE_NAME = 'prxseries:search' | |
425 | _SEARCH_KEY = 'prxseries' | |
426 | ||
427 | def _search_results(self, query): | |
428 | yield from self._entries( | |
429 | f'query {query}', 'series/search', self._series_playlist_entry, query={'q': query}) |