]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/prx.py
[panopto] Add extractors (#2908)
[yt-dlp.git] / yt_dlp / extractor / prx.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import itertools
5 from .common import InfoExtractor, SearchInfoExtractor
6 from ..utils import (
7 urljoin,
8 traverse_obj,
9 int_or_none,
10 mimetype2ext,
11 clean_html,
12 url_or_none,
13 unified_timestamp,
14 str_or_none,
15 )
16
17
18 class PRXBaseIE(InfoExtractor):
19 PRX_BASE_URL_RE = r'https?://(?:(?:beta|listen)\.)?prx.org/%s'
20
21 def _call_api(self, item_id, path, query=None, fatal=True, note='Downloading CMS API JSON'):
22 return self._download_json(
23 urljoin('https://cms.prx.org/api/v1/', path), item_id, query=query, fatal=fatal, note=note)
24
25 @staticmethod
26 def _get_prx_embed_response(response, section):
27 return traverse_obj(response, ('_embedded', f'prx:{section}'))
28
29 @staticmethod
30 def _extract_file_link(response):
31 return url_or_none(traverse_obj(
32 response, ('_links', 'enclosure', 'href'), expected_type=str))
33
34 @classmethod
35 def _extract_image(cls, image_response):
36 if not isinstance(image_response, dict):
37 return
38 return {
39 'id': str_or_none(image_response.get('id')),
40 'filesize': image_response.get('size'),
41 'width': image_response.get('width'),
42 'height': image_response.get('height'),
43 'url': cls._extract_file_link(image_response)
44 }
45
46 @classmethod
47 def _extract_base_info(cls, response):
48 if not isinstance(response, dict):
49 return
50 item_id = str_or_none(response.get('id'))
51 if not item_id:
52 return
53 thumbnail_dict = cls._extract_image(cls._get_prx_embed_response(response, 'image'))
54 description = (
55 clean_html(response.get('description'))
56 or response.get('shortDescription'))
57 return {
58 'id': item_id,
59 'title': response.get('title') or item_id,
60 'thumbnails': [thumbnail_dict] if thumbnail_dict else None,
61 'description': description,
62 'release_timestamp': unified_timestamp(response.get('releasedAt')),
63 'timestamp': unified_timestamp(response.get('createdAt')),
64 'modified_timestamp': unified_timestamp(response.get('updatedAt')),
65 'duration': int_or_none(response.get('duration')),
66 'tags': response.get('tags'),
67 'episode_number': int_or_none(response.get('episodeIdentifier')),
68 'season_number': int_or_none(response.get('seasonIdentifier'))
69 }
70
71 @classmethod
72 def _extract_series_info(cls, series_response):
73 base_info = cls._extract_base_info(series_response)
74 if not base_info:
75 return
76 account_info = cls._extract_account_info(
77 cls._get_prx_embed_response(series_response, 'account')) or {}
78 return {
79 **base_info,
80 'channel_id': account_info.get('channel_id'),
81 'channel_url': account_info.get('channel_url'),
82 'channel': account_info.get('channel'),
83 'series': base_info.get('title'),
84 'series_id': base_info.get('id'),
85 }
86
87 @classmethod
88 def _extract_account_info(cls, account_response):
89 base_info = cls._extract_base_info(account_response)
90 if not base_info:
91 return
92 name = account_response.get('name')
93 return {
94 **base_info,
95 'title': name,
96 'channel_id': base_info.get('id'),
97 'channel_url': 'https://beta.prx.org/accounts/%s' % base_info.get('id'),
98 'channel': name,
99 }
100
101 @classmethod
102 def _extract_story_info(cls, story_response):
103 base_info = cls._extract_base_info(story_response)
104 if not base_info:
105 return
106 series = cls._extract_series_info(
107 cls._get_prx_embed_response(story_response, 'series')) or {}
108 account = cls._extract_account_info(
109 cls._get_prx_embed_response(story_response, 'account')) or {}
110 return {
111 **base_info,
112 'series': series.get('series'),
113 'series_id': series.get('series_id'),
114 'channel_id': account.get('channel_id'),
115 'channel_url': account.get('channel_url'),
116 'channel': account.get('channel')
117 }
118
119 def _entries(self, item_id, endpoint, entry_func, query=None):
120 """
121 Extract entries from paginated list API
122 @param entry_func: Function to generate entry from response item
123 """
124 total = 0
125 for page in itertools.count(1):
126 response = self._call_api(f'{item_id}: page {page}', endpoint, query={
127 **(query or {}),
128 'page': page,
129 'per': 100
130 })
131 items = self._get_prx_embed_response(response, 'items')
132 if not response or not items:
133 break
134
135 yield from filter(None, map(entry_func, items))
136
137 total += response['count']
138 if total >= response['total']:
139 break
140
141 def _story_playlist_entry(self, response):
142 story = self._extract_story_info(response)
143 if not story:
144 return
145 story.update({
146 '_type': 'url',
147 'url': 'https://beta.prx.org/stories/%s' % story['id'],
148 'ie_key': PRXStoryIE.ie_key()
149 })
150 return story
151
152 def _series_playlist_entry(self, response):
153 series = self._extract_series_info(response)
154 if not series:
155 return
156 series.update({
157 '_type': 'url',
158 'url': 'https://beta.prx.org/series/%s' % series['id'],
159 'ie_key': PRXSeriesIE.ie_key()
160 })
161 return series
162
163
164 class PRXStoryIE(PRXBaseIE):
165 _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'stories/(?P<id>\d+)'
166
167 _TESTS = [
168 {
169 # Story with season and episode details
170 'url': 'https://beta.prx.org/stories/399200',
171 'info_dict': {
172 'id': '399200',
173 'title': 'Fly Me To The Moon',
174 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
175 'release_timestamp': 1640250000,
176 'timestamp': 1640208972,
177 'modified_timestamp': 1641318202,
178 'duration': 1004,
179 'tags': 'count:7',
180 'episode_number': 8,
181 'season_number': 5,
182 'series': 'AirSpace',
183 'series_id': '38057',
184 'channel_id': '220986',
185 'channel_url': 'https://beta.prx.org/accounts/220986',
186 'channel': 'Air and Space Museum',
187 },
188 'playlist': [{
189 'info_dict': {
190 'id': '399200_part1',
191 'title': 'Fly Me To The Moon',
192 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
193 'release_timestamp': 1640250000,
194 'timestamp': 1640208972,
195 'modified_timestamp': 1641318202,
196 'duration': 530,
197 'tags': 'count:7',
198 'episode_number': 8,
199 'season_number': 5,
200 'series': 'AirSpace',
201 'series_id': '38057',
202 'channel_id': '220986',
203 'channel_url': 'https://beta.prx.org/accounts/220986',
204 'channel': 'Air and Space Museum',
205 'ext': 'mp3',
206 'upload_date': '20211222',
207 'episode': 'Episode 8',
208 'release_date': '20211223',
209 'season': 'Season 5',
210 'modified_date': '20220104'
211 }
212 }, {
213 'info_dict': {
214 'id': '399200_part2',
215 'title': 'Fly Me To The Moon',
216 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
217 'release_timestamp': 1640250000,
218 'timestamp': 1640208972,
219 'modified_timestamp': 1641318202,
220 'duration': 474,
221 'tags': 'count:7',
222 'episode_number': 8,
223 'season_number': 5,
224 'series': 'AirSpace',
225 'series_id': '38057',
226 'channel_id': '220986',
227 'channel_url': 'https://beta.prx.org/accounts/220986',
228 'channel': 'Air and Space Museum',
229 'ext': 'mp3',
230 'upload_date': '20211222',
231 'episode': 'Episode 8',
232 'release_date': '20211223',
233 'season': 'Season 5',
234 'modified_date': '20220104'
235 }
236 }
237
238 ]
239 }, {
240 # Story with only split audio
241 'url': 'https://beta.prx.org/stories/326414',
242 'info_dict': {
243 'id': '326414',
244 'title': 'Massachusetts v EPA',
245 'description': 'md5:744fffba08f19f4deab69fa8d49d5816',
246 'timestamp': 1592509124,
247 'modified_timestamp': 1592510457,
248 'duration': 3088,
249 'tags': 'count:0',
250 'series': 'Outside/In',
251 'series_id': '36252',
252 'channel_id': '206',
253 'channel_url': 'https://beta.prx.org/accounts/206',
254 'channel': 'New Hampshire Public Radio',
255 },
256 'playlist_count': 4
257 }, {
258 # Story with single combined audio
259 'url': 'https://beta.prx.org/stories/400404',
260 'info_dict': {
261 'id': '400404',
262 'title': 'Cafe Chill (Episode 2022-01)',
263 'thumbnails': 'count:1',
264 'description': 'md5:9f1b5a3cbd64fb159d08c3baa31f1539',
265 'timestamp': 1641233952,
266 'modified_timestamp': 1641234248,
267 'duration': 3540,
268 'series': 'Café Chill',
269 'series_id': '37762',
270 'channel_id': '5767',
271 'channel_url': 'https://beta.prx.org/accounts/5767',
272 'channel': 'C89.5 - KNHC Seattle',
273 'ext': 'mp3',
274 'tags': 'count:0',
275 'thumbnail': r're:https?://cms\.prx\.org/pub/\w+/0/web/story_image/767965/medium/Aurora_Over_Trees\.jpg',
276 'upload_date': '20220103',
277 'modified_date': '20220103'
278 }
279 }, {
280 'url': 'https://listen.prx.org/stories/399200',
281 'only_matching': True
282 }
283 ]
284
285 def _extract_audio_pieces(self, audio_response):
286 return [{
287 'format_id': str_or_none(piece_response.get('id')),
288 'format_note': str_or_none(piece_response.get('label')),
289 'filesize': int_or_none(piece_response.get('size')),
290 'duration': int_or_none(piece_response.get('duration')),
291 'ext': mimetype2ext(piece_response.get('contentType')),
292 'asr': int_or_none(piece_response.get('frequency'), scale=1000),
293 'abr': int_or_none(piece_response.get('bitRate')),
294 'url': self._extract_file_link(piece_response),
295 'vcodec': 'none'
296 } for piece_response in sorted(
297 self._get_prx_embed_response(audio_response, 'items') or [],
298 key=lambda p: int_or_none(p.get('position')))]
299
300 def _extract_story(self, story_response):
301 info = self._extract_story_info(story_response)
302 if not info:
303 return
304 audio_pieces = self._extract_audio_pieces(
305 self._get_prx_embed_response(story_response, 'audio'))
306 if len(audio_pieces) == 1:
307 return {
308 'formats': audio_pieces,
309 **info
310 }
311
312 entries = [{
313 **info,
314 'id': '%s_part%d' % (info['id'], (idx + 1)),
315 'formats': [fmt],
316 } for idx, fmt in enumerate(audio_pieces)]
317 return {
318 '_type': 'multi_video',
319 'entries': entries,
320 **info
321 }
322
323 def _real_extract(self, url):
324 story_id = self._match_id(url)
325 response = self._call_api(story_id, f'stories/{story_id}')
326 return self._extract_story(response)
327
328
329 class PRXSeriesIE(PRXBaseIE):
330 _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'series/(?P<id>\d+)'
331 _TESTS = [
332 {
333 'url': 'https://beta.prx.org/series/36252',
334 'info_dict': {
335 'id': '36252',
336 'title': 'Outside/In',
337 'thumbnails': 'count:1',
338 'description': 'md5:a6bedc5f810777bcb09ab30ff9059114',
339 'timestamp': 1470684964,
340 'modified_timestamp': 1582308830,
341 'channel_id': '206',
342 'channel_url': 'https://beta.prx.org/accounts/206',
343 'channel': 'New Hampshire Public Radio',
344 'series': 'Outside/In',
345 'series_id': '36252'
346 },
347 'playlist_mincount': 39
348 }, {
349 # Blank series
350 'url': 'https://beta.prx.org/series/25038',
351 'info_dict': {
352 'id': '25038',
353 'title': '25038',
354 'timestamp': 1207612800,
355 'modified_timestamp': 1207612800,
356 'channel_id': '206',
357 'channel_url': 'https://beta.prx.org/accounts/206',
358 'channel': 'New Hampshire Public Radio',
359 'series': '25038',
360 'series_id': '25038'
361 },
362 'playlist_count': 0
363 }
364 ]
365
366 def _extract_series(self, series_response):
367 info = self._extract_series_info(series_response)
368 return {
369 '_type': 'playlist',
370 'entries': self._entries(info['id'], 'series/%s/stories' % info['id'], self._story_playlist_entry),
371 **info
372 }
373
374 def _real_extract(self, url):
375 series_id = self._match_id(url)
376 response = self._call_api(series_id, f'series/{series_id}')
377 return self._extract_series(response)
378
379
380 class PRXAccountIE(PRXBaseIE):
381 _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'accounts/(?P<id>\d+)'
382 _TESTS = [{
383 'url': 'https://beta.prx.org/accounts/206',
384 'info_dict': {
385 'id': '206',
386 'title': 'New Hampshire Public Radio',
387 'description': 'md5:277f2395301d0aca563c80c70a18ee0a',
388 'channel_id': '206',
389 'channel_url': 'https://beta.prx.org/accounts/206',
390 'channel': 'New Hampshire Public Radio',
391 'thumbnails': 'count:1'
392 },
393 'playlist_mincount': 380
394 }]
395
396 def _extract_account(self, account_response):
397 info = self._extract_account_info(account_response)
398 series = self._entries(
399 info['id'], f'accounts/{info["id"]}/series', self._series_playlist_entry)
400 stories = self._entries(
401 info['id'], f'accounts/{info["id"]}/stories', self._story_playlist_entry)
402 return {
403 '_type': 'playlist',
404 'entries': itertools.chain(series, stories),
405 **info
406 }
407
408 def _real_extract(self, url):
409 account_id = self._match_id(url)
410 response = self._call_api(account_id, f'accounts/{account_id}')
411 return self._extract_account(response)
412
413
414 class PRXStoriesSearchIE(PRXBaseIE, SearchInfoExtractor):
415 IE_DESC = 'PRX Stories Search'
416 IE_NAME = 'prxstories:search'
417 _SEARCH_KEY = 'prxstories'
418
419 def _search_results(self, query):
420 yield from self._entries(
421 f'query {query}', 'stories/search', self._story_playlist_entry, query={'q': query})
422
423
424 class PRXSeriesSearchIE(PRXBaseIE, SearchInfoExtractor):
425 IE_DESC = 'PRX Series Search'
426 IE_NAME = 'prxseries:search'
427 _SEARCH_KEY = 'prxseries'
428
429 def _search_results(self, query):
430 yield from self._entries(
431 f'query {query}', 'series/search', self._series_playlist_entry, query={'q': query})