jfr.im git - yt-dlp.git/blame_incremental

... / ...

Commit	Line	Data
	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
	4	import itertools
	5	from .common import InfoExtractor, SearchInfoExtractor
	6	from ..utils import (
	7	urljoin,
	8	traverse_obj,
	9	int_or_none,
	10	mimetype2ext,
	11	clean_html,
	12	url_or_none,
	13	unified_timestamp,
	14	str_or_none,
	15	)
	16
	17
	18	class PRXBaseIE(InfoExtractor):
	19	PRX_BASE_URL_RE = r'https?://(?:(?:beta\|listen)\.)?prx.org/%s'
	20
	21	def _call_api(self, item_id, path, query=None, fatal=True, note='Downloading CMS API JSON'):
	22	return self._download_json(
	23	urljoin('https://cms.prx.org/api/v1/', path), item_id, query=query, fatal=fatal, note=note)
	24
	25	@staticmethod
	26	def _get_prx_embed_response(response, section):
	27	return traverse_obj(response, ('_embedded', f'prx:{section}'))
	28
	29	@staticmethod
	30	def _extract_file_link(response):
	31	return url_or_none(traverse_obj(
	32	response, ('_links', 'enclosure', 'href'), expected_type=str))
	33
	34	@classmethod
	35	def _extract_image(cls, image_response):
	36	if not isinstance(image_response, dict):
	37	return
	38	return {
	39	'id': str_or_none(image_response.get('id')),
	40	'filesize': image_response.get('size'),
	41	'width': image_response.get('width'),
	42	'height': image_response.get('height'),
	43	'url': cls._extract_file_link(image_response)
	44	}
	45
	46	@classmethod
	47	def _extract_base_info(cls, response):
	48	if not isinstance(response, dict):
	49	return
	50	item_id = str_or_none(response.get('id'))
	51	if not item_id:
	52	return
	53	thumbnail_dict = cls._extract_image(cls._get_prx_embed_response(response, 'image'))
	54	description = (
	55	clean_html(response.get('description'))
	56	or response.get('shortDescription'))
	57	return {
	58	'id': item_id,
	59	'title': response.get('title') or item_id,
	60	'thumbnails': [thumbnail_dict] if thumbnail_dict else None,
	61	'description': description,
	62	'release_timestamp': unified_timestamp(response.get('releasedAt')),
	63	'timestamp': unified_timestamp(response.get('createdAt')),
	64	'modified_timestamp': unified_timestamp(response.get('updatedAt')),
	65	'duration': int_or_none(response.get('duration')),
	66	'tags': response.get('tags'),
	67	'episode_number': int_or_none(response.get('episodeIdentifier')),
	68	'season_number': int_or_none(response.get('seasonIdentifier'))
	69	}
	70
	71	@classmethod
	72	def _extract_series_info(cls, series_response):
	73	base_info = cls._extract_base_info(series_response)
	74	if not base_info:
	75	return
	76	account_info = cls._extract_account_info(
	77	cls._get_prx_embed_response(series_response, 'account')) or {}
	78	return {
	79	**base_info,
	80	'channel_id': account_info.get('channel_id'),
	81	'channel_url': account_info.get('channel_url'),
	82	'channel': account_info.get('channel'),
	83	'series': base_info.get('title'),
	84	'series_id': base_info.get('id'),
	85	}
	86
	87	@classmethod
	88	def _extract_account_info(cls, account_response):
	89	base_info = cls._extract_base_info(account_response)
	90	if not base_info:
	91	return
	92	name = account_response.get('name')
	93	return {
	94	**base_info,
	95	'title': name,
	96	'channel_id': base_info.get('id'),
	97	'channel_url': 'https://beta.prx.org/accounts/%s' % base_info.get('id'),
	98	'channel': name,
	99	}
	100
	101	@classmethod
	102	def _extract_story_info(cls, story_response):
	103	base_info = cls._extract_base_info(story_response)
	104	if not base_info:
	105	return
	106	series = cls._extract_series_info(
	107	cls._get_prx_embed_response(story_response, 'series')) or {}
	108	account = cls._extract_account_info(
	109	cls._get_prx_embed_response(story_response, 'account')) or {}
	110	return {
	111	**base_info,
	112	'series': series.get('series'),
	113	'series_id': series.get('series_id'),
	114	'channel_id': account.get('channel_id'),
	115	'channel_url': account.get('channel_url'),
	116	'channel': account.get('channel')
	117	}
	118
	119	def _entries(self, item_id, endpoint, entry_func, query=None):
	120	"""
	121	Extract entries from paginated list API
	122	@param entry_func: Function to generate entry from response item
	123	"""
	124	total = 0
	125	for page in itertools.count(1):
	126	response = self._call_api(f'{item_id}: page {page}', endpoint, query={
	127	**(query or {}),
	128	'page': page,
	129	'per': 100
	130	})
	131	items = self._get_prx_embed_response(response, 'items')
	132	if not response or not items:
	133	break
	134
	135	yield from filter(None, map(entry_func, items))
	136
	137	total += response['count']
	138	if total >= response['total']:
	139	break
	140
	141	def _story_playlist_entry(self, response):
	142	story = self._extract_story_info(response)
	143	if not story:
	144	return
	145	story.update({
	146	'_type': 'url',
	147	'url': 'https://beta.prx.org/stories/%s' % story['id'],
	148	'ie_key': PRXStoryIE.ie_key()
	149	})
	150	return story
	151
	152	def _series_playlist_entry(self, response):
	153	series = self._extract_series_info(response)
	154	if not series:
	155	return
	156	series.update({
	157	'_type': 'url',
	158	'url': 'https://beta.prx.org/series/%s' % series['id'],
	159	'ie_key': PRXSeriesIE.ie_key()
	160	})
	161	return series
	162
	163
	164	class PRXStoryIE(PRXBaseIE):
	165	_VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'stories/(?P<id>\d+)'
	166
	167	_TESTS = [
	168	{
	169	# Story with season and episode details
	170	'url': 'https://beta.prx.org/stories/399200',
	171	'info_dict': {
	172	'id': '399200',
	173	'title': 'Fly Me To The Moon',
	174	'description': 'md5:43230168390b95d3322048d8a56bf2bb',
	175	'release_timestamp': 1640250000,
	176	'timestamp': 1640208972,
	177	'modified_timestamp': 1641318202,
	178	'duration': 1004,
	179	'tags': 'count:7',
	180	'episode_number': 8,
	181	'season_number': 5,
	182	'series': 'AirSpace',
	183	'series_id': '38057',
	184	'channel_id': '220986',
	185	'channel_url': 'https://beta.prx.org/accounts/220986',
	186	'channel': 'Air and Space Museum',
	187	},
	188	'playlist': [{
	189	'info_dict': {
	190	'id': '399200_part1',
	191	'title': 'Fly Me To The Moon',
	192	'description': 'md5:43230168390b95d3322048d8a56bf2bb',
	193	'release_timestamp': 1640250000,
	194	'timestamp': 1640208972,
	195	'modified_timestamp': 1641318202,
	196	'duration': 530,
	197	'tags': 'count:7',
	198	'episode_number': 8,
	199	'season_number': 5,
	200	'series': 'AirSpace',
	201	'series_id': '38057',
	202	'channel_id': '220986',
	203	'channel_url': 'https://beta.prx.org/accounts/220986',
	204	'channel': 'Air and Space Museum',
	205	'ext': 'mp3',
	206	'upload_date': '20211222',
	207	'episode': 'Episode 8',
	208	'release_date': '20211223',
	209	'season': 'Season 5',
	210	'modified_date': '20220104'
	211	}
	212	}, {
	213	'info_dict': {
	214	'id': '399200_part2',
	215	'title': 'Fly Me To The Moon',
	216	'description': 'md5:43230168390b95d3322048d8a56bf2bb',
	217	'release_timestamp': 1640250000,
	218	'timestamp': 1640208972,
	219	'modified_timestamp': 1641318202,
	220	'duration': 474,
	221	'tags': 'count:7',
	222	'episode_number': 8,
	223	'season_number': 5,
	224	'series': 'AirSpace',
	225	'series_id': '38057',
	226	'channel_id': '220986',
	227	'channel_url': 'https://beta.prx.org/accounts/220986',
	228	'channel': 'Air and Space Museum',
	229	'ext': 'mp3',
	230	'upload_date': '20211222',
	231	'episode': 'Episode 8',
	232	'release_date': '20211223',
	233	'season': 'Season 5',
	234	'modified_date': '20220104'
	235	}
	236	}
	237
	238	]
	239	}, {
	240	# Story with only split audio
	241	'url': 'https://beta.prx.org/stories/326414',
	242	'info_dict': {
	243	'id': '326414',
	244	'title': 'Massachusetts v EPA',
	245	'description': 'md5:744fffba08f19f4deab69fa8d49d5816',
	246	'timestamp': 1592509124,
	247	'modified_timestamp': 1592510457,
	248	'duration': 3088,
	249	'tags': 'count:0',
	250	'series': 'Outside/In',
	251	'series_id': '36252',
	252	'channel_id': '206',
	253	'channel_url': 'https://beta.prx.org/accounts/206',
	254	'channel': 'New Hampshire Public Radio',
	255	},
	256	'playlist_count': 4
	257	}, {
	258	# Story with single combined audio
	259	'url': 'https://beta.prx.org/stories/400404',
	260	'info_dict': {
	261	'id': '400404',
	262	'title': 'Cafe Chill (Episode 2022-01)',
	263	'thumbnails': 'count:1',
	264	'description': 'md5:9f1b5a3cbd64fb159d08c3baa31f1539',
	265	'timestamp': 1641233952,
	266	'modified_timestamp': 1641234248,
	267	'duration': 3540,
	268	'series': 'Café Chill',
	269	'series_id': '37762',
	270	'channel_id': '5767',
	271	'channel_url': 'https://beta.prx.org/accounts/5767',
	272	'channel': 'C89.5 - KNHC Seattle',
	273	'ext': 'mp3',
	274	'tags': 'count:0',
	275	'thumbnail': r're:https?://cms\.prx\.org/pub/\w+/0/web/story_image/767965/medium/Aurora_Over_Trees\.jpg',
	276	'upload_date': '20220103',
	277	'modified_date': '20220103'
	278	}
	279	}, {
	280	'url': 'https://listen.prx.org/stories/399200',
	281	'only_matching': True
	282	}
	283	]
	284
	285	def _extract_audio_pieces(self, audio_response):
	286	return [{
	287	'format_id': str_or_none(piece_response.get('id')),
	288	'format_note': str_or_none(piece_response.get('label')),
	289	'filesize': int_or_none(piece_response.get('size')),
	290	'duration': int_or_none(piece_response.get('duration')),
	291	'ext': mimetype2ext(piece_response.get('contentType')),
	292	'asr': int_or_none(piece_response.get('frequency'), scale=1000),
	293	'abr': int_or_none(piece_response.get('bitRate')),
	294	'url': self._extract_file_link(piece_response),
	295	'vcodec': 'none'
	296	} for piece_response in sorted(
	297	self._get_prx_embed_response(audio_response, 'items') or [],
	298	key=lambda p: int_or_none(p.get('position')))]
	299
	300	def _extract_story(self, story_response):
	301	info = self._extract_story_info(story_response)
	302	if not info:
	303	return
	304	audio_pieces = self._extract_audio_pieces(
	305	self._get_prx_embed_response(story_response, 'audio'))
	306	if len(audio_pieces) == 1:
	307	return {
	308	'formats': audio_pieces,
	309	**info
	310	}
	311
	312	entries = [{
	313	**info,
	314	'id': '%s_part%d' % (info['id'], (idx + 1)),
	315	'formats': [fmt],
	316	} for idx, fmt in enumerate(audio_pieces)]
	317	return {
	318	'_type': 'multi_video',
	319	'entries': entries,
	320	**info
	321	}
	322
	323	def _real_extract(self, url):
	324	story_id = self._match_id(url)
	325	response = self._call_api(story_id, f'stories/{story_id}')
	326	return self._extract_story(response)
	327
	328
	329	class PRXSeriesIE(PRXBaseIE):
	330	_VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'series/(?P<id>\d+)'
	331	_TESTS = [
	332	{
	333	'url': 'https://beta.prx.org/series/36252',
	334	'info_dict': {
	335	'id': '36252',
	336	'title': 'Outside/In',
	337	'thumbnails': 'count:1',
	338	'description': 'md5:a6bedc5f810777bcb09ab30ff9059114',
	339	'timestamp': 1470684964,
	340	'modified_timestamp': 1582308830,
	341	'channel_id': '206',
	342	'channel_url': 'https://beta.prx.org/accounts/206',
	343	'channel': 'New Hampshire Public Radio',
	344	'series': 'Outside/In',
	345	'series_id': '36252'
	346	},
	347	'playlist_mincount': 39
	348	}, {
	349	# Blank series
	350	'url': 'https://beta.prx.org/series/25038',
	351	'info_dict': {
	352	'id': '25038',
	353	'title': '25038',
	354	'timestamp': 1207612800,
	355	'modified_timestamp': 1207612800,
	356	'channel_id': '206',
	357	'channel_url': 'https://beta.prx.org/accounts/206',
	358	'channel': 'New Hampshire Public Radio',
	359	'series': '25038',
	360	'series_id': '25038'
	361	},
	362	'playlist_count': 0
	363	}
	364	]
	365
	366	def _extract_series(self, series_response):
	367	info = self._extract_series_info(series_response)
	368	return {
	369	'_type': 'playlist',
	370	'entries': self._entries(info['id'], 'series/%s/stories' % info['id'], self._story_playlist_entry),
	371	**info
	372	}
	373
	374	def _real_extract(self, url):
	375	series_id = self._match_id(url)
	376	response = self._call_api(series_id, f'series/{series_id}')
	377	return self._extract_series(response)
	378
	379
	380	class PRXAccountIE(PRXBaseIE):
	381	_VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'accounts/(?P<id>\d+)'
	382	_TESTS = [{
	383	'url': 'https://beta.prx.org/accounts/206',
	384	'info_dict': {
	385	'id': '206',
	386	'title': 'New Hampshire Public Radio',
	387	'description': 'md5:277f2395301d0aca563c80c70a18ee0a',
	388	'channel_id': '206',
	389	'channel_url': 'https://beta.prx.org/accounts/206',
	390	'channel': 'New Hampshire Public Radio',
	391	'thumbnails': 'count:1'
	392	},
	393	'playlist_mincount': 380
	394	}]
	395
	396	def _extract_account(self, account_response):
	397	info = self._extract_account_info(account_response)
	398	series = self._entries(
	399	info['id'], f'accounts/{info["id"]}/series', self._series_playlist_entry)
	400	stories = self._entries(
	401	info['id'], f'accounts/{info["id"]}/stories', self._story_playlist_entry)
	402	return {
	403	'_type': 'playlist',
	404	'entries': itertools.chain(series, stories),
	405	**info
	406	}
	407
	408	def _real_extract(self, url):
	409	account_id = self._match_id(url)
	410	response = self._call_api(account_id, f'accounts/{account_id}')
	411	return self._extract_account(response)
	412
	413
	414	class PRXStoriesSearchIE(PRXBaseIE, SearchInfoExtractor):
	415	IE_DESC = 'PRX Stories Search'
	416	IE_NAME = 'prxstories:search'
	417	_SEARCH_KEY = 'prxstories'
	418
	419	def _search_results(self, query):
	420	yield from self._entries(
	421	f'query {query}', 'stories/search', self._story_playlist_entry, query={'q': query})
	422
	423
	424	class PRXSeriesSearchIE(PRXBaseIE, SearchInfoExtractor):
	425	IE_DESC = 'PRX Series Search'
	426	IE_NAME = 'prxseries:search'
	427	_SEARCH_KEY = 'prxseries'
	428
	429	def _search_results(self, query):
	430	yield from self._entries(
	431	f'query {query}', 'series/search', self._series_playlist_entry, query={'q': query})