jfr.im git - yt-dlp.git/blame_incremental - yt_dlp/extractor/applepodcasts.py

... / ...

Commit	Line	Data
	1	from .common import InfoExtractor
	2	from ..utils import (
	3	clean_html,
	4	clean_podcast_url,
	5	get_element_by_class,
	6	int_or_none,
	7	parse_iso8601,
	8	try_get,
	9	)
	10
	11
	12	class ApplePodcastsIE(InfoExtractor):
	13	_VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)'
	14	_TESTS = [{
	15	'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
	16	'md5': '41dc31cd650143e530d9423b6b5a344f',
	17	'info_dict': {
	18	'id': '1000482637777',
	19	'ext': 'mp3',
	20	'title': '207 - Whitney Webb Returns',
	21	'description': 'md5:75ef4316031df7b41ced4e7b987f79c6',
	22	'upload_date': '20200705',
	23	'timestamp': 1593932400,
	24	'duration': 6454,
	25	'series': 'The Tim Dillon Show',
	26	'thumbnail': 're:.+[.](png\|jpe?g\|webp)',
	27	}
	28	}, {
	29	'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
	30	'only_matching': True,
	31	}, {
	32	'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns?i=1000482637777',
	33	'only_matching': True,
	34	}, {
	35	'url': 'https://podcasts.apple.com/podcast/id1135137367?i=1000482637777',
	36	'only_matching': True,
	37	}]
	38
	39	def _real_extract(self, url):
	40	episode_id = self._match_id(url)
	41	webpage = self._download_webpage(url, episode_id)
	42	episode_data = {}
	43	ember_data = {}
	44	# new page type 2021-11
	45	amp_data = self._parse_json(self._search_regex(
	46	r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]>\s({.+?})\s*<',
	47	webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {}
	48	amp_data = try_get(amp_data,
	49	lambda a: self._parse_json(
	50	next(a[x] for x in iter(a) if episode_id in x),
	51	episode_id),
	52	dict) or {}
	53	amp_data = amp_data.get('d') or []
	54	episode_data = try_get(
	55	amp_data,
	56	lambda a: next(x for x in a
	57	if x['type'] == 'podcast-episodes' and x['id'] == episode_id),
	58	dict)
	59	if not episode_data:
	60	# try pre 2021-11 page type: TODO: consider deleting if no longer used
	61	ember_data = self._parse_json(self._search_regex(
	62	r'(?s)id="shoebox-ember-data-store"[^>]>\s({.+?})\s*<',
	63	webpage, 'ember data'), episode_id) or {}
	64	ember_data = ember_data.get(episode_id) or ember_data
	65	episode_data = try_get(ember_data, lambda x: x['data'], dict)
	66	episode = episode_data['attributes']
	67	description = episode.get('description') or {}
	68
	69	series = None
	70	for inc in (amp_data or ember_data.get('included') or []):
	71	if inc.get('type') == 'media/podcast':
	72	series = try_get(inc, lambda x: x['attributes']['name'])
	73	series = series or clean_html(get_element_by_class('podcast-header__identity', webpage))
	74
	75	return {
	76	'id': episode_id,
	77	'title': episode.get('name'),
	78	'url': clean_podcast_url(episode['assetUrl']),
	79	'description': description.get('standard') or description.get('short'),
	80	'timestamp': parse_iso8601(episode.get('releaseDateTime')),
	81	'duration': int_or_none(episode.get('durationInMilliseconds'), 1000),
	82	'series': series,
	83	'thumbnail': self._og_search_thumbnail(webpage),
	84	'vcodec': 'none',
	85	}