[yt-dlp.git] / yt_dlp / extractor / applepodcasts.py

from .common import InfoExtractor
from ..utils import (
    clean_html,
    clean_podcast_url,
    get_element_by_class,
    int_or_none,
    parse_iso8601,
    try_get,
)


class ApplePodcastsIE(InfoExtractor):
    _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)'
    _TESTS = [{
        'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
        'md5': '41dc31cd650143e530d9423b6b5a344f',
        'info_dict': {
            'id': '1000482637777',
            'ext': 'mp3',
            'title': '207 - Whitney Webb Returns',
            'description': 'md5:75ef4316031df7b41ced4e7b987f79c6',
            'upload_date': '20200705',
            'timestamp': 1593932400,
            'duration': 6454,
            'series': 'The Tim Dillon Show',
            'thumbnail': 're:.+[.](png|jpe?g|webp)',
        }
    }, {
        'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
        'only_matching': True,
    }, {
        'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns?i=1000482637777',
        'only_matching': True,
    }, {
        'url': 'https://podcasts.apple.com/podcast/id1135137367?i=1000482637777',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        episode_id = self._match_id(url)
        webpage = self._download_webpage(url, episode_id)
        episode_data = {}
        ember_data = {}
        # new page type 2021-11
        amp_data = self._parse_json(self._search_regex(
            r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]*>\s*({.+?})\s*<',
            webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {}
        amp_data = try_get(amp_data,
                           lambda a: self._parse_json(
                               next(a[x] for x in iter(a) if episode_id in x),
                               episode_id),
                           dict) or {}
        amp_data = amp_data.get('d') or []
        episode_data = try_get(
            amp_data,
            lambda a: next(x for x in a
                           if x['type'] == 'podcast-episodes' and x['id'] == episode_id),
            dict)
        if not episode_data:
            # try pre 2021-11 page type: TODO: consider deleting if no longer used
            ember_data = self._parse_json(self._search_regex(
                r'(?s)id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<',
                webpage, 'ember data'), episode_id) or {}
            ember_data = ember_data.get(episode_id) or ember_data
            episode_data = try_get(ember_data, lambda x: x['data'], dict)
        episode = episode_data['attributes']
        description = episode.get('description') or {}

        series = None
        for inc in (amp_data or ember_data.get('included') or []):
            if inc.get('type') == 'media/podcast':
                series = try_get(inc, lambda x: x['attributes']['name'])
        series = series or clean_html(get_element_by_class('podcast-header__identity', webpage))

        return {
            'id': episode_id,
            'title': episode.get('name'),
            'url': clean_podcast_url(episode['assetUrl']),
            'description': description.get('standard') or description.get('short'),
            'timestamp': parse_iso8601(episode.get('releaseDateTime')),
            'duration': int_or_none(episode.get('durationInMilliseconds'), 1000),
            'series': series,
            'thumbnail': self._og_search_thumbnail(webpage),
            'vcodec': 'none',
        }
Commit	Line	Data
00dd0cd5	1	from .common import InfoExtractor
00dd0cd5	2	from ..utils import (
50e93e03	3	clean_html,
00dd0cd5	4	clean_podcast_url,
50e93e03	5	get_element_by_class,
00dd0cd5	6	int_or_none,
	7	parse_iso8601,
	8	try_get,
	9	)
	10
	11
	12	class ApplePodcastsIE(InfoExtractor):
	13	_VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)'
	14	_TESTS = [{
	15	'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
50e93e03	16	'md5': '41dc31cd650143e530d9423b6b5a344f',
00dd0cd5	17	'info_dict': {
	18	'id': '1000482637777',
	19	'ext': 'mp3',
	20	'title': '207 - Whitney Webb Returns',
50e93e03	21	'description': 'md5:75ef4316031df7b41ced4e7b987f79c6',
00dd0cd5	22	'upload_date': '20200705',
50e93e03	23	'timestamp': 1593932400,
50e93e03	24	'duration': 6454,
00dd0cd5	25	'series': 'The Tim Dillon Show',
50e93e03	26	'thumbnail': 're:.+[.](png\|jpe?g\|webp)',
00dd0cd5	27	}
	28	}, {
	29	'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
	30	'only_matching': True,
	31	}, {
	32	'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns?i=1000482637777',
	33	'only_matching': True,
	34	}, {
	35	'url': 'https://podcasts.apple.com/podcast/id1135137367?i=1000482637777',
	36	'only_matching': True,
	37	}]
	38
	39	def _real_extract(self, url):
	40	episode_id = self._match_id(url)
	41	webpage = self._download_webpage(url, episode_id)
50e93e03	42	episode_data = {}
	43	ember_data = {}
	44	# new page type 2021-11
	45	amp_data = self._parse_json(self._search_regex(
	46	r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]>\s({.+?})\s*<',
	47	webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {}
	48	amp_data = try_get(amp_data,
	49	lambda a: self._parse_json(
	50	next(a[x] for x in iter(a) if episode_id in x),
	51	episode_id),
	52	dict) or {}
	53	amp_data = amp_data.get('d') or []
	54	episode_data = try_get(
	55	amp_data,
	56	lambda a: next(x for x in a
	57	if x['type'] == 'podcast-episodes' and x['id'] == episode_id),
	58	dict)
	59	if not episode_data:
	60	# try pre 2021-11 page type: TODO: consider deleting if no longer used
	61	ember_data = self._parse_json(self._search_regex(
	62	r'(?s)id="shoebox-ember-data-store"[^>]>\s({.+?})\s*<',
	63	webpage, 'ember data'), episode_id) or {}
	64	ember_data = ember_data.get(episode_id) or ember_data
	65	episode_data = try_get(ember_data, lambda x: x['data'], dict)
	66	episode = episode_data['attributes']
00dd0cd5	67	description = episode.get('description') or {}
	68
	69	series = None
50e93e03	70	for inc in (amp_data or ember_data.get('included') or []):
00dd0cd5	71	if inc.get('type') == 'media/podcast':
00dd0cd5	72	series = try_get(inc, lambda x: x['attributes']['name'])
50e93e03	73	series = series or clean_html(get_element_by_class('podcast-header__identity', webpage))
00dd0cd5	74
	75	return {
	76	'id': episode_id,
50e93e03	77	'title': episode.get('name'),
00dd0cd5	78	'url': clean_podcast_url(episode['assetUrl']),
	79	'description': description.get('standard') or description.get('short'),
	80	'timestamp': parse_iso8601(episode.get('releaseDateTime')),
	81	'duration': int_or_none(episode.get('durationInMilliseconds'), 1000),
	82	'series': series,
50e93e03	83	'thumbnail': self._og_search_thumbnail(webpage),
50e93e03	84	'vcodec': 'none',
00dd0cd5	85	}