[yt-dlp.git] / yt_dlp / extractor / googlepodcasts.py

import json

from .common import InfoExtractor
from ..utils import (
    clean_podcast_url,
    int_or_none,
    try_get,
    urlencode_postdata,
)


class GooglePodcastsBaseIE(InfoExtractor):
    _VALID_URL_BASE = r'https?://podcasts\.google\.com/feed/'

    def _batch_execute(self, func_id, video_id, params):
        return json.loads(self._download_json(
            'https://podcasts.google.com/_/PodcastsUi/data/batchexecute',
            video_id, data=urlencode_postdata({
                'f.req': json.dumps([[[func_id, json.dumps(params), None, '1']]]),
            }), transform_source=lambda x: self._search_regex(r'(?s)(\[.+\])', x, 'data'))[0][2])

    def _extract_episode(self, episode):
        return {
            'id': episode[4][3],
            'title': episode[8],
            'url': clean_podcast_url(episode[13]),
            'thumbnail': episode[2],
            'description': episode[9],
            'creator': try_get(episode, lambda x: x[14]),
            'timestamp': int_or_none(episode[11]),
            'duration': int_or_none(episode[12]),
            'series': episode[1],
        }


class GooglePodcastsIE(GooglePodcastsBaseIE):
    IE_NAME = 'google:podcasts'
    _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P<feed_url>[^/]+)/episode/(?P<id>[^/?&#]+)'
    _TEST = {
        'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA/episode/MzBlNWRlN2UtOWE4Yy00ODcwLTk2M2MtM2JlMmUyNmViOTRh',
        'md5': 'fa56b2ee8bd0703e27e42d4b104c4766',
        'info_dict': {
            'id': '30e5de7e-9a8c-4870-963c-3be2e26eb94a',
            'ext': 'mp3',
            'title': 'WWDTM New Year 2021',
            'description': 'We say goodbye to 2020 with Christine Baranksi, Doug Jones, Jonna Mendez, and Kellee Edwards.',
            'upload_date': '20210102',
            'timestamp': 1609606800,
            'duration': 2901,
            'series': "Wait Wait... Don't Tell Me!",
        }
    }

    def _real_extract(self, url):
        b64_feed_url, b64_guid = self._match_valid_url(url).groups()
        episode = self._batch_execute(
            'oNjqVe', b64_guid, [b64_feed_url, b64_guid])[1]
        return self._extract_episode(episode)


class GooglePodcastsFeedIE(GooglePodcastsBaseIE):
    IE_NAME = 'google:podcasts:feed'
    _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P<id>[^/?&#]+)/?(?:[?#&]|$)'
    _TEST = {
        'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA',
        'info_dict': {
            'title': "Wait Wait... Don't Tell Me!",
            'description': "NPR's weekly current events quiz. Have a laugh and test your news knowledge while figuring out what's real and what we've made up.",
        },
        'playlist_mincount': 20,
    }

    def _real_extract(self, url):
        b64_feed_url = self._match_id(url)
        data = self._batch_execute('ncqJEe', b64_feed_url, [b64_feed_url])

        entries = []
        for episode in (try_get(data, lambda x: x[1][0]) or []):
            entries.append(self._extract_episode(episode))

        feed = try_get(data, lambda x: x[3]) or []
        return self.playlist_result(
            entries, playlist_title=try_get(feed, lambda x: x[0]),
            playlist_description=try_get(feed, lambda x: x[2]))
Commit	Line	Data
00dd0cd5	1	import json
00dd0cd5	2
	3	from .common import InfoExtractor
	4	from ..utils import (
	5	clean_podcast_url,
	6	int_or_none,
	7	try_get,
	8	urlencode_postdata,
	9	)
	10
	11
	12	class GooglePodcastsBaseIE(InfoExtractor):
	13	_VALID_URL_BASE = r'https?://podcasts\.google\.com/feed/'
	14
	15	def _batch_execute(self, func_id, video_id, params):
	16	return json.loads(self._download_json(
	17	'https://podcasts.google.com/_/PodcastsUi/data/batchexecute',
	18	video_id, data=urlencode_postdata({
	19	'f.req': json.dumps([[[func_id, json.dumps(params), None, '1']]]),
	20	}), transform_source=lambda x: self._search_regex(r'(?s)(\[.+\])', x, 'data'))[0][2])
	21
	22	def _extract_episode(self, episode):
	23	return {
	24	'id': episode[4][3],
	25	'title': episode[8],
	26	'url': clean_podcast_url(episode[13]),
	27	'thumbnail': episode[2],
	28	'description': episode[9],
	29	'creator': try_get(episode, lambda x: x[14]),
	30	'timestamp': int_or_none(episode[11]),
	31	'duration': int_or_none(episode[12]),
	32	'series': episode[1],
	33	}
	34
	35
	36	class GooglePodcastsIE(GooglePodcastsBaseIE):
	37	IE_NAME = 'google:podcasts'
	38	_VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P<feed_url>[^/]+)/episode/(?P<id>[^/?&#]+)'
	39	_TEST = {
	40	'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA/episode/MzBlNWRlN2UtOWE4Yy00ODcwLTk2M2MtM2JlMmUyNmViOTRh',
	41	'md5': 'fa56b2ee8bd0703e27e42d4b104c4766',
	42	'info_dict': {
	43	'id': '30e5de7e-9a8c-4870-963c-3be2e26eb94a',
	44	'ext': 'mp3',
	45	'title': 'WWDTM New Year 2021',
	46	'description': 'We say goodbye to 2020 with Christine Baranksi, Doug Jones, Jonna Mendez, and Kellee Edwards.',
	47	'upload_date': '20210102',
	48	'timestamp': 1609606800,
	49	'duration': 2901,
	50	'series': "Wait Wait... Don't Tell Me!",
	51	}
	52	}
	53
	54	def _real_extract(self, url):
5ad28e7f	55	b64_feed_url, b64_guid = self._match_valid_url(url).groups()
00dd0cd5	56	episode = self._batch_execute(
	57	'oNjqVe', b64_guid, [b64_feed_url, b64_guid])[1]
	58	return self._extract_episode(episode)
	59
	60
	61	class GooglePodcastsFeedIE(GooglePodcastsBaseIE):
	62	IE_NAME = 'google:podcasts:feed'
	63	_VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P<id>[^/?&#]+)/?(?:[?#&]\|$)'
	64	_TEST = {
	65	'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA',
	66	'info_dict': {
	67	'title': "Wait Wait... Don't Tell Me!",
	68	'description': "NPR's weekly current events quiz. Have a laugh and test your news knowledge while figuring out what's real and what we've made up.",
	69	},
	70	'playlist_mincount': 20,
	71	}
	72
	73	def _real_extract(self, url):
	74	b64_feed_url = self._match_id(url)
	75	data = self._batch_execute('ncqJEe', b64_feed_url, [b64_feed_url])
	76
	77	entries = []
	78	for episode in (try_get(data, lambda x: x[1][0]) or []):
	79	entries.append(self._extract_episode(episode))
	80
	81	feed = try_get(data, lambda x: x[3]) or []
	82	return self.playlist_result(
	83	entries, playlist_title=try_get(feed, lambda x: x[0]),
	84	playlist_description=try_get(feed, lambda x: x[2]))