[yt-dlp.git] / yt_dlp / extractor / podbayfm.py

from .common import InfoExtractor
from ..utils import (
    OnDemandPagedList,
    clean_html,
    int_or_none,
    jwt_decode_hs256,
    url_or_none,
)
from ..utils.traversal import traverse_obj


def result_from_props(props):
    return {
        **traverse_obj(props, {
            'id': ('_id', {str}),
            'title': ('title', {str}),
            'url': ('mediaURL', {url_or_none}),
            'description': ('description', {clean_html}),
            'thumbnail': ('image', {jwt_decode_hs256}, 'url', {url_or_none}),
            'timestamp': ('timestamp', {int_or_none}),
            'duration': ('duration', {int_or_none}),
        }),
        'ext': 'mp3',
        'vcodec': 'none',
    }


class PodbayFMIE(InfoExtractor):
    _VALID_URL = r'https?://podbay\.fm/p/[^/?#]+/e/(?P<id>\d+)'
    _TESTS = [{
        'url': 'https://podbay.fm/p/behind-the-bastards/e/1647338400',
        'md5': '895ac8505de349515f5ee8a4a3195c93',
        'info_dict': {
            'id': '62306451f4a48e58d0c4d6a8',
            'title': 'Part One: Kissinger',
            'ext': 'mp3',
            'description': r're:^We begin our epic six part series on Henry Kissinger.+',
            'thumbnail': r're:^https?://.*\.jpg',
            'timestamp': 1647338400,
            'duration': 5001,
            'upload_date': '20220315',
        },
    }]

    def _real_extract(self, url):
        episode_id = self._match_id(url)
        webpage = self._download_webpage(url, episode_id)
        data = self._search_nextjs_data(webpage, episode_id)
        return result_from_props(data['props']['pageProps']['episode'])


class PodbayFMChannelIE(InfoExtractor):
    _VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/?#]+)/?(?:$|[?#])'
    _TESTS = [{
        'url': 'https://podbay.fm/p/behind-the-bastards',
        'info_dict': {
            'id': 'behind-the-bastards',
            'title': 'Behind the Bastards',
        },
        'playlist_mincount': 21,
    }]
    _PAGE_SIZE = 10

    def _fetch_page(self, channel_id, pagenum):
        return self._download_json(
            f'https://podbay.fm/api/podcast?reverse=true&page={pagenum}&slug={channel_id}',
            f'Downloading channel JSON page {pagenum + 1}', channel_id)['podcast']

    @staticmethod
    def _results_from_page(channel_id, page):
        return [{
            **result_from_props(e),
            'extractor': PodbayFMIE.IE_NAME,
            'extractor_key': PodbayFMIE.ie_key(),
            # somehow they use timestamps as the episode identifier
            'webpage_url': f'https://podbay.fm/p/{channel_id}/e/{e["timestamp"]}',
        } for e in page['episodes']]

    def _real_extract(self, url):
        channel_id = self._match_id(url)

        first_page = self._fetch_page(channel_id, 0)
        entries = OnDemandPagedList(
            lambda pagenum: self._results_from_page(
                channel_id, self._fetch_page(channel_id, pagenum) if pagenum else first_page),
            self._PAGE_SIZE)

        return self.playlist_result(entries, channel_id, first_page.get('title'))
Commit	Line	Data
2c98d998	1	from .common import InfoExtractor
d4b52ce3	2	from ..utils import (
	3	OnDemandPagedList,
	4	clean_html,
	5	int_or_none,
	6	jwt_decode_hs256,
	7	url_or_none,
	8	)
	9	from ..utils.traversal import traverse_obj
2c98d998	10
2c98d998	11
d4b52ce3	12	def result_from_props(props):
2c98d998	13	return {
d4b52ce3	14	**traverse_obj(props, {
	15	'id': ('_id', {str}),
	16	'title': ('title', {str}),
	17	'url': ('mediaURL', {url_or_none}),
	18	'description': ('description', {clean_html}),
	19	'thumbnail': ('image', {jwt_decode_hs256}, 'url', {url_or_none}),
	20	'timestamp': ('timestamp', {int_or_none}),
	21	'duration': ('duration', {int_or_none}),
	22	}),
2c98d998	23	'ext': 'mp3',
d4b52ce3	24	'vcodec': 'none',
2c98d998	25	}
	26
	27
	28	class PodbayFMIE(InfoExtractor):
d4b52ce3	29	_VALID_URL = r'https?://podbay\.fm/p/[^/?#]+/e/(?P<id>\d+)'
2c98d998	30	_TESTS = [{
2c98d998	31	'url': 'https://podbay.fm/p/behind-the-bastards/e/1647338400',
d4b52ce3	32	'md5': '895ac8505de349515f5ee8a4a3195c93',
2c98d998	33	'info_dict': {
d4b52ce3	34	'id': '62306451f4a48e58d0c4d6a8',
2c98d998	35	'title': 'Part One: Kissinger',
2c98d998	36	'ext': 'mp3',
d4b52ce3	37	'description': r're:^We begin our epic six part series on Henry Kissinger.+',
2c98d998	38	'thumbnail': r're:^https?://.*\.jpg',
	39	'timestamp': 1647338400,
	40	'duration': 5001,
	41	'upload_date': '20220315',
	42	},
	43	}]
	44
	45	def _real_extract(self, url):
	46	episode_id = self._match_id(url)
	47	webpage = self._download_webpage(url, episode_id)
	48	data = self._search_nextjs_data(webpage, episode_id)
d4b52ce3	49	return result_from_props(data['props']['pageProps']['episode'])
2c98d998	50
	51
	52	class PodbayFMChannelIE(InfoExtractor):
d4b52ce3	53	_VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/?#]+)/?(?:$\|[?#])'
2c98d998	54	_TESTS = [{
	55	'url': 'https://podbay.fm/p/behind-the-bastards',
	56	'info_dict': {
	57	'id': 'behind-the-bastards',
	58	'title': 'Behind the Bastards',
	59	},
d4b52ce3	60	'playlist_mincount': 21,
2c98d998	61	}]
	62	_PAGE_SIZE = 10
	63
	64	def _fetch_page(self, channel_id, pagenum):
	65	return self._download_json(
	66	f'https://podbay.fm/api/podcast?reverse=true&page={pagenum}&slug={channel_id}',
d4b52ce3	67	f'Downloading channel JSON page {pagenum + 1}', channel_id)['podcast']
2c98d998	68
	69	@staticmethod
	70	def _results_from_page(channel_id, page):
	71	return [{
	72	**result_from_props(e),
	73	'extractor': PodbayFMIE.IE_NAME,
	74	'extractor_key': PodbayFMIE.ie_key(),
	75	# somehow they use timestamps as the episode identifier
	76	'webpage_url': f'https://podbay.fm/p/{channel_id}/e/{e["timestamp"]}',
	77	} for e in page['episodes']]
	78
	79	def _real_extract(self, url):
	80	channel_id = self._match_id(url)
	81
	82	first_page = self._fetch_page(channel_id, 0)
	83	entries = OnDemandPagedList(
	84	lambda pagenum: self._results_from_page(
	85	channel_id, self._fetch_page(channel_id, pagenum) if pagenum else first_page),
	86	self._PAGE_SIZE)
	87
	88	return self.playlist_result(entries, channel_id, first_page.get('title'))