[yt-dlp.git] / yt_dlp / extractor / podbayfm.py

from .common import InfoExtractor
from ..utils import OnDemandPagedList, int_or_none, jwt_decode_hs256, try_call


def result_from_props(props, episode_id=None):
    return {
        'id': props.get('podcast_id') or episode_id,
        'title': props.get('title'),
        'url': props['mediaURL'],
        'ext': 'mp3',
        'thumbnail': try_call(lambda: jwt_decode_hs256(props['image'])['url']),
        'timestamp': props.get('timestamp'),
        'duration': int_or_none(props.get('duration')),
    }


class PodbayFMIE(InfoExtractor):
    _VALID_URL = r'https?://podbay\.fm/p/[^/]*/e/(?P<id>[^/]*)/?(?:[\?#].*)?$'
    _TESTS = [{
        'url': 'https://podbay.fm/p/behind-the-bastards/e/1647338400',
        'md5': '98b41285dcf7989d105a4ed0404054cf',
        'info_dict': {
            'id': '1647338400',
            'title': 'Part One: Kissinger',
            'ext': 'mp3',
            'thumbnail': r're:^https?://.*\.jpg',
            'timestamp': 1647338400,
            'duration': 5001,
            'upload_date': '20220315',
        },
    }]

    def _real_extract(self, url):
        episode_id = self._match_id(url)
        webpage = self._download_webpage(url, episode_id)
        data = self._search_nextjs_data(webpage, episode_id)
        return result_from_props(data['props']['pageProps']['episode'], episode_id)


class PodbayFMChannelIE(InfoExtractor):
    _VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/]*)/?(?:[\?#].*)?$'
    _TESTS = [{
        'url': 'https://podbay.fm/p/behind-the-bastards',
        'info_dict': {
            'id': 'behind-the-bastards',
            'title': 'Behind the Bastards',
        },
    }]
    _PAGE_SIZE = 10

    def _fetch_page(self, channel_id, pagenum):
        return self._download_json(
            f'https://podbay.fm/api/podcast?reverse=true&page={pagenum}&slug={channel_id}',
            channel_id)['podcast']

    @staticmethod
    def _results_from_page(channel_id, page):
        return [{
            **result_from_props(e),
            'extractor': PodbayFMIE.IE_NAME,
            'extractor_key': PodbayFMIE.ie_key(),
            # somehow they use timestamps as the episode identifier
            'webpage_url': f'https://podbay.fm/p/{channel_id}/e/{e["timestamp"]}',
        } for e in page['episodes']]

    def _real_extract(self, url):
        channel_id = self._match_id(url)

        first_page = self._fetch_page(channel_id, 0)
        entries = OnDemandPagedList(
            lambda pagenum: self._results_from_page(
                channel_id, self._fetch_page(channel_id, pagenum) if pagenum else first_page),
            self._PAGE_SIZE)

        return self.playlist_result(entries, channel_id, first_page.get('title'))
Commit	Line	Data
2c98d998	1	from .common import InfoExtractor
	2	from ..utils import OnDemandPagedList, int_or_none, jwt_decode_hs256, try_call
	3
	4
	5	def result_from_props(props, episode_id=None):
	6	return {
	7	'id': props.get('podcast_id') or episode_id,
	8	'title': props.get('title'),
	9	'url': props['mediaURL'],
	10	'ext': 'mp3',
	11	'thumbnail': try_call(lambda: jwt_decode_hs256(props['image'])['url']),
	12	'timestamp': props.get('timestamp'),
	13	'duration': int_or_none(props.get('duration')),
	14	}
	15
	16
	17	class PodbayFMIE(InfoExtractor):
	18	_VALID_URL = r'https?://podbay\.fm/p/[^/]/e/(?P<id>[^/])/?(?:[\?#].*)?$'
	19	_TESTS = [{
	20	'url': 'https://podbay.fm/p/behind-the-bastards/e/1647338400',
	21	'md5': '98b41285dcf7989d105a4ed0404054cf',
	22	'info_dict': {
	23	'id': '1647338400',
	24	'title': 'Part One: Kissinger',
	25	'ext': 'mp3',
	26	'thumbnail': r're:^https?://.*\.jpg',
	27	'timestamp': 1647338400,
	28	'duration': 5001,
	29	'upload_date': '20220315',
	30	},
	31	}]
	32
	33	def _real_extract(self, url):
	34	episode_id = self._match_id(url)
	35	webpage = self._download_webpage(url, episode_id)
	36	data = self._search_nextjs_data(webpage, episode_id)
	37	return result_from_props(data['props']['pageProps']['episode'], episode_id)
	38
	39
	40	class PodbayFMChannelIE(InfoExtractor):
	41	_VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/])/?(?:[\?#].)?$'
	42	_TESTS = [{
	43	'url': 'https://podbay.fm/p/behind-the-bastards',
	44	'info_dict': {
	45	'id': 'behind-the-bastards',
	46	'title': 'Behind the Bastards',
	47	},
	48	}]
	49	_PAGE_SIZE = 10
	50
	51	def _fetch_page(self, channel_id, pagenum):
	52	return self._download_json(
	53	f'https://podbay.fm/api/podcast?reverse=true&page={pagenum}&slug={channel_id}',
	54	channel_id)['podcast']
	55
	56	@staticmethod
	57	def _results_from_page(channel_id, page):
	58	return [{
	59	**result_from_props(e),
	60	'extractor': PodbayFMIE.IE_NAME,
	61	'extractor_key': PodbayFMIE.ie_key(),
	62	# somehow they use timestamps as the episode identifier
	63	'webpage_url': f'https://podbay.fm/p/{channel_id}/e/{e["timestamp"]}',
	64	} for e in page['episodes']]
65
66	def _real_extract(self, url):
67	channel_id = self._match_id(url)
68
69	first_page = self._fetch_page(channel_id, 0)
70	entries = OnDemandPagedList(
71	lambda pagenum: self._results_from_page(
72	channel_id, self._fetch_page(channel_id, pagenum) if pagenum else first_page),
73	self._PAGE_SIZE)
74
75	return self.playlist_result(entries, channel_id, first_page.get('title'))