]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/podbayfm.py
[ie/orf:on] Improve extraction (#9677)
[yt-dlp.git] / yt_dlp / extractor / podbayfm.py
CommitLineData
2c98d998 1from .common import InfoExtractor
2from ..utils import OnDemandPagedList, int_or_none, jwt_decode_hs256, try_call
3
4
5def result_from_props(props, episode_id=None):
6 return {
7 'id': props.get('podcast_id') or episode_id,
8 'title': props.get('title'),
9 'url': props['mediaURL'],
10 'ext': 'mp3',
11 'thumbnail': try_call(lambda: jwt_decode_hs256(props['image'])['url']),
12 'timestamp': props.get('timestamp'),
13 'duration': int_or_none(props.get('duration')),
14 }
15
16
17class PodbayFMIE(InfoExtractor):
18 _VALID_URL = r'https?://podbay\.fm/p/[^/]*/e/(?P<id>[^/]*)/?(?:[\?#].*)?$'
19 _TESTS = [{
20 'url': 'https://podbay.fm/p/behind-the-bastards/e/1647338400',
21 'md5': '98b41285dcf7989d105a4ed0404054cf',
22 'info_dict': {
23 'id': '1647338400',
24 'title': 'Part One: Kissinger',
25 'ext': 'mp3',
26 'thumbnail': r're:^https?://.*\.jpg',
27 'timestamp': 1647338400,
28 'duration': 5001,
29 'upload_date': '20220315',
30 },
31 }]
32
33 def _real_extract(self, url):
34 episode_id = self._match_id(url)
35 webpage = self._download_webpage(url, episode_id)
36 data = self._search_nextjs_data(webpage, episode_id)
37 return result_from_props(data['props']['pageProps']['episode'], episode_id)
38
39
40class PodbayFMChannelIE(InfoExtractor):
41 _VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/]*)/?(?:[\?#].*)?$'
42 _TESTS = [{
43 'url': 'https://podbay.fm/p/behind-the-bastards',
44 'info_dict': {
45 'id': 'behind-the-bastards',
46 'title': 'Behind the Bastards',
47 },
48 }]
49 _PAGE_SIZE = 10
50
51 def _fetch_page(self, channel_id, pagenum):
52 return self._download_json(
53 f'https://podbay.fm/api/podcast?reverse=true&page={pagenum}&slug={channel_id}',
54 channel_id)['podcast']
55
56 @staticmethod
57 def _results_from_page(channel_id, page):
58 return [{
59 **result_from_props(e),
60 'extractor': PodbayFMIE.IE_NAME,
61 'extractor_key': PodbayFMIE.ie_key(),
62 # somehow they use timestamps as the episode identifier
63 'webpage_url': f'https://podbay.fm/p/{channel_id}/e/{e["timestamp"]}',
64 } for e in page['episodes']]
65
66 def _real_extract(self, url):
67 channel_id = self._match_id(url)
68
69 first_page = self._fetch_page(channel_id, 0)
70 entries = OnDemandPagedList(
71 lambda pagenum: self._results_from_page(
72 channel_id, self._fetch_page(channel_id, pagenum) if pagenum else first_page),
73 self._PAGE_SIZE)
74
75 return self.playlist_result(entries, channel_id, first_page.get('title'))