yt_dlp/extractor/podbayfm.py

   1 from .common import InfoExtractor
   2 from ..utils import OnDemandPagedList, int_or_none, jwt_decode_hs256, try_call
   3
   4
   5 def result_from_props(props, episode_id=None):
   6     return {
   7         'id': props.get('podcast_id') or episode_id,
   8         'title': props.get('title'),
   9         'url': props['mediaURL'],
  10         'ext': 'mp3',
  11         'thumbnail': try_call(lambda: jwt_decode_hs256(props['image'])['url']),
  12         'timestamp': props.get('timestamp'),
  13         'duration': int_or_none(props.get('duration')),
  14     }
  15
  16
  17 class PodbayFMIE(InfoExtractor):
  18     _VALID_URL = r'https?://podbay\.fm/p/[^/]*/e/(?P<id>[^/]*)/?(?:[\?#].*)?$'
  19     _TESTS = [{
  20         'url': 'https://podbay.fm/p/behind-the-bastards/e/1647338400',
  21         'md5': '98b41285dcf7989d105a4ed0404054cf',
  22         'info_dict': {
  23             'id': '1647338400',
  24             'title': 'Part One: Kissinger',
  25             'ext': 'mp3',
  26             'thumbnail': r're:^https?://.*\.jpg',
  27             'timestamp': 1647338400,
  28             'duration': 5001,
  29             'upload_date': '20220315',
  30         },
  31     }]
  32
  33     def _real_extract(self, url):
  34         episode_id = self._match_id(url)
  35         webpage = self._download_webpage(url, episode_id)
  36         data = self._search_nextjs_data(webpage, episode_id)
  37         return result_from_props(data['props']['pageProps']['episode'], episode_id)
  38
  39
  40 class PodbayFMChannelIE(InfoExtractor):
  41     _VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/]*)/?(?:[\?#].*)?$'
  42     _TESTS = [{
  43         'url': 'https://podbay.fm/p/behind-the-bastards',
  44         'info_dict': {
  45             'id': 'behind-the-bastards',
  46             'title': 'Behind the Bastards',
  47         },
  48     }]
  49     _PAGE_SIZE = 10
  50
  51     def _fetch_page(self, channel_id, pagenum):
  52         return self._download_json(
  53             f'https://podbay.fm/api/podcast?reverse=true&page={pagenum}&slug={channel_id}',
  54             channel_id)['podcast']
  55
  56     @staticmethod
  57     def _results_from_page(channel_id, page):
  58         return [{
  59             **result_from_props(e),
  60             'extractor': PodbayFMIE.IE_NAME,
  61             'extractor_key': PodbayFMIE.ie_key(),
  62             # somehow they use timestamps as the episode identifier
  63             'webpage_url': f'https://podbay.fm/p/{channel_id}/e/{e["timestamp"]}',
  64         } for e in page['episodes']]
  65
  66     def _real_extract(self, url):
  67         channel_id = self._match_id(url)
  68
  69         first_page = self._fetch_page(channel_id, 0)
  70         entries = OnDemandPagedList(
  71             lambda pagenum: self._results_from_page(
  72                 channel_id, self._fetch_page(channel_id, pagenum) if pagenum else first_page),
  73             self._PAGE_SIZE)
  74
  75         return self.playlist_result(entries, channel_id, first_page.get('title'))