yt_dlp/extractor/callin.py

   1 # coding: utf-8
   2 from .common import InfoExtractor
   3 from ..utils import (
   4     traverse_obj,
   5     float_or_none,
   6     int_or_none
   7 )
   8
   9
  10 class CallinIE(InfoExtractor):
  11     _VALID_URL = r'https?://(?:www\.)?callin\.com/(episode)/(?P<id>[-a-zA-Z]+)'
  12     _TESTS = [{
  13         'url': 'https://www.callin.com/episode/the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc',
  14         'info_dict': {
  15             'id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd',
  16             'title': 'The Title IX Regime and the Long March Through and Beyond the Institutions',
  17             'ext': 'ts',
  18             'display_id': 'the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc',
  19             'thumbnail': 're:https://.+\\.png',
  20             'description': 'First episode',
  21             'uploader': 'Wesley Yang',
  22             'timestamp': 1639404128.65,
  23             'upload_date': '20211213',
  24             'uploader_id': 'wesyang',
  25             'uploader_url': 'http://wesleyyang.substack.com',
  26             'channel': 'Conversations in Year Zero',
  27             'channel_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553',
  28             'channel_url': 'https://callin.com/show/conversations-in-year-zero-oJNllRFSfx',
  29             'duration': 9951.936,
  30             'view_count': int,
  31             'categories': ['News & Politics', 'History', 'Technology'],
  32             'cast': ['Wesley Yang', 'KC Johnson', 'Gabi Abramovich'],
  33             'series': 'Conversations in Year Zero',
  34             'series_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553',
  35             'episode': 'The Title IX Regime and the Long March Through and Beyond the Institutions',
  36             'episode_number': 1,
  37             'episode_id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd'
  38         }
  39     }]
  40
  41     def try_get_user_name(self, d):
  42         names = [d.get(n) for n in ('first', 'last')]
  43         if None in names:
  44             return next((n for n in names if n), default=None)
  45         return ' '.join(names)
  46
  47     def _real_extract(self, url):
  48         display_id = self._match_id(url)
  49         webpage = self._download_webpage(url, display_id)
  50
  51         next_data = self._search_nextjs_data(webpage, display_id)
  52         episode = next_data['props']['pageProps']['episode']
  53
  54         id = episode['id']
  55         title = (episode.get('title')
  56                  or self._og_search_title(webpage, fatal=False)
  57                  or self._html_search_regex('<title>(.*?)</title>', webpage, 'title'))
  58         url = episode['m3u8']
  59         formats = self._extract_m3u8_formats(url, display_id, ext='ts')
  60         self._sort_formats(formats)
  61
  62         show = traverse_obj(episode, ('show', 'title'))
  63         show_id = traverse_obj(episode, ('show', 'id'))
  64
  65         show_json = None
  66         app_slug = (self._html_search_regex(
  67             '<script\\s+src=["\']/_next/static/([-_a-zA-Z0-9]+)/_',
  68             webpage, 'app slug', fatal=False) or next_data.get('buildId'))
  69         show_slug = traverse_obj(episode, ('show', 'linkObj', 'resourceUrl'))
  70         if app_slug and show_slug and '/' in show_slug:
  71             show_slug = show_slug.rsplit('/', 1)[1]
  72             show_json_url = f'https://www.callin.com/_next/data/{app_slug}/show/{show_slug}.json'
  73             show_json = self._download_json(show_json_url, display_id, fatal=False)
  74
  75         host = (traverse_obj(show_json, ('pageProps', 'show', 'hosts', 0))
  76                 or traverse_obj(episode, ('speakers', 0)))
  77
  78         host_nick = traverse_obj(host, ('linkObj', 'resourceUrl'))
  79         host_nick = host_nick.rsplit('/', 1)[1] if (host_nick and '/' in host_nick) else None
  80
  81         cast = list(filter(None, [
  82             self.try_get_user_name(u) for u in
  83             traverse_obj(episode, (('speakers', 'callerTags'), ...)) or []
  84         ]))
  85
  86         episode_list = traverse_obj(show_json, ('pageProps', 'show', 'episodes')) or []
  87         episode_number = next(
  88             (len(episode_list) - i for (i, e) in enumerate(episode_list) if e.get('id') == id),
  89             None)
  90
  91         return {
  92             'id': id,
  93             'display_id': display_id,
  94             'title': title,
  95             'formats': formats,
  96             'thumbnail': traverse_obj(episode, ('show', 'photo')),
  97             'description': episode.get('description'),
  98             'uploader': self.try_get_user_name(host) if host else None,
  99             'timestamp': episode.get('publishedAt'),
 100             'uploader_id': host_nick,
 101             'uploader_url': traverse_obj(show_json, ('pageProps', 'show', 'url')),
 102             'channel': show,
 103             'channel_id': show_id,
 104             'channel_url': traverse_obj(episode, ('show', 'linkObj', 'resourceUrl')),
 105             'duration': float_or_none(episode.get('runtime')),
 106             'view_count': int_or_none(episode.get('plays')),
 107             'categories': traverse_obj(episode, ('show', 'categorizations', ..., 'name')),
 108             'cast': cast if cast else None,
 109             'series': show,
 110             'series_id': show_id,
 111             'episode': title,
 112             'episode_number': episode_number,
 113             'episode_id': id
 114         }