]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/callin.py
[extractor] Improve `_generic_title`
[yt-dlp.git] / yt_dlp / extractor / callin.py
1 from .common import InfoExtractor
2 from ..utils import (
3 traverse_obj,
4 float_or_none,
5 int_or_none
6 )
7
8
9 class CallinIE(InfoExtractor):
10 _VALID_URL = r'https?://(?:www\.)?callin\.com/(episode)/(?P<id>[-a-zA-Z]+)'
11 _TESTS = [{
12 'url': 'https://www.callin.com/episode/the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc',
13 'info_dict': {
14 'id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd',
15 'title': 'The Title IX Regime and the Long March Through and Beyond the Institutions',
16 'ext': 'ts',
17 'display_id': 'the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc',
18 'thumbnail': 're:https://.+\\.png',
19 'description': 'First episode',
20 'uploader': 'Wesley Yang',
21 'timestamp': 1639404128.65,
22 'upload_date': '20211213',
23 'uploader_id': 'wesyang',
24 'uploader_url': 'http://wesleyyang.substack.com',
25 'channel': 'Conversations in Year Zero',
26 'channel_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553',
27 'channel_url': 'https://callin.com/show/conversations-in-year-zero-oJNllRFSfx',
28 'duration': 9951.936,
29 'view_count': int,
30 'categories': ['News & Politics', 'History', 'Technology'],
31 'cast': ['Wesley Yang', 'KC Johnson', 'Gabi Abramovich'],
32 'series': 'Conversations in Year Zero',
33 'series_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553',
34 'episode': 'The Title IX Regime and the Long March Through and Beyond the Institutions',
35 'episode_number': 1,
36 'episode_id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd'
37 }
38 }]
39
40 def try_get_user_name(self, d):
41 names = [d.get(n) for n in ('first', 'last')]
42 if None in names:
43 return next((n for n in names if n), default=None)
44 return ' '.join(names)
45
46 def _real_extract(self, url):
47 display_id = self._match_id(url)
48 webpage = self._download_webpage(url, display_id)
49
50 next_data = self._search_nextjs_data(webpage, display_id)
51 episode = next_data['props']['pageProps']['episode']
52
53 id = episode['id']
54 title = episode.get('title') or self._generic_title('', webpage)
55 url = episode['m3u8']
56 formats = self._extract_m3u8_formats(url, display_id, ext='ts')
57 self._sort_formats(formats)
58
59 show = traverse_obj(episode, ('show', 'title'))
60 show_id = traverse_obj(episode, ('show', 'id'))
61
62 show_json = None
63 app_slug = (self._html_search_regex(
64 '<script\\s+src=["\']/_next/static/([-_a-zA-Z0-9]+)/_',
65 webpage, 'app slug', fatal=False) or next_data.get('buildId'))
66 show_slug = traverse_obj(episode, ('show', 'linkObj', 'resourceUrl'))
67 if app_slug and show_slug and '/' in show_slug:
68 show_slug = show_slug.rsplit('/', 1)[1]
69 show_json_url = f'https://www.callin.com/_next/data/{app_slug}/show/{show_slug}.json'
70 show_json = self._download_json(show_json_url, display_id, fatal=False)
71
72 host = (traverse_obj(show_json, ('pageProps', 'show', 'hosts', 0))
73 or traverse_obj(episode, ('speakers', 0)))
74
75 host_nick = traverse_obj(host, ('linkObj', 'resourceUrl'))
76 host_nick = host_nick.rsplit('/', 1)[1] if (host_nick and '/' in host_nick) else None
77
78 cast = list(filter(None, [
79 self.try_get_user_name(u) for u in
80 traverse_obj(episode, (('speakers', 'callerTags'), ...)) or []
81 ]))
82
83 episode_list = traverse_obj(show_json, ('pageProps', 'show', 'episodes')) or []
84 episode_number = next(
85 (len(episode_list) - i for (i, e) in enumerate(episode_list) if e.get('id') == id),
86 None)
87
88 return {
89 'id': id,
90 'display_id': display_id,
91 'title': title,
92 'formats': formats,
93 'thumbnail': traverse_obj(episode, ('show', 'photo')),
94 'description': episode.get('description'),
95 'uploader': self.try_get_user_name(host) if host else None,
96 'timestamp': episode.get('publishedAt'),
97 'uploader_id': host_nick,
98 'uploader_url': traverse_obj(show_json, ('pageProps', 'show', 'url')),
99 'channel': show,
100 'channel_id': show_id,
101 'channel_url': traverse_obj(episode, ('show', 'linkObj', 'resourceUrl')),
102 'duration': float_or_none(episode.get('runtime')),
103 'view_count': int_or_none(episode.get('plays')),
104 'categories': traverse_obj(episode, ('show', 'categorizations', ..., 'name')),
105 'cast': cast if cast else None,
106 'series': show,
107 'series_id': show_id,
108 'episode': title,
109 'episode_number': episode_number,
110 'episode_id': id
111 }