]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/nebula.py
[extractor/youtube] Ignore incomplete data for comment threads by default (#7475)
[yt-dlp.git] / yt_dlp / extractor / nebula.py
CommitLineData
359df0fc 1import itertools
bdc196a4 2import json
ac668111 3import urllib.error
bdc196a4 4
359df0fc 5from .common import InfoExtractor
3f756c8c 6from ..utils import ExtractorError, make_archive_id, parse_iso8601, remove_start
359df0fc 7
cbfe2e5c 8_BASE_URL_RE = r'https?://(?:www\.|beta\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)'
4cca2eb1 9
359df0fc
HH
10
11class NebulaBaseIE(InfoExtractor):
12 _NETRC_MACHINE = 'watchnebula'
13
14 _nebula_api_token = None
15 _nebula_bearer_token = None
359df0fc 16
f3b3fe16
HH
17 def _perform_nebula_auth(self, username, password):
18 if not username or not password:
d50ea3ce 19 self.raise_login_required(method='password')
359df0fc
HH
20
21 data = json.dumps({'email': username, 'password': password}).encode('utf8')
22 response = self._download_json(
23 'https://api.watchnebula.com/api/v1/auth/login/',
24 data=data, fatal=False, video_id=None,
25 headers={
26 'content-type': 'application/json',
27 # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint
28 'cookie': ''
29 },
30 note='Logging in to Nebula with supplied credentials',
31 errnote='Authentication failed or rejected')
32 if not response or not response.get('key'):
d50ea3ce 33 self.raise_login_required(method='password')
359df0fc
HH
34
35 return response['key']
36
359df0fc
HH
37 def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''):
38 assert method in ('GET', 'POST',)
39 assert auth_type in ('api', 'bearer',)
bdc196a4 40
359df0fc
HH
41 def inner_call():
42 authorization = f'Token {self._nebula_api_token}' if auth_type == 'api' else f'Bearer {self._nebula_bearer_token}'
43 return self._download_json(
44 url, video_id, note=note, headers={'Authorization': authorization},
45 data=b'' if method == 'POST' else None)
46
47 try:
48 return inner_call()
49 except ExtractorError as exc:
50 # if 401 or 403, attempt credential re-auth and retry
51 if exc.cause and isinstance(exc.cause, urllib.error.HTTPError) and exc.cause.code in (401, 403):
52 self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}')
4c268f9c 53 self._perform_login()
359df0fc
HH
54 return inner_call()
55 else:
56 raise
57
58 def _fetch_nebula_bearer_token(self):
59 """
60 Get a Bearer token for the Nebula API. This will be required to fetch video meta data.
61 """
62 response = self._call_nebula_api('https://api.watchnebula.com/api/v1/authorization/',
63 method='POST',
64 note='Authorizing to Nebula')
65 return response['token']
bdc196a4 66
d50ea3ce 67 def _fetch_video_formats(self, slug):
3f756c8c 68 stream_info = self._call_nebula_api(f'https://content.api.nebula.app/video/{slug}/stream/',
d50ea3ce
HH
69 video_id=slug,
70 auth_type='bearer',
71 note='Fetching video stream info')
72 manifest_url = stream_info['manifest']
3f756c8c 73 return self._extract_m3u8_formats_and_subtitles(manifest_url, slug, 'mp4')
359df0fc
HH
74
75 def _build_video_info(self, episode):
d50ea3ce 76 fmts, subs = self._fetch_video_formats(episode['slug'])
359df0fc 77 channel_slug = episode['channel_slug']
d50ea3ce 78 channel_title = episode['channel_title']
3f756c8c 79 zype_id = episode.get('zype_id')
359df0fc 80 return {
3f756c8c 81 'id': remove_start(episode['id'], 'video_episode:'),
359df0fc 82 'display_id': episode['slug'],
d50ea3ce
HH
83 'formats': fmts,
84 'subtitles': subs,
85 'webpage_url': f'https://nebula.tv/{episode["slug"]}',
359df0fc
HH
86 'title': episode['title'],
87 'description': episode['description'],
88 'timestamp': parse_iso8601(episode['published_at']),
89 'thumbnails': [{
90 # 'id': tn.get('name'), # this appears to be null
91 'url': tn['original'],
92 'height': key,
93 } for key, tn in episode['assets']['thumbnail'].items()],
94 'duration': episode['duration'],
d50ea3ce 95 'channel': channel_title,
359df0fc 96 'channel_id': channel_slug,
d50ea3ce
HH
97 'channel_url': f'https://nebula.tv/{channel_slug}',
98 'uploader': channel_title,
359df0fc 99 'uploader_id': channel_slug,
d50ea3ce
HH
100 'uploader_url': f'https://nebula.tv/{channel_slug}',
101 'series': channel_title,
102 'creator': channel_title,
3f756c8c
RH
103 'extractor_key': NebulaIE.ie_key(),
104 'extractor': NebulaIE.IE_NAME,
105 '_old_archive_ids': [make_archive_id(NebulaIE, zype_id)] if zype_id else None,
359df0fc
HH
106 }
107
52efa4b3 108 def _perform_login(self, username=None, password=None):
d50ea3ce 109 self._nebula_api_token = self._perform_nebula_auth(username, password)
359df0fc 110 self._nebula_bearer_token = self._fetch_nebula_bearer_token()
359df0fc 111
359df0fc
HH
112
113class NebulaIE(NebulaBaseIE):
4cca2eb1 114 _VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[-\w]+)'
bdc196a4
GS
115 _TESTS = [
116 {
d50ea3ce 117 'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast',
f3b3fe16 118 'md5': '14944cfee8c7beeea106320c47560efc',
bdc196a4 119 'info_dict': {
3f756c8c 120 'id': '84ed544d-4afd-4723-8cd5-2b95261f0abf',
bdc196a4
GS
121 'ext': 'mp4',
122 'title': 'That Time Disney Remade Beauty and the Beast',
123 'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.',
124 'upload_date': '20180731',
125 'timestamp': 1533009600,
126 'channel': 'Lindsay Ellis',
359df0fc 127 'channel_id': 'lindsayellis',
bdc196a4 128 'uploader': 'Lindsay Ellis',
359df0fc 129 'uploader_id': 'lindsayellis',
f3b3fe16 130 'timestamp': 1533009600,
d50ea3ce 131 'uploader_url': 'https://nebula.tv/lindsayellis',
f3b3fe16 132 'series': 'Lindsay Ellis',
f3b3fe16 133 'display_id': 'that-time-disney-remade-beauty-and-the-beast',
d50ea3ce 134 'channel_url': 'https://nebula.tv/lindsayellis',
f3b3fe16
HH
135 'creator': 'Lindsay Ellis',
136 'duration': 2212,
f3b3fe16 137 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*',
bdc196a4 138 },
bdc196a4
GS
139 },
140 {
d50ea3ce 141 'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
f3b3fe16 142 'md5': 'd05739cf6c38c09322422f696b569c23',
bdc196a4 143 'info_dict': {
3f756c8c 144 'id': '7e623145-1b44-4ca3-aa0b-ed25a247ea34',
bdc196a4
GS
145 'ext': 'mp4',
146 'title': 'Landing Craft - How The Allies Got Ashore',
147 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.',
148 'upload_date': '20200327',
149 'timestamp': 1585348140,
3f756c8c
RH
150 'channel': 'Real Engineering — The Logistics of D-Day',
151 'channel_id': 'd-day',
152 'uploader': 'Real Engineering — The Logistics of D-Day',
153 'uploader_id': 'd-day',
154 'series': 'Real Engineering — The Logistics of D-Day',
f3b3fe16 155 'display_id': 'the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
3f756c8c 156 'creator': 'Real Engineering — The Logistics of D-Day',
f3b3fe16 157 'duration': 841,
3f756c8c
RH
158 'channel_url': 'https://nebula.tv/d-day',
159 'uploader_url': 'https://nebula.tv/d-day',
f3b3fe16 160 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*',
bdc196a4 161 },
bdc196a4
GS
162 },
163 {
d50ea3ce 164 'url': 'https://nebula.tv/videos/money-episode-1-the-draw',
f3b3fe16 165 'md5': 'ebe28a7ad822b9ee172387d860487868',
bdc196a4 166 'info_dict': {
3f756c8c 167 'id': 'b96c5714-9e2b-4ec3-b3f1-20f6e89cc553',
bdc196a4
GS
168 'ext': 'mp4',
169 'title': 'Episode 1: The Draw',
170 'description': r'contains:There’s free money on offer… if the players can all work together.',
171 'upload_date': '20200323',
172 'timestamp': 1584980400,
173 'channel': 'Tom Scott Presents: Money',
359df0fc 174 'channel_id': 'tom-scott-presents-money',
bdc196a4 175 'uploader': 'Tom Scott Presents: Money',
359df0fc 176 'uploader_id': 'tom-scott-presents-money',
d50ea3ce 177 'uploader_url': 'https://nebula.tv/tom-scott-presents-money',
f3b3fe16 178 'duration': 825,
d50ea3ce 179 'channel_url': 'https://nebula.tv/tom-scott-presents-money',
f3b3fe16
HH
180 'series': 'Tom Scott Presents: Money',
181 'display_id': 'money-episode-1-the-draw',
182 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*',
f3b3fe16 183 'creator': 'Tom Scott Presents: Money',
bdc196a4 184 },
bdc196a4
GS
185 },
186 {
187 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
188 'only_matching': True,
189 },
cbfe2e5c 190 {
191 'url': 'https://beta.nebula.tv/videos/money-episode-1-the-draw',
192 'only_matching': True,
193 },
bdc196a4 194 ]
bdc196a4 195
359df0fc 196 def _fetch_video_metadata(self, slug):
3f756c8c 197 return self._call_nebula_api(f'https://content.api.nebula.app/video/{slug}/',
359df0fc
HH
198 video_id=slug,
199 auth_type='bearer',
200 note='Fetching video meta data')
bdc196a4 201
359df0fc
HH
202 def _real_extract(self, url):
203 slug = self._match_id(url)
204 video = self._fetch_video_metadata(slug)
205 return self._build_video_info(video)
bdc196a4 206
bdc196a4 207
f3b3fe16
HH
208class NebulaSubscriptionsIE(NebulaBaseIE):
209 IE_NAME = 'nebula:subscriptions'
4cca2eb1 210 _VALID_URL = rf'{_BASE_URL_RE}/myshows'
f3b3fe16
HH
211 _TESTS = [
212 {
d50ea3ce 213 'url': 'https://nebula.tv/myshows',
f3b3fe16
HH
214 'playlist_mincount': 1,
215 'info_dict': {
216 'id': 'myshows',
217 },
218 },
219 ]
220
221 def _generate_playlist_entries(self):
222 next_url = 'https://content.watchnebula.com/library/video/?page_size=100'
223 page_num = 1
224 while next_url:
225 channel = self._call_nebula_api(next_url, 'myshows', auth_type='bearer',
226 note=f'Retrieving subscriptions page {page_num}')
227 for episode in channel['results']:
228 yield self._build_video_info(episode)
229 next_url = channel['next']
230 page_num += 1
231
232 def _real_extract(self, url):
233 return self.playlist_result(self._generate_playlist_entries(), 'myshows')
234
235
236class NebulaChannelIE(NebulaBaseIE):
237 IE_NAME = 'nebula:channel'
4cca2eb1 238 _VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|videos/)(?P<id>[-\w]+)'
359df0fc
HH
239 _TESTS = [
240 {
d50ea3ce 241 'url': 'https://nebula.tv/tom-scott-presents-money',
359df0fc
HH
242 'info_dict': {
243 'id': 'tom-scott-presents-money',
244 'title': 'Tom Scott Presents: Money',
245 'description': 'Tom Scott hosts a series all about trust, negotiation and money.',
bdc196a4 246 },
359df0fc 247 'playlist_count': 5,
359df0fc 248 }, {
d50ea3ce 249 'url': 'https://nebula.tv/lindsayellis',
359df0fc
HH
250 'info_dict': {
251 'id': 'lindsayellis',
252 'title': 'Lindsay Ellis',
253 'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.',
254 },
d50ea3ce 255 'playlist_mincount': 2,
359df0fc
HH
256 },
257 ]
bdc196a4 258
359df0fc
HH
259 def _generate_playlist_entries(self, collection_id, channel):
260 episodes = channel['episodes']['results']
261 for page_num in itertools.count(2):
262 for episode in episodes:
263 yield self._build_video_info(episode)
264 next_url = channel['episodes']['next']
265 if not next_url:
266 break
267 channel = self._call_nebula_api(next_url, collection_id, auth_type='bearer',
268 note=f'Retrieving channel page {page_num}')
269 episodes = channel['episodes']['results']
bdc196a4
GS
270
271 def _real_extract(self, url):
359df0fc
HH
272 collection_id = self._match_id(url)
273 channel_url = f'https://content.watchnebula.com/video/channels/{collection_id}/'
274 channel = self._call_nebula_api(channel_url, collection_id, auth_type='bearer', note='Retrieving channel')
275 channel_details = channel['details']
bdc196a4 276
359df0fc
HH
277 return self.playlist_result(
278 entries=self._generate_playlist_entries(collection_id, channel),
279 playlist_id=collection_id,
280 playlist_title=channel_details['title'],
281 playlist_description=channel_details['description']
282 )