]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/nebula.py
[ie/matchtv] Fix extractor (#10190)
[yt-dlp.git] / yt_dlp / extractor / nebula.py
CommitLineData
359df0fc 1import itertools
bdc196a4 2import json
bdc196a4 3
0de09c5b 4from .art19 import Art19IE
359df0fc 5from .common import InfoExtractor
3d2623a8 6from ..networking.exceptions import HTTPError
45d82be6 7from ..utils import (
8 ExtractorError,
9 int_or_none,
10 make_archive_id,
11 parse_iso8601,
12 smuggle_url,
13 try_call,
14 unsmuggle_url,
15 update_url_query,
16 url_or_none,
17 urljoin,
18)
19from ..utils.traversal import traverse_obj
359df0fc 20
cbfe2e5c 21_BASE_URL_RE = r'https?://(?:www\.|beta\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)'
4cca2eb1 22
359df0fc
HH
23
24class NebulaBaseIE(InfoExtractor):
25 _NETRC_MACHINE = 'watchnebula'
45d82be6 26 _token = _api_token = None
359df0fc 27
45d82be6 28 def _perform_login(self, username, password):
29 try:
30 response = self._download_json(
31 'https://nebula.tv/auth/login/', None,
32 'Logging in to Nebula', 'Login failed',
33 data=json.dumps({'email': username, 'password': password}).encode(),
34 headers={'content-type': 'application/json'})
35 except ExtractorError as e:
36 if isinstance(e.cause, HTTPError) and e.cause.status == 400:
37 raise ExtractorError('Login failed: Invalid username or password', expected=True)
38 raise
39 self._api_token = traverse_obj(response, ('key', {str}))
40 if not self._api_token:
41 raise ExtractorError('Login failed: No token')
359df0fc 42
45d82be6 43 def _call_api(self, *args, **kwargs):
44 if self._token:
45 kwargs.setdefault('headers', {})['Authorization'] = f'Bearer {self._token}'
359df0fc 46 try:
45d82be6 47 return self._download_json(*args, **kwargs)
48 except ExtractorError as e:
49 if not isinstance(e.cause, HTTPError) or e.cause.status not in (401, 403):
359df0fc 50 raise
45d82be6 51 self.to_screen(
52 f'Reauthorizing with Nebula and retrying, because last API call resulted in error {e.cause.status}')
53 self._real_initialize()
54 if self._token:
55 kwargs.setdefault('headers', {})['Authorization'] = f'Bearer {self._token}'
56 return self._download_json(*args, **kwargs)
359df0fc 57
45d82be6 58 def _real_initialize(self):
59 if not self._api_token:
60 self._api_token = try_call(
61 lambda: self._get_cookies('https://nebula.tv')['nebula_auth.apiToken'].value)
62 self._token = self._download_json(
63 'https://users.api.nebula.app/api/v1/authorization/', None,
64 headers={'Authorization': f'Token {self._api_token}'} if self._api_token else None,
65 note='Authorizing to Nebula', data=b'')['token']
bdc196a4 66
45d82be6 67 def _extract_formats(self, content_id, slug):
68 for retry in (False, True):
69 try:
70 fmts, subs = self._extract_m3u8_formats_and_subtitles(
71 f'https://content.api.nebula.app/{content_id.split(":")[0]}s/{content_id}/manifest.m3u8',
72 slug, 'mp4', query={
73 'token': self._token,
74 'app_version': '23.10.0',
75 'platform': 'ios',
76 })
77 return {'formats': fmts, 'subtitles': subs}
78 except ExtractorError as e:
79 if isinstance(e.cause, HTTPError) and e.cause.status == 401:
80 self.raise_login_required()
81 if not retry and isinstance(e.cause, HTTPError) and e.cause.status == 403:
82 self.to_screen('Reauthorizing with Nebula and retrying, because fetching video resulted in error')
83 self._real_initialize()
84 continue
85 raise
359df0fc 86
45d82be6 87 def _extract_video_metadata(self, episode):
88 channel_url = traverse_obj(
89 episode, (('channel_slug', 'class_slug'), {lambda x: urljoin('https://nebula.tv/', x)}), get_all=False)
359df0fc 90 return {
45d82be6 91 'id': episode['id'].partition(':')[2],
92 **traverse_obj(episode, {
93 'display_id': 'slug',
94 'title': 'title',
95 'description': 'description',
96 'timestamp': ('published_at', {parse_iso8601}),
97 'duration': ('duration', {int_or_none}),
98 'channel_id': 'channel_slug',
99 'uploader_id': 'channel_slug',
100 'channel': 'channel_title',
101 'uploader': 'channel_title',
102 'series': 'channel_title',
103 'creator': 'channel_title',
104 'thumbnail': ('images', 'thumbnail', 'src', {url_or_none}),
105 'episode_number': ('order', {int_or_none}),
106 # Old code was wrongly setting extractor_key from NebulaSubscriptionsIE
107 '_old_archive_ids': ('zype_id', {lambda x: [
108 make_archive_id(NebulaIE, x), make_archive_id(NebulaSubscriptionsIE, x)] if x else None}),
109 }),
110 'channel_url': channel_url,
111 'uploader_url': channel_url,
359df0fc
HH
112 }
113
359df0fc
HH
114
115class NebulaIE(NebulaBaseIE):
0de09c5b 116 IE_NAME = 'nebula:video'
117 _VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[\w-]+)'
45d82be6 118 _TESTS = [{
119 'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast',
120 'info_dict': {
121 'id': '84ed544d-4afd-4723-8cd5-2b95261f0abf',
122 'ext': 'mp4',
123 'title': 'That Time Disney Remade Beauty and the Beast',
124 'description': 'md5:2aae3c4cfc5ee09a1ecdff0909618cf4',
125 'upload_date': '20180731',
126 'timestamp': 1533009600,
127 'channel': 'Lindsay Ellis',
128 'channel_id': 'lindsayellis',
129 'uploader': 'Lindsay Ellis',
130 'uploader_id': 'lindsayellis',
131 'uploader_url': r're:https://nebula\.(tv|app)/lindsayellis',
132 'series': 'Lindsay Ellis',
133 'display_id': 'that-time-disney-remade-beauty-and-the-beast',
134 'channel_url': r're:https://nebula\.(tv|app)/lindsayellis',
135 'creator': 'Lindsay Ellis',
136 'duration': 2212,
137 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
138 '_old_archive_ids': ['nebula 5c271b40b13fd613090034fd', 'nebulasubscriptions 5c271b40b13fd613090034fd'],
bdc196a4 139 },
45d82be6 140 'params': {'skip_download': 'm3u8'},
141 }, {
142 'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
143 'md5': 'd05739cf6c38c09322422f696b569c23',
144 'info_dict': {
145 'id': '7e623145-1b44-4ca3-aa0b-ed25a247ea34',
146 'ext': 'mp4',
147 'title': 'Landing Craft - How The Allies Got Ashore',
148 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.',
149 'upload_date': '20200327',
150 'timestamp': 1585348140,
151 'channel': 'Real Engineering — The Logistics of D-Day',
152 'channel_id': 'd-day',
153 'uploader': 'Real Engineering — The Logistics of D-Day',
154 'uploader_id': 'd-day',
155 'series': 'Real Engineering — The Logistics of D-Day',
156 'display_id': 'the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
157 'creator': 'Real Engineering — The Logistics of D-Day',
158 'duration': 841,
159 'channel_url': 'https://nebula.tv/d-day',
160 'uploader_url': 'https://nebula.tv/d-day',
161 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
162 '_old_archive_ids': ['nebula 5e7e78171aaf320001fbd6be', 'nebulasubscriptions 5e7e78171aaf320001fbd6be'],
bdc196a4 163 },
45d82be6 164 'params': {'skip_download': 'm3u8'},
165 }, {
166 'url': 'https://nebula.tv/videos/money-episode-1-the-draw',
167 'md5': 'ebe28a7ad822b9ee172387d860487868',
168 'info_dict': {
169 'id': 'b96c5714-9e2b-4ec3-b3f1-20f6e89cc553',
170 'ext': 'mp4',
171 'title': 'Episode 1: The Draw',
172 'description': r'contains:There’s free money on offer… if the players can all work together.',
173 'upload_date': '20200323',
174 'timestamp': 1584980400,
175 'channel': 'Tom Scott Presents: Money',
176 'channel_id': 'tom-scott-presents-money',
177 'uploader': 'Tom Scott Presents: Money',
178 'uploader_id': 'tom-scott-presents-money',
179 'uploader_url': 'https://nebula.tv/tom-scott-presents-money',
180 'duration': 825,
181 'channel_url': 'https://nebula.tv/tom-scott-presents-money',
182 'series': 'Tom Scott Presents: Money',
183 'display_id': 'money-episode-1-the-draw',
184 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
185 'creator': 'Tom Scott Presents: Money',
186 '_old_archive_ids': ['nebula 5e779ebdd157bc0001d1c75a', 'nebulasubscriptions 5e779ebdd157bc0001d1c75a'],
bdc196a4 187 },
45d82be6 188 'params': {'skip_download': 'm3u8'},
189 }, {
190 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
191 'only_matching': True,
192 }, {
193 'url': 'https://nebula.tv/videos/tldrnewseu-did-the-us-really-blow-up-the-nordstream-pipelines',
194 'info_dict': {
195 'id': 'e389af9d-1dab-44f2-8788-ee24deb7ff0d',
196 'ext': 'mp4',
197 'display_id': 'tldrnewseu-did-the-us-really-blow-up-the-nordstream-pipelines',
198 'title': 'Did the US Really Blow Up the NordStream Pipelines?',
199 'description': 'md5:b4e2a14e3ff08f546a3209c75261e789',
200 'upload_date': '20230223',
201 'timestamp': 1677144070,
202 'channel': 'TLDR News EU',
203 'channel_id': 'tldrnewseu',
204 'uploader': 'TLDR News EU',
205 'uploader_id': 'tldrnewseu',
206 'uploader_url': r're:https://nebula\.(tv|app)/tldrnewseu',
207 'duration': 524,
208 'channel_url': r're:https://nebula\.(tv|app)/tldrnewseu',
209 'series': 'TLDR News EU',
210 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
211 'creator': 'TLDR News EU',
212 '_old_archive_ids': ['nebula 63f64c74366fcd00017c1513', 'nebulasubscriptions 63f64c74366fcd00017c1513'],
cbfe2e5c 213 },
45d82be6 214 'params': {'skip_download': 'm3u8'},
215 }, {
216 'url': 'https://beta.nebula.tv/videos/money-episode-1-the-draw',
217 'only_matching': True,
218 }]
bdc196a4 219
359df0fc
HH
220 def _real_extract(self, url):
221 slug = self._match_id(url)
45d82be6 222 url, smuggled_data = unsmuggle_url(url, {})
223 if smuggled_data.get('id'):
224 return {
225 'id': smuggled_data['id'],
226 'display_id': slug,
227 'title': '',
228 **self._extract_formats(smuggled_data['id'], slug),
229 }
230
231 metadata = self._call_api(
232 f'https://content.api.nebula.app/content/videos/{slug}',
233 slug, note='Fetching video metadata')
234 return {
235 **self._extract_video_metadata(metadata),
236 **self._extract_formats(metadata['id'], slug),
237 }
238
239
240class NebulaClassIE(NebulaBaseIE):
0de09c5b 241 IE_NAME = 'nebula:media'
242 _VALID_URL = rf'{_BASE_URL_RE}/(?!(?:myshows|library|videos)/)(?P<id>[\w-]+)/(?P<ep>[\w-]+)/?(?:$|[?#])'
45d82be6 243 _TESTS = [{
244 'url': 'https://nebula.tv/copyright-for-fun-and-profit/14',
245 'info_dict': {
246 'id': 'd7432cdc-c608-474d-942c-f74345daed7b',
247 'ext': 'mp4',
248 'display_id': '14',
249 'channel_url': 'https://nebula.tv/copyright-for-fun-and-profit',
250 'episode_number': 14,
251 'thumbnail': 'https://dj423fildxgac.cloudfront.net/d533718d-9307-42d4-8fb0-e283285e99c9',
252 'uploader_url': 'https://nebula.tv/copyright-for-fun-and-profit',
253 'duration': 646,
254 'episode': 'Episode 14',
255 'title': 'Photos, Sculpture, and Video',
256 },
257 'params': {'skip_download': 'm3u8'},
0de09c5b 258 }, {
259 'url': 'https://nebula.tv/extremitiespodcast/pyramiden-the-high-arctic-soviet-ghost-town',
260 'info_dict': {
261 'ext': 'mp3',
262 'id': '018f65f0-0033-4021-8f87-2d132beb19aa',
263 'description': 'md5:05d2b23ab780c955e2511a2b9127acff',
264 'series_id': '335e8159-d663-491a-888f-1732285706ac',
265 'modified_timestamp': 1599091504,
266 'episode_id': '018f65f0-0033-4021-8f87-2d132beb19aa',
267 'series': 'Extremities',
268 'modified_date': '20200903',
269 'upload_date': '20200902',
270 'title': 'Pyramiden: The High-Arctic Soviet Ghost Town',
271 'release_timestamp': 1571237958,
272 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
273 'duration': 1546.05714,
274 'timestamp': 1599085608,
275 'release_date': '20191016',
276 },
277 }, {
278 'url': 'https://nebula.tv/thelayover/the-layover-episode-1',
279 'info_dict': {
280 'ext': 'mp3',
281 'id': '9d74a762-00bb-45a8-9e8d-9ed47c04a1d0',
282 'episode_number': 1,
283 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
284 'release_date': '20230304',
285 'modified_date': '20230403',
286 'series': 'The Layover',
287 'episode_id': '9d74a762-00bb-45a8-9e8d-9ed47c04a1d0',
288 'modified_timestamp': 1680554566,
289 'duration': 3130.46401,
290 'release_timestamp': 1677943800,
291 'title': 'The Layover — Episode 1',
292 'series_id': '874303a5-4900-4626-a4b6-2aacac34466a',
293 'upload_date': '20230303',
294 'episode': 'Episode 1',
295 'timestamp': 1677883672,
296 'description': 'md5:002cca89258e3bc7c268d5b8c24ba482',
297 },
45d82be6 298 }]
299
300 def _real_extract(self, url):
301 slug, episode = self._match_valid_url(url).group('id', 'ep')
302 url, smuggled_data = unsmuggle_url(url, {})
303 if smuggled_data.get('id'):
304 return {
305 'id': smuggled_data['id'],
306 'display_id': slug,
307 'title': '',
308 **self._extract_formats(smuggled_data['id'], slug),
309 }
310
311 metadata = self._call_api(
312 f'https://content.api.nebula.app/content/{slug}/{episode}/?include=lessons',
0de09c5b 313 slug, note='Fetching class/podcast metadata')
314 content_type = metadata.get('type')
315 if content_type == 'lesson':
316 return {
317 **self._extract_video_metadata(metadata),
318 **self._extract_formats(metadata['id'], slug),
319 }
320 elif content_type == 'podcast_episode':
321 episode_url = metadata['episode_url']
322 if not episode_url and metadata.get('premium'):
323 self.raise_login_required()
324
325 if Art19IE.suitable(episode_url):
326 return self.url_result(episode_url, Art19IE)
327 return traverse_obj(metadata, {
328 'id': ('id', {str}),
329 'url': ('episode_url', {url_or_none}),
330 'title': ('title', {str}),
331 'description': ('description', {str}),
332 'timestamp': ('published_at', {parse_iso8601}),
333 'duration': ('duration', {int_or_none}),
334 'channel_id': ('channel_id', {str}),
335 'chnanel': ('channel_title', {str}),
336 'thumbnail': ('assets', 'regular', {url_or_none}),
337 })
338
339 raise ExtractorError(f'Unexpected content type {content_type!r}')
bdc196a4 340
bdc196a4 341
f3b3fe16
HH
342class NebulaSubscriptionsIE(NebulaBaseIE):
343 IE_NAME = 'nebula:subscriptions'
0de09c5b 344 _VALID_URL = rf'{_BASE_URL_RE}/(?P<id>myshows|library/latest-videos)/?(?:$|[?#])'
45d82be6 345 _TESTS = [{
346 'url': 'https://nebula.tv/myshows',
347 'playlist_mincount': 1,
348 'info_dict': {
349 'id': 'myshows',
f3b3fe16 350 },
45d82be6 351 }]
f3b3fe16
HH
352
353 def _generate_playlist_entries(self):
45d82be6 354 next_url = update_url_query('https://content.api.nebula.app/video_episodes/', {
355 'following': 'true',
356 'include': 'engagement',
357 'ordering': '-published_at',
358 })
359 for page_num in itertools.count(1):
360 channel = self._call_api(
361 next_url, 'myshows', note=f'Retrieving subscriptions page {page_num}')
f3b3fe16 362 for episode in channel['results']:
45d82be6 363 metadata = self._extract_video_metadata(episode)
364 yield self.url_result(smuggle_url(
365 f'https://nebula.tv/videos/{metadata["display_id"]}',
366 {'id': episode['id']}), NebulaIE, url_transparent=True, **metadata)
367 next_url = channel.get('next')
368 if not next_url:
369 return
f3b3fe16
HH
370
371 def _real_extract(self, url):
372 return self.playlist_result(self._generate_playlist_entries(), 'myshows')
373
374
375class NebulaChannelIE(NebulaBaseIE):
376 IE_NAME = 'nebula:channel'
0de09c5b 377 _VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|library|videos)(?P<id>[\w-]+)/?(?:$|[?#])'
45d82be6 378 _TESTS = [{
379 'url': 'https://nebula.tv/tom-scott-presents-money',
380 'info_dict': {
381 'id': 'tom-scott-presents-money',
382 'title': 'Tom Scott Presents: Money',
383 'description': 'Tom Scott hosts a series all about trust, negotiation and money.',
384 },
385 'playlist_count': 5,
386 }, {
387 'url': 'https://nebula.tv/lindsayellis',
388 'info_dict': {
389 'id': 'lindsayellis',
390 'title': 'Lindsay Ellis',
391 'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.',
392 },
393 'playlist_mincount': 2,
394 }, {
395 'url': 'https://nebula.tv/johnnyharris',
396 'info_dict': {
397 'id': 'johnnyharris',
398 'title': 'Johnny Harris',
399 'description': 'I make videos about maps and many other things.',
359df0fc 400 },
45d82be6 401 'playlist_mincount': 90,
402 }, {
403 'url': 'https://nebula.tv/copyright-for-fun-and-profit',
404 'info_dict': {
405 'id': 'copyright-for-fun-and-profit',
406 'title': 'Copyright for Fun and Profit',
407 'description': 'md5:6690248223eed044a9f11cd5a24f9742',
408 },
409 'playlist_count': 23,
0de09c5b 410 }, {
411 'url': 'https://nebula.tv/trussissuespodcast',
412 'info_dict': {
413 'id': 'trussissuespodcast',
414 'title': 'The TLDR News Podcast',
415 'description': 'md5:a08c4483bc0b705881d3e0199e721385',
416 },
417 'playlist_mincount': 80,
45d82be6 418 }]
bdc196a4 419
45d82be6 420 def _generate_playlist_entries(self, collection_id, collection_slug):
421 next_url = f'https://content.api.nebula.app/video_channels/{collection_id}/video_episodes/?ordering=-published_at'
422 for page_num in itertools.count(1):
423 episodes = self._call_api(next_url, collection_slug, note=f'Retrieving channel page {page_num}')
424 for episode in episodes['results']:
425 metadata = self._extract_video_metadata(episode)
426 yield self.url_result(smuggle_url(
427 episode.get('share_url') or f'https://nebula.tv/videos/{metadata["display_id"]}',
428 {'id': episode['id']}), NebulaIE, url_transparent=True, **metadata)
429 next_url = episodes.get('next')
359df0fc
HH
430 if not next_url:
431 break
45d82be6 432
433 def _generate_class_entries(self, channel):
434 for lesson in channel['lessons']:
435 metadata = self._extract_video_metadata(lesson)
436 yield self.url_result(smuggle_url(
437 lesson.get('share_url') or f'https://nebula.tv/{metadata["class_slug"]}/{metadata["slug"]}',
438 {'id': lesson['id']}), NebulaClassIE, url_transparent=True, **metadata)
bdc196a4 439
0de09c5b 440 def _generate_podcast_entries(self, collection_id, collection_slug):
441 next_url = f'https://content.api.nebula.app/podcast_channels/{collection_id}/podcast_episodes/?ordering=-published_at&premium=true'
442 for page_num in itertools.count(1):
443 episodes = self._call_api(next_url, collection_slug, note=f'Retrieving podcast page {page_num}')
444
445 for episode in traverse_obj(episodes, ('results', lambda _, v: url_or_none(v['share_url']))):
446 yield self.url_result(episode['share_url'], NebulaClassIE)
447 next_url = episodes.get('next')
448 if not next_url:
449 break
450
bdc196a4 451 def _real_extract(self, url):
45d82be6 452 collection_slug = self._match_id(url)
453 channel = self._call_api(
454 f'https://content.api.nebula.app/content/{collection_slug}/?include=lessons',
455 collection_slug, note='Retrieving channel')
456
457 if channel.get('type') == 'class':
458 entries = self._generate_class_entries(channel)
0de09c5b 459 elif channel.get('type') == 'podcast_channel':
460 entries = self._generate_podcast_entries(channel['id'], collection_slug)
45d82be6 461 else:
462 entries = self._generate_playlist_entries(channel['id'], collection_slug)
bdc196a4 463
359df0fc 464 return self.playlist_result(
45d82be6 465 entries=entries,
466 playlist_id=collection_slug,
467 playlist_title=channel.get('title'),
468 playlist_description=channel.get('description'))