yt_dlp/extractor/nebula.py

   1 import itertools
   2 import json
   3
   4 from .common import InfoExtractor
   5 from ..networking.exceptions import HTTPError
   6 from ..utils import (
   7     ExtractorError,
   8     int_or_none,
   9     make_archive_id,
  10     parse_iso8601,
  11     smuggle_url,
  12     try_call,
  13     unsmuggle_url,
  14     update_url_query,
  15     url_or_none,
  16     urljoin,
  17 )
  18 from ..utils.traversal import traverse_obj
  19
  20 _BASE_URL_RE = r'https?://(?:www\.|beta\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)'
  21
  22
  23 class NebulaBaseIE(InfoExtractor):
  24     _NETRC_MACHINE = 'watchnebula'
  25     _token = _api_token = None
  26
  27     def _perform_login(self, username, password):
  28         try:
  29             response = self._download_json(
  30                 'https://nebula.tv/auth/login/', None,
  31                 'Logging in to Nebula', 'Login failed',
  32                 data=json.dumps({'email': username, 'password': password}).encode(),
  33                 headers={'content-type': 'application/json'})
  34         except ExtractorError as e:
  35             if isinstance(e.cause, HTTPError) and e.cause.status == 400:
  36                 raise ExtractorError('Login failed: Invalid username or password', expected=True)
  37             raise
  38         self._api_token = traverse_obj(response, ('key', {str}))
  39         if not self._api_token:
  40             raise ExtractorError('Login failed: No token')
  41
  42     def _call_api(self, *args, **kwargs):
  43         if self._token:
  44             kwargs.setdefault('headers', {})['Authorization'] = f'Bearer {self._token}'
  45         try:
  46             return self._download_json(*args, **kwargs)
  47         except ExtractorError as e:
  48             if not isinstance(e.cause, HTTPError) or e.cause.status not in (401, 403):
  49                 raise
  50             self.to_screen(
  51                 f'Reauthorizing with Nebula and retrying, because last API call resulted in error {e.cause.status}')
  52             self._real_initialize()
  53             if self._token:
  54                 kwargs.setdefault('headers', {})['Authorization'] = f'Bearer {self._token}'
  55             return self._download_json(*args, **kwargs)
  56
  57     def _real_initialize(self):
  58         if not self._api_token:
  59             self._api_token = try_call(
  60                 lambda: self._get_cookies('https://nebula.tv')['nebula_auth.apiToken'].value)
  61         self._token = self._download_json(
  62             'https://users.api.nebula.app/api/v1/authorization/', None,
  63             headers={'Authorization': f'Token {self._api_token}'} if self._api_token else None,
  64             note='Authorizing to Nebula', data=b'')['token']
  65
  66     def _extract_formats(self, content_id, slug):
  67         for retry in (False, True):
  68             try:
  69                 fmts, subs = self._extract_m3u8_formats_and_subtitles(
  70                     f'https://content.api.nebula.app/{content_id.split(":")[0]}s/{content_id}/manifest.m3u8',
  71                     slug, 'mp4', query={
  72                         'token': self._token,
  73                         'app_version': '23.10.0',
  74                         'platform': 'ios',
  75                     })
  76                 return {'formats': fmts, 'subtitles': subs}
  77             except ExtractorError as e:
  78                 if isinstance(e.cause, HTTPError) and e.cause.status == 401:
  79                     self.raise_login_required()
  80                 if not retry and isinstance(e.cause, HTTPError) and e.cause.status == 403:
  81                     self.to_screen('Reauthorizing with Nebula and retrying, because fetching video resulted in error')
  82                     self._real_initialize()
  83                     continue
  84                 raise
  85
  86     def _extract_video_metadata(self, episode):
  87         channel_url = traverse_obj(
  88             episode, (('channel_slug', 'class_slug'), {lambda x: urljoin('https://nebula.tv/', x)}), get_all=False)
  89         return {
  90             'id': episode['id'].partition(':')[2],
  91             **traverse_obj(episode, {
  92                 'display_id': 'slug',
  93                 'title': 'title',
  94                 'description': 'description',
  95                 'timestamp': ('published_at', {parse_iso8601}),
  96                 'duration': ('duration', {int_or_none}),
  97                 'channel_id': 'channel_slug',
  98                 'uploader_id': 'channel_slug',
  99                 'channel': 'channel_title',
 100                 'uploader': 'channel_title',
 101                 'series': 'channel_title',
 102                 'creator': 'channel_title',
 103                 'thumbnail': ('images', 'thumbnail', 'src', {url_or_none}),
 104                 'episode_number': ('order', {int_or_none}),
 105                 # Old code was wrongly setting extractor_key from NebulaSubscriptionsIE
 106                 '_old_archive_ids': ('zype_id', {lambda x: [
 107                     make_archive_id(NebulaIE, x), make_archive_id(NebulaSubscriptionsIE, x)] if x else None}),
 108             }),
 109             'channel_url': channel_url,
 110             'uploader_url': channel_url,
 111         }
 112
 113
 114 class NebulaIE(NebulaBaseIE):
 115     _VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[-\w]+)'
 116     _TESTS = [{
 117         'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast',
 118         'info_dict': {
 119             'id': '84ed544d-4afd-4723-8cd5-2b95261f0abf',
 120             'ext': 'mp4',
 121             'title': 'That Time Disney Remade Beauty and the Beast',
 122             'description': 'md5:2aae3c4cfc5ee09a1ecdff0909618cf4',
 123             'upload_date': '20180731',
 124             'timestamp': 1533009600,
 125             'channel': 'Lindsay Ellis',
 126             'channel_id': 'lindsayellis',
 127             'uploader': 'Lindsay Ellis',
 128             'uploader_id': 'lindsayellis',
 129             'uploader_url': r're:https://nebula\.(tv|app)/lindsayellis',
 130             'series': 'Lindsay Ellis',
 131             'display_id': 'that-time-disney-remade-beauty-and-the-beast',
 132             'channel_url': r're:https://nebula\.(tv|app)/lindsayellis',
 133             'creator': 'Lindsay Ellis',
 134             'duration': 2212,
 135             'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
 136             '_old_archive_ids': ['nebula 5c271b40b13fd613090034fd', 'nebulasubscriptions 5c271b40b13fd613090034fd'],
 137         },
 138         'params': {'skip_download': 'm3u8'},
 139     }, {
 140         'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
 141         'md5': 'd05739cf6c38c09322422f696b569c23',
 142         'info_dict': {
 143             'id': '7e623145-1b44-4ca3-aa0b-ed25a247ea34',
 144             'ext': 'mp4',
 145             'title': 'Landing Craft - How The Allies Got Ashore',
 146             'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.',
 147             'upload_date': '20200327',
 148             'timestamp': 1585348140,
 149             'channel': 'Real Engineering — The Logistics of D-Day',
 150             'channel_id': 'd-day',
 151             'uploader': 'Real Engineering — The Logistics of D-Day',
 152             'uploader_id': 'd-day',
 153             'series': 'Real Engineering — The Logistics of D-Day',
 154             'display_id': 'the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
 155             'creator': 'Real Engineering — The Logistics of D-Day',
 156             'duration': 841,
 157             'channel_url': 'https://nebula.tv/d-day',
 158             'uploader_url': 'https://nebula.tv/d-day',
 159             'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
 160             '_old_archive_ids': ['nebula 5e7e78171aaf320001fbd6be', 'nebulasubscriptions 5e7e78171aaf320001fbd6be'],
 161         },
 162         'params': {'skip_download': 'm3u8'},
 163     }, {
 164         'url': 'https://nebula.tv/videos/money-episode-1-the-draw',
 165         'md5': 'ebe28a7ad822b9ee172387d860487868',
 166         'info_dict': {
 167             'id': 'b96c5714-9e2b-4ec3-b3f1-20f6e89cc553',
 168             'ext': 'mp4',
 169             'title': 'Episode 1: The Draw',
 170             'description': r'contains:There’s free money on offer… if the players can all work together.',
 171             'upload_date': '20200323',
 172             'timestamp': 1584980400,
 173             'channel': 'Tom Scott Presents: Money',
 174             'channel_id': 'tom-scott-presents-money',
 175             'uploader': 'Tom Scott Presents: Money',
 176             'uploader_id': 'tom-scott-presents-money',
 177             'uploader_url': 'https://nebula.tv/tom-scott-presents-money',
 178             'duration': 825,
 179             'channel_url': 'https://nebula.tv/tom-scott-presents-money',
 180             'series': 'Tom Scott Presents: Money',
 181             'display_id': 'money-episode-1-the-draw',
 182             'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
 183             'creator': 'Tom Scott Presents: Money',
 184             '_old_archive_ids': ['nebula 5e779ebdd157bc0001d1c75a', 'nebulasubscriptions 5e779ebdd157bc0001d1c75a'],
 185         },
 186         'params': {'skip_download': 'm3u8'},
 187     }, {
 188         'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
 189         'only_matching': True,
 190     }, {
 191         'url': 'https://nebula.tv/videos/tldrnewseu-did-the-us-really-blow-up-the-nordstream-pipelines',
 192         'info_dict': {
 193             'id': 'e389af9d-1dab-44f2-8788-ee24deb7ff0d',
 194             'ext': 'mp4',
 195             'display_id': 'tldrnewseu-did-the-us-really-blow-up-the-nordstream-pipelines',
 196             'title': 'Did the US Really Blow Up the NordStream Pipelines?',
 197             'description': 'md5:b4e2a14e3ff08f546a3209c75261e789',
 198             'upload_date': '20230223',
 199             'timestamp': 1677144070,
 200             'channel': 'TLDR News EU',
 201             'channel_id': 'tldrnewseu',
 202             'uploader': 'TLDR News EU',
 203             'uploader_id': 'tldrnewseu',
 204             'uploader_url': r're:https://nebula\.(tv|app)/tldrnewseu',
 205             'duration': 524,
 206             'channel_url': r're:https://nebula\.(tv|app)/tldrnewseu',
 207             'series': 'TLDR News EU',
 208             'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
 209             'creator': 'TLDR News EU',
 210             '_old_archive_ids': ['nebula 63f64c74366fcd00017c1513', 'nebulasubscriptions 63f64c74366fcd00017c1513'],
 211         },
 212         'params': {'skip_download': 'm3u8'},
 213     }, {
 214         'url': 'https://beta.nebula.tv/videos/money-episode-1-the-draw',
 215         'only_matching': True,
 216     }]
 217
 218     def _real_extract(self, url):
 219         slug = self._match_id(url)
 220         url, smuggled_data = unsmuggle_url(url, {})
 221         if smuggled_data.get('id'):
 222             return {
 223                 'id': smuggled_data['id'],
 224                 'display_id': slug,
 225                 'title': '',
 226                 **self._extract_formats(smuggled_data['id'], slug),
 227             }
 228
 229         metadata = self._call_api(
 230             f'https://content.api.nebula.app/content/videos/{slug}',
 231             slug, note='Fetching video metadata')
 232         return {
 233             **self._extract_video_metadata(metadata),
 234             **self._extract_formats(metadata['id'], slug),
 235         }
 236
 237
 238 class NebulaClassIE(NebulaBaseIE):
 239     IE_NAME = 'nebula:class'
 240     _VALID_URL = rf'{_BASE_URL_RE}/(?P<id>[-\w]+)/(?P<ep>\d+)'
 241     _TESTS = [{
 242         'url': 'https://nebula.tv/copyright-for-fun-and-profit/14',
 243         'info_dict': {
 244             'id': 'd7432cdc-c608-474d-942c-f74345daed7b',
 245             'ext': 'mp4',
 246             'display_id': '14',
 247             'channel_url': 'https://nebula.tv/copyright-for-fun-and-profit',
 248             'episode_number': 14,
 249             'thumbnail': 'https://dj423fildxgac.cloudfront.net/d533718d-9307-42d4-8fb0-e283285e99c9',
 250             'uploader_url': 'https://nebula.tv/copyright-for-fun-and-profit',
 251             'duration': 646,
 252             'episode': 'Episode 14',
 253             'title': 'Photos, Sculpture, and Video',
 254         },
 255         'params': {'skip_download': 'm3u8'},
 256     }]
 257
 258     def _real_extract(self, url):
 259         slug, episode = self._match_valid_url(url).group('id', 'ep')
 260         url, smuggled_data = unsmuggle_url(url, {})
 261         if smuggled_data.get('id'):
 262             return {
 263                 'id': smuggled_data['id'],
 264                 'display_id': slug,
 265                 'title': '',
 266                 **self._extract_formats(smuggled_data['id'], slug),
 267             }
 268
 269         metadata = self._call_api(
 270             f'https://content.api.nebula.app/content/{slug}/{episode}/?include=lessons',
 271             slug, note='Fetching video metadata')
 272         return {
 273             **self._extract_video_metadata(metadata),
 274             **self._extract_formats(metadata['id'], slug),
 275         }
 276
 277
 278 class NebulaSubscriptionsIE(NebulaBaseIE):
 279     IE_NAME = 'nebula:subscriptions'
 280     _VALID_URL = rf'{_BASE_URL_RE}/(?P<id>myshows|library/latest-videos)'
 281     _TESTS = [{
 282         'url': 'https://nebula.tv/myshows',
 283         'playlist_mincount': 1,
 284         'info_dict': {
 285             'id': 'myshows',
 286         },
 287     }]
 288
 289     def _generate_playlist_entries(self):
 290         next_url = update_url_query('https://content.api.nebula.app/video_episodes/', {
 291             'following': 'true',
 292             'include': 'engagement',
 293             'ordering': '-published_at',
 294         })
 295         for page_num in itertools.count(1):
 296             channel = self._call_api(
 297                 next_url, 'myshows', note=f'Retrieving subscriptions page {page_num}')
 298             for episode in channel['results']:
 299                 metadata = self._extract_video_metadata(episode)
 300                 yield self.url_result(smuggle_url(
 301                     f'https://nebula.tv/videos/{metadata["display_id"]}',
 302                     {'id': episode['id']}), NebulaIE, url_transparent=True, **metadata)
 303             next_url = channel.get('next')
 304             if not next_url:
 305                 return
 306
 307     def _real_extract(self, url):
 308         return self.playlist_result(self._generate_playlist_entries(), 'myshows')
 309
 310
 311 class NebulaChannelIE(NebulaBaseIE):
 312     IE_NAME = 'nebula:channel'
 313     _VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|library|videos/)(?P<id>[-\w]+)/?(?:$|[?#])'
 314     _TESTS = [{
 315         'url': 'https://nebula.tv/tom-scott-presents-money',
 316         'info_dict': {
 317             'id': 'tom-scott-presents-money',
 318             'title': 'Tom Scott Presents: Money',
 319             'description': 'Tom Scott hosts a series all about trust, negotiation and money.',
 320         },
 321         'playlist_count': 5,
 322     }, {
 323         'url': 'https://nebula.tv/lindsayellis',
 324         'info_dict': {
 325             'id': 'lindsayellis',
 326             'title': 'Lindsay Ellis',
 327             'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.',
 328         },
 329         'playlist_mincount': 2,
 330     }, {
 331         'url': 'https://nebula.tv/johnnyharris',
 332         'info_dict': {
 333             'id': 'johnnyharris',
 334             'title': 'Johnny Harris',
 335             'description': 'I make videos about maps and many other things.',
 336         },
 337         'playlist_mincount': 90,
 338     }, {
 339         'url': 'https://nebula.tv/copyright-for-fun-and-profit',
 340         'info_dict': {
 341             'id': 'copyright-for-fun-and-profit',
 342             'title': 'Copyright for Fun and Profit',
 343             'description': 'md5:6690248223eed044a9f11cd5a24f9742',
 344         },
 345         'playlist_count': 23,
 346     }]
 347
 348     def _generate_playlist_entries(self, collection_id, collection_slug):
 349         next_url = f'https://content.api.nebula.app/video_channels/{collection_id}/video_episodes/?ordering=-published_at'
 350         for page_num in itertools.count(1):
 351             episodes = self._call_api(next_url, collection_slug, note=f'Retrieving channel page {page_num}')
 352             for episode in episodes['results']:
 353                 metadata = self._extract_video_metadata(episode)
 354                 yield self.url_result(smuggle_url(
 355                     episode.get('share_url') or f'https://nebula.tv/videos/{metadata["display_id"]}',
 356                     {'id': episode['id']}), NebulaIE, url_transparent=True, **metadata)
 357             next_url = episodes.get('next')
 358             if not next_url:
 359                 break
 360
 361     def _generate_class_entries(self, channel):
 362         for lesson in channel['lessons']:
 363             metadata = self._extract_video_metadata(lesson)
 364             yield self.url_result(smuggle_url(
 365                 lesson.get('share_url') or f'https://nebula.tv/{metadata["class_slug"]}/{metadata["slug"]}',
 366                 {'id': lesson['id']}), NebulaClassIE, url_transparent=True, **metadata)
 367
 368     def _real_extract(self, url):
 369         collection_slug = self._match_id(url)
 370         channel = self._call_api(
 371             f'https://content.api.nebula.app/content/{collection_slug}/?include=lessons',
 372             collection_slug, note='Retrieving channel')
 373
 374         if channel.get('type') == 'class':
 375             entries = self._generate_class_entries(channel)
 376         else:
 377             entries = self._generate_playlist_entries(channel['id'], collection_slug)
 378
 379         return self.playlist_result(
 380             entries=entries,
 381             playlist_id=collection_slug,
 382             playlist_title=channel.get('title'),
 383             playlist_description=channel.get('description'))