yt_dlp/extractor/nebula.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import json
   5 import time
   6
   7 from urllib.error import HTTPError
   8 from .common import InfoExtractor
   9 from ..compat import compat_str, compat_urllib_parse_unquote, compat_urllib_parse_quote
  10 from ..utils import (
  11     ExtractorError,
  12     parse_iso8601,
  13     try_get,
  14     urljoin,
  15 )
  16
  17
  18 class NebulaIE(InfoExtractor):
  19
  20     _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/videos/(?P<id>[-\w]+)'
  21     _TESTS = [
  22         {
  23             'url': 'https://nebula.app/videos/that-time-disney-remade-beauty-and-the-beast',
  24             'md5': 'fe79c4df8b3aa2fea98a93d027465c7e',
  25             'info_dict': {
  26                 'id': '5c271b40b13fd613090034fd',
  27                 'ext': 'mp4',
  28                 'title': 'That Time Disney Remade Beauty and the Beast',
  29                 'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.',
  30                 'upload_date': '20180731',
  31                 'timestamp': 1533009600,
  32                 'channel': 'Lindsay Ellis',
  33                 'uploader': 'Lindsay Ellis',
  34             },
  35             'params': {
  36                 'usenetrc': True,
  37             },
  38             'skip': 'All Nebula content requires authentication',
  39         },
  40         {
  41             'url': 'https://nebula.app/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
  42             'md5': '6d4edd14ce65720fa63aba5c583fb328',
  43             'info_dict': {
  44                 'id': '5e7e78171aaf320001fbd6be',
  45                 'ext': 'mp4',
  46                 'title': 'Landing Craft - How The Allies Got Ashore',
  47                 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.',
  48                 'upload_date': '20200327',
  49                 'timestamp': 1585348140,
  50                 'channel': 'The Logistics of D-Day',
  51                 'uploader': 'The Logistics of D-Day',
  52             },
  53             'params': {
  54                 'usenetrc': True,
  55             },
  56             'skip': 'All Nebula content requires authentication',
  57         },
  58         {
  59             'url': 'https://nebula.app/videos/money-episode-1-the-draw',
  60             'md5': '8c7d272910eea320f6f8e6d3084eecf5',
  61             'info_dict': {
  62                 'id': '5e779ebdd157bc0001d1c75a',
  63                 'ext': 'mp4',
  64                 'title': 'Episode 1: The Draw',
  65                 'description': r'contains:There’s free money on offer… if the players can all work together.',
  66                 'upload_date': '20200323',
  67                 'timestamp': 1584980400,
  68                 'channel': 'Tom Scott Presents: Money',
  69                 'uploader': 'Tom Scott Presents: Money',
  70             },
  71             'params': {
  72                 'usenetrc': True,
  73             },
  74             'skip': 'All Nebula content requires authentication',
  75         },
  76         {
  77             'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
  78             'only_matching': True,
  79         },
  80     ]
  81     _NETRC_MACHINE = 'watchnebula'
  82
  83     _nebula_token = None
  84
  85     def _retrieve_nebula_auth(self):
  86         """
  87         Log in to Nebula, and returns a Nebula API token
  88         """
  89
  90         username, password = self._get_login_info()
  91         if not (username and password):
  92             self.raise_login_required()
  93
  94         self.report_login()
  95         data = json.dumps({'email': username, 'password': password}).encode('utf8')
  96         response = self._download_json(
  97             'https://api.watchnebula.com/api/v1/auth/login/',
  98             data=data, fatal=False, video_id=None,
  99             headers={
 100                 'content-type': 'application/json',
 101                 # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint
 102                 'cookie': ''
 103             },
 104             note='Authenticating to Nebula with supplied credentials',
 105             errnote='Authentication failed or rejected')
 106         if not response or not response.get('key'):
 107             self.raise_login_required()
 108
 109         # save nebula token as cookie
 110         self._set_cookie(
 111             'nebula.app', 'nebula-auth',
 112             compat_urllib_parse_quote(
 113                 json.dumps({
 114                     "apiToken": response["key"],
 115                     "isLoggingIn": False,
 116                     "isLoggingOut": False,
 117                 }, separators=(",", ":"))),
 118             expire_time=int(time.time()) + 86400 * 365,
 119         )
 120
 121         return response['key']
 122
 123     def _retrieve_zype_api_key(self, page_url, display_id):
 124         """
 125         Retrieves the Zype API key
 126         """
 127
 128         # Find the js that has the API key from the webpage and download it
 129         webpage = self._download_webpage(page_url, video_id=display_id)
 130         main_script_relpath = self._search_regex(
 131             r'<script[^>]*src="(?P<script_relpath>[^"]*main.[0-9a-f]*.chunk.js)"[^>]*>', webpage,
 132             group='script_relpath', name='script relative path', fatal=True)
 133         main_script_abspath = urljoin(page_url, main_script_relpath)
 134         main_script = self._download_webpage(main_script_abspath, video_id=display_id,
 135                                              note='Retrieving Zype API key')
 136
 137         api_key = self._search_regex(
 138             r'REACT_APP_ZYPE_API_KEY\s*:\s*"(?P<api_key>[\w-]*)"', main_script,
 139             group='api_key', name='API key', fatal=True)
 140
 141         return api_key
 142
 143     def _call_zype_api(self, path, params, video_id, api_key, note):
 144         """
 145         A helper for making calls to the Zype API.
 146         """
 147         query = {'api_key': api_key, 'per_page': 1}
 148         query.update(params)
 149         return self._download_json('https://api.zype.com' + path, video_id, query=query, note=note)
 150
 151     def _call_nebula_api(self, path, video_id, access_token, note):
 152         """
 153         A helper for making calls to the Nebula API.
 154         """
 155         return self._download_json('https://api.watchnebula.com/api/v1' + path, video_id, headers={
 156             'Authorization': 'Token {access_token}'.format(access_token=access_token)
 157         }, note=note)
 158
 159     def _fetch_zype_access_token(self, video_id):
 160         try:
 161             user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token')
 162         except ExtractorError as exc:
 163             # if 401, attempt credential auth and retry
 164             if exc.cause and isinstance(exc.cause, HTTPError) and exc.cause.code == 401:
 165                 self._nebula_token = self._retrieve_nebula_auth()
 166                 user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token')
 167             else:
 168                 raise
 169
 170         access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str)
 171         if not access_token:
 172             if try_get(user_object, lambda x: x['is_subscribed'], bool):
 173                 # TODO: Reimplement the same Zype token polling the Nebula frontend implements
 174                 # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532
 175                 raise ExtractorError(
 176                     'Unable to extract Zype access token from Nebula API authentication endpoint. '
 177                     'Open an arbitrary video in a browser with this account to generate a token',
 178                     expected=True)
 179             raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint')
 180         return access_token
 181
 182     def _extract_channel_title(self, video_meta):
 183         # TODO: Implement the API calls giving us the channel list,
 184         # so that we can do the title lookup and then figure out the channel URL
 185         categories = video_meta.get('categories', []) if video_meta else []
 186         # the channel name is the value of the first category
 187         for category in categories:
 188             if category.get('value'):
 189                 return category['value'][0]
 190
 191     def _real_initialize(self):
 192         # check cookie jar for valid token
 193         nebula_cookies = self._get_cookies('https://nebula.app')
 194         nebula_cookie = nebula_cookies.get('nebula-auth')
 195         if nebula_cookie:
 196             self.to_screen('Authenticating to Nebula with token from cookie jar')
 197             nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value)
 198             self._nebula_token = self._parse_json(nebula_cookie_value, None).get('apiToken')
 199
 200         # try to authenticate using credentials if no valid token has been found
 201         if not self._nebula_token:
 202             self._nebula_token = self._retrieve_nebula_auth()
 203
 204     def _real_extract(self, url):
 205         display_id = self._match_id(url)
 206         api_key = self._retrieve_zype_api_key(url, display_id)
 207
 208         response = self._call_zype_api('/videos', {'friendly_title': display_id},
 209                                        display_id, api_key, note='Retrieving metadata from Zype')
 210         if len(response.get('response') or []) != 1:
 211             raise ExtractorError('Unable to find video on Zype API')
 212         video_meta = response['response'][0]
 213
 214         video_id = video_meta['_id']
 215         zype_access_token = self._fetch_zype_access_token(display_id)
 216
 217         channel_title = self._extract_channel_title(video_meta)
 218
 219         return {
 220             'id': video_id,
 221             'display_id': display_id,
 222             '_type': 'url_transparent',
 223             'ie_key': 'Zype',
 224             'url': 'https://player.zype.com/embed/%s.html?access_token=%s' % (video_id, zype_access_token),
 225             'title': video_meta.get('title'),
 226             'description': video_meta.get('description'),
 227             'timestamp': parse_iso8601(video_meta.get('published_at')),
 228             'thumbnails': [{
 229                 'id': tn.get('name'),  # this appears to be null
 230                 'url': tn['url'],
 231                 'width': tn.get('width'),
 232                 'height': tn.get('height'),
 233             } for tn in video_meta.get('thumbnails', [])],
 234             'duration': video_meta.get('duration'),
 235             'channel': channel_title,
 236             'uploader': channel_title,  # we chose uploader = channel name
 237             # TODO: uploader_url, channel_id, channel_url
 238         }