yt_dlp/extractor/vrv.py

   1 import base64
   2 import hashlib
   3 import hmac
   4 import json
   5 import random
   6 import string
   7 import time
   8 import urllib.parse
   9
  10 from .common import InfoExtractor
  11 from ..compat import compat_urllib_parse_urlencode
  12 from ..networking.exceptions import HTTPError
  13 from ..utils import (
  14     ExtractorError,
  15     float_or_none,
  16     int_or_none,
  17     join_nonempty,
  18     traverse_obj,
  19 )
  20
  21
  22 class VRVBaseIE(InfoExtractor):
  23     _API_DOMAIN = None
  24     _API_PARAMS = {}
  25     _CMS_SIGNING = {}
  26     _TOKEN = None
  27     _TOKEN_SECRET = ''
  28
  29     def _call_api(self, path, video_id, note, data=None):
  30         # https://tools.ietf.org/html/rfc5849#section-3
  31         base_url = self._API_DOMAIN + '/core/' + path
  32         query = [
  33             ('oauth_consumer_key', self._API_PARAMS['oAuthKey']),
  34             ('oauth_nonce', ''.join(random.choices(string.ascii_letters, k=32))),
  35             ('oauth_signature_method', 'HMAC-SHA1'),
  36             ('oauth_timestamp', int(time.time())),
  37         ]
  38         if self._TOKEN:
  39             query.append(('oauth_token', self._TOKEN))
  40         encoded_query = compat_urllib_parse_urlencode(query)
  41         headers = self.geo_verification_headers()
  42         if data:
  43             data = json.dumps(data).encode()
  44             headers['Content-Type'] = 'application/json'
  45         base_string = '&'.join([
  46             'POST' if data else 'GET',
  47             urllib.parse.quote(base_url, ''),
  48             urllib.parse.quote(encoded_query, '')])
  49         oauth_signature = base64.b64encode(hmac.new(
  50             (self._API_PARAMS['oAuthSecret'] + '&' + self._TOKEN_SECRET).encode('ascii'),
  51             base_string.encode(), hashlib.sha1).digest()).decode()
  52         encoded_query += '&oauth_signature=' + urllib.parse.quote(oauth_signature, '')
  53         try:
  54             return self._download_json(
  55                 '?'.join([base_url, encoded_query]), video_id,
  56                 note='Downloading %s JSON metadata' % note, headers=headers, data=data)
  57         except ExtractorError as e:
  58             if isinstance(e.cause, HTTPError) and e.cause.status == 401:
  59                 raise ExtractorError(json.loads(e.cause.response.read().decode())['message'], expected=True)
  60             raise
  61
  62     def _call_cms(self, path, video_id, note):
  63         if not self._CMS_SIGNING:
  64             index = self._call_api('index', video_id, 'CMS Signing')
  65             self._CMS_SIGNING = index.get('cms_signing') or {}
  66             if not self._CMS_SIGNING:
  67                 for signing_policy in index.get('signing_policies', []):
  68                     signing_path = signing_policy.get('path')
  69                     if signing_path and signing_path.startswith('/cms/'):
  70                         name, value = signing_policy.get('name'), signing_policy.get('value')
  71                         if name and value:
  72                             self._CMS_SIGNING[name] = value
  73         return self._download_json(
  74             self._API_DOMAIN + path, video_id, query=self._CMS_SIGNING,
  75             note='Downloading %s JSON metadata' % note, headers=self.geo_verification_headers())
  76
  77     def _get_cms_resource(self, resource_key, video_id):
  78         return self._call_api(
  79             'cms_resource', video_id, 'resource path', data={
  80                 'resource_key': resource_key,
  81             })['__links__']['cms_resource']['href']
  82
  83     def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang):
  84         if not url or stream_format not in ('hls', 'dash', 'adaptive_hls'):
  85             return []
  86         format_id = join_nonempty(
  87             stream_format,
  88             audio_lang and 'audio-%s' % audio_lang,
  89             hardsub_lang and 'hardsub-%s' % hardsub_lang)
  90         if 'hls' in stream_format:
  91             adaptive_formats = self._extract_m3u8_formats(
  92                 url, video_id, 'mp4', m3u8_id=format_id,
  93                 note='Downloading %s information' % format_id,
  94                 fatal=False)
  95         elif stream_format == 'dash':
  96             adaptive_formats = self._extract_mpd_formats(
  97                 url, video_id, mpd_id=format_id,
  98                 note='Downloading %s information' % format_id,
  99                 fatal=False)
 100         if audio_lang:
 101             for f in adaptive_formats:
 102                 if f.get('acodec') != 'none':
 103                     f['language'] = audio_lang
 104         return adaptive_formats
 105
 106     def _set_api_params(self):
 107         webpage = self._download_webpage(
 108             'https://vrv.co/', None, headers=self.geo_verification_headers())
 109         self._API_PARAMS = self._parse_json(self._search_regex(
 110             [
 111                 r'window\.__APP_CONFIG__\s*=\s*({.+?})(?:</script>|;)',
 112                 r'window\.__APP_CONFIG__\s*=\s*({.+})'
 113             ], webpage, 'app config'), None)['cxApiParams']
 114         self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co')
 115
 116
 117 class VRVIE(VRVBaseIE):
 118     IE_NAME = 'vrv'
 119     _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P<id>[A-Z0-9]+)'
 120     _TESTS = [{
 121         'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT',
 122         'info_dict': {
 123             'id': 'GR9PNZ396',
 124             'ext': 'mp4',
 125             'title': 'BOSTON: WHERE THE PAST IS THE PRESENT',
 126             'description': 'md5:4ec8844ac262ca2df9e67c0983c6b83f',
 127             'uploader_id': 'seeso',
 128         },
 129         'params': {
 130             # m3u8 download
 131             'skip_download': True,
 132         },
 133     }, {
 134         # movie listing
 135         'url': 'https://vrv.co/watch/G6NQXZ1J6/Lily-CAT',
 136         'info_dict': {
 137             'id': 'G6NQXZ1J6',
 138             'title': 'Lily C.A.T',
 139             'description': 'md5:988b031e7809a6aeb60968be4af7db07',
 140         },
 141         'playlist_count': 2,
 142     }]
 143     _NETRC_MACHINE = 'vrv'
 144
 145     def _perform_login(self, username, password):
 146         token_credentials = self._call_api(
 147             'authenticate/by:credentials', None, 'Token Credentials', data={
 148                 'email': username,
 149                 'password': password,
 150             })
 151         self._TOKEN = token_credentials['oauth_token']
 152         self._TOKEN_SECRET = token_credentials['oauth_token_secret']
 153
 154     def _initialize_pre_login(self):
 155         return self._set_api_params()
 156
 157     def _real_extract(self, url):
 158         video_id = self._match_id(url)
 159
 160         object_data = self._call_cms(self._get_cms_resource(
 161             'cms:/objects/' + video_id, video_id), video_id, 'object')['items'][0]
 162         resource_path = object_data['__links__']['resource']['href']
 163         video_data = self._call_cms(resource_path, video_id, 'video')
 164         title = video_data['title']
 165         description = video_data.get('description')
 166
 167         if video_data.get('__class__') == 'movie_listing':
 168             items = self._call_cms(
 169                 video_data['__links__']['movie_listing/movies']['href'],
 170                 video_id, 'movie listing').get('items') or []
 171             if len(items) != 1:
 172                 entries = []
 173                 for item in items:
 174                     item_id = item.get('id')
 175                     if not item_id:
 176                         continue
 177                     entries.append(self.url_result(
 178                         'https://vrv.co/watch/' + item_id,
 179                         self.ie_key(), item_id, item.get('title')))
 180                 return self.playlist_result(entries, video_id, title, description)
 181             video_data = items[0]
 182
 183         streams_path = video_data['__links__'].get('streams', {}).get('href')
 184         if not streams_path:
 185             self.raise_login_required()
 186         streams_json = self._call_cms(streams_path, video_id, 'streams')
 187
 188         audio_locale = streams_json.get('audio_locale')
 189         formats = []
 190         for stream_type, streams in streams_json.get('streams', {}).items():
 191             if stream_type in ('adaptive_hls', 'adaptive_dash'):
 192                 for stream in streams.values():
 193                     formats.extend(self._extract_vrv_formats(
 194                         stream.get('url'), video_id, stream_type.split('_')[1],
 195                         audio_locale, stream.get('hardsub_locale')))
 196
 197         subtitles = {}
 198         for k in ('captions', 'subtitles'):
 199             for subtitle in streams_json.get(k, {}).values():
 200                 subtitle_url = subtitle.get('url')
 201                 if not subtitle_url:
 202                     continue
 203                 subtitles.setdefault(subtitle.get('locale', 'en-US'), []).append({
 204                     'url': subtitle_url,
 205                     'ext': subtitle.get('format', 'ass'),
 206                 })
 207
 208         thumbnails = []
 209         for thumbnail in traverse_obj(video_data, ('images', 'thumbnail', ..., ...)) or []:
 210             thumbnail_url = thumbnail.get('source')
 211             if not thumbnail_url:
 212                 continue
 213             thumbnails.append({
 214                 'url': thumbnail_url,
 215                 'width': int_or_none(thumbnail.get('width')),
 216                 'height': int_or_none(thumbnail.get('height')),
 217             })
 218
 219         return {
 220             'id': video_id,
 221             'title': title,
 222             'formats': formats,
 223             'subtitles': subtitles,
 224             'thumbnails': thumbnails,
 225             'description': description,
 226             'duration': float_or_none(video_data.get('duration_ms'), 1000),
 227             'uploader_id': video_data.get('channel_id'),
 228             'series': video_data.get('series_title'),
 229             'season': video_data.get('season_title'),
 230             'season_number': int_or_none(video_data.get('season_number')),
 231             'season_id': video_data.get('season_id'),
 232             'episode': title,
 233             'episode_number': int_or_none(video_data.get('episode_number')),
 234             'episode_id': video_data.get('production_episode_id'),
 235         }
 236
 237
 238 class VRVSeriesIE(VRVBaseIE):
 239     IE_NAME = 'vrv:series'
 240     _VALID_URL = r'https?://(?:www\.)?vrv\.co/series/(?P<id>[A-Z0-9]+)'
 241     _TEST = {
 242         'url': 'https://vrv.co/series/G68VXG3G6/The-Perfect-Insider',
 243         'info_dict': {
 244             'id': 'G68VXG3G6',
 245         },
 246         'playlist_mincount': 11,
 247     }
 248
 249     def _initialize_pre_login(self):
 250         return self._set_api_params()
 251
 252     def _real_extract(self, url):
 253         series_id = self._match_id(url)
 254
 255         seasons_path = self._get_cms_resource(
 256             'cms:/seasons?series_id=' + series_id, series_id)
 257         seasons_data = self._call_cms(seasons_path, series_id, 'seasons')
 258
 259         entries = []
 260         for season in seasons_data.get('items', []):
 261             episodes_path = season['__links__']['season/episodes']['href']
 262             episodes = self._call_cms(episodes_path, series_id, 'episodes')
 263             for episode in episodes.get('items', []):
 264                 episode_id = episode['id']
 265                 entries.append(self.url_result(
 266                     'https://vrv.co/watch/' + episode_id,
 267                     'VRV', episode_id, episode.get('title')))
 268
 269         return self.playlist_result(entries, series_id)