yt_dlp/extractor/vrv.py

   1 import base64
   2 import hashlib
   3 import hmac
   4 import json
   5 import random
   6 import string
   7 import time
   8 import urllib.parse
   9
  10 from .common import InfoExtractor
  11 from ..compat import compat_HTTPError, compat_urllib_parse_urlencode
  12 from ..utils import (
  13     ExtractorError,
  14     float_or_none,
  15     int_or_none,
  16     join_nonempty,
  17     traverse_obj,
  18 )
  19
  20
  21 class VRVBaseIE(InfoExtractor):
  22     _API_DOMAIN = None
  23     _API_PARAMS = {}
  24     _CMS_SIGNING = {}
  25     _TOKEN = None
  26     _TOKEN_SECRET = ''
  27
  28     def _call_api(self, path, video_id, note, data=None):
  29         # https://tools.ietf.org/html/rfc5849#section-3
  30         base_url = self._API_DOMAIN + '/core/' + path
  31         query = [
  32             ('oauth_consumer_key', self._API_PARAMS['oAuthKey']),
  33             ('oauth_nonce', ''.join([random.choice(string.ascii_letters) for _ in range(32)])),
  34             ('oauth_signature_method', 'HMAC-SHA1'),
  35             ('oauth_timestamp', int(time.time())),
  36         ]
  37         if self._TOKEN:
  38             query.append(('oauth_token', self._TOKEN))
  39         encoded_query = compat_urllib_parse_urlencode(query)
  40         headers = self.geo_verification_headers()
  41         if data:
  42             data = json.dumps(data).encode()
  43             headers['Content-Type'] = 'application/json'
  44         base_string = '&'.join([
  45             'POST' if data else 'GET',
  46             urllib.parse.quote(base_url, ''),
  47             urllib.parse.quote(encoded_query, '')])
  48         oauth_signature = base64.b64encode(hmac.new(
  49             (self._API_PARAMS['oAuthSecret'] + '&' + self._TOKEN_SECRET).encode('ascii'),
  50             base_string.encode(), hashlib.sha1).digest()).decode()
  51         encoded_query += '&oauth_signature=' + urllib.parse.quote(oauth_signature, '')
  52         try:
  53             return self._download_json(
  54                 '?'.join([base_url, encoded_query]), video_id,
  55                 note='Downloading %s JSON metadata' % note, headers=headers, data=data)
  56         except ExtractorError as e:
  57             if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
  58                 raise ExtractorError(json.loads(e.cause.read().decode())['message'], expected=True)
  59             raise
  60
  61     def _call_cms(self, path, video_id, note):
  62         if not self._CMS_SIGNING:
  63             index = self._call_api('index', video_id, 'CMS Signing')
  64             self._CMS_SIGNING = index.get('cms_signing') or {}
  65             if not self._CMS_SIGNING:
  66                 for signing_policy in index.get('signing_policies', []):
  67                     signing_path = signing_policy.get('path')
  68                     if signing_path and signing_path.startswith('/cms/'):
  69                         name, value = signing_policy.get('name'), signing_policy.get('value')
  70                         if name and value:
  71                             self._CMS_SIGNING[name] = value
  72         return self._download_json(
  73             self._API_DOMAIN + path, video_id, query=self._CMS_SIGNING,
  74             note='Downloading %s JSON metadata' % note, headers=self.geo_verification_headers())
  75
  76     def _get_cms_resource(self, resource_key, video_id):
  77         return self._call_api(
  78             'cms_resource', video_id, 'resource path', data={
  79                 'resource_key': resource_key,
  80             })['__links__']['cms_resource']['href']
  81
  82     def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang):
  83         if not url or stream_format not in ('hls', 'dash', 'adaptive_hls'):
  84             return []
  85         format_id = join_nonempty(
  86             stream_format,
  87             audio_lang and 'audio-%s' % audio_lang,
  88             hardsub_lang and 'hardsub-%s' % hardsub_lang)
  89         if 'hls' in stream_format:
  90             adaptive_formats = self._extract_m3u8_formats(
  91                 url, video_id, 'mp4', m3u8_id=format_id,
  92                 note='Downloading %s information' % format_id,
  93                 fatal=False)
  94         elif stream_format == 'dash':
  95             adaptive_formats = self._extract_mpd_formats(
  96                 url, video_id, mpd_id=format_id,
  97                 note='Downloading %s information' % format_id,
  98                 fatal=False)
  99         if audio_lang:
 100             for f in adaptive_formats:
 101                 if f.get('acodec') != 'none':
 102                     f['language'] = audio_lang
 103         return adaptive_formats
 104
 105     def _set_api_params(self):
 106         webpage = self._download_webpage(
 107             'https://vrv.co/', None, headers=self.geo_verification_headers())
 108         self._API_PARAMS = self._parse_json(self._search_regex(
 109             [
 110                 r'window\.__APP_CONFIG__\s*=\s*({.+?})(?:</script>|;)',
 111                 r'window\.__APP_CONFIG__\s*=\s*({.+})'
 112             ], webpage, 'app config'), None)['cxApiParams']
 113         self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co')
 114
 115
 116 class VRVIE(VRVBaseIE):
 117     IE_NAME = 'vrv'
 118     _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P<id>[A-Z0-9]+)'
 119     _TESTS = [{
 120         'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT',
 121         'info_dict': {
 122             'id': 'GR9PNZ396',
 123             'ext': 'mp4',
 124             'title': 'BOSTON: WHERE THE PAST IS THE PRESENT',
 125             'description': 'md5:4ec8844ac262ca2df9e67c0983c6b83f',
 126             'uploader_id': 'seeso',
 127         },
 128         'params': {
 129             # m3u8 download
 130             'skip_download': True,
 131         },
 132     }, {
 133         # movie listing
 134         'url': 'https://vrv.co/watch/G6NQXZ1J6/Lily-CAT',
 135         'info_dict': {
 136             'id': 'G6NQXZ1J6',
 137             'title': 'Lily C.A.T',
 138             'description': 'md5:988b031e7809a6aeb60968be4af7db07',
 139         },
 140         'playlist_count': 2,
 141     }]
 142     _NETRC_MACHINE = 'vrv'
 143
 144     def _perform_login(self, username, password):
 145         token_credentials = self._call_api(
 146             'authenticate/by:credentials', None, 'Token Credentials', data={
 147                 'email': username,
 148                 'password': password,
 149             })
 150         self._TOKEN = token_credentials['oauth_token']
 151         self._TOKEN_SECRET = token_credentials['oauth_token_secret']
 152
 153     def _initialize_pre_login(self):
 154         return self._set_api_params()
 155
 156     def _real_extract(self, url):
 157         video_id = self._match_id(url)
 158
 159         object_data = self._call_cms(self._get_cms_resource(
 160             'cms:/objects/' + video_id, video_id), video_id, 'object')['items'][0]
 161         resource_path = object_data['__links__']['resource']['href']
 162         video_data = self._call_cms(resource_path, video_id, 'video')
 163         title = video_data['title']
 164         description = video_data.get('description')
 165
 166         if video_data.get('__class__') == 'movie_listing':
 167             items = self._call_cms(
 168                 video_data['__links__']['movie_listing/movies']['href'],
 169                 video_id, 'movie listing').get('items') or []
 170             if len(items) != 1:
 171                 entries = []
 172                 for item in items:
 173                     item_id = item.get('id')
 174                     if not item_id:
 175                         continue
 176                     entries.append(self.url_result(
 177                         'https://vrv.co/watch/' + item_id,
 178                         self.ie_key(), item_id, item.get('title')))
 179                 return self.playlist_result(entries, video_id, title, description)
 180             video_data = items[0]
 181
 182         streams_path = video_data['__links__'].get('streams', {}).get('href')
 183         if not streams_path:
 184             self.raise_login_required()
 185         streams_json = self._call_cms(streams_path, video_id, 'streams')
 186
 187         audio_locale = streams_json.get('audio_locale')
 188         formats = []
 189         for stream_type, streams in streams_json.get('streams', {}).items():
 190             if stream_type in ('adaptive_hls', 'adaptive_dash'):
 191                 for stream in streams.values():
 192                     formats.extend(self._extract_vrv_formats(
 193                         stream.get('url'), video_id, stream_type.split('_')[1],
 194                         audio_locale, stream.get('hardsub_locale')))
 195         self._sort_formats(formats)
 196
 197         subtitles = {}
 198         for k in ('captions', 'subtitles'):
 199             for subtitle in streams_json.get(k, {}).values():
 200                 subtitle_url = subtitle.get('url')
 201                 if not subtitle_url:
 202                     continue
 203                 subtitles.setdefault(subtitle.get('locale', 'en-US'), []).append({
 204                     'url': subtitle_url,
 205                     'ext': subtitle.get('format', 'ass'),
 206                 })
 207
 208         thumbnails = []
 209         for thumbnail in traverse_obj(video_data, ('images', 'thumbnail', ..., ...)) or []:
 210             thumbnail_url = thumbnail.get('source')
 211             if not thumbnail_url:
 212                 continue
 213             thumbnails.append({
 214                 'url': thumbnail_url,
 215                 'width': int_or_none(thumbnail.get('width')),
 216                 'height': int_or_none(thumbnail.get('height')),
 217             })
 218
 219         return {
 220             'id': video_id,
 221             'title': title,
 222             'formats': formats,
 223             'subtitles': subtitles,
 224             'thumbnails': thumbnails,
 225             'description': description,
 226             'duration': float_or_none(video_data.get('duration_ms'), 1000),
 227             'uploader_id': video_data.get('channel_id'),
 228             'series': video_data.get('series_title'),
 229             'season': video_data.get('season_title'),
 230             'season_number': int_or_none(video_data.get('season_number')),
 231             'season_id': video_data.get('season_id'),
 232             'episode': title,
 233             'episode_number': int_or_none(video_data.get('episode_number')),
 234             'episode_id': video_data.get('production_episode_id'),
 235         }
 236
 237
 238 class VRVSeriesIE(VRVBaseIE):
 239     IE_NAME = 'vrv:series'
 240     _VALID_URL = r'https?://(?:www\.)?vrv\.co/series/(?P<id>[A-Z0-9]+)'
 241     _TEST = {
 242         'url': 'https://vrv.co/series/G68VXG3G6/The-Perfect-Insider',
 243         'info_dict': {
 244             'id': 'G68VXG3G6',
 245         },
 246         'playlist_mincount': 11,
 247     }
 248
 249     def _initialize_pre_login(self):
 250         return self._set_api_params()
 251
 252     def _real_extract(self, url):
 253         series_id = self._match_id(url)
 254
 255         seasons_path = self._get_cms_resource(
 256             'cms:/seasons?series_id=' + series_id, series_id)
 257         seasons_data = self._call_cms(seasons_path, series_id, 'seasons')
 258
 259         entries = []
 260         for season in seasons_data.get('items', []):
 261             episodes_path = season['__links__']['season/episodes']['href']
 262             episodes = self._call_cms(episodes_path, series_id, 'episodes')
 263             for episode in episodes.get('items', []):
 264                 episode_id = episode['id']
 265                 entries.append(self.url_result(
 266                     'https://vrv.co/watch/' + episode_id,
 267                     'VRV', episode_id, episode.get('title')))
 268
 269         return self.playlist_result(entries, series_id)