yt_dlp/extractor/vevo.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4 import json
   5
   6 from .common import InfoExtractor
   7 from ..compat import (
   8     compat_str,
   9     compat_HTTPError,
  10 )
  11 from ..utils import (
  12     ExtractorError,
  13     int_or_none,
  14     parse_iso8601,
  15     parse_qs,
  16 )
  17
  18
  19 class VevoBaseIE(InfoExtractor):
  20     def _extract_json(self, webpage, video_id):
  21         return self._parse_json(
  22             self._search_regex(
  23                 r'window\.__INITIAL_STORE__\s*=\s*({.+?});\s*</script>',
  24                 webpage, 'initial store'),
  25             video_id)
  26
  27
  28 class VevoIE(VevoBaseIE):
  29     '''
  30     Accepts urls from vevo.com or in the format 'vevo:{id}'
  31     (currently used by MTVIE and MySpaceIE)
  32     '''
  33     _VALID_URL = r'''(?x)
  34         (?:https?://(?:www\.)?vevo\.com/watch/(?!playlist|genre)(?:[^/]+/(?:[^/]+/)?)?|
  35            https?://cache\.vevo\.com/m/html/embed\.html\?video=|
  36            https?://videoplayer\.vevo\.com/embed/embedded\?videoId=|
  37            https?://embed\.vevo\.com/.*?[?&]isrc=|
  38            vevo:)
  39         (?P<id>[^&?#]+)'''
  40
  41     _TESTS = []
  42     _VERSIONS = {
  43         0: 'youtube',  # only in AuthenticateVideo videoVersions
  44         1: 'level3',
  45         2: 'akamai',
  46         3: 'level3',
  47         4: 'amazon',
  48     }
  49
  50     def _initialize_api(self, video_id):
  51         webpage = self._download_webpage(
  52             'https://accounts.vevo.com/token', None,
  53             note='Retrieving oauth token',
  54             errnote='Unable to retrieve oauth token',
  55             data=json.dumps({
  56                 'client_id': 'SPupX1tvqFEopQ1YS6SS',
  57                 'grant_type': 'urn:vevo:params:oauth:grant-type:anonymous',
  58             }).encode('utf-8'),
  59             headers={
  60                 'Content-Type': 'application/json',
  61             })
  62
  63         if re.search(r'(?i)THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION', webpage):
  64             self.raise_geo_restricted(
  65                 '%s said: This page is currently unavailable in your region' % self.IE_NAME)
  66
  67         auth_info = self._parse_json(webpage, video_id)
  68         self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['legacy_token']
  69
  70     def _call_api(self, path, *args, **kwargs):
  71         try:
  72             data = self._download_json(self._api_url_template % path, *args, **kwargs)
  73         except ExtractorError as e:
  74             if isinstance(e.cause, compat_HTTPError):
  75                 errors = self._parse_json(e.cause.read().decode(), None)['errors']
  76                 error_message = ', '.join([error['message'] for error in errors])
  77                 raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True)
  78             raise
  79         return data
  80
  81     def _real_extract(self, url):
  82         video_id = self._match_id(url)
  83
  84         self._initialize_api(video_id)
  85
  86         video_info = self._call_api(
  87             'video/%s' % video_id, video_id, 'Downloading api video info',
  88             'Failed to download video info')
  89
  90         video_versions = self._call_api(
  91             'video/%s/streams' % video_id, video_id,
  92             'Downloading video versions info',
  93             'Failed to download video versions info',
  94             fatal=False)
  95
  96         # Some videos are only available via webpage (e.g.
  97         # https://github.com/ytdl-org/youtube-dl/issues/9366)
  98         if not video_versions:
  99             webpage = self._download_webpage(url, video_id)
 100             json_data = self._extract_json(webpage, video_id)
 101             if 'streams' in json_data.get('default', {}):
 102                 video_versions = json_data['default']['streams'][video_id][0]
 103             else:
 104                 video_versions = [
 105                     value
 106                     for key, value in json_data['apollo']['data'].items()
 107                     if key.startswith('%s.streams' % video_id)]
 108
 109         uploader = None
 110         artist = None
 111         featured_artist = None
 112         artists = video_info.get('artists')
 113         for curr_artist in artists:
 114             if curr_artist.get('role') == 'Featured':
 115                 featured_artist = curr_artist['name']
 116             else:
 117                 artist = uploader = curr_artist['name']
 118
 119         formats = []
 120         for video_version in video_versions:
 121             version = self._VERSIONS.get(video_version.get('version'), 'generic')
 122             version_url = video_version.get('url')
 123             if not version_url:
 124                 continue
 125
 126             if '.ism' in version_url:
 127                 continue
 128             elif '.mpd' in version_url:
 129                 formats.extend(self._extract_mpd_formats(
 130                     version_url, video_id, mpd_id='dash-%s' % version,
 131                     note='Downloading %s MPD information' % version,
 132                     errnote='Failed to download %s MPD information' % version,
 133                     fatal=False))
 134             elif '.m3u8' in version_url:
 135                 formats.extend(self._extract_m3u8_formats(
 136                     version_url, video_id, 'mp4', 'm3u8_native',
 137                     m3u8_id='hls-%s' % version,
 138                     note='Downloading %s m3u8 information' % version,
 139                     errnote='Failed to download %s m3u8 information' % version,
 140                     fatal=False))
 141             else:
 142                 m = re.search(r'''(?xi)
 143                     _(?P<width>[0-9]+)x(?P<height>[0-9]+)
 144                     _(?P<vcodec>[a-z0-9]+)
 145                     _(?P<vbr>[0-9]+)
 146                     _(?P<acodec>[a-z0-9]+)
 147                     _(?P<abr>[0-9]+)
 148                     \.(?P<ext>[a-z0-9]+)''', version_url)
 149                 if not m:
 150                     continue
 151
 152                 formats.append({
 153                     'url': version_url,
 154                     'format_id': 'http-%s-%s' % (version, video_version['quality']),
 155                     'vcodec': m.group('vcodec'),
 156                     'acodec': m.group('acodec'),
 157                     'vbr': int(m.group('vbr')),
 158                     'abr': int(m.group('abr')),
 159                     'ext': m.group('ext'),
 160                     'width': int(m.group('width')),
 161                     'height': int(m.group('height')),
 162                 })
 163         self._sort_formats(formats)
 164
 165         track = video_info['title']
 166         if featured_artist:
 167             artist = '%s ft. %s' % (artist, featured_artist)
 168         title = '%s - %s' % (artist, track) if artist else track
 169
 170         genres = video_info.get('genres')
 171         genre = (
 172             genres[0] if genres and isinstance(genres, list)
 173             and isinstance(genres[0], compat_str) else None)
 174
 175         is_explicit = video_info.get('isExplicit')
 176         if is_explicit is True:
 177             age_limit = 18
 178         elif is_explicit is False:
 179             age_limit = 0
 180         else:
 181             age_limit = None
 182
 183         return {
 184             'id': video_id,
 185             'title': title,
 186             'formats': formats,
 187             'thumbnail': video_info.get('imageUrl') or video_info.get('thumbnailUrl'),
 188             'timestamp': parse_iso8601(video_info.get('releaseDate')),
 189             'uploader': uploader,
 190             'duration': int_or_none(video_info.get('duration')),
 191             'view_count': int_or_none(video_info.get('views', {}).get('total')),
 192             'age_limit': age_limit,
 193             'track': track,
 194             'artist': uploader,
 195             'genre': genre,
 196         }
 197
 198
 199 class VevoPlaylistIE(VevoBaseIE):
 200     _VALID_URL = r'https?://(?:www\.)?vevo\.com/watch/(?P<kind>playlist|genre)/(?P<id>[^/?#&]+)'
 201
 202     _TESTS = [{
 203         'url': 'http://www.vevo.com/watch/genre/rock',
 204         'info_dict': {
 205             'id': 'rock',
 206             'title': 'Rock',
 207         },
 208         'playlist_count': 20,
 209     }, {
 210         'url': 'http://www.vevo.com/watch/genre/rock?index=0',
 211         'only_matching': True,
 212     }]
 213
 214     def _real_extract(self, url):
 215         mobj = self._match_valid_url(url)
 216         playlist_id = mobj.group('id')
 217         playlist_kind = mobj.group('kind')
 218
 219         webpage = self._download_webpage(url, playlist_id)
 220
 221         qs = parse_qs(url)
 222         index = qs.get('index', [None])[0]
 223
 224         if index:
 225             video_id = self._search_regex(
 226                 r'<meta[^>]+content=(["\'])vevo://video/(?P<id>.+?)\1[^>]*>',
 227                 webpage, 'video id', default=None, group='id')
 228             if video_id:
 229                 return self.url_result('vevo:%s' % video_id, VevoIE.ie_key())
 230
 231         playlists = self._extract_json(webpage, playlist_id)['default']['%ss' % playlist_kind]
 232
 233         playlist = (list(playlists.values())[0]
 234                     if playlist_kind == 'playlist' else playlists[playlist_id])
 235
 236         entries = [
 237             self.url_result('vevo:%s' % src, VevoIE.ie_key())
 238             for src in playlist['isrcs']]
 239
 240         return self.playlist_result(
 241             entries, playlist.get('playlistId') or playlist_id,
 242             playlist.get('name'), playlist.get('description'))