yt_dlp/extractor/vevo.py

   1 import re
   2 import json
   3
   4 from .common import InfoExtractor
   5 from ..compat import compat_str
   6 from ..networking.exceptions import HTTPError
   7 from ..utils import (
   8     ExtractorError,
   9     int_or_none,
  10     parse_iso8601,
  11     parse_qs,
  12 )
  13
  14
  15 class VevoBaseIE(InfoExtractor):
  16     def _extract_json(self, webpage, video_id):
  17         return self._parse_json(
  18             self._search_regex(
  19                 r'window\.__INITIAL_STORE__\s*=\s*({.+?});\s*</script>',
  20                 webpage, 'initial store'),
  21             video_id)
  22
  23
  24 class VevoIE(VevoBaseIE):
  25     '''
  26     Accepts urls from vevo.com or in the format 'vevo:{id}'
  27     (currently used by MTVIE and MySpaceIE)
  28     '''
  29     _VALID_URL = r'''(?x)
  30         (?:https?://(?:www\.)?vevo\.com/watch/(?!playlist|genre)(?:[^/]+/(?:[^/]+/)?)?|
  31            https?://cache\.vevo\.com/m/html/embed\.html\?video=|
  32            https?://videoplayer\.vevo\.com/embed/embedded\?videoId=|
  33            https?://embed\.vevo\.com/.*?[?&]isrc=|
  34            https?://tv\.vevo\.com/watch/artist/(?:[^/]+)/|
  35            vevo:)
  36         (?P<id>[^&?#]+)'''
  37     _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1']
  38
  39     _TESTS = [{
  40         'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
  41         'md5': '95ee28ee45e70130e3ab02b0f579ae23',
  42         'info_dict': {
  43             'id': 'GB1101300280',
  44             'ext': 'mp4',
  45             'title': 'Hurts - Somebody to Die For',
  46             'timestamp': 1372057200,
  47             'upload_date': '20130624',
  48             'uploader': 'Hurts',
  49             'track': 'Somebody to Die For',
  50             'artist': 'Hurts',
  51             'genre': 'Pop',
  52         },
  53         'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'],
  54     }, {
  55         'note': 'v3 SMIL format',
  56         'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923',
  57         'md5': 'f6ab09b034f8c22969020b042e5ac7fc',
  58         'info_dict': {
  59             'id': 'USUV71302923',
  60             'ext': 'mp4',
  61             'title': 'Cassadee Pope - I Wish I Could Break Your Heart',
  62             'timestamp': 1392796919,
  63             'upload_date': '20140219',
  64             'uploader': 'Cassadee Pope',
  65             'track': 'I Wish I Could Break Your Heart',
  66             'artist': 'Cassadee Pope',
  67             'genre': 'Country',
  68         },
  69         'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'],
  70     }, {
  71         'note': 'Age-limited video',
  72         'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282',
  73         'info_dict': {
  74             'id': 'USRV81300282',
  75             'ext': 'mp4',
  76             'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
  77             'age_limit': 18,
  78             'timestamp': 1372888800,
  79             'upload_date': '20130703',
  80             'uploader': 'Justin Timberlake',
  81             'track': 'Tunnel Vision (Explicit)',
  82             'artist': 'Justin Timberlake',
  83             'genre': 'Pop',
  84         },
  85         'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'],
  86     }, {
  87         'note': 'No video_info',
  88         'url': 'http://www.vevo.com/watch/k-camp-1/Till-I-Die/USUV71503000',
  89         'md5': '8b83cc492d72fc9cf74a02acee7dc1b0',
  90         'info_dict': {
  91             'id': 'USUV71503000',
  92             'ext': 'mp4',
  93             'title': 'K Camp ft. T.I. - Till I Die',
  94             'age_limit': 18,
  95             'timestamp': 1449468000,
  96             'upload_date': '20151207',
  97             'uploader': 'K Camp',
  98             'track': 'Till I Die',
  99             'artist': 'K Camp',
 100             'genre': 'Hip-Hop',
 101         },
 102         'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'],
 103     }, {
 104         'note': 'Featured test',
 105         'url': 'https://www.vevo.com/watch/lemaitre/Wait/USUV71402190',
 106         'md5': 'd28675e5e8805035d949dc5cf161071d',
 107         'info_dict': {
 108             'id': 'USUV71402190',
 109             'ext': 'mp4',
 110             'title': 'Lemaitre ft. LoLo - Wait',
 111             'age_limit': 0,
 112             'timestamp': 1413432000,
 113             'upload_date': '20141016',
 114             'uploader': 'Lemaitre',
 115             'track': 'Wait',
 116             'artist': 'Lemaitre',
 117             'genre': 'Electronic',
 118         },
 119         'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'],
 120     }, {
 121         'note': 'Only available via webpage',
 122         'url': 'http://www.vevo.com/watch/GBUV71600656',
 123         'md5': '67e79210613865b66a47c33baa5e37fe',
 124         'info_dict': {
 125             'id': 'GBUV71600656',
 126             'ext': 'mp4',
 127             'title': 'ABC - Viva Love',
 128             'age_limit': 0,
 129             'timestamp': 1461830400,
 130             'upload_date': '20160428',
 131             'uploader': 'ABC',
 132             'track': 'Viva Love',
 133             'artist': 'ABC',
 134             'genre': 'Pop',
 135         },
 136         'expected_warnings': ['Failed to download video versions info'],
 137     }, {
 138         # no genres available
 139         'url': 'http://www.vevo.com/watch/INS171400764',
 140         'only_matching': True,
 141     }, {
 142         # Another case available only via the webpage; using streams/streamsV3 formats
 143         # Geo-restricted to Netherlands/Germany
 144         'url': 'http://www.vevo.com/watch/boostee/pop-corn-clip-officiel/FR1A91600909',
 145         'only_matching': True,
 146     }, {
 147         'url': 'https://embed.vevo.com/?isrc=USH5V1923499&partnerId=4d61b777-8023-4191-9ede-497ed6c24647&partnerAdCode=',
 148         'only_matching': True,
 149     }, {
 150         'url': 'https://tv.vevo.com/watch/artist/janet-jackson/US0450100550',
 151         'only_matching': True,
 152     }]
 153     _VERSIONS = {
 154         0: 'youtube',  # only in AuthenticateVideo videoVersions
 155         1: 'level3',
 156         2: 'akamai',
 157         3: 'level3',
 158         4: 'amazon',
 159     }
 160
 161     def _initialize_api(self, video_id):
 162         webpage = self._download_webpage(
 163             'https://accounts.vevo.com/token', None,
 164             note='Retrieving oauth token',
 165             errnote='Unable to retrieve oauth token',
 166             data=json.dumps({
 167                 'client_id': 'SPupX1tvqFEopQ1YS6SS',
 168                 'grant_type': 'urn:vevo:params:oauth:grant-type:anonymous',
 169             }).encode('utf-8'),
 170             headers={
 171                 'Content-Type': 'application/json',
 172             })
 173
 174         if re.search(r'(?i)THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION', webpage):
 175             self.raise_geo_restricted(
 176                 '%s said: This page is currently unavailable in your region' % self.IE_NAME)
 177
 178         auth_info = self._parse_json(webpage, video_id)
 179         self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['legacy_token']
 180
 181     def _call_api(self, path, *args, **kwargs):
 182         try:
 183             data = self._download_json(self._api_url_template % path, *args, **kwargs)
 184         except ExtractorError as e:
 185             if isinstance(e.cause, HTTPError):
 186                 errors = self._parse_json(e.cause.response.read().decode(), None)['errors']
 187                 error_message = ', '.join([error['message'] for error in errors])
 188                 raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True)
 189             raise
 190         return data
 191
 192     def _real_extract(self, url):
 193         video_id = self._match_id(url)
 194
 195         self._initialize_api(video_id)
 196
 197         video_info = self._call_api(
 198             'video/%s' % video_id, video_id, 'Downloading api video info',
 199             'Failed to download video info')
 200
 201         video_versions = self._call_api(
 202             'video/%s/streams' % video_id, video_id,
 203             'Downloading video versions info',
 204             'Failed to download video versions info',
 205             fatal=False)
 206
 207         # Some videos are only available via webpage (e.g.
 208         # https://github.com/ytdl-org/youtube-dl/issues/9366)
 209         if not video_versions:
 210             webpage = self._download_webpage(url, video_id)
 211             json_data = self._extract_json(webpage, video_id)
 212             if 'streams' in json_data.get('default', {}):
 213                 video_versions = json_data['default']['streams'][video_id][0]
 214             else:
 215                 video_versions = [
 216                     value
 217                     for key, value in json_data['apollo']['data'].items()
 218                     if key.startswith('%s.streams' % video_id)]
 219
 220         uploader = None
 221         artist = None
 222         featured_artist = None
 223         artists = video_info.get('artists')
 224         for curr_artist in artists:
 225             if curr_artist.get('role') == 'Featured':
 226                 featured_artist = curr_artist['name']
 227             else:
 228                 artist = uploader = curr_artist['name']
 229
 230         formats = []
 231         for video_version in video_versions:
 232             version = self._VERSIONS.get(video_version.get('version'), 'generic')
 233             version_url = video_version.get('url')
 234             if not version_url:
 235                 continue
 236
 237             if '.ism' in version_url:
 238                 continue
 239             elif '.mpd' in version_url:
 240                 formats.extend(self._extract_mpd_formats(
 241                     version_url, video_id, mpd_id='dash-%s' % version,
 242                     note='Downloading %s MPD information' % version,
 243                     errnote='Failed to download %s MPD information' % version,
 244                     fatal=False))
 245             elif '.m3u8' in version_url:
 246                 formats.extend(self._extract_m3u8_formats(
 247                     version_url, video_id, 'mp4', 'm3u8_native',
 248                     m3u8_id='hls-%s' % version,
 249                     note='Downloading %s m3u8 information' % version,
 250                     errnote='Failed to download %s m3u8 information' % version,
 251                     fatal=False))
 252             else:
 253                 m = re.search(r'''(?xi)
 254                     _(?P<quality>[a-z0-9]+)
 255                     _(?P<width>[0-9]+)x(?P<height>[0-9]+)
 256                     _(?P<vcodec>[a-z0-9]+)
 257                     _(?P<vbr>[0-9]+)
 258                     _(?P<acodec>[a-z0-9]+)
 259                     _(?P<abr>[0-9]+)
 260                     \.(?P<ext>[a-z0-9]+)''', version_url)
 261                 if not m:
 262                     continue
 263
 264                 formats.append({
 265                     'url': version_url,
 266                     'format_id': f'http-{version}-{video_version.get("quality") or m.group("quality")}',
 267                     'vcodec': m.group('vcodec'),
 268                     'acodec': m.group('acodec'),
 269                     'vbr': int(m.group('vbr')),
 270                     'abr': int(m.group('abr')),
 271                     'ext': m.group('ext'),
 272                     'width': int(m.group('width')),
 273                     'height': int(m.group('height')),
 274                 })
 275
 276         track = video_info['title']
 277         if featured_artist:
 278             artist = '%s ft. %s' % (artist, featured_artist)
 279         title = '%s - %s' % (artist, track) if artist else track
 280
 281         genres = video_info.get('genres')
 282         genre = (
 283             genres[0] if genres and isinstance(genres, list)
 284             and isinstance(genres[0], compat_str) else None)
 285
 286         is_explicit = video_info.get('isExplicit')
 287         if is_explicit is True:
 288             age_limit = 18
 289         elif is_explicit is False:
 290             age_limit = 0
 291         else:
 292             age_limit = None
 293
 294         return {
 295             'id': video_id,
 296             'title': title,
 297             'formats': formats,
 298             'thumbnail': video_info.get('imageUrl') or video_info.get('thumbnailUrl'),
 299             'timestamp': parse_iso8601(video_info.get('releaseDate')),
 300             'uploader': uploader,
 301             'duration': int_or_none(video_info.get('duration')),
 302             'view_count': int_or_none(video_info.get('views', {}).get('total')),
 303             'age_limit': age_limit,
 304             'track': track,
 305             'artist': uploader,
 306             'genre': genre,
 307         }
 308
 309
 310 class VevoPlaylistIE(VevoBaseIE):
 311     _VALID_URL = r'https?://(?:www\.)?vevo\.com/watch/(?P<kind>playlist|genre)/(?P<id>[^/?#&]+)'
 312
 313     _TESTS = [{
 314         'url': 'http://www.vevo.com/watch/genre/rock',
 315         'info_dict': {
 316             'id': 'rock',
 317             'title': 'Rock',
 318         },
 319         'playlist_count': 20,
 320     }, {
 321         'url': 'http://www.vevo.com/watch/genre/rock?index=0',
 322         'only_matching': True,
 323     }]
 324
 325     def _real_extract(self, url):
 326         mobj = self._match_valid_url(url)
 327         playlist_id = mobj.group('id')
 328         playlist_kind = mobj.group('kind')
 329
 330         webpage = self._download_webpage(url, playlist_id)
 331
 332         qs = parse_qs(url)
 333         index = qs.get('index', [None])[0]
 334
 335         if index:
 336             video_id = self._search_regex(
 337                 r'<meta[^>]+content=(["\'])vevo://video/(?P<id>.+?)\1[^>]*>',
 338                 webpage, 'video id', default=None, group='id')
 339             if video_id:
 340                 return self.url_result('vevo:%s' % video_id, VevoIE.ie_key())
 341
 342         playlists = self._extract_json(webpage, playlist_id)['default']['%ss' % playlist_kind]
 343
 344         playlist = (list(playlists.values())[0]
 345                     if playlist_kind == 'playlist' else playlists[playlist_id])
 346
 347         entries = [
 348             self.url_result('vevo:%s' % src, VevoIE.ie_key())
 349             for src in playlist['isrcs']]
 350
 351         return self.playlist_result(
 352             entries, playlist.get('playlistId') or playlist_id,
 353             playlist.get('name'), playlist.get('description'))