get_element_by_attribute,
get_element_by_id,
int_or_none,
+ js_to_json,
mimetype2ext,
orderedSet,
parse_codecs,
+ parse_count,
parse_duration,
remove_quotes,
remove_start,
_LOGIN_REQUIRED = False
_PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
+ _INITIAL_DATA_RE = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
+ _YTCFG_DATA_RE = r"ytcfg.set\(({.*?})\)"
_YOUTUBE_CLIENT_HEADERS = {
'x-youtube-client-name': '1',
if username is None:
if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
+ if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
+ self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
return True
login_page = self._download_webpage(
def _download_webpage_handle(self, *args, **kwargs):
query = kwargs.get('query', {}).copy()
- query['disable_polymer'] = 'true'
kwargs['query'] = query
return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
*args, **compat_kwargs(kwargs))
+ def _get_yt_initial_data(self, video_id, webpage):
+ config = self._search_regex(
+ (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
+ r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
+ webpage, 'ytInitialData', default=None)
+ if config:
+ return self._parse_json(
+ uppercase_escape(config), video_id, fatal=False)
+
def _real_initialize(self):
if self._downloader is None:
return
class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
- # Extract entries from page with "Load more" button
- def _entries(self, page, playlist_id):
- more_widget_html = content_html = page
- for page_num in itertools.count(1):
- for entry in self._process_page(content_html):
+
+ def _find_entries_in_json(self, extracted):
+ entries = []
+ c = {}
+
+ def _real_find(obj):
+ if obj is None or isinstance(obj, str):
+ return
+
+ if type(obj) is list:
+ for elem in obj:
+ _real_find(elem)
+
+ if type(obj) is dict:
+ if self._is_entry(obj):
+ entries.append(obj)
+ return
+
+ if 'continuationCommand' in obj:
+ c['continuation'] = obj
+ return
+
+ for _, o in obj.items():
+ _real_find(o)
+
+ _real_find(extracted)
+
+ return entries, try_get(c, lambda x: x["continuation"])
+
+ def _entries(self, page, playlist_id, max_pages=None):
+ seen = []
+
+ yt_conf = {}
+ for m in re.finditer(self._YTCFG_DATA_RE, page):
+ parsed = self._parse_json(m.group(1), playlist_id,
+ transform_source=js_to_json, fatal=False)
+ if parsed:
+ yt_conf.update(parsed)
+
+ data_json = self._parse_json(self._search_regex(self._INITIAL_DATA_RE, page, 'ytInitialData'), None)
+
+ for page_num in range(1, max_pages + 1) if max_pages is not None else itertools.count(1):
+ entries, continuation = self._find_entries_in_json(data_json)
+ processed = self._process_entries(entries, seen)
+
+ if not processed:
+ break
+ for entry in processed:
yield entry
- mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
- if not mobj:
+ if not continuation or not yt_conf:
+ break
+ continuation_token = try_get(continuation, lambda x: x['continuationCommand']['token'])
+ continuation_url = try_get(continuation, lambda x: x['commandMetadata']['webCommandMetadata']['apiUrl'])
+ if not continuation_token or not continuation_url:
break
count = 0
try:
# Downloading page may result in intermittent 5xx HTTP error
# that is usually worked around with a retry
- more = self._download_json(
- 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id,
- 'Downloading page #%s%s'
- % (page_num, ' (retry #%d)' % count if count else ''),
+ data_json = self._download_json(
+ 'https://www.youtube.com%s' % continuation_url,
+ playlist_id,
+ 'Downloading continuation page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''),
+
transform_source=uppercase_escape,
- headers=self._YOUTUBE_CLIENT_HEADERS)
+ query={
+ 'key': try_get(yt_conf, lambda x: x['INNERTUBE_API_KEY'])
+ },
+ data=bytes(json.dumps({
+ 'context': try_get(yt_conf, lambda x: x['INNERTUBE_CONTEXT']),
+ 'continuation': continuation_token
+ }), encoding='utf-8'),
+ headers={
+ 'Content-Type': 'application/json'
+ }
+ )
break
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
continue
raise
- content_html = more['content_html']
- if not content_html.strip():
- # Some webpages show a "Load more" button but they don't
- # have more videos
- break
- more_widget_html = more['load_more_widget_html']
+ def _extract_title(self, renderer):
+ title = try_get(renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
+ if title:
+ return title
+ return try_get(renderer, lambda x: x['title']['simpleText'], compat_str)
class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
- def _process_page(self, content):
- for video_id, video_title in self.extract_videos_from_page(content):
- yield self.url_result(video_id, 'Youtube', video_id, video_title)
+ def _is_entry(self, obj):
+ return 'videoId' in obj
- def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
- for mobj in re.finditer(video_re, page):
- # The link with index 0 is not the first video of the playlist (not sure if still actual)
- if 'index' in mobj.groupdict() and mobj.group('id') == '0':
+ def _process_entries(self, entries, seen):
+ ids_in_page = []
+ titles_in_page = []
+ for renderer in entries:
+ video_id = try_get(renderer, lambda x: x['videoId'])
+ video_title = self._extract_title(renderer)
+
+ if video_id is None or video_title is None:
+ # we do not have a videoRenderer or title extraction broke
continue
- video_id = mobj.group('id')
- video_title = unescapeHTML(
- mobj.group('title')) if 'title' in mobj.groupdict() else None
- if video_title:
- video_title = video_title.strip()
- if video_title == '► Play all':
- video_title = None
+
+ video_title = video_title.strip()
+
try:
idx = ids_in_page.index(video_id)
if video_title and not titles_in_page[idx]:
ids_in_page.append(video_id)
titles_in_page.append(video_title)
- def extract_videos_from_page(self, page):
- ids_in_page = []
- titles_in_page = []
- self.extract_videos_from_page_impl(
- self._VIDEO_RE, page, ids_in_page, titles_in_page)
- return zip(ids_in_page, titles_in_page)
+ for video_id, video_title in zip(ids_in_page, titles_in_page):
+ yield self.url_result(video_id, 'Youtube', video_id, video_title)
class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
- def _process_page(self, content):
- for playlist_id in orderedSet(re.findall(
- r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
- content)):
+ def _is_entry(self, obj):
+ return 'playlistId' in obj
+
+ def _process_entries(self, entries, seen):
+ for playlist_id in orderedSet(try_get(r, lambda x: x['playlistId']) for r in entries):
+
yield self.url_result(
'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
'396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
'397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
}
- _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt', 'json3')
+ _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
_GEO_BYPASS = False
'end_time': 9,
}
},
- {
- 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',
- 'note': 'Test generic use_cipher_signature video (#897)',
- 'info_dict': {
- 'id': 'UxxajLWwzqY',
- 'ext': 'mp4',
- 'upload_date': '20120506',
- 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
- 'alt_title': 'I Love It (feat. Charli XCX)',
- 'description': 'md5:19a2f98d9032b9311e686ed039564f63',
- 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
- 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
- 'iconic ep', 'iconic', 'love', 'it'],
- 'duration': 180,
- 'uploader': 'Icona Pop',
- 'uploader_id': 'IconaPop',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
- 'creator': 'Icona Pop',
- 'track': 'I Love It (feat. Charli XCX)',
- 'artist': 'Icona Pop',
- }
- },
- {
- 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
- 'note': 'Test VEVO video with age protection (#956)',
- 'info_dict': {
- 'id': '07FYdnEawAQ',
- 'ext': 'mp4',
- 'upload_date': '20130703',
- 'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)',
- 'alt_title': 'Tunnel Vision',
- 'description': 'md5:07dab3356cde4199048e4c7cd93471e1',
- 'duration': 419,
- 'uploader': 'justintimberlakeVEVO',
- 'uploader_id': 'justintimberlakeVEVO',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
- 'creator': 'Justin Timberlake',
- 'track': 'Tunnel Vision',
- 'artist': 'Justin Timberlake',
- 'age_limit': 18,
- }
- },
{
'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
'note': 'Embed-only video (#1746)',
},
'skip': 'format 141 not served anymore',
},
- # DASH manifest with encrypted signature
- {
- 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
- 'info_dict': {
- 'id': 'IB3lcPjvWLA',
- 'ext': 'm4a',
- 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
- 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
- 'duration': 244,
- 'uploader': 'AfrojackVEVO',
- 'uploader_id': 'AfrojackVEVO',
- 'upload_date': '20131011',
- },
- 'params': {
- 'youtube_include_dash_manifest': True,
- 'format': '141/bestaudio[ext=m4a]',
- },
- },
- # JS player signature function name containing $
- {
- 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
- 'info_dict': {
- 'id': 'nfWlot6h_JM',
- 'ext': 'm4a',
- 'title': 'Taylor Swift - Shake It Off',
- 'description': 'md5:307195cd21ff7fa352270fe884570ef0',
- 'duration': 242,
- 'uploader': 'TaylorSwiftVEVO',
- 'uploader_id': 'TaylorSwiftVEVO',
- 'upload_date': '20140818',
- },
- 'params': {
- 'youtube_include_dash_manifest': True,
- 'format': '141/bestaudio[ext=m4a]',
- },
- },
# Controversy video
{
'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
}
},
- # Normal age-gate video (No vevo, embed allowed)
+ # Normal age-gate video (embed allowed)
{
'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
'info_dict': {
'age_limit': 18,
},
},
- # Age-gate video with encrypted signature
- {
- 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',
- 'info_dict': {
- 'id': '6kLq3WMV1nU',
- 'ext': 'mp4',
- 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
- 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
- 'duration': 246,
- 'uploader': 'LloydVEVO',
- 'uploader_id': 'LloydVEVO',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
- 'upload_date': '20110629',
- 'age_limit': 18,
- },
- },
- # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
- # YouTube Red ad is not captured for creator
- {
- 'url': '__2ABJjxzNo',
- 'info_dict': {
- 'id': '__2ABJjxzNo',
- 'ext': 'mp4',
- 'duration': 266,
- 'upload_date': '20100430',
- 'uploader_id': 'deadmau5',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
- 'creator': 'Dada Life, deadmau5',
- 'description': 'md5:12c56784b8032162bb936a5f76d55360',
- 'uploader': 'deadmau5',
- 'title': 'Deadmau5 - Some Chords (HD)',
- 'alt_title': 'This Machine Kills Some Chords',
- },
- 'expected_warnings': [
- 'DASH manifest missing',
- ]
- },
# Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
{
'url': 'lqQg6PlCWgI',
'params': {
'skip_download': True,
},
- }
+ },
+ {
+ # empty description results in an empty string
+ 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
+ 'info_dict': {
+ 'id': 'x41yOUIvK2k',
+ 'ext': 'mp4',
+ 'title': 'IMG 3456',
+ 'description': '',
+ 'upload_date': '20170613',
+ 'uploader_id': 'ElevageOrVert',
+ 'uploader': 'ElevageOrVert',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
]
def __init__(self, *args, **kwargs):
# https://github.com/ytdl-org/youtube-dl/pull/7599)
r';ytplayer\.config\s*=\s*({.+?});ytplayer',
r';ytplayer\.config\s*=\s*({.+?});',
+ r'ytInitialPlayerResponse\s*=\s*({.+?});var meta'
)
config = self._search_regex(
patterns, webpage, 'ytplayer.config', default=None)
return self._parse_json(
uppercase_escape(config), video_id, fatal=False)
- def _get_yt_initial_data(self, video_id, webpage):
- config = self._search_regex(
- (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
- r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
- webpage, 'ytInitialData', default=None)
- if config:
- return self._parse_json(
- uppercase_escape(config), video_id, fatal=False)
+ def _get_music_metadata_from_yt_initial(self, yt_initial):
+ music_metadata = []
+ key_map = {
+ 'Album': 'album',
+ 'Artist': 'artist',
+ 'Song': 'track'
+ }
+ contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'])
+ if type(contents) is list:
+ for content in contents:
+ music_track = {}
+ if type(content) is not dict:
+ continue
+ videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer'])
+ if type(videoSecondaryInfoRenderer) is not dict:
+ continue
+ rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'])
+ if type(rows) is not list:
+ continue
+ for row in rows:
+ metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer'])
+ if type(metadataRowRenderer) is not dict:
+ continue
+ key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText'])
+ value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \
+ try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text'])
+ if type(key) is not str or type(value) is not str:
+ continue
+ if key in key_map:
+ if key_map[key] in music_track:
+ # we've started on a new track
+ music_metadata.append(music_track)
+ music_track = {}
+ music_track[key_map[key]] = value
+ if len(music_track.keys()):
+ music_metadata.append(music_track)
+ return music_metadata
def _get_automatic_captions(self, video_id, webpage):
"""We need the webpage for getting the captions url, pass it as an
self._downloader.report_warning(err_msg)
return {}
try:
- args = player_config['args']
- caption_url = args.get('ttsurl')
- if caption_url:
+ if "args" in player_config and "ttsurl" in player_config["args"]:
+ args = player_config['args']
+ caption_url = args['ttsurl']
timestamp = args['timestamp']
+
# We get the available subtitles
list_params = compat_urllib_parse_urlencode({
'type': 'list',
return captions
# New captions format as of 22.06.2017
- player_response = args.get('player_response')
- if player_response and isinstance(player_response, compat_str):
- player_response = self._parse_json(
- player_response, video_id, fatal=False)
- if player_response:
- renderer = player_response['captions']['playerCaptionsTracklistRenderer']
- caption_tracks = renderer['captionTracks']
- for caption_track in caption_tracks:
- if 'kind' not in caption_track:
- # not an automatic transcription
- continue
- base_url = caption_track['baseUrl']
- sub_lang_list = []
- for lang in renderer['translationLanguages']:
- lang_code = lang.get('languageCode')
- if lang_code:
- sub_lang_list.append(lang_code)
- return make_captions(base_url, sub_lang_list)
-
- self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id)
- return {}
- # Some videos don't provide ttsurl but rather caption_tracks and
- # caption_translation_languages (e.g. 20LmZk1hakA)
- # Does not used anymore as of 22.06.2017
- caption_tracks = args['caption_tracks']
- caption_translation_languages = args['caption_translation_languages']
- caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
- sub_lang_list = []
- for lang in caption_translation_languages.split(','):
- lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
- sub_lang = lang_qs.get('lc', [None])[0]
- if sub_lang:
- sub_lang_list.append(sub_lang)
- return make_captions(caption_url, sub_lang_list)
+ if "args" in player_config:
+ player_response = player_config["args"].get('player_response')
+ else:
+ # New player system (ytInitialPlayerResponse) as of October 2020
+ player_response = player_config
+
+ if player_response:
+ if isinstance(player_response, compat_str):
+ player_response = self._parse_json(
+ player_response, video_id, fatal=False)
+
+ renderer = player_response['captions']['playerCaptionsTracklistRenderer']
+ caption_tracks = renderer['captionTracks']
+ for caption_track in caption_tracks:
+ if 'kind' not in caption_track:
+ # not an automatic transcription
+ continue
+ base_url = caption_track['baseUrl']
+ sub_lang_list = []
+ for lang in renderer['translationLanguages']:
+ lang_code = lang.get('languageCode')
+ if lang_code:
+ sub_lang_list.append(lang_code)
+ return make_captions(base_url, sub_lang_list)
+
+ self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id)
+ return {}
+
+ if "args" in player_config:
+ args = player_config["args"]
+
+ # Some videos don't provide ttsurl but rather caption_tracks and
+ # caption_translation_languages (e.g. 20LmZk1hakA)
+ # Does not used anymore as of 22.06.2017
+ caption_tracks = args['caption_tracks']
+ caption_translation_languages = args['caption_translation_languages']
+ caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
+ sub_lang_list = []
+ for lang in caption_translation_languages.split(','):
+ lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
+ sub_lang = lang_qs.get('lc', [None])[0]
+ if sub_lang:
+ sub_lang_list.append(sub_lang)
+ return make_captions(caption_url, sub_lang_list)
# An extractor error can be raise by the download process if there are
# no automatic captions but there are subtitles
except (KeyError, IndexError, ExtractorError):
add_dash_mpd_pr(pl_response)
return pl_response
+ def extract_embedded_config(embed_webpage, video_id):
+ embedded_config = self._search_regex(
+ r'setConfig\(({.*})\);',
+ embed_webpage, 'ytInitialData', default=None)
+ if embedded_config:
+ return embedded_config
+
player_response = {}
# Get video info
video_info = {}
embed_webpage = None
- if self._html_search_meta('og:restrictions:age', video_webpage, default=None) == "18+":
+ if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
+ or re.search(r'player-age-gate-content">', video_webpage) is not None):
+ cookie_keys = self._get_cookies('https://www.youtube.com').keys()
age_gate = True
# We simulate the access to the video from www.youtube.com/v/{video_id}
# this can be viewed without login into Youtube
url = proto + '://www.youtube.com/embed/%s' % video_id
embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
- data = compat_urllib_parse_urlencode({
- 'video_id': video_id,
- 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
- 'sts': self._search_regex(
- r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
- })
- video_info_url = proto + '://www.youtube.com/get_video_info?' + data
- try:
- video_info_webpage = self._download_webpage(
- video_info_url, video_id,
- note='Refetching age-gated info webpage',
- errnote='unable to download video info webpage')
- except ExtractorError:
- video_info_webpage = None
- if video_info_webpage:
- video_info = compat_parse_qs(video_info_webpage)
- pl_response = video_info.get('player_response', [None])[0]
- player_response = extract_player_response(pl_response, video_id)
- add_dash_mpd(video_info)
- view_count = extract_view_count(video_info)
+ ext = extract_embedded_config(embed_webpage, video_id)
+ # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
+ playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
+ if not playable_in_embed:
+ self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
+ playable_in_embed = ''
+ else:
+ playable_in_embed = playable_in_embed.group('playableinEmbed')
+ # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
+ # if re.search(r'player-unavailable">', embed_webpage) is not None:
+ if playable_in_embed == 'false':
+ '''
+ # TODO apply this patch when Support for Python 2.6(!) and above drops
+ if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
+ or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
+ '''
+ if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
+ or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
+ age_gate = False
+ # Try looking directly into the video webpage
+ ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
+ if ytplayer_config:
+ args = ytplayer_config.get("args")
+ if args is not None:
+ if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
+ # Convert to the same format returned by compat_parse_qs
+ video_info = dict((k, [v]) for k, v in args.items())
+ add_dash_mpd(video_info)
+ # Rental video is not rented but preview is available (e.g.
+ # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
+ # https://github.com/ytdl-org/youtube-dl/issues/10532)
+ if not video_info and args.get('ypc_vid'):
+ return self.url_result(
+ args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
+ if args.get('livestream') == '1' or args.get('live_playback') == 1:
+ is_live = True
+ if not player_response:
+ player_response = extract_player_response(args.get('player_response'), video_id)
+ elif not player_response:
+ player_response = ytplayer_config
+ if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
+ add_dash_mpd_pr(player_response)
+ else:
+ raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
+ else:
+ data = compat_urllib_parse_urlencode({
+ 'video_id': video_id,
+ 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
+ 'sts': self._search_regex(
+ r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
+ })
+ video_info_url = proto + '://www.youtube.com/get_video_info?' + data
+ try:
+ video_info_webpage = self._download_webpage(
+ video_info_url, video_id,
+ note='Refetching age-gated info webpage',
+ errnote='unable to download video info webpage')
+ except ExtractorError:
+ video_info_webpage = None
+ if video_info_webpage:
+ video_info = compat_parse_qs(video_info_webpage)
+ pl_response = video_info.get('player_response', [None])[0]
+ player_response = extract_player_response(pl_response, video_id)
+ add_dash_mpd(video_info)
+ view_count = extract_view_count(video_info)
else:
age_gate = False
# Try looking directly into the video webpage
ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
- if ytplayer_config:
- args = ytplayer_config['args']
+ args = ytplayer_config.get("args")
+ if args is not None:
if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
# Convert to the same format returned by compat_parse_qs
video_info = dict((k, [v]) for k, v in args.items())
is_live = True
if not player_response:
player_response = extract_player_response(args.get('player_response'), video_id)
+ elif not player_response:
+ player_response = ytplayer_config
if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
add_dash_mpd_pr(player_response)
''', replace_url, video_description)
video_description = clean_html(video_description)
else:
- video_description = video_details.get('shortDescription') or self._html_search_meta('description', video_webpage)
+ video_description = video_details.get('shortDescription')
+ if video_description is None:
+ video_description = self._html_search_meta('description', video_webpage)
if not smuggled_data.get('force_singlefeed', False):
if not self._downloader.params.get('noplaylist'):
if cipher:
if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
- ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
+ ASSETS_RE = r'(?:"assets":.+?"js":\s*("[^"]+"))|(?:"jsUrl":\s*("[^"]+"))'
jsplayer_url_json = self._search_regex(
ASSETS_RE,
embed_webpage if age_gate else video_webpage,
a_format['player_url'] = player_url
# Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
- formats.append(a_format)
+ if self._downloader.params.get('youtube_include_hls_manifest', True):
+ formats.append(a_format)
else:
error_message = extract_unavailable_message()
if not error_message:
if release_year:
release_year = int(release_year)
+ yt_initial = self._get_yt_initial_data(video_id, video_webpage)
+ if yt_initial:
+ music_metadata = self._get_music_metadata_from_yt_initial(yt_initial)
+ if len(music_metadata):
+ album = music_metadata[0].get('album')
+ artist = music_metadata[0].get('artist')
+ track = music_metadata[0].get('track')
+
m_episode = re.search(
r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
video_webpage)
def _extract_count(count_name):
return str_to_int(self._search_regex(
- r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
+ r'"accessibilityData":\{"label":"([\d,\w]+) %ss"\}'
% re.escape(count_name),
video_webpage, count_name, default=None))
video_duration = parse_duration(self._html_search_meta(
'duration', video_webpage, 'video duration'))
+ # Get Subscriber Count of channel
+ subscriber_count = parse_count(self._search_regex(
+ r'"text":"([\d\.]+\w?) subscribers"',
+ video_webpage,
+ 'subscriber count',
+ default=None
+ ))
+
# annotations
video_annotations = None
if self._downloader.params.get('writeannotations', False):
'album': album,
'release_date': release_date,
'release_year': release_year,
+ 'subscriber_count': subscriber_count,
}
_VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
_VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
IE_NAME = 'youtube:playlist'
+ _YTM_PLAYLIST_PREFIX = 'RDCLAK5uy_'
+ _YTM_CHANNEL_INFO = {
+ 'uploader': 'Youtube Music',
+ 'uploader_id': 'music', # or "UC-9-kyTW8ZkZNDHQJ6FgpwQ"
+ 'uploader_url': 'https://www.youtube.com/music'
+ }
_TESTS = [{
'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
'info_dict': {
return zip(ids_in_page, titles_in_page)
+ def _extract_mix_ids_from_yt_initial(self, yt_initial):
+ ids = []
+ playlist_contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['contents'], list)
+ if playlist_contents:
+ for item in playlist_contents:
+ videoId = try_get(item, lambda x: x['playlistPanelVideoRenderer']['videoId'], compat_str)
+ if videoId:
+ ids.append(videoId)
+ return ids
+
def _extract_mix(self, playlist_id):
# The mixes are generated from a single video
# the id of the playlist is just 'RD' + video_id
ids = []
+ yt_initial = None
last_id = playlist_id[-11:]
for n in itertools.count(1):
url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
r'''(?xs)data-video-username=".*?".*?
href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id),
webpage))
+
+ # if no ids in html of page, try using embedded json
+ if (len(new_ids) == 0):
+ yt_initial = self._get_yt_initial_data(playlist_id, webpage)
+ if yt_initial:
+ new_ids = self._extract_mix_ids_from_yt_initial(yt_initial)
+
# Fetch new pages until all the videos are repeated, it seems that
# there are always 51 unique videos.
new_ids = [_id for _id in new_ids if _id not in ids]
or search_title('title'))
title = clean_html(title_span)
+ if not title:
+ title = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['title'], compat_str)
+
return self.playlist_result(url_results, playlist_id, title)
def _extract_playlist(self, playlist_id):
'uploader_id': uploader_id,
'uploader_url': uploader_url,
})
+ if playlist_id.startswith(self._YTM_PLAYLIST_PREFIX):
+ playlist.update(self._YTM_CHANNEL_INFO)
return has_videos, playlist
return video
if playlist_id.startswith(('RD', 'UL', 'PU')):
- # Mixes require a custom extraction process
- return self._extract_mix(playlist_id)
+ if not playlist_id.startswith(self._YTM_PLAYLIST_PREFIX):
+ # Mixes require a custom extraction process,
+ # Youtube Music playlists act like normal playlists (with randomized order)
+ return self._extract_mix(playlist_id)
has_videos, playlist = self._extract_playlist(playlist_id)
if has_videos or not video_id:
class YoutubeUserIE(YoutubeChannelIE):
IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
- _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
+ _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)'
_TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
IE_NAME = 'youtube:user'
}, {
'url': 'https://www.youtube.com/c/gametrailers',
'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak',
+ 'only_matching': True,
}, {
'url': 'https://www.youtube.com/gametrailers',
'only_matching': True,
}]
-class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
- _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
-
-
-class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
+class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistBaseInfoExtractor):
IE_DESC = 'YouTube.com searches'
# there doesn't appear to be a real limit, for example if you search for
# 'python' you get more than 8.000.000 results
_MAX_RESULTS = float('inf')
IE_NAME = 'youtube:search'
_SEARCH_KEY = 'ytsearch'
- _EXTRA_QUERY_ARGS = {}
+ _SEARCH_PARAMS = None
_TESTS = []
- def _get_n_results(self, query, n):
- """Get a specified number of results for a query"""
-
- videos = []
- limit = n
-
- url_query = {
- 'search_query': query.encode('utf-8'),
+ def _entries(self, query, n):
+ data = {
+ 'context': {
+ 'client': {
+ 'clientName': 'WEB',
+ 'clientVersion': '2.20201021.03.00',
+ }
+ },
+ 'query': query,
}
- url_query.update(self._EXTRA_QUERY_ARGS)
- result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
-
- for pagenum in itertools.count(1):
- data = self._download_json(
- result_url, video_id='query "%s"' % query,
- note='Downloading page %s' % pagenum,
- errnote='Unable to download API page',
- query={'spf': 'navigate'})
- html_content = data[1]['body']['content']
-
- if 'class="search-message' in html_content:
- raise ExtractorError(
- '[youtube] No video results', expected=True)
-
- new_videos = list(self._process_page(html_content))
- videos += new_videos
- if not new_videos or len(videos) > limit:
+ if self._SEARCH_PARAMS:
+ data['params'] = self._SEARCH_PARAMS
+ total = 0
+ for page_num in itertools.count(1):
+ search = self._download_json(
+ 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ video_id='query "%s"' % query,
+ note='Downloading page %s' % page_num,
+ errnote='Unable to download API page', fatal=False,
+ data=json.dumps(data).encode('utf8'),
+ headers={'content-type': 'application/json'})
+ if not search:
+ break
+ slr_contents = try_get(
+ search,
+ (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
+ lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
+ list)
+ if not slr_contents:
break
- next_link = self._html_search_regex(
- r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',
- html_content, 'next link', default=None)
- if next_link is None:
+ isr_contents = try_get(
+ slr_contents,
+ lambda x: x[0]['itemSectionRenderer']['contents'],
+ list)
+ if not isr_contents:
+ break
+ for content in isr_contents:
+ if not isinstance(content, dict):
+ continue
+ video = content.get('videoRenderer')
+ if not isinstance(video, dict):
+ continue
+ video_id = video.get('videoId')
+ if not video_id:
+ continue
+ title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
+ description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
+ duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
+ view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
+ view_count = int_or_none(self._search_regex(
+ r'^(\d+)', re.sub(r'\s', '', view_count_text),
+ 'view count', default=None))
+ uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
+ total += 1
+ yield {
+ '_type': 'url_transparent',
+ 'ie_key': YoutubeIE.ie_key(),
+ 'id': video_id,
+ 'url': video_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'uploader': uploader,
+ }
+ if total == n:
+ return
+ token = try_get(
+ slr_contents,
+ lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
+ compat_str)
+ if not token:
break
- result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)
+ data['continuation'] = token
- if len(videos) > n:
- videos = videos[:n]
- return self.playlist_result(videos, query)
+ def _get_n_results(self, query, n):
+ """Get a specified number of results for a query"""
+ return self.playlist_result(self._entries(query, n), query)
class YoutubeSearchDateIE(YoutubeSearchIE):
IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
_SEARCH_KEY = 'ytsearchdate'
IE_DESC = 'YouTube.com searches, newest videos first'
- _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
+ _SEARCH_PARAMS = 'CAI%3D'
-class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
+class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor):
IE_DESC = 'YouTube.com search URLs'
IE_NAME = 'youtube:search_url'
_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
'only_matching': True,
}]
+ def _process_json_dict(self, obj, videos, c):
+ if "videoId" in obj:
+ videos.append(obj)
+ return
+
+ if "nextContinuationData" in obj:
+ c["continuation"] = obj["nextContinuationData"]
+ return
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
query = compat_urllib_parse_unquote_plus(mobj.group('query'))
webpage = self._download_webpage(url, query)
- return self.playlist_result(self._process_page(webpage), playlist_title=query)
+ return self.playlist_result(self._entries(webpage, query, max_pages=0), playlist_title=query)
class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
'https://www.youtube.com/show/%s/playlists' % playlist_id)
-class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
+class YoutubeFeedsInfoExtractor(YoutubePlaylistBaseInfoExtractor):
"""
Base class for feed extractors
Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
def _real_initialize(self):
self._login()
- def _entries(self, page):
- # The extraction process is the same as for playlists, but the regex
- # for the video ids doesn't contain an index
- ids = []
- more_widget_html = content_html = page
- for page_num in itertools.count(1):
- matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
-
- # 'recommended' feed has infinite 'load more' and each new portion spins
- # the same videos in (sometimes) slightly different order, so we'll check
- # for unicity and break when portion has no new videos
- new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
- if not new_ids:
- break
+ def _process_entries(self, entries, seen):
+ new_info = []
+ for v in entries:
+ v_id = try_get(v, lambda x: x['videoId'])
+ if not v_id:
+ continue
- ids.extend(new_ids)
+ have_video = False
+ for old in seen:
+ if old['videoId'] == v_id:
+ have_video = True
+ break
- for entry in self._ids_to_results(new_ids):
- yield entry
+ if not have_video:
+ new_info.append(v)
- mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
- if not mobj:
- break
+ if not new_info:
+ return
- more = self._download_json(
- 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
- 'Downloading page #%s' % page_num,
- transform_source=uppercase_escape,
- headers=self._YOUTUBE_CLIENT_HEADERS)
- content_html = more['content_html']
- more_widget_html = more['load_more_widget_html']
+ seen.extend(new_info)
+ for video in new_info:
+ yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=self._extract_title(video))
def _real_extract(self, url):
page = self._download_webpage(
'https://www.youtube.com/feed/%s' % self._FEED_NAME,
self._PLAYLIST_TITLE)
- return self.playlist_result(
- self._entries(page), playlist_title=self._PLAYLIST_TITLE)
+ return self.playlist_result(self._entries(page, self._PLAYLIST_TITLE),
+ playlist_title=self._PLAYLIST_TITLE)
class YoutubeWatchLaterIE(YoutubePlaylistIE):