)
+def parse_qs(url):
+ return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+
+
class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors"""
_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
_RESERVED_NAMES = (
r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|'
- r'movies|results|shared|hashtag|trending|feed|feeds|'
+ r'movies|results|shared|hashtag|trending|feed|feeds|oembed|'
r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout')
_NETRC_MACHINE = 'youtube'
_PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
- def _ids_to_results(self, ids):
- return [
- self.url_result(vid_id, 'Youtube', video_id=vid_id)
- for vid_id in ids]
-
def _login(self):
"""
Attempt to log in to YouTube.
return data.get('DELEGATED_SESSION_ID')
def _extract_ytcfg(self, video_id, webpage):
+ if not webpage:
+ return {}
return self._parse_json(
self._search_regex(
r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
r'(?:(?:www|dev)\.)?invidio\.us',
# Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
r'(?:www\.)?invidious\.pussthecat\.org',
- r'(?:www\.)?invidious\.048596\.xyz',
r'(?:www\.)?invidious\.zee\.li',
- r'(?:www\.)?vid\.puffyan\.us',
r'(?:(?:www|au)\.)?ytprivate\.com',
r'(?:www\.)?invidious\.namazso\.eu',
r'(?:www\.)?invidious\.ethibox\.fr',
- r'(?:www\.)?inv\.skyn3t\.in',
- r'(?:www\.)?invidious\.himiko\.cloud',
r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
r'(?:(?:www|no)\.)?invidiou\.sh',
r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
r'(?:www\.)?invidious\.kabi\.tk',
- r'(?:www\.)?invidious\.13ad\.de',
r'(?:www\.)?invidious\.mastodon\.host',
r'(?:www\.)?invidious\.zapashcanon\.fr',
r'(?:www\.)?invidious\.kavin\.rocks',
+ r'(?:www\.)?invidious\.tinfoil-hat\.net',
+ r'(?:www\.)?invidious\.himiko\.cloud',
+ r'(?:www\.)?invidious\.reallyancient\.tech',
r'(?:www\.)?invidious\.tube',
r'(?:www\.)?invidiou\.site',
r'(?:www\.)?invidious\.site',
r'(?:www\.)?invidious\.xyz',
r'(?:www\.)?invidious\.nixnet\.xyz',
+ r'(?:www\.)?invidious\.048596\.xyz',
r'(?:www\.)?invidious\.drycat\.fr',
+ r'(?:www\.)?inv\.skyn3t\.in',
r'(?:www\.)?tube\.poal\.co',
r'(?:www\.)?tube\.connect\.cafe',
r'(?:www\.)?vid\.wxzm\.sx',
r'(?:www\.)?vid\.mint\.lgbt',
+ r'(?:www\.)?vid\.puffyan\.us',
r'(?:www\.)?yewtu\.be',
r'(?:www\.)?yt\.elukerio\.org',
r'(?:www\.)?yt\.lelux\.fi',
r'(?:www\.)?invidious\.ggc-project\.de',
r'(?:www\.)?yt\.maisputain\.ovh',
+ r'(?:www\.)?ytprivate\.com',
+ r'(?:www\.)?invidious\.13ad\.de',
r'(?:www\.)?invidious\.toot\.koeln',
r'(?:www\.)?invidious\.fdn\.fr',
r'(?:www\.)?watch\.nettohikari\.com',
|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
)
)? # all until now is optional -> you can pass the naked ID
- (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
- (?!.*?\blist=
- (?:
- %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
- WL # WL are handled by the watch later IE
- )
- )
+ (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
(?(1).+)? # if we found the ID, everything can follow
$""" % {
- 'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE,
'invidious': '|'.join(_INVIDIOUS_SITES),
}
_PLAYER_INFO_RE = (
},
'skip': 'This video does not exist.',
},
+ {
+ # Video with incomplete 'yt:stretch=16:'
+ 'url': 'https://www.youtube.com/watch?v=FRhJzUSJbGI',
+ 'only_matching': True,
+ },
{
# Video licensed under Creative Commons
'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
},
]
+ @classmethod
+ def suitable(cls, url):
+ # Hack for lazy extractors until more generic solution is implemented
+ # (see #28780)
+ from .youtube import parse_qs
+ qs = parse_qs(url)
+ if qs.get('list', [None])[0]:
+ return False
+ return super(YoutubeIE, cls).suitable(url)
+
def __init__(self, *args, **kwargs):
super(YoutubeIE, self).__init__(*args, **kwargs)
self._code_cache = {}
if not formats:
if not self._downloader.params.get('allow_unplayable_formats') and streaming_data.get('licenseInfos'):
- raise ExtractorError(
+ self.raise_no_formats(
'This video is DRM protected.', expected=True)
pemr = try_get(
playability_status,
if not countries:
regions_allowed = search_meta('regionsAllowed')
countries = regions_allowed.split(',') if regions_allowed else None
- self.raise_geo_restricted(
- subreason, countries)
+ self.raise_geo_restricted(subreason, countries, metadata_available=True)
reason += '\n' + subreason
if reason:
- raise ExtractorError(reason, expected=True)
+ self.raise_no_formats(reason, expected=True)
self._sort_formats(formats)
for m in re.finditer(self._meta_regex('og:video:tag'), webpage)]
for keyword in keywords:
if keyword.startswith('yt:stretch='):
- stretch_ratio = map(
- lambda x: int_or_none(x, default=0),
- keyword.split('=')[1].split(':'))
- w, h = (list(stretch_ratio) + [0])[:2]
- if w > 0 and h > 0:
- ratio = w / h
- for f in formats:
- if f.get('vcodec') != 'none':
- f['stretched_ratio'] = ratio
+ mobj = re.search(r'(\d+)\s*:\s*(\d+)', keyword)
+ if mobj:
+ # NB: float is intentional for forcing float division
+ w, h = (float(v) for v in mobj.groups())
+ if w > 0 and h > 0:
+ ratio = w / h
+ for f in formats:
+ if f.get('vcodec') != 'none':
+ f['stretched_ratio'] = ratio
+ break
thumbnails = []
for container in (video_details, microformat):
'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
'uploader': 'Игорь Клейнер',
},
+ }, {
+ # playlists, series
+ 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3',
+ 'playlist_mincount': 5,
+ 'info_dict': {
+ 'id': 'UCYO_jab_esuFRV4b17AJtAw',
+ 'title': '3Blue1Brown - Playlists',
+ 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
+ },
}, {
# playlists, singlepage
'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
'expected_warnings': [
'YouTube said: INFO - Unavailable videos are hidden',
]
+ }, {
+ 'note': 'Playlist with unavailable videos in a later page',
+ 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w',
+ 'info_dict': {
+ 'title': 'Uploads from BlankTV',
+ 'id': 'UU8l9frL61Yl5KFOl87nIm2w',
+ 'uploader': 'BlankTV',
+ 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w',
+ },
+ 'playlist_mincount': 20000,
}, {
# https://github.com/ytdl-org/youtube-dl/issues/21844
'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
'title': '#cctv9',
},
'playlist_mincount': 350,
+ }, {
+ 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU',
+ 'only_matching': True,
}]
@classmethod
@staticmethod
def _extract_basic_item_renderer(item):
# Modified from _extract_grid_item_renderer
- known_renderers = (
- 'playlistRenderer', 'videoRenderer', 'channelRenderer',
- 'gridPlaylistRenderer', 'gridVideoRenderer', 'gridChannelRenderer'
+ known_basic_renderers = (
+ 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer'
)
for key, renderer in item.items():
- if key not in known_renderers:
+ if not isinstance(renderer, dict):
continue
- return renderer
+ elif key in known_basic_renderers:
+ return renderer
+ elif key.startswith('grid') and key.endswith('Renderer'):
+ return renderer
def _grid_entries(self, grid_renderer):
for item in grid_renderer['items']:
if not isinstance(renderer, dict):
continue
title = try_get(
- renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
+ renderer, (lambda x: x['title']['runs'][0]['text'],
+ lambda x: x['title']['simpleText']), compat_str)
# playlist
playlist_id = renderer.get('playlistId')
if playlist_id:
'https://www.youtube.com/playlist?list=%s' % playlist_id,
ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
video_title=title)
+ continue
# video
video_id = renderer.get('videoId')
if video_id:
yield self._extract_video(renderer)
+ continue
# channel
channel_id = renderer.get('channelId')
if channel_id:
yield self.url_result(
'https://www.youtube.com/channel/%s' % channel_id,
ie=YoutubeTabIE.ie_key(), video_title=title)
+ continue
+ # generic endpoint URL support
+ ep_url = urljoin('https://www.youtube.com/', try_get(
+ renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'],
+ compat_str))
+ if ep_url:
+ for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE):
+ if ie.suitable(ep_url):
+ yield self.url_result(
+ ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title)
+ break
def _shelf_entries_from_content(self, shelf_renderer):
content = shelf_renderer.get('content')
return
# video attachment
video_renderer = try_get(
- post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
- video_id = None
- if video_renderer:
- entry = self._video_entry(video_renderer)
+ post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict) or {}
+ video_id = video_renderer.get('videoId')
+ if video_id:
+ entry = self._extract_video(video_renderer)
if entry:
yield entry
+ # playlist attachment
+ playlist_id = try_get(
+ post_renderer, lambda x: x['backstageAttachment']['playlistRenderer']['playlistId'], compat_str)
+ if playlist_id:
+ yield self.url_result(
+ 'https://www.youtube.com/playlist?list=%s' % playlist_id,
+ ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
# inline video links
runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
for run in runs:
ep_video_id = YoutubeIE._match_id(ep_url)
if video_id == ep_video_id:
continue
- yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
+ yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=ep_video_id)
def _post_thread_continuation_entries(self, post_thread_continuation):
contents = post_thread_continuation.get('contents')
Get playlist with unavailable videos if the 'show unavailable videos' button exists.
"""
sidebar_renderer = try_get(
- data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list) or []
+ data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
+ if not sidebar_renderer:
+ return
+ browse_id = params = None
for item in sidebar_renderer:
if not isinstance(item, dict):
continue
nav_item_renderer, lambda x: x['navigationEndpoint']['browseEndpoint'], dict) or {}
browse_id = browse_endpoint.get('browseId')
params = browse_endpoint.get('params')
- if not browse_id or not params:
- return
- ytcfg = self._extract_ytcfg(item_id, webpage)
- headers = self._generate_api_headers(
- ytcfg, account_syncid=self._extract_account_syncid(ytcfg),
- identity_token=self._extract_identity_token(webpage, item_id=item_id),
- visitor_data=try_get(
- self._extract_context(ytcfg), lambda x: x['client']['visitorData'], compat_str))
- query = {
- 'params': params,
- 'browseId': browse_id
- }
- return self._extract_response(
- item_id=item_id, headers=headers, query=query,
- check_get_keys='contents', fatal=False,
- note='Downloading API JSON with unavailable videos')
+ break
+
+ ytcfg = self._extract_ytcfg(item_id, webpage)
+ headers = self._generate_api_headers(
+ ytcfg, account_syncid=self._extract_account_syncid(ytcfg),
+ identity_token=self._extract_identity_token(webpage, item_id=item_id),
+ visitor_data=try_get(
+ self._extract_context(ytcfg), lambda x: x['client']['visitorData'], compat_str))
+ query = {
+ 'params': params or 'wgYCCAA=',
+ 'browseId': browse_id or 'VL%s' % item_id
+ }
+ return self._extract_response(
+ item_id=item_id, headers=headers, query=query,
+ check_get_keys='contents', fatal=False,
+ note='Downloading API JSON with unavailable videos')
def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None,
ytcfg=None, check_get_keys=None, ep='browse', fatal=True):
url = '%s/videos%s' % (mobj.get('pre'), mobj.get('post') or '')
# Handle both video/playlist URLs
- qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ qs = parse_qs(url)
video_id = qs.get('v', [None])[0]
playlist_id = qs.get('list', [None])[0]
@classmethod
def suitable(cls, url):
- return False if YoutubeTabIE.suitable(url) else super(
- YoutubePlaylistIE, cls).suitable(url)
+ if YoutubeTabIE.suitable(url):
+ return False
+ # Hack for lazy extractors until more generic solution is implemented
+ # (see #28780)
+ from .youtube import parse_qs
+ qs = parse_qs(url)
+ if qs.get('v', [None])[0]:
+ return False
+ return super(YoutubePlaylistIE, cls).suitable(url)
def _real_extract(self, url):
playlist_id = self._match_id(url)
- qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ qs = parse_qs(url)
if not qs:
qs = {'list': playlist_id}
return self.url_result(