ExtractorError,
float_or_none,
format_field,
+ get_first,
int_or_none,
is_html,
join_nonempty,
+ js_to_json,
mimetype2ext,
network_exceptions,
NO_DEFAULT,
)
-def get_first(obj, keys, **kwargs):
- return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
-
-
# any clients starting with _ cannot be explicity requested by the user
INNERTUBE_CLIENTS = {
'web': {
}
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 2
- }
+ },
+ # This client can access age restricted videos (unless the uploader has disabled the 'allow embedding' option)
+ # See: https://github.com/zerodytrash/YouTube-Internal-Clients
+ 'tv_embedded': {
+ 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER',
+ 'clientVersion': '2.0',
+ },
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 85
+ },
}
+def _split_innertube_client(client_name):
+ variant, *base = client_name.rsplit('.', 1)
+ if base:
+ return variant, base[0], variant
+ base, *variant = client_name.split('_', 1)
+ return client_name, base, variant[0] if variant else None
+
+
def build_innertube_clients():
- third_party = {
- 'embedUrl': 'https://google.com', # Can be any valid URL
+ THIRD_PARTY = {
+ 'embedUrl': 'https://www.youtube.com/', # Can be any valid URL
}
- base_clients = ('android', 'web', 'ios', 'mweb')
- priority = qualities(base_clients[::-1])
+ BASE_CLIENTS = ('android', 'web', 'tv', 'ios', 'mweb')
+ priority = qualities(BASE_CLIENTS[::-1])
for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()):
ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8')
ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com')
ytcfg.setdefault('REQUIRE_JS_PLAYER', True)
ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en')
- ytcfg['priority'] = 10 * priority(client.split('_', 1)[0])
-
- if client in base_clients:
- INNERTUBE_CLIENTS[f'{client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg)
- agegate_ytcfg['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED'
- agegate_ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party
- agegate_ytcfg['priority'] -= 1
- elif client.endswith('_embedded'):
- ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party
+
+ _, base_client, variant = _split_innertube_client(client)
+ ytcfg['priority'] = 10 * priority(base_client)
+
+ if not variant:
+ INNERTUBE_CLIENTS[f'{client}_embedscreen'] = embedscreen = copy.deepcopy(ytcfg)
+ embedscreen['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED'
+ embedscreen['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY
+ embedscreen['priority'] -= 3
+ elif variant == 'embedded':
+ ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY
ytcfg['priority'] -= 2
else:
ytcfg['priority'] -= 3
_RESERVED_NAMES = (
r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|clip|'
- r'shorts|movies|results|shared|hashtag|trending|feed|feeds|'
+ r'shorts|movies|results|search|shared|hashtag|trending|explore|feed|feeds|'
r'browse|oembed|get_video_info|iframe_api|s/player|'
r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout')
_PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
- _NETRC_MACHINE = 'youtube'
+ # _NETRC_MACHINE = 'youtube'
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion',
)
- def _login(self):
- """
- Attempt to log in to YouTube.
- If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
- """
-
- if (self._LOGIN_REQUIRED
- and self.get_param('cookiefile') is None
- and self.get_param('cookiesfrombrowser') is None):
- self.raise_login_required(
- 'Login details are needed to download this content', method='cookies')
- username, password = self._get_login_info()
- if username:
- self.report_warning(f'Cannot login to YouTube using username and password. {self._LOGIN_HINTS["cookies"]}')
-
def _initialize_consent(self):
cookies = self._get_cookies('https://www.youtube.com/')
if cookies.get('__Secure-3PSID'):
pref = dict(compat_urlparse.parse_qsl(pref_cookie.value))
except ValueError:
self.report_warning('Failed to parse user PREF cookie' + bug_reports_message())
- pref.update({'hl': 'en'})
+ pref.update({'hl': 'en', 'tz': 'UTC'})
self._set_cookie('.youtube.com', name='PREF', value=compat_urllib_parse_urlencode(pref))
def _real_initialize(self):
self._initialize_pref()
self._initialize_consent()
- self._login()
+ if (self._LOGIN_REQUIRED
+ and self.get_param('cookiefile') is None
+ and self.get_param('cookiesfrombrowser') is None):
+ self.raise_login_required('Login details are needed to download this content', method='cookies')
_YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
_YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
def _extract_context(self, ytcfg=None, default_client='web'):
context = get_first(
(ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict)
- # Enforce language for extraction
- traverse_obj(context, 'client', expected_type=dict, default={})['hl'] = 'en'
+ # Enforce language and tz for extraction
+ client_context = traverse_obj(context, 'client', expected_type=dict, default={})
+ client_context.update({'hl': 'en', 'timeZone': 'UTC', 'utcOffsetMinutes': 0})
return context
_SAPISID = None
'https://%s/youtubei/v1/%s' % (api_hostname or self._get_innertube_host(default_client), ep),
video_id=video_id, fatal=fatal, note=note, errnote=errnote,
data=json.dumps(data).encode('utf8'), headers=real_headers,
- query={'key': api_key or self._extract_api_key()})
+ query={'key': api_key or self._extract_api_key(), 'prettyPrint': 'false'})
def extract_yt_initial_data(self, item_id, webpage, fatal=True):
data = self._search_regex(
Appears to be used to track session state
"""
return get_first(
- args, (('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))),
+ args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))],
expected_type=str)
@property
timestamp = (
unified_timestamp(text) or unified_timestamp(
self._search_regex(
- (r'(?:.+|^)(?:live|premieres|ed|ing)(?:\s*on)?\s*(.+\d)', r'\w+[\s,\.-]*\w+[\s,\.-]+20\d{2}'), text.lower(), 'time text', default=None)))
+ (r'([a-z]+\s*\d{1,2},?\s*20\d{2})', r'(?:.+|^)(?:live|premieres|ed|ing)(?:\s*(?:on|for))?\s*(.+\d)'),
+ text.lower(), 'time text', default=None)))
if text and timestamp is None:
- self.report_warning('Cannot parse localized time text' + bug_reports_message(), only_once=True)
+ self.report_warning(f"Cannot parse localized time text '{text}'" + bug_reports_message(), only_once=True)
return timestamp, text
def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None,
note='%s%s' % (note, ' (retry #%d)' % count if count else ''))
except ExtractorError as e:
if isinstance(e.cause, network_exceptions):
- if isinstance(e.cause, compat_HTTPError) and not is_html(e.cause.read(512)):
- e.cause.seek(0)
- yt_error = try_get(
- self._parse_json(e.cause.read().decode(), item_id, fatal=False),
- lambda x: x['error']['message'], compat_str)
- if yt_error:
- self._report_alerts([('ERROR', yt_error)], fatal=False)
+ if isinstance(e.cause, compat_HTTPError):
+ first_bytes = e.cause.read(512)
+ if not is_html(first_bytes):
+ yt_error = try_get(
+ self._parse_json(
+ self._webpage_read_content(e.cause, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False),
+ lambda x: x['error']['message'], compat_str)
+ if yt_error:
+ self._report_alerts([('ERROR', yt_error)], fatal=False)
# Downloading page may result in intermittent 5xx HTTP error
# Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
# We also want to catch all other network exceptions since errors in later pages can be troublesome
description = self._get_text(renderer, 'descriptionSnippet')
duration = parse_duration(self._get_text(
renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text')))
+ if duration is None:
+ duration = parse_duration(self._search_regex(
+ r'(?i)(ago)(?!.*\1)\s+(?P<duration>[a-z0-9 ,]+?)(?:\s+[\d,]+\s+views)?(?:\s+-\s+play\s+short)?$',
+ traverse_obj(renderer, ('title', 'accessibility', 'accessibilityData', 'label'), default='', expected_type=str),
+ video_id, default=None, group='duration'))
+
view_count = self._get_count(renderer, 'viewCountText')
uploader = self._get_text(renderer, 'ownerText', 'shortBylineText')
renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str)
badges = self._extract_badges(renderer)
thumbnails = self._extract_thumbnails(renderer, 'thumbnail')
+ navigation_url = urljoin('https://www.youtube.com/', traverse_obj(
+ renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), expected_type=str))
+ url = f'https://www.youtube.com/watch?v={video_id}'
+ if overlay_style == 'SHORTS' or (navigation_url and '/shorts/' in navigation_url):
+ url = f'https://www.youtube.com/shorts/{video_id}'
return {
'_type': 'url',
'ie_key': YoutubeIE.ie_key(),
'id': video_id,
- 'url': f'https://www.youtube.com/watch?v={video_id}',
+ 'url': url,
'title': title,
'description': description,
'duration': duration,
'uploader': uploader,
'channel_id': channel_id,
'thumbnails': thumbnails,
- # 'upload_date': strftime_or_none(timestamp, '%Y%m%d'),
+ 'upload_date': strftime_or_none(timestamp, '%Y%m%d') if self._configuration_arg('approximate_date', ie_key='youtubetab') else None,
'live_status': ('is_upcoming' if scheduled_timestamp is not None
else 'was_live' if 'streamed' in time_text.lower()
else 'is_live' if overlay_style is not None and overlay_style == 'LIVE' or 'live now' in badges
'age_limit': 0,
'start_time': 1,
'end_time': 9,
+ 'channel_follower_count': int
}
},
{
'thumbnail': 'https://i.ytimg.com/vi/BaW_jenozKc/maxresdefault.jpg',
'live_status': 'not_live',
'age_limit': 0,
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
'categories': ['Music'],
'age_limit': 0,
'alt_title': 'The Spark',
+ 'channel_follower_count': int
},
'params': {
'youtube_include_dash_manifest': True,
'channel_id': 'UCzybXLxv08IApdjdN0mJhEg',
'playable_in_embed': True,
'view_count': int,
+ 'channel_follower_count': int
},
},
{
'like_count': int,
'duration': 177,
'playable_in_embed': True,
+ 'channel_follower_count': int
},
},
{
'id': 'Tq92D6wQ1mg',
'title': '[MMD] Adios - EVERGLOW [+Motion DL]',
'ext': 'mp4',
- 'upload_date': '20191227',
+ 'upload_date': '20191228',
'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ',
'uploader': 'Projekt Melody',
'description': 'md5:17eccca93a786d51bc67646756894066',
'categories': ['Entertainment'],
'duration': 106,
'channel_url': 'https://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ',
+ 'channel_follower_count': int
},
},
{
'channel_url': 'https://www.youtube.com/channel/UCdR3RSDPqub28LjZx0v9-aA',
'live_status': 'not_live',
'artist': 'OOMPH!',
+ 'channel_follower_count': int
},
},
{
'channel_url': 'https://www.youtube.com/channel/UCYEK6xds6eo-3tr4xRdflmQ',
'categories': ['Music'],
'album': 'Some Chords',
+ 'channel_follower_count': int
},
'expected_warnings': [
'DASH manifest missing',
'live_status': 'was_live',
'view_count': int,
'channel_url': 'https://www.youtube.com/channel/UCTl3QQTvqHFjurroKxexy2Q',
+ 'channel_follower_count': int
},
'params': {
'skip_download': 'requires avconv',
'like_count': int,
'live_status': 'not_live',
'availability': 'unlisted',
+ 'channel_follower_count': int
},
},
# url_encoded_fmt_stream_map is empty string
'playable_in_embed': True,
'like_count': int,
'age_limit': 0,
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
'title': 'md5:e41008789470fc2533a3252216f1c1d1',
'description': 'md5:a677553cf0840649b731a3024aeff4cc',
'duration': 721,
- 'upload_date': '20150127',
+ 'upload_date': '20150128',
'uploader_id': 'BerkmanCenter',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
'uploader': 'The Berkman Klein Center for Internet & Society',
'thumbnail': 'https://i.ytimg.com/vi_webp/M4gD1WSo5mA/maxresdefault.webp',
'live_status': 'not_live',
'playable_in_embed': True,
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
'description': 'md5:13a2503d7b5904ef4b223aa101628f39',
'duration': 4060,
- 'upload_date': '20151119',
+ 'upload_date': '20151120',
'uploader': 'Bernie Sanders',
'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
'view_count': int,
'live_status': 'not_live',
'channel_url': 'https://www.youtube.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
'like_count': int,
'playable_in_embed': True,
'live_status': 'not_live',
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
'channel_id': 'UC-pWHpBjdGG69N9mM2auIAA',
'tags': 'count:11',
'live_status': 'not_live',
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
'playable_in_embed': True,
'live_status': 'not_live',
'channel': 'ElevageOrVert',
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
'view_count': int,
'duration': 522,
'channel': 'kudvenkat',
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
'thumbnail': 'https://i.ytimg.com/vi/OtqTfy26tG0/maxresdefault.jpg',
'categories': ['Music'],
'playable_in_embed': True,
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
'like_count': int,
'live_status': 'not_live',
'playable_in_embed': True,
+ 'channel_follower_count': int
}
},
{
'channel_url': 'https://www.youtube.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
'live_status': 'not_live',
'playable_in_embed': True,
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
'like_count': int,
'live_status': 'not_live',
'playable_in_embed': True,
+ 'channel_follower_count': int
},
'params': {
'format': '17', # 3gp format available on android
'duration': 248,
'categories': ['Education'],
'age_limit': 0,
+ 'channel_follower_count': int
}, 'params': {'format': 'mhtml', 'skip_download': True}
- }
+ }, {
+ # Ensure video upload_date is in UTC timezone (video was uploaded 1641170939)
+ 'url': 'https://www.youtube.com/watch?v=2NUZ8W2llS4',
+ 'info_dict': {
+ 'id': '2NUZ8W2llS4',
+ 'ext': 'mp4',
+ 'title': 'The NP that test your phone performance 🙂',
+ 'description': 'md5:144494b24d4f9dfacb97c1bbef5de84d',
+ 'uploader': 'Leon Nguyen',
+ 'uploader_id': 'VNSXIII',
+ 'uploader_url': 'http://www.youtube.com/user/VNSXIII',
+ 'channel_id': 'UCRqNBSOHgilHfAczlUmlWHA',
+ 'channel_url': 'https://www.youtube.com/channel/UCRqNBSOHgilHfAczlUmlWHA',
+ 'duration': 21,
+ 'view_count': int,
+ 'age_limit': 0,
+ 'categories': ['Gaming'],
+ 'tags': 'count:23',
+ 'playable_in_embed': True,
+ 'live_status': 'not_live',
+ 'upload_date': '20220103',
+ 'like_count': int,
+ 'availability': 'public',
+ 'channel': 'Leon Nguyen',
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/2NUZ8W2llS4/maxresdefault.webp',
+ 'channel_follower_count': int
+ }
+ }, {
+ # date text is premiered video, ensure upload date in UTC (published 1641172509)
+ 'url': 'https://www.youtube.com/watch?v=mzZzzBU6lrM',
+ 'info_dict': {
+ 'id': 'mzZzzBU6lrM',
+ 'ext': 'mp4',
+ 'title': 'I Met GeorgeNotFound In Real Life...',
+ 'description': 'md5:cca98a355c7184e750f711f3a1b22c84',
+ 'uploader': 'Quackity',
+ 'uploader_id': 'QuackityHQ',
+ 'uploader_url': 'http://www.youtube.com/user/QuackityHQ',
+ 'channel_id': 'UC_8NknAFiyhOUaZqHR3lq3Q',
+ 'channel_url': 'https://www.youtube.com/channel/UC_8NknAFiyhOUaZqHR3lq3Q',
+ 'duration': 955,
+ 'view_count': int,
+ 'age_limit': 0,
+ 'categories': ['Entertainment'],
+ 'tags': 'count:26',
+ 'playable_in_embed': True,
+ 'live_status': 'not_live',
+ 'release_timestamp': 1641172509,
+ 'release_date': '20220103',
+ 'upload_date': '20220103',
+ 'like_count': int,
+ 'availability': 'public',
+ 'channel': 'Quackity',
+ 'thumbnail': 'https://i.ytimg.com/vi/mzZzzBU6lrM/maxresdefault.jpg',
+ 'channel_follower_count': int
+ }
+ },
+ { # continuous livestream. Microformat upload date should be preferred.
+ # Upload date was 2021-06-19 (not UTC), while stream start is 2021-11-27
+ 'url': 'https://www.youtube.com/watch?v=kgx4WGK0oNU',
+ 'info_dict': {
+ 'id': 'kgx4WGK0oNU',
+ 'title': r're:jazz\/lofi hip hop radio🌱chill beats to relax\/study to \[LIVE 24\/7\] \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
+ 'ext': 'mp4',
+ 'channel_id': 'UC84whx2xxsiA1gXHXXqKGOA',
+ 'availability': 'public',
+ 'age_limit': 0,
+ 'release_timestamp': 1637975704,
+ 'upload_date': '20210619',
+ 'channel_url': 'https://www.youtube.com/channel/UC84whx2xxsiA1gXHXXqKGOA',
+ 'live_status': 'is_live',
+ 'thumbnail': 'https://i.ytimg.com/vi/kgx4WGK0oNU/maxresdefault.jpg',
+ 'uploader': '阿鲍Abao',
+ 'uploader_url': 'http://www.youtube.com/channel/UC84whx2xxsiA1gXHXXqKGOA',
+ 'channel': 'Abao in Tokyo',
+ 'channel_follower_count': int,
+ 'release_date': '20211127',
+ 'tags': 'count:39',
+ 'categories': ['People & Blogs'],
+ 'like_count': int,
+ 'uploader_id': 'UC84whx2xxsiA1gXHXXqKGOA',
+ 'view_count': int,
+ 'playable_in_embed': True,
+ 'description': 'md5:2ef1d002cad520f65825346e2084e49d',
+ },
+ 'params': {'skip_download': True}
+ },
]
@classmethod
return f['manifest_url'], f['manifest_stream_number'], is_live
for f in formats:
+ f['is_live'] = True
f['protocol'] = 'http_dash_segments_generator'
f['fragments'] = functools.partial(
self._live_dash_fragments, f['format_id'], live_start_time, mpd_feed)
known_idx, no_fragment_score, last_segment_url = begin_index, 0, None
fragments, fragment_base_url = None, None
- def _extract_sequence_from_mpd(refresh_sequence):
+ def _extract_sequence_from_mpd(refresh_sequence, immediate):
nonlocal mpd_url, stream_number, is_live, no_fragment_score, fragments, fragment_base_url
# Obtain from MPD's maximum seq value
old_mpd_url = mpd_url
last_error = ctx.pop('last_error', None)
- expire_fast = last_error and isinstance(last_error, compat_HTTPError) and last_error.code == 403
+ expire_fast = immediate or last_error and isinstance(last_error, compat_HTTPError) and last_error.code == 403
mpd_url, stream_number, is_live = (mpd_feed(format_id, 5 if expire_fast else 18000)
or (mpd_url, stream_number, False))
if not refresh_sequence:
except ExtractorError:
fmts = None
if not fmts:
- no_fragment_score += 1
+ no_fragment_score += 2
return False, last_seq
fmt_info = next(x for x in fmts if x['manifest_stream_number'] == stream_number)
fragments = fmt_info['fragments']
urlh = None
last_seq = try_get(urlh, lambda x: int_or_none(x.headers['X-Head-Seqnum']))
if last_seq is None:
- no_fragment_score += 1
+ no_fragment_score += 2
last_segment_url = None
continue
else:
- should_continue, last_seq = _extract_sequence_from_mpd(True)
+ should_continue, last_seq = _extract_sequence_from_mpd(True, no_fragment_score > 15)
+ no_fragment_score += 2
if not should_continue:
continue
try:
for idx in range(known_idx, last_seq):
# do not update sequence here or you'll get skipped some part of it
- should_continue, _ = _extract_sequence_from_mpd(False)
+ should_continue, _ = _extract_sequence_from_mpd(False, False)
if not should_continue:
known_idx = idx - 1
raise ExtractorError('breaking out of outer loop')
get_all=False, expected_type=compat_str)
if not player_url:
return
- if player_url.startswith('//'):
- player_url = 'https:' + player_url
- elif not re.match(r'https?://', player_url):
- player_url = compat_urlparse.urljoin(
- 'https://www.youtube.com', player_url)
- return player_url
+ return urljoin('https://www.youtube.com', player_url)
def _download_player_url(self, video_id, fatal=False):
res = self._download_webpage(
"""Turn the encrypted n field into a working signature"""
if player_url is None:
raise ExtractorError('Cannot decrypt nsig without player_url')
- if player_url.startswith('//'):
- player_url = 'https:' + player_url
- elif not re.match(r'https?://', player_url):
- player_url = compat_urlparse.urljoin(
- 'https://www.youtube.com', player_url)
+ player_url = urljoin('https://www.youtube.com', player_url)
sig_id = ('nsig_value', s)
if sig_id in self._player_cache:
raise ExtractorError(traceback.format_exc(), cause=e, video_id=video_id)
def _extract_n_function_name(self, jscode):
- return self._search_regex(
- (r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z0-9$]{3})\([a-zA-Z0-9]\)',),
- jscode, 'Initial JS player n function name', group='nfunc')
+ nfunc, idx = self._search_regex(
+ r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z0-9$]+)(?:\[(?P<idx>\d+)\])?\([a-zA-Z0-9]\)',
+ jscode, 'Initial JS player n function name', group=('nfunc', 'idx'))
+ if not idx:
+ return nfunc
+ return json.loads(js_to_json(self._search_regex(
+ rf'var {re.escape(nfunc)}\s*=\s*(\[.+?\]);', jscode,
+ f'Initial JS player n function list ({nfunc}.{idx})')))[int(idx)]
def _extract_n_function(self, video_id, player_url):
player_id = self._extract_player_info(player_url)
webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
video_id, 'initial player response')
- original_clients = clients
+ all_clients = set(clients)
clients = clients[::-1]
prs = []
- def append_client(client_name):
- if client_name in INNERTUBE_CLIENTS and client_name not in original_clients:
- clients.append(client_name)
+ def append_client(*client_names):
+ """ Append the first client name that exists but not already used """
+ for client_name in client_names:
+ actual_client = _split_innertube_client(client_name)[0]
+ if actual_client in INNERTUBE_CLIENTS:
+ if actual_client not in all_clients:
+ clients.append(client_name)
+ all_clients.add(actual_client)
+ return
# Android player_response does not have microFormats which are needed for
# extraction of some data. So we return the initial_pr with formats
tried_iframe_fallback = False
player_url = None
while clients:
- client = clients.pop()
+ client, base_client, variant = _split_innertube_client(clients.pop())
player_ytcfg = master_ytcfg if client == 'web' else {}
if 'configs' not in self._configuration_arg('player_skip'):
player_ytcfg = self._extract_player_ytcfg(client, video_id) or player_ytcfg
prs.append(pr)
# creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in
- if client.endswith('_agegate') and self._is_unplayable(pr) and self.is_authenticated:
- append_client(client.replace('_agegate', '_creator'))
+ if variant == 'embedded' and self._is_unplayable(pr) and self.is_authenticated:
+ append_client(f'{base_client}_creator')
elif self._is_agegated(pr):
- append_client(f'{client}_agegate')
+ if variant == 'tv_embedded':
+ append_client(f'{base_client}_embedded')
+ elif not variant:
+ append_client(f'tv_embedded.{base_client}', f'{base_client}_embedded')
if last_error:
if not len(prs):
self.report_warning(last_error)
return prs, player_url
- def _extract_formats(self, streaming_data, video_id, player_url, is_live):
+ def _extract_formats(self, streaming_data, video_id, player_url, is_live, duration):
itags, stream_ids = {}, []
itag_qualities, res_qualities = {}, {}
q = qualities([
streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[])
for fmt in streaming_formats:
- if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
+ if fmt.get('targetDurationSec'):
continue
itag = str_or_none(fmt.get('itag'))
itags[itag] = 'https'
stream_ids.append(stream_id)
- tbr = float_or_none(
- fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
+ tbr = float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
+ language_preference = (
+ 10 if audio_track.get('audioIsDefault') and 10
+ else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10
+ else -1)
+ # Some formats may have much smaller duration than others (possibly damaged during encoding)
+ # Eg: 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823
+ # Make sure to avoid false positives with small duration differences.
+ # Eg: __2ABJjxzNo, ySuUZEjARPY
+ is_damaged = try_get(fmt, lambda x: float(x['approxDurationMs']) / duration < 500)
+ if is_damaged:
+ self.report_warning(f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True)
dct = {
'asr': int_or_none(fmt.get('audioSampleRate')),
'filesize': int_or_none(fmt.get('contentLength')),
'format_id': itag,
'format_note': join_nonempty(
'%s%s' % (audio_track.get('displayName') or '',
- ' (default)' if audio_track.get('audioIsDefault') else ''),
+ ' (default)' if language_preference > 0 else ''),
fmt.get('qualityLabel') or quality.replace('audio_quality_', ''),
- throttled and 'THROTTLED', delim=', '),
+ throttled and 'THROTTLED', is_damaged and 'DAMAGED', delim=', '),
'source_preference': -10 if throttled else -1,
'fps': int_or_none(fmt.get('fps')) or None,
'height': height,
'quality': q(quality),
+ 'has_drm': bool(fmt.get('drmFamilies')),
'tbr': tbr,
'url': fmt_url,
'width': int_or_none(fmt.get('width')),
- 'language': audio_track.get('id', '').split('.')[0],
- 'language_preference': 1 if audio_track.get('audioIsDefault') else -1,
+ 'language': join_nonempty(audio_track.get('id', '').split('.')[0],
+ 'desc' if language_preference < -1 else ''),
+ 'language_preference': language_preference,
+ # Strictly de-prioritize damaged and 3gp formats
+ 'preference': -10 if is_damaged else -2 if itag == '17' else None,
}
mime_mobj = re.match(
r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '')
'width': width,
'height': height,
'fragments': [{
- 'path': url.replace('$M', str(j)),
+ 'url': url.replace('$M', str(j)),
'duration': min(fragment_duration, duration - (j * fragment_duration)),
} for j in range(math.ceil(fragment_count))],
}
return webpage, master_ytcfg, player_responses, player_url
- def _list_formats(self, video_id, microformats, video_details, player_responses, player_url):
+ def _list_formats(self, video_id, microformats, video_details, player_responses, player_url, duration=None):
live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails'))
is_live = get_first(video_details, 'isLive')
if is_live is None:
is_live = get_first(live_broadcast_details, 'isLiveNow')
streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[])
- formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live))
+ formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live, duration))
return live_broadcast_details, is_live, streaming_data, formats
return self.playlist_result(
entries, video_id, video_title, video_description)
- live_broadcast_details, is_live, streaming_data, formats = self._list_formats(video_id, microformats, video_details, player_responses, player_url)
+ duration = int_or_none(
+ get_first(video_details, 'lengthSeconds')
+ or get_first(microformats, 'lengthSeconds')
+ or parse_duration(search_meta('duration'))) or None
+
+ live_broadcast_details, is_live, streaming_data, formats = self._list_formats(
+ video_id, microformats, video_details, player_responses, player_url, duration)
if not formats:
if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')):
get_first(video_details, 'channelId')
or get_first(microformats, 'externalChannelId')
or search_meta('channelId'))
- duration = int_or_none(
- get_first(video_details, 'lengthSeconds')
- or get_first(microformats, 'lengthSeconds')
- or parse_duration(search_meta('duration'))) or None
owner_profile_url = get_first(microformats, 'ownerProfileUrl')
live_content = get_first(video_details, 'isLiveContent')
# URL checking if user don't care about getting the best possible thumbnail
'thumbnail': traverse_obj(original_thumbnails, (-1, 'url')),
'description': video_description,
- 'upload_date': unified_strdate(
- get_first(microformats, 'uploadDate')
- or search_meta('uploadDate')),
'uploader': get_first(video_details, 'author'),
'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None,
'uploader_url': owner_profile_url,
'channel_id': channel_id,
- 'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None,
+ 'channel_url': format_field(channel_id, template='https://www.youtube.com/channel/%s'),
'duration': duration,
'view_count': int_or_none(
get_first((video_details, microformats), (..., 'viewCount'))
})
lang_subs.append({
'ext': fmt,
- 'url': update_url_query(base_url, query),
+ 'url': urljoin('https://www.youtube.com', update_url_query(base_url, query)),
'name': sub_name,
})
subtitles, automatic_captions = {}, {}
for lang_code, caption_track in captions.items():
base_url = caption_track.get('baseUrl')
+ orig_lang = parse_qs(base_url).get('lang', [None])[-1]
if not base_url:
continue
lang_name = self._get_text(caption_track, 'name', max_runs=1)
for trans_code, trans_name in translation_languages.items():
if not trans_code:
continue
+ orig_trans_code = trans_code
if caption_track.get('kind') != 'asr':
+ if 'translated_subs' in self._configuration_arg('skip'):
+ continue
trans_code += f'-{lang_code}'
trans_name += format_field(lang_name, template=' from %s')
- process_language(
- automatic_captions, base_url, trans_code, trans_name, {'tlang': trans_code})
+ # Add an "-orig" label to the original language so that it can be distinguished.
+ # The subs are returned without "-orig" as well for compatibility
+ if lang_code == f'a-{orig_trans_code}':
+ process_language(
+ automatic_captions, base_url, f'{trans_code}-orig', f'{trans_name} (Original)', {})
+ # Setting tlang=lang returns damaged subtitles.
+ process_language(automatic_captions, base_url, trans_code, trans_name,
+ {} if orig_lang == orig_trans_code else {'tlang': trans_code})
info['automatic_captions'] = automatic_captions
info['subtitles'] = subtitles
or self._extract_chapters_from_engagement_panel(initial_data, duration)
or None)
- contents = try_get(
- initial_data,
- lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'],
- list) or []
- for content in contents:
- vpir = content.get('videoPrimaryInfoRenderer')
- if vpir:
- stl = vpir.get('superTitleLink')
- if stl:
- stl = self._get_text(stl)
- if try_get(
- vpir,
- lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':
- info['location'] = stl
- else:
- mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl)
- if mobj:
- info.update({
- 'series': mobj.group(1),
- 'season_number': int(mobj.group(2)),
- 'episode_number': int(mobj.group(3)),
- })
- for tlb in (try_get(
- vpir,
- lambda x: x['videoActions']['menuRenderer']['topLevelButtons'],
- list) or []):
- tbr = tlb.get('toggleButtonRenderer') or {}
- for getter, regex in [(
- lambda x: x['defaultText']['accessibility']['accessibilityData'],
- r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([
- lambda x: x['accessibility'],
- lambda x: x['accessibilityData']['accessibilityData'],
- ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]:
- label = (try_get(tbr, getter, dict) or {}).get('label')
- if label:
- mobj = re.match(regex, label)
- if mobj:
- info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count'))
- break
- sbr_tooltip = try_get(
- vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])
- if sbr_tooltip:
- like_count, dislike_count = sbr_tooltip.split(' / ')
+ contents = traverse_obj(
+ initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents'),
+ expected_type=list, default=[])
+
+ vpir = get_first(contents, 'videoPrimaryInfoRenderer')
+ if vpir:
+ stl = vpir.get('superTitleLink')
+ if stl:
+ stl = self._get_text(stl)
+ if try_get(
+ vpir,
+ lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':
+ info['location'] = stl
+ else:
+ mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl)
+ if mobj:
info.update({
- 'like_count': str_to_int(like_count),
- 'dislike_count': str_to_int(dislike_count),
+ 'series': mobj.group(1),
+ 'season_number': int(mobj.group(2)),
+ 'episode_number': int(mobj.group(3)),
})
- vsir = content.get('videoSecondaryInfoRenderer')
- if vsir:
- info['channel'] = self._get_text(vsir, ('owner', 'videoOwnerRenderer', 'title'))
- rows = try_get(
- vsir,
- lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
- list) or []
- multiple_songs = False
- for row in rows:
- if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
- multiple_songs = True
+ for tlb in (try_get(
+ vpir,
+ lambda x: x['videoActions']['menuRenderer']['topLevelButtons'],
+ list) or []):
+ tbr = tlb.get('toggleButtonRenderer') or {}
+ for getter, regex in [(
+ lambda x: x['defaultText']['accessibility']['accessibilityData'],
+ r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([
+ lambda x: x['accessibility'],
+ lambda x: x['accessibilityData']['accessibilityData'],
+ ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]:
+ label = (try_get(tbr, getter, dict) or {}).get('label')
+ if label:
+ mobj = re.match(regex, label)
+ if mobj:
+ info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count'))
break
- for row in rows:
- mrr = row.get('metadataRowRenderer') or {}
- mrr_title = mrr.get('title')
- if not mrr_title:
- continue
- mrr_title = self._get_text(mrr, 'title')
- mrr_contents_text = self._get_text(mrr, ('contents', 0))
- if mrr_title == 'License':
- info['license'] = mrr_contents_text
- elif not multiple_songs:
- if mrr_title == 'Album':
- info['album'] = mrr_contents_text
- elif mrr_title == 'Artist':
- info['artist'] = mrr_contents_text
- elif mrr_title == 'Song':
- info['track'] = mrr_contents_text
+ sbr_tooltip = try_get(
+ vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])
+ if sbr_tooltip:
+ like_count, dislike_count = sbr_tooltip.split(' / ')
+ info.update({
+ 'like_count': str_to_int(like_count),
+ 'dislike_count': str_to_int(dislike_count),
+ })
+ vsir = get_first(contents, 'videoSecondaryInfoRenderer')
+ if vsir:
+ vor = traverse_obj(vsir, ('owner', 'videoOwnerRenderer'))
+ info.update({
+ 'channel': self._get_text(vor, 'title'),
+ 'channel_follower_count': self._get_count(vor, 'subscriberCountText')})
+
+ rows = try_get(
+ vsir,
+ lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
+ list) or []
+ multiple_songs = False
+ for row in rows:
+ if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
+ multiple_songs = True
+ break
+ for row in rows:
+ mrr = row.get('metadataRowRenderer') or {}
+ mrr_title = mrr.get('title')
+ if not mrr_title:
+ continue
+ mrr_title = self._get_text(mrr, 'title')
+ mrr_contents_text = self._get_text(mrr, ('contents', 0))
+ if mrr_title == 'License':
+ info['license'] = mrr_contents_text
+ elif not multiple_songs:
+ if mrr_title == 'Album':
+ info['album'] = mrr_contents_text
+ elif mrr_title == 'Artist':
+ info['artist'] = mrr_contents_text
+ elif mrr_title == 'Song':
+ info['track'] = mrr_contents_text
fallbacks = {
'channel': 'uploader',
'channel_id': 'uploader_id',
'channel_url': 'uploader_url',
}
+
+ # The upload date for scheduled, live and past live streams / premieres in microformats
+ # may be different from the stream date. Although not in UTC, we will prefer it in this case.
+ # See: https://github.com/yt-dlp/yt-dlp/pull/2223#issuecomment-1008485139
+ upload_date = (
+ unified_strdate(get_first(microformats, 'uploadDate'))
+ or unified_strdate(search_meta('uploadDate')))
+ if not upload_date or (not info.get('is_live') and not info.get('was_live') and info.get('live_status') != 'is_upcoming'):
+ upload_date = strftime_or_none(self._extract_time_text(vpir, 'dateText')[0], '%Y%m%d')
+ info['upload_date'] = upload_date
+
for to, frm in fallbacks.items():
if not info.get(to):
info[to] = info.get(frm)
class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
+ @staticmethod
+ def passthrough_smuggled_data(func):
+ def _smuggle(entries, smuggled_data):
+ for entry in entries:
+ # TODO: Convert URL to music.youtube instead.
+ # Do we need to passthrough any other smuggled_data?
+ entry['url'] = smuggle_url(entry['url'], smuggled_data)
+ yield entry
+
+ @functools.wraps(func)
+ def wrapper(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
+ if self.is_music_url(url):
+ smuggled_data['is_music_url'] = True
+ info_dict = func(self, url, smuggled_data)
+ if smuggled_data and info_dict.get('entries'):
+ info_dict['entries'] = _smuggle(info_dict['entries'], smuggled_data)
+ return info_dict
+ return wrapper
+
def _extract_channel_id(self, webpage):
channel_id = self._html_search_meta(
'channelId', webpage, 'channel id', default=None)
def _extract_basic_item_renderer(item):
# Modified from _extract_grid_item_renderer
known_basic_renderers = (
- 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer'
+ 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer', 'reelItemRenderer'
)
for key, renderer in item.items():
if not isinstance(renderer, dict):
ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title)
break
+ def _music_reponsive_list_entry(self, renderer):
+ video_id = traverse_obj(renderer, ('playlistItemData', 'videoId'))
+ if video_id:
+ return self.url_result(f'https://music.youtube.com/watch?v={video_id}',
+ ie=YoutubeIE.ie_key(), video_id=video_id)
+ playlist_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'playlistId'))
+ if playlist_id:
+ video_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'videoId'))
+ if video_id:
+ return self.url_result(f'https://music.youtube.com/watch?v={video_id}&list={playlist_id}',
+ ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
+ return self.url_result(f'https://music.youtube.com/playlist?list={playlist_id}',
+ ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
+ browse_id = traverse_obj(renderer, ('navigationEndpoint', 'browseEndpoint', 'browseId'))
+ if browse_id:
+ return self.url_result(f'https://music.youtube.com/browse/{browse_id}',
+ ie=YoutubeTabIE.ie_key(), video_id=browse_id)
+
def _shelf_entries_from_content(self, shelf_renderer):
content = shelf_renderer.get('content')
if not isinstance(content, dict):
if video_id:
return self._extract_video(video_renderer)
+ def _hashtag_tile_entry(self, hashtag_tile_renderer):
+ url = urljoin('https://youtube.com', traverse_obj(
+ hashtag_tile_renderer, ('onTapCommand', 'commandMetadata', 'webCommandMetadata', 'url')))
+ if url:
+ return self.url_result(
+ url, ie=YoutubeTabIE.ie_key(), title=self._get_text(hashtag_tile_renderer, 'hashtag'))
+
def _post_thread_entries(self, post_thread_renderer):
post_renderer = try_get(
post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
if entry:
yield entry
'''
+
def _extract_entries(self, parent_renderer, continuation_list):
# continuation_list is modified in-place with continuation_list = [continuation_token]
continuation_list[:] = [None]
for content in contents:
if not isinstance(content, dict):
continue
- is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
+ is_renderer = traverse_obj(
+ content, 'itemSectionRenderer', 'musicShelfRenderer', 'musicShelfContinuation',
+ expected_type=dict)
if not is_renderer:
renderer = content.get('richItemRenderer')
if renderer:
known_renderers = {
'playlistVideoListRenderer': self._playlist_entries,
'gridRenderer': self._grid_entries,
- 'shelfRenderer': lambda x: self._shelf_entries(x),
+ 'reelShelfRenderer': self._grid_entries,
+ 'shelfRenderer': self._shelf_entries,
+ 'musicResponsiveListItemRenderer': lambda x: [self._music_reponsive_list_entry(x)],
'backstagePostThreadRenderer': self._post_thread_entries,
'videoRenderer': lambda x: [self._video_entry(x)],
'playlistRenderer': lambda x: self._grid_entries({'items': [{'playlistRenderer': x}]}),
'channelRenderer': lambda x: self._grid_entries({'items': [{'channelRenderer': x}]}),
+ 'hashtagTileRenderer': lambda x: [self._hashtag_tile_entry(x)]
}
for key, renderer in isr_content.items():
if key not in known_renderers:
continue
known_renderers = {
+ 'videoRenderer': (self._grid_entries, 'items'), # for membership tab
'gridPlaylistRenderer': (self._grid_entries, 'items'),
'gridVideoRenderer': (self._grid_entries, 'items'),
'gridChannelRenderer': (self._grid_entries, 'items'),
break
@staticmethod
- def _extract_selected_tab(tabs):
+ def _extract_selected_tab(tabs, fatal=True):
for tab in tabs:
renderer = dict_get(tab, ('tabRenderer', 'expandableTabRenderer')) or {}
if renderer.get('selected') is True:
return renderer
else:
- raise ExtractorError('Unable to find selected tab')
+ if fatal:
+ raise ExtractorError('Unable to find selected tab')
@classmethod
def _extract_uploader(cls, data):
playlist_id = channel_id
tags = renderer.get('keywords', '').split()
- thumbnails = (
- self._extract_thumbnails(renderer, 'avatar')
- or self._extract_thumbnails(
- primary_sidebar_renderer, ('thumbnailRenderer', 'playlistVideoThumbnailRenderer', 'thumbnail')))
+ # We can get the uncropped banner/avatar by replacing the crop params with '=s0'
+ # See: https://github.com/yt-dlp/yt-dlp/issues/2237#issuecomment-1013694714
+ def _get_uncropped(url):
+ return url_or_none((url or '').split('=')[0] + '=s0')
+
+ avatar_thumbnails = self._extract_thumbnails(renderer, 'avatar')
+ if avatar_thumbnails:
+ uncropped_avatar = _get_uncropped(avatar_thumbnails[0]['url'])
+ if uncropped_avatar:
+ avatar_thumbnails.append({
+ 'url': uncropped_avatar,
+ 'id': 'avatar_uncropped',
+ 'preference': 1
+ })
+
+ channel_banners = self._extract_thumbnails(
+ data, ('header', ..., ['banner', 'mobileBanner', 'tvBanner']))
+ for banner in channel_banners:
+ banner['preference'] = -10
+
+ if channel_banners:
+ uncropped_banner = _get_uncropped(channel_banners[0]['url'])
+ if uncropped_banner:
+ channel_banners.append({
+ 'url': uncropped_banner,
+ 'id': 'banner_uncropped',
+ 'preference': -5
+ })
+
+ primary_thumbnails = self._extract_thumbnails(
+ primary_sidebar_renderer, ('thumbnailRenderer', ('playlistVideoThumbnailRenderer', 'playlistCustomThumbnailRenderer'), 'thumbnail'))
if playlist_id is None:
playlist_id = item_id
'uploader': channel_name,
'uploader_id': channel_id,
'uploader_url': channel_url,
- 'thumbnails': thumbnails,
+ 'thumbnails': primary_thumbnails + avatar_thumbnails + channel_banners,
'tags': tags,
'view_count': self._get_count(playlist_stats, 1),
'availability': self._extract_availability(data),
'modified_date': strftime_or_none(last_updated_unix, '%Y%m%d'),
- 'playlist_count': self._get_count(playlist_stats, 0)
+ 'playlist_count': self._get_count(playlist_stats, 0),
+ 'channel_follower_count': self._get_count(data, ('header', ..., 'subscriberCountText')),
}
if not channel_id:
metadata.update(self._extract_uploader(data))
self.report_warning(error_to_compat_str(e))
break
- if dict_get(data, ('contents', 'currentVideoEndpoint')):
+ if dict_get(data, ('contents', 'currentVideoEndpoint', 'onResponseReceivedActions')):
break
last_error = 'Incomplete yt initial data received'
if 'webpage' not in self._configuration_arg('skip'):
webpage, data = self._extract_webpage(url, item_id, fatal=webpage_fatal)
ytcfg = ytcfg or self.extract_ytcfg(item_id, webpage)
+ # Reject webpage data if redirected to home page without explicitly requesting
+ selected_tab = self._extract_selected_tab(traverse_obj(
+ data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list, default=[]), fatal=False) or {}
+ if (url != 'https://www.youtube.com/feed/recommended'
+ and selected_tab.get('tabIdentifier') == 'FEwhat_to_watch' # Home page
+ and 'no-youtube-channel-redirect' not in self.get_param('compat_opts', [])):
+ msg = 'The channel/playlist does not exist and the URL redirected to youtube.com home page'
+ if fatal:
+ raise ExtractorError(msg, expected=True)
+ self.report_warning(msg, only_once=True)
if not data:
if not ytcfg and self.is_authenticated:
msg = 'Playlists that require authentication may not extract correctly without a successful webpage download.'
return self._extract_response(
item_id=item_id, query=params, ep=ep, headers=headers,
ytcfg=ytcfg, fatal=fatal, default_client=default_client,
- check_get_keys=('contents', 'currentVideoEndpoint'))
+ check_get_keys=('contents', 'currentVideoEndpoint', 'onResponseReceivedActions'))
err_note = 'Failed to resolve url (does the playlist exist?)'
if fatal:
raise ExtractorError(err_note, expected=True)
self.report_warning(err_note, item_id)
- @staticmethod
- def _smuggle_data(entries, data):
- for entry in entries:
- if data:
- entry['url'] = smuggle_url(entry['url'], data)
- yield entry
-
_SEARCH_PARAMS = None
- def _search_results(self, query, params=NO_DEFAULT):
+ def _search_results(self, query, params=NO_DEFAULT, default_client='web'):
data = {'query': query}
if params is NO_DEFAULT:
params = self._SEARCH_PARAMS
if params:
data['params'] = params
+
+ content_keys = (
+ ('contents', 'twoColumnSearchResultsRenderer', 'primaryContents', 'sectionListRenderer', 'contents'),
+ ('onResponseReceivedCommands', 0, 'appendContinuationItemsAction', 'continuationItems'),
+ # ytmusic search
+ ('contents', 'tabbedSearchResultsRenderer', 'tabs', 0, 'tabRenderer', 'content', 'sectionListRenderer', 'contents'),
+ ('continuationContents', ),
+ )
+ check_get_keys = tuple(set(keys[0] for keys in content_keys))
+
continuation_list = [None]
for page_num in itertools.count(1):
data.update(continuation_list[0] or {})
search = self._extract_response(
item_id='query "%s" page %s' % (query, page_num), ep='search', query=data,
- check_get_keys=('contents', 'onResponseReceivedCommands'))
- slr_contents = try_get(
- search,
- (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
- lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
- list)
- yield from self._extract_entries({'contents': slr_contents}, continuation_list)
+ default_client=default_client, check_get_keys=check_get_keys)
+ slr_contents = traverse_obj(search, *content_keys)
+ yield from self._extract_entries({'contents': list(variadic(slr_contents))}, continuation_list)
if not continuation_list[0]:
break
'tags': ['"критическое', 'мышление"', '"наука', 'просто"', 'математика', '"анализ', 'данных"'],
'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg',
'uploader_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg',
+ 'channel_follower_count': int
},
}, {
'note': 'playlists, multipage, different order',
'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg',
'channel': 'Igor Kleiner',
'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg',
+ 'channel_follower_count': int
},
}, {
'note': 'playlists, series',
'channel': '3Blue1Brown',
'channel_id': 'UCYO_jab_esuFRV4b17AJtAw',
'tags': ['Mathematics'],
+ 'channel_follower_count': int
},
}, {
'note': 'playlists, singlepage',
'channel_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
'tags': 'count:13',
'channel': 'ThirstForScience',
+ 'channel_follower_count': int
}
}, {
'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel_follower_count': int
},
'playlist_mincount': 2,
}, {
'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
'channel': 'lex will',
+ 'channel_follower_count': int
},
'playlist_mincount': 975,
}, {
'channel': 'lex will',
'tags': ['bible', 'history', 'prophesy'],
'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel_follower_count': int
},
'playlist_mincount': 199,
}, {
'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
'tags': ['bible', 'history', 'prophesy'],
+ 'channel_follower_count': int
},
'playlist_mincount': 17,
}, {
'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
'tags': ['bible', 'history', 'prophesy'],
+ 'channel_follower_count': int
},
'playlist_mincount': 18,
}, {
'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
'tags': ['bible', 'history', 'prophesy'],
+ 'channel_follower_count': int
},
'playlist_mincount': 12,
}, {
'tags': ['Mathematics'],
'channel': '3Blue1Brown',
'channel_id': 'UCYO_jab_esuFRV4b17AJtAw',
+ 'channel_follower_count': int
},
}, {
'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
}, {
'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
'info_dict': {
- 'id': 'zpsbVPFwsqk', # This will keep changing
+ 'id': 'GgL890LIznQ', # This will keep changing
'ext': 'mp4',
'title': str,
'uploader': 'Sky News',
'categories': ['News & Politics'],
'tags': list,
'like_count': int,
- 'release_timestamp': 1640164857,
+ 'release_timestamp': 1642502819,
'channel': 'Sky News',
'channel_id': 'UCoMdktPbSTixAyNGwb-UYkQ',
'age_limit': 0,
'view_count': int,
- 'thumbnail': 'https://i.ytimg.com/vi/zpsbVPFwsqk/maxresdefault_live.jpg',
+ 'thumbnail': 'https://i.ytimg.com/vi/GgL890LIznQ/maxresdefault_live.jpg',
'playable_in_embed': True,
- 'release_date': '20211222',
+ 'release_date': '20220118',
'availability': 'public',
'live_status': 'is_live',
'channel_url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ',
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
'info_dict': {
'id': 'recommended',
'title': 'recommended',
+ 'tags': [],
},
'playlist_mincount': 50,
'params': {
'tags': [],
'channel_url': 'https://www.youtube.com/channel/UCu6mSoMNzHQiBIOCkHUa2Aw',
'uploader_url': 'https://www.youtube.com/channel/UCu6mSoMNzHQiBIOCkHUa2Aw',
+ 'channel_follower_count': int
},
'playlist_mincount': 650,
'params': {
'skip_download': True,
'extractor_args': {'youtubetab': {'skip': ['webpage']}}
},
+ }, {
+ 'note': 'non-standard redirect to regional channel',
+ 'url': 'https://www.youtube.com/channel/UCwVVpHQ2Cs9iGJfpdFngePQ',
+ 'only_matching': True
}]
@classmethod
return False if YoutubeIE.suitable(url) else super(
YoutubeTabIE, cls).suitable(url)
- def _real_extract(self, url):
- url, smuggled_data = unsmuggle_url(url, {})
- if self.is_music_url(url):
- smuggled_data['is_music_url'] = True
- info_dict = self.__real_extract(url, smuggled_data)
- if info_dict.get('entries'):
- info_dict['entries'] = self._smuggle_data(info_dict['entries'], smuggled_data)
- return info_dict
-
_URL_RE = re.compile(rf'(?P<pre>{_VALID_URL})(?(not_channel)|(?P<tab>/\w+))?(?P<post>.*)$')
- def __real_extract(self, url, smuggled_data):
+ @YoutubeTabBaseInfoExtractor.passthrough_smuggled_data
+ def _real_extract(self, url, smuggled_data):
item_id = self._match_id(url)
url = compat_urlparse.urlunparse(
compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
data, ytcfg = self._extract_data(url, item_id)
+ # YouTube may provide a non-standard redirect to the regional channel
+ # See: https://github.com/yt-dlp/yt-dlp/issues/2694
+ redirect_url = traverse_obj(
+ data, ('onResponseReceivedActions', ..., 'navigateAction', 'endpoint', 'commandMetadata', 'webCommandMetadata', 'url'), get_all=False)
+ if redirect_url and 'no-youtube-channel-redirect' not in compat_opts:
+ redirect_url = ''.join((
+ urljoin('https://www.youtube.com', redirect_url), mobj['tab'], mobj['post']))
+ self.to_screen(f'This playlist is likely not available in your region. Following redirect to regional playlist {redirect_url}')
+ return self.url_result(redirect_url, ie=YoutubeTabIE.ie_key())
+
tabs = traverse_obj(data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list)
if tabs:
selected_tab = self._extract_selected_tab(tabs)
IE_NAME = 'youtube:search'
_SEARCH_KEY = 'ytsearch'
_SEARCH_PARAMS = 'EgIQAQ%3D%3D' # Videos only
- _TESTS = []
+ _TESTS = [{
+ 'url': 'ytsearch5:youtube-dl test video',
+ 'playlist_count': 5,
+ 'info_dict': {
+ 'id': 'youtube-dl test video',
+ 'title': 'youtube-dl test video',
+ }
+ }]
class YoutubeSearchDateIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor):
_SEARCH_KEY = 'ytsearchdate'
IE_DESC = 'YouTube search, newest videos first'
_SEARCH_PARAMS = 'CAISAhAB' # Videos only, sorted by date
+ _TESTS = [{
+ 'url': 'ytsearchdate5:youtube-dl test video',
+ 'playlist_count': 5,
+ 'info_dict': {
+ 'id': 'youtube-dl test video',
+ 'title': 'youtube-dl test video',
+ }
+ }]
class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor):
IE_DESC = 'YouTube search URLs with sorting and filter support'
IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
- _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:results|search)\?([^#]+&)?(?:search_query|q)=(?:[^&]+)(?:[&#]|$)'
_TESTS = [{
'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
'playlist_mincount': 5,
'id': 'python',
'title': 'python',
}
-
+ }, {
+ 'url': 'https://www.youtube.com/results?search_query=%23cats',
+ 'playlist_mincount': 1,
+ 'info_dict': {
+ 'id': '#cats',
+ 'title': '#cats',
+ 'entries': [{
+ 'url': r're:https://(www\.)?youtube\.com/hashtag/cats',
+ 'title': '#cats',
+ }],
+ },
}, {
'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
'only_matching': True,
return self.playlist_result(self._search_results(query, qs.get('sp', (None,))[0]), query, query)
-class YoutubeFeedsInfoExtractor(YoutubeTabIE):
+class YoutubeMusicSearchURLIE(YoutubeTabBaseInfoExtractor):
+ IE_DESC = 'YouTube music search URLs with selectable sections (Eg: #songs)'
+ IE_NAME = 'youtube:music:search_url'
+ _VALID_URL = r'https?://music\.youtube\.com/search\?([^#]+&)?(?:search_query|q)=(?:[^&]+)(?:[&#]|$)'
+ _TESTS = [{
+ 'url': 'https://music.youtube.com/search?q=royalty+free+music',
+ 'playlist_count': 16,
+ 'info_dict': {
+ 'id': 'royalty free music',
+ 'title': 'royalty free music',
+ }
+ }, {
+ 'url': 'https://music.youtube.com/search?q=royalty+free+music&sp=EgWKAQIIAWoKEAoQAxAEEAkQBQ%3D%3D',
+ 'playlist_mincount': 30,
+ 'info_dict': {
+ 'id': 'royalty free music - songs',
+ 'title': 'royalty free music - songs',
+ },
+ 'params': {'extract_flat': 'in_playlist'}
+ }, {
+ 'url': 'https://music.youtube.com/search?q=royalty+free+music#community+playlists',
+ 'playlist_mincount': 30,
+ 'info_dict': {
+ 'id': 'royalty free music - community playlists',
+ 'title': 'royalty free music - community playlists',
+ },
+ 'params': {'extract_flat': 'in_playlist'}
+ }]
+
+ _SECTIONS = {
+ 'albums': 'EgWKAQIYAWoKEAoQAxAEEAkQBQ==',
+ 'artists': 'EgWKAQIgAWoKEAoQAxAEEAkQBQ==',
+ 'community playlists': 'EgeKAQQoAEABagoQChADEAQQCRAF',
+ 'featured playlists': 'EgeKAQQoADgBagwQAxAJEAQQDhAKEAU==',
+ 'songs': 'EgWKAQIIAWoKEAoQAxAEEAkQBQ==',
+ 'videos': 'EgWKAQIQAWoKEAoQAxAEEAkQBQ==',
+ }
+
+ def _real_extract(self, url):
+ qs = parse_qs(url)
+ query = (qs.get('search_query') or qs.get('q'))[0]
+ params = qs.get('sp', (None,))[0]
+ if params:
+ section = next((k for k, v in self._SECTIONS.items() if v == params), params)
+ else:
+ section = compat_urllib_parse_unquote_plus((url.split('#') + [''])[1]).lower()
+ params = self._SECTIONS.get(section)
+ if not params:
+ section = None
+ title = join_nonempty(query, section, delim=' - ')
+ return self.playlist_result(self._search_results(query, params, default_client='web_music'), title, title)
+
+
+class YoutubeFeedsInfoExtractor(InfoExtractor):
"""
Base class for feed extractors
Subclasses must define the _FEED_NAME property.
def _real_extract(self, url):
return self.url_result(
- 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
- ie=YoutubeTabIE.ie_key())
+ f'https://www.youtube.com/feed/{self._FEED_NAME}', ie=YoutubeTabIE.ie_key())
class YoutubeWatchLaterIE(InfoExtractor):