X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/471d0367c76e1413bb35e0be45765a277e469ee2..fbb73833067ba742459729809679a62f34b3e41e:/yt_dlp/extractor/youtube.py diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index ccb41cb2e..9dde34fb0 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -1,7 +1,9 @@ import base64 import calendar +import collections import copy import datetime +import enum import hashlib import itertools import json @@ -13,29 +15,24 @@ import threading import time import traceback +import urllib.error +import urllib.parse from .common import InfoExtractor, SearchInfoExtractor -from ..compat import functools # isort: split -from ..compat import ( - compat_chr, - compat_HTTPError, - compat_parse_qs, - compat_str, - compat_urllib_parse_unquote_plus, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_urlparse, -) +from .openload import PhantomJSwrapper +from ..compat import functools from ..jsinterp import JSInterpreter from ..utils import ( NO_DEFAULT, ExtractorError, + LazyList, + UserNotLive, bug_reports_message, classproperty, clean_html, datetime_from_str, dict_get, - error_to_compat_str, + filter_dict, float_or_none, format_field, get_first, @@ -52,7 +49,6 @@ parse_iso8601, parse_qs, qualities, - remove_end, remove_start, smuggle_url, str_or_none, @@ -70,14 +66,14 @@ variadic, ) -# any clients starting with _ cannot be explicity requested by the user +# any clients starting with _ cannot be explicitly requested by the user INNERTUBE_CLIENTS = { 'web': { 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB', - 'clientVersion': '2.20211221.00.00', + 'clientVersion': '2.20220801.00.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1 @@ -87,7 +83,7 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_EMBEDDED_PLAYER', - 'clientVersion': '1.20211215.00.01', + 'clientVersion': '1.20220731.00.00', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 56 @@ -98,7 +94,7 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_REMIX', - 'clientVersion': '1.20211213.00.00', + 'clientVersion': '1.20220727.01.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, @@ -108,7 +104,7 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_CREATOR', - 'clientVersion': '1.20211220.02.00', + 'clientVersion': '1.20220726.00.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, @@ -118,7 +114,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID', - 'clientVersion': '16.49', + 'clientVersion': '17.31.35', + 'androidSdkVersion': 30, + 'userAgent': 'com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip' } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, @@ -129,7 +127,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_EMBEDDED_PLAYER', - 'clientVersion': '16.49', + 'clientVersion': '17.31.35', + 'androidSdkVersion': 30, + 'userAgent': 'com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip' }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 55, @@ -140,7 +140,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_MUSIC', - 'clientVersion': '4.57', + 'clientVersion': '5.16.51', + 'androidSdkVersion': 30, + 'userAgent': 'com.google.android.apps.youtube.music/5.16.51 (Linux; U; Android 11) gzip' } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, @@ -151,7 +153,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_CREATOR', - 'clientVersion': '21.47', + 'clientVersion': '22.30.100', + 'androidSdkVersion': 30, + 'userAgent': 'com.google.android.apps.youtube.creator/22.30.100 (Linux; U; Android 11) gzip' }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, @@ -164,8 +168,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS', - 'clientVersion': '16.46', + 'clientVersion': '17.33.2', 'deviceModel': 'iPhone14,3', + 'userAgent': 'com.google.ios.youtube/17.33.2 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, @@ -175,8 +180,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MESSAGES_EXTENSION', - 'clientVersion': '16.46', + 'clientVersion': '17.33.2', 'deviceModel': 'iPhone14,3', + 'userAgent': 'com.google.ios.youtube/17.33.2 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 66, @@ -187,7 +193,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MUSIC', - 'clientVersion': '4.57', + 'clientVersion': '5.21', + 'deviceModel': 'iPhone14,3', + 'userAgent': 'com.google.ios.youtubemusic/5.21 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, @@ -197,7 +205,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_CREATOR', - 'clientVersion': '21.47', + 'clientVersion': '22.33.101', + 'deviceModel': 'iPhone14,3', + 'userAgent': 'com.google.ios.ytcreator/22.33.101 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 15, @@ -210,7 +220,7 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'MWEB', - 'clientVersion': '2.20211221.01.00', + 'clientVersion': '2.20220801.00.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 2 @@ -269,14 +279,23 @@ def build_innertube_clients(): build_innertube_clients() +class BadgeType(enum.Enum): + AVAILABILITY_UNLISTED = enum.auto() + AVAILABILITY_PRIVATE = enum.auto() + AVAILABILITY_PUBLIC = enum.auto() + AVAILABILITY_PREMIUM = enum.auto() + AVAILABILITY_SUBSCRIPTION = enum.auto() + LIVE_NOW = enum.auto() + + class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _RESERVED_NAMES = ( r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|clip|' r'shorts|movies|results|search|shared|hashtag|trending|explore|feed|feeds|' - r'browse|oembed|get_video_info|iframe_api|s/player|' - r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout') + r'browse|oembed|get_video_info|iframe_api|s/player|source|' + r'storefront|oops|index|account|t/terms|about|upload|signin|logout') _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)' @@ -351,16 +370,60 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion', # piped instances from https://github.com/TeamPiped/Piped/wiki/Instances r'(?:www\.)?piped\.kavin\.rocks', - r'(?:www\.)?piped\.silkky\.cloud', r'(?:www\.)?piped\.tokhmi\.xyz', - r'(?:www\.)?piped\.moomoo\.me', - r'(?:www\.)?il\.ax', - r'(?:www\.)?piped\.syncpundit\.com', + r'(?:www\.)?piped\.syncpundit\.io', r'(?:www\.)?piped\.mha\.fi', + r'(?:www\.)?watch\.whatever\.social', + r'(?:www\.)?piped\.garudalinux\.org', + r'(?:www\.)?piped\.rivo\.lol', + r'(?:www\.)?piped-libre\.kavin\.rocks', + r'(?:www\.)?yt\.jae\.fi', r'(?:www\.)?piped\.mint\.lgbt', - r'(?:www\.)?piped\.privacy\.com\.de', + r'(?:www\.)?il\.ax', + r'(?:www\.)?piped\.esmailelbob\.xyz', + r'(?:www\.)?piped\.projectsegfau\.lt', + r'(?:www\.)?piped\.privacydev\.net', + r'(?:www\.)?piped\.palveluntarjoaja\.eu', + r'(?:www\.)?piped\.smnz\.de', + r'(?:www\.)?piped\.adminforge\.de', + r'(?:www\.)?watch\.whatevertinfoil\.de', + r'(?:www\.)?piped\.qdi\.fi', + r'(?:www\.)?piped\.video', + r'(?:www\.)?piped\.aeong\.one', ) + # extracted from account/account_menu ep + # XXX: These are the supported YouTube UI and API languages, + # which is slightly different from languages supported for translation in YouTube studio + _SUPPORTED_LANG_CODES = [ + 'af', 'az', 'id', 'ms', 'bs', 'ca', 'cs', 'da', 'de', 'et', 'en-IN', 'en-GB', 'en', 'es', + 'es-419', 'es-US', 'eu', 'fil', 'fr', 'fr-CA', 'gl', 'hr', 'zu', 'is', 'it', 'sw', 'lv', + 'lt', 'hu', 'nl', 'no', 'uz', 'pl', 'pt-PT', 'pt', 'ro', 'sq', 'sk', 'sl', 'sr-Latn', 'fi', + 'sv', 'vi', 'tr', 'be', 'bg', 'ky', 'kk', 'mk', 'mn', 'ru', 'sr', 'uk', 'el', 'hy', 'iw', + 'ur', 'ar', 'fa', 'ne', 'mr', 'hi', 'as', 'bn', 'pa', 'gu', 'or', 'ta', 'te', 'kn', 'ml', + 'si', 'th', 'lo', 'my', 'ka', 'am', 'km', 'zh-CN', 'zh-TW', 'zh-HK', 'ja', 'ko' + ] + + _IGNORED_WARNINGS = {'Unavailable videos will be hidden during playback'} + + @functools.cached_property + def _preferred_lang(self): + """ + Returns a language code supported by YouTube for the user preferred language. + Returns None if no preferred language set. + """ + preferred_lang = self._configuration_arg('lang', ie_key='Youtube', casesense=True, default=[''])[0] + if not preferred_lang: + return + if preferred_lang not in self._SUPPORTED_LANG_CODES: + raise ExtractorError( + f'Unsupported language code: {preferred_lang}. Supported language codes (case-sensitive): {join_nonempty(*self._SUPPORTED_LANG_CODES, delim=", ")}.', + expected=True) + elif preferred_lang != 'en': + self.report_warning( + f'Preferring "{preferred_lang}" translated fields. Note that some metadata extraction may fail or be incorrect.') + return preferred_lang + def _initialize_consent(self): cookies = self._get_cookies('https://www.youtube.com/') if cookies.get('__Secure-3PSID'): @@ -382,11 +445,11 @@ def _initialize_pref(self): pref = {} if pref_cookie: try: - pref = dict(compat_urlparse.parse_qsl(pref_cookie.value)) + pref = dict(urllib.parse.parse_qsl(pref_cookie.value)) except ValueError: self.report_warning('Failed to parse user PREF cookie' + bug_reports_message()) - pref.update({'hl': 'en', 'tz': 'UTC'}) - self._set_cookie('.youtube.com', name='PREF', value=compat_urllib_parse_urlencode(pref)) + pref.update({'hl': self._preferred_lang or 'en', 'tz': 'UTC'}) + self._set_cookie('.youtube.com', name='PREF', value=urllib.parse.urlencode(pref)) def _real_initialize(self): self._initialize_pref() @@ -414,26 +477,26 @@ def _ytcfg_get_safe(self, ytcfg, getter, expected_type=None, default_client='web def _extract_client_name(self, ytcfg, default_client='web'): return self._ytcfg_get_safe( ytcfg, (lambda x: x['INNERTUBE_CLIENT_NAME'], - lambda x: x['INNERTUBE_CONTEXT']['client']['clientName']), compat_str, default_client) + lambda x: x['INNERTUBE_CONTEXT']['client']['clientName']), str, default_client) def _extract_client_version(self, ytcfg, default_client='web'): return self._ytcfg_get_safe( ytcfg, (lambda x: x['INNERTUBE_CLIENT_VERSION'], - lambda x: x['INNERTUBE_CONTEXT']['client']['clientVersion']), compat_str, default_client) + lambda x: x['INNERTUBE_CONTEXT']['client']['clientVersion']), str, default_client) def _select_api_hostname(self, req_api_hostname, default_client=None): return (self._configuration_arg('innertube_host', [''], ie_key=YoutubeIE.ie_key())[0] or req_api_hostname or self._get_innertube_host(default_client or 'web')) def _extract_api_key(self, ytcfg=None, default_client='web'): - return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client) + return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], str, default_client) def _extract_context(self, ytcfg=None, default_client='web'): context = get_first( (ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict) # Enforce language and tz for extraction client_context = traverse_obj(context, 'client', expected_type=dict, default={}) - client_context.update({'hl': 'en', 'timeZone': 'UTC', 'utcOffsetMinutes': 0}) + client_context.update({'hl': self._preferred_lang or 'en', 'timeZone': 'UTC', 'utcOffsetMinutes': 0}) return context _SAPISID = None @@ -498,7 +561,7 @@ def _extract_session_index(*data): # Deprecated? def _extract_identity_token(self, ytcfg=None, webpage=None): if ytcfg: - token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str) + token = try_get(ytcfg, lambda x: x['ID_TOKEN'], str) if token: return token if webpage: @@ -514,12 +577,12 @@ def _extract_account_syncid(*args): """ for data in args: # ytcfg includes channel_syncid if on secondary channel - delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], compat_str) + delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], str) if delegated_sid: return delegated_sid sync_ids = (try_get( data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'], - lambda x: x['DATASYNC_ID']), compat_str) or '').split('||') + lambda x: x['DATASYNC_ID']), str) or '').split('||') if len(sync_ids) >= 2 and sync_ids[1]: # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel # and just "user_syncid||" for primary channel. We only want the channel_syncid @@ -553,13 +616,14 @@ def generate_api_headers( origin = 'https://' + (self._select_api_hostname(api_hostname, default_client)) headers = { - 'X-YouTube-Client-Name': compat_str( + 'X-YouTube-Client-Name': str( self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=default_client)), 'X-YouTube-Client-Version': self._extract_client_version(ytcfg, default_client), 'Origin': origin, 'X-Youtube-Identity-Token': identity_token or self._extract_identity_token(ytcfg), 'X-Goog-PageId': account_syncid or self._extract_account_syncid(ytcfg), - 'X-Goog-Visitor-Id': visitor_data or self._extract_visitor_data(ytcfg) + 'X-Goog-Visitor-Id': visitor_data or self._extract_visitor_data(ytcfg), + 'User-Agent': self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT']['client']['userAgent'], default_client=default_client) } if session_index is None: session_index = self._extract_session_index(ytcfg) @@ -570,7 +634,7 @@ def generate_api_headers( if auth is not None: headers['Authorization'] = auth headers['X-Origin'] = origin - return {h: v for h, v in headers.items() if v is not None} + return filter_dict(headers) def _download_ytcfg(self, client, video_id): url = { @@ -613,7 +677,7 @@ def _extract_next_continuation_data(cls, renderer): def _extract_continuation_ep_data(cls, continuation_ep: dict): if isinstance(continuation_ep, dict): continuation = try_get( - continuation_ep, lambda x: x['continuationCommand']['token'], compat_str) + continuation_ep, lambda x: x['continuationCommand']['token'], str) if not continuation: return ctp = continuation_ep.get('clickTrackingParams') @@ -625,20 +689,10 @@ def _extract_continuation(cls, renderer): if next_continuation: return next_continuation - contents = [] - for key in ('contents', 'items'): - contents.extend(try_get(renderer, lambda x: x[key], list) or []) - - for content in contents: - if not isinstance(content, dict): - continue - continuation_ep = try_get( - content, (lambda x: x['continuationItemRenderer']['continuationEndpoint'], - lambda x: x['continuationItemRenderer']['button']['buttonRenderer']['command']), - dict) - continuation = cls._extract_continuation_ep_data(continuation_ep) - if continuation: - return continuation + return traverse_obj(renderer, ( + ('contents', 'items', 'rows'), ..., 'continuationItemRenderer', + ('continuationEndpoint', ('button', 'buttonRenderer', 'command')) + ), get_all=False, expected_type=cls._extract_continuation_ep_data) @classmethod def _extract_alerts(cls, data): @@ -654,12 +708,11 @@ def _extract_alerts(cls, data): yield alert_type, message def _report_alerts(self, alerts, expected=True, fatal=True, only_once=False): - errors = [] - warnings = [] + errors, warnings = [], [] for alert_type, alert_message in alerts: if alert_type.lower() == 'error' and fatal: errors.append([alert_type, alert_message]) - else: + elif alert_message not in self._IGNORED_WARNINGS: warnings.append([alert_type, alert_message]) for alert_type, alert_message in (warnings + errors[:-1]): @@ -671,13 +724,49 @@ def _extract_and_report_alerts(self, data, *args, **kwargs): return self._report_alerts(self._extract_alerts(data), *args, **kwargs) def _extract_badges(self, renderer: dict): - badges = set() - for badge in try_get(renderer, lambda x: x['badges'], list) or []: - label = try_get(badge, lambda x: x['metadataBadgeRenderer']['label'], compat_str) - if label: - badges.add(label.lower()) + privacy_icon_map = { + 'PRIVACY_UNLISTED': BadgeType.AVAILABILITY_UNLISTED, + 'PRIVACY_PRIVATE': BadgeType.AVAILABILITY_PRIVATE, + 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC + } + + badge_style_map = { + 'BADGE_STYLE_TYPE_MEMBERS_ONLY': BadgeType.AVAILABILITY_SUBSCRIPTION, + 'BADGE_STYLE_TYPE_PREMIUM': BadgeType.AVAILABILITY_PREMIUM, + 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW + } + + label_map = { + 'unlisted': BadgeType.AVAILABILITY_UNLISTED, + 'private': BadgeType.AVAILABILITY_PRIVATE, + 'members only': BadgeType.AVAILABILITY_SUBSCRIPTION, + 'live': BadgeType.LIVE_NOW, + 'premium': BadgeType.AVAILABILITY_PREMIUM + } + + badges = [] + for badge in traverse_obj(renderer, ('badges', ..., 'metadataBadgeRenderer'), default=[]): + badge_type = ( + privacy_icon_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str)) + or badge_style_map.get(traverse_obj(badge, 'style')) + ) + if badge_type: + badges.append({'type': badge_type}) + continue + + # fallback, won't work in some languages + label = traverse_obj(badge, 'label', expected_type=str, default='') + for match, label_badge_type in label_map.items(): + if match in label.lower(): + badges.append({'type': badge_type}) + continue + return badges + @staticmethod + def _has_badge(badges, badge_type): + return bool(traverse_obj(badges, lambda _, v: v['type'] == badge_type)) + @staticmethod def _get_text(data, *path_list, max_runs=None): for path in path_list or [None]: @@ -688,7 +777,7 @@ def _get_text(data, *path_list, max_runs=None): if not any(key is ... or isinstance(key, (list, tuple)) for key in variadic(path)): obj = [obj] for item in obj: - text = try_get(item, lambda x: x['simpleText'], compat_str) + text = try_get(item, lambda x: x['simpleText'], str) if text: return text runs = try_get(item, lambda x: x['runs'], list) or [] @@ -748,9 +837,9 @@ def extract_relative_time(relative_time_text): except ValueError: return None - def _extract_time_text(self, renderer, *path_list): - """@returns (timestamp, time_text)""" - text = self._get_text(renderer, *path_list) or '' + def _parse_time_text(self, text): + if not text: + return dt = self.extract_relative_time(text) timestamp = None if isinstance(dt, datetime.datetime): @@ -763,81 +852,62 @@ def _extract_time_text(self, renderer, *path_list): (r'([a-z]+\s*\d{1,2},?\s*20\d{2})', r'(?:.+|^)(?:live|premieres|ed|ing)(?:\s*(?:on|for))?\s*(.+\d)'), text.lower(), 'time text', default=None))) - if text and timestamp is None: - self.report_warning(f"Cannot parse localized time text '{text}'" + bug_reports_message(), only_once=True) - return timestamp, text + if text and timestamp is None and self._preferred_lang in (None, 'en'): + self.report_warning( + f'Cannot parse localized time text "{text}"', only_once=True) + return timestamp def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None, ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None, default_client='web'): - response = None - last_error = None - count = -1 - retries = self.get_param('extractor_retries', 3) - if check_get_keys is None: - check_get_keys = [] - while count < retries: - count += 1 - if last_error: - self.report_warning('%s. Retrying ...' % remove_end(last_error, '.')) + for retry in self.RetryManager(): try: response = self._call_api( ep=ep, fatal=True, headers=headers, - video_id=item_id, query=query, + video_id=item_id, query=query, note=note, context=self._extract_context(ytcfg, default_client), api_key=self._extract_api_key(ytcfg, default_client), - api_hostname=api_hostname, default_client=default_client, - note='%s%s' % (note, ' (retry #%d)' % count if count else '')) + api_hostname=api_hostname, default_client=default_client) except ExtractorError as e: - if isinstance(e.cause, network_exceptions): - if isinstance(e.cause, compat_HTTPError): - first_bytes = e.cause.read(512) - if not is_html(first_bytes): - yt_error = try_get( - self._parse_json( - self._webpage_read_content(e.cause, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False), - lambda x: x['error']['message'], compat_str) - if yt_error: - self._report_alerts([('ERROR', yt_error)], fatal=False) - # Downloading page may result in intermittent 5xx HTTP error - # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289 - # We also want to catch all other network exceptions since errors in later pages can be troublesome - # See https://github.com/yt-dlp/yt-dlp/issues/507#issuecomment-880188210 - if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (403, 429): - last_error = error_to_compat_str(e.cause or e.msg) - if count < retries: - continue - if fatal: - raise - else: - self.report_warning(error_to_compat_str(e)) - return + if not isinstance(e.cause, network_exceptions): + return self._error_or_warning(e, fatal=fatal) + elif not isinstance(e.cause, urllib.error.HTTPError): + retry.error = e + continue - else: - try: - self._extract_and_report_alerts(response, only_once=True) - except ExtractorError as e: - # YouTube servers may return errors we want to retry on in a 200 OK response - # See: https://github.com/yt-dlp/yt-dlp/issues/839 - if 'unknown error' in e.msg.lower(): - last_error = e.msg - continue - if fatal: - raise - self.report_warning(error_to_compat_str(e)) - return - if not check_get_keys or dict_get(response, check_get_keys): - break - # Youtube sometimes sends incomplete data - # See: https://github.com/ytdl-org/youtube-dl/issues/28194 - last_error = 'Incomplete data received' - if count >= retries: - if fatal: - raise ExtractorError(last_error) - else: - self.report_warning(last_error) - return - return response + first_bytes = e.cause.read(512) + if not is_html(first_bytes): + yt_error = try_get( + self._parse_json( + self._webpage_read_content(e.cause, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False), + lambda x: x['error']['message'], str) + if yt_error: + self._report_alerts([('ERROR', yt_error)], fatal=False) + # Downloading page may result in intermittent 5xx HTTP error + # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289 + # We also want to catch all other network exceptions since errors in later pages can be troublesome + # See https://github.com/yt-dlp/yt-dlp/issues/507#issuecomment-880188210 + if e.cause.code not in (403, 429): + retry.error = e + continue + return self._error_or_warning(e, fatal=fatal) + + try: + self._extract_and_report_alerts(response, only_once=True) + except ExtractorError as e: + # YouTube servers may return errors we want to retry on in a 200 OK response + # See: https://github.com/yt-dlp/yt-dlp/issues/839 + if 'unknown error' in e.msg.lower(): + retry.error = e + continue + return self._error_or_warning(e, fatal=fatal) + # Youtube sometimes sends incomplete data + # See: https://github.com/ytdl-org/youtube-dl/issues/28194 + if not traverse_obj(response, *variadic(check_get_keys)): + retry.error = ExtractorError('Incomplete data received', expected=True) + continue + + return response @staticmethod def is_music_url(url): @@ -845,29 +915,36 @@ def is_music_url(url): def _extract_video(self, renderer): video_id = renderer.get('videoId') - title = self._get_text(renderer, 'title') + + reel_header_renderer = traverse_obj(renderer, ( + 'navigationEndpoint', 'reelWatchEndpoint', 'overlay', 'reelPlayerOverlayRenderer', + 'reelPlayerHeaderSupportedRenderers', 'reelPlayerHeaderRenderer')) + + title = self._get_text(renderer, 'title', 'headline') or self._get_text(reel_header_renderer, 'reelTitleText') description = self._get_text(renderer, 'descriptionSnippet') - duration = parse_duration(self._get_text( - renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text'))) + + duration = int_or_none(renderer.get('lengthSeconds')) + if duration is None: + duration = parse_duration(self._get_text( + renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text'))) if duration is None: + # XXX: should write a parser to be more general to support more cases (e.g. shorts in shorts tab) duration = parse_duration(self._search_regex( r'(?i)(ago)(?!.*\1)\s+(?P[a-z0-9 ,]+?)(?:\s+[\d,]+\s+views)?(?:\s+-\s+play\s+short)?$', traverse_obj(renderer, ('title', 'accessibility', 'accessibilityData', 'label'), default='', expected_type=str), video_id, default=None, group='duration')) - view_count = self._get_count(renderer, 'viewCountText') - - uploader = self._get_text(renderer, 'ownerText', 'shortBylineText') channel_id = traverse_obj( renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'), expected_type=str, get_all=False) - timestamp, time_text = self._extract_time_text(renderer, 'publishedTimeText') - scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False)) + if not channel_id: + channel_id = traverse_obj(reel_header_renderer, ('channelNavigationEndpoint', 'browseEndpoint', 'browseId')) + overlay_style = traverse_obj( renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str) badges = self._extract_badges(renderer) - thumbnails = self._extract_thumbnails(renderer, 'thumbnail') + navigation_url = urljoin('https://www.youtube.com/', traverse_obj( renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), expected_type=str)) or '' @@ -875,6 +952,22 @@ def _extract_video(self, renderer): if overlay_style == 'SHORTS' or '/shorts/' in navigation_url: url = f'https://www.youtube.com/shorts/{video_id}' + time_text = (self._get_text(renderer, 'publishedTimeText', 'videoInfo') + or self._get_text(reel_header_renderer, 'timestampText') or '') + scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False)) + + live_status = ( + 'is_upcoming' if scheduled_timestamp is not None + else 'was_live' if 'streamed' in time_text.lower() + else 'is_live' if overlay_style == 'LIVE' or self._has_badge(badges, BadgeType.LIVE_NOW) + else None) + + # videoInfo is a string like '50K views • 10 years ago'. + view_count_text = self._get_text(renderer, 'viewCountText', 'shortViewCountText', 'videoInfo') or '' + view_count = (0 if 'no views' in view_count_text.lower() + else self._get_count({'simpleText': view_count_text})) + view_count_field = 'concurrent_view_count' if live_status in ('is_live', 'is_upcoming') else 'view_count' + return { '_type': 'url', 'ie_key': YoutubeIE.ie_key(), @@ -883,19 +976,24 @@ def _extract_video(self, renderer): 'title': title, 'description': description, 'duration': duration, - 'view_count': view_count, - 'uploader': uploader, 'channel_id': channel_id, - 'thumbnails': thumbnails, - 'upload_date': (strftime_or_none(timestamp, '%Y%m%d') - if self._configuration_arg('approximate_date', ie_key='youtubetab') - else None), - 'live_status': ('is_upcoming' if scheduled_timestamp is not None - else 'was_live' if 'streamed' in time_text.lower() - else 'is_live' if overlay_style is not None and overlay_style == 'LIVE' or 'live now' in badges - else None), + 'channel': (self._get_text(renderer, 'ownerText', 'shortBylineText') + or self._get_text(reel_header_renderer, 'channelTitleText')), + 'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None, + 'thumbnails': self._extract_thumbnails(renderer, 'thumbnail'), + 'timestamp': (self._parse_time_text(time_text) + if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE) + else None), 'release_timestamp': scheduled_timestamp, - 'availability': self._availability(needs_premium='premium' in badges, needs_subscription='members only' in badges) + 'availability': + 'public' if self._has_badge(badges, BadgeType.AVAILABILITY_PUBLIC) + else self._availability( + is_private=self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE) or None, + needs_premium=self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) or None, + needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None, + is_unlisted=self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) or None), + view_count_field: view_count, + 'live_status': live_status } @@ -936,6 +1034,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:\#|$)""" % { 'invidious': '|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES), } + _EMBED_REGEX = [ + r'''(?x) + (?: + <(?:[0-9A-Za-z-]+?)?iframe[^>]+?src=| + data-video-url=| + ]+?src=| + embedSWF\(?:\s*| + ]+data=| + new\s+SWFObject\( + ) + (["\']) + (?P(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ + (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) + \1''', + # https://wordpress.org/plugins/lazy-load-for-videos/ + r'''(?xs) + ]*\bhref="(?Phttps://www\.youtube\.com/watch\?v=[0-9A-Za-z_-]{11})" + \s[^>]*\bclass="[^"]*\blazy-load-youtube''', + ] + _RETURN_TYPE = 'video' # XXX: How to handle multifeed? + _PLAYER_INFO_RE = ( r'/s/player/(?P[a-zA-Z0-9_-]{8,})/player', r'/(?P[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', @@ -1082,6 +1201,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'age_limit': 0, 'start_time': 1, 'end_time': 9, + 'comment_count': int, 'channel_follower_count': int } }, @@ -1126,6 +1246,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'thumbnail': 'https://i.ytimg.com/vi/BaW_jenozKc/maxresdefault.jpg', 'live_status': 'not_live', 'age_limit': 0, + 'comment_count': int, 'channel_follower_count': int }, 'params': { @@ -1268,6 +1389,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'categories': ['Entertainment'], 'duration': 106, 'channel_url': 'https://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ', + 'comment_count': int, 'channel_follower_count': int }, }, @@ -1355,7 +1477,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20150827', 'uploader_id': 'olympic', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', - 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', + 'description': 'md5:04bbbf3ccceb6795947572ca36f45904', 'uploader': 'Olympics', 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', 'like_count': int, @@ -1404,6 +1526,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'like_count': int, 'live_status': 'not_live', 'availability': 'unlisted', + 'comment_count': int, 'channel_follower_count': int }, }, @@ -1461,66 +1584,99 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip': 'This live event has ended.', }, { - # Multifeed videos (multiple cameras), URL is for Main Camera - 'url': 'https://www.youtube.com/watch?v=jvGDaLqkpTg', + # Multifeed videos (multiple cameras), URL can be of any Camera + 'url': 'https://www.youtube.com/watch?v=zaPI8MvL8pg', 'info_dict': { - 'id': 'jvGDaLqkpTg', - 'title': 'Tom Clancy Free Weekend Rainbow Whatever', - 'description': 'md5:e03b909557865076822aa169218d6a5d', + 'id': 'zaPI8MvL8pg', + 'title': 'Terraria 1.2 Live Stream | Let\'s Play - Part 04', + 'description': 'md5:563ccbc698b39298481ca3c571169519', }, 'playlist': [{ 'info_dict': { - 'id': 'jvGDaLqkpTg', - 'ext': 'mp4', - 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Main Camera)', - 'description': 'md5:e03b909557865076822aa169218d6a5d', - 'duration': 10643, - 'upload_date': '20161111', - 'uploader': 'Team PGP', - 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', - }, - }, { - 'info_dict': { - 'id': '3AKt1R1aDnw', + 'id': 'j5yGuxZ8lLU', 'ext': 'mp4', - 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 2)', - 'description': 'md5:e03b909557865076822aa169218d6a5d', - 'duration': 10991, - 'upload_date': '20161111', - 'uploader': 'Team PGP', - 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', + 'title': 'Terraria 1.2 Live Stream | Let\'s Play - Part 04 (Chris)', + 'uploader': 'WiiLikeToPlay', + 'description': 'md5:563ccbc698b39298481ca3c571169519', + 'uploader_url': 'http://www.youtube.com/user/WiiRikeToPray', + 'duration': 10120, + 'channel_follower_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCN2XePorRokPB9TEgRZpddg', + 'availability': 'public', + 'playable_in_embed': True, + 'upload_date': '20131105', + 'uploader_id': 'WiiRikeToPray', + 'categories': ['Gaming'], + 'live_status': 'was_live', + 'tags': 'count:24', + 'release_timestamp': 1383701910, + 'thumbnail': 'https://i.ytimg.com/vi/j5yGuxZ8lLU/maxresdefault.jpg', + 'comment_count': int, + 'age_limit': 0, + 'like_count': int, + 'channel_id': 'UCN2XePorRokPB9TEgRZpddg', + 'channel': 'WiiLikeToPlay', + 'view_count': int, + 'release_date': '20131106', }, }, { 'info_dict': { - 'id': 'RtAMM00gpVc', + 'id': 'zaPI8MvL8pg', 'ext': 'mp4', - 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 3)', - 'description': 'md5:e03b909557865076822aa169218d6a5d', - 'duration': 10995, - 'upload_date': '20161111', - 'uploader': 'Team PGP', - 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', + 'title': 'Terraria 1.2 Live Stream | Let\'s Play - Part 04 (Tyson)', + 'uploader_id': 'WiiRikeToPray', + 'availability': 'public', + 'channel_url': 'https://www.youtube.com/channel/UCN2XePorRokPB9TEgRZpddg', + 'channel': 'WiiLikeToPlay', + 'uploader_url': 'http://www.youtube.com/user/WiiRikeToPray', + 'channel_follower_count': int, + 'description': 'md5:563ccbc698b39298481ca3c571169519', + 'duration': 10108, + 'age_limit': 0, + 'like_count': int, + 'tags': 'count:24', + 'channel_id': 'UCN2XePorRokPB9TEgRZpddg', + 'uploader': 'WiiLikeToPlay', + 'release_timestamp': 1383701915, + 'comment_count': int, + 'upload_date': '20131105', + 'thumbnail': 'https://i.ytimg.com/vi/zaPI8MvL8pg/maxresdefault.jpg', + 'release_date': '20131106', + 'playable_in_embed': True, + 'live_status': 'was_live', + 'categories': ['Gaming'], + 'view_count': int, }, }, { 'info_dict': { - 'id': '6N2fdlP3C5U', + 'id': 'R7r3vfO7Hao', 'ext': 'mp4', - 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 4)', - 'description': 'md5:e03b909557865076822aa169218d6a5d', - 'duration': 10990, - 'upload_date': '20161111', - 'uploader': 'Team PGP', - 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', + 'title': 'Terraria 1.2 Live Stream | Let\'s Play - Part 04 (Spencer)', + 'thumbnail': 'https://i.ytimg.com/vi/R7r3vfO7Hao/maxresdefault.jpg', + 'channel_id': 'UCN2XePorRokPB9TEgRZpddg', + 'like_count': int, + 'availability': 'public', + 'playable_in_embed': True, + 'upload_date': '20131105', + 'description': 'md5:563ccbc698b39298481ca3c571169519', + 'uploader_id': 'WiiRikeToPray', + 'uploader_url': 'http://www.youtube.com/user/WiiRikeToPray', + 'channel_follower_count': int, + 'tags': 'count:24', + 'release_date': '20131106', + 'uploader': 'WiiLikeToPlay', + 'comment_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCN2XePorRokPB9TEgRZpddg', + 'channel': 'WiiLikeToPlay', + 'categories': ['Gaming'], + 'release_timestamp': 1383701914, + 'live_status': 'was_live', + 'age_limit': 0, + 'duration': 10128, + 'view_count': int, }, }], - 'params': { - 'skip_download': True, - }, - 'skip': 'Not multifeed anymore', + 'params': {'skip_download': True}, }, { # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536) @@ -1632,7 +1788,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'thumbnail': 'https://i.ytimg.com/vi_webp/M4gD1WSo5mA/maxresdefault.webp', 'live_status': 'not_live', 'playable_in_embed': True, - 'channel_follower_count': int + 'comment_count': int, + 'channel_follower_count': int, + 'chapters': list, }, 'params': { 'skip_download': True, @@ -1664,7 +1822,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'view_count': int, 'live_status': 'not_live', 'channel_url': 'https://www.youtube.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', - 'channel_follower_count': int + 'comment_count': int, + 'channel_follower_count': int, + 'chapters': list, }, 'params': { 'skip_download': True, @@ -1928,7 +2088,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'view_count': int, 'duration': 522, 'channel': 'kudvenkat', - 'channel_follower_count': int + 'comment_count': int, + 'channel_follower_count': int, + 'chapters': list, }, 'params': { 'skip_download': True, @@ -2078,7 +2240,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'like_count': int, 'live_status': 'not_live', 'playable_in_embed': True, - 'channel_follower_count': int + 'channel_follower_count': int, + 'chapters': list, }, 'params': { 'format': '17', # 3gp format available on android @@ -2122,7 +2285,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': 248, 'categories': ['Education'], 'age_limit': 0, - 'channel_follower_count': int + 'channel_follower_count': int, + 'chapters': list, }, 'params': {'format': 'mhtml', 'skip_download': True} }, { # Ensure video upload_date is in UTC timezone (video was uploaded 1641170939) @@ -2149,8 +2313,38 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'availability': 'public', 'channel': 'Leon Nguyen', 'thumbnail': 'https://i.ytimg.com/vi_webp/2NUZ8W2llS4/maxresdefault.webp', + 'comment_count': int, 'channel_follower_count': int } + }, { + # Same video as above, but with --compat-opt no-youtube-prefer-utc-upload-date + 'url': 'https://www.youtube.com/watch?v=2NUZ8W2llS4', + 'info_dict': { + 'id': '2NUZ8W2llS4', + 'ext': 'mp4', + 'title': 'The NP that test your phone performance 🙂', + 'description': 'md5:144494b24d4f9dfacb97c1bbef5de84d', + 'uploader': 'Leon Nguyen', + 'uploader_id': 'VNSXIII', + 'uploader_url': 'http://www.youtube.com/user/VNSXIII', + 'channel_id': 'UCRqNBSOHgilHfAczlUmlWHA', + 'channel_url': 'https://www.youtube.com/channel/UCRqNBSOHgilHfAczlUmlWHA', + 'duration': 21, + 'view_count': int, + 'age_limit': 0, + 'categories': ['Gaming'], + 'tags': 'count:23', + 'playable_in_embed': True, + 'live_status': 'not_live', + 'upload_date': '20220102', + 'like_count': int, + 'availability': 'public', + 'channel': 'Leon Nguyen', + 'thumbnail': 'https://i.ytimg.com/vi_webp/2NUZ8W2llS4/maxresdefault.webp', + 'comment_count': int, + 'channel_follower_count': int + }, + 'params': {'compat_opts': ['no-youtube-prefer-utc-upload-date']} }, { # date text is premiered video, ensure upload date in UTC (published 1641172509) 'url': 'https://www.youtube.com/watch?v=mzZzzBU6lrM', @@ -2208,11 +2402,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'view_count': int, 'playable_in_embed': True, 'description': 'md5:2ef1d002cad520f65825346e2084e49d', + 'concurrent_view_count': int, }, 'params': {'skip_download': True} }, { # Story. Requires specific player params to work. - # Note: stories get removed after some period of time 'url': 'https://www.youtube.com/watch?v=vv8qTUWmulI', 'info_dict': { 'id': 'vv8qTUWmulI', @@ -2235,7 +2429,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'thumbnail': 'https://i.ytimg.com/vi_webp/vv8qTUWmulI/maxresdefault.webp', 'uploader_url': 'http://www.youtube.com/user/BlastfromthePast', 'channel_url': 'https://www.youtube.com/channel/UCzIZ8HrzDgc-pNQDUG6avBA', - } + }, + 'skip': 'stories get removed after some period of time', }, { 'url': 'https://www.youtube.com/watch?v=tjjjtzRLHvA', 'info_dict': { @@ -2262,9 +2457,132 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'tags': [], 'uploader_url': 'http://www.youtube.com/user/nao20010128nao', } + }, { + # Prefer primary title+description language metadata by default + # Do not prefer translated description if primary is empty + 'url': 'https://www.youtube.com/watch?v=el3E4MbxRqQ', + 'info_dict': { + 'id': 'el3E4MbxRqQ', + 'ext': 'mp4', + 'title': 'dlp test video 2 - primary sv no desc', + 'description': '', + 'channel': 'cole-dlp-test-acc', + 'tags': [], + 'view_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'like_count': int, + 'playable_in_embed': True, + 'availability': 'unlisted', + 'thumbnail': 'https://i.ytimg.com/vi_webp/el3E4MbxRqQ/maxresdefault.webp', + 'age_limit': 0, + 'duration': 5, + 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'uploader_url': 'http://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'live_status': 'not_live', + 'upload_date': '20220908', + 'categories': ['People & Blogs'], + 'uploader': 'cole-dlp-test-acc', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + }, + 'params': {'skip_download': True} + }, { + # Extractor argument: prefer translated title+description + 'url': 'https://www.youtube.com/watch?v=gHKT4uU8Zng', + 'info_dict': { + 'id': 'gHKT4uU8Zng', + 'ext': 'mp4', + 'channel': 'cole-dlp-test-acc', + 'tags': [], + 'duration': 5, + 'live_status': 'not_live', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'upload_date': '20220728', + 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'view_count': int, + 'categories': ['People & Blogs'], + 'thumbnail': 'https://i.ytimg.com/vi_webp/gHKT4uU8Zng/maxresdefault.webp', + 'title': 'dlp test video title translated (fr)', + 'availability': 'public', + 'uploader': 'cole-dlp-test-acc', + 'age_limit': 0, + 'description': 'dlp test video description translated (fr)', + 'playable_in_embed': True, + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'uploader_url': 'http://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + }, + 'params': {'skip_download': True, 'extractor_args': {'youtube': {'lang': ['fr']}}}, + 'expected_warnings': [r'Preferring "fr" translated fields'], + }, { + 'note': '6 channel audio', + 'url': 'https://www.youtube.com/watch?v=zgdo7-RRjgo', + 'only_matching': True, + }, { + 'note': 'Multiple HLS formats with same itag', + 'url': 'https://www.youtube.com/watch?v=kX3nB4PpJko', + 'info_dict': { + 'id': 'kX3nB4PpJko', + 'ext': 'mp4', + 'categories': ['Entertainment'], + 'description': 'md5:e8031ff6e426cdb6a77670c9b81f6fa6', + 'uploader_url': 'http://www.youtube.com/user/MrBeast6000', + 'live_status': 'not_live', + 'duration': 937, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi_webp/kX3nB4PpJko/maxresdefault.webp', + 'title': 'Last To Take Hand Off Jet, Keeps It!', + 'channel': 'MrBeast', + 'playable_in_embed': True, + 'view_count': int, + 'upload_date': '20221112', + 'uploader': 'MrBeast', + 'uploader_id': 'MrBeast6000', + 'channel_url': 'https://www.youtube.com/channel/UCX6OQ3DkcsbYNE6H8uQQuVA', + 'age_limit': 0, + 'availability': 'public', + 'channel_id': 'UCX6OQ3DkcsbYNE6H8uQQuVA', + 'like_count': int, + 'tags': [], + }, + 'params': {'extractor_args': {'youtube': {'player_client': ['ios']}}, 'format': '233-1'}, } ] + _WEBPAGE_TESTS = [ + # YouTube embed + { + 'url': 'http://www.improbable.com/2017/04/03/untrained-modern-youths-and-ancient-masters-in-selfie-portraits/', + 'md5': '873c81d308b979f0e23ee7e620b312a3', + 'info_dict': { + 'id': 'msN87y-iEx0', + 'ext': 'mp4', + 'title': 'Feynman: Mirrors FUN TO IMAGINE 6', + 'upload_date': '20080526', + 'description': 'md5:873c81d308b979f0e23ee7e620b312a3', + 'uploader': 'Christopher Sykes', + 'uploader_id': 'ChristopherJSykes', + 'age_limit': 0, + 'tags': ['feynman', 'mirror', 'science', 'physics', 'imagination', 'fun', 'cool', 'puzzle'], + 'channel_id': 'UCCeo--lls1vna5YJABWAcVA', + 'playable_in_embed': True, + 'thumbnail': 'https://i.ytimg.com/vi/msN87y-iEx0/hqdefault.jpg', + 'like_count': int, + 'comment_count': int, + 'channel': 'Christopher Sykes', + 'live_status': 'not_live', + 'channel_url': 'https://www.youtube.com/channel/UCCeo--lls1vna5YJABWAcVA', + 'availability': 'public', + 'duration': 195, + 'view_count': int, + 'categories': ['Science & Technology'], + 'channel_follower_count': int, + 'uploader_url': 'http://www.youtube.com/user/ChristopherJSykes', + }, + 'params': { + 'skip_download': True, + } + }, + ] + @classmethod def suitable(cls, url): from ..utils import parse_qs @@ -2279,10 +2597,8 @@ def __init__(self, *args, **kwargs): self._code_cache = {} self._player_cache = {} - def _prepare_live_from_start_formats(self, formats, video_id, live_start_time, url, webpage_url, smuggled_data): + def _prepare_live_from_start_formats(self, formats, video_id, live_start_time, url, webpage_url, smuggled_data, is_live): lock = threading.Lock() - - is_live = True start_time = time.time() formats = [f for f in formats if f.get('is_from_start')] @@ -2297,7 +2613,8 @@ def refetch_manifest(format_id, delay): microformats = traverse_obj( prs, (..., 'microformat', 'playerMicroformatRenderer'), expected_type=dict, default=[]) - _, is_live, _, formats = self._list_formats(video_id, microformats, video_details, prs, player_url) + _, live_status, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url) + is_live = live_status == 'is_live' start_time = time.time() def mpd_feed(format_id, delay): @@ -2318,12 +2635,17 @@ def mpd_feed(format_id, delay): return f['manifest_url'], f['manifest_stream_number'], is_live for f in formats: - f['is_live'] = True - f['protocol'] = 'http_dash_segments_generator' - f['fragments'] = functools.partial( - self._live_dash_fragments, f['format_id'], live_start_time, mpd_feed) + f['is_live'] = is_live + gen = functools.partial(self._live_dash_fragments, video_id, f['format_id'], + live_start_time, mpd_feed, not is_live and f.copy()) + if is_live: + f['fragments'] = gen + f['protocol'] = 'http_dash_segments_generator' + else: + f['fragments'] = LazyList(gen({})) + del f['is_from_start'] - def _live_dash_fragments(self, format_id, live_start_time, mpd_feed, ctx): + def _live_dash_fragments(self, video_id, format_id, live_start_time, mpd_feed, manifestless_orig_fmt, ctx): FETCH_SPAN, MAX_DURATION = 5, 432000 mpd_url, stream_number, is_live = None, None, True @@ -2346,7 +2668,7 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate): # Obtain from MPD's maximum seq value old_mpd_url = mpd_url last_error = ctx.pop('last_error', None) - expire_fast = immediate or last_error and isinstance(last_error, compat_HTTPError) and last_error.code == 403 + expire_fast = immediate or last_error and isinstance(last_error, urllib.error.HTTPError) and last_error.code == 403 mpd_url, stream_number, is_live = (mpd_feed(format_id, 5 if expire_fast else 18000) or (mpd_url, stream_number, False)) if not refresh_sequence: @@ -2354,15 +2676,18 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate): return False, last_seq elif old_mpd_url == mpd_url: return True, last_seq - try: - fmts, _ = self._extract_mpd_formats_and_subtitles( - mpd_url, None, note=False, errnote=False, fatal=False) - except ExtractorError: - fmts = None - if not fmts: - no_fragment_score += 2 - return False, last_seq - fmt_info = next(x for x in fmts if x['manifest_stream_number'] == stream_number) + if manifestless_orig_fmt: + fmt_info = manifestless_orig_fmt + else: + try: + fmts, _ = self._extract_mpd_formats_and_subtitles( + mpd_url, None, note=False, errnote=False, fatal=False) + except ExtractorError: + fmts = None + if not fmts: + no_fragment_score += 2 + return False, last_seq + fmt_info = next(x for x in fmts if x['manifest_stream_number'] == stream_number) fragments = fmt_info['fragments'] fragment_base_url = fmt_info['fragment_base_url'] assert fragment_base_url @@ -2370,6 +2695,7 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate): _last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1)) return True, _last_seq + self.write_debug(f'[{video_id}] Generating fragments for format {format_id}') while is_live: fetch_time = time.time() if no_fragment_score > 30: @@ -2423,12 +2749,17 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate): except ExtractorError: continue + if manifestless_orig_fmt: + # Stop at the first iteration if running for post-live manifestless; + # fragment count no longer increase since it starts + break + time.sleep(max(0, FETCH_SPAN + fetch_time - time.time())) def _extract_player_url(self, *ytcfgs, webpage=None): player_url = traverse_obj( ytcfgs, (..., 'PLAYER_JS_URL'), (..., 'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'jsUrl'), - get_all=False, expected_type=compat_str) + get_all=False, expected_type=str) if not player_url: return return urljoin('https://www.youtube.com', player_url) @@ -2445,7 +2776,7 @@ def _download_player_url(self, video_id, fatal=False): def _signature_cache_id(self, example_sig): """ Return a string representation of a signature """ - return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) + return '.'.join(str(len(part)) for part in example_sig.split('.')) @classmethod def _extract_player_info(cls, player_url): @@ -2475,20 +2806,18 @@ def _extract_signature_function(self, video_id, player_url, example_sig): func_id = f'js_{player_id}_{self._signature_cache_id(example_sig)}' assert os.path.basename(func_id) == func_id - cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) - if cache_spec is not None: - return lambda s: ''.join(s[i] for i in cache_spec) + self.write_debug(f'Extracting signature function {func_id}') + cache_spec, code = self.cache.load('youtube-sigfuncs', func_id), None - code = self._load_player(video_id, player_url) + if not cache_spec: + code = self._load_player(video_id, player_url) if code: res = self._parse_sig_js(code) + test_string = ''.join(map(chr, range(len(example_sig)))) + cache_spec = [ord(c) for c in res(test_string)] + self.cache.store('youtube-sigfuncs', func_id, cache_spec) - test_string = ''.join(map(compat_chr, range(len(example_sig)))) - cache_res = res(test_string) - cache_spec = [ord(c) for c in cache_res] - - self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec) - return res + return lambda s: ''.join(s[i] for i in cache_spec) def _print_sig_code(self, func, example_sig): if not self.get_param('youtube_print_sig_code'): @@ -2522,12 +2851,12 @@ def _genslice(start, end, step): else: yield _genslice(start, i, step) - test_string = ''.join(map(compat_chr, range(len(example_sig)))) + test_string = ''.join(map(chr, range(len(example_sig)))) cache_res = func(test_string) cache_spec = [ord(c) for c in cache_res] expr_code = ' + '.join(gen_sig_code(cache_spec)) signature_id_tuple = '(%s)' % ( - ', '.join(compat_str(len(p)) for p in example_sig.split('.'))) + ', '.join(str(len(p)) for p in example_sig.split('.'))) code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n' ' return %s\n') % (signature_id_tuple, expr_code) self.to_screen('Extracted signature function:\n' + code) @@ -2556,18 +2885,29 @@ def _parse_sig_js(self, jscode): initial_function = jsi.extract_function(funcname) return lambda s: initial_function([s]) + def _cached(self, func, *cache_id): + def inner(*args, **kwargs): + if cache_id not in self._player_cache: + try: + self._player_cache[cache_id] = func(*args, **kwargs) + except ExtractorError as e: + self._player_cache[cache_id] = e + except Exception as e: + self._player_cache[cache_id] = ExtractorError(traceback.format_exc(), cause=e) + + ret = self._player_cache[cache_id] + if isinstance(ret, Exception): + raise ret + return ret + return inner + def _decrypt_signature(self, s, video_id, player_url): """Turn the encrypted s field into a working signature""" - try: - player_id = (player_url, self._signature_cache_id(s)) - if player_id not in self._player_cache: - func = self._extract_signature_function(video_id, player_url, s) - self._player_cache[player_id] = func - func = self._player_cache[player_id] - self._print_sig_code(func, s) - return func(s) - except Exception as e: - raise ExtractorError(traceback.format_exc(), cause=e, video_id=video_id) + extract_sig = self._cached( + self._extract_signature_function, 'sig', player_url, self._signature_cache_id(s)) + func = extract_sig(video_id, player_url, s) + self._print_sig_code(func, s) + return func(s) def _decrypt_nsig(self, s, video_id, player_url): """Turn the encrypted n field into a working signature""" @@ -2575,48 +2915,87 @@ def _decrypt_nsig(self, s, video_id, player_url): raise ExtractorError('Cannot decrypt nsig without player_url') player_url = urljoin('https://www.youtube.com', player_url) - sig_id = ('nsig_value', s) - if sig_id in self._player_cache: - return self._player_cache[sig_id] + try: + jsi, player_id, func_code = self._extract_n_function_code(video_id, player_url) + except ExtractorError as e: + raise ExtractorError('Unable to extract nsig function code', cause=e) + if self.get_param('youtube_print_sig_code'): + self.to_screen(f'Extracted nsig function from {player_id}:\n{func_code[1]}\n') try: - player_id = ('nsig', player_url) - if player_id not in self._player_cache: - self._player_cache[player_id] = self._extract_n_function(video_id, player_url) - func = self._player_cache[player_id] - self._player_cache[sig_id] = func(s) - self.write_debug(f'Decrypted nsig {s} => {self._player_cache[sig_id]}') - return self._player_cache[sig_id] - except Exception as e: - raise ExtractorError(traceback.format_exc(), cause=e, video_id=video_id) + extract_nsig = self._cached(self._extract_n_function_from_code, 'nsig func', player_url) + ret = extract_nsig(jsi, func_code)(s) + except JSInterpreter.Exception as e: + try: + jsi = PhantomJSwrapper(self, timeout=5000) + except ExtractorError: + raise e + self.report_warning( + f'Native nsig extraction failed: Trying with PhantomJS\n' + f' n = {s} ; player = {player_url}', video_id) + self.write_debug(e, only_once=True) + + args, func_body = func_code + ret = jsi.execute( + f'console.log(function({", ".join(args)}) {{ {func_body} }}({s!r}));', + video_id=video_id, note='Executing signature code').strip() + + self.write_debug(f'Decrypted nsig {s} => {ret}') + return ret def _extract_n_function_name(self, jscode): - nfunc, idx = self._search_regex( + funcname, idx = self._search_regex( r'\.get\("n"\)\)&&\(b=(?P[a-zA-Z0-9$]+)(?:\[(?P\d+)\])?\([a-zA-Z0-9]\)', jscode, 'Initial JS player n function name', group=('nfunc', 'idx')) if not idx: - return nfunc + return funcname + return json.loads(js_to_json(self._search_regex( - rf'var {re.escape(nfunc)}\s*=\s*(\[.+?\]);', jscode, - f'Initial JS player n function list ({nfunc}.{idx})')))[int(idx)] + rf'var {re.escape(funcname)}\s*=\s*(\[.+?\]);', jscode, + f'Initial JS player n function list ({funcname}.{idx})')))[int(idx)] - def _extract_n_function(self, video_id, player_url): + def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) - func_code = self._downloader.cache.load('youtube-nsig', player_id) + func_code = self.cache.load('youtube-nsig', player_id, min_ver='2022.09.1') + jscode = func_code or self._load_player(video_id, player_url) + jsi = JSInterpreter(jscode) + + if func_code: + return jsi, player_id, func_code + + func_name = self._extract_n_function_name(jscode) + # For redundancy + func_code = self._search_regex( + r'''(?xs)%s\s*=\s*function\s*\((?P[\w$]+)\)\s* + # NB: The end of the regex is intentionally kept strict + {(?P.+?}\s*return\ [\w$]+.join\(""\))};''' % func_name, + jscode, 'nsig function', group=('var', 'code'), default=None) if func_code: - jsi = JSInterpreter(func_code) + func_code = ([func_code[0]], func_code[1]) else: - jscode = self._load_player(video_id, player_url) - funcname = self._extract_n_function_name(jscode) - jsi = JSInterpreter(jscode) - func_code = jsi.extract_function_code(funcname) - self._downloader.cache.store('youtube-nsig', player_id, func_code) + self.write_debug('Extracting nsig function with jsinterp') + func_code = jsi.extract_function_code(func_name) - if self.get_param('youtube_print_sig_code'): - self.to_screen(f'Extracted nsig function from {player_id}:\n{func_code[1]}\n') + self.cache.store('youtube-nsig', player_id, func_code) + return jsi, player_id, func_code + + def _extract_n_function_from_code(self, jsi, func_code): + func = jsi.extract_function_from_code(*func_code) + + def extract_nsig(s): + try: + ret = func([s]) + except JSInterpreter.Exception: + raise + except Exception as e: + raise JSInterpreter.Exception(traceback.format_exc(), cause=e) + + if ret.startswith('enhanced_except_'): + raise JSInterpreter.Exception('Signature function returned an exception') + return ret - return lambda s: jsi.extract_function_from_code(*func_code)([s]) + return extract_nsig def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): """ @@ -2643,74 +3022,76 @@ def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=F return sts def _mark_watched(self, video_id, player_responses): - playback_url = get_first( - player_responses, ('playbackTracking', 'videostatsPlaybackUrl', 'baseUrl'), - expected_type=url_or_none) - if not playback_url: - self.report_warning('Unable to mark watched') - return - parsed_playback_url = compat_urlparse.urlparse(playback_url) - qs = compat_urlparse.parse_qs(parsed_playback_url.query) + for is_full, key in enumerate(('videostatsPlaybackUrl', 'videostatsWatchtimeUrl')): + label = 'fully ' if is_full else '' + url = get_first(player_responses, ('playbackTracking', key, 'baseUrl'), + expected_type=url_or_none) + if not url: + self.report_warning(f'Unable to mark {label}watched') + return + parsed_url = urllib.parse.urlparse(url) + qs = urllib.parse.parse_qs(parsed_url.query) + + # cpn generation algorithm is reverse engineered from base.js. + # In fact it works even with dummy cpn. + CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' + cpn = ''.join(CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)) + + # # more consistent results setting it to right before the end + video_length = [str(float((qs.get('len') or ['1.5'])[0]) - 1)] + + qs.update({ + 'ver': ['2'], + 'cpn': [cpn], + 'cmt': video_length, + 'el': 'detailpage', # otherwise defaults to "shorts" + }) - # cpn generation algorithm is reverse engineered from base.js. - # In fact it works even with dummy cpn. - CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' - cpn = ''.join(CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)) + if is_full: + # these seem to mark watchtime "history" in the real world + # they're required, so send in a single value + qs.update({ + 'st': 0, + 'et': video_length, + }) - qs.update({ - 'ver': ['2'], - 'cpn': [cpn], - }) - playback_url = compat_urlparse.urlunparse( - parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + url = urllib.parse.urlunparse( + parsed_url._replace(query=urllib.parse.urlencode(qs, True))) - self._download_webpage( - playback_url, video_id, 'Marking watched', - 'Unable to mark watched', fatal=False) + self._download_webpage( + url, video_id, f'Marking {label}watched', + 'Unable to mark watched', fatal=False) - @staticmethod - def _extract_urls(webpage): - # Embedded YouTube player - entries = [ - unescapeHTML(mobj.group('url')) - for mobj in re.finditer(r'''(?x) - (?: - ]+?src=| - data-video-url=| - ]+?src=| - embedSWF\(?:\s*| - ]+data=| - new\s+SWFObject\( - ) - (["\']) - (?P(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ - (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) - \1''', webpage)] + @classmethod + def _extract_from_webpage(cls, url, webpage): + # Invidious Instances + # https://github.com/yt-dlp/yt-dlp/issues/195 + # https://github.com/iv-org/invidious/pull/1730 + mobj = re.search( + r']+ - class=(?P[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ - data-video_id=(?P[\'"])([^\'"]+)(?P=q2)''', webpage) - entries.extend(m[-1] for m in matches) - - return entries - - @staticmethod - def _extract_url(webpage): - urls = YoutubeIE._extract_urls(webpage) - return urls[0] if urls else None + for m in re.findall(r'''(?x)]+ + class=(?P[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ + data-video_id=(?P[\'"])([^\'"]+)(?P=q2)''', webpage): + yield cls.url_result(m[-1], cls, m[-1]) @classmethod def extract_id(cls, url): - mobj = re.match(cls._VALID_URL, url, re.VERBOSE) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - return mobj.group('id') + video_id = cls.get_temp_id(url) + if not video_id: + raise ExtractorError(f'Invalid URL: {url}') + return video_id def _extract_chapters_from_json(self, data, duration): chapter_list = traverse_obj( @@ -2741,9 +3122,14 @@ def _extract_chapters_from_engagement_panel(self, data, duration): for contents in content_list)), []) def _extract_chapters_from_description(self, description, duration): + duration_re = r'(?:\d+:)?\d{1,2}:\d{2}' + sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$' return self._extract_chapters( - re.findall(r'(?m)^((?:\d+:)?\d{1,2}:\d{2})\b\W*\s(.+?)\s*$', description or ''), + re.findall(sep_re % (duration_re, r'.+?'), description or ''), chapter_time=lambda x: parse_duration(x[0]), chapter_title=lambda x: x[1], + duration=duration, strict=False) or self._extract_chapters( + re.findall(sep_re % (r'.+?', duration_re), description or ''), + chapter_time=lambda x: parse_duration(x[1]), chapter_title=lambda x: x[0], duration=duration, strict=False) def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration, strict=True): @@ -2756,17 +3142,16 @@ def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration, if not strict: chapter_list.sort(key=lambda c: c['start_time'] or 0) - chapters = [{'start_time': 0, 'title': ''}] + chapters = [{'start_time': 0}] for idx, chapter in enumerate(chapter_list): - if chapter['start_time'] is None or not chapter['title']: + if chapter['start_time'] is None: self.report_warning(f'Incomplete chapter {idx}') elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration: - chapters[-1]['end_time'] = chapter['start_time'] chapters.append(chapter) - else: - self.report_warning(f'Invalid start time for chapter "{chapter["title"]}"') - chapters[-1]['end_time'] = duration - return chapters if len(chapters) > 1 and chapters[1]['start_time'] else chapters[1:] + elif chapter not in chapters: + self.report_warning( + f'Invalid start time ({chapter["start_time"]} < {chapters[-1]["start_time"]}) for chapter "{chapter["title"]}"') + return chapters[1:] def _extract_comment(self, comment_renderer, parent=None): comment_id = comment_renderer.get('commentId') @@ -2775,16 +3160,18 @@ def _extract_comment(self, comment_renderer, parent=None): text = self._get_text(comment_renderer, 'contentText') - # note: timestamp is an estimate calculated from the current time and time_text - timestamp, time_text = self._extract_time_text(comment_renderer, 'publishedTimeText') + # Timestamp is an estimate calculated from the current time and time_text + time_text = self._get_text(comment_renderer, 'publishedTimeText') or '' + timestamp = self._parse_time_text(time_text) + author = self._get_text(comment_renderer, 'authorText') author_id = try_get(comment_renderer, - lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str) + lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], str) votes = parse_count(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'], - lambda x: x['likeCount']), compat_str)) or 0 + lambda x: x['likeCount']), str)) or 0 author_thumbnail = try_get(comment_renderer, - lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], compat_str) + lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], str) author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool) is_favorited = 'creatorHeart' in (try_get( @@ -2879,8 +3266,8 @@ def extract_thread(contents): # YouTube comments have a max depth of 2 max_depth = int_or_none(get_single_config_arg('max_comment_depth')) if max_depth: - self._downloader.deprecation_warning( - '[youtube] max_comment_depth extractor argument is deprecated. Set max replies in the max-comments extractor argument instead.') + self._downloader.deprecated_feature('[youtube] max_comment_depth extractor argument is deprecated. ' + 'Set max replies in the max-comments extractor argument instead') if max_depth == 1 and parent: return @@ -2915,15 +3302,25 @@ def extract_thread(contents): note_prefix = '%sDownloading comment%s API JSON page %d %s' % ( ' ' if parent else '', ' replies' if parent else '', page_num, comment_prog_str) - - response = self._extract_response( - item_id=None, query=continuation, - ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix, - check_get_keys='onResponseReceivedEndpoints' if not is_forced_continuation else None) - is_forced_continuation = False - continuation_contents = traverse_obj( - response, 'onResponseReceivedEndpoints', expected_type=list, default=[]) - + try: + response = self._extract_response( + item_id=None, query=continuation, + ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix, + check_get_keys='onResponseReceivedEndpoints' if not is_forced_continuation else None) + except ExtractorError as e: + # Ignore incomplete data error for replies if retries didn't work. + # This is to allow any other parent comments and comment threads to be downloaded. + # See: https://github.com/yt-dlp/yt-dlp/issues/4669 + if 'incomplete data' in str(e).lower() and parent and self.get_param('ignoreerrors') is True: + self.report_warning( + 'Received incomplete data for a comment reply thread and retrying did not help. ' + 'Ignoring to let other comments be downloaded.') + else: + raise + is_forced_continuation = False + continuation_contents = traverse_obj( + response, 'onResponseReceivedEndpoints', expected_type=list, default=[]) + continuation = None for continuation_section in continuation_contents: continuation_items = traverse_obj( @@ -2948,6 +3345,7 @@ def extract_thread(contents): message = self._get_text(root_continuation_data, ('contents', ..., 'messageRenderer', 'text'), max_runs=1) if message and not parent and tracker['running_total'] == 0: self.report_warning(f'Youtube said: {message}', video_id=video_id, only_once=True) + raise self.CommentsDisabled @staticmethod def _generate_comment_continuation(video_id): @@ -3002,7 +3400,9 @@ def _is_agegated(player_response): def _is_unplayable(player_response): return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE' - def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr): + _STORY_PLAYER_PARAMS = '8AEB' + + def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, smuggled_data): session_index = self._extract_session_index(player_ytcfg, master_ytcfg) syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr) @@ -3012,8 +3412,10 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, yt_query = { 'videoId': video_id, - 'params': '8AEB' # enable stories } + if smuggled_data.get('is_story') or _split_innertube_client(client)[0] == 'android': + yt_query['params'] = self._STORY_PLAYER_PARAMS + yt_query.update(self._generate_player_context(sts)) return self._extract_response( item_id=video_id, ep='player', query=yt_query, @@ -3046,7 +3448,7 @@ def _get_requested_clients(self, url, smuggled_data): return orderedSet(requested_clients) - def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg): + def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, smuggled_data): initial_pr = None if webpage: initial_pr = self._search_json( @@ -3096,7 +3498,7 @@ def append_client(*client_names): try: pr = initial_pr if client == 'web' and initial_pr else self._extract_player_response( - client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, player_url if require_js_player else None, initial_pr) + client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, player_url if require_js_player else None, initial_pr, smuggled_data) except ExtractorError as e: if last_error: self.report_warning(last_error) @@ -3104,7 +3506,14 @@ def append_client(*client_names): continue if pr: - prs.append(pr) + # YouTube may return a different video player response than expected. + # See: https://github.com/TeamNewPipe/NewPipe/issues/8713 + pr_video_id = traverse_obj(pr, ('videoDetails', 'videoId')) + if pr_video_id and pr_video_id != video_id: + self.report_warning( + f'Skipping player response from {client} client (got player response for video "{pr_video_id}" instead of "{video_id}")' + bug_reports_message()) + else: + prs.append(pr) # creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in if variant == 'embedded' and self._is_unplayable(pr) and self.is_authenticated: @@ -3121,9 +3530,14 @@ def append_client(*client_names): self.report_warning(last_error) return prs, player_url - def _extract_formats(self, streaming_data, video_id, player_url, is_live, duration): - itags, stream_ids = {}, [] - itag_qualities, res_qualities = {}, {} + def _needs_live_processing(self, live_status, duration): + if (live_status == 'is_live' and self.get_param('live_from_start') + or live_status == 'post_live' and (duration or 0) > 4 * 3600): + return live_status + + def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration): + itags, stream_ids = collections.defaultdict(set), [] + itag_qualities, res_qualities = {}, {0: None} q = qualities([ # Normally tiny is the smallest video-only formats. But # audio-only formats with unknown quality may get tagged as tiny @@ -3164,7 +3578,7 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live, durati fmt_url = fmt.get('url') if not fmt_url: - sc = compat_parse_qs(fmt.get('signatureCipher')) + sc = urllib.parse.parse_qs(fmt.get('signatureCipher')) fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0])) encrypted_sig = try_get(sc, lambda x: x['s'][0]) if not all((sc, fmt_url, player_url, encrypted_sig)): @@ -3175,7 +3589,8 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live, durati self._decrypt_signature(encrypted_sig, video_id, player_url) ) except ExtractorError as e: - self.report_warning('Signature extraction failed: Some formats may be missing', only_once=True) + self.report_warning('Signature extraction failed: Some formats may be missing', + video_id=video_id, only_once=True) self.write_debug(e, only_once=True) continue @@ -3183,28 +3598,35 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live, durati throttled = False if query.get('n'): try: + decrypt_nsig = self._cached(self._decrypt_nsig, 'nsig', query['n'][0]) fmt_url = update_url_query(fmt_url, { - 'n': self._decrypt_nsig(query['n'][0], video_id, player_url)}) + 'n': decrypt_nsig(query['n'][0], video_id, player_url) + }) except ExtractorError as e: - self.report_warning( - 'nsig extraction failed: You may experience throttling for some formats\n' - f'n = {query["n"][0]} ; player = {player_url}', only_once=True) - self.write_debug(e, only_once=True) + phantomjs_hint = '' + if isinstance(e, JSInterpreter.Exception): + phantomjs_hint = (f' Install {self._downloader._format_err("PhantomJS", self._downloader.Styles.EMPHASIS)} ' + f'to workaround the issue. {PhantomJSwrapper.INSTALL_HINT}\n') + if player_url: + self.report_warning( + f'nsig extraction failed: You may experience throttling for some formats\n{phantomjs_hint}' + f' n = {query["n"][0]} ; player = {player_url}', video_id=video_id, only_once=True) + self.write_debug(e, only_once=True) + else: + self.report_warning( + 'Cannot decrypt nsig without player_url: You may experience throttling for some formats', + video_id=video_id, only_once=True) throttled = True - if itag: - itags[itag] = 'https' - stream_ids.append(stream_id) - tbr = float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) language_preference = ( 10 if audio_track.get('audioIsDefault') and 10 else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10 else -1) # Some formats may have much smaller duration than others (possibly damaged during encoding) - # Eg: 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823 + # E.g. 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823 # Make sure to avoid false positives with small duration differences. - # Eg: __2ABJjxzNo, ySuUZEjARPY + # E.g. __2ABJjxzNo, ySuUZEjARPY is_damaged = try_get(fmt, lambda x: float(x['approxDurationMs']) / duration < 500) if is_damaged: self.report_warning( @@ -3217,10 +3639,13 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live, durati '%s%s' % (audio_track.get('displayName') or '', ' (default)' if language_preference > 0 else ''), fmt.get('qualityLabel') or quality.replace('audio_quality_', ''), + try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), + try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), throttled and 'THROTTLED', is_damaged and 'DAMAGED', delim=', '), # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 'source_preference': -10 if throttled else -5 if itag == '22' else -1, 'fps': int_or_none(fmt.get('fps')) or None, + 'audio_channels': fmt.get('audioChannels'), 'height': height, 'quality': q(quality), 'has_drm': bool(fmt.get('drmFamilies')), @@ -3251,51 +3676,70 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live, durati } if dct.get('ext'): dct['container'] = dct['ext'] + '_dash' + + if itag: + itags[itag].add(('https', dct.get('language'))) + stream_ids.append(stream_id) yield dct - live_from_start = is_live and self.get_param('live_from_start') - skip_manifests = self._configuration_arg('skip') - if not self.get_param('youtube_include_hls_manifest', True): - skip_manifests.append('hls') + needs_live_processing = self._needs_live_processing(live_status, duration) + skip_bad_formats = not self._configuration_arg('include_incomplete_formats') + + skip_manifests = set(self._configuration_arg('skip')) + if (not self.get_param('youtube_include_hls_manifest', True) + or needs_live_processing == 'is_live' # These will be filtered out by YoutubeDL anyway + or needs_live_processing and skip_bad_formats): + skip_manifests.add('hls') + if not self.get_param('youtube_include_dash_manifest', True): - skip_manifests.append('dash') - get_dash = 'dash' not in skip_manifests and ( - not is_live or live_from_start or self._configuration_arg('include_live_dash')) - get_hls = not live_from_start and 'hls' not in skip_manifests + skip_manifests.add('dash') + if self._configuration_arg('include_live_dash'): + self._downloader.deprecated_feature('[youtube] include_live_dash extractor argument is deprecated. ' + 'Use include_incomplete_formats extractor argument instead') + elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live': + skip_manifests.add('dash') def process_manifest_format(f, proto, itag): - if itag in itags: - if itags[itag] == proto or f'{itag}-{proto}' in itags: - return False - itag = f'{itag}-{proto}' - if itag: + key = (proto, f.get('language')) + if key in itags[itag]: + return False + itags[itag].add(key) + + if any(p != proto for p, _ in itags[itag]): + f['format_id'] = f'{itag}-{proto}' + elif itag: f['format_id'] = itag - itags[itag] = proto - f['quality'] = next(( - q(qdict[val]) - for val, qdict in ((f.get('format_id', '').split('-')[0], itag_qualities), (f.get('height'), res_qualities)) - if val in qdict), -1) + f['quality'] = q(itag_qualities.get(try_get(f, lambda f: f['format_id'].split('-')[0]), -1)) + if f['quality'] == -1 and f.get('height'): + f['quality'] = q(res_qualities[min(res_qualities, key=lambda x: abs(x - f['height']))]) return True + subtitles = {} for sd in streaming_data: - hls_manifest_url = get_hls and sd.get('hlsManifestUrl') + hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl') if hls_manifest_url: - for f in self._extract_m3u8_formats(hls_manifest_url, video_id, 'mp4', fatal=False): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live') + subtitles = self._merge_subtitles(subs, subtitles) + for f in fmts: if process_manifest_format(f, 'hls', self._search_regex( r'/itag/(\d+)', f['url'], 'itag', default=None)): yield f - dash_manifest_url = get_dash and sd.get('dashManifestUrl') + dash_manifest_url = 'dash' not in skip_manifests and sd.get('dashManifestUrl') if dash_manifest_url: - for f in self._extract_mpd_formats(dash_manifest_url, video_id, fatal=False): + formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) + subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH + for f in formats: if process_manifest_format(f, 'dash', f['format_id']): f['filesize'] = int_or_none(self._search_regex( r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) - if live_from_start: + if needs_live_processing: f['is_from_start'] = True yield f + yield subtitles def _extract_storyboard(self, player_responses, duration): spec = get_first( @@ -3326,6 +3770,9 @@ def _extract_storyboard(self, player_responses, duration): 'url': url, 'width': width, 'height': height, + 'fps': frame_count / duration, + 'rows': rows, + 'columns': cols, 'fragments': [{ 'url': url.replace('$M', str(j)), 'duration': min(fragment_duration, duration - (j * fragment_duration)), @@ -3335,14 +3782,17 @@ def _extract_storyboard(self, player_responses, duration): def _download_player_responses(self, url, smuggled_data, video_id, webpage_url): webpage = None if 'webpage' not in self._configuration_arg('player_skip'): + query = {'bpctr': '9999999999', 'has_verified': '1'} + if smuggled_data.get('is_story'): + query['pp'] = self._STORY_PLAYER_PARAMS webpage = self._download_webpage( - webpage_url + '&bpctr=9999999999&has_verified=1&pp=8AEB', video_id, fatal=False) + webpage_url, video_id, fatal=False, query=query) master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg() player_responses, player_url = self._extract_player_responses( self._get_requested_clients(url, smuggled_data), - video_id, webpage, master_ytcfg) + video_id, webpage, master_ytcfg, smuggled_data) return webpage, master_ytcfg, player_responses, player_url @@ -3351,11 +3801,19 @@ def _list_formats(self, video_id, microformats, video_details, player_responses, is_live = get_first(video_details, 'isLive') if is_live is None: is_live = get_first(live_broadcast_details, 'isLiveNow') - + live_content = get_first(video_details, 'isLiveContent') + is_upcoming = get_first(video_details, 'isUpcoming') + post_live = get_first(video_details, 'isPostLiveDvr') + live_status = ('post_live' if post_live + else 'is_live' if is_live + else 'is_upcoming' if is_upcoming + else 'was_live' if live_content + else 'not_live' if False in (is_live, live_content) + else None) streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[]) - formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live, duration)) + *formats, subtitles = self._extract_formats_and_subtitles(streaming_data, video_id, player_url, live_status, duration) - return live_broadcast_details, is_live, streaming_data, formats + return live_broadcast_details, live_status, streaming_data, formats, subtitles def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -3385,11 +3843,19 @@ def _real_extract(self, url): microformats = traverse_obj( player_responses, (..., 'microformat', 'playerMicroformatRenderer'), expected_type=dict, default=[]) - video_title = ( - get_first(video_details, 'title') - or self._get_text(microformats, (..., 'title')) - or search_meta(['og:title', 'twitter:title', 'title'])) - video_description = get_first(video_details, 'shortDescription') + + translated_title = self._get_text(microformats, (..., 'title')) + video_title = (self._preferred_lang and translated_title + or get_first(video_details, 'title') # primary + or translated_title + or search_meta(['og:title', 'twitter:title', 'title'])) + translated_description = self._get_text(microformats, (..., 'description')) + original_description = get_first(video_details, 'shortDescription') + video_description = ( + self._preferred_lang and translated_description + # If original description is blank, it will be an empty string. + # Do not prefer translated description in this case. + or original_description if original_description is not None else translated_description) multifeed_metadata_list = get_first( player_responses, @@ -3405,12 +3871,12 @@ def _real_extract(self, url): # Unquote should take place before split on comma (,) since textual # fields may contain comma as well (see # https://github.com/ytdl-org/youtube-dl/issues/8536) - feed_data = compat_parse_qs( - compat_urllib_parse_unquote_plus(feed)) + feed_data = urllib.parse.parse_qs( + urllib.parse.unquote_plus(feed)) def feed_entry(name): return try_get( - feed_data, lambda x: x[name][0], compat_str) + feed_data, lambda x: x[name][0], str) feed_id = feed_entry('id') if not feed_id: @@ -3434,20 +3900,14 @@ def feed_entry(name): return self.playlist_result( entries, video_id, video_title, video_description) - duration = int_or_none( - get_first(video_details, 'lengthSeconds') - or get_first(microformats, 'lengthSeconds') - or parse_duration(search_meta('duration'))) or None - - if get_first(video_details, 'isPostLiveDvr'): - self.write_debug('Video is in Post-Live Manifestless mode') - if duration or 0 > 4 * 3600: - self.report_warning( - 'The livestream has not finished processing. Only 4 hours of the video can be currently downloaded. ' - 'This is a known issue and patches are welcome') + duration = (int_or_none(get_first(video_details, 'lengthSeconds')) + or int_or_none(get_first(microformats, 'lengthSeconds')) + or parse_duration(search_meta('duration')) or None) - live_broadcast_details, is_live, streaming_data, formats = self._list_formats( - video_id, microformats, video_details, player_responses, player_url, duration) + live_broadcast_details, live_status, streaming_data, formats, automatic_captions = \ + self._list_formats(video_id, microformats, video_details, player_responses, player_url, duration) + if live_status == 'post_live': + self.write_debug(f'{video_id}: Video is in Post-Live Manifestless mode') if not formats: if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')): @@ -3497,7 +3957,7 @@ def feed_entry(name): # See: https://github.com/yt-dlp/yt-dlp/issues/340 # List of possible thumbnails - Ref: thumbnail_names = [ - # While the *1,*2,*3 thumbnails are just below their correspnding "*default" variants + # While the *1,*2,*3 thumbnails are just below their corresponding "*default" variants # in resolution, these are not the custom thumbnail. So de-prioritize them 'maxresdefault', 'hq720', 'sddefault', 'hqdefault', '0', 'mqdefault', 'default', 'sd1', 'sd2', 'sd3', 'hq1', 'hq2', 'hq3', 'mq1', 'mq2', 'mq3', '1', '2', '3' @@ -3506,7 +3966,7 @@ def feed_entry(name): thumbnails.extend({ 'url': 'https://i.ytimg.com/vi{webp}/{video_id}/{name}{live}.{ext}'.format( video_id=video_id, name=name, ext=ext, - webp='_webp' if ext == 'webp' else '', live='_live' if is_live else ''), + webp='_webp' if ext == 'webp' else '', live='_live' if live_status == 'is_live' else ''), } for name in thumbnail_names for ext in ('webp', 'jpg')) for thumb in thumbnails: i = next((i for i, t in enumerate(thumbnail_names) if f'/{video_id}/{t}' in thumb['url']), n_thumbnail_names) @@ -3521,26 +3981,29 @@ def feed_entry(name): or search_meta('channelId')) owner_profile_url = get_first(microformats, 'ownerProfileUrl') - live_content = get_first(video_details, 'isLiveContent') - is_upcoming = get_first(video_details, 'isUpcoming') - if is_live is None: - if is_upcoming or live_content is False: - is_live = False - if is_upcoming is None and (live_content or is_live): - is_upcoming = False live_start_time = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp')) live_end_time = parse_iso8601(get_first(live_broadcast_details, 'endTimestamp')) if not duration and live_end_time and live_start_time: duration = live_end_time - live_start_time - if is_live and self.get_param('live_from_start'): - self._prepare_live_from_start_formats(formats, video_id, live_start_time, url, webpage_url, smuggled_data) + needs_live_processing = self._needs_live_processing(live_status, duration) - formats.extend(self._extract_storyboard(player_responses, duration)) + def is_bad_format(fmt): + if needs_live_processing and not fmt.get('is_from_start'): + return True + elif (live_status == 'is_live' and needs_live_processing != 'is_live' + and fmt.get('protocol') == 'http_dash_segments'): + return True - # Source is given priority since formats that throttle are given lower source_preference - # When throttling issue is fully fixed, remove this - self._sort_formats(formats, ('quality', 'res', 'fps', 'hdr:12', 'source', 'codec:vp9.2', 'lang', 'proto')) + for fmt in filter(is_bad_format, formats): + fmt['preference'] = (fmt.get('preference') or -1) - 10 + fmt['format_note'] = join_nonempty(fmt.get('format_note'), '(Last 4 hours)', delim=' ') + + if needs_live_processing: + self._prepare_live_from_start_formats( + formats, video_id, live_start_time, url, webpage_url, smuggled_data, live_status == 'is_live') + + formats.extend(self._extract_storyboard(player_responses, duration)) info = { 'id': video_id, @@ -3569,14 +4032,13 @@ def feed_entry(name): 'categories': [category] if category else None, 'tags': keywords, 'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'), - 'is_live': is_live, - 'was_live': (False if is_live or is_upcoming or live_content is False - else None if is_live is None or is_upcoming is None - else live_content), - 'live_status': 'is_upcoming' if is_upcoming else None, # rest will be set by YoutubeDL + 'live_status': live_status, 'release_timestamp': live_start_time, + '_format_sort_fields': ( # source_preference is lower for throttled/potentially damaged formats + 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec:vp9.2', 'channels', 'acodec', 'lang', 'proto') } + subtitles = {} pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict) if pctr: def get_lang_code(track): @@ -3603,7 +4065,9 @@ def process_language(container, base_url, lang_code, sub_name, query): 'name': sub_name, }) - subtitles, automatic_captions = {}, {} + # NB: Constructing the full subtitle dictionary is slow + get_translated_subs = 'translated_subs' not in self._configuration_arg('skip') and ( + self.get_param('writeautomaticsub', False) or self.get_param('listsubtitles')) for lang_code, caption_track in captions.items(): base_url = caption_track.get('baseUrl') orig_lang = parse_qs(base_url).get('lang', [None])[-1] @@ -3621,8 +4085,8 @@ def process_language(container, base_url, lang_code, sub_name, query): if not trans_code: continue orig_trans_code = trans_code - if caption_track.get('kind') != 'asr': - if 'translated_subs' in self._configuration_arg('skip'): + if caption_track.get('kind') != 'asr' and trans_code != 'und': + if not get_translated_subs: continue trans_code += f'-{lang_code}' trans_name += format_field(lang_name, None, ' from %s') @@ -3634,12 +4098,13 @@ def process_language(container, base_url, lang_code, sub_name, query): # Setting tlang=lang returns damaged subtitles. process_language(automatic_captions, base_url, trans_code, trans_name, {} if orig_lang == orig_trans_code else {'tlang': trans_code}) - info['automatic_captions'] = automatic_captions - info['subtitles'] = subtitles - parsed_url = compat_urllib_parse_urlparse(url) + info['automatic_captions'] = automatic_captions + info['subtitles'] = subtitles + + parsed_url = urllib.parse.urlparse(url) for component in [parsed_url.fragment, parsed_url.query]: - query = compat_parse_qs(component) + query = urllib.parse.parse_qs(component) for k, v in query.items(): for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]: d_k += '_time' @@ -3702,7 +4167,8 @@ def process_language(container, base_url, lang_code, sub_name, query): 'url': f'https://www.youtube.com/watch?v={video_id}&bpctr=9999999999&has_verified=1', 'video_id': video_id, 'ext': 'json', - 'protocol': 'youtube_live_chat' if is_live or is_upcoming else 'youtube_live_chat_replay', + 'protocol': ('youtube_live_chat' if live_status in ('is_live', 'is_upcoming') + else 'youtube_live_chat_replay'), }] if initial_data: @@ -3737,19 +4203,24 @@ def process_language(container, base_url, lang_code, sub_name, query): vpir, lambda x: x['videoActions']['menuRenderer']['topLevelButtons'], list) or []): - tbr = tlb.get('toggleButtonRenderer') or {} - for getter, regex in [( - lambda x: x['defaultText']['accessibility']['accessibilityData'], - r'(?P[\d,]+)\s*(?P(?:dis)?like)'), ([ - lambda x: x['accessibility'], - lambda x: x['accessibilityData']['accessibilityData'], - ], r'(?P(?:dis)?like) this video along with (?P[\d,]+) other people')]: - label = (try_get(tbr, getter, dict) or {}).get('label') - if label: - mobj = re.match(regex, label) - if mobj: - info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count')) - break + tbrs = variadic( + traverse_obj( + tlb, 'toggleButtonRenderer', + ('segmentedLikeDislikeButtonRenderer', ..., 'toggleButtonRenderer'), + default=[])) + for tbr in tbrs: + for getter, regex in [( + lambda x: x['defaultText']['accessibility']['accessibilityData'], + r'(?P[\d,]+)\s*(?P(?:dis)?like)'), ([ + lambda x: x['accessibility'], + lambda x: x['accessibilityData']['accessibilityData'], + ], r'(?P(?:dis)?like) this video along with (?P[\d,]+) other people')]: + label = (try_get(tbr, getter, dict) or {}).get('label') + if label: + mobj = re.match(regex, label) + if mobj: + info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count')) + break sbr_tooltip = try_get( vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) if sbr_tooltip: @@ -3758,6 +4229,15 @@ def process_language(container, base_url, lang_code, sub_name, query): 'like_count': str_to_int(like_count), 'dislike_count': str_to_int(dislike_count), }) + vcr = traverse_obj(vpir, ('viewCount', 'videoViewCountRenderer')) + if vcr: + vc = self._get_count(vcr, 'viewCount') + # Upcoming premieres with waiting count are treated as live here + if vcr.get('isLive'): + info['concurrent_view_count'] = vc + elif info.get('view_count') is None: + info['view_count'] = vc + vsir = get_first(contents, 'videoSecondaryInfoRenderer') if vsir: vor = traverse_obj(vsir, ('owner', 'videoOwnerRenderer')) @@ -3803,8 +4283,12 @@ def process_language(container, base_url, lang_code, sub_name, query): upload_date = ( unified_strdate(get_first(microformats, 'uploadDate')) or unified_strdate(search_meta('uploadDate'))) - if not upload_date or (not info.get('is_live') and not info.get('was_live') and info.get('live_status') != 'is_upcoming'): - upload_date = strftime_or_none(self._extract_time_text(vpir, 'dateText')[0], '%Y%m%d') or upload_date + if not upload_date or ( + live_status in ('not_live', None) + and 'no-youtube-prefer-utc-upload-date' not in self.get_param('compat_opts', []) + ): + upload_date = strftime_or_none( + self._parse_time_text(self._get_text(vpir, 'dateText')), '%Y%m%d') or upload_date info['upload_date'] = upload_date for to, frm in fallbacks.items(): @@ -3816,33 +4300,25 @@ def process_language(container, base_url, lang_code, sub_name, query): if v: info[d_k] = v - is_private = get_first(video_details, 'isPrivate', expected_type=bool) - is_unlisted = get_first(microformats, 'isUnlisted', expected_type=bool) - is_membersonly = None - is_premium = None - if initial_data and is_private is not None: - is_membersonly = False - is_premium = False - contents = try_get(initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or [] - badge_labels = set() - for content in contents: - if not isinstance(content, dict): - continue - badge_labels.update(self._extract_badges(content.get('videoPrimaryInfoRenderer'))) - for badge_label in badge_labels: - if badge_label.lower() == 'members only': - is_membersonly = True - elif badge_label.lower() == 'premium': - is_premium = True - elif badge_label.lower() == 'unlisted': - is_unlisted = True - - info['availability'] = self._availability( - is_private=is_private, - needs_premium=is_premium, - needs_subscription=is_membersonly, - needs_auth=info['age_limit'] >= 18, - is_unlisted=None if is_private is None else is_unlisted) + badges = self._extract_badges(traverse_obj(contents, (..., 'videoPrimaryInfoRenderer'), get_all=False)) + + is_private = (self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE) + or get_first(video_details, 'isPrivate', expected_type=bool)) + + info['availability'] = ( + 'public' if self._has_badge(badges, BadgeType.AVAILABILITY_PUBLIC) + else self._availability( + is_private=is_private, + needs_premium=( + self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) + or False if initial_data and is_private is not None else None), + needs_subscription=( + self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) + or False if initial_data and is_private is not None else None), + needs_auth=info['age_limit'] >= 18, + is_unlisted=None if is_private is None else ( + self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) + or get_first(microformats, 'isUnlisted', expected_type=bool)))) info['__post_extractor'] = self.extract_comments(master_ytcfg, video_id, contents, webpage) @@ -3852,15 +4328,19 @@ def process_language(container, base_url, lang_code, sub_name, query): class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): - @staticmethod def passthrough_smuggled_data(func): - def _smuggle(entries, smuggled_data): - for entry in entries: - # TODO: Convert URL to music.youtube instead. - # Do we need to passthrough any other smuggled_data? - entry['url'] = smuggle_url(entry['url'], smuggled_data) - yield entry + def _smuggle(info, smuggled_data): + if info.get('_type') not in ('url', 'url_transparent'): + return info + if smuggled_data.get('is_music_url'): + parsed_url = urllib.parse.urlparse(info['url']) + if parsed_url.netloc in ('www.youtube.com', 'music.youtube.com'): + smuggled_data.pop('is_music_url') + info['url'] = urllib.parse.urlunparse(parsed_url._replace(netloc='music.youtube.com')) + if smuggled_data: + info['url'] = smuggle_url(info['url'], smuggled_data) + return info @functools.wraps(func) def wrapper(self, url): @@ -3868,8 +4348,10 @@ def wrapper(self, url): if self.is_music_url(url): smuggled_data['is_music_url'] = True info_dict = func(self, url, smuggled_data) - if smuggled_data and info_dict.get('entries'): - info_dict['entries'] = _smuggle(info_dict['entries'], smuggled_data) + if smuggled_data: + _smuggle(info_dict, smuggled_data) + if info_dict.get('entries'): + info_dict['entries'] = (_smuggle(i, smuggled_data.copy()) for i in info_dict['entries']) return info_dict return wrapper @@ -3900,6 +4382,25 @@ def _extract_basic_item_renderer(item): elif key.startswith('grid') and key.endswith('Renderer'): return renderer + def _extract_channel_renderer(self, renderer): + channel_id = renderer['channelId'] + title = self._get_text(renderer, 'title') + channel_url = f'https://www.youtube.com/channel/{channel_id}' + return { + '_type': 'url', + 'url': channel_url, + 'id': channel_id, + 'ie_key': YoutubeTabIE.ie_key(), + 'channel': title, + 'channel_id': channel_id, + 'channel_url': channel_url, + 'title': title, + 'channel_follower_count': self._get_count(renderer, 'subscriberCountText'), + 'thumbnails': self._extract_thumbnails(renderer, 'thumbnail'), + 'playlist_count': self._get_count(renderer, 'videoCountText'), + 'description': self._get_text(renderer, 'descriptionSnippet'), + } + def _grid_entries(self, grid_renderer): for item in grid_renderer['items']: if not isinstance(item, dict): @@ -3925,14 +4426,12 @@ def _grid_entries(self, grid_renderer): # channel channel_id = renderer.get('channelId') if channel_id: - yield self.url_result( - 'https://www.youtube.com/channel/%s' % channel_id, - ie=YoutubeTabIE.ie_key(), video_title=title) + yield self._extract_channel_renderer(renderer) continue # generic endpoint URL support ep_url = urljoin('https://www.youtube.com/', try_get( renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], - compat_str)) + str)) if ep_url: for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE): if ie.suitable(ep_url): @@ -3976,7 +4475,7 @@ def _shelf_entries_from_content(self, shelf_renderer): def _shelf_entries(self, shelf_renderer, skip_channels=False): ep = try_get( shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], - compat_str) + str) shelf_url = urljoin('https://www.youtube.com', ep) if shelf_url: # Skipping links to another channels, note that checking for @@ -4002,8 +4501,8 @@ def _playlist_entries(self, video_list_renderer): yield self._extract_video(renderer) def _rich_entries(self, rich_grid_renderer): - renderer = try_get( - rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {} + renderer = traverse_obj( + rich_grid_renderer, ('content', ('videoRenderer', 'reelItemRenderer')), get_all=False) or {} video_id = renderer.get('videoId') if not video_id: return @@ -4036,7 +4535,7 @@ def _post_thread_entries(self, post_thread_renderer): yield entry # playlist attachment playlist_id = try_get( - post_renderer, lambda x: x['backstageAttachment']['playlistRenderer']['playlistId'], compat_str) + post_renderer, lambda x: x['backstageAttachment']['playlistRenderer']['playlistId'], str) if playlist_id: yield self.url_result( 'https://www.youtube.com/playlist?list=%s' % playlist_id, @@ -4047,7 +4546,7 @@ def _post_thread_entries(self, post_thread_renderer): if not isinstance(run, dict): continue ep_url = try_get( - run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str) + run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], str) if not ep_url: continue if not YoutubeIE.suitable(ep_url): @@ -4080,6 +4579,13 @@ def _rich_grid_entries(self, contents): yield entry ''' + def _report_history_entries(self, renderer): + for url in traverse_obj(renderer, ( + 'rows', ..., 'reportHistoryTableRowRenderer', 'cells', ..., + 'reportHistoryTableCellRenderer', 'cell', 'reportHistoryTableTextCellRenderer', 'text', 'runs', ..., + 'navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url')): + yield self.url_result(urljoin('https://www.youtube.com', url), YoutubeIE) + def _extract_entries(self, parent_renderer, continuation_list): # continuation_list is modified in-place with continuation_list = [continuation_token] continuation_list[:] = [None] @@ -4091,12 +4597,16 @@ def _extract_entries(self, parent_renderer, continuation_list): content, 'itemSectionRenderer', 'musicShelfRenderer', 'musicShelfContinuation', expected_type=dict) if not is_renderer: - renderer = content.get('richItemRenderer') - if renderer: - for entry in self._rich_entries(renderer): + if content.get('richItemRenderer'): + for entry in self._rich_entries(content['richItemRenderer']): yield entry continuation_list[0] = self._extract_continuation(parent_renderer) + elif content.get('reportHistorySectionRenderer'): # https://www.youtube.com/reporthistory + table = traverse_obj(content, ('reportHistorySectionRenderer', 'table', 'tableRenderer')) + yield from self._report_history_entries(table) + continuation_list[0] = self._extract_continuation(table) continue + isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] for isr_content in isr_contents: if not isinstance(isr_content, dict): @@ -4157,26 +4667,6 @@ def _entries(self, tab, item_id, ytcfg, account_syncid, visitor_data): # See: https://github.com/ytdl-org/youtube-dl/issues/28702 visitor_data = self._extract_visitor_data(response) or visitor_data - known_continuation_renderers = { - 'playlistVideoListContinuation': self._playlist_entries, - 'gridContinuation': self._grid_entries, - 'itemSectionContinuation': self._post_thread_continuation_entries, - 'sectionListContinuation': extract_entries, # for feeds - } - continuation_contents = try_get( - response, lambda x: x['continuationContents'], dict) or {} - continuation_renderer = None - for key, value in continuation_contents.items(): - if key not in known_continuation_renderers: - continue - continuation_renderer = value - continuation_list = [None] - yield from known_continuation_renderers[key](continuation_renderer) - continuation = continuation_list[0] or self._extract_continuation(continuation_renderer) - break - if continuation_renderer: - continue - known_renderers = { 'videoRenderer': (self._grid_entries, 'items'), # for membership tab 'gridPlaylistRenderer': (self._grid_entries, 'items'), @@ -4185,79 +4675,81 @@ def _entries(self, tab, item_id, ytcfg, account_syncid, visitor_data): 'playlistVideoRenderer': (self._playlist_entries, 'contents'), 'itemSectionRenderer': (extract_entries, 'contents'), # for feeds 'richItemRenderer': (extract_entries, 'contents'), # for hashtag - 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents') + 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents'), + 'reportHistoryTableRowRenderer': (self._report_history_entries, 'rows'), + 'playlistVideoListContinuation': (self._playlist_entries, None), + 'gridContinuation': (self._grid_entries, None), + 'itemSectionContinuation': (self._post_thread_continuation_entries, None), + 'sectionListContinuation': (extract_entries, None), # for feeds } - on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints')) - continuation_items = try_get( - on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list) - continuation_item = try_get(continuation_items, lambda x: x[0], dict) or {} + + continuation_items = traverse_obj(response, ( + ('onResponseReceivedActions', 'onResponseReceivedEndpoints'), ..., + 'appendContinuationItemsAction', 'continuationItems' + ), 'continuationContents', get_all=False) + continuation_item = traverse_obj(continuation_items, 0, None, expected_type=dict, default={}) + video_items_renderer = None - for key, value in continuation_item.items(): + for key in continuation_item.keys(): if key not in known_renderers: continue - video_items_renderer = {known_renderers[key][1]: continuation_items} + func, parent_key = known_renderers[key] + video_items_renderer = {parent_key: continuation_items} if parent_key else continuation_items continuation_list = [None] - yield from known_renderers[key][0](video_items_renderer) + yield from func(video_items_renderer) continuation = continuation_list[0] or self._extract_continuation(video_items_renderer) + + if not video_items_renderer: break - if video_items_renderer: - continue - break @staticmethod def _extract_selected_tab(tabs, fatal=True): - for tab in tabs: - renderer = dict_get(tab, ('tabRenderer', 'expandableTabRenderer')) or {} - if renderer.get('selected') is True: - return renderer - else: - if fatal: - raise ExtractorError('Unable to find selected tab') - - def _extract_uploader(self, data): - uploader = {} - renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarSecondaryInfoRenderer') or {} - owner = try_get( - renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict) - if owner: - owner_text = owner.get('text') - uploader['uploader'] = self._search_regex( - r'^by (.+) and \d+ others?$', owner_text, 'uploader', default=owner_text) - uploader['uploader_id'] = try_get( - owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str) - uploader['uploader_url'] = urljoin( - 'https://www.youtube.com/', - try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str)) - return {k: v for k, v in uploader.items() if v is not None} + for tab_renderer in tabs: + if tab_renderer.get('selected'): + return tab_renderer + if fatal: + raise ExtractorError('Unable to find selected tab') + + @staticmethod + def _extract_tab_renderers(response): + return traverse_obj( + response, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs', ..., ('tabRenderer', 'expandableTabRenderer')), expected_type=dict) def _extract_from_tabs(self, item_id, ytcfg, data, tabs): - playlist_id = title = description = channel_url = channel_name = channel_id = None - tags = [] + metadata = self._extract_metadata_from_tabs(item_id, data) selected_tab = self._extract_selected_tab(tabs) - primary_sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') - renderer = try_get( - data, lambda x: x['metadata']['channelMetadataRenderer'], dict) - if renderer: - channel_name = renderer.get('title') - channel_url = renderer.get('channelUrl') - channel_id = renderer.get('externalId') - else: - renderer = try_get( - data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) + metadata['title'] += format_field(selected_tab, 'title', ' - %s') + metadata['title'] += format_field(selected_tab, 'expandedText', ' - %s') - if renderer: - title = renderer.get('title') - description = renderer.get('description', '') - playlist_id = channel_id - tags = renderer.get('keywords', '').split() + return self.playlist_result( + self._entries( + selected_tab, metadata['id'], ytcfg, + self._extract_account_syncid(ytcfg, data), + self._extract_visitor_data(data, ytcfg)), + **metadata) + + def _extract_metadata_from_tabs(self, item_id, data): + info = {'id': item_id} + + metadata_renderer = traverse_obj(data, ('metadata', 'channelMetadataRenderer'), expected_type=dict) + if metadata_renderer: + info.update({ + 'uploader': metadata_renderer.get('title'), + 'uploader_id': metadata_renderer.get('externalId'), + 'uploader_url': metadata_renderer.get('channelUrl'), + }) + if info['uploader_id']: + info['id'] = info['uploader_id'] + else: + metadata_renderer = traverse_obj(data, ('metadata', 'playlistMetadataRenderer'), expected_type=dict) # We can get the uncropped banner/avatar by replacing the crop params with '=s0' # See: https://github.com/yt-dlp/yt-dlp/issues/2237#issuecomment-1013694714 def _get_uncropped(url): return url_or_none((url or '').split('=')[0] + '=s0') - avatar_thumbnails = self._extract_thumbnails(renderer, 'avatar') + avatar_thumbnails = self._extract_thumbnails(metadata_renderer, 'avatar') if avatar_thumbnails: uncropped_avatar = _get_uncropped(avatar_thumbnails[0]['url']) if uncropped_avatar: @@ -4268,7 +4760,7 @@ def _get_uncropped(url): }) channel_banners = self._extract_thumbnails( - data, ('header', ..., ['banner', 'mobileBanner', 'tvBanner'])) + data, ('header', ..., ('banner', 'mobileBanner', 'tvBanner'))) for banner in channel_banners: banner['preference'] = -10 @@ -4281,46 +4773,64 @@ def _get_uncropped(url): 'preference': -5 }) + # Deprecated - remove primary_sidebar_renderer when layout discontinued + primary_sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') + playlist_header_renderer = traverse_obj(data, ('header', 'playlistHeaderRenderer'), expected_type=dict) + primary_thumbnails = self._extract_thumbnails( primary_sidebar_renderer, ('thumbnailRenderer', ('playlistVideoThumbnailRenderer', 'playlistCustomThumbnailRenderer'), 'thumbnail')) + playlist_thumbnails = self._extract_thumbnails( + playlist_header_renderer, ('playlistHeaderBanner', 'heroPlaylistThumbnailRenderer', 'thumbnail')) - if playlist_id is None: - playlist_id = item_id - - playlist_stats = traverse_obj(primary_sidebar_renderer, 'stats') - last_updated_unix, _ = self._extract_time_text(playlist_stats, 2) - if title is None: - title = self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) or playlist_id - title += format_field(selected_tab, 'title', ' - %s') - title += format_field(selected_tab, 'expandedText', ' - %s') - - metadata = { - 'playlist_id': playlist_id, - 'playlist_title': title, - 'playlist_description': description, - 'uploader': channel_name, - 'uploader_id': channel_id, - 'uploader_url': channel_url, - 'thumbnails': primary_thumbnails + avatar_thumbnails + channel_banners, - 'tags': tags, - 'view_count': self._get_count(playlist_stats, 1), + info.update({ + 'title': (traverse_obj(metadata_renderer, 'title') + or self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) + or info['id']), 'availability': self._extract_availability(data), - 'modified_date': strftime_or_none(last_updated_unix, '%Y%m%d'), - 'playlist_count': self._get_count(playlist_stats, 0), 'channel_follower_count': self._get_count(data, ('header', ..., 'subscriberCountText')), - } - if not channel_id: - metadata.update(self._extract_uploader(data)) - metadata.update({ - 'channel': metadata['uploader'], - 'channel_id': metadata['uploader_id'], - 'channel_url': metadata['uploader_url']}) - return self.playlist_result( - self._entries( - selected_tab, playlist_id, ytcfg, - self._extract_account_syncid(ytcfg, data), - self._extract_visitor_data(data, ytcfg)), - **metadata) + 'description': try_get(metadata_renderer, lambda x: x.get('description', '')), + 'tags': try_get(metadata_renderer or {}, lambda x: x.get('keywords', '').split()), + 'thumbnails': (primary_thumbnails or playlist_thumbnails) + avatar_thumbnails + channel_banners, + }) + + # Playlist stats is a text runs array containing [video count, view count, last updated]. + # last updated or (view count and last updated) may be missing. + playlist_stats = get_first( + (primary_sidebar_renderer, playlist_header_renderer), (('stats', 'briefStats', 'numVideosText'), )) + + last_updated_unix = self._parse_time_text( + self._get_text(playlist_stats, 2) # deprecated, remove when old layout discontinued + or self._get_text(playlist_header_renderer, ('byline', 1, 'playlistBylineRenderer', 'text'))) + info['modified_date'] = strftime_or_none(last_updated_unix, '%Y%m%d') + + info['view_count'] = self._get_count(playlist_stats, 1) + if info['view_count'] is None: # 0 is allowed + info['view_count'] = self._get_count(playlist_header_renderer, 'viewCountText') + + info['playlist_count'] = self._get_count(playlist_stats, 0) + if info['playlist_count'] is None: # 0 is allowed + info['playlist_count'] = self._get_count(playlist_header_renderer, ('byline', 0, 'playlistBylineRenderer', 'text')) + + if not info.get('uploader_id'): + owner = traverse_obj(playlist_header_renderer, 'ownerText') + if not owner: # Deprecated + owner = traverse_obj( + self._extract_sidebar_info_renderer(data, 'playlistSidebarSecondaryInfoRenderer'), + ('videoOwner', 'videoOwnerRenderer', 'title')) + owner_text = self._get_text(owner) + browse_ep = traverse_obj(owner, ('runs', 0, 'navigationEndpoint', 'browseEndpoint')) or {} + info.update({ + 'uploader': self._search_regex(r'^by (.+) and \d+ others?$', owner_text, 'uploader', default=owner_text), + 'uploader_id': browse_ep.get('browseId'), + 'uploader_url': urljoin('https://www.youtube.com', browse_ep.get('canonicalBaseUrl')) + }) + + info.update({ + 'channel': info['uploader'], + 'channel_id': info['uploader_id'], + 'channel_url': info['uploader_url'] + }) + return info def _extract_inline_playlist(self, playlist, playlist_id, data, ytcfg): first_id = last_id = response = None @@ -4355,13 +4865,13 @@ def _extract_inline_playlist(self, playlist, playlist_id, data, ytcfg): def _extract_from_playlist(self, item_id, url, data, playlist, ytcfg): title = playlist.get('title') or try_get( - data, lambda x: x['titleText']['simpleText'], compat_str) + data, lambda x: x['titleText']['simpleText'], str) playlist_id = playlist.get('playlistId') or item_id # Delegating everything except mix playlists to regular tab-based playlist URL playlist_url = urljoin(url, try_get( playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], - compat_str)) + str)) # Some playlists are unviewable but YouTube still provides a link to the (broken) playlist page [1] # [1] MLCT, RLTDwFCb4jeqaKWnciAYM-ZVHg @@ -4382,31 +4892,40 @@ def _extract_availability(self, data): Note: Unless YouTube tells us explicitly, we do not assume it is public @param data: response """ - is_private = is_unlisted = None - renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') or {} - badge_labels = self._extract_badges(renderer) + sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') or {} + playlist_header_renderer = traverse_obj(data, ('header', 'playlistHeaderRenderer')) or {} + player_header_privacy = playlist_header_renderer.get('privacy') + + badges = self._extract_badges(sidebar_renderer) # Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge - privacy_dropdown_entries = try_get( - renderer, lambda x: x['privacyForm']['dropdownFormFieldRenderer']['dropdown']['dropdownRenderer']['entries'], list) or [] - for renderer_dict in privacy_dropdown_entries: - is_selected = try_get( - renderer_dict, lambda x: x['privacyDropdownItemRenderer']['isSelected'], bool) or False - if not is_selected: - continue - label = self._get_text(renderer_dict, ('privacyDropdownItemRenderer', 'label')) - if label: - badge_labels.add(label.lower()) - break + privacy_setting_icon = get_first( + (playlist_header_renderer, sidebar_renderer), + ('privacyForm', 'dropdownFormFieldRenderer', 'dropdown', 'dropdownRenderer', 'entries', + lambda _, v: v['privacyDropdownItemRenderer']['isSelected'], 'privacyDropdownItemRenderer', 'icon', 'iconType'), + expected_type=str) - for badge_label in badge_labels: - if badge_label == 'unlisted': - is_unlisted = True - elif badge_label == 'private': - is_private = True - elif badge_label == 'public': - is_unlisted = is_private = False - return self._availability(is_private, False, False, False, is_unlisted) + microformats_is_unlisted = traverse_obj( + data, ('microformat', 'microformatDataRenderer', 'unlisted'), expected_type=bool) + + return ( + 'public' if ( + self._has_badge(badges, BadgeType.AVAILABILITY_PUBLIC) + or player_header_privacy == 'PUBLIC' + or privacy_setting_icon == 'PRIVACY_PUBLIC') + else self._availability( + is_private=( + self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE) + or player_header_privacy == 'PRIVATE' if player_header_privacy is not None + else privacy_setting_icon == 'PRIVACY_PRIVATE' if privacy_setting_icon is not None else None), + is_unlisted=( + self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) + or player_header_privacy == 'UNLISTED' if player_header_privacy is not None + else privacy_setting_icon == 'PRIVACY_UNLISTED' if privacy_setting_icon is not None + else microformats_is_unlisted if microformats_is_unlisted is not None else None), + needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None, + needs_premium=self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) or None, + needs_auth=False)) @staticmethod def _extract_sidebar_info_renderer(data, info_renderer, expected_type=dict): @@ -4419,87 +4938,53 @@ def _extract_sidebar_info_renderer(data, info_renderer, expected_type=dict): def _reload_with_unavailable_videos(self, item_id, data, ytcfg): """ - Get playlist with unavailable videos if the 'show unavailable videos' button exists. + Reload playlists with unavailable videos (e.g. private videos, region blocked, etc.) """ - browse_id = params = None - renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') - if not renderer: + is_playlist = bool(traverse_obj( + data, ('metadata', 'playlistMetadataRenderer'), ('header', 'playlistHeaderRenderer'))) + if not is_playlist: return - menu_renderer = try_get( - renderer, lambda x: x['menu']['menuRenderer']['items'], list) or [] - for menu_item in menu_renderer: - if not isinstance(menu_item, dict): - continue - nav_item_renderer = menu_item.get('menuNavigationItemRenderer') - text = try_get( - nav_item_renderer, lambda x: x['text']['simpleText'], compat_str) - if not text or text.lower() != 'show unavailable videos': - continue - browse_endpoint = try_get( - nav_item_renderer, lambda x: x['navigationEndpoint']['browseEndpoint'], dict) or {} - browse_id = browse_endpoint.get('browseId') - params = browse_endpoint.get('params') - break - headers = self.generate_api_headers( ytcfg=ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data), visitor_data=self._extract_visitor_data(data, ytcfg)) query = { - 'params': params or 'wgYCCAA=', - 'browseId': browse_id or 'VL%s' % item_id + 'params': 'wgYCCAA=', + 'browseId': f'VL{item_id}' } return self._extract_response( item_id=item_id, headers=headers, query=query, check_get_keys='contents', fatal=False, ytcfg=ytcfg, - note='Downloading API JSON with unavailable videos') + note='Redownloading playlist API JSON with unavailable videos') @functools.cached_property def skip_webpage(self): return 'webpage' in self._configuration_arg('skip', ie_key=YoutubeTabIE.ie_key()) def _extract_webpage(self, url, item_id, fatal=True): - retries = self.get_param('extractor_retries', 3) - count = -1 - webpage = data = last_error = None - while count < retries: - count += 1 - # Sometimes youtube returns a webpage with incomplete ytInitialData - # See: https://github.com/yt-dlp/yt-dlp/issues/116 - if last_error: - self.report_warning('%s. Retrying ...' % last_error) + webpage, data = None, None + for retry in self.RetryManager(fatal=fatal): try: - webpage = self._download_webpage( - url, item_id, - note='Downloading webpage%s' % (' (retry #%d)' % count if count else '',)) + webpage = self._download_webpage(url, item_id, note='Downloading webpage') data = self.extract_yt_initial_data(item_id, webpage or '', fatal=fatal) or {} except ExtractorError as e: if isinstance(e.cause, network_exceptions): - if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (403, 429): - last_error = error_to_compat_str(e.cause or e.msg) - if count < retries: - continue - if fatal: - raise - self.report_warning(error_to_compat_str(e)) + if not isinstance(e.cause, urllib.error.HTTPError) or e.cause.code not in (403, 429): + retry.error = e + continue + self._error_or_warning(e, fatal=fatal) break - else: - try: - self._extract_and_report_alerts(data) - except ExtractorError as e: - if fatal: - raise - self.report_warning(error_to_compat_str(e)) - break - if dict_get(data, ('contents', 'currentVideoEndpoint', 'onResponseReceivedActions')): - break + try: + self._extract_and_report_alerts(data) + except ExtractorError as e: + self._error_or_warning(e, fatal=fatal) + break - last_error = 'Incomplete yt initial data received' - if count >= retries: - if fatal: - raise ExtractorError(last_error) - self.report_warning(last_error) - break + # Sometimes youtube returns a webpage with incomplete ytInitialData + # See: https://github.com/yt-dlp/yt-dlp/issues/116 + if not traverse_obj(data, 'contents', 'currentVideoEndpoint', 'onResponseReceivedActions'): + retry.error = ExtractorError('Incomplete yt initial data received') + continue return webpage, data @@ -4521,8 +5006,7 @@ def _extract_data(self, url, item_id, ytcfg=None, fatal=True, webpage_fatal=Fals webpage, data = self._extract_webpage(url, item_id, fatal=webpage_fatal) ytcfg = ytcfg or self.extract_ytcfg(item_id, webpage) # Reject webpage data if redirected to home page without explicitly requesting - selected_tab = self._extract_selected_tab(traverse_obj( - data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list, default=[]), fatal=False) or {} + selected_tab = self._extract_selected_tab(self._extract_tab_renderers(data), fatal=False) or {} if (url != 'https://www.youtube.com/feed/recommended' and selected_tab.get('tabIdentifier') == 'FEwhat_to_watch' # Home page and 'no-youtube-channel-redirect' not in self.get_param('compat_opts', [])): @@ -4700,6 +5184,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', 'uploader_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'availability': 'public', }, 'playlist_count': 1, }, { @@ -4717,6 +5202,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'uploader_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'availability': 'public', }, 'playlist_count': 0, }, { @@ -4863,6 +5349,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_id': 'UCEPzS1rYsrkqzSLNp76nrcg', 'channel_url': 'https://www.youtube.com/c/ChRiStIaAn008', 'channel': 'Christiaan008', + 'availability': 'public', }, 'playlist_count': 96, }, { @@ -4881,6 +5368,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'view_count': int, 'description': '', 'channel_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', + 'availability': 'public', }, 'playlist_mincount': 1123, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], @@ -4904,6 +5392,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel': 'Interstellar Movie', 'description': '', 'modified_date': r're:\d{8}', + 'availability': 'public', }, 'playlist_mincount': 21, }, { @@ -4922,6 +5411,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_url': 'https://www.youtube.com/channel/UCTYLiWFZy8xtPwxFwX9rV7Q', 'channel_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q', 'modified_date': r're:\d{8}', + 'availability': 'public', }, 'playlist_mincount': 200, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], @@ -4941,6 +5431,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/c/blanktv', 'modified_date': r're:\d{8}', 'description': '', + 'availability': 'public', }, 'playlist_mincount': 1000, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], @@ -4959,6 +5450,8 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_id': 'UC9-y-6csu5WGm29I7JiwpnA', 'channel_url': 'https://www.youtube.com/user/Computerphile', 'channel': 'Computerphile', + 'availability': 'public', + 'modified_date': '20190712', }, 'playlist_mincount': 11, }, { @@ -4996,7 +5489,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', 'info_dict': { - 'id': 'GgL890LIznQ', # This will keep changing + 'id': 'Wq15eF5vCbI', # This will keep changing 'ext': 'mp4', 'title': str, 'uploader': 'Sky News', @@ -5007,18 +5500,19 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'categories': ['News & Politics'], 'tags': list, 'like_count': int, - 'release_timestamp': 1642502819, + 'release_timestamp': int, 'channel': 'Sky News', 'channel_id': 'UCoMdktPbSTixAyNGwb-UYkQ', 'age_limit': 0, 'view_count': int, - 'thumbnail': 'https://i.ytimg.com/vi/GgL890LIznQ/maxresdefault_live.jpg', + 'thumbnail': r're:https?://i\.ytimg\.com/vi/[^/]+/maxresdefault(?:_live)?\.jpg', 'playable_in_embed': True, - 'release_date': '20220118', + 'release_date': r're:\d+', 'availability': 'public', 'live_status': 'is_live', 'channel_url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ', - 'channel_follower_count': int + 'channel_follower_count': int, + 'concurrent_view_count': int, }, 'params': { 'skip_download': True, @@ -5096,7 +5590,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'title': '#cctv9', 'tags': [], }, - 'playlist_mincount': 350, + 'playlist_mincount': 300, # not consistent but should be over 300 }, { 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU', 'only_matching': True, @@ -5116,7 +5610,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader': 'NoCopyrightSounds', 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!', 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', - 'title': 'NCS Releases', + 'title': 'NCS : All Releases 💿', 'uploader_url': 'https://www.youtube.com/c/NoCopyrightSounds', 'channel_url': 'https://www.youtube.com/c/NoCopyrightSounds', 'modified_date': r're:\d{8}', @@ -5124,6 +5618,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', 'tags': [], 'channel': 'NoCopyrightSounds', + 'availability': 'public', }, 'playlist_mincount': 166, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], @@ -5144,23 +5639,18 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'modified_date': r're:\d{8}', 'uploader_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw', 'description': '', + 'availability': 'public', }, - 'expected_warnings': [ - 'The URL does not have a videos tab', - r'[Uu]navailable videos (are|will be) hidden', - ], 'playlist_mincount': 101, }, { - 'note': 'Topic without a UU playlist', + # Destination channel with only a hidden self tab (tab id is UCtFRv9O2AHqOZjjynzrv-xg) + # Treat as a general feed 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg', 'info_dict': { 'id': 'UCtFRv9O2AHqOZjjynzrv-xg', 'title': 'UCtFRv9O2AHqOZjjynzrv-xg', 'tags': [], }, - 'expected_warnings': [ - 'the playlist redirect gave error', - ], 'playlist_mincount': 9, }, { 'note': 'Youtube music Album', @@ -5185,7 +5675,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'title': 'yt-dlp unlisted playlist test', 'availability': 'unlisted', 'tags': [], - 'modified_date': '20211208', + 'modified_date': '20220418', 'channel': 'colethedj', 'view_count': int, 'description': '', @@ -5228,6 +5718,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'skip_download': True, 'extractor_args': {'youtubetab': {'skip': ['webpage']}} }, + 'skip': 'Query for sorting no longer works', }, { 'note': 'API Fallback: Topic, should redirect to playlist?list=UU...', 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', @@ -5244,11 +5735,8 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel': 'Royalty Free Music - Topic', 'view_count': int, 'uploader_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw', + 'availability': 'public', }, - 'expected_warnings': [ - 'does not have a videos tab', - r'[Uu]navailable videos (are|will be) hidden', - ], 'playlist_mincount': 101, 'params': { 'skip_download': True, @@ -5273,132 +5761,444 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel': 'pukkandan', 'description': 'Test for collaborative playlist', 'title': 'yt-dlp test - collaborative playlist', + 'view_count': int, 'uploader_url': 'https://www.youtube.com/channel/UCKcqXmCcyqnhgpA5P0oHH_Q', }, 'playlist_mincount': 2 + }, { + 'note': 'translated tab name', + 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/playlists', + 'info_dict': { + 'id': 'UCiu-3thuViMebBjw_5nWYrA', + 'tags': [], + 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'description': 'test description', + 'title': 'cole-dlp-test-acc - 再生リスト', + 'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'uploader': 'cole-dlp-test-acc', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel': 'cole-dlp-test-acc', + }, + 'playlist_mincount': 1, + 'params': {'extractor_args': {'youtube': {'lang': ['ja']}}}, + 'expected_warnings': ['Preferring "ja"'], + }, { + # XXX: this should really check flat playlist entries, but the test suite doesn't support that + 'note': 'preferred lang set with playlist with translated video titles', + 'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlQAaPZ5Z-rJoTdbT-45Q7c0', + 'info_dict': { + 'id': 'PLt5yu3-wZAlQAaPZ5Z-rJoTdbT-45Q7c0', + 'tags': [], + 'view_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'uploader': 'cole-dlp-test-acc', + 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel': 'cole-dlp-test-acc', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'description': 'test', + 'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'title': 'dlp test playlist', + 'availability': 'public', + }, + 'playlist_mincount': 1, + 'params': {'extractor_args': {'youtube': {'lang': ['ja']}}}, + 'expected_warnings': ['Preferring "ja"'], + }, { + # shorts audio pivot for 2GtVksBMYFM. + 'url': 'https://www.youtube.com/feed/sfv_audio_pivot?bp=8gUrCikSJwoLMkd0VmtzQk1ZRk0SCzJHdFZrc0JNWUZNGgsyR3RWa3NCTVlGTQ==', + 'info_dict': { + 'id': 'sfv_audio_pivot', + 'title': 'sfv_audio_pivot', + 'tags': [], + }, + 'playlist_mincount': 50, + + }, { + # Channel with a real live tab (not to be mistaken with streams tab) + # Do not treat like it should redirect to live stream + 'url': 'https://www.youtube.com/channel/UCEH7P7kyJIkS_gJf93VYbmg/live', + 'info_dict': { + 'id': 'UCEH7P7kyJIkS_gJf93VYbmg', + 'title': 'UCEH7P7kyJIkS_gJf93VYbmg - Live', + 'tags': [], + }, + 'playlist_mincount': 20, + }, { + # Tab name is not the same as tab id + 'url': 'https://www.youtube.com/channel/UCQvWX73GQygcwXOTSf_VDVg/letsplay', + 'info_dict': { + 'id': 'UCQvWX73GQygcwXOTSf_VDVg', + 'title': 'UCQvWX73GQygcwXOTSf_VDVg - Let\'s play', + 'tags': [], + }, + 'playlist_mincount': 8, + }, { + # Home tab id is literally home. Not to get mistaken with featured + 'url': 'https://www.youtube.com/channel/UCQvWX73GQygcwXOTSf_VDVg/home', + 'info_dict': { + 'id': 'UCQvWX73GQygcwXOTSf_VDVg', + 'title': 'UCQvWX73GQygcwXOTSf_VDVg - Home', + 'tags': [], + }, + 'playlist_mincount': 8, + }, { + # Should get three playlists for videos, shorts and streams tabs + 'url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', + 'info_dict': { + 'id': 'UCK9V2B22uJYu3N7eR_BT9QA', + 'title': 'Polka Ch. 尾丸ポルカ', + 'channel_follower_count': int, + 'channel_id': 'UCK9V2B22uJYu3N7eR_BT9QA', + 'channel_url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', + 'uploader': 'Polka Ch. 尾丸ポルカ', + 'description': 'md5:3b8df1ac5af337aa206e37ee3d181ec9', + 'channel': 'Polka Ch. 尾丸ポルカ', + 'tags': 'count:35', + 'uploader_url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', + 'uploader_id': 'UCK9V2B22uJYu3N7eR_BT9QA', + }, + 'playlist_count': 3, + }, { + # Shorts tab with channel with handle + 'url': 'https://www.youtube.com/@NotJustBikes/shorts', + 'info_dict': { + 'id': 'UC0intLFzLaudFG-xAvUEO-A', + 'title': 'Not Just Bikes - Shorts', + 'tags': 'count:12', + 'uploader': 'Not Just Bikes', + 'channel_url': 'https://www.youtube.com/channel/UC0intLFzLaudFG-xAvUEO-A', + 'description': 'md5:7513148b1f02b924783157d84c4ea555', + 'channel_follower_count': int, + 'uploader_id': 'UC0intLFzLaudFG-xAvUEO-A', + 'channel_id': 'UC0intLFzLaudFG-xAvUEO-A', + 'uploader_url': 'https://www.youtube.com/channel/UC0intLFzLaudFG-xAvUEO-A', + 'channel': 'Not Just Bikes', + }, + 'playlist_mincount': 10, + }, { + # Streams tab + 'url': 'https://www.youtube.com/channel/UC3eYAvjCVwNHgkaGbXX3sig/streams', + 'info_dict': { + 'id': 'UC3eYAvjCVwNHgkaGbXX3sig', + 'title': '中村悠一 - Live', + 'tags': 'count:7', + 'channel_id': 'UC3eYAvjCVwNHgkaGbXX3sig', + 'channel_url': 'https://www.youtube.com/channel/UC3eYAvjCVwNHgkaGbXX3sig', + 'uploader_id': 'UC3eYAvjCVwNHgkaGbXX3sig', + 'channel': '中村悠一', + 'uploader_url': 'https://www.youtube.com/channel/UC3eYAvjCVwNHgkaGbXX3sig', + 'channel_follower_count': int, + 'uploader': '中村悠一', + 'description': 'md5:e744f6c93dafa7a03c0c6deecb157300', + }, + 'playlist_mincount': 60, + }, { + # Channel with no uploads and hence no videos, streams, shorts tabs or uploads playlist. This should fail. + # See test_youtube_lists + 'url': 'https://www.youtube.com/channel/UC2yXPzFejc422buOIzn_0CA', + 'only_matching': True, + }, { + # No uploads and no UCID given. Should fail with no uploads error + # See test_youtube_lists + 'url': 'https://www.youtube.com/news', + 'only_matching': True + }, { + # No videos tab but has a shorts tab + 'url': 'https://www.youtube.com/c/TKFShorts', + 'info_dict': { + 'id': 'UCgJ5_1F6yJhYLnyMszUdmUg', + 'title': 'Shorts Break - Shorts', + 'tags': 'count:32', + 'channel_id': 'UCgJ5_1F6yJhYLnyMszUdmUg', + 'channel': 'Shorts Break', + 'description': 'md5:a6c234cf3d50d878ef8721e34457cd11', + 'uploader': 'Shorts Break', + 'channel_follower_count': int, + 'uploader_id': 'UCgJ5_1F6yJhYLnyMszUdmUg', + 'uploader_url': 'https://www.youtube.com/channel/UCgJ5_1F6yJhYLnyMszUdmUg', + 'channel_url': 'https://www.youtube.com/channel/UCgJ5_1F6yJhYLnyMszUdmUg', + }, + 'playlist_mincount': 30, + }, { + # Trending Now Tab. tab id is empty + 'url': 'https://www.youtube.com/feed/trending', + 'info_dict': { + 'id': 'trending', + 'title': 'trending - Now', + 'tags': [], + }, + 'playlist_mincount': 30, + }, { + # Trending Gaming Tab. tab id is empty + 'url': 'https://www.youtube.com/feed/trending?bp=4gIcGhpnYW1pbmdfY29ycHVzX21vc3RfcG9wdWxhcg%3D%3D', + 'info_dict': { + 'id': 'trending', + 'title': 'trending - Gaming', + 'tags': [], + }, + 'playlist_mincount': 30, + }, { + # Shorts url result in shorts tab + 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/shorts', + 'info_dict': { + 'id': 'UCiu-3thuViMebBjw_5nWYrA', + 'title': 'cole-dlp-test-acc - Shorts', + 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel': 'cole-dlp-test-acc', + 'description': 'test description', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'tags': [], + 'uploader': 'cole-dlp-test-acc', + 'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + + }, + 'playlist': [{ + 'info_dict': { + '_type': 'url', + 'ie_key': 'Youtube', + 'url': 'https://www.youtube.com/shorts/sSM9J5YH_60', + 'id': 'sSM9J5YH_60', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'title': 'SHORT short', + 'channel': 'cole-dlp-test-acc', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'view_count': int, + 'thumbnails': list, + } + }], + 'params': {'extract_flat': True}, + }, { + # Live video status should be extracted + 'url': 'https://www.youtube.com/channel/UCQvWX73GQygcwXOTSf_VDVg/live', + 'info_dict': { + 'id': 'UCQvWX73GQygcwXOTSf_VDVg', + 'title': 'UCQvWX73GQygcwXOTSf_VDVg - Live', # TODO, should be Minecraft - Live or Minecraft - Topic - Live + 'tags': [] + }, + 'playlist': [{ + 'info_dict': { + '_type': 'url', + 'ie_key': 'Youtube', + 'url': 'startswith:https://www.youtube.com/watch?v=', + 'id': str, + 'title': str, + 'live_status': 'is_live', + 'channel_id': str, + 'channel_url': str, + 'concurrent_view_count': int, + 'channel': str, + } + }], + 'params': {'extract_flat': True, 'playlist_items': '1'}, + 'playlist_mincount': 1 + }, { + # Channel renderer metadata. Contains number of videos on the channel + 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/channels', + 'info_dict': { + 'id': 'UCiu-3thuViMebBjw_5nWYrA', + 'title': 'cole-dlp-test-acc - Channels', + 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel': 'cole-dlp-test-acc', + 'description': 'test description', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'tags': [], + 'uploader': 'cole-dlp-test-acc', + 'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + + }, + 'playlist': [{ + 'info_dict': { + '_type': 'url', + 'ie_key': 'YoutubeTab', + 'url': 'https://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'title': 'PewDiePie', + 'channel': 'PewDiePie', + 'channel_url': 'https://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'thumbnails': list, + 'channel_follower_count': int, + 'playlist_count': int + } + }], + 'params': {'extract_flat': True}, }] @classmethod def suitable(cls, url): return False if YoutubeIE.suitable(url) else super().suitable(url) - _URL_RE = re.compile(rf'(?P
{_VALID_URL})(?(not_channel)|(?P/\w+))?(?P.*)$')
+    _URL_RE = re.compile(rf'(?P
{_VALID_URL})(?(not_channel)|(?P/[^?#/]+))?(?P.*)$')
+
+    def _get_url_mobj(self, url):
+        mobj = self._URL_RE.match(url).groupdict()
+        mobj.update((k, '') for k, v in mobj.items() if v is None)
+        return mobj
+
+    def _extract_tab_id_and_name(self, tab, base_url='https://www.youtube.com'):
+        tab_name = (tab.get('title') or '').lower()
+        tab_url = urljoin(base_url, traverse_obj(
+            tab, ('endpoint', 'commandMetadata', 'webCommandMetadata', 'url')))
+
+        tab_id = (tab_url and self._get_url_mobj(tab_url)['tab'][1:]
+                  or traverse_obj(tab, 'tabIdentifier', expected_type=str))
+        if tab_id:
+            return {
+                'TAB_ID_SPONSORSHIPS': 'membership',
+            }.get(tab_id, tab_id), tab_name
+
+        # Fallback to tab name if we cannot get the tab id.
+        # XXX: should we strip non-ascii letters? e.g. in case of 'let's play' tab example on special gaming channel
+        # Note that in the case of translated tab name this may result in an empty string, which we don't want.
+        if tab_name:
+            self.write_debug(f'Falling back to selected tab name: {tab_name}')
+        return {
+            'home': 'featured',
+            'live': 'streams',
+        }.get(tab_name, tab_name), tab_name
+
+    def _has_tab(self, tabs, tab_id):
+        return any(self._extract_tab_id_and_name(tab)[0] == tab_id for tab in tabs)
 
     @YoutubeTabBaseInfoExtractor.passthrough_smuggled_data
     def _real_extract(self, url, smuggled_data):
         item_id = self._match_id(url)
-        url = compat_urlparse.urlunparse(
-            compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
+        url = urllib.parse.urlunparse(
+            urllib.parse.urlparse(url)._replace(netloc='www.youtube.com'))
         compat_opts = self.get_param('compat_opts', [])
 
-        def get_mobj(url):
-            mobj = self._URL_RE.match(url).groupdict()
-            mobj.update((k, '') for k, v in mobj.items() if v is None)
-            return mobj
-
-        mobj, redirect_warning = get_mobj(url), None
-        # Youtube returns incomplete data if tabname is not lower case
-        pre, tab, post, is_channel = mobj['pre'], mobj['tab'].lower(), mobj['post'], not mobj['not_channel']
-        if is_channel:
-            if smuggled_data.get('is_music_url'):
-                if item_id[:2] == 'VL':  # Youtube music VL channels have an equivalent playlist
-                    item_id = item_id[2:]
-                    pre, tab, post, is_channel = f'https://www.youtube.com/playlist?list={item_id}', '', '', False
-                elif item_id[:2] == 'MP':  # Resolve albums (/[channel/browse]/MP...) to their equivalent playlist
-                    mdata = self._extract_tab_endpoint(
-                        f'https://music.youtube.com/channel/{item_id}', item_id, default_client='web_music')
-                    murl = traverse_obj(mdata, ('microformat', 'microformatDataRenderer', 'urlCanonical'),
-                                        get_all=False, expected_type=compat_str)
-                    if not murl:
-                        raise ExtractorError('Failed to resolve album to playlist')
-                    return self.url_result(murl, ie=YoutubeTabIE.ie_key())
-                elif mobj['channel_type'] == 'browse':  # Youtube music /browse/ should be changed to /channel/
-                    pre = f'https://www.youtube.com/channel/{item_id}'
-
-        original_tab_name = tab
+        mobj = self._get_url_mobj(url)
+        pre, tab, post, is_channel = mobj['pre'], mobj['tab'], mobj['post'], not mobj['not_channel']
+        if is_channel and smuggled_data.get('is_music_url'):
+            if item_id[:2] == 'VL':  # Youtube music VL channels have an equivalent playlist
+                return self.url_result(
+                    f'https://music.youtube.com/playlist?list={item_id[2:]}', YoutubeTabIE, item_id[2:])
+            elif item_id[:2] == 'MP':  # Resolve albums (/[channel/browse]/MP...) to their equivalent playlist
+                mdata = self._extract_tab_endpoint(
+                    f'https://music.youtube.com/channel/{item_id}', item_id, default_client='web_music')
+                murl = traverse_obj(mdata, ('microformat', 'microformatDataRenderer', 'urlCanonical'),
+                                    get_all=False, expected_type=str)
+                if not murl:
+                    raise ExtractorError('Failed to resolve album to playlist')
+                return self.url_result(murl, YoutubeTabIE)
+            elif mobj['channel_type'] == 'browse':  # Youtube music /browse/ should be changed to /channel/
+                return self.url_result(
+                    f'https://music.youtube.com/channel/{item_id}{tab}{post}', YoutubeTabIE, item_id)
+
+        original_tab_id, display_id = tab[1:], f'{item_id}{tab}'
         if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts:
-            # Home URLs should redirect to /videos/
-            redirect_warning = ('A channel/user page was given. All the channel\'s videos will be downloaded. '
-                                'To download only the videos in the home page, add a "/featured" to the URL')
-            tab = '/videos'
-
-        url = ''.join((pre, tab, post))
-        mobj = get_mobj(url)
+            url = f'{pre}/videos{post}'
 
         # Handle both video/playlist URLs
         qs = parse_qs(url)
-        video_id, playlist_id = (qs.get(key, [None])[0] for key in ('v', 'list'))
-
+        video_id, playlist_id = [traverse_obj(qs, (key, 0)) for key in ('v', 'list')]
         if not video_id and mobj['not_channel'].startswith('watch'):
             if not playlist_id:
                 # If there is neither video or playlist ids, youtube redirects to home page, which is undesirable
-                raise ExtractorError('Unable to recognize tab page')
+                raise ExtractorError('A video URL was given without video ID', expected=True)
             # Common mistake: https://www.youtube.com/watch?list=playlist_id
             self.report_warning(f'A video URL was given without video ID. Trying to download playlist {playlist_id}')
-            url = f'https://www.youtube.com/playlist?list={playlist_id}'
-            mobj = get_mobj(url)
+            return self.url_result(
+                f'https://www.youtube.com/playlist?list={playlist_id}', YoutubeTabIE, playlist_id)
 
-        if video_id and playlist_id:
-            if self.get_param('noplaylist'):
-                self.to_screen(f'Downloading just video {video_id} because of --no-playlist')
-                return self.url_result(f'https://www.youtube.com/watch?v={video_id}',
-                                       ie=YoutubeIE.ie_key(), video_id=video_id)
-            self.to_screen(f'Downloading playlist {playlist_id}; add --no-playlist to just download video {video_id}')
+        if not self._yes_playlist(playlist_id, video_id):
+            return self.url_result(
+                f'https://www.youtube.com/watch?v={video_id}', YoutubeIE, video_id)
 
-        data, ytcfg = self._extract_data(url, item_id)
+        data, ytcfg = self._extract_data(url, display_id)
 
         # YouTube may provide a non-standard redirect to the regional channel
         # See: https://github.com/yt-dlp/yt-dlp/issues/2694
+        # https://support.google.com/youtube/answer/2976814#zippy=,conditional-redirects
         redirect_url = traverse_obj(
             data, ('onResponseReceivedActions', ..., 'navigateAction', 'endpoint', 'commandMetadata', 'webCommandMetadata', 'url'), get_all=False)
         if redirect_url and 'no-youtube-channel-redirect' not in compat_opts:
-            redirect_url = ''.join((
-                urljoin('https://www.youtube.com', redirect_url), mobj['tab'], mobj['post']))
-            self.to_screen(f'This playlist is likely not available in your region. Following redirect to regional playlist {redirect_url}')
-            return self.url_result(redirect_url, ie=YoutubeTabIE.ie_key())
+            redirect_url = ''.join((urljoin('https://www.youtube.com', redirect_url), tab, post))
+            self.to_screen(f'This playlist is likely not available in your region. Following conditional redirect to {redirect_url}')
+            return self.url_result(redirect_url, YoutubeTabIE)
 
-        tabs = traverse_obj(data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list)
-        if tabs:
+        tabs, extra_tabs = self._extract_tab_renderers(data), []
+        if is_channel and tabs and 'no-youtube-channel-redirect' not in compat_opts:
             selected_tab = self._extract_selected_tab(tabs)
-            selected_tab_name = selected_tab.get('title', '').lower()
-            if selected_tab_name == 'home':
-                selected_tab_name = 'featured'
-            requested_tab_name = mobj['tab'][1:]
-            if 'no-youtube-channel-redirect' not in compat_opts:
-                if requested_tab_name == 'live':
-                    # Live tab should have redirected to the video
-                    raise ExtractorError('The channel is not currently live', expected=True)
-                if requested_tab_name not in ('', selected_tab_name):
-                    redirect_warning = f'The channel does not have a {requested_tab_name} tab'
-                    if not original_tab_name:
-                        if item_id[:2] == 'UC':
-                            # Topic channels don't have /videos. Use the equivalent playlist instead
-                            pl_id = f'UU{item_id[2:]}'
-                            pl_url = f'https://www.youtube.com/playlist?list={pl_id}'
-                            try:
-                                data, ytcfg = self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True, webpage_fatal=True)
-                            except ExtractorError:
-                                redirect_warning += ' and the playlist redirect gave error'
-                            else:
-                                item_id, url, selected_tab_name = pl_id, pl_url, requested_tab_name
-                                redirect_warning += f'. Redirecting to playlist {pl_id} instead'
-                        if selected_tab_name and selected_tab_name != requested_tab_name:
-                            redirect_warning += f'. {selected_tab_name} tab is being downloaded instead'
+            selected_tab_id, selected_tab_name = self._extract_tab_id_and_name(selected_tab, url)  # NB: Name may be translated
+            self.write_debug(f'Selected tab: {selected_tab_id!r} ({selected_tab_name}), Requested tab: {original_tab_id!r}')
+
+            if not original_tab_id and selected_tab_name:
+                self.to_screen('Downloading all uploads of the channel. '
+                               'To download only the videos in a specific tab, pass the tab\'s URL')
+                if self._has_tab(tabs, 'streams'):
+                    extra_tabs.append(''.join((pre, '/streams', post)))
+                if self._has_tab(tabs, 'shorts'):
+                    extra_tabs.append(''.join((pre, '/shorts', post)))
+                # XXX: Members-only tab should also be extracted
+
+                if not extra_tabs and selected_tab_id != 'videos':
+                    # Channel does not have streams, shorts or videos tabs
+                    if item_id[:2] != 'UC':
+                        raise ExtractorError('This channel has no uploads', expected=True)
+
+                    # Topic channels don't have /videos. Use the equivalent playlist instead
+                    pl_id = f'UU{item_id[2:]}'
+                    pl_url = f'https://www.youtube.com/playlist?list={pl_id}'
+                    try:
+                        data, ytcfg = self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True, webpage_fatal=True)
+                    except ExtractorError:
+                        raise ExtractorError('This channel has no uploads', expected=True)
                     else:
-                        raise ExtractorError(redirect_warning, expected=True)
+                        item_id, url = pl_id, pl_url
+                        self.to_screen(
+                            f'The channel does not have a videos, shorts, or live tab. Redirecting to playlist {pl_id} instead')
 
-        if redirect_warning:
-            self.to_screen(redirect_warning)
-        self.write_debug(f'Final URL: {url}')
+                elif extra_tabs and selected_tab_id != 'videos':
+                    # When there are shorts/live tabs but not videos tab
+                    url, data = f'{pre}{post}', None
+
+            elif (original_tab_id or 'videos') != selected_tab_id:
+                if original_tab_id == 'live':
+                    # Live tab should have redirected to the video
+                    # Except in the case the channel has an actual live tab
+                    # Example: https://www.youtube.com/channel/UCEH7P7kyJIkS_gJf93VYbmg/live
+                    raise UserNotLive(video_id=item_id)
+                elif selected_tab_name:
+                    raise ExtractorError(f'This channel does not have a {original_tab_id} tab', expected=True)
+
+                # For channels such as https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg
+                url = f'{pre}{post}'
 
         # YouTube sometimes provides a button to reload playlist with unavailable videos.
         if 'no-youtube-unavailable-videos' not in compat_opts:
-            data = self._reload_with_unavailable_videos(item_id, data, ytcfg) or data
+            data = self._reload_with_unavailable_videos(display_id, data, ytcfg) or data
         self._extract_and_report_alerts(data, only_once=True)
-        tabs = traverse_obj(data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list)
-        if tabs:
-            return self._extract_from_tabs(item_id, ytcfg, data, tabs)
 
+        tabs, entries = self._extract_tab_renderers(data), []
+        if tabs:
+            entries = [self._extract_from_tabs(item_id, ytcfg, data, tabs)]
+            entries[0].update({
+                'extractor_key': YoutubeTabIE.ie_key(),
+                'extractor': YoutubeTabIE.IE_NAME,
+                'webpage_url': url,
+            })
+        if self.get_param('playlist_items') == '0':
+            entries.extend(self.url_result(u, YoutubeTabIE) for u in extra_tabs)
+        else:  # Users expect to get all `video_id`s even with `--flat-playlist`. So don't return `url_result`
+            entries.extend(map(self._real_extract, extra_tabs))
+
+        if len(entries) == 1:
+            return entries[0]
+        elif entries:
+            metadata = self._extract_metadata_from_tabs(item_id, data)
+            uploads_url = 'the Uploads (UU) playlist URL'
+            if try_get(metadata, lambda x: x['channel_id'].startswith('UC')):
+                uploads_url = f'https://www.youtube.com/playlist?list=UU{metadata["channel_id"][2:]}'
+            self.to_screen(
+                'Downloading as multiple playlists, separated by tabs. '
+                f'To download as a single playlist instead, pass {uploads_url}')
+            return self.playlist_result(entries, item_id, **metadata)
+
+        # Inline playlist
         playlist = traverse_obj(
             data, ('contents', 'twoColumnWatchNextResults', 'playlist', 'playlist'), expected_type=dict)
         if playlist:
@@ -5407,10 +6207,9 @@ def get_mobj(url):
         video_id = traverse_obj(
             data, ('currentVideoEndpoint', 'watchEndpoint', 'videoId'), expected_type=str) or video_id
         if video_id:
-            if mobj['tab'] != '/live':  # live tab is expected to redirect to video
+            if tab != '/live':  # live tab is expected to redirect to video
                 self.report_warning(f'Unable to recognize playlist. Downloading just video {video_id}')
-            return self.url_result(f'https://www.youtube.com/watch?v={video_id}',
-                                   ie=YoutubeIE.ie_key(), video_id=video_id)
+            return self.url_result(f'https://www.youtube.com/watch?v={video_id}', YoutubeIE, video_id)
 
         raise ExtractorError('Unable to recognize tab page')
 
@@ -5443,12 +6242,13 @@ class YoutubePlaylistIE(InfoExtractor):
             'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
             'description': 'md5:8fa6f52abb47a9552002fa3ddfc57fc2',
             'view_count': int,
-            'uploader_url': 'https://www.youtube.com/user/Wickydoo',
+            'uploader_url': 'https://www.youtube.com/c/WickmanVT',
             'modified_date': r're:\d{8}',
             'channel_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
             'channel': 'Wickman',
             'tags': [],
-            'channel_url': 'https://www.youtube.com/user/Wickydoo',
+            'channel_url': 'https://www.youtube.com/c/WickmanVT',
+            'availability': 'public',
         },
         'playlist_mincount': 29,
     }, {
@@ -5476,11 +6276,12 @@ class YoutubePlaylistIE(InfoExtractor):
             'channel': 'milan',
             'channel_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
             'uploader_url': 'https://www.youtube.com/channel/UCEI1-PVPcYXjB73Hfelbmaw',
+            'availability': 'public',
         },
-        'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
+        'expected_warnings': [r'[Uu]navailable videos? (is|are|will be) hidden'],
     }, {
         'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
-        'playlist_mincount': 654,
+        'playlist_mincount': 455,
         'info_dict': {
             'title': '2018 Chinese New Singles (11/6 updated)',
             'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
@@ -5494,6 +6295,7 @@ class YoutubePlaylistIE(InfoExtractor):
             'uploader_url': 'https://www.youtube.com/c/愛低音的國王',
             'channel_id': 'UC21nz3_MesPLqtDqwdvnoxA',
             'modified_date': r're:\d{8}',
+            'availability': 'public',
         },
         'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
     }, {
@@ -5553,6 +6355,8 @@ class YoutubeYtBeIE(InfoExtractor):
             'channel_url': 'https://www.youtube.com/channel/UCEfMCQ9bs3tjvjy1s451zaw',
             'availability': 'public',
             'duration': 59,
+            'comment_count': int,
+            'channel_follower_count': int
         },
         'params': {
             'noplaylist': True,
@@ -5601,9 +6405,7 @@ class YoutubeYtUserIE(InfoExtractor):
 
     def _real_extract(self, url):
         user_id = self._match_id(url)
-        return self.url_result(
-            'https://www.youtube.com/user/%s/videos' % user_id,
-            ie=YoutubeTabIE.ie_key(), video_id=user_id)
+        return self.url_result(f'https://www.youtube.com/user/{user_id}', YoutubeTabIE, user_id)
 
 
 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
@@ -5680,9 +6482,9 @@ def _extract_notification_renderer(self, notification):
         title = self._search_regex(
             rf'{re.escape(channel or "")}[^:]+: (.+)', notification_title,
             'video title', default=None)
-        upload_date = (strftime_or_none(self._extract_time_text(notification, 'sentTimeText')[0], '%Y%m%d')
-                       if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE.ie_key())
-                       else None)
+        timestamp = (self._parse_time_text(self._get_text(notification, 'sentTimeText'))
+                     if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE)
+                     else None)
         return {
             '_type': 'url',
             'url': url,
@@ -5692,7 +6494,7 @@ def _extract_notification_renderer(self, notification):
             'channel_id': channel_id,
             'channel': channel,
             'thumbnails': self._extract_thumbnails(notification, 'videoThumbnail'),
-            'upload_date': upload_date,
+            'timestamp': timestamp,
         }
 
     def _notification_menu_entries(self, ytcfg):
@@ -5770,11 +6572,36 @@ class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor):
         'info_dict': {
             'id': '#cats',
             'title': '#cats',
-            'entries': [{
-                'url': r're:https://(www\.)?youtube\.com/hashtag/cats',
-                'title': '#cats',
-            }],
+            # The test suite does not have support for nested playlists
+            # 'entries': [{
+            #     'url': r're:https://(www\.)?youtube\.com/hashtag/cats',
+            #     'title': '#cats',
+            # }],
         },
+    }, {
+        # Channel results
+        'url': 'https://www.youtube.com/results?search_query=kurzgesagt&sp=EgIQAg%253D%253D',
+        'info_dict': {
+            'id': 'kurzgesagt',
+            'title': 'kurzgesagt',
+        },
+        'playlist': [{
+            'info_dict': {
+                '_type': 'url',
+                'id': 'UCsXVk37bltHxD1rDPwtNM8Q',
+                'url': 'https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q',
+                'ie_key': 'YoutubeTab',
+                'channel': 'Kurzgesagt – In a Nutshell',
+                'description': 'md5:4ae48dfa9505ffc307dad26342d06bfc',
+                'title': 'Kurzgesagt – In a Nutshell',
+                'channel_id': 'UCsXVk37bltHxD1rDPwtNM8Q',
+                'playlist_count': int,  # XXX: should have a way of saying > 1
+                'channel_url': 'https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q',
+                'thumbnails': list
+            }
+        }],
+        'params': {'extract_flat': True, 'playlist_items': '1'},
+        'playlist_mincount': 1,
     }, {
         'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
         'only_matching': True,
@@ -5787,7 +6614,7 @@ def _real_extract(self, url):
 
 
 class YoutubeMusicSearchURLIE(YoutubeTabBaseInfoExtractor):
-    IE_DESC = 'YouTube music search URLs with selectable sections (Eg: #songs)'
+    IE_DESC = 'YouTube music search URLs with selectable sections, e.g. #songs'
     IE_NAME = 'youtube:music:search_url'
     _VALID_URL = r'https?://music\.youtube\.com/search\?([^#]+&)?(?:search_query|q)=(?:[^&]+)(?:[&#]|$)'
     _TESTS = [{
@@ -5831,7 +6658,7 @@ def _real_extract(self, url):
         if params:
             section = next((k for k, v in self._SECTIONS.items() if v == params), params)
         else:
-            section = compat_urllib_parse_unquote_plus((url.split('#') + [''])[1]).lower()
+            section = urllib.parse.unquote_plus((url.split('#') + [''])[1]).lower()
             params = self._SECTIONS.get(section)
             if not params:
                 section = None
@@ -5925,10 +6752,34 @@ class YoutubeStoriesIE(InfoExtractor):
     def _real_extract(self, url):
         playlist_id = f'RLTD{self._match_id(url)}'
         return self.url_result(
-            f'https://www.youtube.com/playlist?list={playlist_id}&playnext=1',
+            smuggle_url(f'https://www.youtube.com/playlist?list={playlist_id}&playnext=1', {'is_story': True}),
             ie=YoutubeTabIE, video_id=playlist_id)
 
 
+class YoutubeShortsAudioPivotIE(InfoExtractor):
+    IE_DESC = 'YouTube Shorts audio pivot (Shorts using audio of a given video)'
+    IE_NAME = 'youtube:shorts:pivot:audio'
+    _VALID_URL = r'https?://(?:www\.)?youtube\.com/source/(?P[\w-]{11})/shorts'
+    _TESTS = [{
+        'url': 'https://www.youtube.com/source/Lyj-MZSAA9o/shorts',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _generate_audio_pivot_params(video_id):
+        """
+        Generates sfv_audio_pivot browse params for this video id
+        """
+        pb_params = b'\xf2\x05+\n)\x12\'\n\x0b%b\x12\x0b%b\x1a\x0b%b' % ((video_id.encode(),) * 3)
+        return urllib.parse.quote(base64.b64encode(pb_params).decode())
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        return self.url_result(
+            f'https://www.youtube.com/feed/sfv_audio_pivot?bp={self._generate_audio_pivot_params(video_id)}',
+            ie=YoutubeTabIE)
+
+
 class YoutubeTruncatedURLIE(InfoExtractor):
     IE_NAME = 'youtube:truncated_url'
     IE_DESC = False  # Do not list
@@ -5990,6 +6841,25 @@ class YoutubeClipIE(YoutubeTabBaseInfoExtractor):
             'section_start': 29.0,
             'section_end': 39.7,
             'duration': 10.7,
+            'age_limit': 0,
+            'availability': 'public',
+            'categories': ['Gaming'],
+            'channel': 'Scott The Woz',
+            'channel_id': 'UC4rqhyiTs7XyuODcECvuiiQ',
+            'channel_url': 'https://www.youtube.com/channel/UC4rqhyiTs7XyuODcECvuiiQ',
+            'description': 'md5:7a4517a17ea9b4bd98996399d8bb36e7',
+            'like_count': int,
+            'playable_in_embed': True,
+            'tags': 'count:17',
+            'thumbnail': 'https://i.ytimg.com/vi_webp/ScPX26pdQik/maxresdefault.webp',
+            'title': 'Mobile Games on Console - Scott The Woz',
+            'upload_date': '20210920',
+            'uploader': 'Scott The Woz',
+            'uploader_id': 'scottthewoz',
+            'uploader_url': 'http://www.youtube.com/user/scottthewoz',
+            'view_count': int,
+            'live_status': 'not_live',
+            'channel_follower_count': int
         }
     }]