format_field,
int_or_none,
intlist_to_bytes,
+ is_html,
mimetype2ext,
+ network_exceptions,
orderedSet,
parse_codecs,
parse_count,
parse_duration,
parse_iso8601,
+ parse_qs,
qualities,
remove_start,
smuggle_url,
unsmuggle_url,
update_url_query,
url_or_none,
- urlencode_postdata,
urljoin,
variadic,
)
-def parse_qs(url):
- return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+# any clients starting with _ cannot be explicity requested by the user
+INNERTUBE_CLIENTS = {
+ 'web': {
+ 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'WEB',
+ 'clientVersion': '2.20210622.10.00',
+ }
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 1
+ },
+ 'web_embedded': {
+ 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'WEB_EMBEDDED_PLAYER',
+ 'clientVersion': '1.20210620.0.1',
+ },
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 56
+ },
+ 'web_music': {
+ 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
+ 'INNERTUBE_HOST': 'music.youtube.com',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'WEB_REMIX',
+ 'clientVersion': '1.20210621.00.00',
+ }
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 67,
+ },
+ 'web_creator': {
+ 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'WEB_CREATOR',
+ 'clientVersion': '1.20210621.00.00',
+ }
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 62,
+ },
+ 'android': {
+ 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'ANDROID',
+ 'clientVersion': '16.20',
+ }
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
+ },
+ 'android_embedded': {
+ 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'ANDROID_EMBEDDED_PLAYER',
+ 'clientVersion': '16.20',
+ },
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 55
+ },
+ 'android_music': {
+ 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
+ 'INNERTUBE_HOST': 'music.youtube.com',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'ANDROID_MUSIC',
+ 'clientVersion': '4.32',
+ }
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 21,
+ },
+ 'android_creator': {
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'ANDROID_CREATOR',
+ 'clientVersion': '21.24.100',
+ },
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 14
+ },
+ # ios has HLS live streams
+ # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680
+ 'ios': {
+ 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'IOS',
+ 'clientVersion': '16.20',
+ }
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 5
+ },
+ 'ios_embedded': {
+ 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'IOS_MESSAGES_EXTENSION',
+ 'clientVersion': '16.20',
+ },
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 66
+ },
+ 'ios_music': {
+ 'INNERTUBE_API_KEY': 'AIzaSyDK3iBpDP9nHVTk2qL73FLJICfOC3c51Og',
+ 'INNERTUBE_HOST': 'music.youtube.com',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'IOS_MUSIC',
+ 'clientVersion': '4.32',
+ },
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 26
+ },
+ 'ios_creator': {
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'IOS_CREATOR',
+ 'clientVersion': '21.24.100',
+ },
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 15
+ },
+ # mweb has 'ultralow' formats
+ # See: https://github.com/yt-dlp/yt-dlp/pull/557
+ 'mweb': {
+ 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'MWEB',
+ 'clientVersion': '2.20210721.07.00',
+ }
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 2
+ },
+}
+
+
+def build_innertube_clients():
+ third_party = {
+ 'embedUrl': 'https://google.com', # Can be any valid URL
+ }
+ base_clients = ('android', 'web', 'ios', 'mweb')
+ priority = qualities(base_clients[::-1])
+
+ for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()):
+ ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8')
+ ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com')
+ ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en')
+ ytcfg['priority'] = 10 * priority(client.split('_', 1)[0])
+
+ if client in base_clients:
+ INNERTUBE_CLIENTS[f'{client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg)
+ agegate_ytcfg['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED'
+ agegate_ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party
+ agegate_ytcfg['priority'] -= 1
+ elif client.endswith('_embedded'):
+ ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party
+ ytcfg['priority'] -= 2
+ else:
+ ytcfg['priority'] -= 3
+
+
+build_innertube_clients()
class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors"""
- _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
- _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
-
- _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
- _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
- _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
_RESERVED_NAMES = (
- r'channel|c|user|browse|playlist|watch|w|v|embed|e|watch_popup|shorts|'
- r'movies|results|shared|hashtag|trending|feed|feeds|oembed|get_video_info|'
+ r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|'
+ r'shorts|movies|results|shared|hashtag|trending|feed|feeds|'
+ r'browse|oembed|get_video_info|iframe_api|s/player|'
r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout')
+ _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
+
_NETRC_MACHINE = 'youtube'
+
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
- _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
+ r''' # Unused since login is broken
+ _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
+ _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
+
+ _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
+ _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
+ _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
+ '''
def _login(self):
"""
_YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
_YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
- _YT_DEFAULT_YTCFGS = {
- 'WEB': {
- 'INNERTUBE_API_VERSION': 'v1',
- 'INNERTUBE_CLIENT_NAME': 'WEB',
- 'INNERTUBE_CLIENT_VERSION': '2.20210622.10.00',
- 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
- 'INNERTUBE_CONTEXT': {
- 'client': {
- 'clientName': 'WEB',
- 'clientVersion': '2.20210622.10.00',
- 'hl': 'en',
- }
- },
- 'INNERTUBE_CONTEXT_CLIENT_NAME': 1
- },
- 'WEB_REMIX': {
- 'INNERTUBE_API_VERSION': 'v1',
- 'INNERTUBE_CLIENT_NAME': 'WEB_REMIX',
- 'INNERTUBE_CLIENT_VERSION': '1.20210621.00.00',
- 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
- 'INNERTUBE_CONTEXT': {
- 'client': {
- 'clientName': 'WEB_REMIX',
- 'clientVersion': '1.20210621.00.00',
- 'hl': 'en',
- }
- },
- 'INNERTUBE_CONTEXT_CLIENT_NAME': 67
- },
- 'WEB_EMBEDDED_PLAYER': {
- 'INNERTUBE_API_VERSION': 'v1',
- 'INNERTUBE_CLIENT_NAME': 'WEB_EMBEDDED_PLAYER',
- 'INNERTUBE_CLIENT_VERSION': '1.20210620.0.1',
- 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
- 'INNERTUBE_CONTEXT': {
- 'client': {
- 'clientName': 'WEB_EMBEDDED_PLAYER',
- 'clientVersion': '1.20210620.0.1',
- 'hl': 'en',
- }
- },
- 'INNERTUBE_CONTEXT_CLIENT_NAME': 56
- },
- 'ANDROID': {
- 'INNERTUBE_API_VERSION': 'v1',
- 'INNERTUBE_CLIENT_NAME': 'ANDROID',
- 'INNERTUBE_CLIENT_VERSION': '16.20',
- 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
- 'INNERTUBE_CONTEXT': {
- 'client': {
- 'clientName': 'ANDROID',
- 'clientVersion': '16.20',
- 'hl': 'en',
- }
- },
- 'INNERTUBE_CONTEXT_CLIENT_NAME': 3
- },
- 'ANDROID_EMBEDDED_PLAYER': {
- 'INNERTUBE_API_VERSION': 'v1',
- 'INNERTUBE_CLIENT_NAME': 'ANDROID_EMBEDDED_PLAYER',
- 'INNERTUBE_CLIENT_VERSION': '16.20',
- 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
- 'INNERTUBE_CONTEXT': {
- 'client': {
- 'clientName': 'ANDROID_EMBEDDED_PLAYER',
- 'clientVersion': '16.20',
- 'hl': 'en',
- }
- },
- 'INNERTUBE_CONTEXT_CLIENT_NAME': 55
- },
- 'ANDROID_MUSIC': {
- 'INNERTUBE_API_VERSION': 'v1',
- 'INNERTUBE_CLIENT_NAME': 'ANDROID_MUSIC',
- 'INNERTUBE_CLIENT_VERSION': '4.32',
- 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
- 'INNERTUBE_CONTEXT': {
- 'client': {
- 'clientName': 'ANDROID_MUSIC',
- 'clientVersion': '4.32',
- 'hl': 'en',
- }
- },
- 'INNERTUBE_CONTEXT_CLIENT_NAME': 21
- },
- 'IOS': {
- 'INNERTUBE_API_VERSION': 'v1',
- 'INNERTUBE_CLIENT_NAME': 'IOS',
- 'INNERTUBE_CLIENT_VERSION': '16.20',
- 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
- 'INNERTUBE_CONTEXT': {
- 'client': {
- 'clientName': 'IOS',
- 'clientVersion': '16.20',
- 'hl': 'en',
- }
- },
- 'INNERTUBE_CONTEXT_CLIENT_NAME': 5
-
- },
- 'IOS_MUSIC': {
- 'INNERTUBE_API_VERSION': 'v1',
- 'INNERTUBE_CLIENT_NAME': 'IOS_MUSIC',
- 'INNERTUBE_CLIENT_VERSION': '4.32',
- 'INNERTUBE_API_KEY': 'AIzaSyDK3iBpDP9nHVTk2qL73FLJICfOC3c51Og',
- 'INNERTUBE_CONTEXT': {
- 'client': {
- 'clientName': 'IOS_MUSIC',
- 'clientVersion': '4.32',
- 'hl': 'en',
- }
- },
- 'INNERTUBE_CONTEXT_CLIENT_NAME': 26
- },
- 'IOS_MESSAGES_EXTENSION': {
- 'INNERTUBE_API_VERSION': 'v1',
- 'INNERTUBE_CLIENT_NAME': 'IOS_MESSAGES_EXTENSION',
- 'INNERTUBE_CLIENT_VERSION': '16.20',
- 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8',
- 'INNERTUBE_CONTEXT': {
- 'client': {
- 'clientName': 'IOS_MESSAGES_EXTENSION',
- 'clientVersion': '16.20',
- 'hl': 'en',
- }
- },
- 'INNERTUBE_CONTEXT_CLIENT_NAME': 66
- }
- }
-
- _YT_DEFAULT_INNERTUBE_HOSTS = {
- 'DIRECT': 'youtubei.googleapis.com',
- 'WEB': 'www.youtube.com',
- 'WEB_REMIX': 'music.youtube.com',
- 'ANDROID_MUSIC': 'music.youtube.com'
- }
-
- # clients starting with _ cannot be explicity requested by the user
- _YT_CLIENTS = {
- 'web': 'WEB',
- 'web_music': 'WEB_REMIX',
- '_web_embedded': 'WEB_EMBEDDED_PLAYER',
- '_web_agegate': 'TVHTML5',
- 'android': 'ANDROID',
- 'android_music': 'ANDROID_MUSIC',
- '_android_embedded': 'ANDROID_EMBEDDED_PLAYER',
- '_android_agegate': 'ANDROID',
- 'ios': 'IOS',
- 'ios_music': 'IOS_MUSIC',
- '_ios_embedded': 'IOS_MESSAGES_EXTENSION',
- '_ios_agegate': 'IOS'
- }
+ def _get_default_ytcfg(self, client='web'):
+ return copy.deepcopy(INNERTUBE_CLIENTS[client])
- def _get_default_ytcfg(self, client='WEB'):
- if client in self._YT_DEFAULT_YTCFGS:
- return copy.deepcopy(self._YT_DEFAULT_YTCFGS[client])
- self.write_debug(f'INNERTUBE default client {client} does not exist - falling back to WEB client.')
- return copy.deepcopy(self._YT_DEFAULT_YTCFGS['WEB'])
+ def _get_innertube_host(self, client='web'):
+ return INNERTUBE_CLIENTS[client]['INNERTUBE_HOST']
- def _get_innertube_host(self, client='WEB'):
- return dict_get(self._YT_DEFAULT_INNERTUBE_HOSTS, (client, 'WEB'))
-
- def _ytcfg_get_safe(self, ytcfg, getter, expected_type=None, default_client='WEB'):
+ def _ytcfg_get_safe(self, ytcfg, getter, expected_type=None, default_client='web'):
# try_get but with fallback to default ytcfg client values when present
_func = lambda y: try_get(y, getter, expected_type)
return _func(ytcfg) or _func(self._get_default_ytcfg(default_client))
- def _extract_client_name(self, ytcfg, default_client='WEB'):
- return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CLIENT_NAME'], compat_str, default_client)
+ def _extract_client_name(self, ytcfg, default_client='web'):
+ return self._ytcfg_get_safe(
+ ytcfg, (lambda x: x['INNERTUBE_CLIENT_NAME'],
+ lambda x: x['INNERTUBE_CONTEXT']['client']['clientName']), compat_str, default_client)
@staticmethod
def _extract_session_index(*data):
if session_index is not None:
return session_index
- def _extract_client_version(self, ytcfg, default_client='WEB'):
- return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CLIENT_VERSION'], compat_str, default_client)
+ def _extract_client_version(self, ytcfg, default_client='web'):
+ return self._ytcfg_get_safe(
+ ytcfg, (lambda x: x['INNERTUBE_CLIENT_VERSION'],
+ lambda x: x['INNERTUBE_CONTEXT']['client']['clientVersion']), compat_str, default_client)
- def _extract_api_key(self, ytcfg=None, default_client='WEB'):
+ def _extract_api_key(self, ytcfg=None, default_client='web'):
return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client)
- def _extract_context(self, ytcfg=None, default_client='WEB'):
+ def _extract_context(self, ytcfg=None, default_client='web'):
_get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict)
context = _get_context(ytcfg)
if context:
context['client']['visitorData'] = visitor_data
return context
+ _SAPISID = None
+
def _generate_sapisidhash_header(self, origin='https://www.youtube.com'):
- # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is.
- # See: https://github.com/yt-dlp/yt-dlp/issues/393
- yt_cookies = self._get_cookies('https://www.youtube.com')
- sapisid_cookie = dict_get(
- yt_cookies, ('__Secure-3PAPISID', 'SAPISID'))
- if sapisid_cookie is None:
- return
time_now = round(time.time())
- # SAPISID cookie is required if not already present
- if not yt_cookies.get('SAPISID'):
- self._set_cookie(
- '.youtube.com', 'SAPISID', sapisid_cookie.value, secure=True, expire_time=time_now + 3600)
+ if self._SAPISID is None:
+ yt_cookies = self._get_cookies('https://www.youtube.com')
+ # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is.
+ # See: https://github.com/yt-dlp/yt-dlp/issues/393
+ sapisid_cookie = dict_get(
+ yt_cookies, ('__Secure-3PAPISID', 'SAPISID'))
+ if sapisid_cookie and sapisid_cookie.value:
+ self._SAPISID = sapisid_cookie.value
+ self.write_debug('Extracted SAPISID cookie')
+ # SAPISID cookie is required if not already present
+ if not yt_cookies.get('SAPISID'):
+ self.write_debug('Copying __Secure-3PAPISID cookie to SAPISID cookie')
+ self._set_cookie(
+ '.youtube.com', 'SAPISID', self._SAPISID, secure=True, expire_time=time_now + 3600)
+ else:
+ self._SAPISID = False
+ if not self._SAPISID:
+ return None
# SAPISIDHASH algorithm from https://stackoverflow.com/a/32065323
sapisidhash = hashlib.sha1(
- f'{time_now} {sapisid_cookie.value} {origin}'.encode('utf-8')).hexdigest()
+ f'{time_now} {self._SAPISID} {origin}'.encode('utf-8')).hexdigest()
return f'SAPISIDHASH {time_now}_{sapisidhash}'
def _call_api(self, ep, query, video_id, fatal=True, headers=None,
note='Downloading API JSON', errnote='Unable to download API page',
- context=None, api_key=None, api_hostname=None, default_client='WEB'):
+ context=None, api_key=None, api_hostname=None, default_client='web'):
data = {'context': context} if context else {'context': self._extract_context(default_client=default_client)}
data.update(query)
def generate_api_headers(
self, ytcfg=None, identity_token=None, account_syncid=None,
- visitor_data=None, api_hostname=None, default_client='WEB', session_index=None):
+ visitor_data=None, api_hostname=None, default_client='web', session_index=None):
origin = 'https://' + (api_hostname if api_hostname else self._get_innertube_host(default_client))
headers = {
'X-YouTube-Client-Name': compat_str(
alert_type = alert.get('type')
if not alert_type:
continue
- message = cls._get_text(alert.get('text'))
+ message = cls._get_text(alert, 'text')
if message:
yield alert_type, message
- def _report_alerts(self, alerts, expected=True):
+ def _report_alerts(self, alerts, expected=True, fatal=True):
errors = []
warnings = []
for alert_type, alert_message in alerts:
- if alert_type.lower() == 'error':
+ if alert_type.lower() == 'error' and fatal:
errors.append([alert_type, alert_message])
else:
warnings.append([alert_type, alert_message])
return badges
@staticmethod
- def _get_text(data, getter=None, max_runs=None):
- for get in variadic(getter):
- d = try_get(data, get) if get is not None else data
- text = try_get(d, lambda x: x['simpleText'], compat_str)
- if text:
- return text
- runs = try_get(d, lambda x: x['runs'], list) or []
- if not runs and isinstance(d, list):
- runs = d
-
- def get_runs(runs):
- for run in runs[:min(len(runs), max_runs or len(runs))]:
- yield try_get(run, lambda x: x['text'], compat_str) or ''
-
- text = ''.join(get_runs(runs))
- if text:
- return text
+ def _get_text(data, *path_list, max_runs=None):
+ for path in path_list or [None]:
+ if path is None:
+ obj = [data]
+ else:
+ obj = traverse_obj(data, path, default=[])
+ if not any(key is ... or isinstance(key, (list, tuple)) for key in variadic(path)):
+ obj = [obj]
+ for item in obj:
+ text = try_get(item, lambda x: x['simpleText'], compat_str)
+ if text:
+ return text
+ runs = try_get(item, lambda x: x['runs'], list) or []
+ if not runs and isinstance(item, list):
+ runs = item
+
+ runs = runs[:min(len(runs), max_runs or len(runs))]
+ text = ''.join(traverse_obj(runs, (..., 'text'), expected_type=str, default=[]))
+ if text:
+ return text
def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None,
ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None,
- default_client='WEB'):
+ default_client='web'):
response = None
last_error = None
count = -1
api_hostname=api_hostname, default_client=default_client,
note='%s%s' % (note, ' (retry #%d)' % count if count else ''))
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404):
+ if isinstance(e.cause, network_exceptions):
+ if isinstance(e.cause, compat_HTTPError) and not is_html(e.cause.read(512)):
+ e.cause.seek(0)
+ yt_error = try_get(
+ self._parse_json(e.cause.read().decode(), item_id, fatal=False),
+ lambda x: x['error']['message'], compat_str)
+ if yt_error:
+ self._report_alerts([('ERROR', yt_error)], fatal=False)
# Downloading page may result in intermittent 5xx HTTP error
# Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
- last_error = 'HTTP Error %s' % e.cause.code
- if count < retries:
- continue
+ # We also want to catch all other network exceptions since errors in later pages can be troublesome
+ # See https://github.com/yt-dlp/yt-dlp/issues/507#issuecomment-880188210
+ if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (403, 429):
+ last_error = error_to_compat_str(e.cause or e)
+ if count < retries:
+ continue
if fatal:
raise
else:
def _extract_video(self, renderer):
video_id = renderer.get('videoId')
- title = self._get_text(renderer.get('title'))
- description = self._get_text(renderer.get('descriptionSnippet'))
- duration = parse_duration(self._get_text(renderer.get('lengthText')))
- view_count_text = self._get_text(renderer.get('viewCountText')) or ''
+ title = self._get_text(renderer, 'title')
+ description = self._get_text(renderer, 'descriptionSnippet')
+ duration = parse_duration(self._get_text(
+ renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text')))
+ view_count_text = self._get_text(renderer, 'viewCountText') or ''
view_count = str_to_int(self._search_regex(
r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
'view count', default=None))
- uploader = self._get_text(renderer, (lambda x: x['ownerText'], lambda x: x['shortBylineText']))
+ uploader = self._get_text(renderer, 'ownerText', 'shortBylineText')
return {
'_type': 'url',
youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
- (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
+ (?:(?:v|embed|e|shorts)/(?!videoseries)) # v/ or embed/ or e/ or shorts/
|(?: # or the v= param in all its forms
(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #!
'_rtmp': {'protocol': 'rtmp'},
# av01 video only formats sometimes served with "unknown" codecs
- '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
- '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
- '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
- '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
+ '394': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'av01.0.00M.08'},
+ '395': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'av01.0.00M.08'},
+ '396': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'av01.0.01M.08'},
+ '397': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'av01.0.04M.08'},
+ '398': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'av01.0.05M.08'},
+ '399': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'av01.0.08M.08'},
+ '400': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'},
+ '401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'},
}
_SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
- _AGE_GATE_REASONS = (
- 'Sign in to confirm your age',
- 'This video may be inappropriate for some users.',
- 'Sorry, this content is age-restricted.')
-
_GEO_BYPASS = False
IE_NAME = 'youtube'
'format': '141/bestaudio[ext=m4a]',
},
},
- # Normal age-gate video (embed allowed)
+ # Age-gate videos. See https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-888837000
{
+ 'note': 'Embed allowed age-gate video',
'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
'info_dict': {
'id': 'HtVdAasjOgU',
'age_limit': 18,
},
},
+ {
+ 'note': 'Age-gate video with embed allowed in public site',
+ 'url': 'https://youtube.com/watch?v=HsUATh_Nc2U',
+ 'info_dict': {
+ 'id': 'HsUATh_Nc2U',
+ 'ext': 'mp4',
+ 'title': 'Godzilla 2 (Official Video)',
+ 'description': 'md5:bf77e03fcae5529475e500129b05668a',
+ 'upload_date': '20200408',
+ 'uploader_id': 'FlyingKitty900',
+ 'uploader': 'FlyingKitty',
+ 'age_limit': 18,
+ },
+ },
+ {
+ 'note': 'Age-gate video embedable only with clientScreen=EMBED',
+ 'url': 'https://youtube.com/watch?v=Tq92D6wQ1mg',
+ 'info_dict': {
+ 'id': 'Tq92D6wQ1mg',
+ 'title': '[MMD] Adios - EVERGLOW [+Motion DL]',
+ 'ext': 'mp4',
+ 'upload_date': '20191227',
+ 'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ',
+ 'uploader': 'Projekt Melody',
+ 'description': 'md5:17eccca93a786d51bc67646756894066',
+ 'age_limit': 18,
+ },
+ },
+ {
+ 'note': 'Non-Agegated non-embeddable video',
+ 'url': 'https://youtube.com/watch?v=MeJVWBSsPAY',
+ 'info_dict': {
+ 'id': 'MeJVWBSsPAY',
+ 'ext': 'mp4',
+ 'title': 'OOMPH! - Such Mich Find Mich (Lyrics)',
+ 'uploader': 'Herr Lurik',
+ 'uploader_id': 'st3in234',
+ 'description': 'Fan Video. Music & Lyrics by OOMPH!.',
+ 'upload_date': '20130730',
+ },
+ },
+ {
+ 'note': 'Non-bypassable age-gated video',
+ 'url': 'https://youtube.com/watch?v=Cr381pDsSsA',
+ 'only_matching': True,
+ },
# video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
# YouTube Red ad is not captured for creator
{
'params': {
'skip_download': True,
},
+ 'skip': 'Not multifeed anymore',
},
{
# Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
'params': {
'extractor_args': {'youtube': {'player_skip': ['configs']}},
},
- }
+ }, {
+ # shorts
+ 'url': 'https://www.youtube.com/shorts/BGQWPY4IigY',
+ 'only_matching': True,
+ },
]
@classmethod
def suitable(cls, url):
- # Hack for lazy extractors until more generic solution is implemented
- # (see #28780)
- from .youtube import parse_qs
+ from ..utils import parse_qs
+
qs = parse_qs(url)
if qs.get('list', [None])[0]:
return False
funcname = self._search_regex(
(r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
- r'\bm=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(h\.s\)\)',
- r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(c\)\)',
- r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)',
- r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
+ r'\bm=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)',
+ r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)',
+ r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)',
+ r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
# Obsolete patterns
r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
if mobj is None:
raise ExtractorError('Invalid URL: %s' % url)
- video_id = mobj.group(2)
- return video_id
+ return mobj.group('id')
def _extract_chapters_from_json(self, data, duration):
chapter_list = traverse_obj(
data,
('engagementPanels', ..., 'engagementPanelSectionListRenderer', 'content', 'macroMarkersListRenderer', 'contents'),
expected_type=list, default=[])
- chapter_time = lambda chapter: parse_duration(self._get_text(chapter.get('timeDescription')))
- chapter_title = lambda chapter: self._get_text(chapter.get('title'))
+ chapter_time = lambda chapter: parse_duration(self._get_text(chapter, 'timeDescription'))
+ chapter_title = lambda chapter: self._get_text(chapter, 'title')
return next((
filter(None, (
if not comment_id:
return
- text = self._get_text(comment_renderer.get('contentText'))
+ text = self._get_text(comment_renderer, 'contentText')
# note: timestamp is an estimate calculated from the current time and time_text
- time_text = self._get_text(comment_renderer.get('publishedTimeText')) or ''
+ time_text = self._get_text(comment_renderer, 'publishedTimeText') or ''
time_text_dt = self.parse_time_text(time_text)
if isinstance(time_text_dt, datetime.datetime):
timestamp = calendar.timegm(time_text_dt.timetuple())
- author = self._get_text(comment_renderer.get('authorText'))
+ author = self._get_text(comment_renderer, 'authorText')
author_id = try_get(comment_renderer,
lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
for content in contents:
comments_header_renderer = try_get(content, lambda x: x['commentsHeaderRenderer'])
expected_comment_count = parse_count(self._get_text(
- comments_header_renderer, (lambda x: x['countText'], lambda x: x['commentsCount']), max_runs=1))
+ comments_header_renderer, 'countText', 'commentsCount', max_runs=1))
if expected_comment_count:
comment_counts[1] = expected_comment_count
known_entry_comment_renderers = ('itemSectionRenderer',)
estimated_total = 0
max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) or float('inf')
-
+ # Force English regardless of account setting to prevent parsing issues
+ # See: https://github.com/yt-dlp/yt-dlp/issues/532
+ ytcfg = copy.deepcopy(ytcfg)
+ traverse_obj(
+ ytcfg, ('INNERTUBE_CONTEXT', 'client'), expected_type=dict, default={})['hl'] = 'en'
try:
for comment in _real_comment_extract(contents):
if len(comments) >= max_comments:
'playbackContext': {
'contentPlaybackContext': context
},
- 'contentCheckOk': True
+ 'contentCheckOk': True,
+ 'racyCheckOk': True
}
@staticmethod
- def _get_video_info_params(video_id, client='TVHTML5'):
- GVI_CLIENTS = {
- 'ANDROID': {
- 'c': 'ANDROID',
- 'cver': '16.20',
- },
- 'TVHTML5': {
- 'c': 'TVHTML5',
- 'cver': '6.20180913',
- },
- 'IOS': {
- 'c': 'IOS',
- 'cver': '16.20'
- }
- }
- query = {
- 'video_id': video_id,
- 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
- 'html5': '1'
- }
- query.update(GVI_CLIENTS.get(client))
- return query
+ def _is_agegated(player_response):
+ if traverse_obj(player_response, ('playabilityStatus', 'desktopLegacyAgeGateReason')):
+ return True
+
+ reasons = traverse_obj(player_response, ('playabilityStatus', ('status', 'reason')), default=[])
+ AGE_GATE_REASONS = (
+ 'confirm your age', 'age-restricted', 'inappropriate', # reason
+ 'age_verification_required', 'age_check_required', # status
+ )
+ return any(expected in reason for expected in AGE_GATE_REASONS for reason in reasons)
+
+ @staticmethod
+ def _is_unplayable(player_response):
+ return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE'
def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, identity_token, player_url, initial_pr):
sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False)
headers = self.generate_api_headers(
player_ytcfg, identity_token, syncid,
- default_client=self._YT_CLIENTS[client], session_index=session_index)
+ default_client=client, session_index=session_index)
yt_query = {'videoId': video_id}
yt_query.update(self._generate_player_context(sts))
return self._extract_response(
item_id=video_id, ep='player', query=yt_query,
- ytcfg=player_ytcfg, headers=headers, fatal=False,
- default_client=self._YT_CLIENTS[client],
+ ytcfg=player_ytcfg, headers=headers, fatal=True,
+ default_client=client,
note='Downloading %s player API JSON' % client.replace('_', ' ').strip()
) or None
- def _extract_age_gated_player_response(self, client, video_id, ytcfg, identity_token, player_url, initial_pr):
- gvi_client = self._YT_CLIENTS.get(f'_{client}_agegate')
- if not gvi_client:
- return
-
- pr = self._parse_json(traverse_obj(
- compat_parse_qs(self._download_webpage(
- self.http_scheme() + '//www.youtube.com/get_video_info', video_id,
- 'Refetching age-gated %s info webpage' % gvi_client.lower(),
- 'unable to download video info webpage', fatal=False,
- query=self._get_video_info_params(video_id, client=gvi_client))),
- ('player_response', 0), expected_type=str) or '{}', video_id)
- if pr:
- return pr
-
- self.report_warning('Falling back to embedded-only age-gate workaround')
- embed_webpage = None
- if client == 'web' and 'configs' not in self._configuration_arg('player_skip'):
- embed_webpage = self._download_webpage(
- 'https://www.youtube.com/embed/%s?html5=1' % video_id,
- video_id=video_id, note=f'Downloading age-gated {client} embed config')
-
- ytcfg_age = self.extract_ytcfg(video_id, embed_webpage) or {}
- # If we extracted the embed webpage, it'll tell us if we can view the video
- embedded_pr = self._parse_json(
- traverse_obj(ytcfg_age, ('PLAYER_VARS', 'embedded_player_response'), expected_type=str) or '{}',
- video_id=video_id)
- embedded_ps_reason = traverse_obj(embedded_pr, ('playabilityStatus', 'reason'), expected_type=str) or ''
- if embedded_ps_reason in self._AGE_GATE_REASONS:
- return
- return self._extract_player_response(
- f'_{client}_embedded', video_id,
- ytcfg_age or ytcfg, ytcfg_age if client == 'web' else {},
- identity_token, player_url, initial_pr)
-
def _get_requested_clients(self, url, smuggled_data):
- requested_clients = [client for client in self._configuration_arg('player_client')
- if client[:0] != '_' and client in self._YT_CLIENTS]
+ requested_clients = []
+ allowed_clients = sorted(
+ [client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'],
+ key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True)
+ for client in self._configuration_arg('player_client'):
+ if client in allowed_clients:
+ requested_clients.append(client)
+ elif client == 'all':
+ requested_clients.extend(allowed_clients)
+ else:
+ self.report_warning(f'Skipping unsupported client {client}')
if not requested_clients:
requested_clients = ['android', 'web']
if smuggled_data.get('is_music_url') or self.is_music_url(url):
requested_clients.extend(
- f'{client}_music' for client in requested_clients if not client.endswith('_music'))
+ f'{client}_music' for client in requested_clients if f'{client}_music' in INNERTUBE_CLIENTS)
return orderedSet(requested_clients)
+ def _extract_player_ytcfg(self, client, video_id):
+ url = {
+ 'web_music': 'https://music.youtube.com',
+ 'web_embedded': f'https://www.youtube.com/embed/{video_id}?html5=1'
+ }.get(client)
+ if not url:
+ return {}
+ webpage = self._download_webpage(url, video_id, fatal=False, note=f'Downloading {client} config')
+ return self.extract_ytcfg(video_id, webpage) or {}
+
def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, player_url, identity_token):
initial_pr = None
if webpage:
webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
video_id, 'initial player response')
- age_gated = False
- for client in clients:
- player_ytcfg = master_ytcfg if client == 'web' else {}
- if age_gated:
- pr = None
- elif client == 'web' and initial_pr:
- pr = initial_pr
- else:
- if client == 'web_music' and 'configs' not in self._configuration_arg('player_skip'):
- ytm_webpage = self._download_webpage(
- 'https://music.youtube.com',
- video_id, fatal=False, note='Downloading remix client config')
- player_ytcfg = self.extract_ytcfg(video_id, ytm_webpage) or {}
- pr = self._extract_player_response(
- client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, identity_token, player_url, initial_pr)
- if pr:
- yield pr
- if age_gated or traverse_obj(pr, ('playabilityStatus', 'reason')) in self._AGE_GATE_REASONS:
- age_gated = True
- pr = self._extract_age_gated_player_response(
- client, video_id, player_ytcfg or master_ytcfg, identity_token, player_url, initial_pr)
- if pr:
- yield pr
+ original_clients = clients
+ clients = clients[::-1]
+
+ def append_client(client_name):
+ if client_name in INNERTUBE_CLIENTS and client_name not in original_clients:
+ clients.append(client_name)
+
# Android player_response does not have microFormats which are needed for
# extraction of some data. So we return the initial_pr with formats
# stripped out even if not requested by the user
# See: https://github.com/yt-dlp/yt-dlp/issues/501
- if initial_pr and 'web' not in clients:
- initial_pr['streamingData'] = None
- yield initial_pr
+ yielded_pr = False
+ if initial_pr:
+ pr = dict(initial_pr)
+ pr['streamingData'] = None
+ yielded_pr = True
+ yield pr
+
+ last_error = None
+ while clients:
+ client = clients.pop()
+ player_ytcfg = master_ytcfg if client == 'web' else {}
+ if 'configs' not in self._configuration_arg('player_skip'):
+ player_ytcfg = self._extract_player_ytcfg(client, video_id) or player_ytcfg
+
+ try:
+ pr = initial_pr if client == 'web' and initial_pr else self._extract_player_response(
+ client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, identity_token, player_url, initial_pr)
+ except ExtractorError as e:
+ if last_error:
+ self.report_warning(last_error)
+ last_error = e
+ continue
+
+ if pr:
+ yielded_pr = True
+ yield pr
+
+ # creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in
+ if client.endswith('_agegate') and self._is_unplayable(pr) and self._generate_sapisidhash_header():
+ append_client(client.replace('_agegate', '_creator'))
+ elif self._is_agegated(pr):
+ append_client(f'{client}_agegate')
+
+ if last_error:
+ if not yielded_pr:
+ raise last_error
+ self.report_warning(last_error)
def _extract_formats(self, streaming_data, video_id, player_url, is_live):
itags, stream_ids = [], []
- itag_qualities = {}
+ itag_qualities, res_qualities = {}, {}
q = qualities([
- # "tiny" is the smallest video-only format. But some audio-only formats
- # was also labeled "tiny". It is not clear if such formats still exist
- 'tiny', 'audio_quality_low', 'audio_quality_medium', 'audio_quality_high', # Audio only formats
+ # Normally tiny is the smallest video-only formats. But
+ # audio-only formats with unknown quality may get tagged as tiny
+ 'tiny',
+ 'audio_quality_ultralow', 'audio_quality_low', 'audio_quality_medium', 'audio_quality_high', # Audio only formats
'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'
])
streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[])
continue
quality = fmt.get('quality')
+ height = int_or_none(fmt.get('height'))
if quality == 'tiny' or not quality:
quality = fmt.get('audioQuality', '').lower() or quality
- if itag and quality:
- itag_qualities[itag] = quality
+ # The 3gp format (17) in android client has a quality of "small",
+ # but is actually worse than other formats
+ if itag == '17':
+ quality = 'tiny'
+ if quality:
+ if itag:
+ itag_qualities[itag] = quality
+ if height:
+ res_qualities[height] = quality
# FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment
# (adding `&sq=0` to the URL) and parsing emsg box to determine the
# number of fragment that would subsequently requested with (`&sq=N`)
'filesize': int_or_none(fmt.get('contentLength')),
'format_id': itag,
'format_note': ', '.join(filter(None, (
- audio_track.get('displayName'), fmt.get('qualityLabel') or quality))),
+ '%s%s' % (audio_track.get('displayName') or '',
+ ' (default)' if audio_track.get('audioIsDefault') else ''),
+ fmt.get('qualityLabel') or quality.replace('audio_quality_', '')))),
'fps': int_or_none(fmt.get('fps')),
- 'height': int_or_none(fmt.get('height')),
+ 'height': height,
'quality': q(quality),
'tbr': tbr,
'url': fmt_url,
- 'width': fmt.get('width'),
+ 'width': int_or_none(fmt.get('width')),
'language': audio_track.get('id', '').split('.')[0],
+ 'language_preference': 1 if audio_track.get('audioIsDefault') else -1,
}
mime_mobj = re.match(
r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '')
if mime_mobj:
dct['ext'] = mimetype2ext(mime_mobj.group(1))
dct.update(parse_codecs(mime_mobj.group(2)))
- # The 3gp format in android client has a quality of "small",
- # but is actually worse than all other formats
- if dct['ext'] == '3gp':
- dct['quality'] = q('tiny')
- dct['preference'] = -10
no_audio = dct.get('acodec') == 'none'
no_video = dct.get('vcodec') == 'none'
if no_audio:
yield dct
skip_manifests = self._configuration_arg('skip')
- get_dash = not is_live and 'dash' not in skip_manifests and self.get_param('youtube_include_dash_manifest', True)
+ get_dash = (
+ (not is_live or self._configuration_arg('include_live_dash'))
+ and 'dash' not in skip_manifests and self.get_param('youtube_include_dash_manifest', True))
get_hls = 'hls' not in skip_manifests and self.get_param('youtube_include_hls_manifest', True)
+ def guess_quality(f):
+ for val, qdict in ((f.get('format_id'), itag_qualities), (f.get('height'), res_qualities)):
+ if val in qdict:
+ return q(qdict[val])
+ return -1
+
for sd in streaming_data:
hls_manifest_url = get_hls and sd.get('hlsManifestUrl')
if hls_manifest_url:
- for f in self._extract_m3u8_formats(
- hls_manifest_url, video_id, 'mp4', fatal=False):
+ for f in self._extract_m3u8_formats(hls_manifest_url, video_id, 'mp4', fatal=False):
itag = self._search_regex(
r'/itag/(\d+)', f['url'], 'itag', default=None)
if itag in itags:
if itag:
f['format_id'] = itag
itags.append(itag)
+ f['quality'] = guess_quality(f)
yield f
dash_manifest_url = get_dash and sd.get('dashManifestUrl')
if dash_manifest_url:
- for f in self._extract_mpd_formats(
- dash_manifest_url, video_id, fatal=False):
+ for f in self._extract_mpd_formats(dash_manifest_url, video_id, fatal=False):
itag = f['format_id']
if itag in itags:
continue
if itag:
itags.append(itag)
- if itag in itag_qualities:
- f['quality'] = q(itag_qualities[itag])
+ f['quality'] = guess_quality(f)
filesize = int_or_none(self._search_regex(
r'/clen/(\d+)', f.get('fragment_base_url')
or f['url'], 'file size', default=None))
if not formats:
if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')):
- self.raise_no_formats(
- 'This video is DRM protected.', expected=True)
+ self.report_drm(video_id)
pemr = get_first(
playability_statuses,
('errorScreen', 'playerErrorMessageRenderer'), expected_type=dict) or {}
self.raise_no_formats(reason, expected=True)
for f in formats:
- # TODO: detect if throttled
- if '&n=' in f['url']: # possibly throttled
+ if '&c=WEB&' in f['url'] and '&ratebypass=yes&' not in f['url']: # throttled
f['source_preference'] = -10
- # note = f.get('format_note')
- # f['format_note'] = f'{note} (throttled)' if note else '(throttled)'
+ # TODO: this method is not reliable
+ f['format_note'] = format_field(f, 'format_note', '%s ') + '(maybe throttled)'
- self._sort_formats(formats)
+ # Source is given priority since formats that throttle are given lower source_preference
+ # When throttling issue is fully fixed, remove this
+ self._sort_formats(formats, ('quality', 'res', 'fps', 'source', 'codec:vp9.2', 'lang'))
keywords = get_first(video_details, 'keywords', expected_type=list) or []
if not keywords and webpage:
# See: https://github.com/ytdl-org/youtube-dl/issues/29049, https://github.com/yt-dlp/yt-dlp/issues/340
# List of possible thumbnails - Ref: <https://stackoverflow.com/a/20542029>
hq_thumbnail_names = ['maxresdefault', 'hq720', 'sddefault', 'sd1', 'sd2', 'sd3']
+ # TODO: Test them also? - For some videos, even these don't exist
guaranteed_thumbnail_names = [
'hqdefault', 'hq1', 'hq2', 'hq3', '0',
'mqdefault', 'mq1', 'mq2', 'mq3',
'release_timestamp': live_starttime,
}
- pctr = get_first(player_responses, ('captions', 'playerCaptionsTracklistRenderer'), expected_type=dict)
+ pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict)
+ # Converted into dicts to remove duplicates
+ captions = {
+ sub.get('baseUrl'): sub
+ for sub in traverse_obj(pctr, (..., 'captionTracks', ...), default=[])}
+ translation_languages = {
+ lang.get('languageCode'): lang.get('languageName')
+ for lang in traverse_obj(pctr, (..., 'translationLanguages', ...), default=[])}
subtitles = {}
if pctr:
def process_language(container, base_url, lang_code, sub_name, query):
'name': sub_name,
})
- for caption_track in (pctr.get('captionTracks') or []):
- base_url = caption_track.get('baseUrl')
+ for base_url, caption_track in captions.items():
if not base_url:
continue
if caption_track.get('kind') != 'asr':
continue
process_language(
subtitles, base_url, lang_code,
- try_get(caption_track, lambda x: x['name']['simpleText']),
+ traverse_obj(caption_track, ('name', 'simpleText'), ('name', 'runs', ..., 'text'), get_all=False),
{})
continue
automatic_captions = {}
- for translation_language in (pctr.get('translationLanguages') or []):
- translation_language_code = translation_language.get('languageCode')
- if not translation_language_code:
+ for trans_code, trans_name in translation_languages.items():
+ if not trans_code:
continue
process_language(
- automatic_captions, base_url, translation_language_code,
- self._get_text(translation_language.get('languageName'), max_runs=1),
- {'tlang': translation_language_code})
+ automatic_captions, base_url, trans_code,
+ self._get_text(trans_name, max_runs=1),
+ {'tlang': trans_code})
info['automatic_captions'] = automatic_captions
info['subtitles'] = subtitles
})
vsir = content.get('videoSecondaryInfoRenderer')
if vsir:
- info['channel'] = self._get_text(try_get(
- vsir,
- lambda x: x['owner']['videoOwnerRenderer']['title'],
- dict))
+ info['channel'] = self._get_text(vsir, ('owner', 'videoOwnerRenderer', 'title'))
rows = try_get(
vsir,
lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
mrr_title = mrr.get('title')
if not mrr_title:
continue
- mrr_title = self._get_text(mrr['title'])
- mrr_contents_text = self._get_text(mrr['contents'][0])
+ mrr_title = self._get_text(mrr, 'title')
+ mrr_contents_text = self._get_text(mrr, ('contents', 0))
if mrr_title == 'License':
info['license'] = mrr_contents_text
elif not multiple_songs:
needs_auth=info['age_limit'] >= 18,
is_unlisted=None if is_private is None else is_unlisted)
- # get xsrf for annotations or comments
- get_annotations = self.get_param('writeannotations', False)
- get_comments = self.get_param('getcomments', False)
- if get_annotations or get_comments:
- xsrf_token = None
- if master_ytcfg:
- xsrf_token = try_get(master_ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
- if not xsrf_token:
- xsrf_token = self._search_regex(
- r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
- webpage, 'xsrf token', group='xsrf_token', fatal=False)
-
- # annotations
- if get_annotations:
- invideo_url = get_first(
- player_responses,
- ('annotations', 0, 'playerAnnotationsUrlsRenderer', 'invideoUrl'),
- expected_type=str)
- if xsrf_token and invideo_url:
- xsrf_field_name = None
- if master_ytcfg:
- xsrf_field_name = try_get(master_ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
- if not xsrf_field_name:
- xsrf_field_name = self._search_regex(
- r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
- webpage, 'xsrf field name',
- group='xsrf_field_name', default='session_token')
- info['annotations'] = self._download_webpage(
- self._proto_relative_url(invideo_url),
- video_id, note='Downloading annotations',
- errnote='Unable to download video annotations', fatal=False,
- data=urlencode_postdata({xsrf_field_name: xsrf_token}))
-
- if get_comments:
+ if self.get_param('getcomments', False):
info['__post_extractor'] = lambda: self._extract_comments(master_ytcfg, video_id, contents, webpage)
self.mark_watched(video_id, player_responses)
}, {
'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
'info_dict': {
- 'id': 'FMtPN8yp5LU', # This will keep changing
+ 'id': '3yImotZU3tw', # This will keep changing
'ext': 'mp4',
'title': compat_str,
'uploader': 'Sky News',
renderer = self._extract_basic_item_renderer(item)
if not isinstance(renderer, dict):
continue
- title = self._get_text(renderer.get('title'))
+ title = self._get_text(renderer, 'title')
# playlist
playlist_id = renderer.get('playlistId')
# will not work
if skip_channels and '/channels?' in shelf_url:
return
- title = self._get_text(shelf_renderer, lambda x: x['title'])
+ title = self._get_text(shelf_renderer, 'title')
yield self.url_result(shelf_url, video_title=title)
# Shelf may not contain shelf URL, fallback to extraction from content
for entry in self._shelf_entries_from_content(shelf_renderer):
renderer_dict, lambda x: x['privacyDropdownItemRenderer']['isSelected'], bool) or False
if not is_selected:
continue
- label = self._get_text(
- try_get(renderer_dict, lambda x: x['privacyDropdownItemRenderer']['label'], dict) or [])
+ label = self._get_text(renderer_dict, ('privacyDropdownItemRenderer', 'label'))
if label:
badge_labels.add(label.lower())
break
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
playlist_id = mobj.group('playlist_id')
return self.url_result(
return cls._VALID_URL
def _real_extract(self, url):
- qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ qs = parse_qs(url)
query = (qs.get('search_query') or qs.get('q'))[0]
self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
return self._get_n_results(query, self._MAX_RESULTS)