]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/youtube.py
[youtube] Improve extraction of livestream metadata
[yt-dlp.git] / yt_dlp / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5 import base64
6 import calendar
7 import copy
8 import datetime
9 import hashlib
10 import itertools
11 import json
12 import os.path
13 import random
14 import re
15 import time
16 import traceback
17
18 from .common import InfoExtractor, SearchInfoExtractor
19 from ..compat import (
20 compat_chr,
21 compat_HTTPError,
22 compat_parse_qs,
23 compat_str,
24 compat_urllib_parse_unquote_plus,
25 compat_urllib_parse_urlencode,
26 compat_urllib_parse_urlparse,
27 compat_urlparse,
28 )
29 from ..jsinterp import JSInterpreter
30 from ..utils import (
31 bytes_to_intlist,
32 clean_html,
33 datetime_from_str,
34 dict_get,
35 error_to_compat_str,
36 ExtractorError,
37 float_or_none,
38 format_field,
39 int_or_none,
40 intlist_to_bytes,
41 mimetype2ext,
42 orderedSet,
43 parse_codecs,
44 parse_count,
45 parse_duration,
46 parse_iso8601,
47 qualities,
48 remove_start,
49 smuggle_url,
50 str_or_none,
51 str_to_int,
52 traverse_obj,
53 try_get,
54 unescapeHTML,
55 unified_strdate,
56 unsmuggle_url,
57 update_url_query,
58 url_or_none,
59 urlencode_postdata,
60 urljoin,
61 variadic,
62 )
63
64
65 def parse_qs(url):
66 return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
67
68
69 class YoutubeBaseInfoExtractor(InfoExtractor):
70 """Provide base functions for Youtube extractors"""
71 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
72 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
73
74 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
75 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
76 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
77
78 _RESERVED_NAMES = (
79 r'channel|c|user|browse|playlist|watch|w|v|embed|e|watch_popup|shorts|'
80 r'movies|results|shared|hashtag|trending|feed|feeds|oembed|get_video_info|'
81 r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout')
82
83 _NETRC_MACHINE = 'youtube'
84 # If True it will raise an error if no login info is provided
85 _LOGIN_REQUIRED = False
86
87 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
88
89 def _login(self):
90 """
91 Attempt to log in to YouTube.
92 True is returned if successful or skipped.
93 False is returned if login failed.
94
95 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
96 """
97
98 def warn(message):
99 self.report_warning(message)
100
101 # username+password login is broken
102 if self._LOGIN_REQUIRED and self.get_param('cookiefile') is None:
103 self.raise_login_required(
104 'Login details are needed to download this content', method='cookies')
105 username, password = self._get_login_info()
106 if username:
107 warn('Logging in using username and password is broken. %s' % self._LOGIN_HINTS['cookies'])
108 return
109
110 # Everything below this is broken!
111 r'''
112 # No authentication to be performed
113 if username is None:
114 if self._LOGIN_REQUIRED and self.get_param('cookiefile') is None:
115 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
116 # if self.get_param('cookiefile'): # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
117 # self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
118 return True
119
120 login_page = self._download_webpage(
121 self._LOGIN_URL, None,
122 note='Downloading login page',
123 errnote='unable to fetch login page', fatal=False)
124 if login_page is False:
125 return
126
127 login_form = self._hidden_inputs(login_page)
128
129 def req(url, f_req, note, errnote):
130 data = login_form.copy()
131 data.update({
132 'pstMsg': 1,
133 'checkConnection': 'youtube',
134 'checkedDomains': 'youtube',
135 'hl': 'en',
136 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
137 'f.req': json.dumps(f_req),
138 'flowName': 'GlifWebSignIn',
139 'flowEntry': 'ServiceLogin',
140 # TODO: reverse actual botguard identifier generation algo
141 'bgRequest': '["identifier",""]',
142 })
143 return self._download_json(
144 url, None, note=note, errnote=errnote,
145 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
146 fatal=False,
147 data=urlencode_postdata(data), headers={
148 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
149 'Google-Accounts-XSRF': 1,
150 })
151
152 lookup_req = [
153 username,
154 None, [], None, 'US', None, None, 2, False, True,
155 [
156 None, None,
157 [2, 1, None, 1,
158 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
159 None, [], 4],
160 1, [None, None, []], None, None, None, True
161 ],
162 username,
163 ]
164
165 lookup_results = req(
166 self._LOOKUP_URL, lookup_req,
167 'Looking up account info', 'Unable to look up account info')
168
169 if lookup_results is False:
170 return False
171
172 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
173 if not user_hash:
174 warn('Unable to extract user hash')
175 return False
176
177 challenge_req = [
178 user_hash,
179 None, 1, None, [1, None, None, None, [password, None, True]],
180 [
181 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
182 1, [None, None, []], None, None, None, True
183 ]]
184
185 challenge_results = req(
186 self._CHALLENGE_URL, challenge_req,
187 'Logging in', 'Unable to log in')
188
189 if challenge_results is False:
190 return
191
192 login_res = try_get(challenge_results, lambda x: x[0][5], list)
193 if login_res:
194 login_msg = try_get(login_res, lambda x: x[5], compat_str)
195 warn(
196 'Unable to login: %s' % 'Invalid password'
197 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
198 return False
199
200 res = try_get(challenge_results, lambda x: x[0][-1], list)
201 if not res:
202 warn('Unable to extract result entry')
203 return False
204
205 login_challenge = try_get(res, lambda x: x[0][0], list)
206 if login_challenge:
207 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
208 if challenge_str == 'TWO_STEP_VERIFICATION':
209 # SEND_SUCCESS - TFA code has been successfully sent to phone
210 # QUOTA_EXCEEDED - reached the limit of TFA codes
211 status = try_get(login_challenge, lambda x: x[5], compat_str)
212 if status == 'QUOTA_EXCEEDED':
213 warn('Exceeded the limit of TFA codes, try later')
214 return False
215
216 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
217 if not tl:
218 warn('Unable to extract TL')
219 return False
220
221 tfa_code = self._get_tfa_info('2-step verification code')
222
223 if not tfa_code:
224 warn(
225 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
226 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
227 return False
228
229 tfa_code = remove_start(tfa_code, 'G-')
230
231 tfa_req = [
232 user_hash, None, 2, None,
233 [
234 9, None, None, None, None, None, None, None,
235 [None, tfa_code, True, 2]
236 ]]
237
238 tfa_results = req(
239 self._TFA_URL.format(tl), tfa_req,
240 'Submitting TFA code', 'Unable to submit TFA code')
241
242 if tfa_results is False:
243 return False
244
245 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
246 if tfa_res:
247 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
248 warn(
249 'Unable to finish TFA: %s' % 'Invalid TFA code'
250 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
251 return False
252
253 check_cookie_url = try_get(
254 tfa_results, lambda x: x[0][-1][2], compat_str)
255 else:
256 CHALLENGES = {
257 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
258 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
259 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
260 }
261 challenge = CHALLENGES.get(
262 challenge_str,
263 '%s returned error %s.' % (self.IE_NAME, challenge_str))
264 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
265 return False
266 else:
267 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
268
269 if not check_cookie_url:
270 warn('Unable to extract CheckCookie URL')
271 return False
272
273 check_cookie_results = self._download_webpage(
274 check_cookie_url, None, 'Checking cookie', fatal=False)
275
276 if check_cookie_results is False:
277 return False
278
279 if 'https://myaccount.google.com/' not in check_cookie_results:
280 warn('Unable to log in')
281 return False
282
283 return True
284 '''
285
286 def _initialize_consent(self):
287 cookies = self._get_cookies('https://www.youtube.com/')
288 if cookies.get('__Secure-3PSID'):
289 return
290 consent_id = None
291 consent = cookies.get('CONSENT')
292 if consent:
293 if 'YES' in consent.value:
294 return
295 consent_id = self._search_regex(
296 r'PENDING\+(\d+)', consent.value, 'consent', default=None)
297 if not consent_id:
298 consent_id = random.randint(100, 999)
299 self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id)
300
301 def _real_initialize(self):
302 self._initialize_consent()
303 if self._downloader is None:
304 return
305 if not self._login():
306 return
307
308 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
309 _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
310 _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
311
312 _YT_DEFAULT_YTCFGS = {
313 'WEB': {
314 'INNERTUBE_API_VERSION': 'v1',
315 'INNERTUBE_CLIENT_NAME': 'WEB',
316 'INNERTUBE_CLIENT_VERSION': '2.20210622.10.00',
317 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
318 'INNERTUBE_CONTEXT': {
319 'client': {
320 'clientName': 'WEB',
321 'clientVersion': '2.20210622.10.00',
322 'hl': 'en',
323 }
324 },
325 'INNERTUBE_CONTEXT_CLIENT_NAME': 1
326 },
327 'WEB_REMIX': {
328 'INNERTUBE_API_VERSION': 'v1',
329 'INNERTUBE_CLIENT_NAME': 'WEB_REMIX',
330 'INNERTUBE_CLIENT_VERSION': '1.20210621.00.00',
331 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
332 'INNERTUBE_CONTEXT': {
333 'client': {
334 'clientName': 'WEB_REMIX',
335 'clientVersion': '1.20210621.00.00',
336 'hl': 'en',
337 }
338 },
339 'INNERTUBE_CONTEXT_CLIENT_NAME': 67
340 },
341 'WEB_EMBEDDED_PLAYER': {
342 'INNERTUBE_API_VERSION': 'v1',
343 'INNERTUBE_CLIENT_NAME': 'WEB_EMBEDDED_PLAYER',
344 'INNERTUBE_CLIENT_VERSION': '1.20210620.0.1',
345 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
346 'INNERTUBE_CONTEXT': {
347 'client': {
348 'clientName': 'WEB_EMBEDDED_PLAYER',
349 'clientVersion': '1.20210620.0.1',
350 'hl': 'en',
351 }
352 },
353 'INNERTUBE_CONTEXT_CLIENT_NAME': 56
354 },
355 'ANDROID': {
356 'INNERTUBE_API_VERSION': 'v1',
357 'INNERTUBE_CLIENT_NAME': 'ANDROID',
358 'INNERTUBE_CLIENT_VERSION': '16.20',
359 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
360 'INNERTUBE_CONTEXT': {
361 'client': {
362 'clientName': 'ANDROID',
363 'clientVersion': '16.20',
364 'hl': 'en',
365 }
366 },
367 'INNERTUBE_CONTEXT_CLIENT_NAME': 3
368 },
369 'ANDROID_EMBEDDED_PLAYER': {
370 'INNERTUBE_API_VERSION': 'v1',
371 'INNERTUBE_CLIENT_NAME': 'ANDROID_EMBEDDED_PLAYER',
372 'INNERTUBE_CLIENT_VERSION': '16.20',
373 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
374 'INNERTUBE_CONTEXT': {
375 'client': {
376 'clientName': 'ANDROID_EMBEDDED_PLAYER',
377 'clientVersion': '16.20',
378 'hl': 'en',
379 }
380 },
381 'INNERTUBE_CONTEXT_CLIENT_NAME': 55
382 },
383 'ANDROID_MUSIC': {
384 'INNERTUBE_API_VERSION': 'v1',
385 'INNERTUBE_CLIENT_NAME': 'ANDROID_MUSIC',
386 'INNERTUBE_CLIENT_VERSION': '4.32',
387 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
388 'INNERTUBE_CONTEXT': {
389 'client': {
390 'clientName': 'ANDROID_MUSIC',
391 'clientVersion': '4.32',
392 'hl': 'en',
393 }
394 },
395 'INNERTUBE_CONTEXT_CLIENT_NAME': 21
396 },
397 'IOS': {
398 'INNERTUBE_API_VERSION': 'v1',
399 'INNERTUBE_CLIENT_NAME': 'IOS',
400 'INNERTUBE_CLIENT_VERSION': '16.20',
401 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
402 'INNERTUBE_CONTEXT': {
403 'client': {
404 'clientName': 'IOS',
405 'clientVersion': '16.20',
406 'hl': 'en',
407 }
408 },
409 'INNERTUBE_CONTEXT_CLIENT_NAME': 5
410
411 },
412 'IOS_MUSIC': {
413 'INNERTUBE_API_VERSION': 'v1',
414 'INNERTUBE_CLIENT_NAME': 'IOS_MUSIC',
415 'INNERTUBE_CLIENT_VERSION': '4.32',
416 'INNERTUBE_API_KEY': 'AIzaSyDK3iBpDP9nHVTk2qL73FLJICfOC3c51Og',
417 'INNERTUBE_CONTEXT': {
418 'client': {
419 'clientName': 'IOS_MUSIC',
420 'clientVersion': '4.32',
421 'hl': 'en',
422 }
423 },
424 'INNERTUBE_CONTEXT_CLIENT_NAME': 26
425 },
426 'IOS_MESSAGES_EXTENSION': {
427 'INNERTUBE_API_VERSION': 'v1',
428 'INNERTUBE_CLIENT_NAME': 'IOS_MESSAGES_EXTENSION',
429 'INNERTUBE_CLIENT_VERSION': '16.20',
430 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8',
431 'INNERTUBE_CONTEXT': {
432 'client': {
433 'clientName': 'IOS_MESSAGES_EXTENSION',
434 'clientVersion': '16.20',
435 'hl': 'en',
436 }
437 },
438 'INNERTUBE_CONTEXT_CLIENT_NAME': 66
439 }
440 }
441
442 _YT_DEFAULT_INNERTUBE_HOSTS = {
443 'DIRECT': 'youtubei.googleapis.com',
444 'WEB': 'www.youtube.com',
445 'WEB_REMIX': 'music.youtube.com',
446 'ANDROID_MUSIC': 'music.youtube.com'
447 }
448
449 # clients starting with _ cannot be explicity requested by the user
450 _YT_CLIENTS = {
451 'web': 'WEB',
452 'web_music': 'WEB_REMIX',
453 '_web_embedded': 'WEB_EMBEDDED_PLAYER',
454 '_web_agegate': 'TVHTML5',
455 'android': 'ANDROID',
456 'android_music': 'ANDROID_MUSIC',
457 '_android_embedded': 'ANDROID_EMBEDDED_PLAYER',
458 '_android_agegate': 'ANDROID',
459 'ios': 'IOS',
460 'ios_music': 'IOS_MUSIC',
461 '_ios_embedded': 'IOS_MESSAGES_EXTENSION',
462 '_ios_agegate': 'IOS'
463 }
464
465 def _get_default_ytcfg(self, client='WEB'):
466 if client in self._YT_DEFAULT_YTCFGS:
467 return copy.deepcopy(self._YT_DEFAULT_YTCFGS[client])
468 self.write_debug(f'INNERTUBE default client {client} does not exist - falling back to WEB client.')
469 return copy.deepcopy(self._YT_DEFAULT_YTCFGS['WEB'])
470
471 def _get_innertube_host(self, client='WEB'):
472 return dict_get(self._YT_DEFAULT_INNERTUBE_HOSTS, (client, 'WEB'))
473
474 def _ytcfg_get_safe(self, ytcfg, getter, expected_type=None, default_client='WEB'):
475 # try_get but with fallback to default ytcfg client values when present
476 _func = lambda y: try_get(y, getter, expected_type)
477 return _func(ytcfg) or _func(self._get_default_ytcfg(default_client))
478
479 def _extract_client_name(self, ytcfg, default_client='WEB'):
480 return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CLIENT_NAME'], compat_str, default_client)
481
482 @staticmethod
483 def _extract_session_index(*data):
484 for ytcfg in data:
485 session_index = int_or_none(try_get(ytcfg, lambda x: x['SESSION_INDEX']))
486 if session_index is not None:
487 return session_index
488
489 def _extract_client_version(self, ytcfg, default_client='WEB'):
490 return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CLIENT_VERSION'], compat_str, default_client)
491
492 def _extract_api_key(self, ytcfg=None, default_client='WEB'):
493 return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client)
494
495 def _extract_context(self, ytcfg=None, default_client='WEB'):
496 _get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict)
497 context = _get_context(ytcfg)
498 if context:
499 return context
500
501 context = _get_context(self._get_default_ytcfg(default_client))
502 if not ytcfg:
503 return context
504
505 # Recreate the client context (required)
506 context['client'].update({
507 'clientVersion': self._extract_client_version(ytcfg, default_client),
508 'clientName': self._extract_client_name(ytcfg, default_client),
509 })
510 visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str)
511 if visitor_data:
512 context['client']['visitorData'] = visitor_data
513 return context
514
515 def _generate_sapisidhash_header(self, origin='https://www.youtube.com'):
516 # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is.
517 # See: https://github.com/yt-dlp/yt-dlp/issues/393
518 yt_cookies = self._get_cookies('https://www.youtube.com')
519 sapisid_cookie = dict_get(
520 yt_cookies, ('__Secure-3PAPISID', 'SAPISID'))
521 if sapisid_cookie is None:
522 return
523 time_now = round(time.time())
524 # SAPISID cookie is required if not already present
525 if not yt_cookies.get('SAPISID'):
526 self._set_cookie(
527 '.youtube.com', 'SAPISID', sapisid_cookie.value, secure=True, expire_time=time_now + 3600)
528 # SAPISIDHASH algorithm from https://stackoverflow.com/a/32065323
529 sapisidhash = hashlib.sha1(
530 f'{time_now} {sapisid_cookie.value} {origin}'.encode('utf-8')).hexdigest()
531 return f'SAPISIDHASH {time_now}_{sapisidhash}'
532
533 def _call_api(self, ep, query, video_id, fatal=True, headers=None,
534 note='Downloading API JSON', errnote='Unable to download API page',
535 context=None, api_key=None, api_hostname=None, default_client='WEB'):
536
537 data = {'context': context} if context else {'context': self._extract_context(default_client=default_client)}
538 data.update(query)
539 real_headers = self.generate_api_headers(default_client=default_client)
540 real_headers.update({'content-type': 'application/json'})
541 if headers:
542 real_headers.update(headers)
543 return self._download_json(
544 'https://%s/youtubei/v1/%s' % (api_hostname or self._get_innertube_host(default_client), ep),
545 video_id=video_id, fatal=fatal, note=note, errnote=errnote,
546 data=json.dumps(data).encode('utf8'), headers=real_headers,
547 query={'key': api_key or self._extract_api_key()})
548
549 def extract_yt_initial_data(self, video_id, webpage):
550 return self._parse_json(
551 self._search_regex(
552 (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
553 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
554 video_id)
555
556 def _extract_identity_token(self, webpage, item_id):
557 if not webpage:
558 return None
559 ytcfg = self.extract_ytcfg(item_id, webpage)
560 if ytcfg:
561 token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
562 if token:
563 return token
564 return self._search_regex(
565 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
566 'identity token', default=None)
567
568 @staticmethod
569 def _extract_account_syncid(*args):
570 """
571 Extract syncId required to download private playlists of secondary channels
572 @params response and/or ytcfg
573 """
574 for data in args:
575 # ytcfg includes channel_syncid if on secondary channel
576 delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], compat_str)
577 if delegated_sid:
578 return delegated_sid
579 sync_ids = (try_get(
580 data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'],
581 lambda x: x['DATASYNC_ID']), compat_str) or '').split("||")
582 if len(sync_ids) >= 2 and sync_ids[1]:
583 # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
584 # and just "user_syncid||" for primary channel. We only want the channel_syncid
585 return sync_ids[0]
586
587 def extract_ytcfg(self, video_id, webpage):
588 if not webpage:
589 return {}
590 return self._parse_json(
591 self._search_regex(
592 r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
593 default='{}'), video_id, fatal=False) or {}
594
595 def generate_api_headers(
596 self, ytcfg=None, identity_token=None, account_syncid=None,
597 visitor_data=None, api_hostname=None, default_client='WEB', session_index=None):
598 origin = 'https://' + (api_hostname if api_hostname else self._get_innertube_host(default_client))
599 headers = {
600 'X-YouTube-Client-Name': compat_str(
601 self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=default_client)),
602 'X-YouTube-Client-Version': self._extract_client_version(ytcfg, default_client),
603 'Origin': origin
604 }
605 if not visitor_data and ytcfg:
606 visitor_data = try_get(
607 self._extract_context(ytcfg, default_client), lambda x: x['client']['visitorData'], compat_str)
608 if identity_token:
609 headers['X-Youtube-Identity-Token'] = identity_token
610 if account_syncid:
611 headers['X-Goog-PageId'] = account_syncid
612 if session_index is None and ytcfg:
613 session_index = self._extract_session_index(ytcfg)
614 if account_syncid or session_index is not None:
615 headers['X-Goog-AuthUser'] = session_index if session_index is not None else 0
616 if visitor_data:
617 headers['X-Goog-Visitor-Id'] = visitor_data
618 auth = self._generate_sapisidhash_header(origin)
619 if auth is not None:
620 headers['Authorization'] = auth
621 headers['X-Origin'] = origin
622 return headers
623
624 @staticmethod
625 def _build_api_continuation_query(continuation, ctp=None):
626 query = {
627 'continuation': continuation
628 }
629 # TODO: Inconsistency with clickTrackingParams.
630 # Currently we have a fixed ctp contained within context (from ytcfg)
631 # and a ctp in root query for continuation.
632 if ctp:
633 query['clickTracking'] = {'clickTrackingParams': ctp}
634 return query
635
636 @classmethod
637 def _extract_next_continuation_data(cls, renderer):
638 next_continuation = try_get(
639 renderer, (lambda x: x['continuations'][0]['nextContinuationData'],
640 lambda x: x['continuation']['reloadContinuationData']), dict)
641 if not next_continuation:
642 return
643 continuation = next_continuation.get('continuation')
644 if not continuation:
645 return
646 ctp = next_continuation.get('clickTrackingParams')
647 return cls._build_api_continuation_query(continuation, ctp)
648
649 @classmethod
650 def _extract_continuation_ep_data(cls, continuation_ep: dict):
651 if isinstance(continuation_ep, dict):
652 continuation = try_get(
653 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
654 if not continuation:
655 return
656 ctp = continuation_ep.get('clickTrackingParams')
657 return cls._build_api_continuation_query(continuation, ctp)
658
659 @classmethod
660 def _extract_continuation(cls, renderer):
661 next_continuation = cls._extract_next_continuation_data(renderer)
662 if next_continuation:
663 return next_continuation
664
665 contents = []
666 for key in ('contents', 'items'):
667 contents.extend(try_get(renderer, lambda x: x[key], list) or [])
668
669 for content in contents:
670 if not isinstance(content, dict):
671 continue
672 continuation_ep = try_get(
673 content, (lambda x: x['continuationItemRenderer']['continuationEndpoint'],
674 lambda x: x['continuationItemRenderer']['button']['buttonRenderer']['command']),
675 dict)
676 continuation = cls._extract_continuation_ep_data(continuation_ep)
677 if continuation:
678 return continuation
679
680 @classmethod
681 def _extract_alerts(cls, data):
682 for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
683 if not isinstance(alert_dict, dict):
684 continue
685 for alert in alert_dict.values():
686 alert_type = alert.get('type')
687 if not alert_type:
688 continue
689 message = cls._get_text(alert.get('text'))
690 if message:
691 yield alert_type, message
692
693 def _report_alerts(self, alerts, expected=True):
694 errors = []
695 warnings = []
696 for alert_type, alert_message in alerts:
697 if alert_type.lower() == 'error':
698 errors.append([alert_type, alert_message])
699 else:
700 warnings.append([alert_type, alert_message])
701
702 for alert_type, alert_message in (warnings + errors[:-1]):
703 self.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
704 if errors:
705 raise ExtractorError('YouTube said: %s' % errors[-1][1], expected=expected)
706
707 def _extract_and_report_alerts(self, data, *args, **kwargs):
708 return self._report_alerts(self._extract_alerts(data), *args, **kwargs)
709
710 def _extract_badges(self, renderer: dict):
711 badges = set()
712 for badge in try_get(renderer, lambda x: x['badges'], list) or []:
713 label = try_get(badge, lambda x: x['metadataBadgeRenderer']['label'], compat_str)
714 if label:
715 badges.add(label.lower())
716 return badges
717
718 @staticmethod
719 def _get_text(data, getter=None, max_runs=None):
720 for get in variadic(getter):
721 d = try_get(data, get) if get is not None else data
722 text = try_get(d, lambda x: x['simpleText'], compat_str)
723 if text:
724 return text
725 runs = try_get(d, lambda x: x['runs'], list) or []
726 if not runs and isinstance(d, list):
727 runs = d
728
729 def get_runs(runs):
730 for run in runs[:min(len(runs), max_runs or len(runs))]:
731 yield try_get(run, lambda x: x['text'], compat_str) or ''
732
733 text = ''.join(get_runs(runs))
734 if text:
735 return text
736
737 def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None,
738 ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None,
739 default_client='WEB'):
740 response = None
741 last_error = None
742 count = -1
743 retries = self.get_param('extractor_retries', 3)
744 if check_get_keys is None:
745 check_get_keys = []
746 while count < retries:
747 count += 1
748 if last_error:
749 self.report_warning('%s. Retrying ...' % last_error)
750 try:
751 response = self._call_api(
752 ep=ep, fatal=True, headers=headers,
753 video_id=item_id, query=query,
754 context=self._extract_context(ytcfg, default_client),
755 api_key=self._extract_api_key(ytcfg, default_client),
756 api_hostname=api_hostname, default_client=default_client,
757 note='%s%s' % (note, ' (retry #%d)' % count if count else ''))
758 except ExtractorError as e:
759 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404):
760 # Downloading page may result in intermittent 5xx HTTP error
761 # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
762 last_error = 'HTTP Error %s' % e.cause.code
763 if count < retries:
764 continue
765 if fatal:
766 raise
767 else:
768 self.report_warning(error_to_compat_str(e))
769 return
770
771 else:
772 # Youtube may send alerts if there was an issue with the continuation page
773 try:
774 self._extract_and_report_alerts(response, expected=False)
775 except ExtractorError as e:
776 if fatal:
777 raise
778 self.report_warning(error_to_compat_str(e))
779 return
780 if not check_get_keys or dict_get(response, check_get_keys):
781 break
782 # Youtube sometimes sends incomplete data
783 # See: https://github.com/ytdl-org/youtube-dl/issues/28194
784 last_error = 'Incomplete data received'
785 if count >= retries:
786 if fatal:
787 raise ExtractorError(last_error)
788 else:
789 self.report_warning(last_error)
790 return
791 return response
792
793 @staticmethod
794 def is_music_url(url):
795 return re.match(r'https?://music\.youtube\.com/', url) is not None
796
797 def _extract_video(self, renderer):
798 video_id = renderer.get('videoId')
799 title = self._get_text(renderer.get('title'))
800 description = self._get_text(renderer.get('descriptionSnippet'))
801 duration = parse_duration(self._get_text(renderer.get('lengthText')))
802 view_count_text = self._get_text(renderer.get('viewCountText')) or ''
803 view_count = str_to_int(self._search_regex(
804 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
805 'view count', default=None))
806
807 uploader = self._get_text(renderer, (lambda x: x['ownerText'], lambda x: x['shortBylineText']))
808
809 return {
810 '_type': 'url',
811 'ie_key': YoutubeIE.ie_key(),
812 'id': video_id,
813 'url': video_id,
814 'title': title,
815 'description': description,
816 'duration': duration,
817 'view_count': view_count,
818 'uploader': uploader,
819 }
820
821
822 class YoutubeIE(YoutubeBaseInfoExtractor):
823 IE_DESC = 'YouTube.com'
824 _INVIDIOUS_SITES = (
825 # invidious-redirect websites
826 r'(?:www\.)?redirect\.invidious\.io',
827 r'(?:(?:www|dev)\.)?invidio\.us',
828 # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
829 r'(?:www\.)?invidious\.pussthecat\.org',
830 r'(?:www\.)?invidious\.zee\.li',
831 r'(?:www\.)?invidious\.ethibox\.fr',
832 r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
833 # youtube-dl invidious instances list
834 r'(?:(?:www|no)\.)?invidiou\.sh',
835 r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
836 r'(?:www\.)?invidious\.kabi\.tk',
837 r'(?:www\.)?invidious\.mastodon\.host',
838 r'(?:www\.)?invidious\.zapashcanon\.fr',
839 r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks',
840 r'(?:www\.)?invidious\.tinfoil-hat\.net',
841 r'(?:www\.)?invidious\.himiko\.cloud',
842 r'(?:www\.)?invidious\.reallyancient\.tech',
843 r'(?:www\.)?invidious\.tube',
844 r'(?:www\.)?invidiou\.site',
845 r'(?:www\.)?invidious\.site',
846 r'(?:www\.)?invidious\.xyz',
847 r'(?:www\.)?invidious\.nixnet\.xyz',
848 r'(?:www\.)?invidious\.048596\.xyz',
849 r'(?:www\.)?invidious\.drycat\.fr',
850 r'(?:www\.)?inv\.skyn3t\.in',
851 r'(?:www\.)?tube\.poal\.co',
852 r'(?:www\.)?tube\.connect\.cafe',
853 r'(?:www\.)?vid\.wxzm\.sx',
854 r'(?:www\.)?vid\.mint\.lgbt',
855 r'(?:www\.)?vid\.puffyan\.us',
856 r'(?:www\.)?yewtu\.be',
857 r'(?:www\.)?yt\.elukerio\.org',
858 r'(?:www\.)?yt\.lelux\.fi',
859 r'(?:www\.)?invidious\.ggc-project\.de',
860 r'(?:www\.)?yt\.maisputain\.ovh',
861 r'(?:www\.)?ytprivate\.com',
862 r'(?:www\.)?invidious\.13ad\.de',
863 r'(?:www\.)?invidious\.toot\.koeln',
864 r'(?:www\.)?invidious\.fdn\.fr',
865 r'(?:www\.)?watch\.nettohikari\.com',
866 r'(?:www\.)?invidious\.namazso\.eu',
867 r'(?:www\.)?invidious\.silkky\.cloud',
868 r'(?:www\.)?invidious\.exonip\.de',
869 r'(?:www\.)?invidious\.riverside\.rocks',
870 r'(?:www\.)?invidious\.blamefran\.net',
871 r'(?:www\.)?invidious\.moomoo\.de',
872 r'(?:www\.)?ytb\.trom\.tf',
873 r'(?:www\.)?yt\.cyberhost\.uk',
874 r'(?:www\.)?kgg2m7yk5aybusll\.onion',
875 r'(?:www\.)?qklhadlycap4cnod\.onion',
876 r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion',
877 r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion',
878 r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion',
879 r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion',
880 r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p',
881 r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion',
882 r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
883 r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
884 r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
885 r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion',
886 )
887 _VALID_URL = r"""(?x)^
888 (
889 (?:https?://|//) # http(s):// or protocol-independent URL
890 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com|
891 (?:www\.)?deturl\.com/www\.youtube\.com|
892 (?:www\.)?pwnyoutube\.com|
893 (?:www\.)?hooktube\.com|
894 (?:www\.)?yourepeat\.com|
895 tube\.majestyc\.net|
896 %(invidious)s|
897 youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
898 (?:.*?\#/)? # handle anchor (#/) redirect urls
899 (?: # the various things that can precede the ID:
900 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
901 |(?: # or the v= param in all its forms
902 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
903 (?:\?|\#!?) # the params delimiter ? or # or #!
904 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
905 v=
906 )
907 ))
908 |(?:
909 youtu\.be| # just youtu.be/xxxx
910 vid\.plus| # or vid.plus/xxxx
911 zwearz\.com/watch| # or zwearz.com/watch/xxxx
912 %(invidious)s
913 )/
914 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
915 )
916 )? # all until now is optional -> you can pass the naked ID
917 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
918 (?(1).+)? # if we found the ID, everything can follow
919 (?:\#|$)""" % {
920 'invidious': '|'.join(_INVIDIOUS_SITES),
921 }
922 _PLAYER_INFO_RE = (
923 r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
924 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
925 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
926 )
927 _formats = {
928 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
929 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
930 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
931 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
932 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
933 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
934 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
935 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
936 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
937 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
938 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
939 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
940 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
941 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
942 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
943 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
944 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
945 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
946
947
948 # 3D videos
949 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
950 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
951 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
952 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
953 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
954 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
955 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
956
957 # Apple HTTP Live Streaming
958 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
959 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
960 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
961 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
962 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
963 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
964 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
965 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
966
967 # DASH mp4 video
968 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
969 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
970 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
971 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
972 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
973 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
974 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
975 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
976 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
977 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
978 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
979 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
980
981 # Dash mp4 audio
982 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
983 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
984 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
985 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
986 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
987 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
988 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
989
990 # Dash webm
991 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
992 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
993 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
994 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
995 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
996 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
997 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
998 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
999 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1000 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1001 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1002 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1003 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1004 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1005 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1006 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
1007 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1008 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
1009 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
1010 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
1011 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1012 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
1013
1014 # Dash webm audio
1015 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
1016 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
1017
1018 # Dash webm audio with opus inside
1019 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
1020 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
1021 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
1022
1023 # RTMP (unnamed)
1024 '_rtmp': {'protocol': 'rtmp'},
1025
1026 # av01 video only formats sometimes served with "unknown" codecs
1027 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
1028 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
1029 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
1030 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
1031 }
1032 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
1033
1034 _AGE_GATE_REASONS = (
1035 'Sign in to confirm your age',
1036 'This video may be inappropriate for some users.',
1037 'Sorry, this content is age-restricted.')
1038
1039 _GEO_BYPASS = False
1040
1041 IE_NAME = 'youtube'
1042 _TESTS = [
1043 {
1044 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
1045 'info_dict': {
1046 'id': 'BaW_jenozKc',
1047 'ext': 'mp4',
1048 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
1049 'uploader': 'Philipp Hagemeister',
1050 'uploader_id': 'phihag',
1051 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
1052 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
1053 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
1054 'upload_date': '20121002',
1055 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
1056 'categories': ['Science & Technology'],
1057 'tags': ['youtube-dl'],
1058 'duration': 10,
1059 'view_count': int,
1060 'like_count': int,
1061 'dislike_count': int,
1062 'start_time': 1,
1063 'end_time': 9,
1064 }
1065 },
1066 {
1067 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
1068 'note': 'Embed-only video (#1746)',
1069 'info_dict': {
1070 'id': 'yZIXLfi8CZQ',
1071 'ext': 'mp4',
1072 'upload_date': '20120608',
1073 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
1074 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
1075 'uploader': 'SET India',
1076 'uploader_id': 'setindia',
1077 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
1078 'age_limit': 18,
1079 },
1080 'skip': 'Private video',
1081 },
1082 {
1083 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
1084 'note': 'Use the first video ID in the URL',
1085 'info_dict': {
1086 'id': 'BaW_jenozKc',
1087 'ext': 'mp4',
1088 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
1089 'uploader': 'Philipp Hagemeister',
1090 'uploader_id': 'phihag',
1091 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
1092 'upload_date': '20121002',
1093 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
1094 'categories': ['Science & Technology'],
1095 'tags': ['youtube-dl'],
1096 'duration': 10,
1097 'view_count': int,
1098 'like_count': int,
1099 'dislike_count': int,
1100 },
1101 'params': {
1102 'skip_download': True,
1103 },
1104 },
1105 {
1106 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
1107 'note': '256k DASH audio (format 141) via DASH manifest',
1108 'info_dict': {
1109 'id': 'a9LDPn-MO4I',
1110 'ext': 'm4a',
1111 'upload_date': '20121002',
1112 'uploader_id': '8KVIDEO',
1113 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
1114 'description': '',
1115 'uploader': '8KVIDEO',
1116 'title': 'UHDTV TEST 8K VIDEO.mp4'
1117 },
1118 'params': {
1119 'youtube_include_dash_manifest': True,
1120 'format': '141',
1121 },
1122 'skip': 'format 141 not served anymore',
1123 },
1124 # DASH manifest with encrypted signature
1125 {
1126 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
1127 'info_dict': {
1128 'id': 'IB3lcPjvWLA',
1129 'ext': 'm4a',
1130 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
1131 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
1132 'duration': 244,
1133 'uploader': 'AfrojackVEVO',
1134 'uploader_id': 'AfrojackVEVO',
1135 'upload_date': '20131011',
1136 'abr': 129.495,
1137 },
1138 'params': {
1139 'youtube_include_dash_manifest': True,
1140 'format': '141/bestaudio[ext=m4a]',
1141 },
1142 },
1143 # Normal age-gate video (embed allowed)
1144 {
1145 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
1146 'info_dict': {
1147 'id': 'HtVdAasjOgU',
1148 'ext': 'mp4',
1149 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
1150 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
1151 'duration': 142,
1152 'uploader': 'The Witcher',
1153 'uploader_id': 'WitcherGame',
1154 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
1155 'upload_date': '20140605',
1156 'age_limit': 18,
1157 },
1158 },
1159 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
1160 # YouTube Red ad is not captured for creator
1161 {
1162 'url': '__2ABJjxzNo',
1163 'info_dict': {
1164 'id': '__2ABJjxzNo',
1165 'ext': 'mp4',
1166 'duration': 266,
1167 'upload_date': '20100430',
1168 'uploader_id': 'deadmau5',
1169 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
1170 'creator': 'deadmau5',
1171 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336',
1172 'uploader': 'deadmau5',
1173 'title': 'Deadmau5 - Some Chords (HD)',
1174 'alt_title': 'Some Chords',
1175 },
1176 'expected_warnings': [
1177 'DASH manifest missing',
1178 ]
1179 },
1180 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
1181 {
1182 'url': 'lqQg6PlCWgI',
1183 'info_dict': {
1184 'id': 'lqQg6PlCWgI',
1185 'ext': 'mp4',
1186 'duration': 6085,
1187 'upload_date': '20150827',
1188 'uploader_id': 'olympic',
1189 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
1190 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
1191 'uploader': 'Olympics',
1192 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
1193 },
1194 'params': {
1195 'skip_download': 'requires avconv',
1196 }
1197 },
1198 # Non-square pixels
1199 {
1200 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
1201 'info_dict': {
1202 'id': '_b-2C3KPAM0',
1203 'ext': 'mp4',
1204 'stretched_ratio': 16 / 9.,
1205 'duration': 85,
1206 'upload_date': '20110310',
1207 'uploader_id': 'AllenMeow',
1208 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
1209 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
1210 'uploader': '孫ᄋᄅ',
1211 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
1212 },
1213 },
1214 # url_encoded_fmt_stream_map is empty string
1215 {
1216 'url': 'qEJwOuvDf7I',
1217 'info_dict': {
1218 'id': 'qEJwOuvDf7I',
1219 'ext': 'webm',
1220 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
1221 'description': '',
1222 'upload_date': '20150404',
1223 'uploader_id': 'spbelect',
1224 'uploader': 'Наблюдатели Петербурга',
1225 },
1226 'params': {
1227 'skip_download': 'requires avconv',
1228 },
1229 'skip': 'This live event has ended.',
1230 },
1231 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
1232 {
1233 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
1234 'info_dict': {
1235 'id': 'FIl7x6_3R5Y',
1236 'ext': 'webm',
1237 'title': 'md5:7b81415841e02ecd4313668cde88737a',
1238 'description': 'md5:116377fd2963b81ec4ce64b542173306',
1239 'duration': 220,
1240 'upload_date': '20150625',
1241 'uploader_id': 'dorappi2000',
1242 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
1243 'uploader': 'dorappi2000',
1244 'formats': 'mincount:31',
1245 },
1246 'skip': 'not actual anymore',
1247 },
1248 # DASH manifest with segment_list
1249 {
1250 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
1251 'md5': '8ce563a1d667b599d21064e982ab9e31',
1252 'info_dict': {
1253 'id': 'CsmdDsKjzN8',
1254 'ext': 'mp4',
1255 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
1256 'uploader': 'Airtek',
1257 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
1258 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
1259 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
1260 },
1261 'params': {
1262 'youtube_include_dash_manifest': True,
1263 'format': '135', # bestvideo
1264 },
1265 'skip': 'This live event has ended.',
1266 },
1267 {
1268 # Multifeed videos (multiple cameras), URL is for Main Camera
1269 'url': 'https://www.youtube.com/watch?v=jvGDaLqkpTg',
1270 'info_dict': {
1271 'id': 'jvGDaLqkpTg',
1272 'title': 'Tom Clancy Free Weekend Rainbow Whatever',
1273 'description': 'md5:e03b909557865076822aa169218d6a5d',
1274 },
1275 'playlist': [{
1276 'info_dict': {
1277 'id': 'jvGDaLqkpTg',
1278 'ext': 'mp4',
1279 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Main Camera)',
1280 'description': 'md5:e03b909557865076822aa169218d6a5d',
1281 'duration': 10643,
1282 'upload_date': '20161111',
1283 'uploader': 'Team PGP',
1284 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1285 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1286 },
1287 }, {
1288 'info_dict': {
1289 'id': '3AKt1R1aDnw',
1290 'ext': 'mp4',
1291 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 2)',
1292 'description': 'md5:e03b909557865076822aa169218d6a5d',
1293 'duration': 10991,
1294 'upload_date': '20161111',
1295 'uploader': 'Team PGP',
1296 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1297 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1298 },
1299 }, {
1300 'info_dict': {
1301 'id': 'RtAMM00gpVc',
1302 'ext': 'mp4',
1303 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 3)',
1304 'description': 'md5:e03b909557865076822aa169218d6a5d',
1305 'duration': 10995,
1306 'upload_date': '20161111',
1307 'uploader': 'Team PGP',
1308 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1309 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1310 },
1311 }, {
1312 'info_dict': {
1313 'id': '6N2fdlP3C5U',
1314 'ext': 'mp4',
1315 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 4)',
1316 'description': 'md5:e03b909557865076822aa169218d6a5d',
1317 'duration': 10990,
1318 'upload_date': '20161111',
1319 'uploader': 'Team PGP',
1320 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1321 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1322 },
1323 }],
1324 'params': {
1325 'skip_download': True,
1326 },
1327 },
1328 {
1329 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
1330 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
1331 'info_dict': {
1332 'id': 'gVfLd0zydlo',
1333 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
1334 },
1335 'playlist_count': 2,
1336 'skip': 'Not multifeed anymore',
1337 },
1338 {
1339 'url': 'https://vid.plus/FlRa-iH7PGw',
1340 'only_matching': True,
1341 },
1342 {
1343 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
1344 'only_matching': True,
1345 },
1346 {
1347 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
1348 # Also tests cut-off URL expansion in video description (see
1349 # https://github.com/ytdl-org/youtube-dl/issues/1892,
1350 # https://github.com/ytdl-org/youtube-dl/issues/8164)
1351 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
1352 'info_dict': {
1353 'id': 'lsguqyKfVQg',
1354 'ext': 'mp4',
1355 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
1356 'alt_title': 'Dark Walk',
1357 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
1358 'duration': 133,
1359 'upload_date': '20151119',
1360 'uploader_id': 'IronSoulElf',
1361 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
1362 'uploader': 'IronSoulElf',
1363 'creator': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan',
1364 'track': 'Dark Walk',
1365 'artist': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan',
1366 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
1367 },
1368 'params': {
1369 'skip_download': True,
1370 },
1371 },
1372 {
1373 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
1374 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
1375 'only_matching': True,
1376 },
1377 {
1378 # Video with yt:stretch=17:0
1379 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
1380 'info_dict': {
1381 'id': 'Q39EVAstoRM',
1382 'ext': 'mp4',
1383 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
1384 'description': 'md5:ee18a25c350637c8faff806845bddee9',
1385 'upload_date': '20151107',
1386 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
1387 'uploader': 'CH GAMER DROID',
1388 },
1389 'params': {
1390 'skip_download': True,
1391 },
1392 'skip': 'This video does not exist.',
1393 },
1394 {
1395 # Video with incomplete 'yt:stretch=16:'
1396 'url': 'https://www.youtube.com/watch?v=FRhJzUSJbGI',
1397 'only_matching': True,
1398 },
1399 {
1400 # Video licensed under Creative Commons
1401 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
1402 'info_dict': {
1403 'id': 'M4gD1WSo5mA',
1404 'ext': 'mp4',
1405 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
1406 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
1407 'duration': 721,
1408 'upload_date': '20150127',
1409 'uploader_id': 'BerkmanCenter',
1410 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
1411 'uploader': 'The Berkman Klein Center for Internet & Society',
1412 'license': 'Creative Commons Attribution license (reuse allowed)',
1413 },
1414 'params': {
1415 'skip_download': True,
1416 },
1417 },
1418 {
1419 # Channel-like uploader_url
1420 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
1421 'info_dict': {
1422 'id': 'eQcmzGIKrzg',
1423 'ext': 'mp4',
1424 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
1425 'description': 'md5:13a2503d7b5904ef4b223aa101628f39',
1426 'duration': 4060,
1427 'upload_date': '20151119',
1428 'uploader': 'Bernie Sanders',
1429 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
1430 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
1431 'license': 'Creative Commons Attribution license (reuse allowed)',
1432 },
1433 'params': {
1434 'skip_download': True,
1435 },
1436 },
1437 {
1438 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
1439 'only_matching': True,
1440 },
1441 {
1442 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
1443 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
1444 'only_matching': True,
1445 },
1446 {
1447 # Rental video preview
1448 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1449 'info_dict': {
1450 'id': 'uGpuVWrhIzE',
1451 'ext': 'mp4',
1452 'title': 'Piku - Trailer',
1453 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1454 'upload_date': '20150811',
1455 'uploader': 'FlixMatrix',
1456 'uploader_id': 'FlixMatrixKaravan',
1457 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
1458 'license': 'Standard YouTube License',
1459 },
1460 'params': {
1461 'skip_download': True,
1462 },
1463 'skip': 'This video is not available.',
1464 },
1465 {
1466 # YouTube Red video with episode data
1467 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1468 'info_dict': {
1469 'id': 'iqKdEhx-dD4',
1470 'ext': 'mp4',
1471 'title': 'Isolation - Mind Field (Ep 1)',
1472 'description': 'md5:f540112edec5d09fc8cc752d3d4ba3cd',
1473 'duration': 2085,
1474 'upload_date': '20170118',
1475 'uploader': 'Vsauce',
1476 'uploader_id': 'Vsauce',
1477 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1478 'series': 'Mind Field',
1479 'season_number': 1,
1480 'episode_number': 1,
1481 },
1482 'params': {
1483 'skip_download': True,
1484 },
1485 'expected_warnings': [
1486 'Skipping DASH manifest',
1487 ],
1488 },
1489 {
1490 # The following content has been identified by the YouTube community
1491 # as inappropriate or offensive to some audiences.
1492 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1493 'info_dict': {
1494 'id': '6SJNVb0GnPI',
1495 'ext': 'mp4',
1496 'title': 'Race Differences in Intelligence',
1497 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1498 'duration': 965,
1499 'upload_date': '20140124',
1500 'uploader': 'New Century Foundation',
1501 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1502 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1503 },
1504 'params': {
1505 'skip_download': True,
1506 },
1507 'skip': 'This video has been removed for violating YouTube\'s policy on hate speech.',
1508 },
1509 {
1510 # itag 212
1511 'url': '1t24XAntNCY',
1512 'only_matching': True,
1513 },
1514 {
1515 # geo restricted to JP
1516 'url': 'sJL6WA-aGkQ',
1517 'only_matching': True,
1518 },
1519 {
1520 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1521 'only_matching': True,
1522 },
1523 {
1524 'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc',
1525 'only_matching': True,
1526 },
1527 {
1528 # from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m
1529 'url': 'https://redirect.invidious.io/Yh0AhrY9GjA',
1530 'only_matching': True,
1531 },
1532 {
1533 # DRM protected
1534 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1535 'only_matching': True,
1536 },
1537 {
1538 # Video with unsupported adaptive stream type formats
1539 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1540 'info_dict': {
1541 'id': 'Z4Vy8R84T1U',
1542 'ext': 'mp4',
1543 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1544 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1545 'duration': 433,
1546 'upload_date': '20130923',
1547 'uploader': 'Amelia Putri Harwita',
1548 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1549 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1550 'formats': 'maxcount:10',
1551 },
1552 'params': {
1553 'skip_download': True,
1554 'youtube_include_dash_manifest': False,
1555 },
1556 'skip': 'not actual anymore',
1557 },
1558 {
1559 # Youtube Music Auto-generated description
1560 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1561 'info_dict': {
1562 'id': 'MgNrAu2pzNs',
1563 'ext': 'mp4',
1564 'title': 'Voyeur Girl',
1565 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1566 'upload_date': '20190312',
1567 'uploader': 'Stephen - Topic',
1568 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1569 'artist': 'Stephen',
1570 'track': 'Voyeur Girl',
1571 'album': 'it\'s too much love to know my dear',
1572 'release_date': '20190313',
1573 'release_year': 2019,
1574 },
1575 'params': {
1576 'skip_download': True,
1577 },
1578 },
1579 {
1580 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1581 'only_matching': True,
1582 },
1583 {
1584 # invalid -> valid video id redirection
1585 'url': 'DJztXj2GPfl',
1586 'info_dict': {
1587 'id': 'DJztXj2GPfk',
1588 'ext': 'mp4',
1589 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1590 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1591 'upload_date': '20090125',
1592 'uploader': 'Prochorowka',
1593 'uploader_id': 'Prochorowka',
1594 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1595 'artist': 'Panjabi MC',
1596 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1597 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1598 },
1599 'params': {
1600 'skip_download': True,
1601 },
1602 'skip': 'Video unavailable',
1603 },
1604 {
1605 # empty description results in an empty string
1606 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1607 'info_dict': {
1608 'id': 'x41yOUIvK2k',
1609 'ext': 'mp4',
1610 'title': 'IMG 3456',
1611 'description': '',
1612 'upload_date': '20170613',
1613 'uploader_id': 'ElevageOrVert',
1614 'uploader': 'ElevageOrVert',
1615 },
1616 'params': {
1617 'skip_download': True,
1618 },
1619 },
1620 {
1621 # with '};' inside yt initial data (see [1])
1622 # see [2] for an example with '};' inside ytInitialPlayerResponse
1623 # 1. https://github.com/ytdl-org/youtube-dl/issues/27093
1624 # 2. https://github.com/ytdl-org/youtube-dl/issues/27216
1625 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1626 'info_dict': {
1627 'id': 'CHqg6qOn4no',
1628 'ext': 'mp4',
1629 'title': 'Part 77 Sort a list of simple types in c#',
1630 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1631 'upload_date': '20130831',
1632 'uploader_id': 'kudvenkat',
1633 'uploader': 'kudvenkat',
1634 },
1635 'params': {
1636 'skip_download': True,
1637 },
1638 },
1639 {
1640 # another example of '};' in ytInitialData
1641 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY',
1642 'only_matching': True,
1643 },
1644 {
1645 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ',
1646 'only_matching': True,
1647 },
1648 {
1649 # https://github.com/ytdl-org/youtube-dl/pull/28094
1650 'url': 'OtqTfy26tG0',
1651 'info_dict': {
1652 'id': 'OtqTfy26tG0',
1653 'ext': 'mp4',
1654 'title': 'Burn Out',
1655 'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131',
1656 'upload_date': '20141120',
1657 'uploader': 'The Cinematic Orchestra - Topic',
1658 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw',
1659 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw',
1660 'artist': 'The Cinematic Orchestra',
1661 'track': 'Burn Out',
1662 'album': 'Every Day',
1663 'release_data': None,
1664 'release_year': None,
1665 },
1666 'params': {
1667 'skip_download': True,
1668 },
1669 },
1670 {
1671 # controversial video, only works with bpctr when authenticated with cookies
1672 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg',
1673 'only_matching': True,
1674 },
1675 {
1676 # controversial video, requires bpctr/contentCheckOk
1677 'url': 'https://www.youtube.com/watch?v=SZJvDhaSDnc',
1678 'info_dict': {
1679 'id': 'SZJvDhaSDnc',
1680 'ext': 'mp4',
1681 'title': 'San Diego teen commits suicide after bullying over embarrassing video',
1682 'channel_id': 'UC-SJ6nODDmufqBzPBwCvYvQ',
1683 'uploader': 'CBS This Morning',
1684 'uploader_id': 'CBSThisMorning',
1685 'upload_date': '20140716',
1686 'description': 'md5:acde3a73d3f133fc97e837a9f76b53b7'
1687 }
1688 },
1689 {
1690 # restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685
1691 'url': 'cBvYw8_A0vQ',
1692 'info_dict': {
1693 'id': 'cBvYw8_A0vQ',
1694 'ext': 'mp4',
1695 'title': '4K Ueno Okachimachi Street Scenes 上野御徒町歩き',
1696 'description': 'md5:ea770e474b7cd6722b4c95b833c03630',
1697 'upload_date': '20201120',
1698 'uploader': 'Walk around Japan',
1699 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw',
1700 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
1701 },
1702 'params': {
1703 'skip_download': True,
1704 },
1705 }, {
1706 # Has multiple audio streams
1707 'url': 'WaOKSUlf4TM',
1708 'only_matching': True
1709 }, {
1710 # Requires Premium: has format 141 when requested using YTM url
1711 'url': 'https://music.youtube.com/watch?v=XclachpHxis',
1712 'only_matching': True
1713 }, {
1714 # multiple subtitles with same lang_code
1715 'url': 'https://www.youtube.com/watch?v=wsQiKKfKxug',
1716 'only_matching': True,
1717 }, {
1718 # Force use android client fallback
1719 'url': 'https://www.youtube.com/watch?v=YOelRv7fMxY',
1720 'info_dict': {
1721 'id': 'YOelRv7fMxY',
1722 'title': 'DIGGING A SECRET TUNNEL Part 1',
1723 'ext': '3gp',
1724 'upload_date': '20210624',
1725 'channel_id': 'UCp68_FLety0O-n9QU6phsgw',
1726 'uploader': 'colinfurze',
1727 'uploader_id': 'colinfurze',
1728 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCp68_FLety0O-n9QU6phsgw',
1729 'description': 'md5:b5096f56af7ccd7a555c84db81738b22'
1730 },
1731 'params': {
1732 'format': '17', # 3gp format available on android
1733 'extractor_args': {'youtube': {'player_client': ['android']}},
1734 },
1735 },
1736 {
1737 # Skip download of additional client configs (remix client config in this case)
1738 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1739 'only_matching': True,
1740 'params': {
1741 'extractor_args': {'youtube': {'player_skip': ['configs']}},
1742 },
1743 }
1744 ]
1745
1746 @classmethod
1747 def suitable(cls, url):
1748 # Hack for lazy extractors until more generic solution is implemented
1749 # (see #28780)
1750 from .youtube import parse_qs
1751 qs = parse_qs(url)
1752 if qs.get('list', [None])[0]:
1753 return False
1754 return super(YoutubeIE, cls).suitable(url)
1755
1756 def __init__(self, *args, **kwargs):
1757 super(YoutubeIE, self).__init__(*args, **kwargs)
1758 self._code_cache = {}
1759 self._player_cache = {}
1760
1761 def _extract_player_url(self, ytcfg=None, webpage=None):
1762 player_url = try_get(ytcfg, (lambda x: x['PLAYER_JS_URL']), str)
1763 if not player_url and webpage:
1764 player_url = self._search_regex(
1765 r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
1766 webpage, 'player URL', fatal=False)
1767 if not player_url:
1768 return None
1769 if player_url.startswith('//'):
1770 player_url = 'https:' + player_url
1771 elif not re.match(r'https?://', player_url):
1772 player_url = compat_urlparse.urljoin(
1773 'https://www.youtube.com', player_url)
1774 return player_url
1775
1776 def _signature_cache_id(self, example_sig):
1777 """ Return a string representation of a signature """
1778 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1779
1780 @classmethod
1781 def _extract_player_info(cls, player_url):
1782 for player_re in cls._PLAYER_INFO_RE:
1783 id_m = re.search(player_re, player_url)
1784 if id_m:
1785 break
1786 else:
1787 raise ExtractorError('Cannot identify player %r' % player_url)
1788 return id_m.group('id')
1789
1790 def _load_player(self, video_id, player_url, fatal=True) -> bool:
1791 player_id = self._extract_player_info(player_url)
1792 if player_id not in self._code_cache:
1793 self._code_cache[player_id] = self._download_webpage(
1794 player_url, video_id, fatal=fatal,
1795 note='Downloading player ' + player_id,
1796 errnote='Download of %s failed' % player_url)
1797 return player_id in self._code_cache
1798
1799 def _extract_signature_function(self, video_id, player_url, example_sig):
1800 player_id = self._extract_player_info(player_url)
1801
1802 # Read from filesystem cache
1803 func_id = 'js_%s_%s' % (
1804 player_id, self._signature_cache_id(example_sig))
1805 assert os.path.basename(func_id) == func_id
1806
1807 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1808 if cache_spec is not None:
1809 return lambda s: ''.join(s[i] for i in cache_spec)
1810
1811 if self._load_player(video_id, player_url):
1812 code = self._code_cache[player_id]
1813 res = self._parse_sig_js(code)
1814
1815 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1816 cache_res = res(test_string)
1817 cache_spec = [ord(c) for c in cache_res]
1818
1819 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1820 return res
1821
1822 def _print_sig_code(self, func, example_sig):
1823 def gen_sig_code(idxs):
1824 def _genslice(start, end, step):
1825 starts = '' if start == 0 else str(start)
1826 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1827 steps = '' if step == 1 else (':%d' % step)
1828 return 's[%s%s%s]' % (starts, ends, steps)
1829
1830 step = None
1831 # Quelch pyflakes warnings - start will be set when step is set
1832 start = '(Never used)'
1833 for i, prev in zip(idxs[1:], idxs[:-1]):
1834 if step is not None:
1835 if i - prev == step:
1836 continue
1837 yield _genslice(start, prev, step)
1838 step = None
1839 continue
1840 if i - prev in [-1, 1]:
1841 step = i - prev
1842 start = prev
1843 continue
1844 else:
1845 yield 's[%d]' % prev
1846 if step is None:
1847 yield 's[%d]' % i
1848 else:
1849 yield _genslice(start, i, step)
1850
1851 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1852 cache_res = func(test_string)
1853 cache_spec = [ord(c) for c in cache_res]
1854 expr_code = ' + '.join(gen_sig_code(cache_spec))
1855 signature_id_tuple = '(%s)' % (
1856 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1857 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1858 ' return %s\n') % (signature_id_tuple, expr_code)
1859 self.to_screen('Extracted signature function:\n' + code)
1860
1861 def _parse_sig_js(self, jscode):
1862 funcname = self._search_regex(
1863 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1864 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1865 r'\bm=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(h\.s\)\)',
1866 r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(c\)\)',
1867 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)',
1868 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1869 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1870 # Obsolete patterns
1871 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1872 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1873 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1874 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1875 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1876 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1877 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1878 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1879 jscode, 'Initial JS player signature function name', group='sig')
1880
1881 jsi = JSInterpreter(jscode)
1882 initial_function = jsi.extract_function(funcname)
1883 return lambda s: initial_function([s])
1884
1885 def _decrypt_signature(self, s, video_id, player_url):
1886 """Turn the encrypted s field into a working signature"""
1887
1888 if player_url is None:
1889 raise ExtractorError('Cannot decrypt signature without player_url')
1890
1891 try:
1892 player_id = (player_url, self._signature_cache_id(s))
1893 if player_id not in self._player_cache:
1894 func = self._extract_signature_function(
1895 video_id, player_url, s
1896 )
1897 self._player_cache[player_id] = func
1898 func = self._player_cache[player_id]
1899 if self.get_param('youtube_print_sig_code'):
1900 self._print_sig_code(func, s)
1901 return func(s)
1902 except Exception as e:
1903 tb = traceback.format_exc()
1904 raise ExtractorError(
1905 'Signature extraction failed: ' + tb, cause=e)
1906
1907 def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False):
1908 """
1909 Extract signatureTimestamp (sts)
1910 Required to tell API what sig/player version is in use.
1911 """
1912 sts = None
1913 if isinstance(ytcfg, dict):
1914 sts = int_or_none(ytcfg.get('STS'))
1915
1916 if not sts:
1917 # Attempt to extract from player
1918 if player_url is None:
1919 error_msg = 'Cannot extract signature timestamp without player_url.'
1920 if fatal:
1921 raise ExtractorError(error_msg)
1922 self.report_warning(error_msg)
1923 return
1924 if self._load_player(video_id, player_url, fatal=fatal):
1925 player_id = self._extract_player_info(player_url)
1926 code = self._code_cache[player_id]
1927 sts = int_or_none(self._search_regex(
1928 r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code,
1929 'JS player signature timestamp', group='sts', fatal=fatal))
1930 return sts
1931
1932 def _mark_watched(self, video_id, player_responses):
1933 playback_url = traverse_obj(
1934 player_responses, (..., 'playbackTracking', 'videostatsPlaybackUrl', 'baseUrl'),
1935 expected_type=url_or_none, get_all=False)
1936 if not playback_url:
1937 self.report_warning('Unable to mark watched')
1938 return
1939 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1940 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1941
1942 # cpn generation algorithm is reverse engineered from base.js.
1943 # In fact it works even with dummy cpn.
1944 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1945 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1946
1947 qs.update({
1948 'ver': ['2'],
1949 'cpn': [cpn],
1950 })
1951 playback_url = compat_urlparse.urlunparse(
1952 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1953
1954 self._download_webpage(
1955 playback_url, video_id, 'Marking watched',
1956 'Unable to mark watched', fatal=False)
1957
1958 @staticmethod
1959 def _extract_urls(webpage):
1960 # Embedded YouTube player
1961 entries = [
1962 unescapeHTML(mobj.group('url'))
1963 for mobj in re.finditer(r'''(?x)
1964 (?:
1965 <iframe[^>]+?src=|
1966 data-video-url=|
1967 <embed[^>]+?src=|
1968 embedSWF\(?:\s*|
1969 <object[^>]+data=|
1970 new\s+SWFObject\(
1971 )
1972 (["\'])
1973 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1974 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1975 \1''', webpage)]
1976
1977 # lazyYT YouTube embed
1978 entries.extend(list(map(
1979 unescapeHTML,
1980 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1981
1982 # Wordpress "YouTube Video Importer" plugin
1983 matches = re.findall(r'''(?x)<div[^>]+
1984 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1985 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1986 entries.extend(m[-1] for m in matches)
1987
1988 return entries
1989
1990 @staticmethod
1991 def _extract_url(webpage):
1992 urls = YoutubeIE._extract_urls(webpage)
1993 return urls[0] if urls else None
1994
1995 @classmethod
1996 def extract_id(cls, url):
1997 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1998 if mobj is None:
1999 raise ExtractorError('Invalid URL: %s' % url)
2000 video_id = mobj.group(2)
2001 return video_id
2002
2003 def _extract_chapters_from_json(self, data, duration):
2004 chapter_list = traverse_obj(
2005 data, (
2006 'playerOverlays', 'playerOverlayRenderer', 'decoratedPlayerBarRenderer',
2007 'decoratedPlayerBarRenderer', 'playerBar', 'chapteredPlayerBarRenderer', 'chapters'
2008 ), expected_type=list)
2009
2010 return self._extract_chapters(
2011 chapter_list,
2012 chapter_time=lambda chapter: float_or_none(
2013 traverse_obj(chapter, ('chapterRenderer', 'timeRangeStartMillis')), scale=1000),
2014 chapter_title=lambda chapter: traverse_obj(
2015 chapter, ('chapterRenderer', 'title', 'simpleText'), expected_type=str),
2016 duration=duration)
2017
2018 def _extract_chapters_from_engagement_panel(self, data, duration):
2019 content_list = traverse_obj(
2020 data,
2021 ('engagementPanels', ..., 'engagementPanelSectionListRenderer', 'content', 'macroMarkersListRenderer', 'contents'),
2022 expected_type=list, default=[])
2023 chapter_time = lambda chapter: parse_duration(self._get_text(chapter.get('timeDescription')))
2024 chapter_title = lambda chapter: self._get_text(chapter.get('title'))
2025
2026 return next((
2027 filter(None, (
2028 self._extract_chapters(
2029 traverse_obj(contents, (..., 'macroMarkersListItemRenderer')),
2030 chapter_time, chapter_title, duration)
2031 for contents in content_list
2032 ))), [])
2033
2034 def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration):
2035 chapters = []
2036 last_chapter = {'start_time': 0}
2037 for idx, chapter in enumerate(chapter_list or []):
2038 title = chapter_title(chapter)
2039 start_time = chapter_time(chapter)
2040 if start_time is None:
2041 continue
2042 last_chapter['end_time'] = start_time
2043 if start_time < last_chapter['start_time']:
2044 if idx == 1:
2045 chapters.pop()
2046 self.report_warning('Invalid start time for chapter "%s"' % last_chapter['title'])
2047 else:
2048 self.report_warning(f'Invalid start time for chapter "{title}"')
2049 continue
2050 last_chapter = {'start_time': start_time, 'title': title}
2051 chapters.append(last_chapter)
2052 last_chapter['end_time'] = duration
2053 return chapters
2054
2055 def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
2056 return self._parse_json(self._search_regex(
2057 (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
2058 regex), webpage, name, default='{}'), video_id, fatal=False)
2059
2060 @staticmethod
2061 def parse_time_text(time_text):
2062 """
2063 Parse the comment time text
2064 time_text is in the format 'X units ago (edited)'
2065 """
2066 time_text_split = time_text.split(' ')
2067 if len(time_text_split) >= 3:
2068 try:
2069 return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto')
2070 except ValueError:
2071 return None
2072
2073 def _extract_comment(self, comment_renderer, parent=None):
2074 comment_id = comment_renderer.get('commentId')
2075 if not comment_id:
2076 return
2077
2078 text = self._get_text(comment_renderer.get('contentText'))
2079
2080 # note: timestamp is an estimate calculated from the current time and time_text
2081 time_text = self._get_text(comment_renderer.get('publishedTimeText')) or ''
2082 time_text_dt = self.parse_time_text(time_text)
2083 if isinstance(time_text_dt, datetime.datetime):
2084 timestamp = calendar.timegm(time_text_dt.timetuple())
2085 author = self._get_text(comment_renderer.get('authorText'))
2086 author_id = try_get(comment_renderer,
2087 lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
2088
2089 votes = parse_count(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'],
2090 lambda x: x['likeCount']), compat_str)) or 0
2091 author_thumbnail = try_get(comment_renderer,
2092 lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], compat_str)
2093
2094 author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
2095 is_favorited = 'creatorHeart' in (try_get(
2096 comment_renderer, lambda x: x['actionButtons']['commentActionButtonsRenderer'], dict) or {})
2097 return {
2098 'id': comment_id,
2099 'text': text,
2100 'timestamp': timestamp,
2101 'time_text': time_text,
2102 'like_count': votes,
2103 'is_favorited': is_favorited,
2104 'author': author,
2105 'author_id': author_id,
2106 'author_thumbnail': author_thumbnail,
2107 'author_is_uploader': author_is_uploader,
2108 'parent': parent or 'root'
2109 }
2110
2111 def _comment_entries(self, root_continuation_data, identity_token, account_syncid,
2112 ytcfg, video_id, parent=None, comment_counts=None):
2113
2114 def extract_header(contents):
2115 _total_comments = 0
2116 _continuation = None
2117 for content in contents:
2118 comments_header_renderer = try_get(content, lambda x: x['commentsHeaderRenderer'])
2119 expected_comment_count = parse_count(self._get_text(
2120 comments_header_renderer, (lambda x: x['countText'], lambda x: x['commentsCount']), max_runs=1))
2121
2122 if expected_comment_count:
2123 comment_counts[1] = expected_comment_count
2124 self.to_screen('Downloading ~%d comments' % expected_comment_count)
2125 _total_comments = comment_counts[1]
2126 sort_mode_str = self._configuration_arg('comment_sort', [''])[0]
2127 comment_sort_index = int(sort_mode_str != 'top') # 1 = new, 0 = top
2128
2129 sort_menu_item = try_get(
2130 comments_header_renderer,
2131 lambda x: x['sortMenu']['sortFilterSubMenuRenderer']['subMenuItems'][comment_sort_index], dict) or {}
2132 sort_continuation_ep = sort_menu_item.get('serviceEndpoint') or {}
2133
2134 _continuation = self._extract_continuation_ep_data(sort_continuation_ep) or self._extract_continuation(sort_menu_item)
2135 if not _continuation:
2136 continue
2137
2138 sort_text = sort_menu_item.get('title')
2139 if isinstance(sort_text, compat_str):
2140 sort_text = sort_text.lower()
2141 else:
2142 sort_text = 'top comments' if comment_sort_index == 0 else 'newest first'
2143 self.to_screen('Sorting comments by %s' % sort_text)
2144 break
2145 return _total_comments, _continuation
2146
2147 def extract_thread(contents):
2148 if not parent:
2149 comment_counts[2] = 0
2150 for content in contents:
2151 comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
2152 comment_renderer = try_get(
2153 comment_thread_renderer, (lambda x: x['comment']['commentRenderer'], dict)) or try_get(
2154 content, (lambda x: x['commentRenderer'], dict))
2155
2156 if not comment_renderer:
2157 continue
2158 comment = self._extract_comment(comment_renderer, parent)
2159 if not comment:
2160 continue
2161 comment_counts[0] += 1
2162 yield comment
2163 # Attempt to get the replies
2164 comment_replies_renderer = try_get(
2165 comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
2166
2167 if comment_replies_renderer:
2168 comment_counts[2] += 1
2169 comment_entries_iter = self._comment_entries(
2170 comment_replies_renderer, identity_token, account_syncid, ytcfg,
2171 video_id, parent=comment.get('id'), comment_counts=comment_counts)
2172
2173 for reply_comment in comment_entries_iter:
2174 yield reply_comment
2175
2176 # YouTube comments have a max depth of 2
2177 max_depth = int_or_none(self._configuration_arg('max_comment_depth', [''])[0]) or float('inf')
2178 if max_depth == 1 and parent:
2179 return
2180 if not comment_counts:
2181 # comment so far, est. total comments, current comment thread #
2182 comment_counts = [0, 0, 0]
2183
2184 continuation = self._extract_continuation(root_continuation_data)
2185 if continuation and len(continuation['continuation']) < 27:
2186 self.write_debug('Detected old API continuation token. Generating new API compatible token.')
2187 continuation_token = self._generate_comment_continuation(video_id)
2188 continuation = self._build_api_continuation_query(continuation_token, None)
2189
2190 visitor_data = None
2191 is_first_continuation = parent is None
2192
2193 for page_num in itertools.count(0):
2194 if not continuation:
2195 break
2196 headers = self.generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
2197 comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
2198 if page_num == 0:
2199 if is_first_continuation:
2200 note_prefix = 'Downloading comment section API JSON'
2201 else:
2202 note_prefix = ' Downloading comment API JSON reply thread %d %s' % (
2203 comment_counts[2], comment_prog_str)
2204 else:
2205 note_prefix = '%sDownloading comment%s API JSON page %d %s' % (
2206 ' ' if parent else '', ' replies' if parent else '',
2207 page_num, comment_prog_str)
2208
2209 response = self._extract_response(
2210 item_id=None, query=continuation,
2211 ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
2212 check_get_keys=('onResponseReceivedEndpoints', 'continuationContents'))
2213 if not response:
2214 break
2215 visitor_data = try_get(
2216 response,
2217 lambda x: x['responseContext']['webResponseContextExtensionData']['ytConfigData']['visitorData'],
2218 compat_str) or visitor_data
2219
2220 continuation_contents = dict_get(response, ('onResponseReceivedEndpoints', 'continuationContents'))
2221
2222 continuation = None
2223 if isinstance(continuation_contents, list):
2224 for continuation_section in continuation_contents:
2225 if not isinstance(continuation_section, dict):
2226 continue
2227 continuation_items = try_get(
2228 continuation_section,
2229 (lambda x: x['reloadContinuationItemsCommand']['continuationItems'],
2230 lambda x: x['appendContinuationItemsAction']['continuationItems']),
2231 list) or []
2232 if is_first_continuation:
2233 total_comments, continuation = extract_header(continuation_items)
2234 if total_comments:
2235 yield total_comments
2236 is_first_continuation = False
2237 if continuation:
2238 break
2239 continue
2240 count = 0
2241 for count, entry in enumerate(extract_thread(continuation_items)):
2242 yield entry
2243 continuation = self._extract_continuation({'contents': continuation_items})
2244 if continuation:
2245 # Sometimes YouTube provides a continuation without any comments
2246 # In most cases we end up just downloading these with very little comments to come.
2247 if count == 0:
2248 if not parent:
2249 self.report_warning('No comments received - assuming end of comments')
2250 continuation = None
2251 break
2252
2253 # Deprecated response structure
2254 elif isinstance(continuation_contents, dict):
2255 known_continuation_renderers = ('itemSectionContinuation', 'commentRepliesContinuation')
2256 for key, continuation_renderer in continuation_contents.items():
2257 if key not in known_continuation_renderers:
2258 continue
2259 if not isinstance(continuation_renderer, dict):
2260 continue
2261 if is_first_continuation:
2262 header_continuation_items = [continuation_renderer.get('header') or {}]
2263 total_comments, continuation = extract_header(header_continuation_items)
2264 if total_comments:
2265 yield total_comments
2266 is_first_continuation = False
2267 if continuation:
2268 break
2269
2270 # Sometimes YouTube provides a continuation without any comments
2271 # In most cases we end up just downloading these with very little comments to come.
2272 count = 0
2273 for count, entry in enumerate(extract_thread(continuation_renderer.get('contents') or {})):
2274 yield entry
2275 continuation = self._extract_continuation(continuation_renderer)
2276 if count == 0:
2277 if not parent:
2278 self.report_warning('No comments received - assuming end of comments')
2279 continuation = None
2280 break
2281
2282 @staticmethod
2283 def _generate_comment_continuation(video_id):
2284 """
2285 Generates initial comment section continuation token from given video id
2286 """
2287 b64_vid_id = base64.b64encode(bytes(video_id.encode('utf-8')))
2288 parts = ('Eg0SCw==', b64_vid_id, 'GAYyJyIRIgs=', b64_vid_id, 'MAB4AjAAQhBjb21tZW50cy1zZWN0aW9u')
2289 new_continuation_intlist = list(itertools.chain.from_iterable(
2290 [bytes_to_intlist(base64.b64decode(part)) for part in parts]))
2291 return base64.b64encode(intlist_to_bytes(new_continuation_intlist)).decode('utf-8')
2292
2293 def _extract_comments(self, ytcfg, video_id, contents, webpage):
2294 """Entry for comment extraction"""
2295 def _real_comment_extract(contents):
2296 if isinstance(contents, list):
2297 for entry in contents:
2298 for key, renderer in entry.items():
2299 if key not in known_entry_comment_renderers:
2300 continue
2301 yield from self._comment_entries(
2302 renderer, video_id=video_id, ytcfg=ytcfg,
2303 identity_token=self._extract_identity_token(webpage, item_id=video_id),
2304 account_syncid=self._extract_account_syncid(ytcfg))
2305 break
2306 comments = []
2307 known_entry_comment_renderers = ('itemSectionRenderer',)
2308 estimated_total = 0
2309 max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) or float('inf')
2310
2311 try:
2312 for comment in _real_comment_extract(contents):
2313 if len(comments) >= max_comments:
2314 break
2315 if isinstance(comment, int):
2316 estimated_total = comment
2317 continue
2318 comments.append(comment)
2319 except KeyboardInterrupt:
2320 self.to_screen('Interrupted by user')
2321 self.to_screen('Downloaded %d/%d comments' % (len(comments), estimated_total))
2322 return {
2323 'comments': comments,
2324 'comment_count': len(comments),
2325 }
2326
2327 @staticmethod
2328 def _generate_player_context(sts=None):
2329 context = {
2330 'html5Preference': 'HTML5_PREF_WANTS',
2331 }
2332 if sts is not None:
2333 context['signatureTimestamp'] = sts
2334 return {
2335 'playbackContext': {
2336 'contentPlaybackContext': context
2337 },
2338 'contentCheckOk': True
2339 }
2340
2341 @staticmethod
2342 def _get_video_info_params(video_id, client='TVHTML5'):
2343 GVI_CLIENTS = {
2344 'ANDROID': {
2345 'c': 'ANDROID',
2346 'cver': '16.20',
2347 },
2348 'TVHTML5': {
2349 'c': 'TVHTML5',
2350 'cver': '6.20180913',
2351 },
2352 'IOS': {
2353 'c': 'IOS',
2354 'cver': '16.20'
2355 }
2356 }
2357 query = {
2358 'video_id': video_id,
2359 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
2360 'html5': '1'
2361 }
2362 query.update(GVI_CLIENTS.get(client))
2363 return query
2364
2365 def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, identity_token, player_url, initial_pr):
2366
2367 session_index = self._extract_session_index(player_ytcfg, master_ytcfg)
2368 syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr)
2369 sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False)
2370 headers = self.generate_api_headers(
2371 player_ytcfg, identity_token, syncid,
2372 default_client=self._YT_CLIENTS[client], session_index=session_index)
2373
2374 yt_query = {'videoId': video_id}
2375 yt_query.update(self._generate_player_context(sts))
2376 return self._extract_response(
2377 item_id=video_id, ep='player', query=yt_query,
2378 ytcfg=player_ytcfg, headers=headers, fatal=False,
2379 default_client=self._YT_CLIENTS[client],
2380 note='Downloading %s player API JSON' % client.replace('_', ' ').strip()
2381 ) or None
2382
2383 def _extract_age_gated_player_response(self, client, video_id, ytcfg, identity_token, player_url, initial_pr):
2384 gvi_client = self._YT_CLIENTS.get(f'_{client}_agegate')
2385 if not gvi_client:
2386 return
2387
2388 pr = self._parse_json(traverse_obj(
2389 compat_parse_qs(self._download_webpage(
2390 self.http_scheme() + '//www.youtube.com/get_video_info', video_id,
2391 'Refetching age-gated %s info webpage' % gvi_client.lower(),
2392 'unable to download video info webpage', fatal=False,
2393 query=self._get_video_info_params(video_id, client=gvi_client))),
2394 ('player_response', 0), expected_type=str) or '{}', video_id)
2395 if pr:
2396 return pr
2397
2398 self.report_warning('Falling back to embedded-only age-gate workaround')
2399 embed_webpage = None
2400 if client == 'web' and 'configs' not in self._configuration_arg('player_skip'):
2401 embed_webpage = self._download_webpage(
2402 'https://www.youtube.com/embed/%s?html5=1' % video_id,
2403 video_id=video_id, note=f'Downloading age-gated {client} embed config')
2404
2405 ytcfg_age = self.extract_ytcfg(video_id, embed_webpage) or {}
2406 # If we extracted the embed webpage, it'll tell us if we can view the video
2407 embedded_pr = self._parse_json(
2408 traverse_obj(ytcfg_age, ('PLAYER_VARS', 'embedded_player_response'), expected_type=str) or '{}',
2409 video_id=video_id)
2410 embedded_ps_reason = traverse_obj(embedded_pr, ('playabilityStatus', 'reason'), expected_type=str) or ''
2411 if embedded_ps_reason in self._AGE_GATE_REASONS:
2412 return
2413 return self._extract_player_response(
2414 f'_{client}_embedded', video_id,
2415 ytcfg_age or ytcfg, ytcfg_age if client == 'web' else {},
2416 identity_token, player_url, initial_pr)
2417
2418 def _get_requested_clients(self, url, smuggled_data):
2419 requested_clients = [client for client in self._configuration_arg('player_client')
2420 if client[:0] != '_' and client in self._YT_CLIENTS]
2421 if not requested_clients:
2422 requested_clients = ['android', 'web']
2423
2424 if smuggled_data.get('is_music_url') or self.is_music_url(url):
2425 requested_clients.extend(
2426 f'{client}_music' for client in requested_clients if not client.endswith('_music'))
2427
2428 return orderedSet(requested_clients)
2429
2430 def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, player_url, identity_token):
2431 initial_pr = None
2432 if webpage:
2433 initial_pr = self._extract_yt_initial_variable(
2434 webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
2435 video_id, 'initial player response')
2436
2437 age_gated = False
2438 for client in clients:
2439 player_ytcfg = master_ytcfg if client == 'web' else {}
2440 if age_gated:
2441 pr = None
2442 elif client == 'web' and initial_pr:
2443 pr = initial_pr
2444 else:
2445 if client == 'web_music' and 'configs' not in self._configuration_arg('player_skip'):
2446 ytm_webpage = self._download_webpage(
2447 'https://music.youtube.com',
2448 video_id, fatal=False, note='Downloading remix client config')
2449 player_ytcfg = self.extract_ytcfg(video_id, ytm_webpage) or {}
2450 pr = self._extract_player_response(
2451 client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, identity_token, player_url, initial_pr)
2452 if pr:
2453 yield pr
2454 if age_gated or traverse_obj(pr, ('playabilityStatus', 'reason')) in self._AGE_GATE_REASONS:
2455 age_gated = True
2456 pr = self._extract_age_gated_player_response(
2457 client, video_id, player_ytcfg or master_ytcfg, identity_token, player_url, initial_pr)
2458 if pr:
2459 yield pr
2460 # Android player_response does not have microFormats which are needed for
2461 # extraction of some data. So we return the initial_pr with formats
2462 # stripped out even if not requested by the user
2463 # See: https://github.com/yt-dlp/yt-dlp/issues/501
2464 if initial_pr and 'web' not in clients:
2465 initial_pr['streamingData'] = None
2466 yield initial_pr
2467
2468 def _extract_formats(self, streaming_data, video_id, player_url, is_live):
2469 itags, stream_ids = [], []
2470 itag_qualities = {}
2471 q = qualities([
2472 # "tiny" is the smallest video-only format. But some audio-only formats
2473 # was also labeled "tiny". It is not clear if such formats still exist
2474 'tiny', 'audio_quality_low', 'audio_quality_medium', 'audio_quality_high', # Audio only formats
2475 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'
2476 ])
2477 streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[])
2478
2479 for fmt in streaming_formats:
2480 if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
2481 continue
2482
2483 itag = str_or_none(fmt.get('itag'))
2484 audio_track = fmt.get('audioTrack') or {}
2485 stream_id = '%s.%s' % (itag or '', audio_track.get('id', ''))
2486 if stream_id in stream_ids:
2487 continue
2488
2489 quality = fmt.get('quality')
2490 if quality == 'tiny' or not quality:
2491 quality = fmt.get('audioQuality', '').lower() or quality
2492 if itag and quality:
2493 itag_qualities[itag] = quality
2494 # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment
2495 # (adding `&sq=0` to the URL) and parsing emsg box to determine the
2496 # number of fragment that would subsequently requested with (`&sq=N`)
2497 if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF':
2498 continue
2499
2500 fmt_url = fmt.get('url')
2501 if not fmt_url:
2502 sc = compat_parse_qs(fmt.get('signatureCipher'))
2503 fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0]))
2504 encrypted_sig = try_get(sc, lambda x: x['s'][0])
2505 if not (sc and fmt_url and encrypted_sig):
2506 continue
2507 if not player_url:
2508 continue
2509 signature = self._decrypt_signature(sc['s'][0], video_id, player_url)
2510 sp = try_get(sc, lambda x: x['sp'][0]) or 'signature'
2511 fmt_url += '&' + sp + '=' + signature
2512
2513 if itag:
2514 itags.append(itag)
2515 stream_ids.append(stream_id)
2516
2517 tbr = float_or_none(
2518 fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
2519 dct = {
2520 'asr': int_or_none(fmt.get('audioSampleRate')),
2521 'filesize': int_or_none(fmt.get('contentLength')),
2522 'format_id': itag,
2523 'format_note': ', '.join(filter(None, (
2524 audio_track.get('displayName'), fmt.get('qualityLabel') or quality))),
2525 'fps': int_or_none(fmt.get('fps')),
2526 'height': int_or_none(fmt.get('height')),
2527 'quality': q(quality),
2528 'tbr': tbr,
2529 'url': fmt_url,
2530 'width': fmt.get('width'),
2531 'language': audio_track.get('id', '').split('.')[0],
2532 }
2533 mime_mobj = re.match(
2534 r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '')
2535 if mime_mobj:
2536 dct['ext'] = mimetype2ext(mime_mobj.group(1))
2537 dct.update(parse_codecs(mime_mobj.group(2)))
2538 # The 3gp format in android client has a quality of "small",
2539 # but is actually worse than all other formats
2540 if dct['ext'] == '3gp':
2541 dct['quality'] = q('tiny')
2542 dct['preference'] = -10
2543 no_audio = dct.get('acodec') == 'none'
2544 no_video = dct.get('vcodec') == 'none'
2545 if no_audio:
2546 dct['vbr'] = tbr
2547 if no_video:
2548 dct['abr'] = tbr
2549 if no_audio or no_video:
2550 dct['downloader_options'] = {
2551 # Youtube throttles chunks >~10M
2552 'http_chunk_size': 10485760,
2553 }
2554 if dct.get('ext'):
2555 dct['container'] = dct['ext'] + '_dash'
2556 yield dct
2557
2558 skip_manifests = self._configuration_arg('skip')
2559 get_dash = not is_live and 'dash' not in skip_manifests and self.get_param('youtube_include_dash_manifest', True)
2560 get_hls = 'hls' not in skip_manifests and self.get_param('youtube_include_hls_manifest', True)
2561
2562 for sd in streaming_data:
2563 hls_manifest_url = get_hls and sd.get('hlsManifestUrl')
2564 if hls_manifest_url:
2565 for f in self._extract_m3u8_formats(
2566 hls_manifest_url, video_id, 'mp4', fatal=False):
2567 itag = self._search_regex(
2568 r'/itag/(\d+)', f['url'], 'itag', default=None)
2569 if itag in itags:
2570 continue
2571 if itag:
2572 f['format_id'] = itag
2573 itags.append(itag)
2574 yield f
2575
2576 dash_manifest_url = get_dash and sd.get('dashManifestUrl')
2577 if dash_manifest_url:
2578 for f in self._extract_mpd_formats(
2579 dash_manifest_url, video_id, fatal=False):
2580 itag = f['format_id']
2581 if itag in itags:
2582 continue
2583 if itag:
2584 itags.append(itag)
2585 if itag in itag_qualities:
2586 f['quality'] = q(itag_qualities[itag])
2587 filesize = int_or_none(self._search_regex(
2588 r'/clen/(\d+)', f.get('fragment_base_url')
2589 or f['url'], 'file size', default=None))
2590 if filesize:
2591 f['filesize'] = filesize
2592 yield f
2593
2594 def _real_extract(self, url):
2595 url, smuggled_data = unsmuggle_url(url, {})
2596 video_id = self._match_id(url)
2597
2598 base_url = self.http_scheme() + '//www.youtube.com/'
2599 webpage_url = base_url + 'watch?v=' + video_id
2600 webpage = self._download_webpage(
2601 webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
2602
2603 master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg()
2604 player_url = self._extract_player_url(master_ytcfg, webpage)
2605 identity_token = self._extract_identity_token(webpage, video_id)
2606
2607 player_responses = list(self._extract_player_responses(
2608 self._get_requested_clients(url, smuggled_data),
2609 video_id, webpage, master_ytcfg, player_url, identity_token))
2610
2611 get_first = lambda obj, keys, **kwargs: traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
2612
2613 playability_statuses = traverse_obj(
2614 player_responses, (..., 'playabilityStatus'), expected_type=dict, default=[])
2615
2616 trailer_video_id = get_first(
2617 playability_statuses,
2618 ('errorScreen', 'playerLegacyDesktopYpcTrailerRenderer', 'trailerVideoId'),
2619 expected_type=str)
2620 if trailer_video_id:
2621 return self.url_result(
2622 trailer_video_id, self.ie_key(), trailer_video_id)
2623
2624 search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None))
2625 if webpage else (lambda x: None))
2626
2627 video_details = traverse_obj(
2628 player_responses, (..., 'videoDetails'), expected_type=dict, default=[])
2629 microformats = traverse_obj(
2630 player_responses, (..., 'microformat', 'playerMicroformatRenderer'),
2631 expected_type=dict, default=[])
2632 video_title = (
2633 get_first(video_details, 'title')
2634 or self._get_text(microformats, (..., 'title'))
2635 or search_meta(['og:title', 'twitter:title', 'title']))
2636 video_description = get_first(video_details, 'shortDescription')
2637
2638 if not smuggled_data.get('force_singlefeed', False):
2639 if not self.get_param('noplaylist'):
2640 multifeed_metadata_list = get_first(
2641 player_responses,
2642 ('multicamera', 'playerLegacyMulticameraRenderer', 'metadataList'),
2643 expected_type=str)
2644 if multifeed_metadata_list:
2645 entries = []
2646 feed_ids = []
2647 for feed in multifeed_metadata_list.split(','):
2648 # Unquote should take place before split on comma (,) since textual
2649 # fields may contain comma as well (see
2650 # https://github.com/ytdl-org/youtube-dl/issues/8536)
2651 feed_data = compat_parse_qs(
2652 compat_urllib_parse_unquote_plus(feed))
2653
2654 def feed_entry(name):
2655 return try_get(
2656 feed_data, lambda x: x[name][0], compat_str)
2657
2658 feed_id = feed_entry('id')
2659 if not feed_id:
2660 continue
2661 feed_title = feed_entry('title')
2662 title = video_title
2663 if feed_title:
2664 title += ' (%s)' % feed_title
2665 entries.append({
2666 '_type': 'url_transparent',
2667 'ie_key': 'Youtube',
2668 'url': smuggle_url(
2669 '%swatch?v=%s' % (base_url, feed_data['id'][0]),
2670 {'force_singlefeed': True}),
2671 'title': title,
2672 })
2673 feed_ids.append(feed_id)
2674 self.to_screen(
2675 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
2676 % (', '.join(feed_ids), video_id))
2677 return self.playlist_result(
2678 entries, video_id, video_title, video_description)
2679 else:
2680 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2681
2682 live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails'))
2683 is_live = get_first(video_details, 'isLive')
2684 if is_live is None:
2685 is_live = get_first(live_broadcast_details, 'isLiveNow')
2686
2687 streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[])
2688 formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live))
2689
2690 if not formats:
2691 if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')):
2692 self.raise_no_formats(
2693 'This video is DRM protected.', expected=True)
2694 pemr = get_first(
2695 playability_statuses,
2696 ('errorScreen', 'playerErrorMessageRenderer'), expected_type=dict) or {}
2697 reason = self._get_text(pemr, 'reason') or get_first(playability_statuses, 'reason')
2698 subreason = clean_html(self._get_text(pemr, 'subreason') or '')
2699 if subreason:
2700 if subreason == 'The uploader has not made this video available in your country.':
2701 countries = get_first(microformats, 'availableCountries')
2702 if not countries:
2703 regions_allowed = search_meta('regionsAllowed')
2704 countries = regions_allowed.split(',') if regions_allowed else None
2705 self.raise_geo_restricted(subreason, countries, metadata_available=True)
2706 reason += f'. {subreason}'
2707 if reason:
2708 self.raise_no_formats(reason, expected=True)
2709
2710 for f in formats:
2711 # TODO: detect if throttled
2712 if '&n=' in f['url']: # possibly throttled
2713 f['source_preference'] = -10
2714 # note = f.get('format_note')
2715 # f['format_note'] = f'{note} (throttled)' if note else '(throttled)'
2716
2717 self._sort_formats(formats)
2718
2719 keywords = get_first(video_details, 'keywords', expected_type=list) or []
2720 if not keywords and webpage:
2721 keywords = [
2722 unescapeHTML(m.group('content'))
2723 for m in re.finditer(self._meta_regex('og:video:tag'), webpage)]
2724 for keyword in keywords:
2725 if keyword.startswith('yt:stretch='):
2726 mobj = re.search(r'(\d+)\s*:\s*(\d+)', keyword)
2727 if mobj:
2728 # NB: float is intentional for forcing float division
2729 w, h = (float(v) for v in mobj.groups())
2730 if w > 0 and h > 0:
2731 ratio = w / h
2732 for f in formats:
2733 if f.get('vcodec') != 'none':
2734 f['stretched_ratio'] = ratio
2735 break
2736
2737 thumbnails = []
2738 thumbnail_dicts = traverse_obj(
2739 (video_details, microformats), (..., ..., 'thumbnail', 'thumbnails', ...),
2740 expected_type=dict, default=[])
2741 for thumbnail in thumbnail_dicts:
2742 thumbnail_url = thumbnail.get('url')
2743 if not thumbnail_url:
2744 continue
2745 # Sometimes youtube gives a wrong thumbnail URL. See:
2746 # https://github.com/yt-dlp/yt-dlp/issues/233
2747 # https://github.com/ytdl-org/youtube-dl/issues/28023
2748 if 'maxresdefault' in thumbnail_url:
2749 thumbnail_url = thumbnail_url.split('?')[0]
2750 thumbnails.append({
2751 'url': thumbnail_url,
2752 'height': int_or_none(thumbnail.get('height')),
2753 'width': int_or_none(thumbnail.get('width')),
2754 })
2755 thumbnail_url = search_meta(['og:image', 'twitter:image'])
2756 if thumbnail_url:
2757 thumbnails.append({
2758 'url': thumbnail_url,
2759 })
2760 # The best resolution thumbnails sometimes does not appear in the webpage
2761 # See: https://github.com/ytdl-org/youtube-dl/issues/29049, https://github.com/yt-dlp/yt-dlp/issues/340
2762 # List of possible thumbnails - Ref: <https://stackoverflow.com/a/20542029>
2763 hq_thumbnail_names = ['maxresdefault', 'hq720', 'sddefault', 'sd1', 'sd2', 'sd3']
2764 guaranteed_thumbnail_names = [
2765 'hqdefault', 'hq1', 'hq2', 'hq3', '0',
2766 'mqdefault', 'mq1', 'mq2', 'mq3',
2767 'default', '1', '2', '3'
2768 ]
2769 thumbnail_names = hq_thumbnail_names + guaranteed_thumbnail_names
2770 n_thumbnail_names = len(thumbnail_names)
2771
2772 thumbnails.extend({
2773 'url': 'https://i.ytimg.com/vi{webp}/{video_id}/{name}{live}.{ext}'.format(
2774 video_id=video_id, name=name, ext=ext,
2775 webp='_webp' if ext == 'webp' else '', live='_live' if is_live else ''),
2776 '_test_url': name in hq_thumbnail_names,
2777 } for name in thumbnail_names for ext in ('webp', 'jpg'))
2778 for thumb in thumbnails:
2779 i = next((i for i, t in enumerate(thumbnail_names) if f'/{video_id}/{t}' in thumb['url']), n_thumbnail_names)
2780 thumb['preference'] = (0 if '.webp' in thumb['url'] else -1) - (2 * i)
2781 self._remove_duplicate_formats(thumbnails)
2782
2783 category = get_first(microformats, 'category') or search_meta('genre')
2784 channel_id = str_or_none(
2785 get_first(video_details, 'channelId')
2786 or get_first(microformats, 'externalChannelId')
2787 or search_meta('channelId'))
2788 duration = int_or_none(
2789 get_first(video_details, 'lengthSeconds')
2790 or get_first(microformats, 'lengthSeconds')
2791 or parse_duration(search_meta('duration'))) or None
2792 owner_profile_url = get_first(microformats, 'ownerProfileUrl')
2793
2794 live_content = get_first(video_details, 'isLiveContent')
2795 is_upcoming = get_first(video_details, 'isUpcoming')
2796 if is_live is None:
2797 if is_upcoming or live_content is False:
2798 is_live = False
2799 if is_upcoming is None and (live_content or is_live):
2800 is_upcoming = False
2801 live_starttime = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp'))
2802 live_endtime = parse_iso8601(get_first(live_broadcast_details, 'endTimestamp'))
2803 if not duration and live_endtime and live_starttime:
2804 duration = live_endtime - live_starttime
2805
2806 info = {
2807 'id': video_id,
2808 'title': self._live_title(video_title) if is_live else video_title,
2809 'formats': formats,
2810 'thumbnails': thumbnails,
2811 'description': video_description,
2812 'upload_date': unified_strdate(
2813 get_first(microformats, 'uploadDate')
2814 or search_meta('uploadDate')),
2815 'uploader': get_first(video_details, 'author'),
2816 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None,
2817 'uploader_url': owner_profile_url,
2818 'channel_id': channel_id,
2819 'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None,
2820 'duration': duration,
2821 'view_count': int_or_none(
2822 get_first((video_details, microformats), (..., 'viewCount'))
2823 or search_meta('interactionCount')),
2824 'average_rating': float_or_none(get_first(video_details, 'averageRating')),
2825 'age_limit': 18 if (
2826 get_first(microformats, 'isFamilySafe') is False
2827 or search_meta('isFamilyFriendly') == 'false'
2828 or search_meta('og:restrictions:age') == '18+') else 0,
2829 'webpage_url': webpage_url,
2830 'categories': [category] if category else None,
2831 'tags': keywords,
2832 'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'),
2833 'is_live': is_live,
2834 'was_live': (False if is_live or is_upcoming or live_content is False
2835 else None if is_live is None or is_upcoming is None
2836 else live_content),
2837 'live_status': 'is_upcoming' if is_upcoming else None, # rest will be set by YoutubeDL
2838 'release_timestamp': live_starttime,
2839 }
2840
2841 pctr = get_first(player_responses, ('captions', 'playerCaptionsTracklistRenderer'), expected_type=dict)
2842 subtitles = {}
2843 if pctr:
2844 def process_language(container, base_url, lang_code, sub_name, query):
2845 lang_subs = container.setdefault(lang_code, [])
2846 for fmt in self._SUBTITLE_FORMATS:
2847 query.update({
2848 'fmt': fmt,
2849 })
2850 lang_subs.append({
2851 'ext': fmt,
2852 'url': update_url_query(base_url, query),
2853 'name': sub_name,
2854 })
2855
2856 for caption_track in (pctr.get('captionTracks') or []):
2857 base_url = caption_track.get('baseUrl')
2858 if not base_url:
2859 continue
2860 if caption_track.get('kind') != 'asr':
2861 lang_code = (
2862 remove_start(caption_track.get('vssId') or '', '.').replace('.', '-')
2863 or caption_track.get('languageCode'))
2864 if not lang_code:
2865 continue
2866 process_language(
2867 subtitles, base_url, lang_code,
2868 try_get(caption_track, lambda x: x['name']['simpleText']),
2869 {})
2870 continue
2871 automatic_captions = {}
2872 for translation_language in (pctr.get('translationLanguages') or []):
2873 translation_language_code = translation_language.get('languageCode')
2874 if not translation_language_code:
2875 continue
2876 process_language(
2877 automatic_captions, base_url, translation_language_code,
2878 self._get_text(translation_language.get('languageName'), max_runs=1),
2879 {'tlang': translation_language_code})
2880 info['automatic_captions'] = automatic_captions
2881 info['subtitles'] = subtitles
2882
2883 parsed_url = compat_urllib_parse_urlparse(url)
2884 for component in [parsed_url.fragment, parsed_url.query]:
2885 query = compat_parse_qs(component)
2886 for k, v in query.items():
2887 for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]:
2888 d_k += '_time'
2889 if d_k not in info and k in s_ks:
2890 info[d_k] = parse_duration(query[k][0])
2891
2892 # Youtube Music Auto-generated description
2893 if video_description:
2894 mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
2895 if mobj:
2896 release_year = mobj.group('release_year')
2897 release_date = mobj.group('release_date')
2898 if release_date:
2899 release_date = release_date.replace('-', '')
2900 if not release_year:
2901 release_year = release_date[:4]
2902 info.update({
2903 'album': mobj.group('album'.strip()),
2904 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')),
2905 'track': mobj.group('track').strip(),
2906 'release_date': release_date,
2907 'release_year': int_or_none(release_year),
2908 })
2909
2910 initial_data = None
2911 if webpage:
2912 initial_data = self._extract_yt_initial_variable(
2913 webpage, self._YT_INITIAL_DATA_RE, video_id,
2914 'yt initial data')
2915 if not initial_data:
2916 headers = self.generate_api_headers(
2917 master_ytcfg, identity_token, self._extract_account_syncid(master_ytcfg),
2918 session_index=self._extract_session_index(master_ytcfg))
2919
2920 initial_data = self._extract_response(
2921 item_id=video_id, ep='next', fatal=False,
2922 ytcfg=master_ytcfg, headers=headers, query={'videoId': video_id},
2923 note='Downloading initial data API JSON')
2924
2925 try:
2926 # This will error if there is no livechat
2927 initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
2928 info['subtitles']['live_chat'] = [{
2929 'url': 'https://www.youtube.com/watch?v=%s' % video_id, # url is needed to set cookies
2930 'video_id': video_id,
2931 'ext': 'json',
2932 'protocol': 'youtube_live_chat' if is_live or is_upcoming else 'youtube_live_chat_replay',
2933 }]
2934 except (KeyError, IndexError, TypeError):
2935 pass
2936
2937 if initial_data:
2938 info['chapters'] = (
2939 self._extract_chapters_from_json(initial_data, duration)
2940 or self._extract_chapters_from_engagement_panel(initial_data, duration)
2941 or None)
2942
2943 contents = try_get(
2944 initial_data,
2945 lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'],
2946 list) or []
2947 for content in contents:
2948 vpir = content.get('videoPrimaryInfoRenderer')
2949 if vpir:
2950 stl = vpir.get('superTitleLink')
2951 if stl:
2952 stl = self._get_text(stl)
2953 if try_get(
2954 vpir,
2955 lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':
2956 info['location'] = stl
2957 else:
2958 mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl)
2959 if mobj:
2960 info.update({
2961 'series': mobj.group(1),
2962 'season_number': int(mobj.group(2)),
2963 'episode_number': int(mobj.group(3)),
2964 })
2965 for tlb in (try_get(
2966 vpir,
2967 lambda x: x['videoActions']['menuRenderer']['topLevelButtons'],
2968 list) or []):
2969 tbr = tlb.get('toggleButtonRenderer') or {}
2970 for getter, regex in [(
2971 lambda x: x['defaultText']['accessibility']['accessibilityData'],
2972 r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([
2973 lambda x: x['accessibility'],
2974 lambda x: x['accessibilityData']['accessibilityData'],
2975 ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]:
2976 label = (try_get(tbr, getter, dict) or {}).get('label')
2977 if label:
2978 mobj = re.match(regex, label)
2979 if mobj:
2980 info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count'))
2981 break
2982 sbr_tooltip = try_get(
2983 vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])
2984 if sbr_tooltip:
2985 like_count, dislike_count = sbr_tooltip.split(' / ')
2986 info.update({
2987 'like_count': str_to_int(like_count),
2988 'dislike_count': str_to_int(dislike_count),
2989 })
2990 vsir = content.get('videoSecondaryInfoRenderer')
2991 if vsir:
2992 info['channel'] = self._get_text(try_get(
2993 vsir,
2994 lambda x: x['owner']['videoOwnerRenderer']['title'],
2995 dict))
2996 rows = try_get(
2997 vsir,
2998 lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
2999 list) or []
3000 multiple_songs = False
3001 for row in rows:
3002 if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
3003 multiple_songs = True
3004 break
3005 for row in rows:
3006 mrr = row.get('metadataRowRenderer') or {}
3007 mrr_title = mrr.get('title')
3008 if not mrr_title:
3009 continue
3010 mrr_title = self._get_text(mrr['title'])
3011 mrr_contents_text = self._get_text(mrr['contents'][0])
3012 if mrr_title == 'License':
3013 info['license'] = mrr_contents_text
3014 elif not multiple_songs:
3015 if mrr_title == 'Album':
3016 info['album'] = mrr_contents_text
3017 elif mrr_title == 'Artist':
3018 info['artist'] = mrr_contents_text
3019 elif mrr_title == 'Song':
3020 info['track'] = mrr_contents_text
3021
3022 fallbacks = {
3023 'channel': 'uploader',
3024 'channel_id': 'uploader_id',
3025 'channel_url': 'uploader_url',
3026 }
3027 for to, frm in fallbacks.items():
3028 if not info.get(to):
3029 info[to] = info.get(frm)
3030
3031 for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]:
3032 v = info.get(s_k)
3033 if v:
3034 info[d_k] = v
3035
3036 is_private = get_first(video_details, 'isPrivate', expected_type=bool)
3037 is_unlisted = get_first(microformats, 'isUnlisted', expected_type=bool)
3038 is_membersonly = None
3039 is_premium = None
3040 if initial_data and is_private is not None:
3041 is_membersonly = False
3042 is_premium = False
3043 contents = try_get(initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or []
3044 badge_labels = set()
3045 for content in contents:
3046 if not isinstance(content, dict):
3047 continue
3048 badge_labels.update(self._extract_badges(content.get('videoPrimaryInfoRenderer')))
3049 for badge_label in badge_labels:
3050 if badge_label.lower() == 'members only':
3051 is_membersonly = True
3052 elif badge_label.lower() == 'premium':
3053 is_premium = True
3054 elif badge_label.lower() == 'unlisted':
3055 is_unlisted = True
3056
3057 info['availability'] = self._availability(
3058 is_private=is_private,
3059 needs_premium=is_premium,
3060 needs_subscription=is_membersonly,
3061 needs_auth=info['age_limit'] >= 18,
3062 is_unlisted=None if is_private is None else is_unlisted)
3063
3064 # get xsrf for annotations or comments
3065 get_annotations = self.get_param('writeannotations', False)
3066 get_comments = self.get_param('getcomments', False)
3067 if get_annotations or get_comments:
3068 xsrf_token = None
3069 if master_ytcfg:
3070 xsrf_token = try_get(master_ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
3071 if not xsrf_token:
3072 xsrf_token = self._search_regex(
3073 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
3074 webpage, 'xsrf token', group='xsrf_token', fatal=False)
3075
3076 # annotations
3077 if get_annotations:
3078 invideo_url = get_first(
3079 player_responses,
3080 ('annotations', 0, 'playerAnnotationsUrlsRenderer', 'invideoUrl'),
3081 expected_type=str)
3082 if xsrf_token and invideo_url:
3083 xsrf_field_name = None
3084 if master_ytcfg:
3085 xsrf_field_name = try_get(master_ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
3086 if not xsrf_field_name:
3087 xsrf_field_name = self._search_regex(
3088 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
3089 webpage, 'xsrf field name',
3090 group='xsrf_field_name', default='session_token')
3091 info['annotations'] = self._download_webpage(
3092 self._proto_relative_url(invideo_url),
3093 video_id, note='Downloading annotations',
3094 errnote='Unable to download video annotations', fatal=False,
3095 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
3096
3097 if get_comments:
3098 info['__post_extractor'] = lambda: self._extract_comments(master_ytcfg, video_id, contents, webpage)
3099
3100 self.mark_watched(video_id, player_responses)
3101
3102 return info
3103
3104
3105 class YoutubeTabIE(YoutubeBaseInfoExtractor):
3106 IE_DESC = 'YouTube.com tab'
3107 _VALID_URL = r'''(?x)
3108 https?://
3109 (?:\w+\.)?
3110 (?:
3111 youtube(?:kids)?\.com|
3112 invidio\.us
3113 )/
3114 (?:
3115 (?P<channel_type>channel|c|user|browse)/|
3116 (?P<not_channel>
3117 feed/|hashtag/|
3118 (?:playlist|watch)\?.*?\blist=
3119 )|
3120 (?!(?:%s)\b) # Direct URLs
3121 )
3122 (?P<id>[^/?\#&]+)
3123 ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
3124 IE_NAME = 'youtube:tab'
3125
3126 _TESTS = [{
3127 'note': 'playlists, multipage',
3128 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
3129 'playlist_mincount': 94,
3130 'info_dict': {
3131 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
3132 'title': 'Игорь Клейнер - Playlists',
3133 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
3134 'uploader': 'Игорь Клейнер',
3135 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
3136 },
3137 }, {
3138 'note': 'playlists, multipage, different order',
3139 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
3140 'playlist_mincount': 94,
3141 'info_dict': {
3142 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
3143 'title': 'Игорь Клейнер - Playlists',
3144 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
3145 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
3146 'uploader': 'Игорь Клейнер',
3147 },
3148 }, {
3149 'note': 'playlists, series',
3150 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3',
3151 'playlist_mincount': 5,
3152 'info_dict': {
3153 'id': 'UCYO_jab_esuFRV4b17AJtAw',
3154 'title': '3Blue1Brown - Playlists',
3155 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
3156 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
3157 'uploader': '3Blue1Brown',
3158 },
3159 }, {
3160 'note': 'playlists, singlepage',
3161 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
3162 'playlist_mincount': 4,
3163 'info_dict': {
3164 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
3165 'title': 'ThirstForScience - Playlists',
3166 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
3167 'uploader': 'ThirstForScience',
3168 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
3169 }
3170 }, {
3171 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3172 'only_matching': True,
3173 }, {
3174 'note': 'basic, single video playlist',
3175 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3176 'info_dict': {
3177 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
3178 'uploader': 'Sergey M.',
3179 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3180 'title': 'youtube-dl public playlist',
3181 },
3182 'playlist_count': 1,
3183 }, {
3184 'note': 'empty playlist',
3185 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3186 'info_dict': {
3187 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
3188 'uploader': 'Sergey M.',
3189 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3190 'title': 'youtube-dl empty playlist',
3191 },
3192 'playlist_count': 0,
3193 }, {
3194 'note': 'Home tab',
3195 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
3196 'info_dict': {
3197 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3198 'title': 'lex will - Home',
3199 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3200 'uploader': 'lex will',
3201 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3202 },
3203 'playlist_mincount': 2,
3204 }, {
3205 'note': 'Videos tab',
3206 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
3207 'info_dict': {
3208 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3209 'title': 'lex will - Videos',
3210 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3211 'uploader': 'lex will',
3212 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3213 },
3214 'playlist_mincount': 975,
3215 }, {
3216 'note': 'Videos tab, sorted by popular',
3217 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
3218 'info_dict': {
3219 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3220 'title': 'lex will - Videos',
3221 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3222 'uploader': 'lex will',
3223 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3224 },
3225 'playlist_mincount': 199,
3226 }, {
3227 'note': 'Playlists tab',
3228 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
3229 'info_dict': {
3230 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3231 'title': 'lex will - Playlists',
3232 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3233 'uploader': 'lex will',
3234 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3235 },
3236 'playlist_mincount': 17,
3237 }, {
3238 'note': 'Community tab',
3239 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
3240 'info_dict': {
3241 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3242 'title': 'lex will - Community',
3243 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3244 'uploader': 'lex will',
3245 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3246 },
3247 'playlist_mincount': 18,
3248 }, {
3249 'note': 'Channels tab',
3250 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
3251 'info_dict': {
3252 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3253 'title': 'lex will - Channels',
3254 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3255 'uploader': 'lex will',
3256 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3257 },
3258 'playlist_mincount': 12,
3259 }, {
3260 'note': 'Search tab',
3261 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra',
3262 'playlist_mincount': 40,
3263 'info_dict': {
3264 'id': 'UCYO_jab_esuFRV4b17AJtAw',
3265 'title': '3Blue1Brown - Search - linear algebra',
3266 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
3267 'uploader': '3Blue1Brown',
3268 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
3269 },
3270 }, {
3271 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3272 'only_matching': True,
3273 }, {
3274 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3275 'only_matching': True,
3276 }, {
3277 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3278 'only_matching': True,
3279 }, {
3280 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
3281 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
3282 'info_dict': {
3283 'title': '29C3: Not my department',
3284 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
3285 'uploader': 'Christiaan008',
3286 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
3287 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268',
3288 },
3289 'playlist_count': 96,
3290 }, {
3291 'note': 'Large playlist',
3292 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
3293 'info_dict': {
3294 'title': 'Uploads from Cauchemar',
3295 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
3296 'uploader': 'Cauchemar',
3297 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
3298 },
3299 'playlist_mincount': 1123,
3300 }, {
3301 'note': 'even larger playlist, 8832 videos',
3302 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
3303 'only_matching': True,
3304 }, {
3305 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
3306 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
3307 'info_dict': {
3308 'title': 'Uploads from Interstellar Movie',
3309 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
3310 'uploader': 'Interstellar Movie',
3311 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
3312 },
3313 'playlist_mincount': 21,
3314 }, {
3315 'note': 'Playlist with "show unavailable videos" button',
3316 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q',
3317 'info_dict': {
3318 'title': 'Uploads from Phim Siêu Nhân Nhật Bản',
3319 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q',
3320 'uploader': 'Phim Siêu Nhân Nhật Bản',
3321 'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q',
3322 },
3323 'playlist_mincount': 200,
3324 }, {
3325 'note': 'Playlist with unavailable videos in page 7',
3326 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w',
3327 'info_dict': {
3328 'title': 'Uploads from BlankTV',
3329 'id': 'UU8l9frL61Yl5KFOl87nIm2w',
3330 'uploader': 'BlankTV',
3331 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w',
3332 },
3333 'playlist_mincount': 1000,
3334 }, {
3335 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844',
3336 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
3337 'info_dict': {
3338 'title': 'Data Analysis with Dr Mike Pound',
3339 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
3340 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
3341 'uploader': 'Computerphile',
3342 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487',
3343 },
3344 'playlist_mincount': 11,
3345 }, {
3346 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3347 'only_matching': True,
3348 }, {
3349 'note': 'Playlist URL that does not actually serve a playlist',
3350 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
3351 'info_dict': {
3352 'id': 'FqZTN594JQw',
3353 'ext': 'webm',
3354 'title': "Smiley's People 01 detective, Adventure Series, Action",
3355 'uploader': 'STREEM',
3356 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
3357 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
3358 'upload_date': '20150526',
3359 'license': 'Standard YouTube License',
3360 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
3361 'categories': ['People & Blogs'],
3362 'tags': list,
3363 'view_count': int,
3364 'like_count': int,
3365 'dislike_count': int,
3366 },
3367 'params': {
3368 'skip_download': True,
3369 },
3370 'skip': 'This video is not available.',
3371 'add_ie': [YoutubeIE.ie_key()],
3372 }, {
3373 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
3374 'only_matching': True,
3375 }, {
3376 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
3377 'only_matching': True,
3378 }, {
3379 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
3380 'info_dict': {
3381 'id': 'FMtPN8yp5LU', # This will keep changing
3382 'ext': 'mp4',
3383 'title': compat_str,
3384 'uploader': 'Sky News',
3385 'uploader_id': 'skynews',
3386 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
3387 'upload_date': r're:\d{8}',
3388 'description': compat_str,
3389 'categories': ['News & Politics'],
3390 'tags': list,
3391 'like_count': int,
3392 'dislike_count': int,
3393 },
3394 'params': {
3395 'skip_download': True,
3396 },
3397 'expected_warnings': ['Downloading just video ', 'Ignoring subtitle tracks found in '],
3398 }, {
3399 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
3400 'info_dict': {
3401 'id': 'a48o2S1cPoo',
3402 'ext': 'mp4',
3403 'title': 'The Young Turks - Live Main Show',
3404 'uploader': 'The Young Turks',
3405 'uploader_id': 'TheYoungTurks',
3406 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
3407 'upload_date': '20150715',
3408 'license': 'Standard YouTube License',
3409 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3410 'categories': ['News & Politics'],
3411 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3412 'like_count': int,
3413 'dislike_count': int,
3414 },
3415 'params': {
3416 'skip_download': True,
3417 },
3418 'only_matching': True,
3419 }, {
3420 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
3421 'only_matching': True,
3422 }, {
3423 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3424 'only_matching': True,
3425 }, {
3426 'note': 'A channel that is not live. Should raise error',
3427 'url': 'https://www.youtube.com/user/numberphile/live',
3428 'only_matching': True,
3429 }, {
3430 'url': 'https://www.youtube.com/feed/trending',
3431 'only_matching': True,
3432 }, {
3433 'url': 'https://www.youtube.com/feed/library',
3434 'only_matching': True,
3435 }, {
3436 'url': 'https://www.youtube.com/feed/history',
3437 'only_matching': True,
3438 }, {
3439 'url': 'https://www.youtube.com/feed/subscriptions',
3440 'only_matching': True,
3441 }, {
3442 'url': 'https://www.youtube.com/feed/watch_later',
3443 'only_matching': True,
3444 }, {
3445 'note': 'Recommended - redirects to home page',
3446 'url': 'https://www.youtube.com/feed/recommended',
3447 'only_matching': True,
3448 }, {
3449 'note': 'inline playlist with not always working continuations',
3450 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
3451 'only_matching': True,
3452 }, {
3453 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8',
3454 'only_matching': True,
3455 }, {
3456 'url': 'https://www.youtube.com/course',
3457 'only_matching': True,
3458 }, {
3459 'url': 'https://www.youtube.com/zsecurity',
3460 'only_matching': True,
3461 }, {
3462 'url': 'http://www.youtube.com/NASAgovVideo/videos',
3463 'only_matching': True,
3464 }, {
3465 'url': 'https://www.youtube.com/TheYoungTurks/live',
3466 'only_matching': True,
3467 }, {
3468 'url': 'https://www.youtube.com/hashtag/cctv9',
3469 'info_dict': {
3470 'id': 'cctv9',
3471 'title': '#cctv9',
3472 },
3473 'playlist_mincount': 350,
3474 }, {
3475 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU',
3476 'only_matching': True,
3477 }, {
3478 'note': 'Requires Premium: should request additional YTM-info webpage (and have format 141) for videos in playlist',
3479 'url': 'https://music.youtube.com/playlist?list=PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3480 'only_matching': True
3481 }, {
3482 'note': '/browse/ should redirect to /channel/',
3483 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng',
3484 'only_matching': True
3485 }, {
3486 'note': 'VLPL, should redirect to playlist?list=PL...',
3487 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3488 'info_dict': {
3489 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3490 'uploader': 'NoCopyrightSounds',
3491 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!',
3492 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
3493 'title': 'NCS Releases',
3494 },
3495 'playlist_mincount': 166,
3496 }, {
3497 'note': 'Topic, should redirect to playlist?list=UU...',
3498 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw',
3499 'info_dict': {
3500 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw',
3501 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
3502 'title': 'Uploads from Royalty Free Music - Topic',
3503 'uploader': 'Royalty Free Music - Topic',
3504 },
3505 'expected_warnings': [
3506 'A channel/user page was given',
3507 'The URL does not have a videos tab',
3508 ],
3509 'playlist_mincount': 101,
3510 }, {
3511 'note': 'Topic without a UU playlist',
3512 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg',
3513 'info_dict': {
3514 'id': 'UCtFRv9O2AHqOZjjynzrv-xg',
3515 'title': 'UCtFRv9O2AHqOZjjynzrv-xg',
3516 },
3517 'expected_warnings': [
3518 'A channel/user page was given',
3519 'The URL does not have a videos tab',
3520 'Falling back to channel URL',
3521 ],
3522 'playlist_mincount': 9,
3523 }, {
3524 'note': 'Youtube music Album',
3525 'url': 'https://music.youtube.com/browse/MPREb_gTAcphH99wE',
3526 'info_dict': {
3527 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0',
3528 'title': 'Album - Royalty Free Music Library V2 (50 Songs)',
3529 },
3530 'playlist_count': 50,
3531 }, {
3532 'note': 'unlisted single video playlist',
3533 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
3534 'info_dict': {
3535 'uploader_id': 'UC9zHu_mHU96r19o-wV5Qs1Q',
3536 'uploader': 'colethedj',
3537 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
3538 'title': 'yt-dlp unlisted playlist test',
3539 'availability': 'unlisted'
3540 },
3541 'playlist_count': 1,
3542 }]
3543
3544 @classmethod
3545 def suitable(cls, url):
3546 return False if YoutubeIE.suitable(url) else super(
3547 YoutubeTabIE, cls).suitable(url)
3548
3549 def _extract_channel_id(self, webpage):
3550 channel_id = self._html_search_meta(
3551 'channelId', webpage, 'channel id', default=None)
3552 if channel_id:
3553 return channel_id
3554 channel_url = self._html_search_meta(
3555 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
3556 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
3557 'twitter:app:url:googleplay'), webpage, 'channel url')
3558 return self._search_regex(
3559 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
3560 channel_url, 'channel id')
3561
3562 @staticmethod
3563 def _extract_basic_item_renderer(item):
3564 # Modified from _extract_grid_item_renderer
3565 known_basic_renderers = (
3566 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer'
3567 )
3568 for key, renderer in item.items():
3569 if not isinstance(renderer, dict):
3570 continue
3571 elif key in known_basic_renderers:
3572 return renderer
3573 elif key.startswith('grid') and key.endswith('Renderer'):
3574 return renderer
3575
3576 def _grid_entries(self, grid_renderer):
3577 for item in grid_renderer['items']:
3578 if not isinstance(item, dict):
3579 continue
3580 renderer = self._extract_basic_item_renderer(item)
3581 if not isinstance(renderer, dict):
3582 continue
3583 title = self._get_text(renderer.get('title'))
3584
3585 # playlist
3586 playlist_id = renderer.get('playlistId')
3587 if playlist_id:
3588 yield self.url_result(
3589 'https://www.youtube.com/playlist?list=%s' % playlist_id,
3590 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3591 video_title=title)
3592 continue
3593 # video
3594 video_id = renderer.get('videoId')
3595 if video_id:
3596 yield self._extract_video(renderer)
3597 continue
3598 # channel
3599 channel_id = renderer.get('channelId')
3600 if channel_id:
3601 yield self.url_result(
3602 'https://www.youtube.com/channel/%s' % channel_id,
3603 ie=YoutubeTabIE.ie_key(), video_title=title)
3604 continue
3605 # generic endpoint URL support
3606 ep_url = urljoin('https://www.youtube.com/', try_get(
3607 renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'],
3608 compat_str))
3609 if ep_url:
3610 for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE):
3611 if ie.suitable(ep_url):
3612 yield self.url_result(
3613 ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title)
3614 break
3615
3616 def _shelf_entries_from_content(self, shelf_renderer):
3617 content = shelf_renderer.get('content')
3618 if not isinstance(content, dict):
3619 return
3620 renderer = content.get('gridRenderer') or content.get('expandedShelfContentsRenderer')
3621 if renderer:
3622 # TODO: add support for nested playlists so each shelf is processed
3623 # as separate playlist
3624 # TODO: this includes only first N items
3625 for entry in self._grid_entries(renderer):
3626 yield entry
3627 renderer = content.get('horizontalListRenderer')
3628 if renderer:
3629 # TODO
3630 pass
3631
3632 def _shelf_entries(self, shelf_renderer, skip_channels=False):
3633 ep = try_get(
3634 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3635 compat_str)
3636 shelf_url = urljoin('https://www.youtube.com', ep)
3637 if shelf_url:
3638 # Skipping links to another channels, note that checking for
3639 # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL
3640 # will not work
3641 if skip_channels and '/channels?' in shelf_url:
3642 return
3643 title = self._get_text(shelf_renderer, lambda x: x['title'])
3644 yield self.url_result(shelf_url, video_title=title)
3645 # Shelf may not contain shelf URL, fallback to extraction from content
3646 for entry in self._shelf_entries_from_content(shelf_renderer):
3647 yield entry
3648
3649 def _playlist_entries(self, video_list_renderer):
3650 for content in video_list_renderer['contents']:
3651 if not isinstance(content, dict):
3652 continue
3653 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
3654 if not isinstance(renderer, dict):
3655 continue
3656 video_id = renderer.get('videoId')
3657 if not video_id:
3658 continue
3659 yield self._extract_video(renderer)
3660
3661 def _rich_entries(self, rich_grid_renderer):
3662 renderer = try_get(
3663 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
3664 video_id = renderer.get('videoId')
3665 if not video_id:
3666 return
3667 yield self._extract_video(renderer)
3668
3669 def _video_entry(self, video_renderer):
3670 video_id = video_renderer.get('videoId')
3671 if video_id:
3672 return self._extract_video(video_renderer)
3673
3674 def _post_thread_entries(self, post_thread_renderer):
3675 post_renderer = try_get(
3676 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
3677 if not post_renderer:
3678 return
3679 # video attachment
3680 video_renderer = try_get(
3681 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict) or {}
3682 video_id = video_renderer.get('videoId')
3683 if video_id:
3684 entry = self._extract_video(video_renderer)
3685 if entry:
3686 yield entry
3687 # playlist attachment
3688 playlist_id = try_get(
3689 post_renderer, lambda x: x['backstageAttachment']['playlistRenderer']['playlistId'], compat_str)
3690 if playlist_id:
3691 yield self.url_result(
3692 'https://www.youtube.com/playlist?list=%s' % playlist_id,
3693 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3694 # inline video links
3695 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
3696 for run in runs:
3697 if not isinstance(run, dict):
3698 continue
3699 ep_url = try_get(
3700 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
3701 if not ep_url:
3702 continue
3703 if not YoutubeIE.suitable(ep_url):
3704 continue
3705 ep_video_id = YoutubeIE._match_id(ep_url)
3706 if video_id == ep_video_id:
3707 continue
3708 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=ep_video_id)
3709
3710 def _post_thread_continuation_entries(self, post_thread_continuation):
3711 contents = post_thread_continuation.get('contents')
3712 if not isinstance(contents, list):
3713 return
3714 for content in contents:
3715 renderer = content.get('backstagePostThreadRenderer')
3716 if not isinstance(renderer, dict):
3717 continue
3718 for entry in self._post_thread_entries(renderer):
3719 yield entry
3720
3721 r''' # unused
3722 def _rich_grid_entries(self, contents):
3723 for content in contents:
3724 video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict)
3725 if video_renderer:
3726 entry = self._video_entry(video_renderer)
3727 if entry:
3728 yield entry
3729 '''
3730 def _entries(self, tab, item_id, identity_token, account_syncid, ytcfg):
3731
3732 def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
3733 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
3734 for content in contents:
3735 if not isinstance(content, dict):
3736 continue
3737 is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
3738 if not is_renderer:
3739 renderer = content.get('richItemRenderer')
3740 if renderer:
3741 for entry in self._rich_entries(renderer):
3742 yield entry
3743 continuation_list[0] = self._extract_continuation(parent_renderer)
3744 continue
3745 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
3746 for isr_content in isr_contents:
3747 if not isinstance(isr_content, dict):
3748 continue
3749
3750 known_renderers = {
3751 'playlistVideoListRenderer': self._playlist_entries,
3752 'gridRenderer': self._grid_entries,
3753 'shelfRenderer': lambda x: self._shelf_entries(x, tab.get('title') != 'Channels'),
3754 'backstagePostThreadRenderer': self._post_thread_entries,
3755 'videoRenderer': lambda x: [self._video_entry(x)],
3756 }
3757 for key, renderer in isr_content.items():
3758 if key not in known_renderers:
3759 continue
3760 for entry in known_renderers[key](renderer):
3761 if entry:
3762 yield entry
3763 continuation_list[0] = self._extract_continuation(renderer)
3764 break
3765
3766 if not continuation_list[0]:
3767 continuation_list[0] = self._extract_continuation(is_renderer)
3768
3769 if not continuation_list[0]:
3770 continuation_list[0] = self._extract_continuation(parent_renderer)
3771
3772 continuation_list = [None] # Python 2 doesnot support nonlocal
3773 tab_content = try_get(tab, lambda x: x['content'], dict)
3774 if not tab_content:
3775 return
3776 parent_renderer = (
3777 try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
3778 or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
3779 for entry in extract_entries(parent_renderer):
3780 yield entry
3781 continuation = continuation_list[0]
3782 visitor_data = None
3783
3784 for page_num in itertools.count(1):
3785 if not continuation:
3786 break
3787 headers = self.generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
3788 response = self._extract_response(
3789 item_id='%s page %s' % (item_id, page_num),
3790 query=continuation, headers=headers, ytcfg=ytcfg,
3791 check_get_keys=('continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints'))
3792
3793 if not response:
3794 break
3795 visitor_data = try_get(
3796 response, lambda x: x['responseContext']['visitorData'], compat_str) or visitor_data
3797
3798 known_continuation_renderers = {
3799 'playlistVideoListContinuation': self._playlist_entries,
3800 'gridContinuation': self._grid_entries,
3801 'itemSectionContinuation': self._post_thread_continuation_entries,
3802 'sectionListContinuation': extract_entries, # for feeds
3803 }
3804 continuation_contents = try_get(
3805 response, lambda x: x['continuationContents'], dict) or {}
3806 continuation_renderer = None
3807 for key, value in continuation_contents.items():
3808 if key not in known_continuation_renderers:
3809 continue
3810 continuation_renderer = value
3811 continuation_list = [None]
3812 for entry in known_continuation_renderers[key](continuation_renderer):
3813 yield entry
3814 continuation = continuation_list[0] or self._extract_continuation(continuation_renderer)
3815 break
3816 if continuation_renderer:
3817 continue
3818
3819 known_renderers = {
3820 'gridPlaylistRenderer': (self._grid_entries, 'items'),
3821 'gridVideoRenderer': (self._grid_entries, 'items'),
3822 'gridChannelRenderer': (self._grid_entries, 'items'),
3823 'playlistVideoRenderer': (self._playlist_entries, 'contents'),
3824 'itemSectionRenderer': (extract_entries, 'contents'), # for feeds
3825 'richItemRenderer': (extract_entries, 'contents'), # for hashtag
3826 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents')
3827 }
3828 on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints'))
3829 continuation_items = try_get(
3830 on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list)
3831 continuation_item = try_get(continuation_items, lambda x: x[0], dict) or {}
3832 video_items_renderer = None
3833 for key, value in continuation_item.items():
3834 if key not in known_renderers:
3835 continue
3836 video_items_renderer = {known_renderers[key][1]: continuation_items}
3837 continuation_list = [None]
3838 for entry in known_renderers[key][0](video_items_renderer):
3839 yield entry
3840 continuation = continuation_list[0] or self._extract_continuation(video_items_renderer)
3841 break
3842 if video_items_renderer:
3843 continue
3844 break
3845
3846 @staticmethod
3847 def _extract_selected_tab(tabs):
3848 for tab in tabs:
3849 renderer = dict_get(tab, ('tabRenderer', 'expandableTabRenderer')) or {}
3850 if renderer.get('selected') is True:
3851 return renderer
3852 else:
3853 raise ExtractorError('Unable to find selected tab')
3854
3855 @classmethod
3856 def _extract_uploader(cls, data):
3857 uploader = {}
3858 renderer = cls._extract_sidebar_info_renderer(data, 'playlistSidebarSecondaryInfoRenderer') or {}
3859 owner = try_get(
3860 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3861 if owner:
3862 uploader['uploader'] = owner.get('text')
3863 uploader['uploader_id'] = try_get(
3864 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3865 uploader['uploader_url'] = urljoin(
3866 'https://www.youtube.com/',
3867 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3868 return {k: v for k, v in uploader.items() if v is not None}
3869
3870 def _extract_from_tabs(self, item_id, webpage, data, tabs):
3871 playlist_id = title = description = channel_url = channel_name = channel_id = None
3872 thumbnails_list = tags = []
3873
3874 selected_tab = self._extract_selected_tab(tabs)
3875 renderer = try_get(
3876 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3877 if renderer:
3878 channel_name = renderer.get('title')
3879 channel_url = renderer.get('channelUrl')
3880 channel_id = renderer.get('externalId')
3881 else:
3882 renderer = try_get(
3883 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3884
3885 if renderer:
3886 title = renderer.get('title')
3887 description = renderer.get('description', '')
3888 playlist_id = channel_id
3889 tags = renderer.get('keywords', '').split()
3890 thumbnails_list = (
3891 try_get(renderer, lambda x: x['avatar']['thumbnails'], list)
3892 or try_get(
3893 self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer'),
3894 lambda x: x['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails'],
3895 list)
3896 or [])
3897
3898 thumbnails = []
3899 for t in thumbnails_list:
3900 if not isinstance(t, dict):
3901 continue
3902 thumbnail_url = url_or_none(t.get('url'))
3903 if not thumbnail_url:
3904 continue
3905 thumbnails.append({
3906 'url': thumbnail_url,
3907 'width': int_or_none(t.get('width')),
3908 'height': int_or_none(t.get('height')),
3909 })
3910 if playlist_id is None:
3911 playlist_id = item_id
3912 if title is None:
3913 title = (
3914 try_get(data, lambda x: x['header']['hashtagHeaderRenderer']['hashtag']['simpleText'])
3915 or playlist_id)
3916 title += format_field(selected_tab, 'title', ' - %s')
3917 title += format_field(selected_tab, 'expandedText', ' - %s')
3918 metadata = {
3919 'playlist_id': playlist_id,
3920 'playlist_title': title,
3921 'playlist_description': description,
3922 'uploader': channel_name,
3923 'uploader_id': channel_id,
3924 'uploader_url': channel_url,
3925 'thumbnails': thumbnails,
3926 'tags': tags,
3927 }
3928 availability = self._extract_availability(data)
3929 if availability:
3930 metadata['availability'] = availability
3931 if not channel_id:
3932 metadata.update(self._extract_uploader(data))
3933 metadata.update({
3934 'channel': metadata['uploader'],
3935 'channel_id': metadata['uploader_id'],
3936 'channel_url': metadata['uploader_url']})
3937 ytcfg = self.extract_ytcfg(item_id, webpage)
3938 return self.playlist_result(
3939 self._entries(
3940 selected_tab, playlist_id,
3941 self._extract_identity_token(webpage, item_id),
3942 self._extract_account_syncid(ytcfg, data), ytcfg),
3943 **metadata)
3944
3945 def _extract_mix_playlist(self, playlist, playlist_id, data, webpage):
3946 first_id = last_id = None
3947 ytcfg = self.extract_ytcfg(playlist_id, webpage)
3948 headers = self.generate_api_headers(
3949 ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data),
3950 identity_token=self._extract_identity_token(webpage, item_id=playlist_id))
3951 for page_num in itertools.count(1):
3952 videos = list(self._playlist_entries(playlist))
3953 if not videos:
3954 return
3955 start = next((i for i, v in enumerate(videos) if v['id'] == last_id), -1) + 1
3956 if start >= len(videos):
3957 return
3958 for video in videos[start:]:
3959 if video['id'] == first_id:
3960 self.to_screen('First video %s found again; Assuming end of Mix' % first_id)
3961 return
3962 yield video
3963 first_id = first_id or videos[0]['id']
3964 last_id = videos[-1]['id']
3965 watch_endpoint = try_get(
3966 playlist, lambda x: x['contents'][-1]['playlistPanelVideoRenderer']['navigationEndpoint']['watchEndpoint'])
3967 query = {
3968 'playlistId': playlist_id,
3969 'videoId': watch_endpoint.get('videoId') or last_id,
3970 'index': watch_endpoint.get('index') or len(videos),
3971 'params': watch_endpoint.get('params') or 'OAE%3D'
3972 }
3973 response = self._extract_response(
3974 item_id='%s page %d' % (playlist_id, page_num),
3975 query=query, ep='next', headers=headers, ytcfg=ytcfg,
3976 check_get_keys='contents'
3977 )
3978 playlist = try_get(
3979 response, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3980
3981 def _extract_from_playlist(self, item_id, url, data, playlist, webpage):
3982 title = playlist.get('title') or try_get(
3983 data, lambda x: x['titleText']['simpleText'], compat_str)
3984 playlist_id = playlist.get('playlistId') or item_id
3985
3986 # Delegating everything except mix playlists to regular tab-based playlist URL
3987 playlist_url = urljoin(url, try_get(
3988 playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3989 compat_str))
3990 if playlist_url and playlist_url != url:
3991 return self.url_result(
3992 playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3993 video_title=title)
3994
3995 return self.playlist_result(
3996 self._extract_mix_playlist(playlist, playlist_id, data, webpage),
3997 playlist_id=playlist_id, playlist_title=title)
3998
3999 def _extract_availability(self, data):
4000 """
4001 Gets the availability of a given playlist/tab.
4002 Note: Unless YouTube tells us explicitly, we do not assume it is public
4003 @param data: response
4004 """
4005 is_private = is_unlisted = None
4006 renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') or {}
4007 badge_labels = self._extract_badges(renderer)
4008
4009 # Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge
4010 privacy_dropdown_entries = try_get(
4011 renderer, lambda x: x['privacyForm']['dropdownFormFieldRenderer']['dropdown']['dropdownRenderer']['entries'], list) or []
4012 for renderer_dict in privacy_dropdown_entries:
4013 is_selected = try_get(
4014 renderer_dict, lambda x: x['privacyDropdownItemRenderer']['isSelected'], bool) or False
4015 if not is_selected:
4016 continue
4017 label = self._get_text(
4018 try_get(renderer_dict, lambda x: x['privacyDropdownItemRenderer']['label'], dict) or [])
4019 if label:
4020 badge_labels.add(label.lower())
4021 break
4022
4023 for badge_label in badge_labels:
4024 if badge_label == 'unlisted':
4025 is_unlisted = True
4026 elif badge_label == 'private':
4027 is_private = True
4028 elif badge_label == 'public':
4029 is_unlisted = is_private = False
4030 return self._availability(is_private, False, False, False, is_unlisted)
4031
4032 @staticmethod
4033 def _extract_sidebar_info_renderer(data, info_renderer, expected_type=dict):
4034 sidebar_renderer = try_get(
4035 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list) or []
4036 for item in sidebar_renderer:
4037 renderer = try_get(item, lambda x: x[info_renderer], expected_type)
4038 if renderer:
4039 return renderer
4040
4041 def _reload_with_unavailable_videos(self, item_id, data, webpage):
4042 """
4043 Get playlist with unavailable videos if the 'show unavailable videos' button exists.
4044 """
4045 browse_id = params = None
4046 renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer')
4047 if not renderer:
4048 return
4049 menu_renderer = try_get(
4050 renderer, lambda x: x['menu']['menuRenderer']['items'], list) or []
4051 for menu_item in menu_renderer:
4052 if not isinstance(menu_item, dict):
4053 continue
4054 nav_item_renderer = menu_item.get('menuNavigationItemRenderer')
4055 text = try_get(
4056 nav_item_renderer, lambda x: x['text']['simpleText'], compat_str)
4057 if not text or text.lower() != 'show unavailable videos':
4058 continue
4059 browse_endpoint = try_get(
4060 nav_item_renderer, lambda x: x['navigationEndpoint']['browseEndpoint'], dict) or {}
4061 browse_id = browse_endpoint.get('browseId')
4062 params = browse_endpoint.get('params')
4063 break
4064
4065 ytcfg = self.extract_ytcfg(item_id, webpage)
4066 headers = self.generate_api_headers(
4067 ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data),
4068 identity_token=self._extract_identity_token(webpage, item_id=item_id),
4069 visitor_data=try_get(
4070 self._extract_context(ytcfg), lambda x: x['client']['visitorData'], compat_str))
4071 query = {
4072 'params': params or 'wgYCCAA=',
4073 'browseId': browse_id or 'VL%s' % item_id
4074 }
4075 return self._extract_response(
4076 item_id=item_id, headers=headers, query=query,
4077 check_get_keys='contents', fatal=False, ytcfg=ytcfg,
4078 note='Downloading API JSON with unavailable videos')
4079
4080 def _extract_webpage(self, url, item_id):
4081 retries = self.get_param('extractor_retries', 3)
4082 count = -1
4083 last_error = 'Incomplete yt initial data recieved'
4084 while count < retries:
4085 count += 1
4086 # Sometimes youtube returns a webpage with incomplete ytInitialData
4087 # See: https://github.com/yt-dlp/yt-dlp/issues/116
4088 if count:
4089 self.report_warning('%s. Retrying ...' % last_error)
4090 webpage = self._download_webpage(
4091 url, item_id,
4092 'Downloading webpage%s' % (' (retry #%d)' % count if count else ''))
4093 data = self.extract_yt_initial_data(item_id, webpage)
4094 if data.get('contents') or data.get('currentVideoEndpoint'):
4095 break
4096 # Extract alerts here only when there is error
4097 self._extract_and_report_alerts(data)
4098 if count >= retries:
4099 raise ExtractorError(last_error)
4100 return webpage, data
4101
4102 @staticmethod
4103 def _smuggle_data(entries, data):
4104 for entry in entries:
4105 if data:
4106 entry['url'] = smuggle_url(entry['url'], data)
4107 yield entry
4108
4109 def _real_extract(self, url):
4110 url, smuggled_data = unsmuggle_url(url, {})
4111 if self.is_music_url(url):
4112 smuggled_data['is_music_url'] = True
4113 info_dict = self.__real_extract(url, smuggled_data)
4114 if info_dict.get('entries'):
4115 info_dict['entries'] = self._smuggle_data(info_dict['entries'], smuggled_data)
4116 return info_dict
4117
4118 _url_re = re.compile(r'(?P<pre>%s)(?(channel_type)(?P<tab>/\w+))?(?P<post>.*)$' % _VALID_URL)
4119
4120 def __real_extract(self, url, smuggled_data):
4121 item_id = self._match_id(url)
4122 url = compat_urlparse.urlunparse(
4123 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
4124 compat_opts = self.get_param('compat_opts', [])
4125
4126 def get_mobj(url):
4127 mobj = self._url_re.match(url).groupdict()
4128 mobj.update((k, '') for k, v in mobj.items() if v is None)
4129 return mobj
4130
4131 mobj = get_mobj(url)
4132 # Youtube returns incomplete data if tabname is not lower case
4133 pre, tab, post, is_channel = mobj['pre'], mobj['tab'].lower(), mobj['post'], not mobj['not_channel']
4134
4135 if is_channel:
4136 if smuggled_data.get('is_music_url'):
4137 if item_id[:2] == 'VL':
4138 # Youtube music VL channels have an equivalent playlist
4139 item_id = item_id[2:]
4140 pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False
4141 elif item_id[:2] == 'MP':
4142 # Youtube music albums (/channel/MP...) have a OLAK playlist that can be extracted from the webpage
4143 item_id = self._search_regex(
4144 r'\\x22audioPlaylistId\\x22:\\x22([0-9A-Za-z_-]+)\\x22',
4145 self._download_webpage('https://music.youtube.com/channel/%s' % item_id, item_id),
4146 'playlist id')
4147 pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False
4148 elif mobj['channel_type'] == 'browse':
4149 # Youtube music /browse/ should be changed to /channel/
4150 pre = 'https://www.youtube.com/channel/%s' % item_id
4151 if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts:
4152 # Home URLs should redirect to /videos/
4153 self.report_warning(
4154 'A channel/user page was given. All the channel\'s videos will be downloaded. '
4155 'To download only the videos in the home page, add a "/featured" to the URL')
4156 tab = '/videos'
4157
4158 url = ''.join((pre, tab, post))
4159 mobj = get_mobj(url)
4160
4161 # Handle both video/playlist URLs
4162 qs = parse_qs(url)
4163 video_id = qs.get('v', [None])[0]
4164 playlist_id = qs.get('list', [None])[0]
4165
4166 if not video_id and mobj['not_channel'].startswith('watch'):
4167 if not playlist_id:
4168 # If there is neither video or playlist ids, youtube redirects to home page, which is undesirable
4169 raise ExtractorError('Unable to recognize tab page')
4170 # Common mistake: https://www.youtube.com/watch?list=playlist_id
4171 self.report_warning('A video URL was given without video ID. Trying to download playlist %s' % playlist_id)
4172 url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
4173 mobj = get_mobj(url)
4174
4175 if video_id and playlist_id:
4176 if self.get_param('noplaylist'):
4177 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
4178 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
4179 self.to_screen('Downloading playlist %s; add --no-playlist to just download video %s' % (playlist_id, video_id))
4180
4181 webpage, data = self._extract_webpage(url, item_id)
4182
4183 tabs = try_get(
4184 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
4185 if tabs:
4186 selected_tab = self._extract_selected_tab(tabs)
4187 tab_name = selected_tab.get('title', '')
4188 if 'no-youtube-channel-redirect' not in compat_opts:
4189 if mobj['tab'] == '/live':
4190 # Live tab should have redirected to the video
4191 raise ExtractorError('The channel is not currently live', expected=True)
4192 if mobj['tab'] == '/videos' and tab_name.lower() != mobj['tab'][1:]:
4193 if not mobj['not_channel'] and item_id[:2] == 'UC':
4194 # Topic channels don't have /videos. Use the equivalent playlist instead
4195 self.report_warning('The URL does not have a %s tab. Trying to redirect to playlist UU%s instead' % (mobj['tab'][1:], item_id[2:]))
4196 pl_id = 'UU%s' % item_id[2:]
4197 pl_url = 'https://www.youtube.com/playlist?list=%s%s' % (pl_id, mobj['post'])
4198 try:
4199 pl_webpage, pl_data = self._extract_webpage(pl_url, pl_id)
4200 for alert_type, alert_message in self._extract_alerts(pl_data):
4201 if alert_type == 'error':
4202 raise ExtractorError('Youtube said: %s' % alert_message)
4203 item_id, url, webpage, data = pl_id, pl_url, pl_webpage, pl_data
4204 except ExtractorError:
4205 self.report_warning('The playlist gave error. Falling back to channel URL')
4206 else:
4207 self.report_warning('The URL does not have a %s tab. %s is being downloaded instead' % (mobj['tab'][1:], tab_name))
4208
4209 self.write_debug('Final URL: %s' % url)
4210
4211 # YouTube sometimes provides a button to reload playlist with unavailable videos.
4212 if 'no-youtube-unavailable-videos' not in compat_opts:
4213 data = self._reload_with_unavailable_videos(item_id, data, webpage) or data
4214 self._extract_and_report_alerts(data)
4215 tabs = try_get(
4216 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
4217 if tabs:
4218 return self._extract_from_tabs(item_id, webpage, data, tabs)
4219
4220 playlist = try_get(
4221 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
4222 if playlist:
4223 return self._extract_from_playlist(item_id, url, data, playlist, webpage)
4224
4225 video_id = try_get(
4226 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
4227 compat_str) or video_id
4228 if video_id:
4229 if mobj['tab'] != '/live': # live tab is expected to redirect to video
4230 self.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id)
4231 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
4232
4233 raise ExtractorError('Unable to recognize tab page')
4234
4235
4236 class YoutubePlaylistIE(InfoExtractor):
4237 IE_DESC = 'YouTube.com playlists'
4238 _VALID_URL = r'''(?x)(?:
4239 (?:https?://)?
4240 (?:\w+\.)?
4241 (?:
4242 (?:
4243 youtube(?:kids)?\.com|
4244 invidio\.us
4245 )
4246 /.*?\?.*?\blist=
4247 )?
4248 (?P<id>%(playlist_id)s)
4249 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
4250 IE_NAME = 'youtube:playlist'
4251 _TESTS = [{
4252 'note': 'issue #673',
4253 'url': 'PLBB231211A4F62143',
4254 'info_dict': {
4255 'title': '[OLD]Team Fortress 2 (Class-based LP)',
4256 'id': 'PLBB231211A4F62143',
4257 'uploader': 'Wickydoo',
4258 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
4259 'description': 'md5:8fa6f52abb47a9552002fa3ddfc57fc2',
4260 },
4261 'playlist_mincount': 29,
4262 }, {
4263 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
4264 'info_dict': {
4265 'title': 'YDL_safe_search',
4266 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
4267 },
4268 'playlist_count': 2,
4269 'skip': 'This playlist is private',
4270 }, {
4271 'note': 'embedded',
4272 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
4273 'playlist_count': 4,
4274 'info_dict': {
4275 'title': 'JODA15',
4276 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
4277 'uploader': 'milan',
4278 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
4279 }
4280 }, {
4281 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
4282 'playlist_mincount': 654,
4283 'info_dict': {
4284 'title': '2018 Chinese New Singles (11/6 updated)',
4285 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
4286 'uploader': 'LBK',
4287 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
4288 'description': 'md5:da521864744d60a198e3a88af4db0d9d',
4289 }
4290 }, {
4291 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
4292 'only_matching': True,
4293 }, {
4294 # music album playlist
4295 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
4296 'only_matching': True,
4297 }]
4298
4299 @classmethod
4300 def suitable(cls, url):
4301 if YoutubeTabIE.suitable(url):
4302 return False
4303 # Hack for lazy extractors until more generic solution is implemented
4304 # (see #28780)
4305 from .youtube import parse_qs
4306 qs = parse_qs(url)
4307 if qs.get('v', [None])[0]:
4308 return False
4309 return super(YoutubePlaylistIE, cls).suitable(url)
4310
4311 def _real_extract(self, url):
4312 playlist_id = self._match_id(url)
4313 is_music_url = YoutubeBaseInfoExtractor.is_music_url(url)
4314 url = update_url_query(
4315 'https://www.youtube.com/playlist',
4316 parse_qs(url) or {'list': playlist_id})
4317 if is_music_url:
4318 url = smuggle_url(url, {'is_music_url': True})
4319 return self.url_result(url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
4320
4321
4322 class YoutubeYtBeIE(InfoExtractor):
4323 IE_DESC = 'youtu.be'
4324 _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
4325 _TESTS = [{
4326 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
4327 'info_dict': {
4328 'id': 'yeWKywCrFtk',
4329 'ext': 'mp4',
4330 'title': 'Small Scale Baler and Braiding Rugs',
4331 'uploader': 'Backus-Page House Museum',
4332 'uploader_id': 'backuspagemuseum',
4333 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
4334 'upload_date': '20161008',
4335 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
4336 'categories': ['Nonprofits & Activism'],
4337 'tags': list,
4338 'like_count': int,
4339 'dislike_count': int,
4340 },
4341 'params': {
4342 'noplaylist': True,
4343 'skip_download': True,
4344 },
4345 }, {
4346 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
4347 'only_matching': True,
4348 }]
4349
4350 def _real_extract(self, url):
4351 mobj = re.match(self._VALID_URL, url)
4352 video_id = mobj.group('id')
4353 playlist_id = mobj.group('playlist_id')
4354 return self.url_result(
4355 update_url_query('https://www.youtube.com/watch', {
4356 'v': video_id,
4357 'list': playlist_id,
4358 'feature': 'youtu.be',
4359 }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
4360
4361
4362 class YoutubeYtUserIE(InfoExtractor):
4363 IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
4364 _VALID_URL = r'ytuser:(?P<id>.+)'
4365 _TESTS = [{
4366 'url': 'ytuser:phihag',
4367 'only_matching': True,
4368 }]
4369
4370 def _real_extract(self, url):
4371 user_id = self._match_id(url)
4372 return self.url_result(
4373 'https://www.youtube.com/user/%s' % user_id,
4374 ie=YoutubeTabIE.ie_key(), video_id=user_id)
4375
4376
4377 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
4378 IE_NAME = 'youtube:favorites'
4379 IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
4380 _VALID_URL = r':ytfav(?:ou?rite)?s?'
4381 _LOGIN_REQUIRED = True
4382 _TESTS = [{
4383 'url': ':ytfav',
4384 'only_matching': True,
4385 }, {
4386 'url': ':ytfavorites',
4387 'only_matching': True,
4388 }]
4389
4390 def _real_extract(self, url):
4391 return self.url_result(
4392 'https://www.youtube.com/playlist?list=LL',
4393 ie=YoutubeTabIE.ie_key())
4394
4395
4396 class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE):
4397 IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
4398 # there doesn't appear to be a real limit, for example if you search for
4399 # 'python' you get more than 8.000.000 results
4400 _MAX_RESULTS = float('inf')
4401 IE_NAME = 'youtube:search'
4402 _SEARCH_KEY = 'ytsearch'
4403 _SEARCH_PARAMS = None
4404 _TESTS = []
4405
4406 def _entries(self, query, n):
4407 data = {'query': query}
4408 if self._SEARCH_PARAMS:
4409 data['params'] = self._SEARCH_PARAMS
4410 total = 0
4411 continuation = {}
4412 for page_num in itertools.count(1):
4413 data.update(continuation)
4414 search = self._extract_response(
4415 item_id='query "%s" page %s' % (query, page_num), ep='search', query=data,
4416 check_get_keys=('contents', 'onResponseReceivedCommands')
4417 )
4418 if not search:
4419 break
4420 slr_contents = try_get(
4421 search,
4422 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
4423 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
4424 list)
4425 if not slr_contents:
4426 break
4427
4428 # Youtube sometimes adds promoted content to searches,
4429 # changing the index location of videos and token.
4430 # So we search through all entries till we find them.
4431 continuation = None
4432 for slr_content in slr_contents:
4433 if not continuation:
4434 continuation = self._extract_continuation({'contents': [slr_content]})
4435
4436 isr_contents = try_get(
4437 slr_content,
4438 lambda x: x['itemSectionRenderer']['contents'],
4439 list)
4440 if not isr_contents:
4441 continue
4442 for content in isr_contents:
4443 if not isinstance(content, dict):
4444 continue
4445 video = content.get('videoRenderer')
4446 if not isinstance(video, dict):
4447 continue
4448 video_id = video.get('videoId')
4449 if not video_id:
4450 continue
4451
4452 yield self._extract_video(video)
4453 total += 1
4454 if total == n:
4455 return
4456
4457 if not continuation:
4458 break
4459
4460 def _get_n_results(self, query, n):
4461 """Get a specified number of results for a query"""
4462 return self.playlist_result(self._entries(query, n), query, query)
4463
4464
4465 class YoutubeSearchDateIE(YoutubeSearchIE):
4466 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
4467 _SEARCH_KEY = 'ytsearchdate'
4468 IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
4469 _SEARCH_PARAMS = 'CAI%3D'
4470
4471
4472 class YoutubeSearchURLIE(YoutubeSearchIE):
4473 IE_DESC = 'YouTube.com search URLs'
4474 IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
4475 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
4476 # _MAX_RESULTS = 100
4477 _TESTS = [{
4478 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
4479 'playlist_mincount': 5,
4480 'info_dict': {
4481 'id': 'youtube-dl test video',
4482 'title': 'youtube-dl test video',
4483 }
4484 }, {
4485 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
4486 'only_matching': True,
4487 }]
4488
4489 @classmethod
4490 def _make_valid_url(cls):
4491 return cls._VALID_URL
4492
4493 def _real_extract(self, url):
4494 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
4495 query = (qs.get('search_query') or qs.get('q'))[0]
4496 self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
4497 return self._get_n_results(query, self._MAX_RESULTS)
4498
4499
4500 class YoutubeFeedsInfoExtractor(YoutubeTabIE):
4501 """
4502 Base class for feed extractors
4503 Subclasses must define the _FEED_NAME property.
4504 """
4505 _LOGIN_REQUIRED = True
4506 _TESTS = []
4507
4508 @property
4509 def IE_NAME(self):
4510 return 'youtube:%s' % self._FEED_NAME
4511
4512 def _real_extract(self, url):
4513 return self.url_result(
4514 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
4515 ie=YoutubeTabIE.ie_key())
4516
4517
4518 class YoutubeWatchLaterIE(InfoExtractor):
4519 IE_NAME = 'youtube:watchlater'
4520 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
4521 _VALID_URL = r':ytwatchlater'
4522 _TESTS = [{
4523 'url': ':ytwatchlater',
4524 'only_matching': True,
4525 }]
4526
4527 def _real_extract(self, url):
4528 return self.url_result(
4529 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
4530
4531
4532 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
4533 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
4534 _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
4535 _FEED_NAME = 'recommended'
4536 _LOGIN_REQUIRED = False
4537 _TESTS = [{
4538 'url': ':ytrec',
4539 'only_matching': True,
4540 }, {
4541 'url': ':ytrecommended',
4542 'only_matching': True,
4543 }, {
4544 'url': 'https://youtube.com',
4545 'only_matching': True,
4546 }]
4547
4548
4549 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
4550 IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
4551 _VALID_URL = r':ytsub(?:scription)?s?'
4552 _FEED_NAME = 'subscriptions'
4553 _TESTS = [{
4554 'url': ':ytsubs',
4555 'only_matching': True,
4556 }, {
4557 'url': ':ytsubscriptions',
4558 'only_matching': True,
4559 }]
4560
4561
4562 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
4563 IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)'
4564 _VALID_URL = r':ythis(?:tory)?'
4565 _FEED_NAME = 'history'
4566 _TESTS = [{
4567 'url': ':ythistory',
4568 'only_matching': True,
4569 }]
4570
4571
4572 class YoutubeTruncatedURLIE(InfoExtractor):
4573 IE_NAME = 'youtube:truncated_url'
4574 IE_DESC = False # Do not list
4575 _VALID_URL = r'''(?x)
4576 (?:https?://)?
4577 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
4578 (?:watch\?(?:
4579 feature=[a-z_]+|
4580 annotation_id=annotation_[^&]+|
4581 x-yt-cl=[0-9]+|
4582 hl=[^&]*|
4583 t=[0-9]+
4584 )?
4585 |
4586 attribution_link\?a=[^&]+
4587 )
4588 $
4589 '''
4590
4591 _TESTS = [{
4592 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
4593 'only_matching': True,
4594 }, {
4595 'url': 'https://www.youtube.com/watch?',
4596 'only_matching': True,
4597 }, {
4598 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
4599 'only_matching': True,
4600 }, {
4601 'url': 'https://www.youtube.com/watch?feature=foo',
4602 'only_matching': True,
4603 }, {
4604 'url': 'https://www.youtube.com/watch?hl=en-GB',
4605 'only_matching': True,
4606 }, {
4607 'url': 'https://www.youtube.com/watch?t=2372',
4608 'only_matching': True,
4609 }]
4610
4611 def _real_extract(self, url):
4612 raise ExtractorError(
4613 'Did you forget to quote the URL? Remember that & is a meta '
4614 'character in most shells, so you want to put the URL in quotes, '
4615 'like youtube-dl '
4616 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
4617 ' or simply youtube-dl BaW_jenozKc .',
4618 expected=True)
4619
4620
4621 class YoutubeTruncatedIDIE(InfoExtractor):
4622 IE_NAME = 'youtube:truncated_id'
4623 IE_DESC = False # Do not list
4624 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
4625
4626 _TESTS = [{
4627 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
4628 'only_matching': True,
4629 }]
4630
4631 def _real_extract(self, url):
4632 video_id = self._match_id(url)
4633 raise ExtractorError(
4634 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
4635 expected=True)