]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/youtube.py
[utils] Add `parse_qs`
[yt-dlp.git] / yt_dlp / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5 import base64
6 import calendar
7 import copy
8 import datetime
9 import hashlib
10 import itertools
11 import json
12 import os.path
13 import random
14 import re
15 import time
16 import traceback
17
18 from .common import InfoExtractor, SearchInfoExtractor
19 from ..compat import (
20 compat_chr,
21 compat_HTTPError,
22 compat_parse_qs,
23 compat_str,
24 compat_urllib_parse_unquote_plus,
25 compat_urllib_parse_urlencode,
26 compat_urllib_parse_urlparse,
27 compat_urlparse,
28 )
29 from ..jsinterp import JSInterpreter
30 from ..utils import (
31 bytes_to_intlist,
32 clean_html,
33 datetime_from_str,
34 dict_get,
35 error_to_compat_str,
36 ExtractorError,
37 float_or_none,
38 format_field,
39 int_or_none,
40 intlist_to_bytes,
41 is_html,
42 mimetype2ext,
43 network_exceptions,
44 orderedSet,
45 parse_codecs,
46 parse_count,
47 parse_duration,
48 parse_iso8601,
49 parse_qs,
50 qualities,
51 remove_start,
52 smuggle_url,
53 str_or_none,
54 str_to_int,
55 traverse_obj,
56 try_get,
57 unescapeHTML,
58 unified_strdate,
59 unsmuggle_url,
60 update_url_query,
61 url_or_none,
62 urlencode_postdata,
63 urljoin,
64 variadic,
65 )
66
67
68 # any clients starting with _ cannot be explicity requested by the user
69 INNERTUBE_CLIENTS = {
70 'web': {
71 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
72 'INNERTUBE_CONTEXT': {
73 'client': {
74 'clientName': 'WEB',
75 'clientVersion': '2.20210622.10.00',
76 }
77 },
78 'INNERTUBE_CONTEXT_CLIENT_NAME': 1
79 },
80 'web_embedded': {
81 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
82 'INNERTUBE_CONTEXT': {
83 'client': {
84 'clientName': 'WEB_EMBEDDED_PLAYER',
85 'clientVersion': '1.20210620.0.1',
86 },
87 },
88 'INNERTUBE_CONTEXT_CLIENT_NAME': 56
89 },
90 'web_music': {
91 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
92 'INNERTUBE_HOST': 'music.youtube.com',
93 'INNERTUBE_CONTEXT': {
94 'client': {
95 'clientName': 'WEB_REMIX',
96 'clientVersion': '1.20210621.00.00',
97 }
98 },
99 'INNERTUBE_CONTEXT_CLIENT_NAME': 67,
100 },
101 'web_creator': {
102 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
103 'INNERTUBE_CONTEXT': {
104 'client': {
105 'clientName': 'WEB_CREATOR',
106 'clientVersion': '1.20210621.00.00',
107 }
108 },
109 'INNERTUBE_CONTEXT_CLIENT_NAME': 62,
110 },
111 'android': {
112 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
113 'INNERTUBE_CONTEXT': {
114 'client': {
115 'clientName': 'ANDROID',
116 'clientVersion': '16.20',
117 }
118 },
119 'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
120 },
121 'android_embedded': {
122 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
123 'INNERTUBE_CONTEXT': {
124 'client': {
125 'clientName': 'ANDROID_EMBEDDED_PLAYER',
126 'clientVersion': '16.20',
127 },
128 },
129 'INNERTUBE_CONTEXT_CLIENT_NAME': 55
130 },
131 'android_music': {
132 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
133 'INNERTUBE_HOST': 'music.youtube.com',
134 'INNERTUBE_CONTEXT': {
135 'client': {
136 'clientName': 'ANDROID_MUSIC',
137 'clientVersion': '4.32',
138 }
139 },
140 'INNERTUBE_CONTEXT_CLIENT_NAME': 21,
141 },
142 'android_creator': {
143 'INNERTUBE_CONTEXT': {
144 'client': {
145 'clientName': 'ANDROID_CREATOR',
146 'clientVersion': '21.24.100',
147 },
148 },
149 'INNERTUBE_CONTEXT_CLIENT_NAME': 14
150 },
151 # ios has HLS live streams
152 # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680
153 'ios': {
154 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
155 'INNERTUBE_CONTEXT': {
156 'client': {
157 'clientName': 'IOS',
158 'clientVersion': '16.20',
159 }
160 },
161 'INNERTUBE_CONTEXT_CLIENT_NAME': 5
162 },
163 'ios_embedded': {
164 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8',
165 'INNERTUBE_CONTEXT': {
166 'client': {
167 'clientName': 'IOS_MESSAGES_EXTENSION',
168 'clientVersion': '16.20',
169 },
170 },
171 'INNERTUBE_CONTEXT_CLIENT_NAME': 66
172 },
173 'ios_music': {
174 'INNERTUBE_API_KEY': 'AIzaSyDK3iBpDP9nHVTk2qL73FLJICfOC3c51Og',
175 'INNERTUBE_HOST': 'music.youtube.com',
176 'INNERTUBE_CONTEXT': {
177 'client': {
178 'clientName': 'IOS_MUSIC',
179 'clientVersion': '4.32',
180 },
181 },
182 'INNERTUBE_CONTEXT_CLIENT_NAME': 26
183 },
184 'ios_creator': {
185 'INNERTUBE_CONTEXT': {
186 'client': {
187 'clientName': 'IOS_CREATOR',
188 'clientVersion': '21.24.100',
189 },
190 },
191 'INNERTUBE_CONTEXT_CLIENT_NAME': 15
192 },
193 # mweb has 'ultralow' formats
194 # See: https://github.com/yt-dlp/yt-dlp/pull/557
195 'mweb': {
196 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8',
197 'INNERTUBE_CONTEXT': {
198 'client': {
199 'clientName': 'MWEB',
200 'clientVersion': '2.20210721.07.00',
201 }
202 },
203 'INNERTUBE_CONTEXT_CLIENT_NAME': 2
204 },
205 }
206
207
208 def build_innertube_clients():
209 third_party = {
210 'embedUrl': 'https://google.com', # Can be any valid URL
211 }
212 base_clients = ('android', 'web', 'ios', 'mweb')
213 priority = qualities(base_clients[::-1])
214
215 for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()):
216 ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8')
217 ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com')
218 ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en')
219 ytcfg['priority'] = 10 * priority(client.split('_', 1)[0])
220
221 if client in base_clients:
222 INNERTUBE_CLIENTS[f'{client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg)
223 agegate_ytcfg['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED'
224 agegate_ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party
225 agegate_ytcfg['priority'] -= 1
226 elif client.endswith('_embedded'):
227 ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party
228 ytcfg['priority'] -= 2
229 else:
230 ytcfg['priority'] -= 3
231
232
233 build_innertube_clients()
234
235
236 class YoutubeBaseInfoExtractor(InfoExtractor):
237 """Provide base functions for Youtube extractors"""
238
239 _RESERVED_NAMES = (
240 r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|'
241 r'shorts|movies|results|shared|hashtag|trending|feed|feeds|'
242 r'browse|oembed|get_video_info|iframe_api|s/player|'
243 r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout')
244
245 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
246
247 _NETRC_MACHINE = 'youtube'
248
249 # If True it will raise an error if no login info is provided
250 _LOGIN_REQUIRED = False
251
252 r''' # Unused since login is broken
253 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
254 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
255
256 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
257 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
258 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
259 '''
260
261 def _login(self):
262 """
263 Attempt to log in to YouTube.
264 True is returned if successful or skipped.
265 False is returned if login failed.
266
267 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
268 """
269
270 def warn(message):
271 self.report_warning(message)
272
273 # username+password login is broken
274 if (self._LOGIN_REQUIRED
275 and self.get_param('cookiefile') is None
276 and self.get_param('cookiesfrombrowser') is None):
277 self.raise_login_required(
278 'Login details are needed to download this content', method='cookies')
279 username, password = self._get_login_info()
280 if username:
281 warn('Logging in using username and password is broken. %s' % self._LOGIN_HINTS['cookies'])
282 return
283
284 # Everything below this is broken!
285 r'''
286 # No authentication to be performed
287 if username is None:
288 if self._LOGIN_REQUIRED and self.get_param('cookiefile') is None:
289 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
290 # if self.get_param('cookiefile'): # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
291 # self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
292 return True
293
294 login_page = self._download_webpage(
295 self._LOGIN_URL, None,
296 note='Downloading login page',
297 errnote='unable to fetch login page', fatal=False)
298 if login_page is False:
299 return
300
301 login_form = self._hidden_inputs(login_page)
302
303 def req(url, f_req, note, errnote):
304 data = login_form.copy()
305 data.update({
306 'pstMsg': 1,
307 'checkConnection': 'youtube',
308 'checkedDomains': 'youtube',
309 'hl': 'en',
310 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
311 'f.req': json.dumps(f_req),
312 'flowName': 'GlifWebSignIn',
313 'flowEntry': 'ServiceLogin',
314 # TODO: reverse actual botguard identifier generation algo
315 'bgRequest': '["identifier",""]',
316 })
317 return self._download_json(
318 url, None, note=note, errnote=errnote,
319 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
320 fatal=False,
321 data=urlencode_postdata(data), headers={
322 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
323 'Google-Accounts-XSRF': 1,
324 })
325
326 lookup_req = [
327 username,
328 None, [], None, 'US', None, None, 2, False, True,
329 [
330 None, None,
331 [2, 1, None, 1,
332 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
333 None, [], 4],
334 1, [None, None, []], None, None, None, True
335 ],
336 username,
337 ]
338
339 lookup_results = req(
340 self._LOOKUP_URL, lookup_req,
341 'Looking up account info', 'Unable to look up account info')
342
343 if lookup_results is False:
344 return False
345
346 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
347 if not user_hash:
348 warn('Unable to extract user hash')
349 return False
350
351 challenge_req = [
352 user_hash,
353 None, 1, None, [1, None, None, None, [password, None, True]],
354 [
355 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
356 1, [None, None, []], None, None, None, True
357 ]]
358
359 challenge_results = req(
360 self._CHALLENGE_URL, challenge_req,
361 'Logging in', 'Unable to log in')
362
363 if challenge_results is False:
364 return
365
366 login_res = try_get(challenge_results, lambda x: x[0][5], list)
367 if login_res:
368 login_msg = try_get(login_res, lambda x: x[5], compat_str)
369 warn(
370 'Unable to login: %s' % 'Invalid password'
371 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
372 return False
373
374 res = try_get(challenge_results, lambda x: x[0][-1], list)
375 if not res:
376 warn('Unable to extract result entry')
377 return False
378
379 login_challenge = try_get(res, lambda x: x[0][0], list)
380 if login_challenge:
381 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
382 if challenge_str == 'TWO_STEP_VERIFICATION':
383 # SEND_SUCCESS - TFA code has been successfully sent to phone
384 # QUOTA_EXCEEDED - reached the limit of TFA codes
385 status = try_get(login_challenge, lambda x: x[5], compat_str)
386 if status == 'QUOTA_EXCEEDED':
387 warn('Exceeded the limit of TFA codes, try later')
388 return False
389
390 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
391 if not tl:
392 warn('Unable to extract TL')
393 return False
394
395 tfa_code = self._get_tfa_info('2-step verification code')
396
397 if not tfa_code:
398 warn(
399 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
400 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
401 return False
402
403 tfa_code = remove_start(tfa_code, 'G-')
404
405 tfa_req = [
406 user_hash, None, 2, None,
407 [
408 9, None, None, None, None, None, None, None,
409 [None, tfa_code, True, 2]
410 ]]
411
412 tfa_results = req(
413 self._TFA_URL.format(tl), tfa_req,
414 'Submitting TFA code', 'Unable to submit TFA code')
415
416 if tfa_results is False:
417 return False
418
419 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
420 if tfa_res:
421 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
422 warn(
423 'Unable to finish TFA: %s' % 'Invalid TFA code'
424 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
425 return False
426
427 check_cookie_url = try_get(
428 tfa_results, lambda x: x[0][-1][2], compat_str)
429 else:
430 CHALLENGES = {
431 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
432 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
433 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
434 }
435 challenge = CHALLENGES.get(
436 challenge_str,
437 '%s returned error %s.' % (self.IE_NAME, challenge_str))
438 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
439 return False
440 else:
441 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
442
443 if not check_cookie_url:
444 warn('Unable to extract CheckCookie URL')
445 return False
446
447 check_cookie_results = self._download_webpage(
448 check_cookie_url, None, 'Checking cookie', fatal=False)
449
450 if check_cookie_results is False:
451 return False
452
453 if 'https://myaccount.google.com/' not in check_cookie_results:
454 warn('Unable to log in')
455 return False
456
457 return True
458 '''
459
460 def _initialize_consent(self):
461 cookies = self._get_cookies('https://www.youtube.com/')
462 if cookies.get('__Secure-3PSID'):
463 return
464 consent_id = None
465 consent = cookies.get('CONSENT')
466 if consent:
467 if 'YES' in consent.value:
468 return
469 consent_id = self._search_regex(
470 r'PENDING\+(\d+)', consent.value, 'consent', default=None)
471 if not consent_id:
472 consent_id = random.randint(100, 999)
473 self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id)
474
475 def _real_initialize(self):
476 self._initialize_consent()
477 if self._downloader is None:
478 return
479 if not self._login():
480 return
481
482 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
483 _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
484 _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
485
486 def _get_default_ytcfg(self, client='web'):
487 return copy.deepcopy(INNERTUBE_CLIENTS[client])
488
489 def _get_innertube_host(self, client='web'):
490 return INNERTUBE_CLIENTS[client]['INNERTUBE_HOST']
491
492 def _ytcfg_get_safe(self, ytcfg, getter, expected_type=None, default_client='web'):
493 # try_get but with fallback to default ytcfg client values when present
494 _func = lambda y: try_get(y, getter, expected_type)
495 return _func(ytcfg) or _func(self._get_default_ytcfg(default_client))
496
497 def _extract_client_name(self, ytcfg, default_client='web'):
498 return self._ytcfg_get_safe(
499 ytcfg, (lambda x: x['INNERTUBE_CLIENT_NAME'],
500 lambda x: x['INNERTUBE_CONTEXT']['client']['clientName']), compat_str, default_client)
501
502 @staticmethod
503 def _extract_session_index(*data):
504 for ytcfg in data:
505 session_index = int_or_none(try_get(ytcfg, lambda x: x['SESSION_INDEX']))
506 if session_index is not None:
507 return session_index
508
509 def _extract_client_version(self, ytcfg, default_client='web'):
510 return self._ytcfg_get_safe(
511 ytcfg, (lambda x: x['INNERTUBE_CLIENT_VERSION'],
512 lambda x: x['INNERTUBE_CONTEXT']['client']['clientVersion']), compat_str, default_client)
513
514 def _extract_api_key(self, ytcfg=None, default_client='web'):
515 return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client)
516
517 def _extract_context(self, ytcfg=None, default_client='web'):
518 _get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict)
519 context = _get_context(ytcfg)
520 if context:
521 return context
522
523 context = _get_context(self._get_default_ytcfg(default_client))
524 if not ytcfg:
525 return context
526
527 # Recreate the client context (required)
528 context['client'].update({
529 'clientVersion': self._extract_client_version(ytcfg, default_client),
530 'clientName': self._extract_client_name(ytcfg, default_client),
531 })
532 visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str)
533 if visitor_data:
534 context['client']['visitorData'] = visitor_data
535 return context
536
537 _SAPISID = None
538
539 def _generate_sapisidhash_header(self, origin='https://www.youtube.com'):
540 time_now = round(time.time())
541 if self._SAPISID is None:
542 yt_cookies = self._get_cookies('https://www.youtube.com')
543 # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is.
544 # See: https://github.com/yt-dlp/yt-dlp/issues/393
545 sapisid_cookie = dict_get(
546 yt_cookies, ('__Secure-3PAPISID', 'SAPISID'))
547 if sapisid_cookie and sapisid_cookie.value:
548 self._SAPISID = sapisid_cookie.value
549 self.write_debug('Extracted SAPISID cookie')
550 # SAPISID cookie is required if not already present
551 if not yt_cookies.get('SAPISID'):
552 self.write_debug('Copying __Secure-3PAPISID cookie to SAPISID cookie')
553 self._set_cookie(
554 '.youtube.com', 'SAPISID', self._SAPISID, secure=True, expire_time=time_now + 3600)
555 else:
556 self._SAPISID = False
557 if not self._SAPISID:
558 return None
559 # SAPISIDHASH algorithm from https://stackoverflow.com/a/32065323
560 sapisidhash = hashlib.sha1(
561 f'{time_now} {self._SAPISID} {origin}'.encode('utf-8')).hexdigest()
562 return f'SAPISIDHASH {time_now}_{sapisidhash}'
563
564 def _call_api(self, ep, query, video_id, fatal=True, headers=None,
565 note='Downloading API JSON', errnote='Unable to download API page',
566 context=None, api_key=None, api_hostname=None, default_client='web'):
567
568 data = {'context': context} if context else {'context': self._extract_context(default_client=default_client)}
569 data.update(query)
570 real_headers = self.generate_api_headers(default_client=default_client)
571 real_headers.update({'content-type': 'application/json'})
572 if headers:
573 real_headers.update(headers)
574 return self._download_json(
575 'https://%s/youtubei/v1/%s' % (api_hostname or self._get_innertube_host(default_client), ep),
576 video_id=video_id, fatal=fatal, note=note, errnote=errnote,
577 data=json.dumps(data).encode('utf8'), headers=real_headers,
578 query={'key': api_key or self._extract_api_key()})
579
580 def extract_yt_initial_data(self, video_id, webpage):
581 return self._parse_json(
582 self._search_regex(
583 (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
584 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
585 video_id)
586
587 def _extract_identity_token(self, webpage, item_id):
588 if not webpage:
589 return None
590 ytcfg = self.extract_ytcfg(item_id, webpage)
591 if ytcfg:
592 token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
593 if token:
594 return token
595 return self._search_regex(
596 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
597 'identity token', default=None)
598
599 @staticmethod
600 def _extract_account_syncid(*args):
601 """
602 Extract syncId required to download private playlists of secondary channels
603 @params response and/or ytcfg
604 """
605 for data in args:
606 # ytcfg includes channel_syncid if on secondary channel
607 delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], compat_str)
608 if delegated_sid:
609 return delegated_sid
610 sync_ids = (try_get(
611 data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'],
612 lambda x: x['DATASYNC_ID']), compat_str) or '').split("||")
613 if len(sync_ids) >= 2 and sync_ids[1]:
614 # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
615 # and just "user_syncid||" for primary channel. We only want the channel_syncid
616 return sync_ids[0]
617
618 def extract_ytcfg(self, video_id, webpage):
619 if not webpage:
620 return {}
621 return self._parse_json(
622 self._search_regex(
623 r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
624 default='{}'), video_id, fatal=False) or {}
625
626 def generate_api_headers(
627 self, ytcfg=None, identity_token=None, account_syncid=None,
628 visitor_data=None, api_hostname=None, default_client='web', session_index=None):
629 origin = 'https://' + (api_hostname if api_hostname else self._get_innertube_host(default_client))
630 headers = {
631 'X-YouTube-Client-Name': compat_str(
632 self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=default_client)),
633 'X-YouTube-Client-Version': self._extract_client_version(ytcfg, default_client),
634 'Origin': origin
635 }
636 if not visitor_data and ytcfg:
637 visitor_data = try_get(
638 self._extract_context(ytcfg, default_client), lambda x: x['client']['visitorData'], compat_str)
639 if identity_token:
640 headers['X-Youtube-Identity-Token'] = identity_token
641 if account_syncid:
642 headers['X-Goog-PageId'] = account_syncid
643 if session_index is None and ytcfg:
644 session_index = self._extract_session_index(ytcfg)
645 if account_syncid or session_index is not None:
646 headers['X-Goog-AuthUser'] = session_index if session_index is not None else 0
647 if visitor_data:
648 headers['X-Goog-Visitor-Id'] = visitor_data
649 auth = self._generate_sapisidhash_header(origin)
650 if auth is not None:
651 headers['Authorization'] = auth
652 headers['X-Origin'] = origin
653 return headers
654
655 @staticmethod
656 def _build_api_continuation_query(continuation, ctp=None):
657 query = {
658 'continuation': continuation
659 }
660 # TODO: Inconsistency with clickTrackingParams.
661 # Currently we have a fixed ctp contained within context (from ytcfg)
662 # and a ctp in root query for continuation.
663 if ctp:
664 query['clickTracking'] = {'clickTrackingParams': ctp}
665 return query
666
667 @classmethod
668 def _extract_next_continuation_data(cls, renderer):
669 next_continuation = try_get(
670 renderer, (lambda x: x['continuations'][0]['nextContinuationData'],
671 lambda x: x['continuation']['reloadContinuationData']), dict)
672 if not next_continuation:
673 return
674 continuation = next_continuation.get('continuation')
675 if not continuation:
676 return
677 ctp = next_continuation.get('clickTrackingParams')
678 return cls._build_api_continuation_query(continuation, ctp)
679
680 @classmethod
681 def _extract_continuation_ep_data(cls, continuation_ep: dict):
682 if isinstance(continuation_ep, dict):
683 continuation = try_get(
684 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
685 if not continuation:
686 return
687 ctp = continuation_ep.get('clickTrackingParams')
688 return cls._build_api_continuation_query(continuation, ctp)
689
690 @classmethod
691 def _extract_continuation(cls, renderer):
692 next_continuation = cls._extract_next_continuation_data(renderer)
693 if next_continuation:
694 return next_continuation
695
696 contents = []
697 for key in ('contents', 'items'):
698 contents.extend(try_get(renderer, lambda x: x[key], list) or [])
699
700 for content in contents:
701 if not isinstance(content, dict):
702 continue
703 continuation_ep = try_get(
704 content, (lambda x: x['continuationItemRenderer']['continuationEndpoint'],
705 lambda x: x['continuationItemRenderer']['button']['buttonRenderer']['command']),
706 dict)
707 continuation = cls._extract_continuation_ep_data(continuation_ep)
708 if continuation:
709 return continuation
710
711 @classmethod
712 def _extract_alerts(cls, data):
713 for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
714 if not isinstance(alert_dict, dict):
715 continue
716 for alert in alert_dict.values():
717 alert_type = alert.get('type')
718 if not alert_type:
719 continue
720 message = cls._get_text(alert, 'text')
721 if message:
722 yield alert_type, message
723
724 def _report_alerts(self, alerts, expected=True, fatal=True):
725 errors = []
726 warnings = []
727 for alert_type, alert_message in alerts:
728 if alert_type.lower() == 'error' and fatal:
729 errors.append([alert_type, alert_message])
730 else:
731 warnings.append([alert_type, alert_message])
732
733 for alert_type, alert_message in (warnings + errors[:-1]):
734 self.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
735 if errors:
736 raise ExtractorError('YouTube said: %s' % errors[-1][1], expected=expected)
737
738 def _extract_and_report_alerts(self, data, *args, **kwargs):
739 return self._report_alerts(self._extract_alerts(data), *args, **kwargs)
740
741 def _extract_badges(self, renderer: dict):
742 badges = set()
743 for badge in try_get(renderer, lambda x: x['badges'], list) or []:
744 label = try_get(badge, lambda x: x['metadataBadgeRenderer']['label'], compat_str)
745 if label:
746 badges.add(label.lower())
747 return badges
748
749 @staticmethod
750 def _get_text(data, *path_list, max_runs=None):
751 for path in path_list or [None]:
752 if path is None:
753 obj = [data]
754 else:
755 obj = traverse_obj(data, path, default=[])
756 if not any(key is ... or isinstance(key, (list, tuple)) for key in variadic(path)):
757 obj = [obj]
758 for item in obj:
759 text = try_get(item, lambda x: x['simpleText'], compat_str)
760 if text:
761 return text
762 runs = try_get(item, lambda x: x['runs'], list) or []
763 if not runs and isinstance(item, list):
764 runs = item
765
766 runs = runs[:min(len(runs), max_runs or len(runs))]
767 text = ''.join(traverse_obj(runs, (..., 'text'), expected_type=str, default=[]))
768 if text:
769 return text
770
771 def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None,
772 ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None,
773 default_client='web'):
774 response = None
775 last_error = None
776 count = -1
777 retries = self.get_param('extractor_retries', 3)
778 if check_get_keys is None:
779 check_get_keys = []
780 while count < retries:
781 count += 1
782 if last_error:
783 self.report_warning('%s. Retrying ...' % last_error)
784 try:
785 response = self._call_api(
786 ep=ep, fatal=True, headers=headers,
787 video_id=item_id, query=query,
788 context=self._extract_context(ytcfg, default_client),
789 api_key=self._extract_api_key(ytcfg, default_client),
790 api_hostname=api_hostname, default_client=default_client,
791 note='%s%s' % (note, ' (retry #%d)' % count if count else ''))
792 except ExtractorError as e:
793 if isinstance(e.cause, network_exceptions):
794 if isinstance(e.cause, compat_HTTPError) and not is_html(e.cause.read(512)):
795 e.cause.seek(0)
796 yt_error = try_get(
797 self._parse_json(e.cause.read().decode(), item_id, fatal=False),
798 lambda x: x['error']['message'], compat_str)
799 if yt_error:
800 self._report_alerts([('ERROR', yt_error)], fatal=False)
801 # Downloading page may result in intermittent 5xx HTTP error
802 # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
803 # We also want to catch all other network exceptions since errors in later pages can be troublesome
804 # See https://github.com/yt-dlp/yt-dlp/issues/507#issuecomment-880188210
805 if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (403, 429):
806 last_error = error_to_compat_str(e.cause or e)
807 if count < retries:
808 continue
809 if fatal:
810 raise
811 else:
812 self.report_warning(error_to_compat_str(e))
813 return
814
815 else:
816 # Youtube may send alerts if there was an issue with the continuation page
817 try:
818 self._extract_and_report_alerts(response, expected=False)
819 except ExtractorError as e:
820 if fatal:
821 raise
822 self.report_warning(error_to_compat_str(e))
823 return
824 if not check_get_keys or dict_get(response, check_get_keys):
825 break
826 # Youtube sometimes sends incomplete data
827 # See: https://github.com/ytdl-org/youtube-dl/issues/28194
828 last_error = 'Incomplete data received'
829 if count >= retries:
830 if fatal:
831 raise ExtractorError(last_error)
832 else:
833 self.report_warning(last_error)
834 return
835 return response
836
837 @staticmethod
838 def is_music_url(url):
839 return re.match(r'https?://music\.youtube\.com/', url) is not None
840
841 def _extract_video(self, renderer):
842 video_id = renderer.get('videoId')
843 title = self._get_text(renderer, 'title')
844 description = self._get_text(renderer, 'descriptionSnippet')
845 duration = parse_duration(self._get_text(
846 renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text')))
847 view_count_text = self._get_text(renderer, 'viewCountText') or ''
848 view_count = str_to_int(self._search_regex(
849 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
850 'view count', default=None))
851
852 uploader = self._get_text(renderer, 'ownerText', 'shortBylineText')
853
854 return {
855 '_type': 'url',
856 'ie_key': YoutubeIE.ie_key(),
857 'id': video_id,
858 'url': video_id,
859 'title': title,
860 'description': description,
861 'duration': duration,
862 'view_count': view_count,
863 'uploader': uploader,
864 }
865
866
867 class YoutubeIE(YoutubeBaseInfoExtractor):
868 IE_DESC = 'YouTube.com'
869 _INVIDIOUS_SITES = (
870 # invidious-redirect websites
871 r'(?:www\.)?redirect\.invidious\.io',
872 r'(?:(?:www|dev)\.)?invidio\.us',
873 # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
874 r'(?:www\.)?invidious\.pussthecat\.org',
875 r'(?:www\.)?invidious\.zee\.li',
876 r'(?:www\.)?invidious\.ethibox\.fr',
877 r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
878 # youtube-dl invidious instances list
879 r'(?:(?:www|no)\.)?invidiou\.sh',
880 r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
881 r'(?:www\.)?invidious\.kabi\.tk',
882 r'(?:www\.)?invidious\.mastodon\.host',
883 r'(?:www\.)?invidious\.zapashcanon\.fr',
884 r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks',
885 r'(?:www\.)?invidious\.tinfoil-hat\.net',
886 r'(?:www\.)?invidious\.himiko\.cloud',
887 r'(?:www\.)?invidious\.reallyancient\.tech',
888 r'(?:www\.)?invidious\.tube',
889 r'(?:www\.)?invidiou\.site',
890 r'(?:www\.)?invidious\.site',
891 r'(?:www\.)?invidious\.xyz',
892 r'(?:www\.)?invidious\.nixnet\.xyz',
893 r'(?:www\.)?invidious\.048596\.xyz',
894 r'(?:www\.)?invidious\.drycat\.fr',
895 r'(?:www\.)?inv\.skyn3t\.in',
896 r'(?:www\.)?tube\.poal\.co',
897 r'(?:www\.)?tube\.connect\.cafe',
898 r'(?:www\.)?vid\.wxzm\.sx',
899 r'(?:www\.)?vid\.mint\.lgbt',
900 r'(?:www\.)?vid\.puffyan\.us',
901 r'(?:www\.)?yewtu\.be',
902 r'(?:www\.)?yt\.elukerio\.org',
903 r'(?:www\.)?yt\.lelux\.fi',
904 r'(?:www\.)?invidious\.ggc-project\.de',
905 r'(?:www\.)?yt\.maisputain\.ovh',
906 r'(?:www\.)?ytprivate\.com',
907 r'(?:www\.)?invidious\.13ad\.de',
908 r'(?:www\.)?invidious\.toot\.koeln',
909 r'(?:www\.)?invidious\.fdn\.fr',
910 r'(?:www\.)?watch\.nettohikari\.com',
911 r'(?:www\.)?invidious\.namazso\.eu',
912 r'(?:www\.)?invidious\.silkky\.cloud',
913 r'(?:www\.)?invidious\.exonip\.de',
914 r'(?:www\.)?invidious\.riverside\.rocks',
915 r'(?:www\.)?invidious\.blamefran\.net',
916 r'(?:www\.)?invidious\.moomoo\.de',
917 r'(?:www\.)?ytb\.trom\.tf',
918 r'(?:www\.)?yt\.cyberhost\.uk',
919 r'(?:www\.)?kgg2m7yk5aybusll\.onion',
920 r'(?:www\.)?qklhadlycap4cnod\.onion',
921 r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion',
922 r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion',
923 r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion',
924 r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion',
925 r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p',
926 r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion',
927 r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
928 r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
929 r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
930 r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion',
931 )
932 _VALID_URL = r"""(?x)^
933 (
934 (?:https?://|//) # http(s):// or protocol-independent URL
935 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com|
936 (?:www\.)?deturl\.com/www\.youtube\.com|
937 (?:www\.)?pwnyoutube\.com|
938 (?:www\.)?hooktube\.com|
939 (?:www\.)?yourepeat\.com|
940 tube\.majestyc\.net|
941 %(invidious)s|
942 youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
943 (?:.*?\#/)? # handle anchor (#/) redirect urls
944 (?: # the various things that can precede the ID:
945 (?:(?:v|embed|e|shorts)/(?!videoseries)) # v/ or embed/ or e/ or shorts/
946 |(?: # or the v= param in all its forms
947 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
948 (?:\?|\#!?) # the params delimiter ? or # or #!
949 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
950 v=
951 )
952 ))
953 |(?:
954 youtu\.be| # just youtu.be/xxxx
955 vid\.plus| # or vid.plus/xxxx
956 zwearz\.com/watch| # or zwearz.com/watch/xxxx
957 %(invidious)s
958 )/
959 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
960 )
961 )? # all until now is optional -> you can pass the naked ID
962 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
963 (?(1).+)? # if we found the ID, everything can follow
964 (?:\#|$)""" % {
965 'invidious': '|'.join(_INVIDIOUS_SITES),
966 }
967 _PLAYER_INFO_RE = (
968 r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
969 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
970 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
971 )
972 _formats = {
973 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
974 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
975 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
976 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
977 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
978 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
979 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
980 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
981 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
982 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
983 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
984 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
985 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
986 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
987 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
988 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
989 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
990 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
991
992
993 # 3D videos
994 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
995 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
996 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
997 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
998 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
999 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
1000 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
1001
1002 # Apple HTTP Live Streaming
1003 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
1004 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
1005 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
1006 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
1007 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
1008 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
1009 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
1010 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
1011
1012 # DASH mp4 video
1013 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
1014 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
1015 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
1016 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
1017 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
1018 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
1019 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
1020 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
1021 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
1022 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
1023 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
1024 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
1025
1026 # Dash mp4 audio
1027 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
1028 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
1029 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
1030 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
1031 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
1032 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
1033 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
1034
1035 # Dash webm
1036 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
1037 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
1038 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
1039 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
1040 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
1041 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
1042 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
1043 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1044 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1045 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1046 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1047 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1048 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1049 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1050 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1051 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
1052 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1053 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
1054 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
1055 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
1056 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1057 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
1058
1059 # Dash webm audio
1060 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
1061 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
1062
1063 # Dash webm audio with opus inside
1064 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
1065 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
1066 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
1067
1068 # RTMP (unnamed)
1069 '_rtmp': {'protocol': 'rtmp'},
1070
1071 # av01 video only formats sometimes served with "unknown" codecs
1072 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
1073 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
1074 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
1075 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
1076 }
1077 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
1078
1079 _GEO_BYPASS = False
1080
1081 IE_NAME = 'youtube'
1082 _TESTS = [
1083 {
1084 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
1085 'info_dict': {
1086 'id': 'BaW_jenozKc',
1087 'ext': 'mp4',
1088 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
1089 'uploader': 'Philipp Hagemeister',
1090 'uploader_id': 'phihag',
1091 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
1092 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
1093 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
1094 'upload_date': '20121002',
1095 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
1096 'categories': ['Science & Technology'],
1097 'tags': ['youtube-dl'],
1098 'duration': 10,
1099 'view_count': int,
1100 'like_count': int,
1101 'dislike_count': int,
1102 'start_time': 1,
1103 'end_time': 9,
1104 }
1105 },
1106 {
1107 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
1108 'note': 'Embed-only video (#1746)',
1109 'info_dict': {
1110 'id': 'yZIXLfi8CZQ',
1111 'ext': 'mp4',
1112 'upload_date': '20120608',
1113 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
1114 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
1115 'uploader': 'SET India',
1116 'uploader_id': 'setindia',
1117 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
1118 'age_limit': 18,
1119 },
1120 'skip': 'Private video',
1121 },
1122 {
1123 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
1124 'note': 'Use the first video ID in the URL',
1125 'info_dict': {
1126 'id': 'BaW_jenozKc',
1127 'ext': 'mp4',
1128 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
1129 'uploader': 'Philipp Hagemeister',
1130 'uploader_id': 'phihag',
1131 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
1132 'upload_date': '20121002',
1133 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
1134 'categories': ['Science & Technology'],
1135 'tags': ['youtube-dl'],
1136 'duration': 10,
1137 'view_count': int,
1138 'like_count': int,
1139 'dislike_count': int,
1140 },
1141 'params': {
1142 'skip_download': True,
1143 },
1144 },
1145 {
1146 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
1147 'note': '256k DASH audio (format 141) via DASH manifest',
1148 'info_dict': {
1149 'id': 'a9LDPn-MO4I',
1150 'ext': 'm4a',
1151 'upload_date': '20121002',
1152 'uploader_id': '8KVIDEO',
1153 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
1154 'description': '',
1155 'uploader': '8KVIDEO',
1156 'title': 'UHDTV TEST 8K VIDEO.mp4'
1157 },
1158 'params': {
1159 'youtube_include_dash_manifest': True,
1160 'format': '141',
1161 },
1162 'skip': 'format 141 not served anymore',
1163 },
1164 # DASH manifest with encrypted signature
1165 {
1166 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
1167 'info_dict': {
1168 'id': 'IB3lcPjvWLA',
1169 'ext': 'm4a',
1170 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
1171 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
1172 'duration': 244,
1173 'uploader': 'AfrojackVEVO',
1174 'uploader_id': 'AfrojackVEVO',
1175 'upload_date': '20131011',
1176 'abr': 129.495,
1177 },
1178 'params': {
1179 'youtube_include_dash_manifest': True,
1180 'format': '141/bestaudio[ext=m4a]',
1181 },
1182 },
1183 # Age-gate videos. See https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-888837000
1184 {
1185 'note': 'Embed allowed age-gate video',
1186 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
1187 'info_dict': {
1188 'id': 'HtVdAasjOgU',
1189 'ext': 'mp4',
1190 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
1191 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
1192 'duration': 142,
1193 'uploader': 'The Witcher',
1194 'uploader_id': 'WitcherGame',
1195 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
1196 'upload_date': '20140605',
1197 'age_limit': 18,
1198 },
1199 },
1200 {
1201 'note': 'Age-gate video with embed allowed in public site',
1202 'url': 'https://youtube.com/watch?v=HsUATh_Nc2U',
1203 'info_dict': {
1204 'id': 'HsUATh_Nc2U',
1205 'ext': 'mp4',
1206 'title': 'Godzilla 2 (Official Video)',
1207 'description': 'md5:bf77e03fcae5529475e500129b05668a',
1208 'upload_date': '20200408',
1209 'uploader_id': 'FlyingKitty900',
1210 'uploader': 'FlyingKitty',
1211 'age_limit': 18,
1212 },
1213 },
1214 {
1215 'note': 'Age-gate video embedable only with clientScreen=EMBED',
1216 'url': 'https://youtube.com/watch?v=Tq92D6wQ1mg',
1217 'info_dict': {
1218 'id': 'Tq92D6wQ1mg',
1219 'title': '[MMD] Adios - EVERGLOW [+Motion DL]',
1220 'ext': 'mp4',
1221 'upload_date': '20191227',
1222 'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ',
1223 'uploader': 'Projekt Melody',
1224 'description': 'md5:17eccca93a786d51bc67646756894066',
1225 'age_limit': 18,
1226 },
1227 },
1228 {
1229 'note': 'Non-Agegated non-embeddable video',
1230 'url': 'https://youtube.com/watch?v=MeJVWBSsPAY',
1231 'info_dict': {
1232 'id': 'MeJVWBSsPAY',
1233 'ext': 'mp4',
1234 'title': 'OOMPH! - Such Mich Find Mich (Lyrics)',
1235 'uploader': 'Herr Lurik',
1236 'uploader_id': 'st3in234',
1237 'description': 'Fan Video. Music & Lyrics by OOMPH!.',
1238 'upload_date': '20130730',
1239 },
1240 },
1241 {
1242 'note': 'Non-bypassable age-gated video',
1243 'url': 'https://youtube.com/watch?v=Cr381pDsSsA',
1244 'only_matching': True,
1245 },
1246 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
1247 # YouTube Red ad is not captured for creator
1248 {
1249 'url': '__2ABJjxzNo',
1250 'info_dict': {
1251 'id': '__2ABJjxzNo',
1252 'ext': 'mp4',
1253 'duration': 266,
1254 'upload_date': '20100430',
1255 'uploader_id': 'deadmau5',
1256 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
1257 'creator': 'deadmau5',
1258 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336',
1259 'uploader': 'deadmau5',
1260 'title': 'Deadmau5 - Some Chords (HD)',
1261 'alt_title': 'Some Chords',
1262 },
1263 'expected_warnings': [
1264 'DASH manifest missing',
1265 ]
1266 },
1267 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
1268 {
1269 'url': 'lqQg6PlCWgI',
1270 'info_dict': {
1271 'id': 'lqQg6PlCWgI',
1272 'ext': 'mp4',
1273 'duration': 6085,
1274 'upload_date': '20150827',
1275 'uploader_id': 'olympic',
1276 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
1277 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
1278 'uploader': 'Olympics',
1279 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
1280 },
1281 'params': {
1282 'skip_download': 'requires avconv',
1283 }
1284 },
1285 # Non-square pixels
1286 {
1287 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
1288 'info_dict': {
1289 'id': '_b-2C3KPAM0',
1290 'ext': 'mp4',
1291 'stretched_ratio': 16 / 9.,
1292 'duration': 85,
1293 'upload_date': '20110310',
1294 'uploader_id': 'AllenMeow',
1295 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
1296 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
1297 'uploader': '孫ᄋᄅ',
1298 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
1299 },
1300 },
1301 # url_encoded_fmt_stream_map is empty string
1302 {
1303 'url': 'qEJwOuvDf7I',
1304 'info_dict': {
1305 'id': 'qEJwOuvDf7I',
1306 'ext': 'webm',
1307 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
1308 'description': '',
1309 'upload_date': '20150404',
1310 'uploader_id': 'spbelect',
1311 'uploader': 'Наблюдатели Петербурга',
1312 },
1313 'params': {
1314 'skip_download': 'requires avconv',
1315 },
1316 'skip': 'This live event has ended.',
1317 },
1318 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
1319 {
1320 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
1321 'info_dict': {
1322 'id': 'FIl7x6_3R5Y',
1323 'ext': 'webm',
1324 'title': 'md5:7b81415841e02ecd4313668cde88737a',
1325 'description': 'md5:116377fd2963b81ec4ce64b542173306',
1326 'duration': 220,
1327 'upload_date': '20150625',
1328 'uploader_id': 'dorappi2000',
1329 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
1330 'uploader': 'dorappi2000',
1331 'formats': 'mincount:31',
1332 },
1333 'skip': 'not actual anymore',
1334 },
1335 # DASH manifest with segment_list
1336 {
1337 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
1338 'md5': '8ce563a1d667b599d21064e982ab9e31',
1339 'info_dict': {
1340 'id': 'CsmdDsKjzN8',
1341 'ext': 'mp4',
1342 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
1343 'uploader': 'Airtek',
1344 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
1345 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
1346 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
1347 },
1348 'params': {
1349 'youtube_include_dash_manifest': True,
1350 'format': '135', # bestvideo
1351 },
1352 'skip': 'This live event has ended.',
1353 },
1354 {
1355 # Multifeed videos (multiple cameras), URL is for Main Camera
1356 'url': 'https://www.youtube.com/watch?v=jvGDaLqkpTg',
1357 'info_dict': {
1358 'id': 'jvGDaLqkpTg',
1359 'title': 'Tom Clancy Free Weekend Rainbow Whatever',
1360 'description': 'md5:e03b909557865076822aa169218d6a5d',
1361 },
1362 'playlist': [{
1363 'info_dict': {
1364 'id': 'jvGDaLqkpTg',
1365 'ext': 'mp4',
1366 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Main Camera)',
1367 'description': 'md5:e03b909557865076822aa169218d6a5d',
1368 'duration': 10643,
1369 'upload_date': '20161111',
1370 'uploader': 'Team PGP',
1371 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1372 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1373 },
1374 }, {
1375 'info_dict': {
1376 'id': '3AKt1R1aDnw',
1377 'ext': 'mp4',
1378 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 2)',
1379 'description': 'md5:e03b909557865076822aa169218d6a5d',
1380 'duration': 10991,
1381 'upload_date': '20161111',
1382 'uploader': 'Team PGP',
1383 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1384 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1385 },
1386 }, {
1387 'info_dict': {
1388 'id': 'RtAMM00gpVc',
1389 'ext': 'mp4',
1390 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 3)',
1391 'description': 'md5:e03b909557865076822aa169218d6a5d',
1392 'duration': 10995,
1393 'upload_date': '20161111',
1394 'uploader': 'Team PGP',
1395 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1396 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1397 },
1398 }, {
1399 'info_dict': {
1400 'id': '6N2fdlP3C5U',
1401 'ext': 'mp4',
1402 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 4)',
1403 'description': 'md5:e03b909557865076822aa169218d6a5d',
1404 'duration': 10990,
1405 'upload_date': '20161111',
1406 'uploader': 'Team PGP',
1407 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1408 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1409 },
1410 }],
1411 'params': {
1412 'skip_download': True,
1413 },
1414 'skip': 'Not multifeed anymore',
1415 },
1416 {
1417 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
1418 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
1419 'info_dict': {
1420 'id': 'gVfLd0zydlo',
1421 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
1422 },
1423 'playlist_count': 2,
1424 'skip': 'Not multifeed anymore',
1425 },
1426 {
1427 'url': 'https://vid.plus/FlRa-iH7PGw',
1428 'only_matching': True,
1429 },
1430 {
1431 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
1432 'only_matching': True,
1433 },
1434 {
1435 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
1436 # Also tests cut-off URL expansion in video description (see
1437 # https://github.com/ytdl-org/youtube-dl/issues/1892,
1438 # https://github.com/ytdl-org/youtube-dl/issues/8164)
1439 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
1440 'info_dict': {
1441 'id': 'lsguqyKfVQg',
1442 'ext': 'mp4',
1443 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
1444 'alt_title': 'Dark Walk',
1445 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
1446 'duration': 133,
1447 'upload_date': '20151119',
1448 'uploader_id': 'IronSoulElf',
1449 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
1450 'uploader': 'IronSoulElf',
1451 'creator': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan',
1452 'track': 'Dark Walk',
1453 'artist': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan',
1454 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
1455 },
1456 'params': {
1457 'skip_download': True,
1458 },
1459 },
1460 {
1461 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
1462 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
1463 'only_matching': True,
1464 },
1465 {
1466 # Video with yt:stretch=17:0
1467 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
1468 'info_dict': {
1469 'id': 'Q39EVAstoRM',
1470 'ext': 'mp4',
1471 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
1472 'description': 'md5:ee18a25c350637c8faff806845bddee9',
1473 'upload_date': '20151107',
1474 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
1475 'uploader': 'CH GAMER DROID',
1476 },
1477 'params': {
1478 'skip_download': True,
1479 },
1480 'skip': 'This video does not exist.',
1481 },
1482 {
1483 # Video with incomplete 'yt:stretch=16:'
1484 'url': 'https://www.youtube.com/watch?v=FRhJzUSJbGI',
1485 'only_matching': True,
1486 },
1487 {
1488 # Video licensed under Creative Commons
1489 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
1490 'info_dict': {
1491 'id': 'M4gD1WSo5mA',
1492 'ext': 'mp4',
1493 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
1494 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
1495 'duration': 721,
1496 'upload_date': '20150127',
1497 'uploader_id': 'BerkmanCenter',
1498 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
1499 'uploader': 'The Berkman Klein Center for Internet & Society',
1500 'license': 'Creative Commons Attribution license (reuse allowed)',
1501 },
1502 'params': {
1503 'skip_download': True,
1504 },
1505 },
1506 {
1507 # Channel-like uploader_url
1508 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
1509 'info_dict': {
1510 'id': 'eQcmzGIKrzg',
1511 'ext': 'mp4',
1512 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
1513 'description': 'md5:13a2503d7b5904ef4b223aa101628f39',
1514 'duration': 4060,
1515 'upload_date': '20151119',
1516 'uploader': 'Bernie Sanders',
1517 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
1518 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
1519 'license': 'Creative Commons Attribution license (reuse allowed)',
1520 },
1521 'params': {
1522 'skip_download': True,
1523 },
1524 },
1525 {
1526 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
1527 'only_matching': True,
1528 },
1529 {
1530 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
1531 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
1532 'only_matching': True,
1533 },
1534 {
1535 # Rental video preview
1536 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1537 'info_dict': {
1538 'id': 'uGpuVWrhIzE',
1539 'ext': 'mp4',
1540 'title': 'Piku - Trailer',
1541 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1542 'upload_date': '20150811',
1543 'uploader': 'FlixMatrix',
1544 'uploader_id': 'FlixMatrixKaravan',
1545 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
1546 'license': 'Standard YouTube License',
1547 },
1548 'params': {
1549 'skip_download': True,
1550 },
1551 'skip': 'This video is not available.',
1552 },
1553 {
1554 # YouTube Red video with episode data
1555 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1556 'info_dict': {
1557 'id': 'iqKdEhx-dD4',
1558 'ext': 'mp4',
1559 'title': 'Isolation - Mind Field (Ep 1)',
1560 'description': 'md5:f540112edec5d09fc8cc752d3d4ba3cd',
1561 'duration': 2085,
1562 'upload_date': '20170118',
1563 'uploader': 'Vsauce',
1564 'uploader_id': 'Vsauce',
1565 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1566 'series': 'Mind Field',
1567 'season_number': 1,
1568 'episode_number': 1,
1569 },
1570 'params': {
1571 'skip_download': True,
1572 },
1573 'expected_warnings': [
1574 'Skipping DASH manifest',
1575 ],
1576 },
1577 {
1578 # The following content has been identified by the YouTube community
1579 # as inappropriate or offensive to some audiences.
1580 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1581 'info_dict': {
1582 'id': '6SJNVb0GnPI',
1583 'ext': 'mp4',
1584 'title': 'Race Differences in Intelligence',
1585 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1586 'duration': 965,
1587 'upload_date': '20140124',
1588 'uploader': 'New Century Foundation',
1589 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1590 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1591 },
1592 'params': {
1593 'skip_download': True,
1594 },
1595 'skip': 'This video has been removed for violating YouTube\'s policy on hate speech.',
1596 },
1597 {
1598 # itag 212
1599 'url': '1t24XAntNCY',
1600 'only_matching': True,
1601 },
1602 {
1603 # geo restricted to JP
1604 'url': 'sJL6WA-aGkQ',
1605 'only_matching': True,
1606 },
1607 {
1608 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1609 'only_matching': True,
1610 },
1611 {
1612 'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc',
1613 'only_matching': True,
1614 },
1615 {
1616 # from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m
1617 'url': 'https://redirect.invidious.io/Yh0AhrY9GjA',
1618 'only_matching': True,
1619 },
1620 {
1621 # DRM protected
1622 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1623 'only_matching': True,
1624 },
1625 {
1626 # Video with unsupported adaptive stream type formats
1627 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1628 'info_dict': {
1629 'id': 'Z4Vy8R84T1U',
1630 'ext': 'mp4',
1631 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1632 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1633 'duration': 433,
1634 'upload_date': '20130923',
1635 'uploader': 'Amelia Putri Harwita',
1636 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1637 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1638 'formats': 'maxcount:10',
1639 },
1640 'params': {
1641 'skip_download': True,
1642 'youtube_include_dash_manifest': False,
1643 },
1644 'skip': 'not actual anymore',
1645 },
1646 {
1647 # Youtube Music Auto-generated description
1648 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1649 'info_dict': {
1650 'id': 'MgNrAu2pzNs',
1651 'ext': 'mp4',
1652 'title': 'Voyeur Girl',
1653 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1654 'upload_date': '20190312',
1655 'uploader': 'Stephen - Topic',
1656 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1657 'artist': 'Stephen',
1658 'track': 'Voyeur Girl',
1659 'album': 'it\'s too much love to know my dear',
1660 'release_date': '20190313',
1661 'release_year': 2019,
1662 },
1663 'params': {
1664 'skip_download': True,
1665 },
1666 },
1667 {
1668 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1669 'only_matching': True,
1670 },
1671 {
1672 # invalid -> valid video id redirection
1673 'url': 'DJztXj2GPfl',
1674 'info_dict': {
1675 'id': 'DJztXj2GPfk',
1676 'ext': 'mp4',
1677 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1678 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1679 'upload_date': '20090125',
1680 'uploader': 'Prochorowka',
1681 'uploader_id': 'Prochorowka',
1682 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1683 'artist': 'Panjabi MC',
1684 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1685 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1686 },
1687 'params': {
1688 'skip_download': True,
1689 },
1690 'skip': 'Video unavailable',
1691 },
1692 {
1693 # empty description results in an empty string
1694 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1695 'info_dict': {
1696 'id': 'x41yOUIvK2k',
1697 'ext': 'mp4',
1698 'title': 'IMG 3456',
1699 'description': '',
1700 'upload_date': '20170613',
1701 'uploader_id': 'ElevageOrVert',
1702 'uploader': 'ElevageOrVert',
1703 },
1704 'params': {
1705 'skip_download': True,
1706 },
1707 },
1708 {
1709 # with '};' inside yt initial data (see [1])
1710 # see [2] for an example with '};' inside ytInitialPlayerResponse
1711 # 1. https://github.com/ytdl-org/youtube-dl/issues/27093
1712 # 2. https://github.com/ytdl-org/youtube-dl/issues/27216
1713 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1714 'info_dict': {
1715 'id': 'CHqg6qOn4no',
1716 'ext': 'mp4',
1717 'title': 'Part 77 Sort a list of simple types in c#',
1718 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1719 'upload_date': '20130831',
1720 'uploader_id': 'kudvenkat',
1721 'uploader': 'kudvenkat',
1722 },
1723 'params': {
1724 'skip_download': True,
1725 },
1726 },
1727 {
1728 # another example of '};' in ytInitialData
1729 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY',
1730 'only_matching': True,
1731 },
1732 {
1733 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ',
1734 'only_matching': True,
1735 },
1736 {
1737 # https://github.com/ytdl-org/youtube-dl/pull/28094
1738 'url': 'OtqTfy26tG0',
1739 'info_dict': {
1740 'id': 'OtqTfy26tG0',
1741 'ext': 'mp4',
1742 'title': 'Burn Out',
1743 'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131',
1744 'upload_date': '20141120',
1745 'uploader': 'The Cinematic Orchestra - Topic',
1746 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw',
1747 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw',
1748 'artist': 'The Cinematic Orchestra',
1749 'track': 'Burn Out',
1750 'album': 'Every Day',
1751 'release_data': None,
1752 'release_year': None,
1753 },
1754 'params': {
1755 'skip_download': True,
1756 },
1757 },
1758 {
1759 # controversial video, only works with bpctr when authenticated with cookies
1760 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg',
1761 'only_matching': True,
1762 },
1763 {
1764 # controversial video, requires bpctr/contentCheckOk
1765 'url': 'https://www.youtube.com/watch?v=SZJvDhaSDnc',
1766 'info_dict': {
1767 'id': 'SZJvDhaSDnc',
1768 'ext': 'mp4',
1769 'title': 'San Diego teen commits suicide after bullying over embarrassing video',
1770 'channel_id': 'UC-SJ6nODDmufqBzPBwCvYvQ',
1771 'uploader': 'CBS This Morning',
1772 'uploader_id': 'CBSThisMorning',
1773 'upload_date': '20140716',
1774 'description': 'md5:acde3a73d3f133fc97e837a9f76b53b7'
1775 }
1776 },
1777 {
1778 # restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685
1779 'url': 'cBvYw8_A0vQ',
1780 'info_dict': {
1781 'id': 'cBvYw8_A0vQ',
1782 'ext': 'mp4',
1783 'title': '4K Ueno Okachimachi Street Scenes 上野御徒町歩き',
1784 'description': 'md5:ea770e474b7cd6722b4c95b833c03630',
1785 'upload_date': '20201120',
1786 'uploader': 'Walk around Japan',
1787 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw',
1788 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
1789 },
1790 'params': {
1791 'skip_download': True,
1792 },
1793 }, {
1794 # Has multiple audio streams
1795 'url': 'WaOKSUlf4TM',
1796 'only_matching': True
1797 }, {
1798 # Requires Premium: has format 141 when requested using YTM url
1799 'url': 'https://music.youtube.com/watch?v=XclachpHxis',
1800 'only_matching': True
1801 }, {
1802 # multiple subtitles with same lang_code
1803 'url': 'https://www.youtube.com/watch?v=wsQiKKfKxug',
1804 'only_matching': True,
1805 }, {
1806 # Force use android client fallback
1807 'url': 'https://www.youtube.com/watch?v=YOelRv7fMxY',
1808 'info_dict': {
1809 'id': 'YOelRv7fMxY',
1810 'title': 'DIGGING A SECRET TUNNEL Part 1',
1811 'ext': '3gp',
1812 'upload_date': '20210624',
1813 'channel_id': 'UCp68_FLety0O-n9QU6phsgw',
1814 'uploader': 'colinfurze',
1815 'uploader_id': 'colinfurze',
1816 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCp68_FLety0O-n9QU6phsgw',
1817 'description': 'md5:b5096f56af7ccd7a555c84db81738b22'
1818 },
1819 'params': {
1820 'format': '17', # 3gp format available on android
1821 'extractor_args': {'youtube': {'player_client': ['android']}},
1822 },
1823 },
1824 {
1825 # Skip download of additional client configs (remix client config in this case)
1826 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1827 'only_matching': True,
1828 'params': {
1829 'extractor_args': {'youtube': {'player_skip': ['configs']}},
1830 },
1831 }, {
1832 # shorts
1833 'url': 'https://www.youtube.com/shorts/BGQWPY4IigY',
1834 'only_matching': True,
1835 },
1836 ]
1837
1838 @classmethod
1839 def suitable(cls, url):
1840 # Hack for lazy extractors until more generic solution is implemented
1841 # (see #28780)
1842 from ..utils import parse_qs
1843
1844 qs = parse_qs(url)
1845 if qs.get('list', [None])[0]:
1846 return False
1847 return super(YoutubeIE, cls).suitable(url)
1848
1849 def __init__(self, *args, **kwargs):
1850 super(YoutubeIE, self).__init__(*args, **kwargs)
1851 self._code_cache = {}
1852 self._player_cache = {}
1853
1854 def _extract_player_url(self, ytcfg=None, webpage=None):
1855 player_url = try_get(ytcfg, (lambda x: x['PLAYER_JS_URL']), str)
1856 if not player_url and webpage:
1857 player_url = self._search_regex(
1858 r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
1859 webpage, 'player URL', fatal=False)
1860 if not player_url:
1861 return None
1862 if player_url.startswith('//'):
1863 player_url = 'https:' + player_url
1864 elif not re.match(r'https?://', player_url):
1865 player_url = compat_urlparse.urljoin(
1866 'https://www.youtube.com', player_url)
1867 return player_url
1868
1869 def _signature_cache_id(self, example_sig):
1870 """ Return a string representation of a signature """
1871 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1872
1873 @classmethod
1874 def _extract_player_info(cls, player_url):
1875 for player_re in cls._PLAYER_INFO_RE:
1876 id_m = re.search(player_re, player_url)
1877 if id_m:
1878 break
1879 else:
1880 raise ExtractorError('Cannot identify player %r' % player_url)
1881 return id_m.group('id')
1882
1883 def _load_player(self, video_id, player_url, fatal=True) -> bool:
1884 player_id = self._extract_player_info(player_url)
1885 if player_id not in self._code_cache:
1886 self._code_cache[player_id] = self._download_webpage(
1887 player_url, video_id, fatal=fatal,
1888 note='Downloading player ' + player_id,
1889 errnote='Download of %s failed' % player_url)
1890 return player_id in self._code_cache
1891
1892 def _extract_signature_function(self, video_id, player_url, example_sig):
1893 player_id = self._extract_player_info(player_url)
1894
1895 # Read from filesystem cache
1896 func_id = 'js_%s_%s' % (
1897 player_id, self._signature_cache_id(example_sig))
1898 assert os.path.basename(func_id) == func_id
1899
1900 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1901 if cache_spec is not None:
1902 return lambda s: ''.join(s[i] for i in cache_spec)
1903
1904 if self._load_player(video_id, player_url):
1905 code = self._code_cache[player_id]
1906 res = self._parse_sig_js(code)
1907
1908 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1909 cache_res = res(test_string)
1910 cache_spec = [ord(c) for c in cache_res]
1911
1912 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1913 return res
1914
1915 def _print_sig_code(self, func, example_sig):
1916 def gen_sig_code(idxs):
1917 def _genslice(start, end, step):
1918 starts = '' if start == 0 else str(start)
1919 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1920 steps = '' if step == 1 else (':%d' % step)
1921 return 's[%s%s%s]' % (starts, ends, steps)
1922
1923 step = None
1924 # Quelch pyflakes warnings - start will be set when step is set
1925 start = '(Never used)'
1926 for i, prev in zip(idxs[1:], idxs[:-1]):
1927 if step is not None:
1928 if i - prev == step:
1929 continue
1930 yield _genslice(start, prev, step)
1931 step = None
1932 continue
1933 if i - prev in [-1, 1]:
1934 step = i - prev
1935 start = prev
1936 continue
1937 else:
1938 yield 's[%d]' % prev
1939 if step is None:
1940 yield 's[%d]' % i
1941 else:
1942 yield _genslice(start, i, step)
1943
1944 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1945 cache_res = func(test_string)
1946 cache_spec = [ord(c) for c in cache_res]
1947 expr_code = ' + '.join(gen_sig_code(cache_spec))
1948 signature_id_tuple = '(%s)' % (
1949 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1950 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1951 ' return %s\n') % (signature_id_tuple, expr_code)
1952 self.to_screen('Extracted signature function:\n' + code)
1953
1954 def _parse_sig_js(self, jscode):
1955 funcname = self._search_regex(
1956 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1957 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1958 r'\bm=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)',
1959 r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)',
1960 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)',
1961 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1962 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1963 # Obsolete patterns
1964 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1965 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1966 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1967 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1968 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1969 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1970 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1971 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1972 jscode, 'Initial JS player signature function name', group='sig')
1973
1974 jsi = JSInterpreter(jscode)
1975 initial_function = jsi.extract_function(funcname)
1976 return lambda s: initial_function([s])
1977
1978 def _decrypt_signature(self, s, video_id, player_url):
1979 """Turn the encrypted s field into a working signature"""
1980
1981 if player_url is None:
1982 raise ExtractorError('Cannot decrypt signature without player_url')
1983
1984 try:
1985 player_id = (player_url, self._signature_cache_id(s))
1986 if player_id not in self._player_cache:
1987 func = self._extract_signature_function(
1988 video_id, player_url, s
1989 )
1990 self._player_cache[player_id] = func
1991 func = self._player_cache[player_id]
1992 if self.get_param('youtube_print_sig_code'):
1993 self._print_sig_code(func, s)
1994 return func(s)
1995 except Exception as e:
1996 tb = traceback.format_exc()
1997 raise ExtractorError(
1998 'Signature extraction failed: ' + tb, cause=e)
1999
2000 def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False):
2001 """
2002 Extract signatureTimestamp (sts)
2003 Required to tell API what sig/player version is in use.
2004 """
2005 sts = None
2006 if isinstance(ytcfg, dict):
2007 sts = int_or_none(ytcfg.get('STS'))
2008
2009 if not sts:
2010 # Attempt to extract from player
2011 if player_url is None:
2012 error_msg = 'Cannot extract signature timestamp without player_url.'
2013 if fatal:
2014 raise ExtractorError(error_msg)
2015 self.report_warning(error_msg)
2016 return
2017 if self._load_player(video_id, player_url, fatal=fatal):
2018 player_id = self._extract_player_info(player_url)
2019 code = self._code_cache[player_id]
2020 sts = int_or_none(self._search_regex(
2021 r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code,
2022 'JS player signature timestamp', group='sts', fatal=fatal))
2023 return sts
2024
2025 def _mark_watched(self, video_id, player_responses):
2026 playback_url = traverse_obj(
2027 player_responses, (..., 'playbackTracking', 'videostatsPlaybackUrl', 'baseUrl'),
2028 expected_type=url_or_none, get_all=False)
2029 if not playback_url:
2030 self.report_warning('Unable to mark watched')
2031 return
2032 parsed_playback_url = compat_urlparse.urlparse(playback_url)
2033 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
2034
2035 # cpn generation algorithm is reverse engineered from base.js.
2036 # In fact it works even with dummy cpn.
2037 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
2038 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
2039
2040 qs.update({
2041 'ver': ['2'],
2042 'cpn': [cpn],
2043 })
2044 playback_url = compat_urlparse.urlunparse(
2045 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
2046
2047 self._download_webpage(
2048 playback_url, video_id, 'Marking watched',
2049 'Unable to mark watched', fatal=False)
2050
2051 @staticmethod
2052 def _extract_urls(webpage):
2053 # Embedded YouTube player
2054 entries = [
2055 unescapeHTML(mobj.group('url'))
2056 for mobj in re.finditer(r'''(?x)
2057 (?:
2058 <iframe[^>]+?src=|
2059 data-video-url=|
2060 <embed[^>]+?src=|
2061 embedSWF\(?:\s*|
2062 <object[^>]+data=|
2063 new\s+SWFObject\(
2064 )
2065 (["\'])
2066 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
2067 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
2068 \1''', webpage)]
2069
2070 # lazyYT YouTube embed
2071 entries.extend(list(map(
2072 unescapeHTML,
2073 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
2074
2075 # Wordpress "YouTube Video Importer" plugin
2076 matches = re.findall(r'''(?x)<div[^>]+
2077 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
2078 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
2079 entries.extend(m[-1] for m in matches)
2080
2081 return entries
2082
2083 @staticmethod
2084 def _extract_url(webpage):
2085 urls = YoutubeIE._extract_urls(webpage)
2086 return urls[0] if urls else None
2087
2088 @classmethod
2089 def extract_id(cls, url):
2090 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
2091 if mobj is None:
2092 raise ExtractorError('Invalid URL: %s' % url)
2093 return mobj.group('id')
2094
2095 def _extract_chapters_from_json(self, data, duration):
2096 chapter_list = traverse_obj(
2097 data, (
2098 'playerOverlays', 'playerOverlayRenderer', 'decoratedPlayerBarRenderer',
2099 'decoratedPlayerBarRenderer', 'playerBar', 'chapteredPlayerBarRenderer', 'chapters'
2100 ), expected_type=list)
2101
2102 return self._extract_chapters(
2103 chapter_list,
2104 chapter_time=lambda chapter: float_or_none(
2105 traverse_obj(chapter, ('chapterRenderer', 'timeRangeStartMillis')), scale=1000),
2106 chapter_title=lambda chapter: traverse_obj(
2107 chapter, ('chapterRenderer', 'title', 'simpleText'), expected_type=str),
2108 duration=duration)
2109
2110 def _extract_chapters_from_engagement_panel(self, data, duration):
2111 content_list = traverse_obj(
2112 data,
2113 ('engagementPanels', ..., 'engagementPanelSectionListRenderer', 'content', 'macroMarkersListRenderer', 'contents'),
2114 expected_type=list, default=[])
2115 chapter_time = lambda chapter: parse_duration(self._get_text(chapter, 'timeDescription'))
2116 chapter_title = lambda chapter: self._get_text(chapter, 'title')
2117
2118 return next((
2119 filter(None, (
2120 self._extract_chapters(
2121 traverse_obj(contents, (..., 'macroMarkersListItemRenderer')),
2122 chapter_time, chapter_title, duration)
2123 for contents in content_list
2124 ))), [])
2125
2126 def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration):
2127 chapters = []
2128 last_chapter = {'start_time': 0}
2129 for idx, chapter in enumerate(chapter_list or []):
2130 title = chapter_title(chapter)
2131 start_time = chapter_time(chapter)
2132 if start_time is None:
2133 continue
2134 last_chapter['end_time'] = start_time
2135 if start_time < last_chapter['start_time']:
2136 if idx == 1:
2137 chapters.pop()
2138 self.report_warning('Invalid start time for chapter "%s"' % last_chapter['title'])
2139 else:
2140 self.report_warning(f'Invalid start time for chapter "{title}"')
2141 continue
2142 last_chapter = {'start_time': start_time, 'title': title}
2143 chapters.append(last_chapter)
2144 last_chapter['end_time'] = duration
2145 return chapters
2146
2147 def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
2148 return self._parse_json(self._search_regex(
2149 (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
2150 regex), webpage, name, default='{}'), video_id, fatal=False)
2151
2152 @staticmethod
2153 def parse_time_text(time_text):
2154 """
2155 Parse the comment time text
2156 time_text is in the format 'X units ago (edited)'
2157 """
2158 time_text_split = time_text.split(' ')
2159 if len(time_text_split) >= 3:
2160 try:
2161 return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto')
2162 except ValueError:
2163 return None
2164
2165 def _extract_comment(self, comment_renderer, parent=None):
2166 comment_id = comment_renderer.get('commentId')
2167 if not comment_id:
2168 return
2169
2170 text = self._get_text(comment_renderer, 'contentText')
2171
2172 # note: timestamp is an estimate calculated from the current time and time_text
2173 time_text = self._get_text(comment_renderer, 'publishedTimeText') or ''
2174 time_text_dt = self.parse_time_text(time_text)
2175 if isinstance(time_text_dt, datetime.datetime):
2176 timestamp = calendar.timegm(time_text_dt.timetuple())
2177 author = self._get_text(comment_renderer, 'authorText')
2178 author_id = try_get(comment_renderer,
2179 lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
2180
2181 votes = parse_count(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'],
2182 lambda x: x['likeCount']), compat_str)) or 0
2183 author_thumbnail = try_get(comment_renderer,
2184 lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], compat_str)
2185
2186 author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
2187 is_favorited = 'creatorHeart' in (try_get(
2188 comment_renderer, lambda x: x['actionButtons']['commentActionButtonsRenderer'], dict) or {})
2189 return {
2190 'id': comment_id,
2191 'text': text,
2192 'timestamp': timestamp,
2193 'time_text': time_text,
2194 'like_count': votes,
2195 'is_favorited': is_favorited,
2196 'author': author,
2197 'author_id': author_id,
2198 'author_thumbnail': author_thumbnail,
2199 'author_is_uploader': author_is_uploader,
2200 'parent': parent or 'root'
2201 }
2202
2203 def _comment_entries(self, root_continuation_data, identity_token, account_syncid,
2204 ytcfg, video_id, parent=None, comment_counts=None):
2205
2206 def extract_header(contents):
2207 _total_comments = 0
2208 _continuation = None
2209 for content in contents:
2210 comments_header_renderer = try_get(content, lambda x: x['commentsHeaderRenderer'])
2211 expected_comment_count = parse_count(self._get_text(
2212 comments_header_renderer, 'countText', 'commentsCount', max_runs=1))
2213
2214 if expected_comment_count:
2215 comment_counts[1] = expected_comment_count
2216 self.to_screen('Downloading ~%d comments' % expected_comment_count)
2217 _total_comments = comment_counts[1]
2218 sort_mode_str = self._configuration_arg('comment_sort', [''])[0]
2219 comment_sort_index = int(sort_mode_str != 'top') # 1 = new, 0 = top
2220
2221 sort_menu_item = try_get(
2222 comments_header_renderer,
2223 lambda x: x['sortMenu']['sortFilterSubMenuRenderer']['subMenuItems'][comment_sort_index], dict) or {}
2224 sort_continuation_ep = sort_menu_item.get('serviceEndpoint') or {}
2225
2226 _continuation = self._extract_continuation_ep_data(sort_continuation_ep) or self._extract_continuation(sort_menu_item)
2227 if not _continuation:
2228 continue
2229
2230 sort_text = sort_menu_item.get('title')
2231 if isinstance(sort_text, compat_str):
2232 sort_text = sort_text.lower()
2233 else:
2234 sort_text = 'top comments' if comment_sort_index == 0 else 'newest first'
2235 self.to_screen('Sorting comments by %s' % sort_text)
2236 break
2237 return _total_comments, _continuation
2238
2239 def extract_thread(contents):
2240 if not parent:
2241 comment_counts[2] = 0
2242 for content in contents:
2243 comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
2244 comment_renderer = try_get(
2245 comment_thread_renderer, (lambda x: x['comment']['commentRenderer'], dict)) or try_get(
2246 content, (lambda x: x['commentRenderer'], dict))
2247
2248 if not comment_renderer:
2249 continue
2250 comment = self._extract_comment(comment_renderer, parent)
2251 if not comment:
2252 continue
2253 comment_counts[0] += 1
2254 yield comment
2255 # Attempt to get the replies
2256 comment_replies_renderer = try_get(
2257 comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
2258
2259 if comment_replies_renderer:
2260 comment_counts[2] += 1
2261 comment_entries_iter = self._comment_entries(
2262 comment_replies_renderer, identity_token, account_syncid, ytcfg,
2263 video_id, parent=comment.get('id'), comment_counts=comment_counts)
2264
2265 for reply_comment in comment_entries_iter:
2266 yield reply_comment
2267
2268 # YouTube comments have a max depth of 2
2269 max_depth = int_or_none(self._configuration_arg('max_comment_depth', [''])[0]) or float('inf')
2270 if max_depth == 1 and parent:
2271 return
2272 if not comment_counts:
2273 # comment so far, est. total comments, current comment thread #
2274 comment_counts = [0, 0, 0]
2275
2276 continuation = self._extract_continuation(root_continuation_data)
2277 if continuation and len(continuation['continuation']) < 27:
2278 self.write_debug('Detected old API continuation token. Generating new API compatible token.')
2279 continuation_token = self._generate_comment_continuation(video_id)
2280 continuation = self._build_api_continuation_query(continuation_token, None)
2281
2282 visitor_data = None
2283 is_first_continuation = parent is None
2284
2285 for page_num in itertools.count(0):
2286 if not continuation:
2287 break
2288 headers = self.generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
2289 comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
2290 if page_num == 0:
2291 if is_first_continuation:
2292 note_prefix = 'Downloading comment section API JSON'
2293 else:
2294 note_prefix = ' Downloading comment API JSON reply thread %d %s' % (
2295 comment_counts[2], comment_prog_str)
2296 else:
2297 note_prefix = '%sDownloading comment%s API JSON page %d %s' % (
2298 ' ' if parent else '', ' replies' if parent else '',
2299 page_num, comment_prog_str)
2300
2301 response = self._extract_response(
2302 item_id=None, query=continuation,
2303 ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
2304 check_get_keys=('onResponseReceivedEndpoints', 'continuationContents'))
2305 if not response:
2306 break
2307 visitor_data = try_get(
2308 response,
2309 lambda x: x['responseContext']['webResponseContextExtensionData']['ytConfigData']['visitorData'],
2310 compat_str) or visitor_data
2311
2312 continuation_contents = dict_get(response, ('onResponseReceivedEndpoints', 'continuationContents'))
2313
2314 continuation = None
2315 if isinstance(continuation_contents, list):
2316 for continuation_section in continuation_contents:
2317 if not isinstance(continuation_section, dict):
2318 continue
2319 continuation_items = try_get(
2320 continuation_section,
2321 (lambda x: x['reloadContinuationItemsCommand']['continuationItems'],
2322 lambda x: x['appendContinuationItemsAction']['continuationItems']),
2323 list) or []
2324 if is_first_continuation:
2325 total_comments, continuation = extract_header(continuation_items)
2326 if total_comments:
2327 yield total_comments
2328 is_first_continuation = False
2329 if continuation:
2330 break
2331 continue
2332 count = 0
2333 for count, entry in enumerate(extract_thread(continuation_items)):
2334 yield entry
2335 continuation = self._extract_continuation({'contents': continuation_items})
2336 if continuation:
2337 # Sometimes YouTube provides a continuation without any comments
2338 # In most cases we end up just downloading these with very little comments to come.
2339 if count == 0:
2340 if not parent:
2341 self.report_warning('No comments received - assuming end of comments')
2342 continuation = None
2343 break
2344
2345 # Deprecated response structure
2346 elif isinstance(continuation_contents, dict):
2347 known_continuation_renderers = ('itemSectionContinuation', 'commentRepliesContinuation')
2348 for key, continuation_renderer in continuation_contents.items():
2349 if key not in known_continuation_renderers:
2350 continue
2351 if not isinstance(continuation_renderer, dict):
2352 continue
2353 if is_first_continuation:
2354 header_continuation_items = [continuation_renderer.get('header') or {}]
2355 total_comments, continuation = extract_header(header_continuation_items)
2356 if total_comments:
2357 yield total_comments
2358 is_first_continuation = False
2359 if continuation:
2360 break
2361
2362 # Sometimes YouTube provides a continuation without any comments
2363 # In most cases we end up just downloading these with very little comments to come.
2364 count = 0
2365 for count, entry in enumerate(extract_thread(continuation_renderer.get('contents') or {})):
2366 yield entry
2367 continuation = self._extract_continuation(continuation_renderer)
2368 if count == 0:
2369 if not parent:
2370 self.report_warning('No comments received - assuming end of comments')
2371 continuation = None
2372 break
2373
2374 @staticmethod
2375 def _generate_comment_continuation(video_id):
2376 """
2377 Generates initial comment section continuation token from given video id
2378 """
2379 b64_vid_id = base64.b64encode(bytes(video_id.encode('utf-8')))
2380 parts = ('Eg0SCw==', b64_vid_id, 'GAYyJyIRIgs=', b64_vid_id, 'MAB4AjAAQhBjb21tZW50cy1zZWN0aW9u')
2381 new_continuation_intlist = list(itertools.chain.from_iterable(
2382 [bytes_to_intlist(base64.b64decode(part)) for part in parts]))
2383 return base64.b64encode(intlist_to_bytes(new_continuation_intlist)).decode('utf-8')
2384
2385 def _extract_comments(self, ytcfg, video_id, contents, webpage):
2386 """Entry for comment extraction"""
2387 def _real_comment_extract(contents):
2388 if isinstance(contents, list):
2389 for entry in contents:
2390 for key, renderer in entry.items():
2391 if key not in known_entry_comment_renderers:
2392 continue
2393 yield from self._comment_entries(
2394 renderer, video_id=video_id, ytcfg=ytcfg,
2395 identity_token=self._extract_identity_token(webpage, item_id=video_id),
2396 account_syncid=self._extract_account_syncid(ytcfg))
2397 break
2398 comments = []
2399 known_entry_comment_renderers = ('itemSectionRenderer',)
2400 estimated_total = 0
2401 max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) or float('inf')
2402 # Force English regardless of account setting to prevent parsing issues
2403 # See: https://github.com/yt-dlp/yt-dlp/issues/532
2404 ytcfg = copy.deepcopy(ytcfg)
2405 traverse_obj(
2406 ytcfg, ('INNERTUBE_CONTEXT', 'client'), expected_type=dict, default={})['hl'] = 'en'
2407 try:
2408 for comment in _real_comment_extract(contents):
2409 if len(comments) >= max_comments:
2410 break
2411 if isinstance(comment, int):
2412 estimated_total = comment
2413 continue
2414 comments.append(comment)
2415 except KeyboardInterrupt:
2416 self.to_screen('Interrupted by user')
2417 self.to_screen('Downloaded %d/%d comments' % (len(comments), estimated_total))
2418 return {
2419 'comments': comments,
2420 'comment_count': len(comments),
2421 }
2422
2423 @staticmethod
2424 def _generate_player_context(sts=None):
2425 context = {
2426 'html5Preference': 'HTML5_PREF_WANTS',
2427 }
2428 if sts is not None:
2429 context['signatureTimestamp'] = sts
2430 return {
2431 'playbackContext': {
2432 'contentPlaybackContext': context
2433 },
2434 'contentCheckOk': True,
2435 'racyCheckOk': True
2436 }
2437
2438 @staticmethod
2439 def _is_agegated(player_response):
2440 if traverse_obj(player_response, ('playabilityStatus', 'desktopLegacyAgeGateReason')):
2441 return True
2442
2443 reasons = traverse_obj(player_response, ('playabilityStatus', ('status', 'reason')), default=[])
2444 AGE_GATE_REASONS = (
2445 'confirm your age', 'age-restricted', 'inappropriate', # reason
2446 'age_verification_required', 'age_check_required', # status
2447 )
2448 return any(expected in reason for expected in AGE_GATE_REASONS for reason in reasons)
2449
2450 @staticmethod
2451 def _is_unplayable(player_response):
2452 return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE'
2453
2454 def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, identity_token, player_url, initial_pr):
2455
2456 session_index = self._extract_session_index(player_ytcfg, master_ytcfg)
2457 syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr)
2458 sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False)
2459 headers = self.generate_api_headers(
2460 player_ytcfg, identity_token, syncid,
2461 default_client=client, session_index=session_index)
2462
2463 yt_query = {'videoId': video_id}
2464 yt_query.update(self._generate_player_context(sts))
2465 return self._extract_response(
2466 item_id=video_id, ep='player', query=yt_query,
2467 ytcfg=player_ytcfg, headers=headers, fatal=True,
2468 default_client=client,
2469 note='Downloading %s player API JSON' % client.replace('_', ' ').strip()
2470 ) or None
2471
2472 def _get_requested_clients(self, url, smuggled_data):
2473 requested_clients = []
2474 allowed_clients = sorted(
2475 [client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'],
2476 key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True)
2477 for client in self._configuration_arg('player_client'):
2478 if client in allowed_clients:
2479 requested_clients.append(client)
2480 elif client == 'all':
2481 requested_clients.extend(allowed_clients)
2482 else:
2483 self.report_warning(f'Skipping unsupported client {client}')
2484 if not requested_clients:
2485 requested_clients = ['android', 'web']
2486
2487 if smuggled_data.get('is_music_url') or self.is_music_url(url):
2488 requested_clients.extend(
2489 f'{client}_music' for client in requested_clients if f'{client}_music' in INNERTUBE_CLIENTS)
2490
2491 return orderedSet(requested_clients)
2492
2493 def _extract_player_ytcfg(self, client, video_id):
2494 url = {
2495 'web_music': 'https://music.youtube.com',
2496 'web_embedded': f'https://www.youtube.com/embed/{video_id}?html5=1'
2497 }.get(client)
2498 if not url:
2499 return {}
2500 webpage = self._download_webpage(url, video_id, fatal=False, note=f'Downloading {client} config')
2501 return self.extract_ytcfg(video_id, webpage) or {}
2502
2503 def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, player_url, identity_token):
2504 initial_pr = None
2505 if webpage:
2506 initial_pr = self._extract_yt_initial_variable(
2507 webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
2508 video_id, 'initial player response')
2509
2510 original_clients = clients
2511 clients = clients[::-1]
2512
2513 def append_client(client_name):
2514 if client_name in INNERTUBE_CLIENTS and client_name not in original_clients:
2515 clients.append(client_name)
2516
2517 # Android player_response does not have microFormats which are needed for
2518 # extraction of some data. So we return the initial_pr with formats
2519 # stripped out even if not requested by the user
2520 # See: https://github.com/yt-dlp/yt-dlp/issues/501
2521 yielded_pr = False
2522 if initial_pr:
2523 pr = dict(initial_pr)
2524 pr['streamingData'] = None
2525 yielded_pr = True
2526 yield pr
2527
2528 last_error = None
2529 while clients:
2530 client = clients.pop()
2531 player_ytcfg = master_ytcfg if client == 'web' else {}
2532 if 'configs' not in self._configuration_arg('player_skip'):
2533 player_ytcfg = self._extract_player_ytcfg(client, video_id) or player_ytcfg
2534
2535 try:
2536 pr = initial_pr if client == 'web' and initial_pr else self._extract_player_response(
2537 client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, identity_token, player_url, initial_pr)
2538 except ExtractorError as e:
2539 if last_error:
2540 self.report_warning(last_error)
2541 last_error = e
2542 continue
2543
2544 if pr:
2545 yielded_pr = True
2546 yield pr
2547
2548 # creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in
2549 if client.endswith('_agegate') and self._is_unplayable(pr) and self._generate_sapisidhash_header():
2550 append_client(client.replace('_agegate', '_creator'))
2551 elif self._is_agegated(pr):
2552 append_client(f'{client}_agegate')
2553
2554 if last_error:
2555 if not yielded_pr:
2556 raise last_error
2557 self.report_warning(last_error)
2558
2559 def _extract_formats(self, streaming_data, video_id, player_url, is_live):
2560 itags, stream_ids = [], []
2561 itag_qualities, res_qualities = {}, {}
2562 q = qualities([
2563 # Normally tiny is the smallest video-only formats. But
2564 # audio-only formats with unknown quality may get tagged as tiny
2565 'tiny',
2566 'audio_quality_ultralow', 'audio_quality_low', 'audio_quality_medium', 'audio_quality_high', # Audio only formats
2567 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'
2568 ])
2569 streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[])
2570
2571 for fmt in streaming_formats:
2572 if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
2573 continue
2574
2575 itag = str_or_none(fmt.get('itag'))
2576 audio_track = fmt.get('audioTrack') or {}
2577 stream_id = '%s.%s' % (itag or '', audio_track.get('id', ''))
2578 if stream_id in stream_ids:
2579 continue
2580
2581 quality = fmt.get('quality')
2582 height = int_or_none(fmt.get('height'))
2583 if quality == 'tiny' or not quality:
2584 quality = fmt.get('audioQuality', '').lower() or quality
2585 # The 3gp format (17) in android client has a quality of "small",
2586 # but is actually worse than other formats
2587 if itag == '17':
2588 quality = 'tiny'
2589 if quality:
2590 if itag:
2591 itag_qualities[itag] = quality
2592 if height:
2593 res_qualities[height] = quality
2594 # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment
2595 # (adding `&sq=0` to the URL) and parsing emsg box to determine the
2596 # number of fragment that would subsequently requested with (`&sq=N`)
2597 if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF':
2598 continue
2599
2600 fmt_url = fmt.get('url')
2601 if not fmt_url:
2602 sc = compat_parse_qs(fmt.get('signatureCipher'))
2603 fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0]))
2604 encrypted_sig = try_get(sc, lambda x: x['s'][0])
2605 if not (sc and fmt_url and encrypted_sig):
2606 continue
2607 if not player_url:
2608 continue
2609 signature = self._decrypt_signature(sc['s'][0], video_id, player_url)
2610 sp = try_get(sc, lambda x: x['sp'][0]) or 'signature'
2611 fmt_url += '&' + sp + '=' + signature
2612
2613 if itag:
2614 itags.append(itag)
2615 stream_ids.append(stream_id)
2616
2617 tbr = float_or_none(
2618 fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
2619 dct = {
2620 'asr': int_or_none(fmt.get('audioSampleRate')),
2621 'filesize': int_or_none(fmt.get('contentLength')),
2622 'format_id': itag,
2623 'format_note': ', '.join(filter(None, (
2624 audio_track.get('displayName'),
2625 fmt.get('qualityLabel') or quality.replace('audio_quality_', '')))),
2626 'fps': int_or_none(fmt.get('fps')),
2627 'height': height,
2628 'quality': q(quality),
2629 'tbr': tbr,
2630 'url': fmt_url,
2631 'width': int_or_none(fmt.get('width')),
2632 'language': audio_track.get('id', '').split('.')[0],
2633 }
2634 mime_mobj = re.match(
2635 r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '')
2636 if mime_mobj:
2637 dct['ext'] = mimetype2ext(mime_mobj.group(1))
2638 dct.update(parse_codecs(mime_mobj.group(2)))
2639 no_audio = dct.get('acodec') == 'none'
2640 no_video = dct.get('vcodec') == 'none'
2641 if no_audio:
2642 dct['vbr'] = tbr
2643 if no_video:
2644 dct['abr'] = tbr
2645 if no_audio or no_video:
2646 dct['downloader_options'] = {
2647 # Youtube throttles chunks >~10M
2648 'http_chunk_size': 10485760,
2649 }
2650 if dct.get('ext'):
2651 dct['container'] = dct['ext'] + '_dash'
2652 yield dct
2653
2654 skip_manifests = self._configuration_arg('skip')
2655 get_dash = (
2656 (not is_live or self._configuration_arg('include_live_dash'))
2657 and 'dash' not in skip_manifests and self.get_param('youtube_include_dash_manifest', True))
2658 get_hls = 'hls' not in skip_manifests and self.get_param('youtube_include_hls_manifest', True)
2659
2660 def guess_quality(f):
2661 for val, qdict in ((f.get('format_id'), itag_qualities), (f.get('height'), res_qualities)):
2662 if val in qdict:
2663 return q(qdict[val])
2664 return -1
2665
2666 for sd in streaming_data:
2667 hls_manifest_url = get_hls and sd.get('hlsManifestUrl')
2668 if hls_manifest_url:
2669 for f in self._extract_m3u8_formats(hls_manifest_url, video_id, 'mp4', fatal=False):
2670 itag = self._search_regex(
2671 r'/itag/(\d+)', f['url'], 'itag', default=None)
2672 if itag in itags:
2673 continue
2674 if itag:
2675 f['format_id'] = itag
2676 itags.append(itag)
2677 f['quality'] = guess_quality(f)
2678 yield f
2679
2680 dash_manifest_url = get_dash and sd.get('dashManifestUrl')
2681 if dash_manifest_url:
2682 for f in self._extract_mpd_formats(dash_manifest_url, video_id, fatal=False):
2683 itag = f['format_id']
2684 if itag in itags:
2685 continue
2686 if itag:
2687 itags.append(itag)
2688 f['quality'] = guess_quality(f)
2689 filesize = int_or_none(self._search_regex(
2690 r'/clen/(\d+)', f.get('fragment_base_url')
2691 or f['url'], 'file size', default=None))
2692 if filesize:
2693 f['filesize'] = filesize
2694 yield f
2695
2696 def _real_extract(self, url):
2697 url, smuggled_data = unsmuggle_url(url, {})
2698 video_id = self._match_id(url)
2699
2700 base_url = self.http_scheme() + '//www.youtube.com/'
2701 webpage_url = base_url + 'watch?v=' + video_id
2702 webpage = self._download_webpage(
2703 webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
2704
2705 master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg()
2706 player_url = self._extract_player_url(master_ytcfg, webpage)
2707 identity_token = self._extract_identity_token(webpage, video_id)
2708
2709 player_responses = list(self._extract_player_responses(
2710 self._get_requested_clients(url, smuggled_data),
2711 video_id, webpage, master_ytcfg, player_url, identity_token))
2712
2713 get_first = lambda obj, keys, **kwargs: traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
2714
2715 playability_statuses = traverse_obj(
2716 player_responses, (..., 'playabilityStatus'), expected_type=dict, default=[])
2717
2718 trailer_video_id = get_first(
2719 playability_statuses,
2720 ('errorScreen', 'playerLegacyDesktopYpcTrailerRenderer', 'trailerVideoId'),
2721 expected_type=str)
2722 if trailer_video_id:
2723 return self.url_result(
2724 trailer_video_id, self.ie_key(), trailer_video_id)
2725
2726 search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None))
2727 if webpage else (lambda x: None))
2728
2729 video_details = traverse_obj(
2730 player_responses, (..., 'videoDetails'), expected_type=dict, default=[])
2731 microformats = traverse_obj(
2732 player_responses, (..., 'microformat', 'playerMicroformatRenderer'),
2733 expected_type=dict, default=[])
2734 video_title = (
2735 get_first(video_details, 'title')
2736 or self._get_text(microformats, (..., 'title'))
2737 or search_meta(['og:title', 'twitter:title', 'title']))
2738 video_description = get_first(video_details, 'shortDescription')
2739
2740 if not smuggled_data.get('force_singlefeed', False):
2741 if not self.get_param('noplaylist'):
2742 multifeed_metadata_list = get_first(
2743 player_responses,
2744 ('multicamera', 'playerLegacyMulticameraRenderer', 'metadataList'),
2745 expected_type=str)
2746 if multifeed_metadata_list:
2747 entries = []
2748 feed_ids = []
2749 for feed in multifeed_metadata_list.split(','):
2750 # Unquote should take place before split on comma (,) since textual
2751 # fields may contain comma as well (see
2752 # https://github.com/ytdl-org/youtube-dl/issues/8536)
2753 feed_data = compat_parse_qs(
2754 compat_urllib_parse_unquote_plus(feed))
2755
2756 def feed_entry(name):
2757 return try_get(
2758 feed_data, lambda x: x[name][0], compat_str)
2759
2760 feed_id = feed_entry('id')
2761 if not feed_id:
2762 continue
2763 feed_title = feed_entry('title')
2764 title = video_title
2765 if feed_title:
2766 title += ' (%s)' % feed_title
2767 entries.append({
2768 '_type': 'url_transparent',
2769 'ie_key': 'Youtube',
2770 'url': smuggle_url(
2771 '%swatch?v=%s' % (base_url, feed_data['id'][0]),
2772 {'force_singlefeed': True}),
2773 'title': title,
2774 })
2775 feed_ids.append(feed_id)
2776 self.to_screen(
2777 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
2778 % (', '.join(feed_ids), video_id))
2779 return self.playlist_result(
2780 entries, video_id, video_title, video_description)
2781 else:
2782 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2783
2784 live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails'))
2785 is_live = get_first(video_details, 'isLive')
2786 if is_live is None:
2787 is_live = get_first(live_broadcast_details, 'isLiveNow')
2788
2789 streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[])
2790 formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live))
2791
2792 if not formats:
2793 if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')):
2794 self.raise_no_formats(
2795 'This video is DRM protected.', expected=True)
2796 pemr = get_first(
2797 playability_statuses,
2798 ('errorScreen', 'playerErrorMessageRenderer'), expected_type=dict) or {}
2799 reason = self._get_text(pemr, 'reason') or get_first(playability_statuses, 'reason')
2800 subreason = clean_html(self._get_text(pemr, 'subreason') or '')
2801 if subreason:
2802 if subreason == 'The uploader has not made this video available in your country.':
2803 countries = get_first(microformats, 'availableCountries')
2804 if not countries:
2805 regions_allowed = search_meta('regionsAllowed')
2806 countries = regions_allowed.split(',') if regions_allowed else None
2807 self.raise_geo_restricted(subreason, countries, metadata_available=True)
2808 reason += f'. {subreason}'
2809 if reason:
2810 self.raise_no_formats(reason, expected=True)
2811
2812 for f in formats:
2813 if '&c=WEB&' in f['url'] and '&ratebypass=yes&' not in f['url']: # throttled
2814 f['source_preference'] = -10
2815 # TODO: this method is not reliable
2816 f['format_note'] = format_field(f, 'format_note', '%s ') + '(maybe throttled)'
2817
2818 # Source is given priority since formats that throttle are given lower source_preference
2819 # When throttling issue is fully fixed, remove this
2820 self._sort_formats(formats, ('quality', 'height', 'fps', 'source'))
2821
2822 keywords = get_first(video_details, 'keywords', expected_type=list) or []
2823 if not keywords and webpage:
2824 keywords = [
2825 unescapeHTML(m.group('content'))
2826 for m in re.finditer(self._meta_regex('og:video:tag'), webpage)]
2827 for keyword in keywords:
2828 if keyword.startswith('yt:stretch='):
2829 mobj = re.search(r'(\d+)\s*:\s*(\d+)', keyword)
2830 if mobj:
2831 # NB: float is intentional for forcing float division
2832 w, h = (float(v) for v in mobj.groups())
2833 if w > 0 and h > 0:
2834 ratio = w / h
2835 for f in formats:
2836 if f.get('vcodec') != 'none':
2837 f['stretched_ratio'] = ratio
2838 break
2839
2840 thumbnails = []
2841 thumbnail_dicts = traverse_obj(
2842 (video_details, microformats), (..., ..., 'thumbnail', 'thumbnails', ...),
2843 expected_type=dict, default=[])
2844 for thumbnail in thumbnail_dicts:
2845 thumbnail_url = thumbnail.get('url')
2846 if not thumbnail_url:
2847 continue
2848 # Sometimes youtube gives a wrong thumbnail URL. See:
2849 # https://github.com/yt-dlp/yt-dlp/issues/233
2850 # https://github.com/ytdl-org/youtube-dl/issues/28023
2851 if 'maxresdefault' in thumbnail_url:
2852 thumbnail_url = thumbnail_url.split('?')[0]
2853 thumbnails.append({
2854 'url': thumbnail_url,
2855 'height': int_or_none(thumbnail.get('height')),
2856 'width': int_or_none(thumbnail.get('width')),
2857 })
2858 thumbnail_url = search_meta(['og:image', 'twitter:image'])
2859 if thumbnail_url:
2860 thumbnails.append({
2861 'url': thumbnail_url,
2862 })
2863 # The best resolution thumbnails sometimes does not appear in the webpage
2864 # See: https://github.com/ytdl-org/youtube-dl/issues/29049, https://github.com/yt-dlp/yt-dlp/issues/340
2865 # List of possible thumbnails - Ref: <https://stackoverflow.com/a/20542029>
2866 hq_thumbnail_names = ['maxresdefault', 'hq720', 'sddefault', 'sd1', 'sd2', 'sd3']
2867 # TODO: Test them also? - For some videos, even these don't exist
2868 guaranteed_thumbnail_names = [
2869 'hqdefault', 'hq1', 'hq2', 'hq3', '0',
2870 'mqdefault', 'mq1', 'mq2', 'mq3',
2871 'default', '1', '2', '3'
2872 ]
2873 thumbnail_names = hq_thumbnail_names + guaranteed_thumbnail_names
2874 n_thumbnail_names = len(thumbnail_names)
2875
2876 thumbnails.extend({
2877 'url': 'https://i.ytimg.com/vi{webp}/{video_id}/{name}{live}.{ext}'.format(
2878 video_id=video_id, name=name, ext=ext,
2879 webp='_webp' if ext == 'webp' else '', live='_live' if is_live else ''),
2880 '_test_url': name in hq_thumbnail_names,
2881 } for name in thumbnail_names for ext in ('webp', 'jpg'))
2882 for thumb in thumbnails:
2883 i = next((i for i, t in enumerate(thumbnail_names) if f'/{video_id}/{t}' in thumb['url']), n_thumbnail_names)
2884 thumb['preference'] = (0 if '.webp' in thumb['url'] else -1) - (2 * i)
2885 self._remove_duplicate_formats(thumbnails)
2886
2887 category = get_first(microformats, 'category') or search_meta('genre')
2888 channel_id = str_or_none(
2889 get_first(video_details, 'channelId')
2890 or get_first(microformats, 'externalChannelId')
2891 or search_meta('channelId'))
2892 duration = int_or_none(
2893 get_first(video_details, 'lengthSeconds')
2894 or get_first(microformats, 'lengthSeconds')
2895 or parse_duration(search_meta('duration'))) or None
2896 owner_profile_url = get_first(microformats, 'ownerProfileUrl')
2897
2898 live_content = get_first(video_details, 'isLiveContent')
2899 is_upcoming = get_first(video_details, 'isUpcoming')
2900 if is_live is None:
2901 if is_upcoming or live_content is False:
2902 is_live = False
2903 if is_upcoming is None and (live_content or is_live):
2904 is_upcoming = False
2905 live_starttime = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp'))
2906 live_endtime = parse_iso8601(get_first(live_broadcast_details, 'endTimestamp'))
2907 if not duration and live_endtime and live_starttime:
2908 duration = live_endtime - live_starttime
2909
2910 info = {
2911 'id': video_id,
2912 'title': self._live_title(video_title) if is_live else video_title,
2913 'formats': formats,
2914 'thumbnails': thumbnails,
2915 'description': video_description,
2916 'upload_date': unified_strdate(
2917 get_first(microformats, 'uploadDate')
2918 or search_meta('uploadDate')),
2919 'uploader': get_first(video_details, 'author'),
2920 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None,
2921 'uploader_url': owner_profile_url,
2922 'channel_id': channel_id,
2923 'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None,
2924 'duration': duration,
2925 'view_count': int_or_none(
2926 get_first((video_details, microformats), (..., 'viewCount'))
2927 or search_meta('interactionCount')),
2928 'average_rating': float_or_none(get_first(video_details, 'averageRating')),
2929 'age_limit': 18 if (
2930 get_first(microformats, 'isFamilySafe') is False
2931 or search_meta('isFamilyFriendly') == 'false'
2932 or search_meta('og:restrictions:age') == '18+') else 0,
2933 'webpage_url': webpage_url,
2934 'categories': [category] if category else None,
2935 'tags': keywords,
2936 'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'),
2937 'is_live': is_live,
2938 'was_live': (False if is_live or is_upcoming or live_content is False
2939 else None if is_live is None or is_upcoming is None
2940 else live_content),
2941 'live_status': 'is_upcoming' if is_upcoming else None, # rest will be set by YoutubeDL
2942 'release_timestamp': live_starttime,
2943 }
2944
2945 pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict)
2946 # Converted into dicts to remove duplicates
2947 captions = {
2948 sub.get('baseUrl'): sub
2949 for sub in traverse_obj(pctr, (..., 'captionTracks', ...), default=[])}
2950 translation_languages = {
2951 lang.get('languageCode'): lang.get('languageName')
2952 for lang in traverse_obj(pctr, (..., 'translationLanguages', ...), default=[])}
2953 subtitles = {}
2954 if pctr:
2955 def process_language(container, base_url, lang_code, sub_name, query):
2956 lang_subs = container.setdefault(lang_code, [])
2957 for fmt in self._SUBTITLE_FORMATS:
2958 query.update({
2959 'fmt': fmt,
2960 })
2961 lang_subs.append({
2962 'ext': fmt,
2963 'url': update_url_query(base_url, query),
2964 'name': sub_name,
2965 })
2966
2967 for base_url, caption_track in captions.items():
2968 if not base_url:
2969 continue
2970 if caption_track.get('kind') != 'asr':
2971 lang_code = (
2972 remove_start(caption_track.get('vssId') or '', '.').replace('.', '-')
2973 or caption_track.get('languageCode'))
2974 if not lang_code:
2975 continue
2976 process_language(
2977 subtitles, base_url, lang_code,
2978 traverse_obj(caption_track, ('name', 'simpleText')),
2979 {})
2980 continue
2981 automatic_captions = {}
2982 for trans_code, trans_name in translation_languages.items():
2983 if not trans_code:
2984 continue
2985 process_language(
2986 automatic_captions, base_url, trans_code,
2987 self._get_text(trans_name, max_runs=1),
2988 {'tlang': trans_code})
2989 info['automatic_captions'] = automatic_captions
2990 info['subtitles'] = subtitles
2991
2992 parsed_url = compat_urllib_parse_urlparse(url)
2993 for component in [parsed_url.fragment, parsed_url.query]:
2994 query = compat_parse_qs(component)
2995 for k, v in query.items():
2996 for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]:
2997 d_k += '_time'
2998 if d_k not in info and k in s_ks:
2999 info[d_k] = parse_duration(query[k][0])
3000
3001 # Youtube Music Auto-generated description
3002 if video_description:
3003 mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
3004 if mobj:
3005 release_year = mobj.group('release_year')
3006 release_date = mobj.group('release_date')
3007 if release_date:
3008 release_date = release_date.replace('-', '')
3009 if not release_year:
3010 release_year = release_date[:4]
3011 info.update({
3012 'album': mobj.group('album'.strip()),
3013 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')),
3014 'track': mobj.group('track').strip(),
3015 'release_date': release_date,
3016 'release_year': int_or_none(release_year),
3017 })
3018
3019 initial_data = None
3020 if webpage:
3021 initial_data = self._extract_yt_initial_variable(
3022 webpage, self._YT_INITIAL_DATA_RE, video_id,
3023 'yt initial data')
3024 if not initial_data:
3025 headers = self.generate_api_headers(
3026 master_ytcfg, identity_token, self._extract_account_syncid(master_ytcfg),
3027 session_index=self._extract_session_index(master_ytcfg))
3028
3029 initial_data = self._extract_response(
3030 item_id=video_id, ep='next', fatal=False,
3031 ytcfg=master_ytcfg, headers=headers, query={'videoId': video_id},
3032 note='Downloading initial data API JSON')
3033
3034 try:
3035 # This will error if there is no livechat
3036 initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
3037 info['subtitles']['live_chat'] = [{
3038 'url': 'https://www.youtube.com/watch?v=%s' % video_id, # url is needed to set cookies
3039 'video_id': video_id,
3040 'ext': 'json',
3041 'protocol': 'youtube_live_chat' if is_live or is_upcoming else 'youtube_live_chat_replay',
3042 }]
3043 except (KeyError, IndexError, TypeError):
3044 pass
3045
3046 if initial_data:
3047 info['chapters'] = (
3048 self._extract_chapters_from_json(initial_data, duration)
3049 or self._extract_chapters_from_engagement_panel(initial_data, duration)
3050 or None)
3051
3052 contents = try_get(
3053 initial_data,
3054 lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'],
3055 list) or []
3056 for content in contents:
3057 vpir = content.get('videoPrimaryInfoRenderer')
3058 if vpir:
3059 stl = vpir.get('superTitleLink')
3060 if stl:
3061 stl = self._get_text(stl)
3062 if try_get(
3063 vpir,
3064 lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':
3065 info['location'] = stl
3066 else:
3067 mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl)
3068 if mobj:
3069 info.update({
3070 'series': mobj.group(1),
3071 'season_number': int(mobj.group(2)),
3072 'episode_number': int(mobj.group(3)),
3073 })
3074 for tlb in (try_get(
3075 vpir,
3076 lambda x: x['videoActions']['menuRenderer']['topLevelButtons'],
3077 list) or []):
3078 tbr = tlb.get('toggleButtonRenderer') or {}
3079 for getter, regex in [(
3080 lambda x: x['defaultText']['accessibility']['accessibilityData'],
3081 r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([
3082 lambda x: x['accessibility'],
3083 lambda x: x['accessibilityData']['accessibilityData'],
3084 ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]:
3085 label = (try_get(tbr, getter, dict) or {}).get('label')
3086 if label:
3087 mobj = re.match(regex, label)
3088 if mobj:
3089 info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count'))
3090 break
3091 sbr_tooltip = try_get(
3092 vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])
3093 if sbr_tooltip:
3094 like_count, dislike_count = sbr_tooltip.split(' / ')
3095 info.update({
3096 'like_count': str_to_int(like_count),
3097 'dislike_count': str_to_int(dislike_count),
3098 })
3099 vsir = content.get('videoSecondaryInfoRenderer')
3100 if vsir:
3101 info['channel'] = self._get_text(vsir, ('owner', 'videoOwnerRenderer', 'title'))
3102 rows = try_get(
3103 vsir,
3104 lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
3105 list) or []
3106 multiple_songs = False
3107 for row in rows:
3108 if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
3109 multiple_songs = True
3110 break
3111 for row in rows:
3112 mrr = row.get('metadataRowRenderer') or {}
3113 mrr_title = mrr.get('title')
3114 if not mrr_title:
3115 continue
3116 mrr_title = self._get_text(mrr, 'title')
3117 mrr_contents_text = self._get_text(mrr, ('contents', 0))
3118 if mrr_title == 'License':
3119 info['license'] = mrr_contents_text
3120 elif not multiple_songs:
3121 if mrr_title == 'Album':
3122 info['album'] = mrr_contents_text
3123 elif mrr_title == 'Artist':
3124 info['artist'] = mrr_contents_text
3125 elif mrr_title == 'Song':
3126 info['track'] = mrr_contents_text
3127
3128 fallbacks = {
3129 'channel': 'uploader',
3130 'channel_id': 'uploader_id',
3131 'channel_url': 'uploader_url',
3132 }
3133 for to, frm in fallbacks.items():
3134 if not info.get(to):
3135 info[to] = info.get(frm)
3136
3137 for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]:
3138 v = info.get(s_k)
3139 if v:
3140 info[d_k] = v
3141
3142 is_private = get_first(video_details, 'isPrivate', expected_type=bool)
3143 is_unlisted = get_first(microformats, 'isUnlisted', expected_type=bool)
3144 is_membersonly = None
3145 is_premium = None
3146 if initial_data and is_private is not None:
3147 is_membersonly = False
3148 is_premium = False
3149 contents = try_get(initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or []
3150 badge_labels = set()
3151 for content in contents:
3152 if not isinstance(content, dict):
3153 continue
3154 badge_labels.update(self._extract_badges(content.get('videoPrimaryInfoRenderer')))
3155 for badge_label in badge_labels:
3156 if badge_label.lower() == 'members only':
3157 is_membersonly = True
3158 elif badge_label.lower() == 'premium':
3159 is_premium = True
3160 elif badge_label.lower() == 'unlisted':
3161 is_unlisted = True
3162
3163 info['availability'] = self._availability(
3164 is_private=is_private,
3165 needs_premium=is_premium,
3166 needs_subscription=is_membersonly,
3167 needs_auth=info['age_limit'] >= 18,
3168 is_unlisted=None if is_private is None else is_unlisted)
3169
3170 # get xsrf for annotations or comments
3171 get_annotations = self.get_param('writeannotations', False)
3172 get_comments = self.get_param('getcomments', False)
3173 if get_annotations or get_comments:
3174 xsrf_token = None
3175 if master_ytcfg:
3176 xsrf_token = try_get(master_ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
3177 if not xsrf_token:
3178 xsrf_token = self._search_regex(
3179 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
3180 webpage, 'xsrf token', group='xsrf_token', fatal=False)
3181
3182 # annotations
3183 if get_annotations:
3184 invideo_url = get_first(
3185 player_responses,
3186 ('annotations', 0, 'playerAnnotationsUrlsRenderer', 'invideoUrl'),
3187 expected_type=str)
3188 if xsrf_token and invideo_url:
3189 xsrf_field_name = None
3190 if master_ytcfg:
3191 xsrf_field_name = try_get(master_ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
3192 if not xsrf_field_name:
3193 xsrf_field_name = self._search_regex(
3194 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
3195 webpage, 'xsrf field name',
3196 group='xsrf_field_name', default='session_token')
3197 info['annotations'] = self._download_webpage(
3198 self._proto_relative_url(invideo_url),
3199 video_id, note='Downloading annotations',
3200 errnote='Unable to download video annotations', fatal=False,
3201 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
3202
3203 if get_comments:
3204 info['__post_extractor'] = lambda: self._extract_comments(master_ytcfg, video_id, contents, webpage)
3205
3206 self.mark_watched(video_id, player_responses)
3207
3208 return info
3209
3210
3211 class YoutubeTabIE(YoutubeBaseInfoExtractor):
3212 IE_DESC = 'YouTube.com tab'
3213 _VALID_URL = r'''(?x)
3214 https?://
3215 (?:\w+\.)?
3216 (?:
3217 youtube(?:kids)?\.com|
3218 invidio\.us
3219 )/
3220 (?:
3221 (?P<channel_type>channel|c|user|browse)/|
3222 (?P<not_channel>
3223 feed/|hashtag/|
3224 (?:playlist|watch)\?.*?\blist=
3225 )|
3226 (?!(?:%s)\b) # Direct URLs
3227 )
3228 (?P<id>[^/?\#&]+)
3229 ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
3230 IE_NAME = 'youtube:tab'
3231
3232 _TESTS = [{
3233 'note': 'playlists, multipage',
3234 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
3235 'playlist_mincount': 94,
3236 'info_dict': {
3237 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
3238 'title': 'Игорь Клейнер - Playlists',
3239 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
3240 'uploader': 'Игорь Клейнер',
3241 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
3242 },
3243 }, {
3244 'note': 'playlists, multipage, different order',
3245 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
3246 'playlist_mincount': 94,
3247 'info_dict': {
3248 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
3249 'title': 'Игорь Клейнер - Playlists',
3250 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
3251 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
3252 'uploader': 'Игорь Клейнер',
3253 },
3254 }, {
3255 'note': 'playlists, series',
3256 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3',
3257 'playlist_mincount': 5,
3258 'info_dict': {
3259 'id': 'UCYO_jab_esuFRV4b17AJtAw',
3260 'title': '3Blue1Brown - Playlists',
3261 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
3262 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
3263 'uploader': '3Blue1Brown',
3264 },
3265 }, {
3266 'note': 'playlists, singlepage',
3267 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
3268 'playlist_mincount': 4,
3269 'info_dict': {
3270 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
3271 'title': 'ThirstForScience - Playlists',
3272 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
3273 'uploader': 'ThirstForScience',
3274 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
3275 }
3276 }, {
3277 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3278 'only_matching': True,
3279 }, {
3280 'note': 'basic, single video playlist',
3281 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3282 'info_dict': {
3283 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
3284 'uploader': 'Sergey M.',
3285 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3286 'title': 'youtube-dl public playlist',
3287 },
3288 'playlist_count': 1,
3289 }, {
3290 'note': 'empty playlist',
3291 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3292 'info_dict': {
3293 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
3294 'uploader': 'Sergey M.',
3295 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3296 'title': 'youtube-dl empty playlist',
3297 },
3298 'playlist_count': 0,
3299 }, {
3300 'note': 'Home tab',
3301 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
3302 'info_dict': {
3303 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3304 'title': 'lex will - Home',
3305 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3306 'uploader': 'lex will',
3307 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3308 },
3309 'playlist_mincount': 2,
3310 }, {
3311 'note': 'Videos tab',
3312 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
3313 'info_dict': {
3314 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3315 'title': 'lex will - Videos',
3316 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3317 'uploader': 'lex will',
3318 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3319 },
3320 'playlist_mincount': 975,
3321 }, {
3322 'note': 'Videos tab, sorted by popular',
3323 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
3324 'info_dict': {
3325 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3326 'title': 'lex will - Videos',
3327 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3328 'uploader': 'lex will',
3329 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3330 },
3331 'playlist_mincount': 199,
3332 }, {
3333 'note': 'Playlists tab',
3334 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
3335 'info_dict': {
3336 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3337 'title': 'lex will - Playlists',
3338 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3339 'uploader': 'lex will',
3340 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3341 },
3342 'playlist_mincount': 17,
3343 }, {
3344 'note': 'Community tab',
3345 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
3346 'info_dict': {
3347 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3348 'title': 'lex will - Community',
3349 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3350 'uploader': 'lex will',
3351 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3352 },
3353 'playlist_mincount': 18,
3354 }, {
3355 'note': 'Channels tab',
3356 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
3357 'info_dict': {
3358 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3359 'title': 'lex will - Channels',
3360 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3361 'uploader': 'lex will',
3362 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3363 },
3364 'playlist_mincount': 12,
3365 }, {
3366 'note': 'Search tab',
3367 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra',
3368 'playlist_mincount': 40,
3369 'info_dict': {
3370 'id': 'UCYO_jab_esuFRV4b17AJtAw',
3371 'title': '3Blue1Brown - Search - linear algebra',
3372 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
3373 'uploader': '3Blue1Brown',
3374 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
3375 },
3376 }, {
3377 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3378 'only_matching': True,
3379 }, {
3380 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3381 'only_matching': True,
3382 }, {
3383 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3384 'only_matching': True,
3385 }, {
3386 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
3387 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
3388 'info_dict': {
3389 'title': '29C3: Not my department',
3390 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
3391 'uploader': 'Christiaan008',
3392 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
3393 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268',
3394 },
3395 'playlist_count': 96,
3396 }, {
3397 'note': 'Large playlist',
3398 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
3399 'info_dict': {
3400 'title': 'Uploads from Cauchemar',
3401 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
3402 'uploader': 'Cauchemar',
3403 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
3404 },
3405 'playlist_mincount': 1123,
3406 }, {
3407 'note': 'even larger playlist, 8832 videos',
3408 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
3409 'only_matching': True,
3410 }, {
3411 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
3412 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
3413 'info_dict': {
3414 'title': 'Uploads from Interstellar Movie',
3415 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
3416 'uploader': 'Interstellar Movie',
3417 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
3418 },
3419 'playlist_mincount': 21,
3420 }, {
3421 'note': 'Playlist with "show unavailable videos" button',
3422 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q',
3423 'info_dict': {
3424 'title': 'Uploads from Phim Siêu Nhân Nhật Bản',
3425 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q',
3426 'uploader': 'Phim Siêu Nhân Nhật Bản',
3427 'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q',
3428 },
3429 'playlist_mincount': 200,
3430 }, {
3431 'note': 'Playlist with unavailable videos in page 7',
3432 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w',
3433 'info_dict': {
3434 'title': 'Uploads from BlankTV',
3435 'id': 'UU8l9frL61Yl5KFOl87nIm2w',
3436 'uploader': 'BlankTV',
3437 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w',
3438 },
3439 'playlist_mincount': 1000,
3440 }, {
3441 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844',
3442 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
3443 'info_dict': {
3444 'title': 'Data Analysis with Dr Mike Pound',
3445 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
3446 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
3447 'uploader': 'Computerphile',
3448 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487',
3449 },
3450 'playlist_mincount': 11,
3451 }, {
3452 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3453 'only_matching': True,
3454 }, {
3455 'note': 'Playlist URL that does not actually serve a playlist',
3456 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
3457 'info_dict': {
3458 'id': 'FqZTN594JQw',
3459 'ext': 'webm',
3460 'title': "Smiley's People 01 detective, Adventure Series, Action",
3461 'uploader': 'STREEM',
3462 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
3463 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
3464 'upload_date': '20150526',
3465 'license': 'Standard YouTube License',
3466 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
3467 'categories': ['People & Blogs'],
3468 'tags': list,
3469 'view_count': int,
3470 'like_count': int,
3471 'dislike_count': int,
3472 },
3473 'params': {
3474 'skip_download': True,
3475 },
3476 'skip': 'This video is not available.',
3477 'add_ie': [YoutubeIE.ie_key()],
3478 }, {
3479 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
3480 'only_matching': True,
3481 }, {
3482 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
3483 'only_matching': True,
3484 }, {
3485 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
3486 'info_dict': {
3487 'id': '3yImotZU3tw', # This will keep changing
3488 'ext': 'mp4',
3489 'title': compat_str,
3490 'uploader': 'Sky News',
3491 'uploader_id': 'skynews',
3492 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
3493 'upload_date': r're:\d{8}',
3494 'description': compat_str,
3495 'categories': ['News & Politics'],
3496 'tags': list,
3497 'like_count': int,
3498 'dislike_count': int,
3499 },
3500 'params': {
3501 'skip_download': True,
3502 },
3503 'expected_warnings': ['Downloading just video ', 'Ignoring subtitle tracks found in '],
3504 }, {
3505 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
3506 'info_dict': {
3507 'id': 'a48o2S1cPoo',
3508 'ext': 'mp4',
3509 'title': 'The Young Turks - Live Main Show',
3510 'uploader': 'The Young Turks',
3511 'uploader_id': 'TheYoungTurks',
3512 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
3513 'upload_date': '20150715',
3514 'license': 'Standard YouTube License',
3515 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3516 'categories': ['News & Politics'],
3517 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3518 'like_count': int,
3519 'dislike_count': int,
3520 },
3521 'params': {
3522 'skip_download': True,
3523 },
3524 'only_matching': True,
3525 }, {
3526 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
3527 'only_matching': True,
3528 }, {
3529 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3530 'only_matching': True,
3531 }, {
3532 'note': 'A channel that is not live. Should raise error',
3533 'url': 'https://www.youtube.com/user/numberphile/live',
3534 'only_matching': True,
3535 }, {
3536 'url': 'https://www.youtube.com/feed/trending',
3537 'only_matching': True,
3538 }, {
3539 'url': 'https://www.youtube.com/feed/library',
3540 'only_matching': True,
3541 }, {
3542 'url': 'https://www.youtube.com/feed/history',
3543 'only_matching': True,
3544 }, {
3545 'url': 'https://www.youtube.com/feed/subscriptions',
3546 'only_matching': True,
3547 }, {
3548 'url': 'https://www.youtube.com/feed/watch_later',
3549 'only_matching': True,
3550 }, {
3551 'note': 'Recommended - redirects to home page',
3552 'url': 'https://www.youtube.com/feed/recommended',
3553 'only_matching': True,
3554 }, {
3555 'note': 'inline playlist with not always working continuations',
3556 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
3557 'only_matching': True,
3558 }, {
3559 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8',
3560 'only_matching': True,
3561 }, {
3562 'url': 'https://www.youtube.com/course',
3563 'only_matching': True,
3564 }, {
3565 'url': 'https://www.youtube.com/zsecurity',
3566 'only_matching': True,
3567 }, {
3568 'url': 'http://www.youtube.com/NASAgovVideo/videos',
3569 'only_matching': True,
3570 }, {
3571 'url': 'https://www.youtube.com/TheYoungTurks/live',
3572 'only_matching': True,
3573 }, {
3574 'url': 'https://www.youtube.com/hashtag/cctv9',
3575 'info_dict': {
3576 'id': 'cctv9',
3577 'title': '#cctv9',
3578 },
3579 'playlist_mincount': 350,
3580 }, {
3581 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU',
3582 'only_matching': True,
3583 }, {
3584 'note': 'Requires Premium: should request additional YTM-info webpage (and have format 141) for videos in playlist',
3585 'url': 'https://music.youtube.com/playlist?list=PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3586 'only_matching': True
3587 }, {
3588 'note': '/browse/ should redirect to /channel/',
3589 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng',
3590 'only_matching': True
3591 }, {
3592 'note': 'VLPL, should redirect to playlist?list=PL...',
3593 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3594 'info_dict': {
3595 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3596 'uploader': 'NoCopyrightSounds',
3597 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!',
3598 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
3599 'title': 'NCS Releases',
3600 },
3601 'playlist_mincount': 166,
3602 }, {
3603 'note': 'Topic, should redirect to playlist?list=UU...',
3604 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw',
3605 'info_dict': {
3606 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw',
3607 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
3608 'title': 'Uploads from Royalty Free Music - Topic',
3609 'uploader': 'Royalty Free Music - Topic',
3610 },
3611 'expected_warnings': [
3612 'A channel/user page was given',
3613 'The URL does not have a videos tab',
3614 ],
3615 'playlist_mincount': 101,
3616 }, {
3617 'note': 'Topic without a UU playlist',
3618 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg',
3619 'info_dict': {
3620 'id': 'UCtFRv9O2AHqOZjjynzrv-xg',
3621 'title': 'UCtFRv9O2AHqOZjjynzrv-xg',
3622 },
3623 'expected_warnings': [
3624 'A channel/user page was given',
3625 'The URL does not have a videos tab',
3626 'Falling back to channel URL',
3627 ],
3628 'playlist_mincount': 9,
3629 }, {
3630 'note': 'Youtube music Album',
3631 'url': 'https://music.youtube.com/browse/MPREb_gTAcphH99wE',
3632 'info_dict': {
3633 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0',
3634 'title': 'Album - Royalty Free Music Library V2 (50 Songs)',
3635 },
3636 'playlist_count': 50,
3637 }, {
3638 'note': 'unlisted single video playlist',
3639 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
3640 'info_dict': {
3641 'uploader_id': 'UC9zHu_mHU96r19o-wV5Qs1Q',
3642 'uploader': 'colethedj',
3643 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
3644 'title': 'yt-dlp unlisted playlist test',
3645 'availability': 'unlisted'
3646 },
3647 'playlist_count': 1,
3648 }]
3649
3650 @classmethod
3651 def suitable(cls, url):
3652 return False if YoutubeIE.suitable(url) else super(
3653 YoutubeTabIE, cls).suitable(url)
3654
3655 def _extract_channel_id(self, webpage):
3656 channel_id = self._html_search_meta(
3657 'channelId', webpage, 'channel id', default=None)
3658 if channel_id:
3659 return channel_id
3660 channel_url = self._html_search_meta(
3661 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
3662 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
3663 'twitter:app:url:googleplay'), webpage, 'channel url')
3664 return self._search_regex(
3665 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
3666 channel_url, 'channel id')
3667
3668 @staticmethod
3669 def _extract_basic_item_renderer(item):
3670 # Modified from _extract_grid_item_renderer
3671 known_basic_renderers = (
3672 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer'
3673 )
3674 for key, renderer in item.items():
3675 if not isinstance(renderer, dict):
3676 continue
3677 elif key in known_basic_renderers:
3678 return renderer
3679 elif key.startswith('grid') and key.endswith('Renderer'):
3680 return renderer
3681
3682 def _grid_entries(self, grid_renderer):
3683 for item in grid_renderer['items']:
3684 if not isinstance(item, dict):
3685 continue
3686 renderer = self._extract_basic_item_renderer(item)
3687 if not isinstance(renderer, dict):
3688 continue
3689 title = self._get_text(renderer, 'title')
3690
3691 # playlist
3692 playlist_id = renderer.get('playlistId')
3693 if playlist_id:
3694 yield self.url_result(
3695 'https://www.youtube.com/playlist?list=%s' % playlist_id,
3696 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3697 video_title=title)
3698 continue
3699 # video
3700 video_id = renderer.get('videoId')
3701 if video_id:
3702 yield self._extract_video(renderer)
3703 continue
3704 # channel
3705 channel_id = renderer.get('channelId')
3706 if channel_id:
3707 yield self.url_result(
3708 'https://www.youtube.com/channel/%s' % channel_id,
3709 ie=YoutubeTabIE.ie_key(), video_title=title)
3710 continue
3711 # generic endpoint URL support
3712 ep_url = urljoin('https://www.youtube.com/', try_get(
3713 renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'],
3714 compat_str))
3715 if ep_url:
3716 for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE):
3717 if ie.suitable(ep_url):
3718 yield self.url_result(
3719 ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title)
3720 break
3721
3722 def _shelf_entries_from_content(self, shelf_renderer):
3723 content = shelf_renderer.get('content')
3724 if not isinstance(content, dict):
3725 return
3726 renderer = content.get('gridRenderer') or content.get('expandedShelfContentsRenderer')
3727 if renderer:
3728 # TODO: add support for nested playlists so each shelf is processed
3729 # as separate playlist
3730 # TODO: this includes only first N items
3731 for entry in self._grid_entries(renderer):
3732 yield entry
3733 renderer = content.get('horizontalListRenderer')
3734 if renderer:
3735 # TODO
3736 pass
3737
3738 def _shelf_entries(self, shelf_renderer, skip_channels=False):
3739 ep = try_get(
3740 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3741 compat_str)
3742 shelf_url = urljoin('https://www.youtube.com', ep)
3743 if shelf_url:
3744 # Skipping links to another channels, note that checking for
3745 # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL
3746 # will not work
3747 if skip_channels and '/channels?' in shelf_url:
3748 return
3749 title = self._get_text(shelf_renderer, 'title')
3750 yield self.url_result(shelf_url, video_title=title)
3751 # Shelf may not contain shelf URL, fallback to extraction from content
3752 for entry in self._shelf_entries_from_content(shelf_renderer):
3753 yield entry
3754
3755 def _playlist_entries(self, video_list_renderer):
3756 for content in video_list_renderer['contents']:
3757 if not isinstance(content, dict):
3758 continue
3759 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
3760 if not isinstance(renderer, dict):
3761 continue
3762 video_id = renderer.get('videoId')
3763 if not video_id:
3764 continue
3765 yield self._extract_video(renderer)
3766
3767 def _rich_entries(self, rich_grid_renderer):
3768 renderer = try_get(
3769 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
3770 video_id = renderer.get('videoId')
3771 if not video_id:
3772 return
3773 yield self._extract_video(renderer)
3774
3775 def _video_entry(self, video_renderer):
3776 video_id = video_renderer.get('videoId')
3777 if video_id:
3778 return self._extract_video(video_renderer)
3779
3780 def _post_thread_entries(self, post_thread_renderer):
3781 post_renderer = try_get(
3782 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
3783 if not post_renderer:
3784 return
3785 # video attachment
3786 video_renderer = try_get(
3787 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict) or {}
3788 video_id = video_renderer.get('videoId')
3789 if video_id:
3790 entry = self._extract_video(video_renderer)
3791 if entry:
3792 yield entry
3793 # playlist attachment
3794 playlist_id = try_get(
3795 post_renderer, lambda x: x['backstageAttachment']['playlistRenderer']['playlistId'], compat_str)
3796 if playlist_id:
3797 yield self.url_result(
3798 'https://www.youtube.com/playlist?list=%s' % playlist_id,
3799 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3800 # inline video links
3801 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
3802 for run in runs:
3803 if not isinstance(run, dict):
3804 continue
3805 ep_url = try_get(
3806 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
3807 if not ep_url:
3808 continue
3809 if not YoutubeIE.suitable(ep_url):
3810 continue
3811 ep_video_id = YoutubeIE._match_id(ep_url)
3812 if video_id == ep_video_id:
3813 continue
3814 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=ep_video_id)
3815
3816 def _post_thread_continuation_entries(self, post_thread_continuation):
3817 contents = post_thread_continuation.get('contents')
3818 if not isinstance(contents, list):
3819 return
3820 for content in contents:
3821 renderer = content.get('backstagePostThreadRenderer')
3822 if not isinstance(renderer, dict):
3823 continue
3824 for entry in self._post_thread_entries(renderer):
3825 yield entry
3826
3827 r''' # unused
3828 def _rich_grid_entries(self, contents):
3829 for content in contents:
3830 video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict)
3831 if video_renderer:
3832 entry = self._video_entry(video_renderer)
3833 if entry:
3834 yield entry
3835 '''
3836 def _entries(self, tab, item_id, identity_token, account_syncid, ytcfg):
3837
3838 def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
3839 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
3840 for content in contents:
3841 if not isinstance(content, dict):
3842 continue
3843 is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
3844 if not is_renderer:
3845 renderer = content.get('richItemRenderer')
3846 if renderer:
3847 for entry in self._rich_entries(renderer):
3848 yield entry
3849 continuation_list[0] = self._extract_continuation(parent_renderer)
3850 continue
3851 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
3852 for isr_content in isr_contents:
3853 if not isinstance(isr_content, dict):
3854 continue
3855
3856 known_renderers = {
3857 'playlistVideoListRenderer': self._playlist_entries,
3858 'gridRenderer': self._grid_entries,
3859 'shelfRenderer': lambda x: self._shelf_entries(x, tab.get('title') != 'Channels'),
3860 'backstagePostThreadRenderer': self._post_thread_entries,
3861 'videoRenderer': lambda x: [self._video_entry(x)],
3862 }
3863 for key, renderer in isr_content.items():
3864 if key not in known_renderers:
3865 continue
3866 for entry in known_renderers[key](renderer):
3867 if entry:
3868 yield entry
3869 continuation_list[0] = self._extract_continuation(renderer)
3870 break
3871
3872 if not continuation_list[0]:
3873 continuation_list[0] = self._extract_continuation(is_renderer)
3874
3875 if not continuation_list[0]:
3876 continuation_list[0] = self._extract_continuation(parent_renderer)
3877
3878 continuation_list = [None] # Python 2 doesnot support nonlocal
3879 tab_content = try_get(tab, lambda x: x['content'], dict)
3880 if not tab_content:
3881 return
3882 parent_renderer = (
3883 try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
3884 or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
3885 for entry in extract_entries(parent_renderer):
3886 yield entry
3887 continuation = continuation_list[0]
3888 visitor_data = None
3889
3890 for page_num in itertools.count(1):
3891 if not continuation:
3892 break
3893 headers = self.generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
3894 response = self._extract_response(
3895 item_id='%s page %s' % (item_id, page_num),
3896 query=continuation, headers=headers, ytcfg=ytcfg,
3897 check_get_keys=('continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints'))
3898
3899 if not response:
3900 break
3901 visitor_data = try_get(
3902 response, lambda x: x['responseContext']['visitorData'], compat_str) or visitor_data
3903
3904 known_continuation_renderers = {
3905 'playlistVideoListContinuation': self._playlist_entries,
3906 'gridContinuation': self._grid_entries,
3907 'itemSectionContinuation': self._post_thread_continuation_entries,
3908 'sectionListContinuation': extract_entries, # for feeds
3909 }
3910 continuation_contents = try_get(
3911 response, lambda x: x['continuationContents'], dict) or {}
3912 continuation_renderer = None
3913 for key, value in continuation_contents.items():
3914 if key not in known_continuation_renderers:
3915 continue
3916 continuation_renderer = value
3917 continuation_list = [None]
3918 for entry in known_continuation_renderers[key](continuation_renderer):
3919 yield entry
3920 continuation = continuation_list[0] or self._extract_continuation(continuation_renderer)
3921 break
3922 if continuation_renderer:
3923 continue
3924
3925 known_renderers = {
3926 'gridPlaylistRenderer': (self._grid_entries, 'items'),
3927 'gridVideoRenderer': (self._grid_entries, 'items'),
3928 'gridChannelRenderer': (self._grid_entries, 'items'),
3929 'playlistVideoRenderer': (self._playlist_entries, 'contents'),
3930 'itemSectionRenderer': (extract_entries, 'contents'), # for feeds
3931 'richItemRenderer': (extract_entries, 'contents'), # for hashtag
3932 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents')
3933 }
3934 on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints'))
3935 continuation_items = try_get(
3936 on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list)
3937 continuation_item = try_get(continuation_items, lambda x: x[0], dict) or {}
3938 video_items_renderer = None
3939 for key, value in continuation_item.items():
3940 if key not in known_renderers:
3941 continue
3942 video_items_renderer = {known_renderers[key][1]: continuation_items}
3943 continuation_list = [None]
3944 for entry in known_renderers[key][0](video_items_renderer):
3945 yield entry
3946 continuation = continuation_list[0] or self._extract_continuation(video_items_renderer)
3947 break
3948 if video_items_renderer:
3949 continue
3950 break
3951
3952 @staticmethod
3953 def _extract_selected_tab(tabs):
3954 for tab in tabs:
3955 renderer = dict_get(tab, ('tabRenderer', 'expandableTabRenderer')) or {}
3956 if renderer.get('selected') is True:
3957 return renderer
3958 else:
3959 raise ExtractorError('Unable to find selected tab')
3960
3961 @classmethod
3962 def _extract_uploader(cls, data):
3963 uploader = {}
3964 renderer = cls._extract_sidebar_info_renderer(data, 'playlistSidebarSecondaryInfoRenderer') or {}
3965 owner = try_get(
3966 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3967 if owner:
3968 uploader['uploader'] = owner.get('text')
3969 uploader['uploader_id'] = try_get(
3970 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3971 uploader['uploader_url'] = urljoin(
3972 'https://www.youtube.com/',
3973 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3974 return {k: v for k, v in uploader.items() if v is not None}
3975
3976 def _extract_from_tabs(self, item_id, webpage, data, tabs):
3977 playlist_id = title = description = channel_url = channel_name = channel_id = None
3978 thumbnails_list = tags = []
3979
3980 selected_tab = self._extract_selected_tab(tabs)
3981 renderer = try_get(
3982 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3983 if renderer:
3984 channel_name = renderer.get('title')
3985 channel_url = renderer.get('channelUrl')
3986 channel_id = renderer.get('externalId')
3987 else:
3988 renderer = try_get(
3989 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3990
3991 if renderer:
3992 title = renderer.get('title')
3993 description = renderer.get('description', '')
3994 playlist_id = channel_id
3995 tags = renderer.get('keywords', '').split()
3996 thumbnails_list = (
3997 try_get(renderer, lambda x: x['avatar']['thumbnails'], list)
3998 or try_get(
3999 self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer'),
4000 lambda x: x['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails'],
4001 list)
4002 or [])
4003
4004 thumbnails = []
4005 for t in thumbnails_list:
4006 if not isinstance(t, dict):
4007 continue
4008 thumbnail_url = url_or_none(t.get('url'))
4009 if not thumbnail_url:
4010 continue
4011 thumbnails.append({
4012 'url': thumbnail_url,
4013 'width': int_or_none(t.get('width')),
4014 'height': int_or_none(t.get('height')),
4015 })
4016 if playlist_id is None:
4017 playlist_id = item_id
4018 if title is None:
4019 title = (
4020 try_get(data, lambda x: x['header']['hashtagHeaderRenderer']['hashtag']['simpleText'])
4021 or playlist_id)
4022 title += format_field(selected_tab, 'title', ' - %s')
4023 title += format_field(selected_tab, 'expandedText', ' - %s')
4024 metadata = {
4025 'playlist_id': playlist_id,
4026 'playlist_title': title,
4027 'playlist_description': description,
4028 'uploader': channel_name,
4029 'uploader_id': channel_id,
4030 'uploader_url': channel_url,
4031 'thumbnails': thumbnails,
4032 'tags': tags,
4033 }
4034 availability = self._extract_availability(data)
4035 if availability:
4036 metadata['availability'] = availability
4037 if not channel_id:
4038 metadata.update(self._extract_uploader(data))
4039 metadata.update({
4040 'channel': metadata['uploader'],
4041 'channel_id': metadata['uploader_id'],
4042 'channel_url': metadata['uploader_url']})
4043 ytcfg = self.extract_ytcfg(item_id, webpage)
4044 return self.playlist_result(
4045 self._entries(
4046 selected_tab, playlist_id,
4047 self._extract_identity_token(webpage, item_id),
4048 self._extract_account_syncid(ytcfg, data), ytcfg),
4049 **metadata)
4050
4051 def _extract_mix_playlist(self, playlist, playlist_id, data, webpage):
4052 first_id = last_id = None
4053 ytcfg = self.extract_ytcfg(playlist_id, webpage)
4054 headers = self.generate_api_headers(
4055 ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data),
4056 identity_token=self._extract_identity_token(webpage, item_id=playlist_id))
4057 for page_num in itertools.count(1):
4058 videos = list(self._playlist_entries(playlist))
4059 if not videos:
4060 return
4061 start = next((i for i, v in enumerate(videos) if v['id'] == last_id), -1) + 1
4062 if start >= len(videos):
4063 return
4064 for video in videos[start:]:
4065 if video['id'] == first_id:
4066 self.to_screen('First video %s found again; Assuming end of Mix' % first_id)
4067 return
4068 yield video
4069 first_id = first_id or videos[0]['id']
4070 last_id = videos[-1]['id']
4071 watch_endpoint = try_get(
4072 playlist, lambda x: x['contents'][-1]['playlistPanelVideoRenderer']['navigationEndpoint']['watchEndpoint'])
4073 query = {
4074 'playlistId': playlist_id,
4075 'videoId': watch_endpoint.get('videoId') or last_id,
4076 'index': watch_endpoint.get('index') or len(videos),
4077 'params': watch_endpoint.get('params') or 'OAE%3D'
4078 }
4079 response = self._extract_response(
4080 item_id='%s page %d' % (playlist_id, page_num),
4081 query=query, ep='next', headers=headers, ytcfg=ytcfg,
4082 check_get_keys='contents'
4083 )
4084 playlist = try_get(
4085 response, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
4086
4087 def _extract_from_playlist(self, item_id, url, data, playlist, webpage):
4088 title = playlist.get('title') or try_get(
4089 data, lambda x: x['titleText']['simpleText'], compat_str)
4090 playlist_id = playlist.get('playlistId') or item_id
4091
4092 # Delegating everything except mix playlists to regular tab-based playlist URL
4093 playlist_url = urljoin(url, try_get(
4094 playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
4095 compat_str))
4096 if playlist_url and playlist_url != url:
4097 return self.url_result(
4098 playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
4099 video_title=title)
4100
4101 return self.playlist_result(
4102 self._extract_mix_playlist(playlist, playlist_id, data, webpage),
4103 playlist_id=playlist_id, playlist_title=title)
4104
4105 def _extract_availability(self, data):
4106 """
4107 Gets the availability of a given playlist/tab.
4108 Note: Unless YouTube tells us explicitly, we do not assume it is public
4109 @param data: response
4110 """
4111 is_private = is_unlisted = None
4112 renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') or {}
4113 badge_labels = self._extract_badges(renderer)
4114
4115 # Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge
4116 privacy_dropdown_entries = try_get(
4117 renderer, lambda x: x['privacyForm']['dropdownFormFieldRenderer']['dropdown']['dropdownRenderer']['entries'], list) or []
4118 for renderer_dict in privacy_dropdown_entries:
4119 is_selected = try_get(
4120 renderer_dict, lambda x: x['privacyDropdownItemRenderer']['isSelected'], bool) or False
4121 if not is_selected:
4122 continue
4123 label = self._get_text(renderer_dict, ('privacyDropdownItemRenderer', 'label'))
4124 if label:
4125 badge_labels.add(label.lower())
4126 break
4127
4128 for badge_label in badge_labels:
4129 if badge_label == 'unlisted':
4130 is_unlisted = True
4131 elif badge_label == 'private':
4132 is_private = True
4133 elif badge_label == 'public':
4134 is_unlisted = is_private = False
4135 return self._availability(is_private, False, False, False, is_unlisted)
4136
4137 @staticmethod
4138 def _extract_sidebar_info_renderer(data, info_renderer, expected_type=dict):
4139 sidebar_renderer = try_get(
4140 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list) or []
4141 for item in sidebar_renderer:
4142 renderer = try_get(item, lambda x: x[info_renderer], expected_type)
4143 if renderer:
4144 return renderer
4145
4146 def _reload_with_unavailable_videos(self, item_id, data, webpage):
4147 """
4148 Get playlist with unavailable videos if the 'show unavailable videos' button exists.
4149 """
4150 browse_id = params = None
4151 renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer')
4152 if not renderer:
4153 return
4154 menu_renderer = try_get(
4155 renderer, lambda x: x['menu']['menuRenderer']['items'], list) or []
4156 for menu_item in menu_renderer:
4157 if not isinstance(menu_item, dict):
4158 continue
4159 nav_item_renderer = menu_item.get('menuNavigationItemRenderer')
4160 text = try_get(
4161 nav_item_renderer, lambda x: x['text']['simpleText'], compat_str)
4162 if not text or text.lower() != 'show unavailable videos':
4163 continue
4164 browse_endpoint = try_get(
4165 nav_item_renderer, lambda x: x['navigationEndpoint']['browseEndpoint'], dict) or {}
4166 browse_id = browse_endpoint.get('browseId')
4167 params = browse_endpoint.get('params')
4168 break
4169
4170 ytcfg = self.extract_ytcfg(item_id, webpage)
4171 headers = self.generate_api_headers(
4172 ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data),
4173 identity_token=self._extract_identity_token(webpage, item_id=item_id),
4174 visitor_data=try_get(
4175 self._extract_context(ytcfg), lambda x: x['client']['visitorData'], compat_str))
4176 query = {
4177 'params': params or 'wgYCCAA=',
4178 'browseId': browse_id or 'VL%s' % item_id
4179 }
4180 return self._extract_response(
4181 item_id=item_id, headers=headers, query=query,
4182 check_get_keys='contents', fatal=False, ytcfg=ytcfg,
4183 note='Downloading API JSON with unavailable videos')
4184
4185 def _extract_webpage(self, url, item_id):
4186 retries = self.get_param('extractor_retries', 3)
4187 count = -1
4188 last_error = 'Incomplete yt initial data recieved'
4189 while count < retries:
4190 count += 1
4191 # Sometimes youtube returns a webpage with incomplete ytInitialData
4192 # See: https://github.com/yt-dlp/yt-dlp/issues/116
4193 if count:
4194 self.report_warning('%s. Retrying ...' % last_error)
4195 webpage = self._download_webpage(
4196 url, item_id,
4197 'Downloading webpage%s' % (' (retry #%d)' % count if count else ''))
4198 data = self.extract_yt_initial_data(item_id, webpage)
4199 if data.get('contents') or data.get('currentVideoEndpoint'):
4200 break
4201 # Extract alerts here only when there is error
4202 self._extract_and_report_alerts(data)
4203 if count >= retries:
4204 raise ExtractorError(last_error)
4205 return webpage, data
4206
4207 @staticmethod
4208 def _smuggle_data(entries, data):
4209 for entry in entries:
4210 if data:
4211 entry['url'] = smuggle_url(entry['url'], data)
4212 yield entry
4213
4214 def _real_extract(self, url):
4215 url, smuggled_data = unsmuggle_url(url, {})
4216 if self.is_music_url(url):
4217 smuggled_data['is_music_url'] = True
4218 info_dict = self.__real_extract(url, smuggled_data)
4219 if info_dict.get('entries'):
4220 info_dict['entries'] = self._smuggle_data(info_dict['entries'], smuggled_data)
4221 return info_dict
4222
4223 _url_re = re.compile(r'(?P<pre>%s)(?(channel_type)(?P<tab>/\w+))?(?P<post>.*)$' % _VALID_URL)
4224
4225 def __real_extract(self, url, smuggled_data):
4226 item_id = self._match_id(url)
4227 url = compat_urlparse.urlunparse(
4228 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
4229 compat_opts = self.get_param('compat_opts', [])
4230
4231 def get_mobj(url):
4232 mobj = self._url_re.match(url).groupdict()
4233 mobj.update((k, '') for k, v in mobj.items() if v is None)
4234 return mobj
4235
4236 mobj = get_mobj(url)
4237 # Youtube returns incomplete data if tabname is not lower case
4238 pre, tab, post, is_channel = mobj['pre'], mobj['tab'].lower(), mobj['post'], not mobj['not_channel']
4239
4240 if is_channel:
4241 if smuggled_data.get('is_music_url'):
4242 if item_id[:2] == 'VL':
4243 # Youtube music VL channels have an equivalent playlist
4244 item_id = item_id[2:]
4245 pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False
4246 elif item_id[:2] == 'MP':
4247 # Youtube music albums (/channel/MP...) have a OLAK playlist that can be extracted from the webpage
4248 item_id = self._search_regex(
4249 r'\\x22audioPlaylistId\\x22:\\x22([0-9A-Za-z_-]+)\\x22',
4250 self._download_webpage('https://music.youtube.com/channel/%s' % item_id, item_id),
4251 'playlist id')
4252 pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False
4253 elif mobj['channel_type'] == 'browse':
4254 # Youtube music /browse/ should be changed to /channel/
4255 pre = 'https://www.youtube.com/channel/%s' % item_id
4256 if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts:
4257 # Home URLs should redirect to /videos/
4258 self.report_warning(
4259 'A channel/user page was given. All the channel\'s videos will be downloaded. '
4260 'To download only the videos in the home page, add a "/featured" to the URL')
4261 tab = '/videos'
4262
4263 url = ''.join((pre, tab, post))
4264 mobj = get_mobj(url)
4265
4266 # Handle both video/playlist URLs
4267 qs = parse_qs(url)
4268 video_id = qs.get('v', [None])[0]
4269 playlist_id = qs.get('list', [None])[0]
4270
4271 if not video_id and mobj['not_channel'].startswith('watch'):
4272 if not playlist_id:
4273 # If there is neither video or playlist ids, youtube redirects to home page, which is undesirable
4274 raise ExtractorError('Unable to recognize tab page')
4275 # Common mistake: https://www.youtube.com/watch?list=playlist_id
4276 self.report_warning('A video URL was given without video ID. Trying to download playlist %s' % playlist_id)
4277 url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
4278 mobj = get_mobj(url)
4279
4280 if video_id and playlist_id:
4281 if self.get_param('noplaylist'):
4282 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
4283 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
4284 self.to_screen('Downloading playlist %s; add --no-playlist to just download video %s' % (playlist_id, video_id))
4285
4286 webpage, data = self._extract_webpage(url, item_id)
4287
4288 tabs = try_get(
4289 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
4290 if tabs:
4291 selected_tab = self._extract_selected_tab(tabs)
4292 tab_name = selected_tab.get('title', '')
4293 if 'no-youtube-channel-redirect' not in compat_opts:
4294 if mobj['tab'] == '/live':
4295 # Live tab should have redirected to the video
4296 raise ExtractorError('The channel is not currently live', expected=True)
4297 if mobj['tab'] == '/videos' and tab_name.lower() != mobj['tab'][1:]:
4298 if not mobj['not_channel'] and item_id[:2] == 'UC':
4299 # Topic channels don't have /videos. Use the equivalent playlist instead
4300 self.report_warning('The URL does not have a %s tab. Trying to redirect to playlist UU%s instead' % (mobj['tab'][1:], item_id[2:]))
4301 pl_id = 'UU%s' % item_id[2:]
4302 pl_url = 'https://www.youtube.com/playlist?list=%s%s' % (pl_id, mobj['post'])
4303 try:
4304 pl_webpage, pl_data = self._extract_webpage(pl_url, pl_id)
4305 for alert_type, alert_message in self._extract_alerts(pl_data):
4306 if alert_type == 'error':
4307 raise ExtractorError('Youtube said: %s' % alert_message)
4308 item_id, url, webpage, data = pl_id, pl_url, pl_webpage, pl_data
4309 except ExtractorError:
4310 self.report_warning('The playlist gave error. Falling back to channel URL')
4311 else:
4312 self.report_warning('The URL does not have a %s tab. %s is being downloaded instead' % (mobj['tab'][1:], tab_name))
4313
4314 self.write_debug('Final URL: %s' % url)
4315
4316 # YouTube sometimes provides a button to reload playlist with unavailable videos.
4317 if 'no-youtube-unavailable-videos' not in compat_opts:
4318 data = self._reload_with_unavailable_videos(item_id, data, webpage) or data
4319 self._extract_and_report_alerts(data)
4320 tabs = try_get(
4321 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
4322 if tabs:
4323 return self._extract_from_tabs(item_id, webpage, data, tabs)
4324
4325 playlist = try_get(
4326 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
4327 if playlist:
4328 return self._extract_from_playlist(item_id, url, data, playlist, webpage)
4329
4330 video_id = try_get(
4331 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
4332 compat_str) or video_id
4333 if video_id:
4334 if mobj['tab'] != '/live': # live tab is expected to redirect to video
4335 self.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id)
4336 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
4337
4338 raise ExtractorError('Unable to recognize tab page')
4339
4340
4341 class YoutubePlaylistIE(InfoExtractor):
4342 IE_DESC = 'YouTube.com playlists'
4343 _VALID_URL = r'''(?x)(?:
4344 (?:https?://)?
4345 (?:\w+\.)?
4346 (?:
4347 (?:
4348 youtube(?:kids)?\.com|
4349 invidio\.us
4350 )
4351 /.*?\?.*?\blist=
4352 )?
4353 (?P<id>%(playlist_id)s)
4354 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
4355 IE_NAME = 'youtube:playlist'
4356 _TESTS = [{
4357 'note': 'issue #673',
4358 'url': 'PLBB231211A4F62143',
4359 'info_dict': {
4360 'title': '[OLD]Team Fortress 2 (Class-based LP)',
4361 'id': 'PLBB231211A4F62143',
4362 'uploader': 'Wickydoo',
4363 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
4364 'description': 'md5:8fa6f52abb47a9552002fa3ddfc57fc2',
4365 },
4366 'playlist_mincount': 29,
4367 }, {
4368 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
4369 'info_dict': {
4370 'title': 'YDL_safe_search',
4371 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
4372 },
4373 'playlist_count': 2,
4374 'skip': 'This playlist is private',
4375 }, {
4376 'note': 'embedded',
4377 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
4378 'playlist_count': 4,
4379 'info_dict': {
4380 'title': 'JODA15',
4381 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
4382 'uploader': 'milan',
4383 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
4384 }
4385 }, {
4386 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
4387 'playlist_mincount': 654,
4388 'info_dict': {
4389 'title': '2018 Chinese New Singles (11/6 updated)',
4390 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
4391 'uploader': 'LBK',
4392 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
4393 'description': 'md5:da521864744d60a198e3a88af4db0d9d',
4394 }
4395 }, {
4396 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
4397 'only_matching': True,
4398 }, {
4399 # music album playlist
4400 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
4401 'only_matching': True,
4402 }]
4403
4404 @classmethod
4405 def suitable(cls, url):
4406 if YoutubeTabIE.suitable(url):
4407 return False
4408 # Hack for lazy extractors until more generic solution is implemented
4409 # (see #28780)
4410 from .youtube import parse_qs
4411 qs = parse_qs(url)
4412 if qs.get('v', [None])[0]:
4413 return False
4414 return super(YoutubePlaylistIE, cls).suitable(url)
4415
4416 def _real_extract(self, url):
4417 playlist_id = self._match_id(url)
4418 is_music_url = YoutubeBaseInfoExtractor.is_music_url(url)
4419 url = update_url_query(
4420 'https://www.youtube.com/playlist',
4421 parse_qs(url) or {'list': playlist_id})
4422 if is_music_url:
4423 url = smuggle_url(url, {'is_music_url': True})
4424 return self.url_result(url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
4425
4426
4427 class YoutubeYtBeIE(InfoExtractor):
4428 IE_DESC = 'youtu.be'
4429 _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
4430 _TESTS = [{
4431 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
4432 'info_dict': {
4433 'id': 'yeWKywCrFtk',
4434 'ext': 'mp4',
4435 'title': 'Small Scale Baler and Braiding Rugs',
4436 'uploader': 'Backus-Page House Museum',
4437 'uploader_id': 'backuspagemuseum',
4438 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
4439 'upload_date': '20161008',
4440 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
4441 'categories': ['Nonprofits & Activism'],
4442 'tags': list,
4443 'like_count': int,
4444 'dislike_count': int,
4445 },
4446 'params': {
4447 'noplaylist': True,
4448 'skip_download': True,
4449 },
4450 }, {
4451 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
4452 'only_matching': True,
4453 }]
4454
4455 def _real_extract(self, url):
4456 mobj = self._match_valid_url(url)
4457 video_id = mobj.group('id')
4458 playlist_id = mobj.group('playlist_id')
4459 return self.url_result(
4460 update_url_query('https://www.youtube.com/watch', {
4461 'v': video_id,
4462 'list': playlist_id,
4463 'feature': 'youtu.be',
4464 }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
4465
4466
4467 class YoutubeYtUserIE(InfoExtractor):
4468 IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
4469 _VALID_URL = r'ytuser:(?P<id>.+)'
4470 _TESTS = [{
4471 'url': 'ytuser:phihag',
4472 'only_matching': True,
4473 }]
4474
4475 def _real_extract(self, url):
4476 user_id = self._match_id(url)
4477 return self.url_result(
4478 'https://www.youtube.com/user/%s' % user_id,
4479 ie=YoutubeTabIE.ie_key(), video_id=user_id)
4480
4481
4482 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
4483 IE_NAME = 'youtube:favorites'
4484 IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
4485 _VALID_URL = r':ytfav(?:ou?rite)?s?'
4486 _LOGIN_REQUIRED = True
4487 _TESTS = [{
4488 'url': ':ytfav',
4489 'only_matching': True,
4490 }, {
4491 'url': ':ytfavorites',
4492 'only_matching': True,
4493 }]
4494
4495 def _real_extract(self, url):
4496 return self.url_result(
4497 'https://www.youtube.com/playlist?list=LL',
4498 ie=YoutubeTabIE.ie_key())
4499
4500
4501 class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE):
4502 IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
4503 # there doesn't appear to be a real limit, for example if you search for
4504 # 'python' you get more than 8.000.000 results
4505 _MAX_RESULTS = float('inf')
4506 IE_NAME = 'youtube:search'
4507 _SEARCH_KEY = 'ytsearch'
4508 _SEARCH_PARAMS = None
4509 _TESTS = []
4510
4511 def _entries(self, query, n):
4512 data = {'query': query}
4513 if self._SEARCH_PARAMS:
4514 data['params'] = self._SEARCH_PARAMS
4515 total = 0
4516 continuation = {}
4517 for page_num in itertools.count(1):
4518 data.update(continuation)
4519 search = self._extract_response(
4520 item_id='query "%s" page %s' % (query, page_num), ep='search', query=data,
4521 check_get_keys=('contents', 'onResponseReceivedCommands')
4522 )
4523 if not search:
4524 break
4525 slr_contents = try_get(
4526 search,
4527 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
4528 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
4529 list)
4530 if not slr_contents:
4531 break
4532
4533 # Youtube sometimes adds promoted content to searches,
4534 # changing the index location of videos and token.
4535 # So we search through all entries till we find them.
4536 continuation = None
4537 for slr_content in slr_contents:
4538 if not continuation:
4539 continuation = self._extract_continuation({'contents': [slr_content]})
4540
4541 isr_contents = try_get(
4542 slr_content,
4543 lambda x: x['itemSectionRenderer']['contents'],
4544 list)
4545 if not isr_contents:
4546 continue
4547 for content in isr_contents:
4548 if not isinstance(content, dict):
4549 continue
4550 video = content.get('videoRenderer')
4551 if not isinstance(video, dict):
4552 continue
4553 video_id = video.get('videoId')
4554 if not video_id:
4555 continue
4556
4557 yield self._extract_video(video)
4558 total += 1
4559 if total == n:
4560 return
4561
4562 if not continuation:
4563 break
4564
4565 def _get_n_results(self, query, n):
4566 """Get a specified number of results for a query"""
4567 return self.playlist_result(self._entries(query, n), query, query)
4568
4569
4570 class YoutubeSearchDateIE(YoutubeSearchIE):
4571 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
4572 _SEARCH_KEY = 'ytsearchdate'
4573 IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
4574 _SEARCH_PARAMS = 'CAI%3D'
4575
4576
4577 class YoutubeSearchURLIE(YoutubeSearchIE):
4578 IE_DESC = 'YouTube.com search URLs'
4579 IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
4580 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
4581 # _MAX_RESULTS = 100
4582 _TESTS = [{
4583 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
4584 'playlist_mincount': 5,
4585 'info_dict': {
4586 'id': 'youtube-dl test video',
4587 'title': 'youtube-dl test video',
4588 }
4589 }, {
4590 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
4591 'only_matching': True,
4592 }]
4593
4594 @classmethod
4595 def _make_valid_url(cls):
4596 return cls._VALID_URL
4597
4598 def _real_extract(self, url):
4599 qs = parse_qs(url)
4600 query = (qs.get('search_query') or qs.get('q'))[0]
4601 self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
4602 return self._get_n_results(query, self._MAX_RESULTS)
4603
4604
4605 class YoutubeFeedsInfoExtractor(YoutubeTabIE):
4606 """
4607 Base class for feed extractors
4608 Subclasses must define the _FEED_NAME property.
4609 """
4610 _LOGIN_REQUIRED = True
4611 _TESTS = []
4612
4613 @property
4614 def IE_NAME(self):
4615 return 'youtube:%s' % self._FEED_NAME
4616
4617 def _real_extract(self, url):
4618 return self.url_result(
4619 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
4620 ie=YoutubeTabIE.ie_key())
4621
4622
4623 class YoutubeWatchLaterIE(InfoExtractor):
4624 IE_NAME = 'youtube:watchlater'
4625 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
4626 _VALID_URL = r':ytwatchlater'
4627 _TESTS = [{
4628 'url': ':ytwatchlater',
4629 'only_matching': True,
4630 }]
4631
4632 def _real_extract(self, url):
4633 return self.url_result(
4634 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
4635
4636
4637 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
4638 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
4639 _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
4640 _FEED_NAME = 'recommended'
4641 _LOGIN_REQUIRED = False
4642 _TESTS = [{
4643 'url': ':ytrec',
4644 'only_matching': True,
4645 }, {
4646 'url': ':ytrecommended',
4647 'only_matching': True,
4648 }, {
4649 'url': 'https://youtube.com',
4650 'only_matching': True,
4651 }]
4652
4653
4654 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
4655 IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
4656 _VALID_URL = r':ytsub(?:scription)?s?'
4657 _FEED_NAME = 'subscriptions'
4658 _TESTS = [{
4659 'url': ':ytsubs',
4660 'only_matching': True,
4661 }, {
4662 'url': ':ytsubscriptions',
4663 'only_matching': True,
4664 }]
4665
4666
4667 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
4668 IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)'
4669 _VALID_URL = r':ythis(?:tory)?'
4670 _FEED_NAME = 'history'
4671 _TESTS = [{
4672 'url': ':ythistory',
4673 'only_matching': True,
4674 }]
4675
4676
4677 class YoutubeTruncatedURLIE(InfoExtractor):
4678 IE_NAME = 'youtube:truncated_url'
4679 IE_DESC = False # Do not list
4680 _VALID_URL = r'''(?x)
4681 (?:https?://)?
4682 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
4683 (?:watch\?(?:
4684 feature=[a-z_]+|
4685 annotation_id=annotation_[^&]+|
4686 x-yt-cl=[0-9]+|
4687 hl=[^&]*|
4688 t=[0-9]+
4689 )?
4690 |
4691 attribution_link\?a=[^&]+
4692 )
4693 $
4694 '''
4695
4696 _TESTS = [{
4697 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
4698 'only_matching': True,
4699 }, {
4700 'url': 'https://www.youtube.com/watch?',
4701 'only_matching': True,
4702 }, {
4703 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
4704 'only_matching': True,
4705 }, {
4706 'url': 'https://www.youtube.com/watch?feature=foo',
4707 'only_matching': True,
4708 }, {
4709 'url': 'https://www.youtube.com/watch?hl=en-GB',
4710 'only_matching': True,
4711 }, {
4712 'url': 'https://www.youtube.com/watch?t=2372',
4713 'only_matching': True,
4714 }]
4715
4716 def _real_extract(self, url):
4717 raise ExtractorError(
4718 'Did you forget to quote the URL? Remember that & is a meta '
4719 'character in most shells, so you want to put the URL in quotes, '
4720 'like youtube-dl '
4721 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
4722 ' or simply youtube-dl BaW_jenozKc .',
4723 expected=True)
4724
4725
4726 class YoutubeTruncatedIDIE(InfoExtractor):
4727 IE_NAME = 'youtube:truncated_id'
4728 IE_DESC = False # Do not list
4729 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
4730
4731 _TESTS = [{
4732 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
4733 'only_matching': True,
4734 }]
4735
4736 def _real_extract(self, url):
4737 video_id = self._match_id(url)
4738 raise ExtractorError(
4739 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
4740 expected=True)