]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/youtube.py
[bitchute] Fix test (#758)
[yt-dlp.git] / yt_dlp / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5 import base64
6 import calendar
7 import copy
8 import datetime
9 import hashlib
10 import itertools
11 import json
12 import os.path
13 import random
14 import re
15 import time
16 import traceback
17
18 from .common import InfoExtractor, SearchInfoExtractor
19 from ..compat import (
20 compat_chr,
21 compat_HTTPError,
22 compat_parse_qs,
23 compat_str,
24 compat_urllib_parse_unquote_plus,
25 compat_urllib_parse_urlencode,
26 compat_urllib_parse_urlparse,
27 compat_urlparse,
28 )
29 from ..jsinterp import JSInterpreter
30 from ..utils import (
31 bytes_to_intlist,
32 clean_html,
33 datetime_from_str,
34 dict_get,
35 error_to_compat_str,
36 ExtractorError,
37 float_or_none,
38 format_field,
39 int_or_none,
40 intlist_to_bytes,
41 is_html,
42 mimetype2ext,
43 network_exceptions,
44 orderedSet,
45 parse_codecs,
46 parse_count,
47 parse_duration,
48 parse_iso8601,
49 parse_qs,
50 qualities,
51 remove_start,
52 smuggle_url,
53 str_or_none,
54 str_to_int,
55 traverse_obj,
56 try_get,
57 unescapeHTML,
58 unified_strdate,
59 unsmuggle_url,
60 update_url_query,
61 url_or_none,
62 urlencode_postdata,
63 urljoin,
64 variadic,
65 )
66
67
68 # any clients starting with _ cannot be explicity requested by the user
69 INNERTUBE_CLIENTS = {
70 'web': {
71 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
72 'INNERTUBE_CONTEXT': {
73 'client': {
74 'clientName': 'WEB',
75 'clientVersion': '2.20210622.10.00',
76 }
77 },
78 'INNERTUBE_CONTEXT_CLIENT_NAME': 1
79 },
80 'web_embedded': {
81 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
82 'INNERTUBE_CONTEXT': {
83 'client': {
84 'clientName': 'WEB_EMBEDDED_PLAYER',
85 'clientVersion': '1.20210620.0.1',
86 },
87 },
88 'INNERTUBE_CONTEXT_CLIENT_NAME': 56
89 },
90 'web_music': {
91 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
92 'INNERTUBE_HOST': 'music.youtube.com',
93 'INNERTUBE_CONTEXT': {
94 'client': {
95 'clientName': 'WEB_REMIX',
96 'clientVersion': '1.20210621.00.00',
97 }
98 },
99 'INNERTUBE_CONTEXT_CLIENT_NAME': 67,
100 },
101 'web_creator': {
102 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
103 'INNERTUBE_CONTEXT': {
104 'client': {
105 'clientName': 'WEB_CREATOR',
106 'clientVersion': '1.20210621.00.00',
107 }
108 },
109 'INNERTUBE_CONTEXT_CLIENT_NAME': 62,
110 },
111 'android': {
112 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
113 'INNERTUBE_CONTEXT': {
114 'client': {
115 'clientName': 'ANDROID',
116 'clientVersion': '16.20',
117 }
118 },
119 'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
120 },
121 'android_embedded': {
122 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
123 'INNERTUBE_CONTEXT': {
124 'client': {
125 'clientName': 'ANDROID_EMBEDDED_PLAYER',
126 'clientVersion': '16.20',
127 },
128 },
129 'INNERTUBE_CONTEXT_CLIENT_NAME': 55
130 },
131 'android_music': {
132 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
133 'INNERTUBE_HOST': 'music.youtube.com',
134 'INNERTUBE_CONTEXT': {
135 'client': {
136 'clientName': 'ANDROID_MUSIC',
137 'clientVersion': '4.32',
138 }
139 },
140 'INNERTUBE_CONTEXT_CLIENT_NAME': 21,
141 },
142 'android_creator': {
143 'INNERTUBE_CONTEXT': {
144 'client': {
145 'clientName': 'ANDROID_CREATOR',
146 'clientVersion': '21.24.100',
147 },
148 },
149 'INNERTUBE_CONTEXT_CLIENT_NAME': 14
150 },
151 # ios has HLS live streams
152 # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680
153 'ios': {
154 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
155 'INNERTUBE_CONTEXT': {
156 'client': {
157 'clientName': 'IOS',
158 'clientVersion': '16.20',
159 }
160 },
161 'INNERTUBE_CONTEXT_CLIENT_NAME': 5
162 },
163 'ios_embedded': {
164 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8',
165 'INNERTUBE_CONTEXT': {
166 'client': {
167 'clientName': 'IOS_MESSAGES_EXTENSION',
168 'clientVersion': '16.20',
169 },
170 },
171 'INNERTUBE_CONTEXT_CLIENT_NAME': 66
172 },
173 'ios_music': {
174 'INNERTUBE_API_KEY': 'AIzaSyDK3iBpDP9nHVTk2qL73FLJICfOC3c51Og',
175 'INNERTUBE_HOST': 'music.youtube.com',
176 'INNERTUBE_CONTEXT': {
177 'client': {
178 'clientName': 'IOS_MUSIC',
179 'clientVersion': '4.32',
180 },
181 },
182 'INNERTUBE_CONTEXT_CLIENT_NAME': 26
183 },
184 'ios_creator': {
185 'INNERTUBE_CONTEXT': {
186 'client': {
187 'clientName': 'IOS_CREATOR',
188 'clientVersion': '21.24.100',
189 },
190 },
191 'INNERTUBE_CONTEXT_CLIENT_NAME': 15
192 },
193 # mweb has 'ultralow' formats
194 # See: https://github.com/yt-dlp/yt-dlp/pull/557
195 'mweb': {
196 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8',
197 'INNERTUBE_CONTEXT': {
198 'client': {
199 'clientName': 'MWEB',
200 'clientVersion': '2.20210721.07.00',
201 }
202 },
203 'INNERTUBE_CONTEXT_CLIENT_NAME': 2
204 },
205 }
206
207
208 def build_innertube_clients():
209 third_party = {
210 'embedUrl': 'https://google.com', # Can be any valid URL
211 }
212 base_clients = ('android', 'web', 'ios', 'mweb')
213 priority = qualities(base_clients[::-1])
214
215 for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()):
216 ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8')
217 ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com')
218 ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en')
219 ytcfg['priority'] = 10 * priority(client.split('_', 1)[0])
220
221 if client in base_clients:
222 INNERTUBE_CLIENTS[f'{client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg)
223 agegate_ytcfg['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED'
224 agegate_ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party
225 agegate_ytcfg['priority'] -= 1
226 elif client.endswith('_embedded'):
227 ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party
228 ytcfg['priority'] -= 2
229 else:
230 ytcfg['priority'] -= 3
231
232
233 build_innertube_clients()
234
235
236 class YoutubeBaseInfoExtractor(InfoExtractor):
237 """Provide base functions for Youtube extractors"""
238
239 _RESERVED_NAMES = (
240 r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|'
241 r'shorts|movies|results|shared|hashtag|trending|feed|feeds|'
242 r'browse|oembed|get_video_info|iframe_api|s/player|'
243 r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout')
244
245 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
246
247 _NETRC_MACHINE = 'youtube'
248
249 # If True it will raise an error if no login info is provided
250 _LOGIN_REQUIRED = False
251
252 r''' # Unused since login is broken
253 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
254 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
255
256 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
257 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
258 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
259 '''
260
261 def _login(self):
262 """
263 Attempt to log in to YouTube.
264 True is returned if successful or skipped.
265 False is returned if login failed.
266
267 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
268 """
269
270 def warn(message):
271 self.report_warning(message)
272
273 # username+password login is broken
274 if (self._LOGIN_REQUIRED
275 and self.get_param('cookiefile') is None
276 and self.get_param('cookiesfrombrowser') is None):
277 self.raise_login_required(
278 'Login details are needed to download this content', method='cookies')
279 username, password = self._get_login_info()
280 if username:
281 warn('Logging in using username and password is broken. %s' % self._LOGIN_HINTS['cookies'])
282 return
283
284 # Everything below this is broken!
285 r'''
286 # No authentication to be performed
287 if username is None:
288 if self._LOGIN_REQUIRED and self.get_param('cookiefile') is None:
289 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
290 # if self.get_param('cookiefile'): # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
291 # self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
292 return True
293
294 login_page = self._download_webpage(
295 self._LOGIN_URL, None,
296 note='Downloading login page',
297 errnote='unable to fetch login page', fatal=False)
298 if login_page is False:
299 return
300
301 login_form = self._hidden_inputs(login_page)
302
303 def req(url, f_req, note, errnote):
304 data = login_form.copy()
305 data.update({
306 'pstMsg': 1,
307 'checkConnection': 'youtube',
308 'checkedDomains': 'youtube',
309 'hl': 'en',
310 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
311 'f.req': json.dumps(f_req),
312 'flowName': 'GlifWebSignIn',
313 'flowEntry': 'ServiceLogin',
314 # TODO: reverse actual botguard identifier generation algo
315 'bgRequest': '["identifier",""]',
316 })
317 return self._download_json(
318 url, None, note=note, errnote=errnote,
319 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
320 fatal=False,
321 data=urlencode_postdata(data), headers={
322 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
323 'Google-Accounts-XSRF': 1,
324 })
325
326 lookup_req = [
327 username,
328 None, [], None, 'US', None, None, 2, False, True,
329 [
330 None, None,
331 [2, 1, None, 1,
332 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
333 None, [], 4],
334 1, [None, None, []], None, None, None, True
335 ],
336 username,
337 ]
338
339 lookup_results = req(
340 self._LOOKUP_URL, lookup_req,
341 'Looking up account info', 'Unable to look up account info')
342
343 if lookup_results is False:
344 return False
345
346 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
347 if not user_hash:
348 warn('Unable to extract user hash')
349 return False
350
351 challenge_req = [
352 user_hash,
353 None, 1, None, [1, None, None, None, [password, None, True]],
354 [
355 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
356 1, [None, None, []], None, None, None, True
357 ]]
358
359 challenge_results = req(
360 self._CHALLENGE_URL, challenge_req,
361 'Logging in', 'Unable to log in')
362
363 if challenge_results is False:
364 return
365
366 login_res = try_get(challenge_results, lambda x: x[0][5], list)
367 if login_res:
368 login_msg = try_get(login_res, lambda x: x[5], compat_str)
369 warn(
370 'Unable to login: %s' % 'Invalid password'
371 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
372 return False
373
374 res = try_get(challenge_results, lambda x: x[0][-1], list)
375 if not res:
376 warn('Unable to extract result entry')
377 return False
378
379 login_challenge = try_get(res, lambda x: x[0][0], list)
380 if login_challenge:
381 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
382 if challenge_str == 'TWO_STEP_VERIFICATION':
383 # SEND_SUCCESS - TFA code has been successfully sent to phone
384 # QUOTA_EXCEEDED - reached the limit of TFA codes
385 status = try_get(login_challenge, lambda x: x[5], compat_str)
386 if status == 'QUOTA_EXCEEDED':
387 warn('Exceeded the limit of TFA codes, try later')
388 return False
389
390 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
391 if not tl:
392 warn('Unable to extract TL')
393 return False
394
395 tfa_code = self._get_tfa_info('2-step verification code')
396
397 if not tfa_code:
398 warn(
399 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
400 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
401 return False
402
403 tfa_code = remove_start(tfa_code, 'G-')
404
405 tfa_req = [
406 user_hash, None, 2, None,
407 [
408 9, None, None, None, None, None, None, None,
409 [None, tfa_code, True, 2]
410 ]]
411
412 tfa_results = req(
413 self._TFA_URL.format(tl), tfa_req,
414 'Submitting TFA code', 'Unable to submit TFA code')
415
416 if tfa_results is False:
417 return False
418
419 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
420 if tfa_res:
421 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
422 warn(
423 'Unable to finish TFA: %s' % 'Invalid TFA code'
424 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
425 return False
426
427 check_cookie_url = try_get(
428 tfa_results, lambda x: x[0][-1][2], compat_str)
429 else:
430 CHALLENGES = {
431 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
432 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
433 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
434 }
435 challenge = CHALLENGES.get(
436 challenge_str,
437 '%s returned error %s.' % (self.IE_NAME, challenge_str))
438 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
439 return False
440 else:
441 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
442
443 if not check_cookie_url:
444 warn('Unable to extract CheckCookie URL')
445 return False
446
447 check_cookie_results = self._download_webpage(
448 check_cookie_url, None, 'Checking cookie', fatal=False)
449
450 if check_cookie_results is False:
451 return False
452
453 if 'https://myaccount.google.com/' not in check_cookie_results:
454 warn('Unable to log in')
455 return False
456
457 return True
458 '''
459
460 def _initialize_consent(self):
461 cookies = self._get_cookies('https://www.youtube.com/')
462 if cookies.get('__Secure-3PSID'):
463 return
464 consent_id = None
465 consent = cookies.get('CONSENT')
466 if consent:
467 if 'YES' in consent.value:
468 return
469 consent_id = self._search_regex(
470 r'PENDING\+(\d+)', consent.value, 'consent', default=None)
471 if not consent_id:
472 consent_id = random.randint(100, 999)
473 self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id)
474
475 def _real_initialize(self):
476 self._initialize_consent()
477 if self._downloader is None:
478 return
479 if not self._login():
480 return
481
482 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
483 _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
484 _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
485
486 def _get_default_ytcfg(self, client='web'):
487 return copy.deepcopy(INNERTUBE_CLIENTS[client])
488
489 def _get_innertube_host(self, client='web'):
490 return INNERTUBE_CLIENTS[client]['INNERTUBE_HOST']
491
492 def _ytcfg_get_safe(self, ytcfg, getter, expected_type=None, default_client='web'):
493 # try_get but with fallback to default ytcfg client values when present
494 _func = lambda y: try_get(y, getter, expected_type)
495 return _func(ytcfg) or _func(self._get_default_ytcfg(default_client))
496
497 def _extract_client_name(self, ytcfg, default_client='web'):
498 return self._ytcfg_get_safe(
499 ytcfg, (lambda x: x['INNERTUBE_CLIENT_NAME'],
500 lambda x: x['INNERTUBE_CONTEXT']['client']['clientName']), compat_str, default_client)
501
502 @staticmethod
503 def _extract_session_index(*data):
504 for ytcfg in data:
505 session_index = int_or_none(try_get(ytcfg, lambda x: x['SESSION_INDEX']))
506 if session_index is not None:
507 return session_index
508
509 def _extract_client_version(self, ytcfg, default_client='web'):
510 return self._ytcfg_get_safe(
511 ytcfg, (lambda x: x['INNERTUBE_CLIENT_VERSION'],
512 lambda x: x['INNERTUBE_CONTEXT']['client']['clientVersion']), compat_str, default_client)
513
514 def _extract_api_key(self, ytcfg=None, default_client='web'):
515 return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client)
516
517 def _extract_context(self, ytcfg=None, default_client='web'):
518 _get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict)
519 context = _get_context(ytcfg)
520 if context:
521 return context
522
523 context = _get_context(self._get_default_ytcfg(default_client))
524 if not ytcfg:
525 return context
526
527 # Recreate the client context (required)
528 context['client'].update({
529 'clientVersion': self._extract_client_version(ytcfg, default_client),
530 'clientName': self._extract_client_name(ytcfg, default_client),
531 })
532 visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str)
533 if visitor_data:
534 context['client']['visitorData'] = visitor_data
535 return context
536
537 _SAPISID = None
538
539 def _generate_sapisidhash_header(self, origin='https://www.youtube.com'):
540 time_now = round(time.time())
541 if self._SAPISID is None:
542 yt_cookies = self._get_cookies('https://www.youtube.com')
543 # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is.
544 # See: https://github.com/yt-dlp/yt-dlp/issues/393
545 sapisid_cookie = dict_get(
546 yt_cookies, ('__Secure-3PAPISID', 'SAPISID'))
547 if sapisid_cookie and sapisid_cookie.value:
548 self._SAPISID = sapisid_cookie.value
549 self.write_debug('Extracted SAPISID cookie')
550 # SAPISID cookie is required if not already present
551 if not yt_cookies.get('SAPISID'):
552 self.write_debug('Copying __Secure-3PAPISID cookie to SAPISID cookie')
553 self._set_cookie(
554 '.youtube.com', 'SAPISID', self._SAPISID, secure=True, expire_time=time_now + 3600)
555 else:
556 self._SAPISID = False
557 if not self._SAPISID:
558 return None
559 # SAPISIDHASH algorithm from https://stackoverflow.com/a/32065323
560 sapisidhash = hashlib.sha1(
561 f'{time_now} {self._SAPISID} {origin}'.encode('utf-8')).hexdigest()
562 return f'SAPISIDHASH {time_now}_{sapisidhash}'
563
564 def _call_api(self, ep, query, video_id, fatal=True, headers=None,
565 note='Downloading API JSON', errnote='Unable to download API page',
566 context=None, api_key=None, api_hostname=None, default_client='web'):
567
568 data = {'context': context} if context else {'context': self._extract_context(default_client=default_client)}
569 data.update(query)
570 real_headers = self.generate_api_headers(default_client=default_client)
571 real_headers.update({'content-type': 'application/json'})
572 if headers:
573 real_headers.update(headers)
574 return self._download_json(
575 'https://%s/youtubei/v1/%s' % (api_hostname or self._get_innertube_host(default_client), ep),
576 video_id=video_id, fatal=fatal, note=note, errnote=errnote,
577 data=json.dumps(data).encode('utf8'), headers=real_headers,
578 query={'key': api_key or self._extract_api_key()})
579
580 def extract_yt_initial_data(self, video_id, webpage):
581 return self._parse_json(
582 self._search_regex(
583 (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
584 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
585 video_id)
586
587 def _extract_identity_token(self, webpage, item_id):
588 if not webpage:
589 return None
590 ytcfg = self.extract_ytcfg(item_id, webpage)
591 if ytcfg:
592 token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
593 if token:
594 return token
595 return self._search_regex(
596 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
597 'identity token', default=None)
598
599 @staticmethod
600 def _extract_account_syncid(*args):
601 """
602 Extract syncId required to download private playlists of secondary channels
603 @params response and/or ytcfg
604 """
605 for data in args:
606 # ytcfg includes channel_syncid if on secondary channel
607 delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], compat_str)
608 if delegated_sid:
609 return delegated_sid
610 sync_ids = (try_get(
611 data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'],
612 lambda x: x['DATASYNC_ID']), compat_str) or '').split("||")
613 if len(sync_ids) >= 2 and sync_ids[1]:
614 # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
615 # and just "user_syncid||" for primary channel. We only want the channel_syncid
616 return sync_ids[0]
617
618 def extract_ytcfg(self, video_id, webpage):
619 if not webpage:
620 return {}
621 return self._parse_json(
622 self._search_regex(
623 r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
624 default='{}'), video_id, fatal=False) or {}
625
626 def generate_api_headers(
627 self, ytcfg=None, identity_token=None, account_syncid=None,
628 visitor_data=None, api_hostname=None, default_client='web', session_index=None):
629 origin = 'https://' + (api_hostname if api_hostname else self._get_innertube_host(default_client))
630 headers = {
631 'X-YouTube-Client-Name': compat_str(
632 self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=default_client)),
633 'X-YouTube-Client-Version': self._extract_client_version(ytcfg, default_client),
634 'Origin': origin
635 }
636 if not visitor_data and ytcfg:
637 visitor_data = try_get(
638 self._extract_context(ytcfg, default_client), lambda x: x['client']['visitorData'], compat_str)
639 if identity_token:
640 headers['X-Youtube-Identity-Token'] = identity_token
641 if account_syncid:
642 headers['X-Goog-PageId'] = account_syncid
643 if session_index is None and ytcfg:
644 session_index = self._extract_session_index(ytcfg)
645 if account_syncid or session_index is not None:
646 headers['X-Goog-AuthUser'] = session_index if session_index is not None else 0
647 if visitor_data:
648 headers['X-Goog-Visitor-Id'] = visitor_data
649 auth = self._generate_sapisidhash_header(origin)
650 if auth is not None:
651 headers['Authorization'] = auth
652 headers['X-Origin'] = origin
653 return headers
654
655 @staticmethod
656 def _build_api_continuation_query(continuation, ctp=None):
657 query = {
658 'continuation': continuation
659 }
660 # TODO: Inconsistency with clickTrackingParams.
661 # Currently we have a fixed ctp contained within context (from ytcfg)
662 # and a ctp in root query for continuation.
663 if ctp:
664 query['clickTracking'] = {'clickTrackingParams': ctp}
665 return query
666
667 @classmethod
668 def _extract_next_continuation_data(cls, renderer):
669 next_continuation = try_get(
670 renderer, (lambda x: x['continuations'][0]['nextContinuationData'],
671 lambda x: x['continuation']['reloadContinuationData']), dict)
672 if not next_continuation:
673 return
674 continuation = next_continuation.get('continuation')
675 if not continuation:
676 return
677 ctp = next_continuation.get('clickTrackingParams')
678 return cls._build_api_continuation_query(continuation, ctp)
679
680 @classmethod
681 def _extract_continuation_ep_data(cls, continuation_ep: dict):
682 if isinstance(continuation_ep, dict):
683 continuation = try_get(
684 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
685 if not continuation:
686 return
687 ctp = continuation_ep.get('clickTrackingParams')
688 return cls._build_api_continuation_query(continuation, ctp)
689
690 @classmethod
691 def _extract_continuation(cls, renderer):
692 next_continuation = cls._extract_next_continuation_data(renderer)
693 if next_continuation:
694 return next_continuation
695
696 contents = []
697 for key in ('contents', 'items'):
698 contents.extend(try_get(renderer, lambda x: x[key], list) or [])
699
700 for content in contents:
701 if not isinstance(content, dict):
702 continue
703 continuation_ep = try_get(
704 content, (lambda x: x['continuationItemRenderer']['continuationEndpoint'],
705 lambda x: x['continuationItemRenderer']['button']['buttonRenderer']['command']),
706 dict)
707 continuation = cls._extract_continuation_ep_data(continuation_ep)
708 if continuation:
709 return continuation
710
711 @classmethod
712 def _extract_alerts(cls, data):
713 for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
714 if not isinstance(alert_dict, dict):
715 continue
716 for alert in alert_dict.values():
717 alert_type = alert.get('type')
718 if not alert_type:
719 continue
720 message = cls._get_text(alert, 'text')
721 if message:
722 yield alert_type, message
723
724 def _report_alerts(self, alerts, expected=True, fatal=True):
725 errors = []
726 warnings = []
727 for alert_type, alert_message in alerts:
728 if alert_type.lower() == 'error' and fatal:
729 errors.append([alert_type, alert_message])
730 else:
731 warnings.append([alert_type, alert_message])
732
733 for alert_type, alert_message in (warnings + errors[:-1]):
734 self.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
735 if errors:
736 raise ExtractorError('YouTube said: %s' % errors[-1][1], expected=expected)
737
738 def _extract_and_report_alerts(self, data, *args, **kwargs):
739 return self._report_alerts(self._extract_alerts(data), *args, **kwargs)
740
741 def _extract_badges(self, renderer: dict):
742 badges = set()
743 for badge in try_get(renderer, lambda x: x['badges'], list) or []:
744 label = try_get(badge, lambda x: x['metadataBadgeRenderer']['label'], compat_str)
745 if label:
746 badges.add(label.lower())
747 return badges
748
749 @staticmethod
750 def _get_text(data, *path_list, max_runs=None):
751 for path in path_list or [None]:
752 if path is None:
753 obj = [data]
754 else:
755 obj = traverse_obj(data, path, default=[])
756 if not any(key is ... or isinstance(key, (list, tuple)) for key in variadic(path)):
757 obj = [obj]
758 for item in obj:
759 text = try_get(item, lambda x: x['simpleText'], compat_str)
760 if text:
761 return text
762 runs = try_get(item, lambda x: x['runs'], list) or []
763 if not runs and isinstance(item, list):
764 runs = item
765
766 runs = runs[:min(len(runs), max_runs or len(runs))]
767 text = ''.join(traverse_obj(runs, (..., 'text'), expected_type=str, default=[]))
768 if text:
769 return text
770
771 def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None,
772 ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None,
773 default_client='web'):
774 response = None
775 last_error = None
776 count = -1
777 retries = self.get_param('extractor_retries', 3)
778 if check_get_keys is None:
779 check_get_keys = []
780 while count < retries:
781 count += 1
782 if last_error:
783 self.report_warning('%s. Retrying ...' % last_error)
784 try:
785 response = self._call_api(
786 ep=ep, fatal=True, headers=headers,
787 video_id=item_id, query=query,
788 context=self._extract_context(ytcfg, default_client),
789 api_key=self._extract_api_key(ytcfg, default_client),
790 api_hostname=api_hostname, default_client=default_client,
791 note='%s%s' % (note, ' (retry #%d)' % count if count else ''))
792 except ExtractorError as e:
793 if isinstance(e.cause, network_exceptions):
794 if isinstance(e.cause, compat_HTTPError) and not is_html(e.cause.read(512)):
795 e.cause.seek(0)
796 yt_error = try_get(
797 self._parse_json(e.cause.read().decode(), item_id, fatal=False),
798 lambda x: x['error']['message'], compat_str)
799 if yt_error:
800 self._report_alerts([('ERROR', yt_error)], fatal=False)
801 # Downloading page may result in intermittent 5xx HTTP error
802 # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
803 # We also want to catch all other network exceptions since errors in later pages can be troublesome
804 # See https://github.com/yt-dlp/yt-dlp/issues/507#issuecomment-880188210
805 if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (403, 429):
806 last_error = error_to_compat_str(e.cause or e)
807 if count < retries:
808 continue
809 if fatal:
810 raise
811 else:
812 self.report_warning(error_to_compat_str(e))
813 return
814
815 else:
816 # Youtube may send alerts if there was an issue with the continuation page
817 try:
818 self._extract_and_report_alerts(response, expected=False)
819 except ExtractorError as e:
820 if fatal:
821 raise
822 self.report_warning(error_to_compat_str(e))
823 return
824 if not check_get_keys or dict_get(response, check_get_keys):
825 break
826 # Youtube sometimes sends incomplete data
827 # See: https://github.com/ytdl-org/youtube-dl/issues/28194
828 last_error = 'Incomplete data received'
829 if count >= retries:
830 if fatal:
831 raise ExtractorError(last_error)
832 else:
833 self.report_warning(last_error)
834 return
835 return response
836
837 @staticmethod
838 def is_music_url(url):
839 return re.match(r'https?://music\.youtube\.com/', url) is not None
840
841 def _extract_video(self, renderer):
842 video_id = renderer.get('videoId')
843 title = self._get_text(renderer, 'title')
844 description = self._get_text(renderer, 'descriptionSnippet')
845 duration = parse_duration(self._get_text(
846 renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text')))
847 view_count_text = self._get_text(renderer, 'viewCountText') or ''
848 view_count = str_to_int(self._search_regex(
849 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
850 'view count', default=None))
851
852 uploader = self._get_text(renderer, 'ownerText', 'shortBylineText')
853
854 return {
855 '_type': 'url',
856 'ie_key': YoutubeIE.ie_key(),
857 'id': video_id,
858 'url': video_id,
859 'title': title,
860 'description': description,
861 'duration': duration,
862 'view_count': view_count,
863 'uploader': uploader,
864 }
865
866
867 class YoutubeIE(YoutubeBaseInfoExtractor):
868 IE_DESC = 'YouTube.com'
869 _INVIDIOUS_SITES = (
870 # invidious-redirect websites
871 r'(?:www\.)?redirect\.invidious\.io',
872 r'(?:(?:www|dev)\.)?invidio\.us',
873 # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
874 r'(?:www\.)?invidious\.pussthecat\.org',
875 r'(?:www\.)?invidious\.zee\.li',
876 r'(?:www\.)?invidious\.ethibox\.fr',
877 r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
878 # youtube-dl invidious instances list
879 r'(?:(?:www|no)\.)?invidiou\.sh',
880 r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
881 r'(?:www\.)?invidious\.kabi\.tk',
882 r'(?:www\.)?invidious\.mastodon\.host',
883 r'(?:www\.)?invidious\.zapashcanon\.fr',
884 r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks',
885 r'(?:www\.)?invidious\.tinfoil-hat\.net',
886 r'(?:www\.)?invidious\.himiko\.cloud',
887 r'(?:www\.)?invidious\.reallyancient\.tech',
888 r'(?:www\.)?invidious\.tube',
889 r'(?:www\.)?invidiou\.site',
890 r'(?:www\.)?invidious\.site',
891 r'(?:www\.)?invidious\.xyz',
892 r'(?:www\.)?invidious\.nixnet\.xyz',
893 r'(?:www\.)?invidious\.048596\.xyz',
894 r'(?:www\.)?invidious\.drycat\.fr',
895 r'(?:www\.)?inv\.skyn3t\.in',
896 r'(?:www\.)?tube\.poal\.co',
897 r'(?:www\.)?tube\.connect\.cafe',
898 r'(?:www\.)?vid\.wxzm\.sx',
899 r'(?:www\.)?vid\.mint\.lgbt',
900 r'(?:www\.)?vid\.puffyan\.us',
901 r'(?:www\.)?yewtu\.be',
902 r'(?:www\.)?yt\.elukerio\.org',
903 r'(?:www\.)?yt\.lelux\.fi',
904 r'(?:www\.)?invidious\.ggc-project\.de',
905 r'(?:www\.)?yt\.maisputain\.ovh',
906 r'(?:www\.)?ytprivate\.com',
907 r'(?:www\.)?invidious\.13ad\.de',
908 r'(?:www\.)?invidious\.toot\.koeln',
909 r'(?:www\.)?invidious\.fdn\.fr',
910 r'(?:www\.)?watch\.nettohikari\.com',
911 r'(?:www\.)?invidious\.namazso\.eu',
912 r'(?:www\.)?invidious\.silkky\.cloud',
913 r'(?:www\.)?invidious\.exonip\.de',
914 r'(?:www\.)?invidious\.riverside\.rocks',
915 r'(?:www\.)?invidious\.blamefran\.net',
916 r'(?:www\.)?invidious\.moomoo\.de',
917 r'(?:www\.)?ytb\.trom\.tf',
918 r'(?:www\.)?yt\.cyberhost\.uk',
919 r'(?:www\.)?kgg2m7yk5aybusll\.onion',
920 r'(?:www\.)?qklhadlycap4cnod\.onion',
921 r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion',
922 r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion',
923 r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion',
924 r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion',
925 r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p',
926 r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion',
927 r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
928 r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
929 r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
930 r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion',
931 )
932 _VALID_URL = r"""(?x)^
933 (
934 (?:https?://|//) # http(s):// or protocol-independent URL
935 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com|
936 (?:www\.)?deturl\.com/www\.youtube\.com|
937 (?:www\.)?pwnyoutube\.com|
938 (?:www\.)?hooktube\.com|
939 (?:www\.)?yourepeat\.com|
940 tube\.majestyc\.net|
941 %(invidious)s|
942 youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
943 (?:.*?\#/)? # handle anchor (#/) redirect urls
944 (?: # the various things that can precede the ID:
945 (?:(?:v|embed|e|shorts)/(?!videoseries)) # v/ or embed/ or e/ or shorts/
946 |(?: # or the v= param in all its forms
947 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
948 (?:\?|\#!?) # the params delimiter ? or # or #!
949 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
950 v=
951 )
952 ))
953 |(?:
954 youtu\.be| # just youtu.be/xxxx
955 vid\.plus| # or vid.plus/xxxx
956 zwearz\.com/watch| # or zwearz.com/watch/xxxx
957 %(invidious)s
958 )/
959 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
960 )
961 )? # all until now is optional -> you can pass the naked ID
962 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
963 (?(1).+)? # if we found the ID, everything can follow
964 (?:\#|$)""" % {
965 'invidious': '|'.join(_INVIDIOUS_SITES),
966 }
967 _PLAYER_INFO_RE = (
968 r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
969 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
970 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
971 )
972 _formats = {
973 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
974 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
975 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
976 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
977 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
978 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
979 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
980 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
981 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
982 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
983 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
984 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
985 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
986 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
987 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
988 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
989 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
990 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
991
992
993 # 3D videos
994 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
995 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
996 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
997 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
998 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
999 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
1000 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
1001
1002 # Apple HTTP Live Streaming
1003 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
1004 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
1005 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
1006 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
1007 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
1008 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
1009 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
1010 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
1011
1012 # DASH mp4 video
1013 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
1014 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
1015 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
1016 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
1017 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
1018 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
1019 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
1020 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
1021 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
1022 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
1023 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
1024 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
1025
1026 # Dash mp4 audio
1027 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
1028 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
1029 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
1030 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
1031 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
1032 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
1033 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
1034
1035 # Dash webm
1036 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
1037 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
1038 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
1039 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
1040 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
1041 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
1042 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
1043 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1044 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1045 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1046 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1047 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1048 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1049 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1050 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1051 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
1052 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1053 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
1054 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
1055 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
1056 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1057 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
1058
1059 # Dash webm audio
1060 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
1061 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
1062
1063 # Dash webm audio with opus inside
1064 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
1065 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
1066 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
1067
1068 # RTMP (unnamed)
1069 '_rtmp': {'protocol': 'rtmp'},
1070
1071 # av01 video only formats sometimes served with "unknown" codecs
1072 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
1073 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
1074 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
1075 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
1076 }
1077 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
1078
1079 _GEO_BYPASS = False
1080
1081 IE_NAME = 'youtube'
1082 _TESTS = [
1083 {
1084 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
1085 'info_dict': {
1086 'id': 'BaW_jenozKc',
1087 'ext': 'mp4',
1088 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
1089 'uploader': 'Philipp Hagemeister',
1090 'uploader_id': 'phihag',
1091 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
1092 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
1093 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
1094 'upload_date': '20121002',
1095 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
1096 'categories': ['Science & Technology'],
1097 'tags': ['youtube-dl'],
1098 'duration': 10,
1099 'view_count': int,
1100 'like_count': int,
1101 'dislike_count': int,
1102 'start_time': 1,
1103 'end_time': 9,
1104 }
1105 },
1106 {
1107 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
1108 'note': 'Embed-only video (#1746)',
1109 'info_dict': {
1110 'id': 'yZIXLfi8CZQ',
1111 'ext': 'mp4',
1112 'upload_date': '20120608',
1113 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
1114 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
1115 'uploader': 'SET India',
1116 'uploader_id': 'setindia',
1117 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
1118 'age_limit': 18,
1119 },
1120 'skip': 'Private video',
1121 },
1122 {
1123 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
1124 'note': 'Use the first video ID in the URL',
1125 'info_dict': {
1126 'id': 'BaW_jenozKc',
1127 'ext': 'mp4',
1128 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
1129 'uploader': 'Philipp Hagemeister',
1130 'uploader_id': 'phihag',
1131 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
1132 'upload_date': '20121002',
1133 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
1134 'categories': ['Science & Technology'],
1135 'tags': ['youtube-dl'],
1136 'duration': 10,
1137 'view_count': int,
1138 'like_count': int,
1139 'dislike_count': int,
1140 },
1141 'params': {
1142 'skip_download': True,
1143 },
1144 },
1145 {
1146 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
1147 'note': '256k DASH audio (format 141) via DASH manifest',
1148 'info_dict': {
1149 'id': 'a9LDPn-MO4I',
1150 'ext': 'm4a',
1151 'upload_date': '20121002',
1152 'uploader_id': '8KVIDEO',
1153 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
1154 'description': '',
1155 'uploader': '8KVIDEO',
1156 'title': 'UHDTV TEST 8K VIDEO.mp4'
1157 },
1158 'params': {
1159 'youtube_include_dash_manifest': True,
1160 'format': '141',
1161 },
1162 'skip': 'format 141 not served anymore',
1163 },
1164 # DASH manifest with encrypted signature
1165 {
1166 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
1167 'info_dict': {
1168 'id': 'IB3lcPjvWLA',
1169 'ext': 'm4a',
1170 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
1171 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
1172 'duration': 244,
1173 'uploader': 'AfrojackVEVO',
1174 'uploader_id': 'AfrojackVEVO',
1175 'upload_date': '20131011',
1176 'abr': 129.495,
1177 },
1178 'params': {
1179 'youtube_include_dash_manifest': True,
1180 'format': '141/bestaudio[ext=m4a]',
1181 },
1182 },
1183 # Age-gate videos. See https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-888837000
1184 {
1185 'note': 'Embed allowed age-gate video',
1186 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
1187 'info_dict': {
1188 'id': 'HtVdAasjOgU',
1189 'ext': 'mp4',
1190 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
1191 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
1192 'duration': 142,
1193 'uploader': 'The Witcher',
1194 'uploader_id': 'WitcherGame',
1195 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
1196 'upload_date': '20140605',
1197 'age_limit': 18,
1198 },
1199 },
1200 {
1201 'note': 'Age-gate video with embed allowed in public site',
1202 'url': 'https://youtube.com/watch?v=HsUATh_Nc2U',
1203 'info_dict': {
1204 'id': 'HsUATh_Nc2U',
1205 'ext': 'mp4',
1206 'title': 'Godzilla 2 (Official Video)',
1207 'description': 'md5:bf77e03fcae5529475e500129b05668a',
1208 'upload_date': '20200408',
1209 'uploader_id': 'FlyingKitty900',
1210 'uploader': 'FlyingKitty',
1211 'age_limit': 18,
1212 },
1213 },
1214 {
1215 'note': 'Age-gate video embedable only with clientScreen=EMBED',
1216 'url': 'https://youtube.com/watch?v=Tq92D6wQ1mg',
1217 'info_dict': {
1218 'id': 'Tq92D6wQ1mg',
1219 'title': '[MMD] Adios - EVERGLOW [+Motion DL]',
1220 'ext': 'mp4',
1221 'upload_date': '20191227',
1222 'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ',
1223 'uploader': 'Projekt Melody',
1224 'description': 'md5:17eccca93a786d51bc67646756894066',
1225 'age_limit': 18,
1226 },
1227 },
1228 {
1229 'note': 'Non-Agegated non-embeddable video',
1230 'url': 'https://youtube.com/watch?v=MeJVWBSsPAY',
1231 'info_dict': {
1232 'id': 'MeJVWBSsPAY',
1233 'ext': 'mp4',
1234 'title': 'OOMPH! - Such Mich Find Mich (Lyrics)',
1235 'uploader': 'Herr Lurik',
1236 'uploader_id': 'st3in234',
1237 'description': 'Fan Video. Music & Lyrics by OOMPH!.',
1238 'upload_date': '20130730',
1239 },
1240 },
1241 {
1242 'note': 'Non-bypassable age-gated video',
1243 'url': 'https://youtube.com/watch?v=Cr381pDsSsA',
1244 'only_matching': True,
1245 },
1246 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
1247 # YouTube Red ad is not captured for creator
1248 {
1249 'url': '__2ABJjxzNo',
1250 'info_dict': {
1251 'id': '__2ABJjxzNo',
1252 'ext': 'mp4',
1253 'duration': 266,
1254 'upload_date': '20100430',
1255 'uploader_id': 'deadmau5',
1256 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
1257 'creator': 'deadmau5',
1258 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336',
1259 'uploader': 'deadmau5',
1260 'title': 'Deadmau5 - Some Chords (HD)',
1261 'alt_title': 'Some Chords',
1262 },
1263 'expected_warnings': [
1264 'DASH manifest missing',
1265 ]
1266 },
1267 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
1268 {
1269 'url': 'lqQg6PlCWgI',
1270 'info_dict': {
1271 'id': 'lqQg6PlCWgI',
1272 'ext': 'mp4',
1273 'duration': 6085,
1274 'upload_date': '20150827',
1275 'uploader_id': 'olympic',
1276 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
1277 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
1278 'uploader': 'Olympics',
1279 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
1280 },
1281 'params': {
1282 'skip_download': 'requires avconv',
1283 }
1284 },
1285 # Non-square pixels
1286 {
1287 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
1288 'info_dict': {
1289 'id': '_b-2C3KPAM0',
1290 'ext': 'mp4',
1291 'stretched_ratio': 16 / 9.,
1292 'duration': 85,
1293 'upload_date': '20110310',
1294 'uploader_id': 'AllenMeow',
1295 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
1296 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
1297 'uploader': '孫ᄋᄅ',
1298 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
1299 },
1300 },
1301 # url_encoded_fmt_stream_map is empty string
1302 {
1303 'url': 'qEJwOuvDf7I',
1304 'info_dict': {
1305 'id': 'qEJwOuvDf7I',
1306 'ext': 'webm',
1307 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
1308 'description': '',
1309 'upload_date': '20150404',
1310 'uploader_id': 'spbelect',
1311 'uploader': 'Наблюдатели Петербурга',
1312 },
1313 'params': {
1314 'skip_download': 'requires avconv',
1315 },
1316 'skip': 'This live event has ended.',
1317 },
1318 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
1319 {
1320 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
1321 'info_dict': {
1322 'id': 'FIl7x6_3R5Y',
1323 'ext': 'webm',
1324 'title': 'md5:7b81415841e02ecd4313668cde88737a',
1325 'description': 'md5:116377fd2963b81ec4ce64b542173306',
1326 'duration': 220,
1327 'upload_date': '20150625',
1328 'uploader_id': 'dorappi2000',
1329 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
1330 'uploader': 'dorappi2000',
1331 'formats': 'mincount:31',
1332 },
1333 'skip': 'not actual anymore',
1334 },
1335 # DASH manifest with segment_list
1336 {
1337 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
1338 'md5': '8ce563a1d667b599d21064e982ab9e31',
1339 'info_dict': {
1340 'id': 'CsmdDsKjzN8',
1341 'ext': 'mp4',
1342 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
1343 'uploader': 'Airtek',
1344 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
1345 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
1346 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
1347 },
1348 'params': {
1349 'youtube_include_dash_manifest': True,
1350 'format': '135', # bestvideo
1351 },
1352 'skip': 'This live event has ended.',
1353 },
1354 {
1355 # Multifeed videos (multiple cameras), URL is for Main Camera
1356 'url': 'https://www.youtube.com/watch?v=jvGDaLqkpTg',
1357 'info_dict': {
1358 'id': 'jvGDaLqkpTg',
1359 'title': 'Tom Clancy Free Weekend Rainbow Whatever',
1360 'description': 'md5:e03b909557865076822aa169218d6a5d',
1361 },
1362 'playlist': [{
1363 'info_dict': {
1364 'id': 'jvGDaLqkpTg',
1365 'ext': 'mp4',
1366 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Main Camera)',
1367 'description': 'md5:e03b909557865076822aa169218d6a5d',
1368 'duration': 10643,
1369 'upload_date': '20161111',
1370 'uploader': 'Team PGP',
1371 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1372 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1373 },
1374 }, {
1375 'info_dict': {
1376 'id': '3AKt1R1aDnw',
1377 'ext': 'mp4',
1378 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 2)',
1379 'description': 'md5:e03b909557865076822aa169218d6a5d',
1380 'duration': 10991,
1381 'upload_date': '20161111',
1382 'uploader': 'Team PGP',
1383 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1384 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1385 },
1386 }, {
1387 'info_dict': {
1388 'id': 'RtAMM00gpVc',
1389 'ext': 'mp4',
1390 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 3)',
1391 'description': 'md5:e03b909557865076822aa169218d6a5d',
1392 'duration': 10995,
1393 'upload_date': '20161111',
1394 'uploader': 'Team PGP',
1395 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1396 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1397 },
1398 }, {
1399 'info_dict': {
1400 'id': '6N2fdlP3C5U',
1401 'ext': 'mp4',
1402 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 4)',
1403 'description': 'md5:e03b909557865076822aa169218d6a5d',
1404 'duration': 10990,
1405 'upload_date': '20161111',
1406 'uploader': 'Team PGP',
1407 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1408 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1409 },
1410 }],
1411 'params': {
1412 'skip_download': True,
1413 },
1414 'skip': 'Not multifeed anymore',
1415 },
1416 {
1417 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
1418 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
1419 'info_dict': {
1420 'id': 'gVfLd0zydlo',
1421 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
1422 },
1423 'playlist_count': 2,
1424 'skip': 'Not multifeed anymore',
1425 },
1426 {
1427 'url': 'https://vid.plus/FlRa-iH7PGw',
1428 'only_matching': True,
1429 },
1430 {
1431 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
1432 'only_matching': True,
1433 },
1434 {
1435 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
1436 # Also tests cut-off URL expansion in video description (see
1437 # https://github.com/ytdl-org/youtube-dl/issues/1892,
1438 # https://github.com/ytdl-org/youtube-dl/issues/8164)
1439 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
1440 'info_dict': {
1441 'id': 'lsguqyKfVQg',
1442 'ext': 'mp4',
1443 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
1444 'alt_title': 'Dark Walk',
1445 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
1446 'duration': 133,
1447 'upload_date': '20151119',
1448 'uploader_id': 'IronSoulElf',
1449 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
1450 'uploader': 'IronSoulElf',
1451 'creator': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan',
1452 'track': 'Dark Walk',
1453 'artist': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan',
1454 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
1455 },
1456 'params': {
1457 'skip_download': True,
1458 },
1459 },
1460 {
1461 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
1462 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
1463 'only_matching': True,
1464 },
1465 {
1466 # Video with yt:stretch=17:0
1467 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
1468 'info_dict': {
1469 'id': 'Q39EVAstoRM',
1470 'ext': 'mp4',
1471 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
1472 'description': 'md5:ee18a25c350637c8faff806845bddee9',
1473 'upload_date': '20151107',
1474 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
1475 'uploader': 'CH GAMER DROID',
1476 },
1477 'params': {
1478 'skip_download': True,
1479 },
1480 'skip': 'This video does not exist.',
1481 },
1482 {
1483 # Video with incomplete 'yt:stretch=16:'
1484 'url': 'https://www.youtube.com/watch?v=FRhJzUSJbGI',
1485 'only_matching': True,
1486 },
1487 {
1488 # Video licensed under Creative Commons
1489 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
1490 'info_dict': {
1491 'id': 'M4gD1WSo5mA',
1492 'ext': 'mp4',
1493 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
1494 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
1495 'duration': 721,
1496 'upload_date': '20150127',
1497 'uploader_id': 'BerkmanCenter',
1498 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
1499 'uploader': 'The Berkman Klein Center for Internet & Society',
1500 'license': 'Creative Commons Attribution license (reuse allowed)',
1501 },
1502 'params': {
1503 'skip_download': True,
1504 },
1505 },
1506 {
1507 # Channel-like uploader_url
1508 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
1509 'info_dict': {
1510 'id': 'eQcmzGIKrzg',
1511 'ext': 'mp4',
1512 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
1513 'description': 'md5:13a2503d7b5904ef4b223aa101628f39',
1514 'duration': 4060,
1515 'upload_date': '20151119',
1516 'uploader': 'Bernie Sanders',
1517 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
1518 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
1519 'license': 'Creative Commons Attribution license (reuse allowed)',
1520 },
1521 'params': {
1522 'skip_download': True,
1523 },
1524 },
1525 {
1526 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
1527 'only_matching': True,
1528 },
1529 {
1530 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
1531 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
1532 'only_matching': True,
1533 },
1534 {
1535 # Rental video preview
1536 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1537 'info_dict': {
1538 'id': 'uGpuVWrhIzE',
1539 'ext': 'mp4',
1540 'title': 'Piku - Trailer',
1541 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1542 'upload_date': '20150811',
1543 'uploader': 'FlixMatrix',
1544 'uploader_id': 'FlixMatrixKaravan',
1545 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
1546 'license': 'Standard YouTube License',
1547 },
1548 'params': {
1549 'skip_download': True,
1550 },
1551 'skip': 'This video is not available.',
1552 },
1553 {
1554 # YouTube Red video with episode data
1555 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1556 'info_dict': {
1557 'id': 'iqKdEhx-dD4',
1558 'ext': 'mp4',
1559 'title': 'Isolation - Mind Field (Ep 1)',
1560 'description': 'md5:f540112edec5d09fc8cc752d3d4ba3cd',
1561 'duration': 2085,
1562 'upload_date': '20170118',
1563 'uploader': 'Vsauce',
1564 'uploader_id': 'Vsauce',
1565 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1566 'series': 'Mind Field',
1567 'season_number': 1,
1568 'episode_number': 1,
1569 },
1570 'params': {
1571 'skip_download': True,
1572 },
1573 'expected_warnings': [
1574 'Skipping DASH manifest',
1575 ],
1576 },
1577 {
1578 # The following content has been identified by the YouTube community
1579 # as inappropriate or offensive to some audiences.
1580 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1581 'info_dict': {
1582 'id': '6SJNVb0GnPI',
1583 'ext': 'mp4',
1584 'title': 'Race Differences in Intelligence',
1585 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1586 'duration': 965,
1587 'upload_date': '20140124',
1588 'uploader': 'New Century Foundation',
1589 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1590 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1591 },
1592 'params': {
1593 'skip_download': True,
1594 },
1595 'skip': 'This video has been removed for violating YouTube\'s policy on hate speech.',
1596 },
1597 {
1598 # itag 212
1599 'url': '1t24XAntNCY',
1600 'only_matching': True,
1601 },
1602 {
1603 # geo restricted to JP
1604 'url': 'sJL6WA-aGkQ',
1605 'only_matching': True,
1606 },
1607 {
1608 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1609 'only_matching': True,
1610 },
1611 {
1612 'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc',
1613 'only_matching': True,
1614 },
1615 {
1616 # from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m
1617 'url': 'https://redirect.invidious.io/Yh0AhrY9GjA',
1618 'only_matching': True,
1619 },
1620 {
1621 # DRM protected
1622 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1623 'only_matching': True,
1624 },
1625 {
1626 # Video with unsupported adaptive stream type formats
1627 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1628 'info_dict': {
1629 'id': 'Z4Vy8R84T1U',
1630 'ext': 'mp4',
1631 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1632 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1633 'duration': 433,
1634 'upload_date': '20130923',
1635 'uploader': 'Amelia Putri Harwita',
1636 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1637 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1638 'formats': 'maxcount:10',
1639 },
1640 'params': {
1641 'skip_download': True,
1642 'youtube_include_dash_manifest': False,
1643 },
1644 'skip': 'not actual anymore',
1645 },
1646 {
1647 # Youtube Music Auto-generated description
1648 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1649 'info_dict': {
1650 'id': 'MgNrAu2pzNs',
1651 'ext': 'mp4',
1652 'title': 'Voyeur Girl',
1653 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1654 'upload_date': '20190312',
1655 'uploader': 'Stephen - Topic',
1656 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1657 'artist': 'Stephen',
1658 'track': 'Voyeur Girl',
1659 'album': 'it\'s too much love to know my dear',
1660 'release_date': '20190313',
1661 'release_year': 2019,
1662 },
1663 'params': {
1664 'skip_download': True,
1665 },
1666 },
1667 {
1668 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1669 'only_matching': True,
1670 },
1671 {
1672 # invalid -> valid video id redirection
1673 'url': 'DJztXj2GPfl',
1674 'info_dict': {
1675 'id': 'DJztXj2GPfk',
1676 'ext': 'mp4',
1677 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1678 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1679 'upload_date': '20090125',
1680 'uploader': 'Prochorowka',
1681 'uploader_id': 'Prochorowka',
1682 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1683 'artist': 'Panjabi MC',
1684 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1685 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1686 },
1687 'params': {
1688 'skip_download': True,
1689 },
1690 'skip': 'Video unavailable',
1691 },
1692 {
1693 # empty description results in an empty string
1694 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1695 'info_dict': {
1696 'id': 'x41yOUIvK2k',
1697 'ext': 'mp4',
1698 'title': 'IMG 3456',
1699 'description': '',
1700 'upload_date': '20170613',
1701 'uploader_id': 'ElevageOrVert',
1702 'uploader': 'ElevageOrVert',
1703 },
1704 'params': {
1705 'skip_download': True,
1706 },
1707 },
1708 {
1709 # with '};' inside yt initial data (see [1])
1710 # see [2] for an example with '};' inside ytInitialPlayerResponse
1711 # 1. https://github.com/ytdl-org/youtube-dl/issues/27093
1712 # 2. https://github.com/ytdl-org/youtube-dl/issues/27216
1713 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1714 'info_dict': {
1715 'id': 'CHqg6qOn4no',
1716 'ext': 'mp4',
1717 'title': 'Part 77 Sort a list of simple types in c#',
1718 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1719 'upload_date': '20130831',
1720 'uploader_id': 'kudvenkat',
1721 'uploader': 'kudvenkat',
1722 },
1723 'params': {
1724 'skip_download': True,
1725 },
1726 },
1727 {
1728 # another example of '};' in ytInitialData
1729 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY',
1730 'only_matching': True,
1731 },
1732 {
1733 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ',
1734 'only_matching': True,
1735 },
1736 {
1737 # https://github.com/ytdl-org/youtube-dl/pull/28094
1738 'url': 'OtqTfy26tG0',
1739 'info_dict': {
1740 'id': 'OtqTfy26tG0',
1741 'ext': 'mp4',
1742 'title': 'Burn Out',
1743 'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131',
1744 'upload_date': '20141120',
1745 'uploader': 'The Cinematic Orchestra - Topic',
1746 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw',
1747 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw',
1748 'artist': 'The Cinematic Orchestra',
1749 'track': 'Burn Out',
1750 'album': 'Every Day',
1751 'release_data': None,
1752 'release_year': None,
1753 },
1754 'params': {
1755 'skip_download': True,
1756 },
1757 },
1758 {
1759 # controversial video, only works with bpctr when authenticated with cookies
1760 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg',
1761 'only_matching': True,
1762 },
1763 {
1764 # controversial video, requires bpctr/contentCheckOk
1765 'url': 'https://www.youtube.com/watch?v=SZJvDhaSDnc',
1766 'info_dict': {
1767 'id': 'SZJvDhaSDnc',
1768 'ext': 'mp4',
1769 'title': 'San Diego teen commits suicide after bullying over embarrassing video',
1770 'channel_id': 'UC-SJ6nODDmufqBzPBwCvYvQ',
1771 'uploader': 'CBS This Morning',
1772 'uploader_id': 'CBSThisMorning',
1773 'upload_date': '20140716',
1774 'description': 'md5:acde3a73d3f133fc97e837a9f76b53b7'
1775 }
1776 },
1777 {
1778 # restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685
1779 'url': 'cBvYw8_A0vQ',
1780 'info_dict': {
1781 'id': 'cBvYw8_A0vQ',
1782 'ext': 'mp4',
1783 'title': '4K Ueno Okachimachi Street Scenes 上野御徒町歩き',
1784 'description': 'md5:ea770e474b7cd6722b4c95b833c03630',
1785 'upload_date': '20201120',
1786 'uploader': 'Walk around Japan',
1787 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw',
1788 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
1789 },
1790 'params': {
1791 'skip_download': True,
1792 },
1793 }, {
1794 # Has multiple audio streams
1795 'url': 'WaOKSUlf4TM',
1796 'only_matching': True
1797 }, {
1798 # Requires Premium: has format 141 when requested using YTM url
1799 'url': 'https://music.youtube.com/watch?v=XclachpHxis',
1800 'only_matching': True
1801 }, {
1802 # multiple subtitles with same lang_code
1803 'url': 'https://www.youtube.com/watch?v=wsQiKKfKxug',
1804 'only_matching': True,
1805 }, {
1806 # Force use android client fallback
1807 'url': 'https://www.youtube.com/watch?v=YOelRv7fMxY',
1808 'info_dict': {
1809 'id': 'YOelRv7fMxY',
1810 'title': 'DIGGING A SECRET TUNNEL Part 1',
1811 'ext': '3gp',
1812 'upload_date': '20210624',
1813 'channel_id': 'UCp68_FLety0O-n9QU6phsgw',
1814 'uploader': 'colinfurze',
1815 'uploader_id': 'colinfurze',
1816 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCp68_FLety0O-n9QU6phsgw',
1817 'description': 'md5:b5096f56af7ccd7a555c84db81738b22'
1818 },
1819 'params': {
1820 'format': '17', # 3gp format available on android
1821 'extractor_args': {'youtube': {'player_client': ['android']}},
1822 },
1823 },
1824 {
1825 # Skip download of additional client configs (remix client config in this case)
1826 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1827 'only_matching': True,
1828 'params': {
1829 'extractor_args': {'youtube': {'player_skip': ['configs']}},
1830 },
1831 }, {
1832 # shorts
1833 'url': 'https://www.youtube.com/shorts/BGQWPY4IigY',
1834 'only_matching': True,
1835 },
1836 ]
1837
1838 @classmethod
1839 def suitable(cls, url):
1840 from ..utils import parse_qs
1841
1842 qs = parse_qs(url)
1843 if qs.get('list', [None])[0]:
1844 return False
1845 return super(YoutubeIE, cls).suitable(url)
1846
1847 def __init__(self, *args, **kwargs):
1848 super(YoutubeIE, self).__init__(*args, **kwargs)
1849 self._code_cache = {}
1850 self._player_cache = {}
1851
1852 def _extract_player_url(self, ytcfg=None, webpage=None):
1853 player_url = try_get(ytcfg, (lambda x: x['PLAYER_JS_URL']), str)
1854 if not player_url and webpage:
1855 player_url = self._search_regex(
1856 r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
1857 webpage, 'player URL', fatal=False)
1858 if not player_url:
1859 return None
1860 if player_url.startswith('//'):
1861 player_url = 'https:' + player_url
1862 elif not re.match(r'https?://', player_url):
1863 player_url = compat_urlparse.urljoin(
1864 'https://www.youtube.com', player_url)
1865 return player_url
1866
1867 def _signature_cache_id(self, example_sig):
1868 """ Return a string representation of a signature """
1869 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1870
1871 @classmethod
1872 def _extract_player_info(cls, player_url):
1873 for player_re in cls._PLAYER_INFO_RE:
1874 id_m = re.search(player_re, player_url)
1875 if id_m:
1876 break
1877 else:
1878 raise ExtractorError('Cannot identify player %r' % player_url)
1879 return id_m.group('id')
1880
1881 def _load_player(self, video_id, player_url, fatal=True) -> bool:
1882 player_id = self._extract_player_info(player_url)
1883 if player_id not in self._code_cache:
1884 self._code_cache[player_id] = self._download_webpage(
1885 player_url, video_id, fatal=fatal,
1886 note='Downloading player ' + player_id,
1887 errnote='Download of %s failed' % player_url)
1888 return player_id in self._code_cache
1889
1890 def _extract_signature_function(self, video_id, player_url, example_sig):
1891 player_id = self._extract_player_info(player_url)
1892
1893 # Read from filesystem cache
1894 func_id = 'js_%s_%s' % (
1895 player_id, self._signature_cache_id(example_sig))
1896 assert os.path.basename(func_id) == func_id
1897
1898 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1899 if cache_spec is not None:
1900 return lambda s: ''.join(s[i] for i in cache_spec)
1901
1902 if self._load_player(video_id, player_url):
1903 code = self._code_cache[player_id]
1904 res = self._parse_sig_js(code)
1905
1906 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1907 cache_res = res(test_string)
1908 cache_spec = [ord(c) for c in cache_res]
1909
1910 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1911 return res
1912
1913 def _print_sig_code(self, func, example_sig):
1914 def gen_sig_code(idxs):
1915 def _genslice(start, end, step):
1916 starts = '' if start == 0 else str(start)
1917 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1918 steps = '' if step == 1 else (':%d' % step)
1919 return 's[%s%s%s]' % (starts, ends, steps)
1920
1921 step = None
1922 # Quelch pyflakes warnings - start will be set when step is set
1923 start = '(Never used)'
1924 for i, prev in zip(idxs[1:], idxs[:-1]):
1925 if step is not None:
1926 if i - prev == step:
1927 continue
1928 yield _genslice(start, prev, step)
1929 step = None
1930 continue
1931 if i - prev in [-1, 1]:
1932 step = i - prev
1933 start = prev
1934 continue
1935 else:
1936 yield 's[%d]' % prev
1937 if step is None:
1938 yield 's[%d]' % i
1939 else:
1940 yield _genslice(start, i, step)
1941
1942 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1943 cache_res = func(test_string)
1944 cache_spec = [ord(c) for c in cache_res]
1945 expr_code = ' + '.join(gen_sig_code(cache_spec))
1946 signature_id_tuple = '(%s)' % (
1947 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1948 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1949 ' return %s\n') % (signature_id_tuple, expr_code)
1950 self.to_screen('Extracted signature function:\n' + code)
1951
1952 def _parse_sig_js(self, jscode):
1953 funcname = self._search_regex(
1954 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1955 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1956 r'\bm=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)',
1957 r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)',
1958 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)',
1959 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1960 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1961 # Obsolete patterns
1962 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1963 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1964 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1965 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1966 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1967 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1968 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1969 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1970 jscode, 'Initial JS player signature function name', group='sig')
1971
1972 jsi = JSInterpreter(jscode)
1973 initial_function = jsi.extract_function(funcname)
1974 return lambda s: initial_function([s])
1975
1976 def _decrypt_signature(self, s, video_id, player_url):
1977 """Turn the encrypted s field into a working signature"""
1978
1979 if player_url is None:
1980 raise ExtractorError('Cannot decrypt signature without player_url')
1981
1982 try:
1983 player_id = (player_url, self._signature_cache_id(s))
1984 if player_id not in self._player_cache:
1985 func = self._extract_signature_function(
1986 video_id, player_url, s
1987 )
1988 self._player_cache[player_id] = func
1989 func = self._player_cache[player_id]
1990 if self.get_param('youtube_print_sig_code'):
1991 self._print_sig_code(func, s)
1992 return func(s)
1993 except Exception as e:
1994 tb = traceback.format_exc()
1995 raise ExtractorError(
1996 'Signature extraction failed: ' + tb, cause=e)
1997
1998 def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False):
1999 """
2000 Extract signatureTimestamp (sts)
2001 Required to tell API what sig/player version is in use.
2002 """
2003 sts = None
2004 if isinstance(ytcfg, dict):
2005 sts = int_or_none(ytcfg.get('STS'))
2006
2007 if not sts:
2008 # Attempt to extract from player
2009 if player_url is None:
2010 error_msg = 'Cannot extract signature timestamp without player_url.'
2011 if fatal:
2012 raise ExtractorError(error_msg)
2013 self.report_warning(error_msg)
2014 return
2015 if self._load_player(video_id, player_url, fatal=fatal):
2016 player_id = self._extract_player_info(player_url)
2017 code = self._code_cache[player_id]
2018 sts = int_or_none(self._search_regex(
2019 r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code,
2020 'JS player signature timestamp', group='sts', fatal=fatal))
2021 return sts
2022
2023 def _mark_watched(self, video_id, player_responses):
2024 playback_url = traverse_obj(
2025 player_responses, (..., 'playbackTracking', 'videostatsPlaybackUrl', 'baseUrl'),
2026 expected_type=url_or_none, get_all=False)
2027 if not playback_url:
2028 self.report_warning('Unable to mark watched')
2029 return
2030 parsed_playback_url = compat_urlparse.urlparse(playback_url)
2031 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
2032
2033 # cpn generation algorithm is reverse engineered from base.js.
2034 # In fact it works even with dummy cpn.
2035 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
2036 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
2037
2038 qs.update({
2039 'ver': ['2'],
2040 'cpn': [cpn],
2041 })
2042 playback_url = compat_urlparse.urlunparse(
2043 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
2044
2045 self._download_webpage(
2046 playback_url, video_id, 'Marking watched',
2047 'Unable to mark watched', fatal=False)
2048
2049 @staticmethod
2050 def _extract_urls(webpage):
2051 # Embedded YouTube player
2052 entries = [
2053 unescapeHTML(mobj.group('url'))
2054 for mobj in re.finditer(r'''(?x)
2055 (?:
2056 <iframe[^>]+?src=|
2057 data-video-url=|
2058 <embed[^>]+?src=|
2059 embedSWF\(?:\s*|
2060 <object[^>]+data=|
2061 new\s+SWFObject\(
2062 )
2063 (["\'])
2064 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
2065 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
2066 \1''', webpage)]
2067
2068 # lazyYT YouTube embed
2069 entries.extend(list(map(
2070 unescapeHTML,
2071 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
2072
2073 # Wordpress "YouTube Video Importer" plugin
2074 matches = re.findall(r'''(?x)<div[^>]+
2075 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
2076 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
2077 entries.extend(m[-1] for m in matches)
2078
2079 return entries
2080
2081 @staticmethod
2082 def _extract_url(webpage):
2083 urls = YoutubeIE._extract_urls(webpage)
2084 return urls[0] if urls else None
2085
2086 @classmethod
2087 def extract_id(cls, url):
2088 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
2089 if mobj is None:
2090 raise ExtractorError('Invalid URL: %s' % url)
2091 return mobj.group('id')
2092
2093 def _extract_chapters_from_json(self, data, duration):
2094 chapter_list = traverse_obj(
2095 data, (
2096 'playerOverlays', 'playerOverlayRenderer', 'decoratedPlayerBarRenderer',
2097 'decoratedPlayerBarRenderer', 'playerBar', 'chapteredPlayerBarRenderer', 'chapters'
2098 ), expected_type=list)
2099
2100 return self._extract_chapters(
2101 chapter_list,
2102 chapter_time=lambda chapter: float_or_none(
2103 traverse_obj(chapter, ('chapterRenderer', 'timeRangeStartMillis')), scale=1000),
2104 chapter_title=lambda chapter: traverse_obj(
2105 chapter, ('chapterRenderer', 'title', 'simpleText'), expected_type=str),
2106 duration=duration)
2107
2108 def _extract_chapters_from_engagement_panel(self, data, duration):
2109 content_list = traverse_obj(
2110 data,
2111 ('engagementPanels', ..., 'engagementPanelSectionListRenderer', 'content', 'macroMarkersListRenderer', 'contents'),
2112 expected_type=list, default=[])
2113 chapter_time = lambda chapter: parse_duration(self._get_text(chapter, 'timeDescription'))
2114 chapter_title = lambda chapter: self._get_text(chapter, 'title')
2115
2116 return next((
2117 filter(None, (
2118 self._extract_chapters(
2119 traverse_obj(contents, (..., 'macroMarkersListItemRenderer')),
2120 chapter_time, chapter_title, duration)
2121 for contents in content_list
2122 ))), [])
2123
2124 def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration):
2125 chapters = []
2126 last_chapter = {'start_time': 0}
2127 for idx, chapter in enumerate(chapter_list or []):
2128 title = chapter_title(chapter)
2129 start_time = chapter_time(chapter)
2130 if start_time is None:
2131 continue
2132 last_chapter['end_time'] = start_time
2133 if start_time < last_chapter['start_time']:
2134 if idx == 1:
2135 chapters.pop()
2136 self.report_warning('Invalid start time for chapter "%s"' % last_chapter['title'])
2137 else:
2138 self.report_warning(f'Invalid start time for chapter "{title}"')
2139 continue
2140 last_chapter = {'start_time': start_time, 'title': title}
2141 chapters.append(last_chapter)
2142 last_chapter['end_time'] = duration
2143 return chapters
2144
2145 def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
2146 return self._parse_json(self._search_regex(
2147 (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
2148 regex), webpage, name, default='{}'), video_id, fatal=False)
2149
2150 @staticmethod
2151 def parse_time_text(time_text):
2152 """
2153 Parse the comment time text
2154 time_text is in the format 'X units ago (edited)'
2155 """
2156 time_text_split = time_text.split(' ')
2157 if len(time_text_split) >= 3:
2158 try:
2159 return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto')
2160 except ValueError:
2161 return None
2162
2163 def _extract_comment(self, comment_renderer, parent=None):
2164 comment_id = comment_renderer.get('commentId')
2165 if not comment_id:
2166 return
2167
2168 text = self._get_text(comment_renderer, 'contentText')
2169
2170 # note: timestamp is an estimate calculated from the current time and time_text
2171 time_text = self._get_text(comment_renderer, 'publishedTimeText') or ''
2172 time_text_dt = self.parse_time_text(time_text)
2173 if isinstance(time_text_dt, datetime.datetime):
2174 timestamp = calendar.timegm(time_text_dt.timetuple())
2175 author = self._get_text(comment_renderer, 'authorText')
2176 author_id = try_get(comment_renderer,
2177 lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
2178
2179 votes = parse_count(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'],
2180 lambda x: x['likeCount']), compat_str)) or 0
2181 author_thumbnail = try_get(comment_renderer,
2182 lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], compat_str)
2183
2184 author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
2185 is_favorited = 'creatorHeart' in (try_get(
2186 comment_renderer, lambda x: x['actionButtons']['commentActionButtonsRenderer'], dict) or {})
2187 return {
2188 'id': comment_id,
2189 'text': text,
2190 'timestamp': timestamp,
2191 'time_text': time_text,
2192 'like_count': votes,
2193 'is_favorited': is_favorited,
2194 'author': author,
2195 'author_id': author_id,
2196 'author_thumbnail': author_thumbnail,
2197 'author_is_uploader': author_is_uploader,
2198 'parent': parent or 'root'
2199 }
2200
2201 def _comment_entries(self, root_continuation_data, identity_token, account_syncid,
2202 ytcfg, video_id, parent=None, comment_counts=None):
2203
2204 def extract_header(contents):
2205 _total_comments = 0
2206 _continuation = None
2207 for content in contents:
2208 comments_header_renderer = try_get(content, lambda x: x['commentsHeaderRenderer'])
2209 expected_comment_count = parse_count(self._get_text(
2210 comments_header_renderer, 'countText', 'commentsCount', max_runs=1))
2211
2212 if expected_comment_count:
2213 comment_counts[1] = expected_comment_count
2214 self.to_screen('Downloading ~%d comments' % expected_comment_count)
2215 _total_comments = comment_counts[1]
2216 sort_mode_str = self._configuration_arg('comment_sort', [''])[0]
2217 comment_sort_index = int(sort_mode_str != 'top') # 1 = new, 0 = top
2218
2219 sort_menu_item = try_get(
2220 comments_header_renderer,
2221 lambda x: x['sortMenu']['sortFilterSubMenuRenderer']['subMenuItems'][comment_sort_index], dict) or {}
2222 sort_continuation_ep = sort_menu_item.get('serviceEndpoint') or {}
2223
2224 _continuation = self._extract_continuation_ep_data(sort_continuation_ep) or self._extract_continuation(sort_menu_item)
2225 if not _continuation:
2226 continue
2227
2228 sort_text = sort_menu_item.get('title')
2229 if isinstance(sort_text, compat_str):
2230 sort_text = sort_text.lower()
2231 else:
2232 sort_text = 'top comments' if comment_sort_index == 0 else 'newest first'
2233 self.to_screen('Sorting comments by %s' % sort_text)
2234 break
2235 return _total_comments, _continuation
2236
2237 def extract_thread(contents):
2238 if not parent:
2239 comment_counts[2] = 0
2240 for content in contents:
2241 comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
2242 comment_renderer = try_get(
2243 comment_thread_renderer, (lambda x: x['comment']['commentRenderer'], dict)) or try_get(
2244 content, (lambda x: x['commentRenderer'], dict))
2245
2246 if not comment_renderer:
2247 continue
2248 comment = self._extract_comment(comment_renderer, parent)
2249 if not comment:
2250 continue
2251 comment_counts[0] += 1
2252 yield comment
2253 # Attempt to get the replies
2254 comment_replies_renderer = try_get(
2255 comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
2256
2257 if comment_replies_renderer:
2258 comment_counts[2] += 1
2259 comment_entries_iter = self._comment_entries(
2260 comment_replies_renderer, identity_token, account_syncid, ytcfg,
2261 video_id, parent=comment.get('id'), comment_counts=comment_counts)
2262
2263 for reply_comment in comment_entries_iter:
2264 yield reply_comment
2265
2266 # YouTube comments have a max depth of 2
2267 max_depth = int_or_none(self._configuration_arg('max_comment_depth', [''])[0]) or float('inf')
2268 if max_depth == 1 and parent:
2269 return
2270 if not comment_counts:
2271 # comment so far, est. total comments, current comment thread #
2272 comment_counts = [0, 0, 0]
2273
2274 continuation = self._extract_continuation(root_continuation_data)
2275 if continuation and len(continuation['continuation']) < 27:
2276 self.write_debug('Detected old API continuation token. Generating new API compatible token.')
2277 continuation_token = self._generate_comment_continuation(video_id)
2278 continuation = self._build_api_continuation_query(continuation_token, None)
2279
2280 visitor_data = None
2281 is_first_continuation = parent is None
2282
2283 for page_num in itertools.count(0):
2284 if not continuation:
2285 break
2286 headers = self.generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
2287 comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
2288 if page_num == 0:
2289 if is_first_continuation:
2290 note_prefix = 'Downloading comment section API JSON'
2291 else:
2292 note_prefix = ' Downloading comment API JSON reply thread %d %s' % (
2293 comment_counts[2], comment_prog_str)
2294 else:
2295 note_prefix = '%sDownloading comment%s API JSON page %d %s' % (
2296 ' ' if parent else '', ' replies' if parent else '',
2297 page_num, comment_prog_str)
2298
2299 response = self._extract_response(
2300 item_id=None, query=continuation,
2301 ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
2302 check_get_keys=('onResponseReceivedEndpoints', 'continuationContents'))
2303 if not response:
2304 break
2305 visitor_data = try_get(
2306 response,
2307 lambda x: x['responseContext']['webResponseContextExtensionData']['ytConfigData']['visitorData'],
2308 compat_str) or visitor_data
2309
2310 continuation_contents = dict_get(response, ('onResponseReceivedEndpoints', 'continuationContents'))
2311
2312 continuation = None
2313 if isinstance(continuation_contents, list):
2314 for continuation_section in continuation_contents:
2315 if not isinstance(continuation_section, dict):
2316 continue
2317 continuation_items = try_get(
2318 continuation_section,
2319 (lambda x: x['reloadContinuationItemsCommand']['continuationItems'],
2320 lambda x: x['appendContinuationItemsAction']['continuationItems']),
2321 list) or []
2322 if is_first_continuation:
2323 total_comments, continuation = extract_header(continuation_items)
2324 if total_comments:
2325 yield total_comments
2326 is_first_continuation = False
2327 if continuation:
2328 break
2329 continue
2330 count = 0
2331 for count, entry in enumerate(extract_thread(continuation_items)):
2332 yield entry
2333 continuation = self._extract_continuation({'contents': continuation_items})
2334 if continuation:
2335 # Sometimes YouTube provides a continuation without any comments
2336 # In most cases we end up just downloading these with very little comments to come.
2337 if count == 0:
2338 if not parent:
2339 self.report_warning('No comments received - assuming end of comments')
2340 continuation = None
2341 break
2342
2343 # Deprecated response structure
2344 elif isinstance(continuation_contents, dict):
2345 known_continuation_renderers = ('itemSectionContinuation', 'commentRepliesContinuation')
2346 for key, continuation_renderer in continuation_contents.items():
2347 if key not in known_continuation_renderers:
2348 continue
2349 if not isinstance(continuation_renderer, dict):
2350 continue
2351 if is_first_continuation:
2352 header_continuation_items = [continuation_renderer.get('header') or {}]
2353 total_comments, continuation = extract_header(header_continuation_items)
2354 if total_comments:
2355 yield total_comments
2356 is_first_continuation = False
2357 if continuation:
2358 break
2359
2360 # Sometimes YouTube provides a continuation without any comments
2361 # In most cases we end up just downloading these with very little comments to come.
2362 count = 0
2363 for count, entry in enumerate(extract_thread(continuation_renderer.get('contents') or {})):
2364 yield entry
2365 continuation = self._extract_continuation(continuation_renderer)
2366 if count == 0:
2367 if not parent:
2368 self.report_warning('No comments received - assuming end of comments')
2369 continuation = None
2370 break
2371
2372 @staticmethod
2373 def _generate_comment_continuation(video_id):
2374 """
2375 Generates initial comment section continuation token from given video id
2376 """
2377 b64_vid_id = base64.b64encode(bytes(video_id.encode('utf-8')))
2378 parts = ('Eg0SCw==', b64_vid_id, 'GAYyJyIRIgs=', b64_vid_id, 'MAB4AjAAQhBjb21tZW50cy1zZWN0aW9u')
2379 new_continuation_intlist = list(itertools.chain.from_iterable(
2380 [bytes_to_intlist(base64.b64decode(part)) for part in parts]))
2381 return base64.b64encode(intlist_to_bytes(new_continuation_intlist)).decode('utf-8')
2382
2383 def _extract_comments(self, ytcfg, video_id, contents, webpage):
2384 """Entry for comment extraction"""
2385 def _real_comment_extract(contents):
2386 if isinstance(contents, list):
2387 for entry in contents:
2388 for key, renderer in entry.items():
2389 if key not in known_entry_comment_renderers:
2390 continue
2391 yield from self._comment_entries(
2392 renderer, video_id=video_id, ytcfg=ytcfg,
2393 identity_token=self._extract_identity_token(webpage, item_id=video_id),
2394 account_syncid=self._extract_account_syncid(ytcfg))
2395 break
2396 comments = []
2397 known_entry_comment_renderers = ('itemSectionRenderer',)
2398 estimated_total = 0
2399 max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) or float('inf')
2400 # Force English regardless of account setting to prevent parsing issues
2401 # See: https://github.com/yt-dlp/yt-dlp/issues/532
2402 ytcfg = copy.deepcopy(ytcfg)
2403 traverse_obj(
2404 ytcfg, ('INNERTUBE_CONTEXT', 'client'), expected_type=dict, default={})['hl'] = 'en'
2405 try:
2406 for comment in _real_comment_extract(contents):
2407 if len(comments) >= max_comments:
2408 break
2409 if isinstance(comment, int):
2410 estimated_total = comment
2411 continue
2412 comments.append(comment)
2413 except KeyboardInterrupt:
2414 self.to_screen('Interrupted by user')
2415 self.to_screen('Downloaded %d/%d comments' % (len(comments), estimated_total))
2416 return {
2417 'comments': comments,
2418 'comment_count': len(comments),
2419 }
2420
2421 @staticmethod
2422 def _generate_player_context(sts=None):
2423 context = {
2424 'html5Preference': 'HTML5_PREF_WANTS',
2425 }
2426 if sts is not None:
2427 context['signatureTimestamp'] = sts
2428 return {
2429 'playbackContext': {
2430 'contentPlaybackContext': context
2431 },
2432 'contentCheckOk': True,
2433 'racyCheckOk': True
2434 }
2435
2436 @staticmethod
2437 def _is_agegated(player_response):
2438 if traverse_obj(player_response, ('playabilityStatus', 'desktopLegacyAgeGateReason')):
2439 return True
2440
2441 reasons = traverse_obj(player_response, ('playabilityStatus', ('status', 'reason')), default=[])
2442 AGE_GATE_REASONS = (
2443 'confirm your age', 'age-restricted', 'inappropriate', # reason
2444 'age_verification_required', 'age_check_required', # status
2445 )
2446 return any(expected in reason for expected in AGE_GATE_REASONS for reason in reasons)
2447
2448 @staticmethod
2449 def _is_unplayable(player_response):
2450 return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE'
2451
2452 def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, identity_token, player_url, initial_pr):
2453
2454 session_index = self._extract_session_index(player_ytcfg, master_ytcfg)
2455 syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr)
2456 sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False)
2457 headers = self.generate_api_headers(
2458 player_ytcfg, identity_token, syncid,
2459 default_client=client, session_index=session_index)
2460
2461 yt_query = {'videoId': video_id}
2462 yt_query.update(self._generate_player_context(sts))
2463 return self._extract_response(
2464 item_id=video_id, ep='player', query=yt_query,
2465 ytcfg=player_ytcfg, headers=headers, fatal=True,
2466 default_client=client,
2467 note='Downloading %s player API JSON' % client.replace('_', ' ').strip()
2468 ) or None
2469
2470 def _get_requested_clients(self, url, smuggled_data):
2471 requested_clients = []
2472 allowed_clients = sorted(
2473 [client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'],
2474 key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True)
2475 for client in self._configuration_arg('player_client'):
2476 if client in allowed_clients:
2477 requested_clients.append(client)
2478 elif client == 'all':
2479 requested_clients.extend(allowed_clients)
2480 else:
2481 self.report_warning(f'Skipping unsupported client {client}')
2482 if not requested_clients:
2483 requested_clients = ['android', 'web']
2484
2485 if smuggled_data.get('is_music_url') or self.is_music_url(url):
2486 requested_clients.extend(
2487 f'{client}_music' for client in requested_clients if f'{client}_music' in INNERTUBE_CLIENTS)
2488
2489 return orderedSet(requested_clients)
2490
2491 def _extract_player_ytcfg(self, client, video_id):
2492 url = {
2493 'web_music': 'https://music.youtube.com',
2494 'web_embedded': f'https://www.youtube.com/embed/{video_id}?html5=1'
2495 }.get(client)
2496 if not url:
2497 return {}
2498 webpage = self._download_webpage(url, video_id, fatal=False, note=f'Downloading {client} config')
2499 return self.extract_ytcfg(video_id, webpage) or {}
2500
2501 def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, player_url, identity_token):
2502 initial_pr = None
2503 if webpage:
2504 initial_pr = self._extract_yt_initial_variable(
2505 webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
2506 video_id, 'initial player response')
2507
2508 original_clients = clients
2509 clients = clients[::-1]
2510
2511 def append_client(client_name):
2512 if client_name in INNERTUBE_CLIENTS and client_name not in original_clients:
2513 clients.append(client_name)
2514
2515 # Android player_response does not have microFormats which are needed for
2516 # extraction of some data. So we return the initial_pr with formats
2517 # stripped out even if not requested by the user
2518 # See: https://github.com/yt-dlp/yt-dlp/issues/501
2519 yielded_pr = False
2520 if initial_pr:
2521 pr = dict(initial_pr)
2522 pr['streamingData'] = None
2523 yielded_pr = True
2524 yield pr
2525
2526 last_error = None
2527 while clients:
2528 client = clients.pop()
2529 player_ytcfg = master_ytcfg if client == 'web' else {}
2530 if 'configs' not in self._configuration_arg('player_skip'):
2531 player_ytcfg = self._extract_player_ytcfg(client, video_id) or player_ytcfg
2532
2533 try:
2534 pr = initial_pr if client == 'web' and initial_pr else self._extract_player_response(
2535 client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, identity_token, player_url, initial_pr)
2536 except ExtractorError as e:
2537 if last_error:
2538 self.report_warning(last_error)
2539 last_error = e
2540 continue
2541
2542 if pr:
2543 yielded_pr = True
2544 yield pr
2545
2546 # creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in
2547 if client.endswith('_agegate') and self._is_unplayable(pr) and self._generate_sapisidhash_header():
2548 append_client(client.replace('_agegate', '_creator'))
2549 elif self._is_agegated(pr):
2550 append_client(f'{client}_agegate')
2551
2552 if last_error:
2553 if not yielded_pr:
2554 raise last_error
2555 self.report_warning(last_error)
2556
2557 def _extract_formats(self, streaming_data, video_id, player_url, is_live):
2558 itags, stream_ids = [], []
2559 itag_qualities, res_qualities = {}, {}
2560 q = qualities([
2561 # Normally tiny is the smallest video-only formats. But
2562 # audio-only formats with unknown quality may get tagged as tiny
2563 'tiny',
2564 'audio_quality_ultralow', 'audio_quality_low', 'audio_quality_medium', 'audio_quality_high', # Audio only formats
2565 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'
2566 ])
2567 streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[])
2568
2569 for fmt in streaming_formats:
2570 if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
2571 continue
2572
2573 itag = str_or_none(fmt.get('itag'))
2574 audio_track = fmt.get('audioTrack') or {}
2575 stream_id = '%s.%s' % (itag or '', audio_track.get('id', ''))
2576 if stream_id in stream_ids:
2577 continue
2578
2579 quality = fmt.get('quality')
2580 height = int_or_none(fmt.get('height'))
2581 if quality == 'tiny' or not quality:
2582 quality = fmt.get('audioQuality', '').lower() or quality
2583 # The 3gp format (17) in android client has a quality of "small",
2584 # but is actually worse than other formats
2585 if itag == '17':
2586 quality = 'tiny'
2587 if quality:
2588 if itag:
2589 itag_qualities[itag] = quality
2590 if height:
2591 res_qualities[height] = quality
2592 # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment
2593 # (adding `&sq=0` to the URL) and parsing emsg box to determine the
2594 # number of fragment that would subsequently requested with (`&sq=N`)
2595 if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF':
2596 continue
2597
2598 fmt_url = fmt.get('url')
2599 if not fmt_url:
2600 sc = compat_parse_qs(fmt.get('signatureCipher'))
2601 fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0]))
2602 encrypted_sig = try_get(sc, lambda x: x['s'][0])
2603 if not (sc and fmt_url and encrypted_sig):
2604 continue
2605 if not player_url:
2606 continue
2607 signature = self._decrypt_signature(sc['s'][0], video_id, player_url)
2608 sp = try_get(sc, lambda x: x['sp'][0]) or 'signature'
2609 fmt_url += '&' + sp + '=' + signature
2610
2611 if itag:
2612 itags.append(itag)
2613 stream_ids.append(stream_id)
2614
2615 tbr = float_or_none(
2616 fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
2617 dct = {
2618 'asr': int_or_none(fmt.get('audioSampleRate')),
2619 'filesize': int_or_none(fmt.get('contentLength')),
2620 'format_id': itag,
2621 'format_note': ', '.join(filter(None, (
2622 audio_track.get('displayName'),
2623 fmt.get('qualityLabel') or quality.replace('audio_quality_', '')))),
2624 'fps': int_or_none(fmt.get('fps')),
2625 'height': height,
2626 'quality': q(quality),
2627 'tbr': tbr,
2628 'url': fmt_url,
2629 'width': int_or_none(fmt.get('width')),
2630 'language': audio_track.get('id', '').split('.')[0],
2631 }
2632 mime_mobj = re.match(
2633 r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '')
2634 if mime_mobj:
2635 dct['ext'] = mimetype2ext(mime_mobj.group(1))
2636 dct.update(parse_codecs(mime_mobj.group(2)))
2637 no_audio = dct.get('acodec') == 'none'
2638 no_video = dct.get('vcodec') == 'none'
2639 if no_audio:
2640 dct['vbr'] = tbr
2641 if no_video:
2642 dct['abr'] = tbr
2643 if no_audio or no_video:
2644 dct['downloader_options'] = {
2645 # Youtube throttles chunks >~10M
2646 'http_chunk_size': 10485760,
2647 }
2648 if dct.get('ext'):
2649 dct['container'] = dct['ext'] + '_dash'
2650 yield dct
2651
2652 skip_manifests = self._configuration_arg('skip')
2653 get_dash = (
2654 (not is_live or self._configuration_arg('include_live_dash'))
2655 and 'dash' not in skip_manifests and self.get_param('youtube_include_dash_manifest', True))
2656 get_hls = 'hls' not in skip_manifests and self.get_param('youtube_include_hls_manifest', True)
2657
2658 def guess_quality(f):
2659 for val, qdict in ((f.get('format_id'), itag_qualities), (f.get('height'), res_qualities)):
2660 if val in qdict:
2661 return q(qdict[val])
2662 return -1
2663
2664 for sd in streaming_data:
2665 hls_manifest_url = get_hls and sd.get('hlsManifestUrl')
2666 if hls_manifest_url:
2667 for f in self._extract_m3u8_formats(hls_manifest_url, video_id, 'mp4', fatal=False):
2668 itag = self._search_regex(
2669 r'/itag/(\d+)', f['url'], 'itag', default=None)
2670 if itag in itags:
2671 continue
2672 if itag:
2673 f['format_id'] = itag
2674 itags.append(itag)
2675 f['quality'] = guess_quality(f)
2676 yield f
2677
2678 dash_manifest_url = get_dash and sd.get('dashManifestUrl')
2679 if dash_manifest_url:
2680 for f in self._extract_mpd_formats(dash_manifest_url, video_id, fatal=False):
2681 itag = f['format_id']
2682 if itag in itags:
2683 continue
2684 if itag:
2685 itags.append(itag)
2686 f['quality'] = guess_quality(f)
2687 filesize = int_or_none(self._search_regex(
2688 r'/clen/(\d+)', f.get('fragment_base_url')
2689 or f['url'], 'file size', default=None))
2690 if filesize:
2691 f['filesize'] = filesize
2692 yield f
2693
2694 def _real_extract(self, url):
2695 url, smuggled_data = unsmuggle_url(url, {})
2696 video_id = self._match_id(url)
2697
2698 base_url = self.http_scheme() + '//www.youtube.com/'
2699 webpage_url = base_url + 'watch?v=' + video_id
2700 webpage = self._download_webpage(
2701 webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
2702
2703 master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg()
2704 player_url = self._extract_player_url(master_ytcfg, webpage)
2705 identity_token = self._extract_identity_token(webpage, video_id)
2706
2707 player_responses = list(self._extract_player_responses(
2708 self._get_requested_clients(url, smuggled_data),
2709 video_id, webpage, master_ytcfg, player_url, identity_token))
2710
2711 get_first = lambda obj, keys, **kwargs: traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
2712
2713 playability_statuses = traverse_obj(
2714 player_responses, (..., 'playabilityStatus'), expected_type=dict, default=[])
2715
2716 trailer_video_id = get_first(
2717 playability_statuses,
2718 ('errorScreen', 'playerLegacyDesktopYpcTrailerRenderer', 'trailerVideoId'),
2719 expected_type=str)
2720 if trailer_video_id:
2721 return self.url_result(
2722 trailer_video_id, self.ie_key(), trailer_video_id)
2723
2724 search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None))
2725 if webpage else (lambda x: None))
2726
2727 video_details = traverse_obj(
2728 player_responses, (..., 'videoDetails'), expected_type=dict, default=[])
2729 microformats = traverse_obj(
2730 player_responses, (..., 'microformat', 'playerMicroformatRenderer'),
2731 expected_type=dict, default=[])
2732 video_title = (
2733 get_first(video_details, 'title')
2734 or self._get_text(microformats, (..., 'title'))
2735 or search_meta(['og:title', 'twitter:title', 'title']))
2736 video_description = get_first(video_details, 'shortDescription')
2737
2738 if not smuggled_data.get('force_singlefeed', False):
2739 if not self.get_param('noplaylist'):
2740 multifeed_metadata_list = get_first(
2741 player_responses,
2742 ('multicamera', 'playerLegacyMulticameraRenderer', 'metadataList'),
2743 expected_type=str)
2744 if multifeed_metadata_list:
2745 entries = []
2746 feed_ids = []
2747 for feed in multifeed_metadata_list.split(','):
2748 # Unquote should take place before split on comma (,) since textual
2749 # fields may contain comma as well (see
2750 # https://github.com/ytdl-org/youtube-dl/issues/8536)
2751 feed_data = compat_parse_qs(
2752 compat_urllib_parse_unquote_plus(feed))
2753
2754 def feed_entry(name):
2755 return try_get(
2756 feed_data, lambda x: x[name][0], compat_str)
2757
2758 feed_id = feed_entry('id')
2759 if not feed_id:
2760 continue
2761 feed_title = feed_entry('title')
2762 title = video_title
2763 if feed_title:
2764 title += ' (%s)' % feed_title
2765 entries.append({
2766 '_type': 'url_transparent',
2767 'ie_key': 'Youtube',
2768 'url': smuggle_url(
2769 '%swatch?v=%s' % (base_url, feed_data['id'][0]),
2770 {'force_singlefeed': True}),
2771 'title': title,
2772 })
2773 feed_ids.append(feed_id)
2774 self.to_screen(
2775 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
2776 % (', '.join(feed_ids), video_id))
2777 return self.playlist_result(
2778 entries, video_id, video_title, video_description)
2779 else:
2780 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2781
2782 live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails'))
2783 is_live = get_first(video_details, 'isLive')
2784 if is_live is None:
2785 is_live = get_first(live_broadcast_details, 'isLiveNow')
2786
2787 streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[])
2788 formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live))
2789
2790 if not formats:
2791 if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')):
2792 self.raise_no_formats(
2793 'This video is DRM protected.', expected=True)
2794 pemr = get_first(
2795 playability_statuses,
2796 ('errorScreen', 'playerErrorMessageRenderer'), expected_type=dict) or {}
2797 reason = self._get_text(pemr, 'reason') or get_first(playability_statuses, 'reason')
2798 subreason = clean_html(self._get_text(pemr, 'subreason') or '')
2799 if subreason:
2800 if subreason == 'The uploader has not made this video available in your country.':
2801 countries = get_first(microformats, 'availableCountries')
2802 if not countries:
2803 regions_allowed = search_meta('regionsAllowed')
2804 countries = regions_allowed.split(',') if regions_allowed else None
2805 self.raise_geo_restricted(subreason, countries, metadata_available=True)
2806 reason += f'. {subreason}'
2807 if reason:
2808 self.raise_no_formats(reason, expected=True)
2809
2810 for f in formats:
2811 if '&c=WEB&' in f['url'] and '&ratebypass=yes&' not in f['url']: # throttled
2812 f['source_preference'] = -10
2813 # TODO: this method is not reliable
2814 f['format_note'] = format_field(f, 'format_note', '%s ') + '(maybe throttled)'
2815
2816 # Source is given priority since formats that throttle are given lower source_preference
2817 # When throttling issue is fully fixed, remove this
2818 self._sort_formats(formats, ('quality', 'height', 'fps', 'source'))
2819
2820 keywords = get_first(video_details, 'keywords', expected_type=list) or []
2821 if not keywords and webpage:
2822 keywords = [
2823 unescapeHTML(m.group('content'))
2824 for m in re.finditer(self._meta_regex('og:video:tag'), webpage)]
2825 for keyword in keywords:
2826 if keyword.startswith('yt:stretch='):
2827 mobj = re.search(r'(\d+)\s*:\s*(\d+)', keyword)
2828 if mobj:
2829 # NB: float is intentional for forcing float division
2830 w, h = (float(v) for v in mobj.groups())
2831 if w > 0 and h > 0:
2832 ratio = w / h
2833 for f in formats:
2834 if f.get('vcodec') != 'none':
2835 f['stretched_ratio'] = ratio
2836 break
2837
2838 thumbnails = []
2839 thumbnail_dicts = traverse_obj(
2840 (video_details, microformats), (..., ..., 'thumbnail', 'thumbnails', ...),
2841 expected_type=dict, default=[])
2842 for thumbnail in thumbnail_dicts:
2843 thumbnail_url = thumbnail.get('url')
2844 if not thumbnail_url:
2845 continue
2846 # Sometimes youtube gives a wrong thumbnail URL. See:
2847 # https://github.com/yt-dlp/yt-dlp/issues/233
2848 # https://github.com/ytdl-org/youtube-dl/issues/28023
2849 if 'maxresdefault' in thumbnail_url:
2850 thumbnail_url = thumbnail_url.split('?')[0]
2851 thumbnails.append({
2852 'url': thumbnail_url,
2853 'height': int_or_none(thumbnail.get('height')),
2854 'width': int_or_none(thumbnail.get('width')),
2855 })
2856 thumbnail_url = search_meta(['og:image', 'twitter:image'])
2857 if thumbnail_url:
2858 thumbnails.append({
2859 'url': thumbnail_url,
2860 })
2861 # The best resolution thumbnails sometimes does not appear in the webpage
2862 # See: https://github.com/ytdl-org/youtube-dl/issues/29049, https://github.com/yt-dlp/yt-dlp/issues/340
2863 # List of possible thumbnails - Ref: <https://stackoverflow.com/a/20542029>
2864 hq_thumbnail_names = ['maxresdefault', 'hq720', 'sddefault', 'sd1', 'sd2', 'sd3']
2865 # TODO: Test them also? - For some videos, even these don't exist
2866 guaranteed_thumbnail_names = [
2867 'hqdefault', 'hq1', 'hq2', 'hq3', '0',
2868 'mqdefault', 'mq1', 'mq2', 'mq3',
2869 'default', '1', '2', '3'
2870 ]
2871 thumbnail_names = hq_thumbnail_names + guaranteed_thumbnail_names
2872 n_thumbnail_names = len(thumbnail_names)
2873
2874 thumbnails.extend({
2875 'url': 'https://i.ytimg.com/vi{webp}/{video_id}/{name}{live}.{ext}'.format(
2876 video_id=video_id, name=name, ext=ext,
2877 webp='_webp' if ext == 'webp' else '', live='_live' if is_live else ''),
2878 '_test_url': name in hq_thumbnail_names,
2879 } for name in thumbnail_names for ext in ('webp', 'jpg'))
2880 for thumb in thumbnails:
2881 i = next((i for i, t in enumerate(thumbnail_names) if f'/{video_id}/{t}' in thumb['url']), n_thumbnail_names)
2882 thumb['preference'] = (0 if '.webp' in thumb['url'] else -1) - (2 * i)
2883 self._remove_duplicate_formats(thumbnails)
2884
2885 category = get_first(microformats, 'category') or search_meta('genre')
2886 channel_id = str_or_none(
2887 get_first(video_details, 'channelId')
2888 or get_first(microformats, 'externalChannelId')
2889 or search_meta('channelId'))
2890 duration = int_or_none(
2891 get_first(video_details, 'lengthSeconds')
2892 or get_first(microformats, 'lengthSeconds')
2893 or parse_duration(search_meta('duration'))) or None
2894 owner_profile_url = get_first(microformats, 'ownerProfileUrl')
2895
2896 live_content = get_first(video_details, 'isLiveContent')
2897 is_upcoming = get_first(video_details, 'isUpcoming')
2898 if is_live is None:
2899 if is_upcoming or live_content is False:
2900 is_live = False
2901 if is_upcoming is None and (live_content or is_live):
2902 is_upcoming = False
2903 live_starttime = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp'))
2904 live_endtime = parse_iso8601(get_first(live_broadcast_details, 'endTimestamp'))
2905 if not duration and live_endtime and live_starttime:
2906 duration = live_endtime - live_starttime
2907
2908 info = {
2909 'id': video_id,
2910 'title': self._live_title(video_title) if is_live else video_title,
2911 'formats': formats,
2912 'thumbnails': thumbnails,
2913 'description': video_description,
2914 'upload_date': unified_strdate(
2915 get_first(microformats, 'uploadDate')
2916 or search_meta('uploadDate')),
2917 'uploader': get_first(video_details, 'author'),
2918 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None,
2919 'uploader_url': owner_profile_url,
2920 'channel_id': channel_id,
2921 'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None,
2922 'duration': duration,
2923 'view_count': int_or_none(
2924 get_first((video_details, microformats), (..., 'viewCount'))
2925 or search_meta('interactionCount')),
2926 'average_rating': float_or_none(get_first(video_details, 'averageRating')),
2927 'age_limit': 18 if (
2928 get_first(microformats, 'isFamilySafe') is False
2929 or search_meta('isFamilyFriendly') == 'false'
2930 or search_meta('og:restrictions:age') == '18+') else 0,
2931 'webpage_url': webpage_url,
2932 'categories': [category] if category else None,
2933 'tags': keywords,
2934 'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'),
2935 'is_live': is_live,
2936 'was_live': (False if is_live or is_upcoming or live_content is False
2937 else None if is_live is None or is_upcoming is None
2938 else live_content),
2939 'live_status': 'is_upcoming' if is_upcoming else None, # rest will be set by YoutubeDL
2940 'release_timestamp': live_starttime,
2941 }
2942
2943 pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict)
2944 # Converted into dicts to remove duplicates
2945 captions = {
2946 sub.get('baseUrl'): sub
2947 for sub in traverse_obj(pctr, (..., 'captionTracks', ...), default=[])}
2948 translation_languages = {
2949 lang.get('languageCode'): lang.get('languageName')
2950 for lang in traverse_obj(pctr, (..., 'translationLanguages', ...), default=[])}
2951 subtitles = {}
2952 if pctr:
2953 def process_language(container, base_url, lang_code, sub_name, query):
2954 lang_subs = container.setdefault(lang_code, [])
2955 for fmt in self._SUBTITLE_FORMATS:
2956 query.update({
2957 'fmt': fmt,
2958 })
2959 lang_subs.append({
2960 'ext': fmt,
2961 'url': update_url_query(base_url, query),
2962 'name': sub_name,
2963 })
2964
2965 for base_url, caption_track in captions.items():
2966 if not base_url:
2967 continue
2968 if caption_track.get('kind') != 'asr':
2969 lang_code = (
2970 remove_start(caption_track.get('vssId') or '', '.').replace('.', '-')
2971 or caption_track.get('languageCode'))
2972 if not lang_code:
2973 continue
2974 process_language(
2975 subtitles, base_url, lang_code,
2976 traverse_obj(caption_track, ('name', 'simpleText')),
2977 {})
2978 continue
2979 automatic_captions = {}
2980 for trans_code, trans_name in translation_languages.items():
2981 if not trans_code:
2982 continue
2983 process_language(
2984 automatic_captions, base_url, trans_code,
2985 self._get_text(trans_name, max_runs=1),
2986 {'tlang': trans_code})
2987 info['automatic_captions'] = automatic_captions
2988 info['subtitles'] = subtitles
2989
2990 parsed_url = compat_urllib_parse_urlparse(url)
2991 for component in [parsed_url.fragment, parsed_url.query]:
2992 query = compat_parse_qs(component)
2993 for k, v in query.items():
2994 for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]:
2995 d_k += '_time'
2996 if d_k not in info and k in s_ks:
2997 info[d_k] = parse_duration(query[k][0])
2998
2999 # Youtube Music Auto-generated description
3000 if video_description:
3001 mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
3002 if mobj:
3003 release_year = mobj.group('release_year')
3004 release_date = mobj.group('release_date')
3005 if release_date:
3006 release_date = release_date.replace('-', '')
3007 if not release_year:
3008 release_year = release_date[:4]
3009 info.update({
3010 'album': mobj.group('album'.strip()),
3011 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')),
3012 'track': mobj.group('track').strip(),
3013 'release_date': release_date,
3014 'release_year': int_or_none(release_year),
3015 })
3016
3017 initial_data = None
3018 if webpage:
3019 initial_data = self._extract_yt_initial_variable(
3020 webpage, self._YT_INITIAL_DATA_RE, video_id,
3021 'yt initial data')
3022 if not initial_data:
3023 headers = self.generate_api_headers(
3024 master_ytcfg, identity_token, self._extract_account_syncid(master_ytcfg),
3025 session_index=self._extract_session_index(master_ytcfg))
3026
3027 initial_data = self._extract_response(
3028 item_id=video_id, ep='next', fatal=False,
3029 ytcfg=master_ytcfg, headers=headers, query={'videoId': video_id},
3030 note='Downloading initial data API JSON')
3031
3032 try:
3033 # This will error if there is no livechat
3034 initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
3035 info['subtitles']['live_chat'] = [{
3036 'url': 'https://www.youtube.com/watch?v=%s' % video_id, # url is needed to set cookies
3037 'video_id': video_id,
3038 'ext': 'json',
3039 'protocol': 'youtube_live_chat' if is_live or is_upcoming else 'youtube_live_chat_replay',
3040 }]
3041 except (KeyError, IndexError, TypeError):
3042 pass
3043
3044 if initial_data:
3045 info['chapters'] = (
3046 self._extract_chapters_from_json(initial_data, duration)
3047 or self._extract_chapters_from_engagement_panel(initial_data, duration)
3048 or None)
3049
3050 contents = try_get(
3051 initial_data,
3052 lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'],
3053 list) or []
3054 for content in contents:
3055 vpir = content.get('videoPrimaryInfoRenderer')
3056 if vpir:
3057 stl = vpir.get('superTitleLink')
3058 if stl:
3059 stl = self._get_text(stl)
3060 if try_get(
3061 vpir,
3062 lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':
3063 info['location'] = stl
3064 else:
3065 mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl)
3066 if mobj:
3067 info.update({
3068 'series': mobj.group(1),
3069 'season_number': int(mobj.group(2)),
3070 'episode_number': int(mobj.group(3)),
3071 })
3072 for tlb in (try_get(
3073 vpir,
3074 lambda x: x['videoActions']['menuRenderer']['topLevelButtons'],
3075 list) or []):
3076 tbr = tlb.get('toggleButtonRenderer') or {}
3077 for getter, regex in [(
3078 lambda x: x['defaultText']['accessibility']['accessibilityData'],
3079 r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([
3080 lambda x: x['accessibility'],
3081 lambda x: x['accessibilityData']['accessibilityData'],
3082 ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]:
3083 label = (try_get(tbr, getter, dict) or {}).get('label')
3084 if label:
3085 mobj = re.match(regex, label)
3086 if mobj:
3087 info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count'))
3088 break
3089 sbr_tooltip = try_get(
3090 vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])
3091 if sbr_tooltip:
3092 like_count, dislike_count = sbr_tooltip.split(' / ')
3093 info.update({
3094 'like_count': str_to_int(like_count),
3095 'dislike_count': str_to_int(dislike_count),
3096 })
3097 vsir = content.get('videoSecondaryInfoRenderer')
3098 if vsir:
3099 info['channel'] = self._get_text(vsir, ('owner', 'videoOwnerRenderer', 'title'))
3100 rows = try_get(
3101 vsir,
3102 lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
3103 list) or []
3104 multiple_songs = False
3105 for row in rows:
3106 if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
3107 multiple_songs = True
3108 break
3109 for row in rows:
3110 mrr = row.get('metadataRowRenderer') or {}
3111 mrr_title = mrr.get('title')
3112 if not mrr_title:
3113 continue
3114 mrr_title = self._get_text(mrr, 'title')
3115 mrr_contents_text = self._get_text(mrr, ('contents', 0))
3116 if mrr_title == 'License':
3117 info['license'] = mrr_contents_text
3118 elif not multiple_songs:
3119 if mrr_title == 'Album':
3120 info['album'] = mrr_contents_text
3121 elif mrr_title == 'Artist':
3122 info['artist'] = mrr_contents_text
3123 elif mrr_title == 'Song':
3124 info['track'] = mrr_contents_text
3125
3126 fallbacks = {
3127 'channel': 'uploader',
3128 'channel_id': 'uploader_id',
3129 'channel_url': 'uploader_url',
3130 }
3131 for to, frm in fallbacks.items():
3132 if not info.get(to):
3133 info[to] = info.get(frm)
3134
3135 for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]:
3136 v = info.get(s_k)
3137 if v:
3138 info[d_k] = v
3139
3140 is_private = get_first(video_details, 'isPrivate', expected_type=bool)
3141 is_unlisted = get_first(microformats, 'isUnlisted', expected_type=bool)
3142 is_membersonly = None
3143 is_premium = None
3144 if initial_data and is_private is not None:
3145 is_membersonly = False
3146 is_premium = False
3147 contents = try_get(initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or []
3148 badge_labels = set()
3149 for content in contents:
3150 if not isinstance(content, dict):
3151 continue
3152 badge_labels.update(self._extract_badges(content.get('videoPrimaryInfoRenderer')))
3153 for badge_label in badge_labels:
3154 if badge_label.lower() == 'members only':
3155 is_membersonly = True
3156 elif badge_label.lower() == 'premium':
3157 is_premium = True
3158 elif badge_label.lower() == 'unlisted':
3159 is_unlisted = True
3160
3161 info['availability'] = self._availability(
3162 is_private=is_private,
3163 needs_premium=is_premium,
3164 needs_subscription=is_membersonly,
3165 needs_auth=info['age_limit'] >= 18,
3166 is_unlisted=None if is_private is None else is_unlisted)
3167
3168 # get xsrf for annotations or comments
3169 get_annotations = self.get_param('writeannotations', False)
3170 get_comments = self.get_param('getcomments', False)
3171 if get_annotations or get_comments:
3172 xsrf_token = None
3173 if master_ytcfg:
3174 xsrf_token = try_get(master_ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
3175 if not xsrf_token:
3176 xsrf_token = self._search_regex(
3177 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
3178 webpage, 'xsrf token', group='xsrf_token', fatal=False)
3179
3180 # annotations
3181 if get_annotations:
3182 invideo_url = get_first(
3183 player_responses,
3184 ('annotations', 0, 'playerAnnotationsUrlsRenderer', 'invideoUrl'),
3185 expected_type=str)
3186 if xsrf_token and invideo_url:
3187 xsrf_field_name = None
3188 if master_ytcfg:
3189 xsrf_field_name = try_get(master_ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
3190 if not xsrf_field_name:
3191 xsrf_field_name = self._search_regex(
3192 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
3193 webpage, 'xsrf field name',
3194 group='xsrf_field_name', default='session_token')
3195 info['annotations'] = self._download_webpage(
3196 self._proto_relative_url(invideo_url),
3197 video_id, note='Downloading annotations',
3198 errnote='Unable to download video annotations', fatal=False,
3199 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
3200
3201 if get_comments:
3202 info['__post_extractor'] = lambda: self._extract_comments(master_ytcfg, video_id, contents, webpage)
3203
3204 self.mark_watched(video_id, player_responses)
3205
3206 return info
3207
3208
3209 class YoutubeTabIE(YoutubeBaseInfoExtractor):
3210 IE_DESC = 'YouTube.com tab'
3211 _VALID_URL = r'''(?x)
3212 https?://
3213 (?:\w+\.)?
3214 (?:
3215 youtube(?:kids)?\.com|
3216 invidio\.us
3217 )/
3218 (?:
3219 (?P<channel_type>channel|c|user|browse)/|
3220 (?P<not_channel>
3221 feed/|hashtag/|
3222 (?:playlist|watch)\?.*?\blist=
3223 )|
3224 (?!(?:%s)\b) # Direct URLs
3225 )
3226 (?P<id>[^/?\#&]+)
3227 ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
3228 IE_NAME = 'youtube:tab'
3229
3230 _TESTS = [{
3231 'note': 'playlists, multipage',
3232 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
3233 'playlist_mincount': 94,
3234 'info_dict': {
3235 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
3236 'title': 'Игорь Клейнер - Playlists',
3237 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
3238 'uploader': 'Игорь Клейнер',
3239 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
3240 },
3241 }, {
3242 'note': 'playlists, multipage, different order',
3243 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
3244 'playlist_mincount': 94,
3245 'info_dict': {
3246 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
3247 'title': 'Игорь Клейнер - Playlists',
3248 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
3249 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
3250 'uploader': 'Игорь Клейнер',
3251 },
3252 }, {
3253 'note': 'playlists, series',
3254 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3',
3255 'playlist_mincount': 5,
3256 'info_dict': {
3257 'id': 'UCYO_jab_esuFRV4b17AJtAw',
3258 'title': '3Blue1Brown - Playlists',
3259 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
3260 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
3261 'uploader': '3Blue1Brown',
3262 },
3263 }, {
3264 'note': 'playlists, singlepage',
3265 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
3266 'playlist_mincount': 4,
3267 'info_dict': {
3268 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
3269 'title': 'ThirstForScience - Playlists',
3270 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
3271 'uploader': 'ThirstForScience',
3272 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
3273 }
3274 }, {
3275 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3276 'only_matching': True,
3277 }, {
3278 'note': 'basic, single video playlist',
3279 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3280 'info_dict': {
3281 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
3282 'uploader': 'Sergey M.',
3283 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3284 'title': 'youtube-dl public playlist',
3285 },
3286 'playlist_count': 1,
3287 }, {
3288 'note': 'empty playlist',
3289 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3290 'info_dict': {
3291 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
3292 'uploader': 'Sergey M.',
3293 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3294 'title': 'youtube-dl empty playlist',
3295 },
3296 'playlist_count': 0,
3297 }, {
3298 'note': 'Home tab',
3299 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
3300 'info_dict': {
3301 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3302 'title': 'lex will - Home',
3303 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3304 'uploader': 'lex will',
3305 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3306 },
3307 'playlist_mincount': 2,
3308 }, {
3309 'note': 'Videos tab',
3310 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
3311 'info_dict': {
3312 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3313 'title': 'lex will - Videos',
3314 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3315 'uploader': 'lex will',
3316 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3317 },
3318 'playlist_mincount': 975,
3319 }, {
3320 'note': 'Videos tab, sorted by popular',
3321 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
3322 'info_dict': {
3323 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3324 'title': 'lex will - Videos',
3325 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3326 'uploader': 'lex will',
3327 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3328 },
3329 'playlist_mincount': 199,
3330 }, {
3331 'note': 'Playlists tab',
3332 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
3333 'info_dict': {
3334 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3335 'title': 'lex will - Playlists',
3336 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3337 'uploader': 'lex will',
3338 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3339 },
3340 'playlist_mincount': 17,
3341 }, {
3342 'note': 'Community tab',
3343 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
3344 'info_dict': {
3345 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3346 'title': 'lex will - Community',
3347 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3348 'uploader': 'lex will',
3349 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3350 },
3351 'playlist_mincount': 18,
3352 }, {
3353 'note': 'Channels tab',
3354 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
3355 'info_dict': {
3356 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3357 'title': 'lex will - Channels',
3358 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3359 'uploader': 'lex will',
3360 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3361 },
3362 'playlist_mincount': 12,
3363 }, {
3364 'note': 'Search tab',
3365 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra',
3366 'playlist_mincount': 40,
3367 'info_dict': {
3368 'id': 'UCYO_jab_esuFRV4b17AJtAw',
3369 'title': '3Blue1Brown - Search - linear algebra',
3370 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
3371 'uploader': '3Blue1Brown',
3372 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
3373 },
3374 }, {
3375 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3376 'only_matching': True,
3377 }, {
3378 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3379 'only_matching': True,
3380 }, {
3381 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3382 'only_matching': True,
3383 }, {
3384 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
3385 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
3386 'info_dict': {
3387 'title': '29C3: Not my department',
3388 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
3389 'uploader': 'Christiaan008',
3390 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
3391 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268',
3392 },
3393 'playlist_count': 96,
3394 }, {
3395 'note': 'Large playlist',
3396 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
3397 'info_dict': {
3398 'title': 'Uploads from Cauchemar',
3399 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
3400 'uploader': 'Cauchemar',
3401 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
3402 },
3403 'playlist_mincount': 1123,
3404 }, {
3405 'note': 'even larger playlist, 8832 videos',
3406 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
3407 'only_matching': True,
3408 }, {
3409 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
3410 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
3411 'info_dict': {
3412 'title': 'Uploads from Interstellar Movie',
3413 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
3414 'uploader': 'Interstellar Movie',
3415 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
3416 },
3417 'playlist_mincount': 21,
3418 }, {
3419 'note': 'Playlist with "show unavailable videos" button',
3420 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q',
3421 'info_dict': {
3422 'title': 'Uploads from Phim Siêu Nhân Nhật Bản',
3423 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q',
3424 'uploader': 'Phim Siêu Nhân Nhật Bản',
3425 'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q',
3426 },
3427 'playlist_mincount': 200,
3428 }, {
3429 'note': 'Playlist with unavailable videos in page 7',
3430 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w',
3431 'info_dict': {
3432 'title': 'Uploads from BlankTV',
3433 'id': 'UU8l9frL61Yl5KFOl87nIm2w',
3434 'uploader': 'BlankTV',
3435 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w',
3436 },
3437 'playlist_mincount': 1000,
3438 }, {
3439 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844',
3440 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
3441 'info_dict': {
3442 'title': 'Data Analysis with Dr Mike Pound',
3443 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
3444 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
3445 'uploader': 'Computerphile',
3446 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487',
3447 },
3448 'playlist_mincount': 11,
3449 }, {
3450 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3451 'only_matching': True,
3452 }, {
3453 'note': 'Playlist URL that does not actually serve a playlist',
3454 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
3455 'info_dict': {
3456 'id': 'FqZTN594JQw',
3457 'ext': 'webm',
3458 'title': "Smiley's People 01 detective, Adventure Series, Action",
3459 'uploader': 'STREEM',
3460 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
3461 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
3462 'upload_date': '20150526',
3463 'license': 'Standard YouTube License',
3464 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
3465 'categories': ['People & Blogs'],
3466 'tags': list,
3467 'view_count': int,
3468 'like_count': int,
3469 'dislike_count': int,
3470 },
3471 'params': {
3472 'skip_download': True,
3473 },
3474 'skip': 'This video is not available.',
3475 'add_ie': [YoutubeIE.ie_key()],
3476 }, {
3477 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
3478 'only_matching': True,
3479 }, {
3480 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
3481 'only_matching': True,
3482 }, {
3483 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
3484 'info_dict': {
3485 'id': '3yImotZU3tw', # This will keep changing
3486 'ext': 'mp4',
3487 'title': compat_str,
3488 'uploader': 'Sky News',
3489 'uploader_id': 'skynews',
3490 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
3491 'upload_date': r're:\d{8}',
3492 'description': compat_str,
3493 'categories': ['News & Politics'],
3494 'tags': list,
3495 'like_count': int,
3496 'dislike_count': int,
3497 },
3498 'params': {
3499 'skip_download': True,
3500 },
3501 'expected_warnings': ['Downloading just video ', 'Ignoring subtitle tracks found in '],
3502 }, {
3503 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
3504 'info_dict': {
3505 'id': 'a48o2S1cPoo',
3506 'ext': 'mp4',
3507 'title': 'The Young Turks - Live Main Show',
3508 'uploader': 'The Young Turks',
3509 'uploader_id': 'TheYoungTurks',
3510 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
3511 'upload_date': '20150715',
3512 'license': 'Standard YouTube License',
3513 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3514 'categories': ['News & Politics'],
3515 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3516 'like_count': int,
3517 'dislike_count': int,
3518 },
3519 'params': {
3520 'skip_download': True,
3521 },
3522 'only_matching': True,
3523 }, {
3524 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
3525 'only_matching': True,
3526 }, {
3527 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3528 'only_matching': True,
3529 }, {
3530 'note': 'A channel that is not live. Should raise error',
3531 'url': 'https://www.youtube.com/user/numberphile/live',
3532 'only_matching': True,
3533 }, {
3534 'url': 'https://www.youtube.com/feed/trending',
3535 'only_matching': True,
3536 }, {
3537 'url': 'https://www.youtube.com/feed/library',
3538 'only_matching': True,
3539 }, {
3540 'url': 'https://www.youtube.com/feed/history',
3541 'only_matching': True,
3542 }, {
3543 'url': 'https://www.youtube.com/feed/subscriptions',
3544 'only_matching': True,
3545 }, {
3546 'url': 'https://www.youtube.com/feed/watch_later',
3547 'only_matching': True,
3548 }, {
3549 'note': 'Recommended - redirects to home page',
3550 'url': 'https://www.youtube.com/feed/recommended',
3551 'only_matching': True,
3552 }, {
3553 'note': 'inline playlist with not always working continuations',
3554 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
3555 'only_matching': True,
3556 }, {
3557 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8',
3558 'only_matching': True,
3559 }, {
3560 'url': 'https://www.youtube.com/course',
3561 'only_matching': True,
3562 }, {
3563 'url': 'https://www.youtube.com/zsecurity',
3564 'only_matching': True,
3565 }, {
3566 'url': 'http://www.youtube.com/NASAgovVideo/videos',
3567 'only_matching': True,
3568 }, {
3569 'url': 'https://www.youtube.com/TheYoungTurks/live',
3570 'only_matching': True,
3571 }, {
3572 'url': 'https://www.youtube.com/hashtag/cctv9',
3573 'info_dict': {
3574 'id': 'cctv9',
3575 'title': '#cctv9',
3576 },
3577 'playlist_mincount': 350,
3578 }, {
3579 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU',
3580 'only_matching': True,
3581 }, {
3582 'note': 'Requires Premium: should request additional YTM-info webpage (and have format 141) for videos in playlist',
3583 'url': 'https://music.youtube.com/playlist?list=PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3584 'only_matching': True
3585 }, {
3586 'note': '/browse/ should redirect to /channel/',
3587 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng',
3588 'only_matching': True
3589 }, {
3590 'note': 'VLPL, should redirect to playlist?list=PL...',
3591 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3592 'info_dict': {
3593 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3594 'uploader': 'NoCopyrightSounds',
3595 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!',
3596 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
3597 'title': 'NCS Releases',
3598 },
3599 'playlist_mincount': 166,
3600 }, {
3601 'note': 'Topic, should redirect to playlist?list=UU...',
3602 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw',
3603 'info_dict': {
3604 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw',
3605 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
3606 'title': 'Uploads from Royalty Free Music - Topic',
3607 'uploader': 'Royalty Free Music - Topic',
3608 },
3609 'expected_warnings': [
3610 'A channel/user page was given',
3611 'The URL does not have a videos tab',
3612 ],
3613 'playlist_mincount': 101,
3614 }, {
3615 'note': 'Topic without a UU playlist',
3616 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg',
3617 'info_dict': {
3618 'id': 'UCtFRv9O2AHqOZjjynzrv-xg',
3619 'title': 'UCtFRv9O2AHqOZjjynzrv-xg',
3620 },
3621 'expected_warnings': [
3622 'A channel/user page was given',
3623 'The URL does not have a videos tab',
3624 'Falling back to channel URL',
3625 ],
3626 'playlist_mincount': 9,
3627 }, {
3628 'note': 'Youtube music Album',
3629 'url': 'https://music.youtube.com/browse/MPREb_gTAcphH99wE',
3630 'info_dict': {
3631 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0',
3632 'title': 'Album - Royalty Free Music Library V2 (50 Songs)',
3633 },
3634 'playlist_count': 50,
3635 }, {
3636 'note': 'unlisted single video playlist',
3637 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
3638 'info_dict': {
3639 'uploader_id': 'UC9zHu_mHU96r19o-wV5Qs1Q',
3640 'uploader': 'colethedj',
3641 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
3642 'title': 'yt-dlp unlisted playlist test',
3643 'availability': 'unlisted'
3644 },
3645 'playlist_count': 1,
3646 }]
3647
3648 @classmethod
3649 def suitable(cls, url):
3650 return False if YoutubeIE.suitable(url) else super(
3651 YoutubeTabIE, cls).suitable(url)
3652
3653 def _extract_channel_id(self, webpage):
3654 channel_id = self._html_search_meta(
3655 'channelId', webpage, 'channel id', default=None)
3656 if channel_id:
3657 return channel_id
3658 channel_url = self._html_search_meta(
3659 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
3660 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
3661 'twitter:app:url:googleplay'), webpage, 'channel url')
3662 return self._search_regex(
3663 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
3664 channel_url, 'channel id')
3665
3666 @staticmethod
3667 def _extract_basic_item_renderer(item):
3668 # Modified from _extract_grid_item_renderer
3669 known_basic_renderers = (
3670 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer'
3671 )
3672 for key, renderer in item.items():
3673 if not isinstance(renderer, dict):
3674 continue
3675 elif key in known_basic_renderers:
3676 return renderer
3677 elif key.startswith('grid') and key.endswith('Renderer'):
3678 return renderer
3679
3680 def _grid_entries(self, grid_renderer):
3681 for item in grid_renderer['items']:
3682 if not isinstance(item, dict):
3683 continue
3684 renderer = self._extract_basic_item_renderer(item)
3685 if not isinstance(renderer, dict):
3686 continue
3687 title = self._get_text(renderer, 'title')
3688
3689 # playlist
3690 playlist_id = renderer.get('playlistId')
3691 if playlist_id:
3692 yield self.url_result(
3693 'https://www.youtube.com/playlist?list=%s' % playlist_id,
3694 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3695 video_title=title)
3696 continue
3697 # video
3698 video_id = renderer.get('videoId')
3699 if video_id:
3700 yield self._extract_video(renderer)
3701 continue
3702 # channel
3703 channel_id = renderer.get('channelId')
3704 if channel_id:
3705 yield self.url_result(
3706 'https://www.youtube.com/channel/%s' % channel_id,
3707 ie=YoutubeTabIE.ie_key(), video_title=title)
3708 continue
3709 # generic endpoint URL support
3710 ep_url = urljoin('https://www.youtube.com/', try_get(
3711 renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'],
3712 compat_str))
3713 if ep_url:
3714 for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE):
3715 if ie.suitable(ep_url):
3716 yield self.url_result(
3717 ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title)
3718 break
3719
3720 def _shelf_entries_from_content(self, shelf_renderer):
3721 content = shelf_renderer.get('content')
3722 if not isinstance(content, dict):
3723 return
3724 renderer = content.get('gridRenderer') or content.get('expandedShelfContentsRenderer')
3725 if renderer:
3726 # TODO: add support for nested playlists so each shelf is processed
3727 # as separate playlist
3728 # TODO: this includes only first N items
3729 for entry in self._grid_entries(renderer):
3730 yield entry
3731 renderer = content.get('horizontalListRenderer')
3732 if renderer:
3733 # TODO
3734 pass
3735
3736 def _shelf_entries(self, shelf_renderer, skip_channels=False):
3737 ep = try_get(
3738 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3739 compat_str)
3740 shelf_url = urljoin('https://www.youtube.com', ep)
3741 if shelf_url:
3742 # Skipping links to another channels, note that checking for
3743 # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL
3744 # will not work
3745 if skip_channels and '/channels?' in shelf_url:
3746 return
3747 title = self._get_text(shelf_renderer, 'title')
3748 yield self.url_result(shelf_url, video_title=title)
3749 # Shelf may not contain shelf URL, fallback to extraction from content
3750 for entry in self._shelf_entries_from_content(shelf_renderer):
3751 yield entry
3752
3753 def _playlist_entries(self, video_list_renderer):
3754 for content in video_list_renderer['contents']:
3755 if not isinstance(content, dict):
3756 continue
3757 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
3758 if not isinstance(renderer, dict):
3759 continue
3760 video_id = renderer.get('videoId')
3761 if not video_id:
3762 continue
3763 yield self._extract_video(renderer)
3764
3765 def _rich_entries(self, rich_grid_renderer):
3766 renderer = try_get(
3767 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
3768 video_id = renderer.get('videoId')
3769 if not video_id:
3770 return
3771 yield self._extract_video(renderer)
3772
3773 def _video_entry(self, video_renderer):
3774 video_id = video_renderer.get('videoId')
3775 if video_id:
3776 return self._extract_video(video_renderer)
3777
3778 def _post_thread_entries(self, post_thread_renderer):
3779 post_renderer = try_get(
3780 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
3781 if not post_renderer:
3782 return
3783 # video attachment
3784 video_renderer = try_get(
3785 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict) or {}
3786 video_id = video_renderer.get('videoId')
3787 if video_id:
3788 entry = self._extract_video(video_renderer)
3789 if entry:
3790 yield entry
3791 # playlist attachment
3792 playlist_id = try_get(
3793 post_renderer, lambda x: x['backstageAttachment']['playlistRenderer']['playlistId'], compat_str)
3794 if playlist_id:
3795 yield self.url_result(
3796 'https://www.youtube.com/playlist?list=%s' % playlist_id,
3797 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3798 # inline video links
3799 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
3800 for run in runs:
3801 if not isinstance(run, dict):
3802 continue
3803 ep_url = try_get(
3804 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
3805 if not ep_url:
3806 continue
3807 if not YoutubeIE.suitable(ep_url):
3808 continue
3809 ep_video_id = YoutubeIE._match_id(ep_url)
3810 if video_id == ep_video_id:
3811 continue
3812 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=ep_video_id)
3813
3814 def _post_thread_continuation_entries(self, post_thread_continuation):
3815 contents = post_thread_continuation.get('contents')
3816 if not isinstance(contents, list):
3817 return
3818 for content in contents:
3819 renderer = content.get('backstagePostThreadRenderer')
3820 if not isinstance(renderer, dict):
3821 continue
3822 for entry in self._post_thread_entries(renderer):
3823 yield entry
3824
3825 r''' # unused
3826 def _rich_grid_entries(self, contents):
3827 for content in contents:
3828 video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict)
3829 if video_renderer:
3830 entry = self._video_entry(video_renderer)
3831 if entry:
3832 yield entry
3833 '''
3834 def _entries(self, tab, item_id, identity_token, account_syncid, ytcfg):
3835
3836 def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
3837 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
3838 for content in contents:
3839 if not isinstance(content, dict):
3840 continue
3841 is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
3842 if not is_renderer:
3843 renderer = content.get('richItemRenderer')
3844 if renderer:
3845 for entry in self._rich_entries(renderer):
3846 yield entry
3847 continuation_list[0] = self._extract_continuation(parent_renderer)
3848 continue
3849 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
3850 for isr_content in isr_contents:
3851 if not isinstance(isr_content, dict):
3852 continue
3853
3854 known_renderers = {
3855 'playlistVideoListRenderer': self._playlist_entries,
3856 'gridRenderer': self._grid_entries,
3857 'shelfRenderer': lambda x: self._shelf_entries(x, tab.get('title') != 'Channels'),
3858 'backstagePostThreadRenderer': self._post_thread_entries,
3859 'videoRenderer': lambda x: [self._video_entry(x)],
3860 }
3861 for key, renderer in isr_content.items():
3862 if key not in known_renderers:
3863 continue
3864 for entry in known_renderers[key](renderer):
3865 if entry:
3866 yield entry
3867 continuation_list[0] = self._extract_continuation(renderer)
3868 break
3869
3870 if not continuation_list[0]:
3871 continuation_list[0] = self._extract_continuation(is_renderer)
3872
3873 if not continuation_list[0]:
3874 continuation_list[0] = self._extract_continuation(parent_renderer)
3875
3876 continuation_list = [None] # Python 2 doesnot support nonlocal
3877 tab_content = try_get(tab, lambda x: x['content'], dict)
3878 if not tab_content:
3879 return
3880 parent_renderer = (
3881 try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
3882 or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
3883 for entry in extract_entries(parent_renderer):
3884 yield entry
3885 continuation = continuation_list[0]
3886 visitor_data = None
3887
3888 for page_num in itertools.count(1):
3889 if not continuation:
3890 break
3891 headers = self.generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
3892 response = self._extract_response(
3893 item_id='%s page %s' % (item_id, page_num),
3894 query=continuation, headers=headers, ytcfg=ytcfg,
3895 check_get_keys=('continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints'))
3896
3897 if not response:
3898 break
3899 visitor_data = try_get(
3900 response, lambda x: x['responseContext']['visitorData'], compat_str) or visitor_data
3901
3902 known_continuation_renderers = {
3903 'playlistVideoListContinuation': self._playlist_entries,
3904 'gridContinuation': self._grid_entries,
3905 'itemSectionContinuation': self._post_thread_continuation_entries,
3906 'sectionListContinuation': extract_entries, # for feeds
3907 }
3908 continuation_contents = try_get(
3909 response, lambda x: x['continuationContents'], dict) or {}
3910 continuation_renderer = None
3911 for key, value in continuation_contents.items():
3912 if key not in known_continuation_renderers:
3913 continue
3914 continuation_renderer = value
3915 continuation_list = [None]
3916 for entry in known_continuation_renderers[key](continuation_renderer):
3917 yield entry
3918 continuation = continuation_list[0] or self._extract_continuation(continuation_renderer)
3919 break
3920 if continuation_renderer:
3921 continue
3922
3923 known_renderers = {
3924 'gridPlaylistRenderer': (self._grid_entries, 'items'),
3925 'gridVideoRenderer': (self._grid_entries, 'items'),
3926 'gridChannelRenderer': (self._grid_entries, 'items'),
3927 'playlistVideoRenderer': (self._playlist_entries, 'contents'),
3928 'itemSectionRenderer': (extract_entries, 'contents'), # for feeds
3929 'richItemRenderer': (extract_entries, 'contents'), # for hashtag
3930 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents')
3931 }
3932 on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints'))
3933 continuation_items = try_get(
3934 on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list)
3935 continuation_item = try_get(continuation_items, lambda x: x[0], dict) or {}
3936 video_items_renderer = None
3937 for key, value in continuation_item.items():
3938 if key not in known_renderers:
3939 continue
3940 video_items_renderer = {known_renderers[key][1]: continuation_items}
3941 continuation_list = [None]
3942 for entry in known_renderers[key][0](video_items_renderer):
3943 yield entry
3944 continuation = continuation_list[0] or self._extract_continuation(video_items_renderer)
3945 break
3946 if video_items_renderer:
3947 continue
3948 break
3949
3950 @staticmethod
3951 def _extract_selected_tab(tabs):
3952 for tab in tabs:
3953 renderer = dict_get(tab, ('tabRenderer', 'expandableTabRenderer')) or {}
3954 if renderer.get('selected') is True:
3955 return renderer
3956 else:
3957 raise ExtractorError('Unable to find selected tab')
3958
3959 @classmethod
3960 def _extract_uploader(cls, data):
3961 uploader = {}
3962 renderer = cls._extract_sidebar_info_renderer(data, 'playlistSidebarSecondaryInfoRenderer') or {}
3963 owner = try_get(
3964 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3965 if owner:
3966 uploader['uploader'] = owner.get('text')
3967 uploader['uploader_id'] = try_get(
3968 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3969 uploader['uploader_url'] = urljoin(
3970 'https://www.youtube.com/',
3971 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3972 return {k: v for k, v in uploader.items() if v is not None}
3973
3974 def _extract_from_tabs(self, item_id, webpage, data, tabs):
3975 playlist_id = title = description = channel_url = channel_name = channel_id = None
3976 thumbnails_list = tags = []
3977
3978 selected_tab = self._extract_selected_tab(tabs)
3979 renderer = try_get(
3980 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3981 if renderer:
3982 channel_name = renderer.get('title')
3983 channel_url = renderer.get('channelUrl')
3984 channel_id = renderer.get('externalId')
3985 else:
3986 renderer = try_get(
3987 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3988
3989 if renderer:
3990 title = renderer.get('title')
3991 description = renderer.get('description', '')
3992 playlist_id = channel_id
3993 tags = renderer.get('keywords', '').split()
3994 thumbnails_list = (
3995 try_get(renderer, lambda x: x['avatar']['thumbnails'], list)
3996 or try_get(
3997 self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer'),
3998 lambda x: x['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails'],
3999 list)
4000 or [])
4001
4002 thumbnails = []
4003 for t in thumbnails_list:
4004 if not isinstance(t, dict):
4005 continue
4006 thumbnail_url = url_or_none(t.get('url'))
4007 if not thumbnail_url:
4008 continue
4009 thumbnails.append({
4010 'url': thumbnail_url,
4011 'width': int_or_none(t.get('width')),
4012 'height': int_or_none(t.get('height')),
4013 })
4014 if playlist_id is None:
4015 playlist_id = item_id
4016 if title is None:
4017 title = (
4018 try_get(data, lambda x: x['header']['hashtagHeaderRenderer']['hashtag']['simpleText'])
4019 or playlist_id)
4020 title += format_field(selected_tab, 'title', ' - %s')
4021 title += format_field(selected_tab, 'expandedText', ' - %s')
4022 metadata = {
4023 'playlist_id': playlist_id,
4024 'playlist_title': title,
4025 'playlist_description': description,
4026 'uploader': channel_name,
4027 'uploader_id': channel_id,
4028 'uploader_url': channel_url,
4029 'thumbnails': thumbnails,
4030 'tags': tags,
4031 }
4032 availability = self._extract_availability(data)
4033 if availability:
4034 metadata['availability'] = availability
4035 if not channel_id:
4036 metadata.update(self._extract_uploader(data))
4037 metadata.update({
4038 'channel': metadata['uploader'],
4039 'channel_id': metadata['uploader_id'],
4040 'channel_url': metadata['uploader_url']})
4041 ytcfg = self.extract_ytcfg(item_id, webpage)
4042 return self.playlist_result(
4043 self._entries(
4044 selected_tab, playlist_id,
4045 self._extract_identity_token(webpage, item_id),
4046 self._extract_account_syncid(ytcfg, data), ytcfg),
4047 **metadata)
4048
4049 def _extract_mix_playlist(self, playlist, playlist_id, data, webpage):
4050 first_id = last_id = None
4051 ytcfg = self.extract_ytcfg(playlist_id, webpage)
4052 headers = self.generate_api_headers(
4053 ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data),
4054 identity_token=self._extract_identity_token(webpage, item_id=playlist_id))
4055 for page_num in itertools.count(1):
4056 videos = list(self._playlist_entries(playlist))
4057 if not videos:
4058 return
4059 start = next((i for i, v in enumerate(videos) if v['id'] == last_id), -1) + 1
4060 if start >= len(videos):
4061 return
4062 for video in videos[start:]:
4063 if video['id'] == first_id:
4064 self.to_screen('First video %s found again; Assuming end of Mix' % first_id)
4065 return
4066 yield video
4067 first_id = first_id or videos[0]['id']
4068 last_id = videos[-1]['id']
4069 watch_endpoint = try_get(
4070 playlist, lambda x: x['contents'][-1]['playlistPanelVideoRenderer']['navigationEndpoint']['watchEndpoint'])
4071 query = {
4072 'playlistId': playlist_id,
4073 'videoId': watch_endpoint.get('videoId') or last_id,
4074 'index': watch_endpoint.get('index') or len(videos),
4075 'params': watch_endpoint.get('params') or 'OAE%3D'
4076 }
4077 response = self._extract_response(
4078 item_id='%s page %d' % (playlist_id, page_num),
4079 query=query, ep='next', headers=headers, ytcfg=ytcfg,
4080 check_get_keys='contents'
4081 )
4082 playlist = try_get(
4083 response, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
4084
4085 def _extract_from_playlist(self, item_id, url, data, playlist, webpage):
4086 title = playlist.get('title') or try_get(
4087 data, lambda x: x['titleText']['simpleText'], compat_str)
4088 playlist_id = playlist.get('playlistId') or item_id
4089
4090 # Delegating everything except mix playlists to regular tab-based playlist URL
4091 playlist_url = urljoin(url, try_get(
4092 playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
4093 compat_str))
4094 if playlist_url and playlist_url != url:
4095 return self.url_result(
4096 playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
4097 video_title=title)
4098
4099 return self.playlist_result(
4100 self._extract_mix_playlist(playlist, playlist_id, data, webpage),
4101 playlist_id=playlist_id, playlist_title=title)
4102
4103 def _extract_availability(self, data):
4104 """
4105 Gets the availability of a given playlist/tab.
4106 Note: Unless YouTube tells us explicitly, we do not assume it is public
4107 @param data: response
4108 """
4109 is_private = is_unlisted = None
4110 renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') or {}
4111 badge_labels = self._extract_badges(renderer)
4112
4113 # Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge
4114 privacy_dropdown_entries = try_get(
4115 renderer, lambda x: x['privacyForm']['dropdownFormFieldRenderer']['dropdown']['dropdownRenderer']['entries'], list) or []
4116 for renderer_dict in privacy_dropdown_entries:
4117 is_selected = try_get(
4118 renderer_dict, lambda x: x['privacyDropdownItemRenderer']['isSelected'], bool) or False
4119 if not is_selected:
4120 continue
4121 label = self._get_text(renderer_dict, ('privacyDropdownItemRenderer', 'label'))
4122 if label:
4123 badge_labels.add(label.lower())
4124 break
4125
4126 for badge_label in badge_labels:
4127 if badge_label == 'unlisted':
4128 is_unlisted = True
4129 elif badge_label == 'private':
4130 is_private = True
4131 elif badge_label == 'public':
4132 is_unlisted = is_private = False
4133 return self._availability(is_private, False, False, False, is_unlisted)
4134
4135 @staticmethod
4136 def _extract_sidebar_info_renderer(data, info_renderer, expected_type=dict):
4137 sidebar_renderer = try_get(
4138 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list) or []
4139 for item in sidebar_renderer:
4140 renderer = try_get(item, lambda x: x[info_renderer], expected_type)
4141 if renderer:
4142 return renderer
4143
4144 def _reload_with_unavailable_videos(self, item_id, data, webpage):
4145 """
4146 Get playlist with unavailable videos if the 'show unavailable videos' button exists.
4147 """
4148 browse_id = params = None
4149 renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer')
4150 if not renderer:
4151 return
4152 menu_renderer = try_get(
4153 renderer, lambda x: x['menu']['menuRenderer']['items'], list) or []
4154 for menu_item in menu_renderer:
4155 if not isinstance(menu_item, dict):
4156 continue
4157 nav_item_renderer = menu_item.get('menuNavigationItemRenderer')
4158 text = try_get(
4159 nav_item_renderer, lambda x: x['text']['simpleText'], compat_str)
4160 if not text or text.lower() != 'show unavailable videos':
4161 continue
4162 browse_endpoint = try_get(
4163 nav_item_renderer, lambda x: x['navigationEndpoint']['browseEndpoint'], dict) or {}
4164 browse_id = browse_endpoint.get('browseId')
4165 params = browse_endpoint.get('params')
4166 break
4167
4168 ytcfg = self.extract_ytcfg(item_id, webpage)
4169 headers = self.generate_api_headers(
4170 ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data),
4171 identity_token=self._extract_identity_token(webpage, item_id=item_id),
4172 visitor_data=try_get(
4173 self._extract_context(ytcfg), lambda x: x['client']['visitorData'], compat_str))
4174 query = {
4175 'params': params or 'wgYCCAA=',
4176 'browseId': browse_id or 'VL%s' % item_id
4177 }
4178 return self._extract_response(
4179 item_id=item_id, headers=headers, query=query,
4180 check_get_keys='contents', fatal=False, ytcfg=ytcfg,
4181 note='Downloading API JSON with unavailable videos')
4182
4183 def _extract_webpage(self, url, item_id):
4184 retries = self.get_param('extractor_retries', 3)
4185 count = -1
4186 last_error = 'Incomplete yt initial data recieved'
4187 while count < retries:
4188 count += 1
4189 # Sometimes youtube returns a webpage with incomplete ytInitialData
4190 # See: https://github.com/yt-dlp/yt-dlp/issues/116
4191 if count:
4192 self.report_warning('%s. Retrying ...' % last_error)
4193 webpage = self._download_webpage(
4194 url, item_id,
4195 'Downloading webpage%s' % (' (retry #%d)' % count if count else ''))
4196 data = self.extract_yt_initial_data(item_id, webpage)
4197 if data.get('contents') or data.get('currentVideoEndpoint'):
4198 break
4199 # Extract alerts here only when there is error
4200 self._extract_and_report_alerts(data)
4201 if count >= retries:
4202 raise ExtractorError(last_error)
4203 return webpage, data
4204
4205 @staticmethod
4206 def _smuggle_data(entries, data):
4207 for entry in entries:
4208 if data:
4209 entry['url'] = smuggle_url(entry['url'], data)
4210 yield entry
4211
4212 def _real_extract(self, url):
4213 url, smuggled_data = unsmuggle_url(url, {})
4214 if self.is_music_url(url):
4215 smuggled_data['is_music_url'] = True
4216 info_dict = self.__real_extract(url, smuggled_data)
4217 if info_dict.get('entries'):
4218 info_dict['entries'] = self._smuggle_data(info_dict['entries'], smuggled_data)
4219 return info_dict
4220
4221 _url_re = re.compile(r'(?P<pre>%s)(?(channel_type)(?P<tab>/\w+))?(?P<post>.*)$' % _VALID_URL)
4222
4223 def __real_extract(self, url, smuggled_data):
4224 item_id = self._match_id(url)
4225 url = compat_urlparse.urlunparse(
4226 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
4227 compat_opts = self.get_param('compat_opts', [])
4228
4229 def get_mobj(url):
4230 mobj = self._url_re.match(url).groupdict()
4231 mobj.update((k, '') for k, v in mobj.items() if v is None)
4232 return mobj
4233
4234 mobj = get_mobj(url)
4235 # Youtube returns incomplete data if tabname is not lower case
4236 pre, tab, post, is_channel = mobj['pre'], mobj['tab'].lower(), mobj['post'], not mobj['not_channel']
4237
4238 if is_channel:
4239 if smuggled_data.get('is_music_url'):
4240 if item_id[:2] == 'VL':
4241 # Youtube music VL channels have an equivalent playlist
4242 item_id = item_id[2:]
4243 pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False
4244 elif item_id[:2] == 'MP':
4245 # Youtube music albums (/channel/MP...) have a OLAK playlist that can be extracted from the webpage
4246 item_id = self._search_regex(
4247 r'\\x22audioPlaylistId\\x22:\\x22([0-9A-Za-z_-]+)\\x22',
4248 self._download_webpage('https://music.youtube.com/channel/%s' % item_id, item_id),
4249 'playlist id')
4250 pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False
4251 elif mobj['channel_type'] == 'browse':
4252 # Youtube music /browse/ should be changed to /channel/
4253 pre = 'https://www.youtube.com/channel/%s' % item_id
4254 if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts:
4255 # Home URLs should redirect to /videos/
4256 self.report_warning(
4257 'A channel/user page was given. All the channel\'s videos will be downloaded. '
4258 'To download only the videos in the home page, add a "/featured" to the URL')
4259 tab = '/videos'
4260
4261 url = ''.join((pre, tab, post))
4262 mobj = get_mobj(url)
4263
4264 # Handle both video/playlist URLs
4265 qs = parse_qs(url)
4266 video_id = qs.get('v', [None])[0]
4267 playlist_id = qs.get('list', [None])[0]
4268
4269 if not video_id and mobj['not_channel'].startswith('watch'):
4270 if not playlist_id:
4271 # If there is neither video or playlist ids, youtube redirects to home page, which is undesirable
4272 raise ExtractorError('Unable to recognize tab page')
4273 # Common mistake: https://www.youtube.com/watch?list=playlist_id
4274 self.report_warning('A video URL was given without video ID. Trying to download playlist %s' % playlist_id)
4275 url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
4276 mobj = get_mobj(url)
4277
4278 if video_id and playlist_id:
4279 if self.get_param('noplaylist'):
4280 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
4281 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
4282 self.to_screen('Downloading playlist %s; add --no-playlist to just download video %s' % (playlist_id, video_id))
4283
4284 webpage, data = self._extract_webpage(url, item_id)
4285
4286 tabs = try_get(
4287 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
4288 if tabs:
4289 selected_tab = self._extract_selected_tab(tabs)
4290 tab_name = selected_tab.get('title', '')
4291 if 'no-youtube-channel-redirect' not in compat_opts:
4292 if mobj['tab'] == '/live':
4293 # Live tab should have redirected to the video
4294 raise ExtractorError('The channel is not currently live', expected=True)
4295 if mobj['tab'] == '/videos' and tab_name.lower() != mobj['tab'][1:]:
4296 if not mobj['not_channel'] and item_id[:2] == 'UC':
4297 # Topic channels don't have /videos. Use the equivalent playlist instead
4298 self.report_warning('The URL does not have a %s tab. Trying to redirect to playlist UU%s instead' % (mobj['tab'][1:], item_id[2:]))
4299 pl_id = 'UU%s' % item_id[2:]
4300 pl_url = 'https://www.youtube.com/playlist?list=%s%s' % (pl_id, mobj['post'])
4301 try:
4302 pl_webpage, pl_data = self._extract_webpage(pl_url, pl_id)
4303 for alert_type, alert_message in self._extract_alerts(pl_data):
4304 if alert_type == 'error':
4305 raise ExtractorError('Youtube said: %s' % alert_message)
4306 item_id, url, webpage, data = pl_id, pl_url, pl_webpage, pl_data
4307 except ExtractorError:
4308 self.report_warning('The playlist gave error. Falling back to channel URL')
4309 else:
4310 self.report_warning('The URL does not have a %s tab. %s is being downloaded instead' % (mobj['tab'][1:], tab_name))
4311
4312 self.write_debug('Final URL: %s' % url)
4313
4314 # YouTube sometimes provides a button to reload playlist with unavailable videos.
4315 if 'no-youtube-unavailable-videos' not in compat_opts:
4316 data = self._reload_with_unavailable_videos(item_id, data, webpage) or data
4317 self._extract_and_report_alerts(data)
4318 tabs = try_get(
4319 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
4320 if tabs:
4321 return self._extract_from_tabs(item_id, webpage, data, tabs)
4322
4323 playlist = try_get(
4324 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
4325 if playlist:
4326 return self._extract_from_playlist(item_id, url, data, playlist, webpage)
4327
4328 video_id = try_get(
4329 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
4330 compat_str) or video_id
4331 if video_id:
4332 if mobj['tab'] != '/live': # live tab is expected to redirect to video
4333 self.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id)
4334 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
4335
4336 raise ExtractorError('Unable to recognize tab page')
4337
4338
4339 class YoutubePlaylistIE(InfoExtractor):
4340 IE_DESC = 'YouTube.com playlists'
4341 _VALID_URL = r'''(?x)(?:
4342 (?:https?://)?
4343 (?:\w+\.)?
4344 (?:
4345 (?:
4346 youtube(?:kids)?\.com|
4347 invidio\.us
4348 )
4349 /.*?\?.*?\blist=
4350 )?
4351 (?P<id>%(playlist_id)s)
4352 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
4353 IE_NAME = 'youtube:playlist'
4354 _TESTS = [{
4355 'note': 'issue #673',
4356 'url': 'PLBB231211A4F62143',
4357 'info_dict': {
4358 'title': '[OLD]Team Fortress 2 (Class-based LP)',
4359 'id': 'PLBB231211A4F62143',
4360 'uploader': 'Wickydoo',
4361 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
4362 'description': 'md5:8fa6f52abb47a9552002fa3ddfc57fc2',
4363 },
4364 'playlist_mincount': 29,
4365 }, {
4366 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
4367 'info_dict': {
4368 'title': 'YDL_safe_search',
4369 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
4370 },
4371 'playlist_count': 2,
4372 'skip': 'This playlist is private',
4373 }, {
4374 'note': 'embedded',
4375 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
4376 'playlist_count': 4,
4377 'info_dict': {
4378 'title': 'JODA15',
4379 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
4380 'uploader': 'milan',
4381 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
4382 }
4383 }, {
4384 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
4385 'playlist_mincount': 654,
4386 'info_dict': {
4387 'title': '2018 Chinese New Singles (11/6 updated)',
4388 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
4389 'uploader': 'LBK',
4390 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
4391 'description': 'md5:da521864744d60a198e3a88af4db0d9d',
4392 }
4393 }, {
4394 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
4395 'only_matching': True,
4396 }, {
4397 # music album playlist
4398 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
4399 'only_matching': True,
4400 }]
4401
4402 @classmethod
4403 def suitable(cls, url):
4404 if YoutubeTabIE.suitable(url):
4405 return False
4406 # Hack for lazy extractors until more generic solution is implemented
4407 # (see #28780)
4408 from .youtube import parse_qs
4409 qs = parse_qs(url)
4410 if qs.get('v', [None])[0]:
4411 return False
4412 return super(YoutubePlaylistIE, cls).suitable(url)
4413
4414 def _real_extract(self, url):
4415 playlist_id = self._match_id(url)
4416 is_music_url = YoutubeBaseInfoExtractor.is_music_url(url)
4417 url = update_url_query(
4418 'https://www.youtube.com/playlist',
4419 parse_qs(url) or {'list': playlist_id})
4420 if is_music_url:
4421 url = smuggle_url(url, {'is_music_url': True})
4422 return self.url_result(url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
4423
4424
4425 class YoutubeYtBeIE(InfoExtractor):
4426 IE_DESC = 'youtu.be'
4427 _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
4428 _TESTS = [{
4429 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
4430 'info_dict': {
4431 'id': 'yeWKywCrFtk',
4432 'ext': 'mp4',
4433 'title': 'Small Scale Baler and Braiding Rugs',
4434 'uploader': 'Backus-Page House Museum',
4435 'uploader_id': 'backuspagemuseum',
4436 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
4437 'upload_date': '20161008',
4438 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
4439 'categories': ['Nonprofits & Activism'],
4440 'tags': list,
4441 'like_count': int,
4442 'dislike_count': int,
4443 },
4444 'params': {
4445 'noplaylist': True,
4446 'skip_download': True,
4447 },
4448 }, {
4449 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
4450 'only_matching': True,
4451 }]
4452
4453 def _real_extract(self, url):
4454 mobj = self._match_valid_url(url)
4455 video_id = mobj.group('id')
4456 playlist_id = mobj.group('playlist_id')
4457 return self.url_result(
4458 update_url_query('https://www.youtube.com/watch', {
4459 'v': video_id,
4460 'list': playlist_id,
4461 'feature': 'youtu.be',
4462 }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
4463
4464
4465 class YoutubeYtUserIE(InfoExtractor):
4466 IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
4467 _VALID_URL = r'ytuser:(?P<id>.+)'
4468 _TESTS = [{
4469 'url': 'ytuser:phihag',
4470 'only_matching': True,
4471 }]
4472
4473 def _real_extract(self, url):
4474 user_id = self._match_id(url)
4475 return self.url_result(
4476 'https://www.youtube.com/user/%s' % user_id,
4477 ie=YoutubeTabIE.ie_key(), video_id=user_id)
4478
4479
4480 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
4481 IE_NAME = 'youtube:favorites'
4482 IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
4483 _VALID_URL = r':ytfav(?:ou?rite)?s?'
4484 _LOGIN_REQUIRED = True
4485 _TESTS = [{
4486 'url': ':ytfav',
4487 'only_matching': True,
4488 }, {
4489 'url': ':ytfavorites',
4490 'only_matching': True,
4491 }]
4492
4493 def _real_extract(self, url):
4494 return self.url_result(
4495 'https://www.youtube.com/playlist?list=LL',
4496 ie=YoutubeTabIE.ie_key())
4497
4498
4499 class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE):
4500 IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
4501 # there doesn't appear to be a real limit, for example if you search for
4502 # 'python' you get more than 8.000.000 results
4503 _MAX_RESULTS = float('inf')
4504 IE_NAME = 'youtube:search'
4505 _SEARCH_KEY = 'ytsearch'
4506 _SEARCH_PARAMS = None
4507 _TESTS = []
4508
4509 def _entries(self, query, n):
4510 data = {'query': query}
4511 if self._SEARCH_PARAMS:
4512 data['params'] = self._SEARCH_PARAMS
4513 total = 0
4514 continuation = {}
4515 for page_num in itertools.count(1):
4516 data.update(continuation)
4517 search = self._extract_response(
4518 item_id='query "%s" page %s' % (query, page_num), ep='search', query=data,
4519 check_get_keys=('contents', 'onResponseReceivedCommands')
4520 )
4521 if not search:
4522 break
4523 slr_contents = try_get(
4524 search,
4525 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
4526 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
4527 list)
4528 if not slr_contents:
4529 break
4530
4531 # Youtube sometimes adds promoted content to searches,
4532 # changing the index location of videos and token.
4533 # So we search through all entries till we find them.
4534 continuation = None
4535 for slr_content in slr_contents:
4536 if not continuation:
4537 continuation = self._extract_continuation({'contents': [slr_content]})
4538
4539 isr_contents = try_get(
4540 slr_content,
4541 lambda x: x['itemSectionRenderer']['contents'],
4542 list)
4543 if not isr_contents:
4544 continue
4545 for content in isr_contents:
4546 if not isinstance(content, dict):
4547 continue
4548 video = content.get('videoRenderer')
4549 if not isinstance(video, dict):
4550 continue
4551 video_id = video.get('videoId')
4552 if not video_id:
4553 continue
4554
4555 yield self._extract_video(video)
4556 total += 1
4557 if total == n:
4558 return
4559
4560 if not continuation:
4561 break
4562
4563 def _get_n_results(self, query, n):
4564 """Get a specified number of results for a query"""
4565 return self.playlist_result(self._entries(query, n), query, query)
4566
4567
4568 class YoutubeSearchDateIE(YoutubeSearchIE):
4569 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
4570 _SEARCH_KEY = 'ytsearchdate'
4571 IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
4572 _SEARCH_PARAMS = 'CAI%3D'
4573
4574
4575 class YoutubeSearchURLIE(YoutubeSearchIE):
4576 IE_DESC = 'YouTube.com search URLs'
4577 IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
4578 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
4579 # _MAX_RESULTS = 100
4580 _TESTS = [{
4581 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
4582 'playlist_mincount': 5,
4583 'info_dict': {
4584 'id': 'youtube-dl test video',
4585 'title': 'youtube-dl test video',
4586 }
4587 }, {
4588 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
4589 'only_matching': True,
4590 }]
4591
4592 @classmethod
4593 def _make_valid_url(cls):
4594 return cls._VALID_URL
4595
4596 def _real_extract(self, url):
4597 qs = parse_qs(url)
4598 query = (qs.get('search_query') or qs.get('q'))[0]
4599 self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
4600 return self._get_n_results(query, self._MAX_RESULTS)
4601
4602
4603 class YoutubeFeedsInfoExtractor(YoutubeTabIE):
4604 """
4605 Base class for feed extractors
4606 Subclasses must define the _FEED_NAME property.
4607 """
4608 _LOGIN_REQUIRED = True
4609 _TESTS = []
4610
4611 @property
4612 def IE_NAME(self):
4613 return 'youtube:%s' % self._FEED_NAME
4614
4615 def _real_extract(self, url):
4616 return self.url_result(
4617 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
4618 ie=YoutubeTabIE.ie_key())
4619
4620
4621 class YoutubeWatchLaterIE(InfoExtractor):
4622 IE_NAME = 'youtube:watchlater'
4623 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
4624 _VALID_URL = r':ytwatchlater'
4625 _TESTS = [{
4626 'url': ':ytwatchlater',
4627 'only_matching': True,
4628 }]
4629
4630 def _real_extract(self, url):
4631 return self.url_result(
4632 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
4633
4634
4635 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
4636 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
4637 _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
4638 _FEED_NAME = 'recommended'
4639 _LOGIN_REQUIRED = False
4640 _TESTS = [{
4641 'url': ':ytrec',
4642 'only_matching': True,
4643 }, {
4644 'url': ':ytrecommended',
4645 'only_matching': True,
4646 }, {
4647 'url': 'https://youtube.com',
4648 'only_matching': True,
4649 }]
4650
4651
4652 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
4653 IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
4654 _VALID_URL = r':ytsub(?:scription)?s?'
4655 _FEED_NAME = 'subscriptions'
4656 _TESTS = [{
4657 'url': ':ytsubs',
4658 'only_matching': True,
4659 }, {
4660 'url': ':ytsubscriptions',
4661 'only_matching': True,
4662 }]
4663
4664
4665 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
4666 IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)'
4667 _VALID_URL = r':ythis(?:tory)?'
4668 _FEED_NAME = 'history'
4669 _TESTS = [{
4670 'url': ':ythistory',
4671 'only_matching': True,
4672 }]
4673
4674
4675 class YoutubeTruncatedURLIE(InfoExtractor):
4676 IE_NAME = 'youtube:truncated_url'
4677 IE_DESC = False # Do not list
4678 _VALID_URL = r'''(?x)
4679 (?:https?://)?
4680 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
4681 (?:watch\?(?:
4682 feature=[a-z_]+|
4683 annotation_id=annotation_[^&]+|
4684 x-yt-cl=[0-9]+|
4685 hl=[^&]*|
4686 t=[0-9]+
4687 )?
4688 |
4689 attribution_link\?a=[^&]+
4690 )
4691 $
4692 '''
4693
4694 _TESTS = [{
4695 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
4696 'only_matching': True,
4697 }, {
4698 'url': 'https://www.youtube.com/watch?',
4699 'only_matching': True,
4700 }, {
4701 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
4702 'only_matching': True,
4703 }, {
4704 'url': 'https://www.youtube.com/watch?feature=foo',
4705 'only_matching': True,
4706 }, {
4707 'url': 'https://www.youtube.com/watch?hl=en-GB',
4708 'only_matching': True,
4709 }, {
4710 'url': 'https://www.youtube.com/watch?t=2372',
4711 'only_matching': True,
4712 }]
4713
4714 def _real_extract(self, url):
4715 raise ExtractorError(
4716 'Did you forget to quote the URL? Remember that & is a meta '
4717 'character in most shells, so you want to put the URL in quotes, '
4718 'like youtube-dl '
4719 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
4720 ' or simply youtube-dl BaW_jenozKc .',
4721 expected=True)
4722
4723
4724 class YoutubeTruncatedIDIE(InfoExtractor):
4725 IE_NAME = 'youtube:truncated_id'
4726 IE_DESC = False # Do not list
4727 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
4728
4729 _TESTS = [{
4730 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
4731 'only_matching': True,
4732 }]
4733
4734 def _real_extract(self, url):
4735 video_id = self._match_id(url)
4736 raise ExtractorError(
4737 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
4738 expected=True)