]> jfr.im git - yt-dlp.git/blame_incremental - yt_dlp/extractor/youtube.py
[youtube] Add `thirdParty` to agegate clients (#577)
[yt-dlp.git] / yt_dlp / extractor / youtube.py
... / ...
CommitLineData
1# coding: utf-8
2
3from __future__ import unicode_literals
4
5import base64
6import calendar
7import copy
8import datetime
9import hashlib
10import itertools
11import json
12import os.path
13import random
14import re
15import time
16import traceback
17
18from .common import InfoExtractor, SearchInfoExtractor
19from ..compat import (
20 compat_chr,
21 compat_HTTPError,
22 compat_parse_qs,
23 compat_str,
24 compat_urllib_parse_unquote_plus,
25 compat_urllib_parse_urlencode,
26 compat_urllib_parse_urlparse,
27 compat_urlparse,
28)
29from ..jsinterp import JSInterpreter
30from ..utils import (
31 bytes_to_intlist,
32 clean_html,
33 datetime_from_str,
34 dict_get,
35 error_to_compat_str,
36 ExtractorError,
37 float_or_none,
38 format_field,
39 int_or_none,
40 intlist_to_bytes,
41 mimetype2ext,
42 network_exceptions,
43 orderedSet,
44 parse_codecs,
45 parse_count,
46 parse_duration,
47 parse_iso8601,
48 qualities,
49 remove_start,
50 smuggle_url,
51 str_or_none,
52 str_to_int,
53 traverse_obj,
54 try_get,
55 unescapeHTML,
56 unified_strdate,
57 unsmuggle_url,
58 update_url_query,
59 url_or_none,
60 urlencode_postdata,
61 urljoin,
62 variadic,
63)
64
65
66def parse_qs(url):
67 return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
68
69
70# any clients starting with _ cannot be explicity requested by the user
71INNERTUBE_CLIENTS = {
72 'web': {
73 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
74 'INNERTUBE_CONTEXT': {
75 'client': {
76 'clientName': 'WEB',
77 'clientVersion': '2.20210622.10.00',
78 }
79 },
80 'INNERTUBE_CONTEXT_CLIENT_NAME': 1
81 },
82 'web_embedded': {
83 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
84 'INNERTUBE_CONTEXT': {
85 'client': {
86 'clientName': 'WEB_EMBEDDED_PLAYER',
87 'clientVersion': '1.20210620.0.1',
88 },
89 },
90 'INNERTUBE_CONTEXT_CLIENT_NAME': 56
91 },
92 'web_music': {
93 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
94 'INNERTUBE_HOST': 'music.youtube.com',
95 'INNERTUBE_CONTEXT': {
96 'client': {
97 'clientName': 'WEB_REMIX',
98 'clientVersion': '1.20210621.00.00',
99 }
100 },
101 'INNERTUBE_CONTEXT_CLIENT_NAME': 67,
102 },
103 'android': {
104 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
105 'INNERTUBE_CONTEXT': {
106 'client': {
107 'clientName': 'ANDROID',
108 'clientVersion': '16.20',
109 }
110 },
111 'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
112 },
113 'android_embedded': {
114 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
115 'INNERTUBE_CONTEXT': {
116 'client': {
117 'clientName': 'ANDROID_EMBEDDED_PLAYER',
118 'clientVersion': '16.20',
119 },
120 },
121 'INNERTUBE_CONTEXT_CLIENT_NAME': 55
122 },
123 'android_music': {
124 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
125 'INNERTUBE_HOST': 'music.youtube.com',
126 'INNERTUBE_CONTEXT': {
127 'client': {
128 'clientName': 'ANDROID_MUSIC',
129 'clientVersion': '4.32',
130 }
131 },
132 'INNERTUBE_CONTEXT_CLIENT_NAME': 21,
133 },
134 'ios': {
135 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
136 'INNERTUBE_CONTEXT': {
137 'client': {
138 'clientName': 'IOS',
139 'clientVersion': '16.20',
140 }
141 },
142 'INNERTUBE_CONTEXT_CLIENT_NAME': 5
143 },
144 'ios_embedded': {
145 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8',
146 'INNERTUBE_CONTEXT': {
147 'client': {
148 'clientName': 'IOS_MESSAGES_EXTENSION',
149 'clientVersion': '16.20',
150 },
151 },
152 'INNERTUBE_CONTEXT_CLIENT_NAME': 66
153 },
154 'ios_music': {
155 'INNERTUBE_API_KEY': 'AIzaSyDK3iBpDP9nHVTk2qL73FLJICfOC3c51Og',
156 'INNERTUBE_HOST': 'music.youtube.com',
157 'INNERTUBE_CONTEXT': {
158 'client': {
159 'clientName': 'IOS_MUSIC',
160 'clientVersion': '4.32',
161 },
162 },
163 'INNERTUBE_CONTEXT_CLIENT_NAME': 26
164 },
165 'mweb': {
166 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8',
167 'INNERTUBE_CONTEXT': {
168 'client': {
169 'clientName': 'MWEB',
170 'clientVersion': '2.20210721.07.00',
171 }
172 },
173 'INNERTUBE_CONTEXT_CLIENT_NAME': 2
174 },
175}
176
177
178def build_innertube_clients():
179 third_party = {
180 'embedUrl': 'https://google.com', # Can be any valid URL
181 }
182 base_clients = ('android', 'web', 'ios', 'mweb')
183 priority = qualities(base_clients[::-1])
184
185 for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()):
186 ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM4DrUqRUYnGn3llEO78bcxq8')
187 ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com')
188 ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en')
189 ytcfg['priority'] = 10 * priority(client.split('_', 1)[0])
190
191 if client in base_clients:
192 INNERTUBE_CLIENTS[f'{client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg)
193 agegate_ytcfg['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED'
194 agegate_ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party
195 agegate_ytcfg['priority'] -= 1
196 elif client.endswith('_embedded'):
197 ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party
198 ytcfg['priority'] -= 2
199 else:
200 ytcfg['priority'] -= 3
201
202
203build_innertube_clients()
204
205
206class YoutubeBaseInfoExtractor(InfoExtractor):
207 """Provide base functions for Youtube extractors"""
208 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
209 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
210
211 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
212 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
213 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
214
215 _RESERVED_NAMES = (
216 r'channel|c|user|browse|playlist|watch|w|v|embed|e|watch_popup|shorts|'
217 r'movies|results|shared|hashtag|trending|feed|feeds|oembed|get_video_info|'
218 r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout')
219
220 _NETRC_MACHINE = 'youtube'
221 # If True it will raise an error if no login info is provided
222 _LOGIN_REQUIRED = False
223
224 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
225
226 def _login(self):
227 """
228 Attempt to log in to YouTube.
229 True is returned if successful or skipped.
230 False is returned if login failed.
231
232 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
233 """
234
235 def warn(message):
236 self.report_warning(message)
237
238 # username+password login is broken
239 if (self._LOGIN_REQUIRED
240 and self.get_param('cookiefile') is None
241 and self.get_param('cookiesfrombrowser') is None):
242 self.raise_login_required(
243 'Login details are needed to download this content', method='cookies')
244 username, password = self._get_login_info()
245 if username:
246 warn('Logging in using username and password is broken. %s' % self._LOGIN_HINTS['cookies'])
247 return
248
249 # Everything below this is broken!
250 r'''
251 # No authentication to be performed
252 if username is None:
253 if self._LOGIN_REQUIRED and self.get_param('cookiefile') is None:
254 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
255 # if self.get_param('cookiefile'): # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
256 # self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
257 return True
258
259 login_page = self._download_webpage(
260 self._LOGIN_URL, None,
261 note='Downloading login page',
262 errnote='unable to fetch login page', fatal=False)
263 if login_page is False:
264 return
265
266 login_form = self._hidden_inputs(login_page)
267
268 def req(url, f_req, note, errnote):
269 data = login_form.copy()
270 data.update({
271 'pstMsg': 1,
272 'checkConnection': 'youtube',
273 'checkedDomains': 'youtube',
274 'hl': 'en',
275 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
276 'f.req': json.dumps(f_req),
277 'flowName': 'GlifWebSignIn',
278 'flowEntry': 'ServiceLogin',
279 # TODO: reverse actual botguard identifier generation algo
280 'bgRequest': '["identifier",""]',
281 })
282 return self._download_json(
283 url, None, note=note, errnote=errnote,
284 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
285 fatal=False,
286 data=urlencode_postdata(data), headers={
287 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
288 'Google-Accounts-XSRF': 1,
289 })
290
291 lookup_req = [
292 username,
293 None, [], None, 'US', None, None, 2, False, True,
294 [
295 None, None,
296 [2, 1, None, 1,
297 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
298 None, [], 4],
299 1, [None, None, []], None, None, None, True
300 ],
301 username,
302 ]
303
304 lookup_results = req(
305 self._LOOKUP_URL, lookup_req,
306 'Looking up account info', 'Unable to look up account info')
307
308 if lookup_results is False:
309 return False
310
311 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
312 if not user_hash:
313 warn('Unable to extract user hash')
314 return False
315
316 challenge_req = [
317 user_hash,
318 None, 1, None, [1, None, None, None, [password, None, True]],
319 [
320 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
321 1, [None, None, []], None, None, None, True
322 ]]
323
324 challenge_results = req(
325 self._CHALLENGE_URL, challenge_req,
326 'Logging in', 'Unable to log in')
327
328 if challenge_results is False:
329 return
330
331 login_res = try_get(challenge_results, lambda x: x[0][5], list)
332 if login_res:
333 login_msg = try_get(login_res, lambda x: x[5], compat_str)
334 warn(
335 'Unable to login: %s' % 'Invalid password'
336 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
337 return False
338
339 res = try_get(challenge_results, lambda x: x[0][-1], list)
340 if not res:
341 warn('Unable to extract result entry')
342 return False
343
344 login_challenge = try_get(res, lambda x: x[0][0], list)
345 if login_challenge:
346 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
347 if challenge_str == 'TWO_STEP_VERIFICATION':
348 # SEND_SUCCESS - TFA code has been successfully sent to phone
349 # QUOTA_EXCEEDED - reached the limit of TFA codes
350 status = try_get(login_challenge, lambda x: x[5], compat_str)
351 if status == 'QUOTA_EXCEEDED':
352 warn('Exceeded the limit of TFA codes, try later')
353 return False
354
355 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
356 if not tl:
357 warn('Unable to extract TL')
358 return False
359
360 tfa_code = self._get_tfa_info('2-step verification code')
361
362 if not tfa_code:
363 warn(
364 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
365 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
366 return False
367
368 tfa_code = remove_start(tfa_code, 'G-')
369
370 tfa_req = [
371 user_hash, None, 2, None,
372 [
373 9, None, None, None, None, None, None, None,
374 [None, tfa_code, True, 2]
375 ]]
376
377 tfa_results = req(
378 self._TFA_URL.format(tl), tfa_req,
379 'Submitting TFA code', 'Unable to submit TFA code')
380
381 if tfa_results is False:
382 return False
383
384 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
385 if tfa_res:
386 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
387 warn(
388 'Unable to finish TFA: %s' % 'Invalid TFA code'
389 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
390 return False
391
392 check_cookie_url = try_get(
393 tfa_results, lambda x: x[0][-1][2], compat_str)
394 else:
395 CHALLENGES = {
396 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
397 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
398 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
399 }
400 challenge = CHALLENGES.get(
401 challenge_str,
402 '%s returned error %s.' % (self.IE_NAME, challenge_str))
403 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
404 return False
405 else:
406 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
407
408 if not check_cookie_url:
409 warn('Unable to extract CheckCookie URL')
410 return False
411
412 check_cookie_results = self._download_webpage(
413 check_cookie_url, None, 'Checking cookie', fatal=False)
414
415 if check_cookie_results is False:
416 return False
417
418 if 'https://myaccount.google.com/' not in check_cookie_results:
419 warn('Unable to log in')
420 return False
421
422 return True
423 '''
424
425 def _initialize_consent(self):
426 cookies = self._get_cookies('https://www.youtube.com/')
427 if cookies.get('__Secure-3PSID'):
428 return
429 consent_id = None
430 consent = cookies.get('CONSENT')
431 if consent:
432 if 'YES' in consent.value:
433 return
434 consent_id = self._search_regex(
435 r'PENDING\+(\d+)', consent.value, 'consent', default=None)
436 if not consent_id:
437 consent_id = random.randint(100, 999)
438 self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id)
439
440 def _real_initialize(self):
441 self._initialize_consent()
442 if self._downloader is None:
443 return
444 if not self._login():
445 return
446
447 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
448 _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
449 _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
450
451 def _get_default_ytcfg(self, client='web'):
452 return copy.deepcopy(INNERTUBE_CLIENTS[client])
453
454 def _get_innertube_host(self, client='web'):
455 return INNERTUBE_CLIENTS[client]['INNERTUBE_HOST']
456
457 def _ytcfg_get_safe(self, ytcfg, getter, expected_type=None, default_client='web'):
458 # try_get but with fallback to default ytcfg client values when present
459 _func = lambda y: try_get(y, getter, expected_type)
460 return _func(ytcfg) or _func(self._get_default_ytcfg(default_client))
461
462 def _extract_client_name(self, ytcfg, default_client='web'):
463 return (
464 try_get(ytcfg, lambda x: x['INNERTUBE_CLIENT_NAME'], compat_str)
465 or self._ytcfg_get_safe(
466 ytcfg, lambda x: x['INNERTUBE_CONTEXT']['client']['clientName'], compat_str, default_client))
467
468 @staticmethod
469 def _extract_session_index(*data):
470 for ytcfg in data:
471 session_index = int_or_none(try_get(ytcfg, lambda x: x['SESSION_INDEX']))
472 if session_index is not None:
473 return session_index
474
475 def _extract_client_version(self, ytcfg, default_client='web'):
476 return (
477 try_get(ytcfg, lambda x: x['INNERTUBE_CLIENT_VERSION'], compat_str)
478 or self._ytcfg_get_safe(
479 ytcfg, lambda x: x['INNERTUBE_CONTEXT']['client']['clientVersion'], compat_str, default_client))
480
481 def _extract_api_key(self, ytcfg=None, default_client='web'):
482 return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client)
483
484 def _extract_context(self, ytcfg=None, default_client='web'):
485 _get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict)
486 context = _get_context(ytcfg)
487 if context:
488 return context
489
490 context = _get_context(self._get_default_ytcfg(default_client))
491 if not ytcfg:
492 return context
493
494 # Recreate the client context (required)
495 context['client'].update({
496 'clientVersion': self._extract_client_version(ytcfg, default_client),
497 'clientName': self._extract_client_name(ytcfg, default_client),
498 })
499 visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str)
500 if visitor_data:
501 context['client']['visitorData'] = visitor_data
502 return context
503
504 def _generate_sapisidhash_header(self, origin='https://www.youtube.com'):
505 # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is.
506 # See: https://github.com/yt-dlp/yt-dlp/issues/393
507 yt_cookies = self._get_cookies('https://www.youtube.com')
508 sapisid_cookie = dict_get(
509 yt_cookies, ('__Secure-3PAPISID', 'SAPISID'))
510 if sapisid_cookie is None or not sapisid_cookie.value:
511 return
512 time_now = round(time.time())
513 # SAPISID cookie is required if not already present
514 if not yt_cookies.get('SAPISID'):
515 self.write_debug('Copying __Secure-3PAPISID cookie to SAPISID cookie', only_once=True)
516 self._set_cookie(
517 '.youtube.com', 'SAPISID', sapisid_cookie.value, secure=True, expire_time=time_now + 3600)
518 self.write_debug('Extracted SAPISID cookie', only_once=True)
519 # SAPISIDHASH algorithm from https://stackoverflow.com/a/32065323
520 sapisidhash = hashlib.sha1(
521 f'{time_now} {sapisid_cookie.value} {origin}'.encode('utf-8')).hexdigest()
522 return f'SAPISIDHASH {time_now}_{sapisidhash}'
523
524 def _call_api(self, ep, query, video_id, fatal=True, headers=None,
525 note='Downloading API JSON', errnote='Unable to download API page',
526 context=None, api_key=None, api_hostname=None, default_client='web'):
527
528 data = {'context': context} if context else {'context': self._extract_context(default_client=default_client)}
529 data.update(query)
530 real_headers = self.generate_api_headers(default_client=default_client)
531 real_headers.update({'content-type': 'application/json'})
532 if headers:
533 real_headers.update(headers)
534 return self._download_json(
535 'https://%s/youtubei/v1/%s' % (api_hostname or self._get_innertube_host(default_client), ep),
536 video_id=video_id, fatal=fatal, note=note, errnote=errnote,
537 data=json.dumps(data).encode('utf8'), headers=real_headers,
538 query={'key': api_key or self._extract_api_key()})
539
540 def extract_yt_initial_data(self, video_id, webpage):
541 return self._parse_json(
542 self._search_regex(
543 (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
544 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
545 video_id)
546
547 def _extract_identity_token(self, webpage, item_id):
548 if not webpage:
549 return None
550 ytcfg = self.extract_ytcfg(item_id, webpage)
551 if ytcfg:
552 token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
553 if token:
554 return token
555 return self._search_regex(
556 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
557 'identity token', default=None)
558
559 @staticmethod
560 def _extract_account_syncid(*args):
561 """
562 Extract syncId required to download private playlists of secondary channels
563 @params response and/or ytcfg
564 """
565 for data in args:
566 # ytcfg includes channel_syncid if on secondary channel
567 delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], compat_str)
568 if delegated_sid:
569 return delegated_sid
570 sync_ids = (try_get(
571 data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'],
572 lambda x: x['DATASYNC_ID']), compat_str) or '').split("||")
573 if len(sync_ids) >= 2 and sync_ids[1]:
574 # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
575 # and just "user_syncid||" for primary channel. We only want the channel_syncid
576 return sync_ids[0]
577
578 def extract_ytcfg(self, video_id, webpage):
579 if not webpage:
580 return {}
581 return self._parse_json(
582 self._search_regex(
583 r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
584 default='{}'), video_id, fatal=False) or {}
585
586 def generate_api_headers(
587 self, ytcfg=None, identity_token=None, account_syncid=None,
588 visitor_data=None, api_hostname=None, default_client='web', session_index=None):
589 origin = 'https://' + (api_hostname if api_hostname else self._get_innertube_host(default_client))
590 headers = {
591 'X-YouTube-Client-Name': compat_str(
592 self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=default_client)),
593 'X-YouTube-Client-Version': self._extract_client_version(ytcfg, default_client),
594 'Origin': origin
595 }
596 if not visitor_data and ytcfg:
597 visitor_data = try_get(
598 self._extract_context(ytcfg, default_client), lambda x: x['client']['visitorData'], compat_str)
599 if identity_token:
600 headers['X-Youtube-Identity-Token'] = identity_token
601 if account_syncid:
602 headers['X-Goog-PageId'] = account_syncid
603 if session_index is None and ytcfg:
604 session_index = self._extract_session_index(ytcfg)
605 if account_syncid or session_index is not None:
606 headers['X-Goog-AuthUser'] = session_index if session_index is not None else 0
607 if visitor_data:
608 headers['X-Goog-Visitor-Id'] = visitor_data
609 auth = self._generate_sapisidhash_header(origin)
610 if auth is not None:
611 headers['Authorization'] = auth
612 headers['X-Origin'] = origin
613 return headers
614
615 @staticmethod
616 def _build_api_continuation_query(continuation, ctp=None):
617 query = {
618 'continuation': continuation
619 }
620 # TODO: Inconsistency with clickTrackingParams.
621 # Currently we have a fixed ctp contained within context (from ytcfg)
622 # and a ctp in root query for continuation.
623 if ctp:
624 query['clickTracking'] = {'clickTrackingParams': ctp}
625 return query
626
627 @classmethod
628 def _extract_next_continuation_data(cls, renderer):
629 next_continuation = try_get(
630 renderer, (lambda x: x['continuations'][0]['nextContinuationData'],
631 lambda x: x['continuation']['reloadContinuationData']), dict)
632 if not next_continuation:
633 return
634 continuation = next_continuation.get('continuation')
635 if not continuation:
636 return
637 ctp = next_continuation.get('clickTrackingParams')
638 return cls._build_api_continuation_query(continuation, ctp)
639
640 @classmethod
641 def _extract_continuation_ep_data(cls, continuation_ep: dict):
642 if isinstance(continuation_ep, dict):
643 continuation = try_get(
644 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
645 if not continuation:
646 return
647 ctp = continuation_ep.get('clickTrackingParams')
648 return cls._build_api_continuation_query(continuation, ctp)
649
650 @classmethod
651 def _extract_continuation(cls, renderer):
652 next_continuation = cls._extract_next_continuation_data(renderer)
653 if next_continuation:
654 return next_continuation
655
656 contents = []
657 for key in ('contents', 'items'):
658 contents.extend(try_get(renderer, lambda x: x[key], list) or [])
659
660 for content in contents:
661 if not isinstance(content, dict):
662 continue
663 continuation_ep = try_get(
664 content, (lambda x: x['continuationItemRenderer']['continuationEndpoint'],
665 lambda x: x['continuationItemRenderer']['button']['buttonRenderer']['command']),
666 dict)
667 continuation = cls._extract_continuation_ep_data(continuation_ep)
668 if continuation:
669 return continuation
670
671 @classmethod
672 def _extract_alerts(cls, data):
673 for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
674 if not isinstance(alert_dict, dict):
675 continue
676 for alert in alert_dict.values():
677 alert_type = alert.get('type')
678 if not alert_type:
679 continue
680 message = cls._get_text(alert, 'text')
681 if message:
682 yield alert_type, message
683
684 def _report_alerts(self, alerts, expected=True):
685 errors = []
686 warnings = []
687 for alert_type, alert_message in alerts:
688 if alert_type.lower() == 'error':
689 errors.append([alert_type, alert_message])
690 else:
691 warnings.append([alert_type, alert_message])
692
693 for alert_type, alert_message in (warnings + errors[:-1]):
694 self.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
695 if errors:
696 raise ExtractorError('YouTube said: %s' % errors[-1][1], expected=expected)
697
698 def _extract_and_report_alerts(self, data, *args, **kwargs):
699 return self._report_alerts(self._extract_alerts(data), *args, **kwargs)
700
701 def _extract_badges(self, renderer: dict):
702 badges = set()
703 for badge in try_get(renderer, lambda x: x['badges'], list) or []:
704 label = try_get(badge, lambda x: x['metadataBadgeRenderer']['label'], compat_str)
705 if label:
706 badges.add(label.lower())
707 return badges
708
709 @staticmethod
710 def _get_text(data, *path_list, max_runs=None):
711 for path in path_list or [None]:
712 if path is None:
713 obj = [data]
714 else:
715 obj = traverse_obj(data, path, default=[])
716 if not any(key is ... or isinstance(key, (list, tuple)) for key in variadic(path)):
717 obj = [obj]
718 for item in obj:
719 text = try_get(item, lambda x: x['simpleText'], compat_str)
720 if text:
721 return text
722 runs = try_get(item, lambda x: x['runs'], list) or []
723 if not runs and isinstance(item, list):
724 runs = item
725
726 runs = runs[:min(len(runs), max_runs or len(runs))]
727 text = ''.join(traverse_obj(runs, (..., 'text'), expected_type=str, default=[]))
728 if text:
729 return text
730
731 def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None,
732 ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None,
733 default_client='web'):
734 response = None
735 last_error = None
736 count = -1
737 retries = self.get_param('extractor_retries', 3)
738 if check_get_keys is None:
739 check_get_keys = []
740 while count < retries:
741 count += 1
742 if last_error:
743 self.report_warning('%s. Retrying ...' % last_error)
744 try:
745 response = self._call_api(
746 ep=ep, fatal=True, headers=headers,
747 video_id=item_id, query=query,
748 context=self._extract_context(ytcfg, default_client),
749 api_key=self._extract_api_key(ytcfg, default_client),
750 api_hostname=api_hostname, default_client=default_client,
751 note='%s%s' % (note, ' (retry #%d)' % count if count else ''))
752 except ExtractorError as e:
753 if isinstance(e.cause, network_exceptions):
754 # Downloading page may result in intermittent 5xx HTTP error
755 # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
756 # We also want to catch all other network exceptions since errors in later pages can be troublesome
757 # See https://github.com/yt-dlp/yt-dlp/issues/507#issuecomment-880188210
758 if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (403, 429):
759 last_error = error_to_compat_str(e.cause or e)
760 if count < retries:
761 continue
762 if fatal:
763 raise
764 else:
765 self.report_warning(error_to_compat_str(e))
766 return
767
768 else:
769 # Youtube may send alerts if there was an issue with the continuation page
770 try:
771 self._extract_and_report_alerts(response, expected=False)
772 except ExtractorError as e:
773 if fatal:
774 raise
775 self.report_warning(error_to_compat_str(e))
776 return
777 if not check_get_keys or dict_get(response, check_get_keys):
778 break
779 # Youtube sometimes sends incomplete data
780 # See: https://github.com/ytdl-org/youtube-dl/issues/28194
781 last_error = 'Incomplete data received'
782 if count >= retries:
783 if fatal:
784 raise ExtractorError(last_error)
785 else:
786 self.report_warning(last_error)
787 return
788 return response
789
790 @staticmethod
791 def is_music_url(url):
792 return re.match(r'https?://music\.youtube\.com/', url) is not None
793
794 def _extract_video(self, renderer):
795 video_id = renderer.get('videoId')
796 title = self._get_text(renderer, 'title')
797 description = self._get_text(renderer, 'descriptionSnippet')
798 duration = parse_duration(self._get_text(
799 renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text')))
800 view_count_text = self._get_text(renderer, 'viewCountText') or ''
801 view_count = str_to_int(self._search_regex(
802 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
803 'view count', default=None))
804
805 uploader = self._get_text(renderer, 'ownerText', 'shortBylineText')
806
807 return {
808 '_type': 'url',
809 'ie_key': YoutubeIE.ie_key(),
810 'id': video_id,
811 'url': video_id,
812 'title': title,
813 'description': description,
814 'duration': duration,
815 'view_count': view_count,
816 'uploader': uploader,
817 }
818
819
820class YoutubeIE(YoutubeBaseInfoExtractor):
821 IE_DESC = 'YouTube.com'
822 _INVIDIOUS_SITES = (
823 # invidious-redirect websites
824 r'(?:www\.)?redirect\.invidious\.io',
825 r'(?:(?:www|dev)\.)?invidio\.us',
826 # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
827 r'(?:www\.)?invidious\.pussthecat\.org',
828 r'(?:www\.)?invidious\.zee\.li',
829 r'(?:www\.)?invidious\.ethibox\.fr',
830 r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
831 # youtube-dl invidious instances list
832 r'(?:(?:www|no)\.)?invidiou\.sh',
833 r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
834 r'(?:www\.)?invidious\.kabi\.tk',
835 r'(?:www\.)?invidious\.mastodon\.host',
836 r'(?:www\.)?invidious\.zapashcanon\.fr',
837 r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks',
838 r'(?:www\.)?invidious\.tinfoil-hat\.net',
839 r'(?:www\.)?invidious\.himiko\.cloud',
840 r'(?:www\.)?invidious\.reallyancient\.tech',
841 r'(?:www\.)?invidious\.tube',
842 r'(?:www\.)?invidiou\.site',
843 r'(?:www\.)?invidious\.site',
844 r'(?:www\.)?invidious\.xyz',
845 r'(?:www\.)?invidious\.nixnet\.xyz',
846 r'(?:www\.)?invidious\.048596\.xyz',
847 r'(?:www\.)?invidious\.drycat\.fr',
848 r'(?:www\.)?inv\.skyn3t\.in',
849 r'(?:www\.)?tube\.poal\.co',
850 r'(?:www\.)?tube\.connect\.cafe',
851 r'(?:www\.)?vid\.wxzm\.sx',
852 r'(?:www\.)?vid\.mint\.lgbt',
853 r'(?:www\.)?vid\.puffyan\.us',
854 r'(?:www\.)?yewtu\.be',
855 r'(?:www\.)?yt\.elukerio\.org',
856 r'(?:www\.)?yt\.lelux\.fi',
857 r'(?:www\.)?invidious\.ggc-project\.de',
858 r'(?:www\.)?yt\.maisputain\.ovh',
859 r'(?:www\.)?ytprivate\.com',
860 r'(?:www\.)?invidious\.13ad\.de',
861 r'(?:www\.)?invidious\.toot\.koeln',
862 r'(?:www\.)?invidious\.fdn\.fr',
863 r'(?:www\.)?watch\.nettohikari\.com',
864 r'(?:www\.)?invidious\.namazso\.eu',
865 r'(?:www\.)?invidious\.silkky\.cloud',
866 r'(?:www\.)?invidious\.exonip\.de',
867 r'(?:www\.)?invidious\.riverside\.rocks',
868 r'(?:www\.)?invidious\.blamefran\.net',
869 r'(?:www\.)?invidious\.moomoo\.de',
870 r'(?:www\.)?ytb\.trom\.tf',
871 r'(?:www\.)?yt\.cyberhost\.uk',
872 r'(?:www\.)?kgg2m7yk5aybusll\.onion',
873 r'(?:www\.)?qklhadlycap4cnod\.onion',
874 r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion',
875 r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion',
876 r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion',
877 r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion',
878 r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p',
879 r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion',
880 r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
881 r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
882 r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
883 r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion',
884 )
885 _VALID_URL = r"""(?x)^
886 (
887 (?:https?://|//) # http(s):// or protocol-independent URL
888 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com|
889 (?:www\.)?deturl\.com/www\.youtube\.com|
890 (?:www\.)?pwnyoutube\.com|
891 (?:www\.)?hooktube\.com|
892 (?:www\.)?yourepeat\.com|
893 tube\.majestyc\.net|
894 %(invidious)s|
895 youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
896 (?:.*?\#/)? # handle anchor (#/) redirect urls
897 (?: # the various things that can precede the ID:
898 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
899 |(?: # or the v= param in all its forms
900 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
901 (?:\?|\#!?) # the params delimiter ? or # or #!
902 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
903 v=
904 )
905 ))
906 |(?:
907 youtu\.be| # just youtu.be/xxxx
908 vid\.plus| # or vid.plus/xxxx
909 zwearz\.com/watch| # or zwearz.com/watch/xxxx
910 %(invidious)s
911 )/
912 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
913 )
914 )? # all until now is optional -> you can pass the naked ID
915 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
916 (?(1).+)? # if we found the ID, everything can follow
917 (?:\#|$)""" % {
918 'invidious': '|'.join(_INVIDIOUS_SITES),
919 }
920 _PLAYER_INFO_RE = (
921 r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
922 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
923 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
924 )
925 _formats = {
926 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
927 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
928 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
929 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
930 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
931 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
932 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
933 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
934 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
935 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
936 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
937 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
938 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
939 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
940 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
941 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
942 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
943 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
944
945
946 # 3D videos
947 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
948 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
949 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
950 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
951 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
952 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
953 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
954
955 # Apple HTTP Live Streaming
956 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
957 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
958 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
959 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
960 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
961 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
962 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
963 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
964
965 # DASH mp4 video
966 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
967 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
968 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
969 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
970 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
971 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
972 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
973 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
974 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
975 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
976 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
977 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
978
979 # Dash mp4 audio
980 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
981 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
982 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
983 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
984 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
985 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
986 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
987
988 # Dash webm
989 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
990 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
991 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
992 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
993 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
994 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
995 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
996 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
997 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
998 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
999 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1000 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1001 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1002 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1003 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1004 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
1005 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1006 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
1007 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
1008 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
1009 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1010 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
1011
1012 # Dash webm audio
1013 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
1014 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
1015
1016 # Dash webm audio with opus inside
1017 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
1018 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
1019 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
1020
1021 # RTMP (unnamed)
1022 '_rtmp': {'protocol': 'rtmp'},
1023
1024 # av01 video only formats sometimes served with "unknown" codecs
1025 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
1026 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
1027 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
1028 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
1029 }
1030 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
1031
1032 _AGE_GATE_REASONS = (
1033 'Sign in to confirm your age',
1034 'This video may be inappropriate for some users.',
1035 'Sorry, this content is age-restricted.',
1036 'Please confirm your age.')
1037
1038 _AGE_GATE_STATUS_REASONS = (
1039 'AGE_VERIFICATION_REQUIRED',
1040 'AGE_CHECK_REQUIRED'
1041 )
1042
1043 _GEO_BYPASS = False
1044
1045 IE_NAME = 'youtube'
1046 _TESTS = [
1047 {
1048 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
1049 'info_dict': {
1050 'id': 'BaW_jenozKc',
1051 'ext': 'mp4',
1052 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
1053 'uploader': 'Philipp Hagemeister',
1054 'uploader_id': 'phihag',
1055 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
1056 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
1057 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
1058 'upload_date': '20121002',
1059 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
1060 'categories': ['Science & Technology'],
1061 'tags': ['youtube-dl'],
1062 'duration': 10,
1063 'view_count': int,
1064 'like_count': int,
1065 'dislike_count': int,
1066 'start_time': 1,
1067 'end_time': 9,
1068 }
1069 },
1070 {
1071 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
1072 'note': 'Embed-only video (#1746)',
1073 'info_dict': {
1074 'id': 'yZIXLfi8CZQ',
1075 'ext': 'mp4',
1076 'upload_date': '20120608',
1077 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
1078 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
1079 'uploader': 'SET India',
1080 'uploader_id': 'setindia',
1081 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
1082 'age_limit': 18,
1083 },
1084 'skip': 'Private video',
1085 },
1086 {
1087 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
1088 'note': 'Use the first video ID in the URL',
1089 'info_dict': {
1090 'id': 'BaW_jenozKc',
1091 'ext': 'mp4',
1092 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
1093 'uploader': 'Philipp Hagemeister',
1094 'uploader_id': 'phihag',
1095 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
1096 'upload_date': '20121002',
1097 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
1098 'categories': ['Science & Technology'],
1099 'tags': ['youtube-dl'],
1100 'duration': 10,
1101 'view_count': int,
1102 'like_count': int,
1103 'dislike_count': int,
1104 },
1105 'params': {
1106 'skip_download': True,
1107 },
1108 },
1109 {
1110 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
1111 'note': '256k DASH audio (format 141) via DASH manifest',
1112 'info_dict': {
1113 'id': 'a9LDPn-MO4I',
1114 'ext': 'm4a',
1115 'upload_date': '20121002',
1116 'uploader_id': '8KVIDEO',
1117 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
1118 'description': '',
1119 'uploader': '8KVIDEO',
1120 'title': 'UHDTV TEST 8K VIDEO.mp4'
1121 },
1122 'params': {
1123 'youtube_include_dash_manifest': True,
1124 'format': '141',
1125 },
1126 'skip': 'format 141 not served anymore',
1127 },
1128 # DASH manifest with encrypted signature
1129 {
1130 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
1131 'info_dict': {
1132 'id': 'IB3lcPjvWLA',
1133 'ext': 'm4a',
1134 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
1135 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
1136 'duration': 244,
1137 'uploader': 'AfrojackVEVO',
1138 'uploader_id': 'AfrojackVEVO',
1139 'upload_date': '20131011',
1140 'abr': 129.495,
1141 },
1142 'params': {
1143 'youtube_include_dash_manifest': True,
1144 'format': '141/bestaudio[ext=m4a]',
1145 },
1146 },
1147 # Age-gate videos. See https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-888837000
1148 {
1149 'note': 'Embed allowed age-gate video',
1150 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
1151 'info_dict': {
1152 'id': 'HtVdAasjOgU',
1153 'ext': 'mp4',
1154 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
1155 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
1156 'duration': 142,
1157 'uploader': 'The Witcher',
1158 'uploader_id': 'WitcherGame',
1159 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
1160 'upload_date': '20140605',
1161 'age_limit': 18,
1162 },
1163 },
1164 {
1165 'note': 'Age-gate video with embed allowed in public site',
1166 'url': 'https://youtube.com/watch?v=HsUATh_Nc2U',
1167 'info_dict': {
1168 'id': 'HsUATh_Nc2U',
1169 'ext': 'mp4',
1170 'title': 'Godzilla 2 (Official Video)',
1171 'description': 'md5:bf77e03fcae5529475e500129b05668a',
1172 'upload_date': '20200408',
1173 'uploader_id': 'FlyingKitty900',
1174 'uploader': 'FlyingKitty',
1175 'age_limit': 18,
1176 },
1177 },
1178 {
1179 'note': 'Age-gate video embedable only with clientScreen=EMBED',
1180 'url': 'https://youtube.com/watch?v=Tq92D6wQ1mg',
1181 'info_dict': {
1182 'id': 'Tq92D6wQ1mg',
1183 'title': '[MMD] Adios - EVERGLOW [+Motion DL]',
1184 'ext': 'mp4','upload_date': '20191227',
1185 'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ',
1186 'uploader': 'Projekt Melody',
1187 'description': 'md5:17eccca93a786d51bc67646756894066',
1188 'age_limit': 18,
1189 },
1190 },
1191 {
1192 'note': 'Non-Agegated non-embeddable video',
1193 'url': 'https://youtube.com/watch?v=MeJVWBSsPAY',
1194 'info_dict': {
1195 'id': 'MeJVWBSsPAY',
1196 'ext': 'mp4',
1197 'title': 'OOMPH! - Such Mich Find Mich (Lyrics)',
1198 'uploader': 'Herr Lurik',
1199 'uploader_id': 'st3in234',
1200 'description': 'Fan Video. Music & Lyrics by OOMPH!.',
1201 'upload_date': '20130730',
1202 },
1203 },
1204 {
1205 'note': 'Non-bypassable age-gated video',
1206 'url': 'https://youtube.com/watch?v=Cr381pDsSsA',
1207 'only_matching': True,
1208 },
1209 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
1210 # YouTube Red ad is not captured for creator
1211 {
1212 'url': '__2ABJjxzNo',
1213 'info_dict': {
1214 'id': '__2ABJjxzNo',
1215 'ext': 'mp4',
1216 'duration': 266,
1217 'upload_date': '20100430',
1218 'uploader_id': 'deadmau5',
1219 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
1220 'creator': 'deadmau5',
1221 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336',
1222 'uploader': 'deadmau5',
1223 'title': 'Deadmau5 - Some Chords (HD)',
1224 'alt_title': 'Some Chords',
1225 },
1226 'expected_warnings': [
1227 'DASH manifest missing',
1228 ]
1229 },
1230 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
1231 {
1232 'url': 'lqQg6PlCWgI',
1233 'info_dict': {
1234 'id': 'lqQg6PlCWgI',
1235 'ext': 'mp4',
1236 'duration': 6085,
1237 'upload_date': '20150827',
1238 'uploader_id': 'olympic',
1239 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
1240 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
1241 'uploader': 'Olympics',
1242 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
1243 },
1244 'params': {
1245 'skip_download': 'requires avconv',
1246 }
1247 },
1248 # Non-square pixels
1249 {
1250 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
1251 'info_dict': {
1252 'id': '_b-2C3KPAM0',
1253 'ext': 'mp4',
1254 'stretched_ratio': 16 / 9.,
1255 'duration': 85,
1256 'upload_date': '20110310',
1257 'uploader_id': 'AllenMeow',
1258 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
1259 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
1260 'uploader': '孫ᄋᄅ',
1261 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
1262 },
1263 },
1264 # url_encoded_fmt_stream_map is empty string
1265 {
1266 'url': 'qEJwOuvDf7I',
1267 'info_dict': {
1268 'id': 'qEJwOuvDf7I',
1269 'ext': 'webm',
1270 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
1271 'description': '',
1272 'upload_date': '20150404',
1273 'uploader_id': 'spbelect',
1274 'uploader': 'Наблюдатели Петербурга',
1275 },
1276 'params': {
1277 'skip_download': 'requires avconv',
1278 },
1279 'skip': 'This live event has ended.',
1280 },
1281 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
1282 {
1283 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
1284 'info_dict': {
1285 'id': 'FIl7x6_3R5Y',
1286 'ext': 'webm',
1287 'title': 'md5:7b81415841e02ecd4313668cde88737a',
1288 'description': 'md5:116377fd2963b81ec4ce64b542173306',
1289 'duration': 220,
1290 'upload_date': '20150625',
1291 'uploader_id': 'dorappi2000',
1292 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
1293 'uploader': 'dorappi2000',
1294 'formats': 'mincount:31',
1295 },
1296 'skip': 'not actual anymore',
1297 },
1298 # DASH manifest with segment_list
1299 {
1300 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
1301 'md5': '8ce563a1d667b599d21064e982ab9e31',
1302 'info_dict': {
1303 'id': 'CsmdDsKjzN8',
1304 'ext': 'mp4',
1305 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
1306 'uploader': 'Airtek',
1307 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
1308 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
1309 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
1310 },
1311 'params': {
1312 'youtube_include_dash_manifest': True,
1313 'format': '135', # bestvideo
1314 },
1315 'skip': 'This live event has ended.',
1316 },
1317 {
1318 # Multifeed videos (multiple cameras), URL is for Main Camera
1319 'url': 'https://www.youtube.com/watch?v=jvGDaLqkpTg',
1320 'info_dict': {
1321 'id': 'jvGDaLqkpTg',
1322 'title': 'Tom Clancy Free Weekend Rainbow Whatever',
1323 'description': 'md5:e03b909557865076822aa169218d6a5d',
1324 },
1325 'playlist': [{
1326 'info_dict': {
1327 'id': 'jvGDaLqkpTg',
1328 'ext': 'mp4',
1329 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Main Camera)',
1330 'description': 'md5:e03b909557865076822aa169218d6a5d',
1331 'duration': 10643,
1332 'upload_date': '20161111',
1333 'uploader': 'Team PGP',
1334 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1335 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1336 },
1337 }, {
1338 'info_dict': {
1339 'id': '3AKt1R1aDnw',
1340 'ext': 'mp4',
1341 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 2)',
1342 'description': 'md5:e03b909557865076822aa169218d6a5d',
1343 'duration': 10991,
1344 'upload_date': '20161111',
1345 'uploader': 'Team PGP',
1346 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1347 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1348 },
1349 }, {
1350 'info_dict': {
1351 'id': 'RtAMM00gpVc',
1352 'ext': 'mp4',
1353 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 3)',
1354 'description': 'md5:e03b909557865076822aa169218d6a5d',
1355 'duration': 10995,
1356 'upload_date': '20161111',
1357 'uploader': 'Team PGP',
1358 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1359 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1360 },
1361 }, {
1362 'info_dict': {
1363 'id': '6N2fdlP3C5U',
1364 'ext': 'mp4',
1365 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 4)',
1366 'description': 'md5:e03b909557865076822aa169218d6a5d',
1367 'duration': 10990,
1368 'upload_date': '20161111',
1369 'uploader': 'Team PGP',
1370 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1371 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1372 },
1373 }],
1374 'params': {
1375 'skip_download': True,
1376 },
1377 'skip': 'Not multifeed anymore',
1378 },
1379 {
1380 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
1381 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
1382 'info_dict': {
1383 'id': 'gVfLd0zydlo',
1384 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
1385 },
1386 'playlist_count': 2,
1387 'skip': 'Not multifeed anymore',
1388 },
1389 {
1390 'url': 'https://vid.plus/FlRa-iH7PGw',
1391 'only_matching': True,
1392 },
1393 {
1394 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
1395 'only_matching': True,
1396 },
1397 {
1398 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
1399 # Also tests cut-off URL expansion in video description (see
1400 # https://github.com/ytdl-org/youtube-dl/issues/1892,
1401 # https://github.com/ytdl-org/youtube-dl/issues/8164)
1402 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
1403 'info_dict': {
1404 'id': 'lsguqyKfVQg',
1405 'ext': 'mp4',
1406 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
1407 'alt_title': 'Dark Walk',
1408 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
1409 'duration': 133,
1410 'upload_date': '20151119',
1411 'uploader_id': 'IronSoulElf',
1412 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
1413 'uploader': 'IronSoulElf',
1414 'creator': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan',
1415 'track': 'Dark Walk',
1416 'artist': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan',
1417 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
1418 },
1419 'params': {
1420 'skip_download': True,
1421 },
1422 },
1423 {
1424 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
1425 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
1426 'only_matching': True,
1427 },
1428 {
1429 # Video with yt:stretch=17:0
1430 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
1431 'info_dict': {
1432 'id': 'Q39EVAstoRM',
1433 'ext': 'mp4',
1434 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
1435 'description': 'md5:ee18a25c350637c8faff806845bddee9',
1436 'upload_date': '20151107',
1437 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
1438 'uploader': 'CH GAMER DROID',
1439 },
1440 'params': {
1441 'skip_download': True,
1442 },
1443 'skip': 'This video does not exist.',
1444 },
1445 {
1446 # Video with incomplete 'yt:stretch=16:'
1447 'url': 'https://www.youtube.com/watch?v=FRhJzUSJbGI',
1448 'only_matching': True,
1449 },
1450 {
1451 # Video licensed under Creative Commons
1452 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
1453 'info_dict': {
1454 'id': 'M4gD1WSo5mA',
1455 'ext': 'mp4',
1456 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
1457 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
1458 'duration': 721,
1459 'upload_date': '20150127',
1460 'uploader_id': 'BerkmanCenter',
1461 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
1462 'uploader': 'The Berkman Klein Center for Internet & Society',
1463 'license': 'Creative Commons Attribution license (reuse allowed)',
1464 },
1465 'params': {
1466 'skip_download': True,
1467 },
1468 },
1469 {
1470 # Channel-like uploader_url
1471 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
1472 'info_dict': {
1473 'id': 'eQcmzGIKrzg',
1474 'ext': 'mp4',
1475 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
1476 'description': 'md5:13a2503d7b5904ef4b223aa101628f39',
1477 'duration': 4060,
1478 'upload_date': '20151119',
1479 'uploader': 'Bernie Sanders',
1480 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
1481 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
1482 'license': 'Creative Commons Attribution license (reuse allowed)',
1483 },
1484 'params': {
1485 'skip_download': True,
1486 },
1487 },
1488 {
1489 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
1490 'only_matching': True,
1491 },
1492 {
1493 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
1494 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
1495 'only_matching': True,
1496 },
1497 {
1498 # Rental video preview
1499 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1500 'info_dict': {
1501 'id': 'uGpuVWrhIzE',
1502 'ext': 'mp4',
1503 'title': 'Piku - Trailer',
1504 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1505 'upload_date': '20150811',
1506 'uploader': 'FlixMatrix',
1507 'uploader_id': 'FlixMatrixKaravan',
1508 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
1509 'license': 'Standard YouTube License',
1510 },
1511 'params': {
1512 'skip_download': True,
1513 },
1514 'skip': 'This video is not available.',
1515 },
1516 {
1517 # YouTube Red video with episode data
1518 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1519 'info_dict': {
1520 'id': 'iqKdEhx-dD4',
1521 'ext': 'mp4',
1522 'title': 'Isolation - Mind Field (Ep 1)',
1523 'description': 'md5:f540112edec5d09fc8cc752d3d4ba3cd',
1524 'duration': 2085,
1525 'upload_date': '20170118',
1526 'uploader': 'Vsauce',
1527 'uploader_id': 'Vsauce',
1528 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1529 'series': 'Mind Field',
1530 'season_number': 1,
1531 'episode_number': 1,
1532 },
1533 'params': {
1534 'skip_download': True,
1535 },
1536 'expected_warnings': [
1537 'Skipping DASH manifest',
1538 ],
1539 },
1540 {
1541 # The following content has been identified by the YouTube community
1542 # as inappropriate or offensive to some audiences.
1543 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1544 'info_dict': {
1545 'id': '6SJNVb0GnPI',
1546 'ext': 'mp4',
1547 'title': 'Race Differences in Intelligence',
1548 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1549 'duration': 965,
1550 'upload_date': '20140124',
1551 'uploader': 'New Century Foundation',
1552 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1553 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1554 },
1555 'params': {
1556 'skip_download': True,
1557 },
1558 'skip': 'This video has been removed for violating YouTube\'s policy on hate speech.',
1559 },
1560 {
1561 # itag 212
1562 'url': '1t24XAntNCY',
1563 'only_matching': True,
1564 },
1565 {
1566 # geo restricted to JP
1567 'url': 'sJL6WA-aGkQ',
1568 'only_matching': True,
1569 },
1570 {
1571 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1572 'only_matching': True,
1573 },
1574 {
1575 'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc',
1576 'only_matching': True,
1577 },
1578 {
1579 # from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m
1580 'url': 'https://redirect.invidious.io/Yh0AhrY9GjA',
1581 'only_matching': True,
1582 },
1583 {
1584 # DRM protected
1585 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1586 'only_matching': True,
1587 },
1588 {
1589 # Video with unsupported adaptive stream type formats
1590 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1591 'info_dict': {
1592 'id': 'Z4Vy8R84T1U',
1593 'ext': 'mp4',
1594 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1595 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1596 'duration': 433,
1597 'upload_date': '20130923',
1598 'uploader': 'Amelia Putri Harwita',
1599 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1600 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1601 'formats': 'maxcount:10',
1602 },
1603 'params': {
1604 'skip_download': True,
1605 'youtube_include_dash_manifest': False,
1606 },
1607 'skip': 'not actual anymore',
1608 },
1609 {
1610 # Youtube Music Auto-generated description
1611 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1612 'info_dict': {
1613 'id': 'MgNrAu2pzNs',
1614 'ext': 'mp4',
1615 'title': 'Voyeur Girl',
1616 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1617 'upload_date': '20190312',
1618 'uploader': 'Stephen - Topic',
1619 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1620 'artist': 'Stephen',
1621 'track': 'Voyeur Girl',
1622 'album': 'it\'s too much love to know my dear',
1623 'release_date': '20190313',
1624 'release_year': 2019,
1625 },
1626 'params': {
1627 'skip_download': True,
1628 },
1629 },
1630 {
1631 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1632 'only_matching': True,
1633 },
1634 {
1635 # invalid -> valid video id redirection
1636 'url': 'DJztXj2GPfl',
1637 'info_dict': {
1638 'id': 'DJztXj2GPfk',
1639 'ext': 'mp4',
1640 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1641 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1642 'upload_date': '20090125',
1643 'uploader': 'Prochorowka',
1644 'uploader_id': 'Prochorowka',
1645 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1646 'artist': 'Panjabi MC',
1647 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1648 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1649 },
1650 'params': {
1651 'skip_download': True,
1652 },
1653 'skip': 'Video unavailable',
1654 },
1655 {
1656 # empty description results in an empty string
1657 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1658 'info_dict': {
1659 'id': 'x41yOUIvK2k',
1660 'ext': 'mp4',
1661 'title': 'IMG 3456',
1662 'description': '',
1663 'upload_date': '20170613',
1664 'uploader_id': 'ElevageOrVert',
1665 'uploader': 'ElevageOrVert',
1666 },
1667 'params': {
1668 'skip_download': True,
1669 },
1670 },
1671 {
1672 # with '};' inside yt initial data (see [1])
1673 # see [2] for an example with '};' inside ytInitialPlayerResponse
1674 # 1. https://github.com/ytdl-org/youtube-dl/issues/27093
1675 # 2. https://github.com/ytdl-org/youtube-dl/issues/27216
1676 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1677 'info_dict': {
1678 'id': 'CHqg6qOn4no',
1679 'ext': 'mp4',
1680 'title': 'Part 77 Sort a list of simple types in c#',
1681 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1682 'upload_date': '20130831',
1683 'uploader_id': 'kudvenkat',
1684 'uploader': 'kudvenkat',
1685 },
1686 'params': {
1687 'skip_download': True,
1688 },
1689 },
1690 {
1691 # another example of '};' in ytInitialData
1692 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY',
1693 'only_matching': True,
1694 },
1695 {
1696 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ',
1697 'only_matching': True,
1698 },
1699 {
1700 # https://github.com/ytdl-org/youtube-dl/pull/28094
1701 'url': 'OtqTfy26tG0',
1702 'info_dict': {
1703 'id': 'OtqTfy26tG0',
1704 'ext': 'mp4',
1705 'title': 'Burn Out',
1706 'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131',
1707 'upload_date': '20141120',
1708 'uploader': 'The Cinematic Orchestra - Topic',
1709 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw',
1710 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw',
1711 'artist': 'The Cinematic Orchestra',
1712 'track': 'Burn Out',
1713 'album': 'Every Day',
1714 'release_data': None,
1715 'release_year': None,
1716 },
1717 'params': {
1718 'skip_download': True,
1719 },
1720 },
1721 {
1722 # controversial video, only works with bpctr when authenticated with cookies
1723 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg',
1724 'only_matching': True,
1725 },
1726 {
1727 # controversial video, requires bpctr/contentCheckOk
1728 'url': 'https://www.youtube.com/watch?v=SZJvDhaSDnc',
1729 'info_dict': {
1730 'id': 'SZJvDhaSDnc',
1731 'ext': 'mp4',
1732 'title': 'San Diego teen commits suicide after bullying over embarrassing video',
1733 'channel_id': 'UC-SJ6nODDmufqBzPBwCvYvQ',
1734 'uploader': 'CBS This Morning',
1735 'uploader_id': 'CBSThisMorning',
1736 'upload_date': '20140716',
1737 'description': 'md5:acde3a73d3f133fc97e837a9f76b53b7'
1738 }
1739 },
1740 {
1741 # restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685
1742 'url': 'cBvYw8_A0vQ',
1743 'info_dict': {
1744 'id': 'cBvYw8_A0vQ',
1745 'ext': 'mp4',
1746 'title': '4K Ueno Okachimachi Street Scenes 上野御徒町歩き',
1747 'description': 'md5:ea770e474b7cd6722b4c95b833c03630',
1748 'upload_date': '20201120',
1749 'uploader': 'Walk around Japan',
1750 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw',
1751 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
1752 },
1753 'params': {
1754 'skip_download': True,
1755 },
1756 }, {
1757 # Has multiple audio streams
1758 'url': 'WaOKSUlf4TM',
1759 'only_matching': True
1760 }, {
1761 # Requires Premium: has format 141 when requested using YTM url
1762 'url': 'https://music.youtube.com/watch?v=XclachpHxis',
1763 'only_matching': True
1764 }, {
1765 # multiple subtitles with same lang_code
1766 'url': 'https://www.youtube.com/watch?v=wsQiKKfKxug',
1767 'only_matching': True,
1768 }, {
1769 # Force use android client fallback
1770 'url': 'https://www.youtube.com/watch?v=YOelRv7fMxY',
1771 'info_dict': {
1772 'id': 'YOelRv7fMxY',
1773 'title': 'DIGGING A SECRET TUNNEL Part 1',
1774 'ext': '3gp',
1775 'upload_date': '20210624',
1776 'channel_id': 'UCp68_FLety0O-n9QU6phsgw',
1777 'uploader': 'colinfurze',
1778 'uploader_id': 'colinfurze',
1779 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCp68_FLety0O-n9QU6phsgw',
1780 'description': 'md5:b5096f56af7ccd7a555c84db81738b22'
1781 },
1782 'params': {
1783 'format': '17', # 3gp format available on android
1784 'extractor_args': {'youtube': {'player_client': ['android']}},
1785 },
1786 },
1787 {
1788 # Skip download of additional client configs (remix client config in this case)
1789 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1790 'only_matching': True,
1791 'params': {
1792 'extractor_args': {'youtube': {'player_skip': ['configs']}},
1793 },
1794 }
1795 ]
1796
1797 @classmethod
1798 def suitable(cls, url):
1799 # Hack for lazy extractors until more generic solution is implemented
1800 # (see #28780)
1801 from .youtube import parse_qs
1802 qs = parse_qs(url)
1803 if qs.get('list', [None])[0]:
1804 return False
1805 return super(YoutubeIE, cls).suitable(url)
1806
1807 def __init__(self, *args, **kwargs):
1808 super(YoutubeIE, self).__init__(*args, **kwargs)
1809 self._code_cache = {}
1810 self._player_cache = {}
1811
1812 def _extract_player_url(self, ytcfg=None, webpage=None):
1813 player_url = try_get(ytcfg, (lambda x: x['PLAYER_JS_URL']), str)
1814 if not player_url and webpage:
1815 player_url = self._search_regex(
1816 r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
1817 webpage, 'player URL', fatal=False)
1818 if not player_url:
1819 return None
1820 if player_url.startswith('//'):
1821 player_url = 'https:' + player_url
1822 elif not re.match(r'https?://', player_url):
1823 player_url = compat_urlparse.urljoin(
1824 'https://www.youtube.com', player_url)
1825 return player_url
1826
1827 def _signature_cache_id(self, example_sig):
1828 """ Return a string representation of a signature """
1829 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1830
1831 @classmethod
1832 def _extract_player_info(cls, player_url):
1833 for player_re in cls._PLAYER_INFO_RE:
1834 id_m = re.search(player_re, player_url)
1835 if id_m:
1836 break
1837 else:
1838 raise ExtractorError('Cannot identify player %r' % player_url)
1839 return id_m.group('id')
1840
1841 def _load_player(self, video_id, player_url, fatal=True) -> bool:
1842 player_id = self._extract_player_info(player_url)
1843 if player_id not in self._code_cache:
1844 self._code_cache[player_id] = self._download_webpage(
1845 player_url, video_id, fatal=fatal,
1846 note='Downloading player ' + player_id,
1847 errnote='Download of %s failed' % player_url)
1848 return player_id in self._code_cache
1849
1850 def _extract_signature_function(self, video_id, player_url, example_sig):
1851 player_id = self._extract_player_info(player_url)
1852
1853 # Read from filesystem cache
1854 func_id = 'js_%s_%s' % (
1855 player_id, self._signature_cache_id(example_sig))
1856 assert os.path.basename(func_id) == func_id
1857
1858 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1859 if cache_spec is not None:
1860 return lambda s: ''.join(s[i] for i in cache_spec)
1861
1862 if self._load_player(video_id, player_url):
1863 code = self._code_cache[player_id]
1864 res = self._parse_sig_js(code)
1865
1866 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1867 cache_res = res(test_string)
1868 cache_spec = [ord(c) for c in cache_res]
1869
1870 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1871 return res
1872
1873 def _print_sig_code(self, func, example_sig):
1874 def gen_sig_code(idxs):
1875 def _genslice(start, end, step):
1876 starts = '' if start == 0 else str(start)
1877 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1878 steps = '' if step == 1 else (':%d' % step)
1879 return 's[%s%s%s]' % (starts, ends, steps)
1880
1881 step = None
1882 # Quelch pyflakes warnings - start will be set when step is set
1883 start = '(Never used)'
1884 for i, prev in zip(idxs[1:], idxs[:-1]):
1885 if step is not None:
1886 if i - prev == step:
1887 continue
1888 yield _genslice(start, prev, step)
1889 step = None
1890 continue
1891 if i - prev in [-1, 1]:
1892 step = i - prev
1893 start = prev
1894 continue
1895 else:
1896 yield 's[%d]' % prev
1897 if step is None:
1898 yield 's[%d]' % i
1899 else:
1900 yield _genslice(start, i, step)
1901
1902 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1903 cache_res = func(test_string)
1904 cache_spec = [ord(c) for c in cache_res]
1905 expr_code = ' + '.join(gen_sig_code(cache_spec))
1906 signature_id_tuple = '(%s)' % (
1907 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1908 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1909 ' return %s\n') % (signature_id_tuple, expr_code)
1910 self.to_screen('Extracted signature function:\n' + code)
1911
1912 def _parse_sig_js(self, jscode):
1913 funcname = self._search_regex(
1914 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1915 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1916 r'\bm=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(h\.s\)\)',
1917 r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(c\)\)',
1918 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)',
1919 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1920 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1921 # Obsolete patterns
1922 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1923 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1924 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1925 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1926 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1927 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1928 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1929 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1930 jscode, 'Initial JS player signature function name', group='sig')
1931
1932 jsi = JSInterpreter(jscode)
1933 initial_function = jsi.extract_function(funcname)
1934 return lambda s: initial_function([s])
1935
1936 def _decrypt_signature(self, s, video_id, player_url):
1937 """Turn the encrypted s field into a working signature"""
1938
1939 if player_url is None:
1940 raise ExtractorError('Cannot decrypt signature without player_url')
1941
1942 try:
1943 player_id = (player_url, self._signature_cache_id(s))
1944 if player_id not in self._player_cache:
1945 func = self._extract_signature_function(
1946 video_id, player_url, s
1947 )
1948 self._player_cache[player_id] = func
1949 func = self._player_cache[player_id]
1950 if self.get_param('youtube_print_sig_code'):
1951 self._print_sig_code(func, s)
1952 return func(s)
1953 except Exception as e:
1954 tb = traceback.format_exc()
1955 raise ExtractorError(
1956 'Signature extraction failed: ' + tb, cause=e)
1957
1958 def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False):
1959 """
1960 Extract signatureTimestamp (sts)
1961 Required to tell API what sig/player version is in use.
1962 """
1963 sts = None
1964 if isinstance(ytcfg, dict):
1965 sts = int_or_none(ytcfg.get('STS'))
1966
1967 if not sts:
1968 # Attempt to extract from player
1969 if player_url is None:
1970 error_msg = 'Cannot extract signature timestamp without player_url.'
1971 if fatal:
1972 raise ExtractorError(error_msg)
1973 self.report_warning(error_msg)
1974 return
1975 if self._load_player(video_id, player_url, fatal=fatal):
1976 player_id = self._extract_player_info(player_url)
1977 code = self._code_cache[player_id]
1978 sts = int_or_none(self._search_regex(
1979 r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code,
1980 'JS player signature timestamp', group='sts', fatal=fatal))
1981 return sts
1982
1983 def _mark_watched(self, video_id, player_responses):
1984 playback_url = traverse_obj(
1985 player_responses, (..., 'playbackTracking', 'videostatsPlaybackUrl', 'baseUrl'),
1986 expected_type=url_or_none, get_all=False)
1987 if not playback_url:
1988 self.report_warning('Unable to mark watched')
1989 return
1990 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1991 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1992
1993 # cpn generation algorithm is reverse engineered from base.js.
1994 # In fact it works even with dummy cpn.
1995 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1996 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1997
1998 qs.update({
1999 'ver': ['2'],
2000 'cpn': [cpn],
2001 })
2002 playback_url = compat_urlparse.urlunparse(
2003 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
2004
2005 self._download_webpage(
2006 playback_url, video_id, 'Marking watched',
2007 'Unable to mark watched', fatal=False)
2008
2009 @staticmethod
2010 def _extract_urls(webpage):
2011 # Embedded YouTube player
2012 entries = [
2013 unescapeHTML(mobj.group('url'))
2014 for mobj in re.finditer(r'''(?x)
2015 (?:
2016 <iframe[^>]+?src=|
2017 data-video-url=|
2018 <embed[^>]+?src=|
2019 embedSWF\(?:\s*|
2020 <object[^>]+data=|
2021 new\s+SWFObject\(
2022 )
2023 (["\'])
2024 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
2025 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
2026 \1''', webpage)]
2027
2028 # lazyYT YouTube embed
2029 entries.extend(list(map(
2030 unescapeHTML,
2031 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
2032
2033 # Wordpress "YouTube Video Importer" plugin
2034 matches = re.findall(r'''(?x)<div[^>]+
2035 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
2036 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
2037 entries.extend(m[-1] for m in matches)
2038
2039 return entries
2040
2041 @staticmethod
2042 def _extract_url(webpage):
2043 urls = YoutubeIE._extract_urls(webpage)
2044 return urls[0] if urls else None
2045
2046 @classmethod
2047 def extract_id(cls, url):
2048 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
2049 if mobj is None:
2050 raise ExtractorError('Invalid URL: %s' % url)
2051 video_id = mobj.group(2)
2052 return video_id
2053
2054 def _extract_chapters_from_json(self, data, duration):
2055 chapter_list = traverse_obj(
2056 data, (
2057 'playerOverlays', 'playerOverlayRenderer', 'decoratedPlayerBarRenderer',
2058 'decoratedPlayerBarRenderer', 'playerBar', 'chapteredPlayerBarRenderer', 'chapters'
2059 ), expected_type=list)
2060
2061 return self._extract_chapters(
2062 chapter_list,
2063 chapter_time=lambda chapter: float_or_none(
2064 traverse_obj(chapter, ('chapterRenderer', 'timeRangeStartMillis')), scale=1000),
2065 chapter_title=lambda chapter: traverse_obj(
2066 chapter, ('chapterRenderer', 'title', 'simpleText'), expected_type=str),
2067 duration=duration)
2068
2069 def _extract_chapters_from_engagement_panel(self, data, duration):
2070 content_list = traverse_obj(
2071 data,
2072 ('engagementPanels', ..., 'engagementPanelSectionListRenderer', 'content', 'macroMarkersListRenderer', 'contents'),
2073 expected_type=list, default=[])
2074 chapter_time = lambda chapter: parse_duration(self._get_text(chapter, 'timeDescription'))
2075 chapter_title = lambda chapter: self._get_text(chapter, 'title')
2076
2077 return next((
2078 filter(None, (
2079 self._extract_chapters(
2080 traverse_obj(contents, (..., 'macroMarkersListItemRenderer')),
2081 chapter_time, chapter_title, duration)
2082 for contents in content_list
2083 ))), [])
2084
2085 def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration):
2086 chapters = []
2087 last_chapter = {'start_time': 0}
2088 for idx, chapter in enumerate(chapter_list or []):
2089 title = chapter_title(chapter)
2090 start_time = chapter_time(chapter)
2091 if start_time is None:
2092 continue
2093 last_chapter['end_time'] = start_time
2094 if start_time < last_chapter['start_time']:
2095 if idx == 1:
2096 chapters.pop()
2097 self.report_warning('Invalid start time for chapter "%s"' % last_chapter['title'])
2098 else:
2099 self.report_warning(f'Invalid start time for chapter "{title}"')
2100 continue
2101 last_chapter = {'start_time': start_time, 'title': title}
2102 chapters.append(last_chapter)
2103 last_chapter['end_time'] = duration
2104 return chapters
2105
2106 def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
2107 return self._parse_json(self._search_regex(
2108 (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
2109 regex), webpage, name, default='{}'), video_id, fatal=False)
2110
2111 @staticmethod
2112 def parse_time_text(time_text):
2113 """
2114 Parse the comment time text
2115 time_text is in the format 'X units ago (edited)'
2116 """
2117 time_text_split = time_text.split(' ')
2118 if len(time_text_split) >= 3:
2119 try:
2120 return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto')
2121 except ValueError:
2122 return None
2123
2124 def _extract_comment(self, comment_renderer, parent=None):
2125 comment_id = comment_renderer.get('commentId')
2126 if not comment_id:
2127 return
2128
2129 text = self._get_text(comment_renderer, 'contentText')
2130
2131 # note: timestamp is an estimate calculated from the current time and time_text
2132 time_text = self._get_text(comment_renderer, 'publishedTimeText') or ''
2133 time_text_dt = self.parse_time_text(time_text)
2134 if isinstance(time_text_dt, datetime.datetime):
2135 timestamp = calendar.timegm(time_text_dt.timetuple())
2136 author = self._get_text(comment_renderer, 'authorText')
2137 author_id = try_get(comment_renderer,
2138 lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
2139
2140 votes = parse_count(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'],
2141 lambda x: x['likeCount']), compat_str)) or 0
2142 author_thumbnail = try_get(comment_renderer,
2143 lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], compat_str)
2144
2145 author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
2146 is_favorited = 'creatorHeart' in (try_get(
2147 comment_renderer, lambda x: x['actionButtons']['commentActionButtonsRenderer'], dict) or {})
2148 return {
2149 'id': comment_id,
2150 'text': text,
2151 'timestamp': timestamp,
2152 'time_text': time_text,
2153 'like_count': votes,
2154 'is_favorited': is_favorited,
2155 'author': author,
2156 'author_id': author_id,
2157 'author_thumbnail': author_thumbnail,
2158 'author_is_uploader': author_is_uploader,
2159 'parent': parent or 'root'
2160 }
2161
2162 def _comment_entries(self, root_continuation_data, identity_token, account_syncid,
2163 ytcfg, video_id, parent=None, comment_counts=None):
2164
2165 def extract_header(contents):
2166 _total_comments = 0
2167 _continuation = None
2168 for content in contents:
2169 comments_header_renderer = try_get(content, lambda x: x['commentsHeaderRenderer'])
2170 expected_comment_count = parse_count(self._get_text(
2171 comments_header_renderer, 'countText', 'commentsCount', max_runs=1))
2172
2173 if expected_comment_count:
2174 comment_counts[1] = expected_comment_count
2175 self.to_screen('Downloading ~%d comments' % expected_comment_count)
2176 _total_comments = comment_counts[1]
2177 sort_mode_str = self._configuration_arg('comment_sort', [''])[0]
2178 comment_sort_index = int(sort_mode_str != 'top') # 1 = new, 0 = top
2179
2180 sort_menu_item = try_get(
2181 comments_header_renderer,
2182 lambda x: x['sortMenu']['sortFilterSubMenuRenderer']['subMenuItems'][comment_sort_index], dict) or {}
2183 sort_continuation_ep = sort_menu_item.get('serviceEndpoint') or {}
2184
2185 _continuation = self._extract_continuation_ep_data(sort_continuation_ep) or self._extract_continuation(sort_menu_item)
2186 if not _continuation:
2187 continue
2188
2189 sort_text = sort_menu_item.get('title')
2190 if isinstance(sort_text, compat_str):
2191 sort_text = sort_text.lower()
2192 else:
2193 sort_text = 'top comments' if comment_sort_index == 0 else 'newest first'
2194 self.to_screen('Sorting comments by %s' % sort_text)
2195 break
2196 return _total_comments, _continuation
2197
2198 def extract_thread(contents):
2199 if not parent:
2200 comment_counts[2] = 0
2201 for content in contents:
2202 comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
2203 comment_renderer = try_get(
2204 comment_thread_renderer, (lambda x: x['comment']['commentRenderer'], dict)) or try_get(
2205 content, (lambda x: x['commentRenderer'], dict))
2206
2207 if not comment_renderer:
2208 continue
2209 comment = self._extract_comment(comment_renderer, parent)
2210 if not comment:
2211 continue
2212 comment_counts[0] += 1
2213 yield comment
2214 # Attempt to get the replies
2215 comment_replies_renderer = try_get(
2216 comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
2217
2218 if comment_replies_renderer:
2219 comment_counts[2] += 1
2220 comment_entries_iter = self._comment_entries(
2221 comment_replies_renderer, identity_token, account_syncid, ytcfg,
2222 video_id, parent=comment.get('id'), comment_counts=comment_counts)
2223
2224 for reply_comment in comment_entries_iter:
2225 yield reply_comment
2226
2227 # YouTube comments have a max depth of 2
2228 max_depth = int_or_none(self._configuration_arg('max_comment_depth', [''])[0]) or float('inf')
2229 if max_depth == 1 and parent:
2230 return
2231 if not comment_counts:
2232 # comment so far, est. total comments, current comment thread #
2233 comment_counts = [0, 0, 0]
2234
2235 continuation = self._extract_continuation(root_continuation_data)
2236 if continuation and len(continuation['continuation']) < 27:
2237 self.write_debug('Detected old API continuation token. Generating new API compatible token.')
2238 continuation_token = self._generate_comment_continuation(video_id)
2239 continuation = self._build_api_continuation_query(continuation_token, None)
2240
2241 visitor_data = None
2242 is_first_continuation = parent is None
2243
2244 for page_num in itertools.count(0):
2245 if not continuation:
2246 break
2247 headers = self.generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
2248 comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
2249 if page_num == 0:
2250 if is_first_continuation:
2251 note_prefix = 'Downloading comment section API JSON'
2252 else:
2253 note_prefix = ' Downloading comment API JSON reply thread %d %s' % (
2254 comment_counts[2], comment_prog_str)
2255 else:
2256 note_prefix = '%sDownloading comment%s API JSON page %d %s' % (
2257 ' ' if parent else '', ' replies' if parent else '',
2258 page_num, comment_prog_str)
2259
2260 response = self._extract_response(
2261 item_id=None, query=continuation,
2262 ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
2263 check_get_keys=('onResponseReceivedEndpoints', 'continuationContents'))
2264 if not response:
2265 break
2266 visitor_data = try_get(
2267 response,
2268 lambda x: x['responseContext']['webResponseContextExtensionData']['ytConfigData']['visitorData'],
2269 compat_str) or visitor_data
2270
2271 continuation_contents = dict_get(response, ('onResponseReceivedEndpoints', 'continuationContents'))
2272
2273 continuation = None
2274 if isinstance(continuation_contents, list):
2275 for continuation_section in continuation_contents:
2276 if not isinstance(continuation_section, dict):
2277 continue
2278 continuation_items = try_get(
2279 continuation_section,
2280 (lambda x: x['reloadContinuationItemsCommand']['continuationItems'],
2281 lambda x: x['appendContinuationItemsAction']['continuationItems']),
2282 list) or []
2283 if is_first_continuation:
2284 total_comments, continuation = extract_header(continuation_items)
2285 if total_comments:
2286 yield total_comments
2287 is_first_continuation = False
2288 if continuation:
2289 break
2290 continue
2291 count = 0
2292 for count, entry in enumerate(extract_thread(continuation_items)):
2293 yield entry
2294 continuation = self._extract_continuation({'contents': continuation_items})
2295 if continuation:
2296 # Sometimes YouTube provides a continuation without any comments
2297 # In most cases we end up just downloading these with very little comments to come.
2298 if count == 0:
2299 if not parent:
2300 self.report_warning('No comments received - assuming end of comments')
2301 continuation = None
2302 break
2303
2304 # Deprecated response structure
2305 elif isinstance(continuation_contents, dict):
2306 known_continuation_renderers = ('itemSectionContinuation', 'commentRepliesContinuation')
2307 for key, continuation_renderer in continuation_contents.items():
2308 if key not in known_continuation_renderers:
2309 continue
2310 if not isinstance(continuation_renderer, dict):
2311 continue
2312 if is_first_continuation:
2313 header_continuation_items = [continuation_renderer.get('header') or {}]
2314 total_comments, continuation = extract_header(header_continuation_items)
2315 if total_comments:
2316 yield total_comments
2317 is_first_continuation = False
2318 if continuation:
2319 break
2320
2321 # Sometimes YouTube provides a continuation without any comments
2322 # In most cases we end up just downloading these with very little comments to come.
2323 count = 0
2324 for count, entry in enumerate(extract_thread(continuation_renderer.get('contents') or {})):
2325 yield entry
2326 continuation = self._extract_continuation(continuation_renderer)
2327 if count == 0:
2328 if not parent:
2329 self.report_warning('No comments received - assuming end of comments')
2330 continuation = None
2331 break
2332
2333 @staticmethod
2334 def _generate_comment_continuation(video_id):
2335 """
2336 Generates initial comment section continuation token from given video id
2337 """
2338 b64_vid_id = base64.b64encode(bytes(video_id.encode('utf-8')))
2339 parts = ('Eg0SCw==', b64_vid_id, 'GAYyJyIRIgs=', b64_vid_id, 'MAB4AjAAQhBjb21tZW50cy1zZWN0aW9u')
2340 new_continuation_intlist = list(itertools.chain.from_iterable(
2341 [bytes_to_intlist(base64.b64decode(part)) for part in parts]))
2342 return base64.b64encode(intlist_to_bytes(new_continuation_intlist)).decode('utf-8')
2343
2344 def _extract_comments(self, ytcfg, video_id, contents, webpage):
2345 """Entry for comment extraction"""
2346 def _real_comment_extract(contents):
2347 if isinstance(contents, list):
2348 for entry in contents:
2349 for key, renderer in entry.items():
2350 if key not in known_entry_comment_renderers:
2351 continue
2352 yield from self._comment_entries(
2353 renderer, video_id=video_id, ytcfg=ytcfg,
2354 identity_token=self._extract_identity_token(webpage, item_id=video_id),
2355 account_syncid=self._extract_account_syncid(ytcfg))
2356 break
2357 comments = []
2358 known_entry_comment_renderers = ('itemSectionRenderer',)
2359 estimated_total = 0
2360 max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) or float('inf')
2361 # Force English regardless of account setting to prevent parsing issues
2362 # See: https://github.com/yt-dlp/yt-dlp/issues/532
2363 ytcfg = copy.deepcopy(ytcfg)
2364 traverse_obj(
2365 ytcfg, ('INNERTUBE_CONTEXT', 'client'), expected_type=dict, default={})['hl'] = 'en'
2366 try:
2367 for comment in _real_comment_extract(contents):
2368 if len(comments) >= max_comments:
2369 break
2370 if isinstance(comment, int):
2371 estimated_total = comment
2372 continue
2373 comments.append(comment)
2374 except KeyboardInterrupt:
2375 self.to_screen('Interrupted by user')
2376 self.to_screen('Downloaded %d/%d comments' % (len(comments), estimated_total))
2377 return {
2378 'comments': comments,
2379 'comment_count': len(comments),
2380 }
2381
2382 @staticmethod
2383 def _generate_player_context(sts=None):
2384 context = {
2385 'html5Preference': 'HTML5_PREF_WANTS',
2386 }
2387 if sts is not None:
2388 context['signatureTimestamp'] = sts
2389 return {
2390 'playbackContext': {
2391 'contentPlaybackContext': context
2392 },
2393 'contentCheckOk': True,
2394 'racyCheckOk': True
2395 }
2396
2397 def _is_agegated(self, player_response):
2398 reasons = traverse_obj(player_response, ('playabilityStatus', ('status', 'reason')), default=[])
2399 for reason in reasons:
2400 if reason in self._AGE_GATE_REASONS + self._AGE_GATE_STATUS_REASONS:
2401 return True
2402 if traverse_obj(player_response, ('playabilityStatus', 'desktopLegacyAgeGateReason')) is not None:
2403 return True
2404 return False
2405
2406 def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, identity_token, player_url, initial_pr):
2407
2408 session_index = self._extract_session_index(player_ytcfg, master_ytcfg)
2409 syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr)
2410 sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False)
2411 headers = self.generate_api_headers(
2412 player_ytcfg, identity_token, syncid,
2413 default_client=client, session_index=session_index)
2414
2415 yt_query = {'videoId': video_id}
2416 yt_query.update(self._generate_player_context(sts))
2417 return self._extract_response(
2418 item_id=video_id, ep='player', query=yt_query,
2419 ytcfg=player_ytcfg, headers=headers, fatal=False,
2420 default_client=client,
2421 note='Downloading %s player API JSON' % client.replace('_', ' ').strip()
2422 ) or None
2423
2424 def _get_requested_clients(self, url, smuggled_data):
2425 requested_clients = []
2426 allowed_clients = sorted(
2427 [client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'],
2428 key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True)
2429 for client in self._configuration_arg('player_client'):
2430 if client in allowed_clients:
2431 requested_clients.append(client)
2432 elif client == 'all':
2433 requested_clients.extend(allowed_clients)
2434 else:
2435 self.report_warning(f'Skipping unsupported client {client}')
2436 if not requested_clients:
2437 requested_clients = ['android', 'web']
2438
2439 if smuggled_data.get('is_music_url') or self.is_music_url(url):
2440 requested_clients.extend(
2441 f'{client}_music' for client in requested_clients if not client.endswith('_music'))
2442
2443 return orderedSet(requested_clients)
2444
2445 def _extract_player_ytcfg(self, client, video_id):
2446 url = {
2447 'web_music': 'https://music.youtube.com',
2448 'web_embedded': f'https://www.youtube.com/embed/{video_id}?html5=1'
2449 }.get(client)
2450 if not url:
2451 return {}
2452 webpage = self._download_webpage(url, video_id, fatal=False, note=f'Downloading {client} config')
2453 return self.extract_ytcfg(video_id, webpage) or {}
2454
2455 def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, player_url, identity_token):
2456 initial_pr = None
2457 if webpage:
2458 initial_pr = self._extract_yt_initial_variable(
2459 webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
2460 video_id, 'initial player response')
2461
2462 original_clients = clients
2463 clients = clients[::-1]
2464 while clients:
2465 client = clients.pop()
2466 player_ytcfg = master_ytcfg if client == 'web' else {}
2467 if 'configs' not in self._configuration_arg('player_skip'):
2468 player_ytcfg = self._extract_player_ytcfg(client, video_id) or player_ytcfg
2469
2470 pr = (
2471 initial_pr if client == 'web' and initial_pr
2472 else self._extract_player_response(
2473 client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, identity_token, player_url, initial_pr))
2474 if pr:
2475 yield pr
2476
2477 if self._is_agegated(pr):
2478 client = f'{client}_agegate'
2479 if client in INNERTUBE_CLIENTS and client not in original_clients:
2480 clients.append(client)
2481
2482 # Android player_response does not have microFormats which are needed for
2483 # extraction of some data. So we return the initial_pr with formats
2484 # stripped out even if not requested by the user
2485 # See: https://github.com/yt-dlp/yt-dlp/issues/501
2486 if initial_pr and 'web' not in original_clients:
2487 initial_pr['streamingData'] = None
2488 yield initial_pr
2489
2490 def _extract_formats(self, streaming_data, video_id, player_url, is_live):
2491 itags, stream_ids = [], []
2492 itag_qualities, res_qualities = {}, {}
2493 q = qualities([
2494 # Normally tiny is the smallest video-only formats. But
2495 # audio-only formats with unknown quality may get tagged as tiny
2496 'tiny',
2497 'audio_quality_ultralow', 'audio_quality_low', 'audio_quality_medium', 'audio_quality_high', # Audio only formats
2498 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'
2499 ])
2500 streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[])
2501
2502 for fmt in streaming_formats:
2503 if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
2504 continue
2505
2506 itag = str_or_none(fmt.get('itag'))
2507 audio_track = fmt.get('audioTrack') or {}
2508 stream_id = '%s.%s' % (itag or '', audio_track.get('id', ''))
2509 if stream_id in stream_ids:
2510 continue
2511
2512 quality = fmt.get('quality')
2513 height = int_or_none(fmt.get('height'))
2514 if quality == 'tiny' or not quality:
2515 quality = fmt.get('audioQuality', '').lower() or quality
2516 # The 3gp format (17) in android client has a quality of "small",
2517 # but is actually worse than other formats
2518 if itag == '17':
2519 quality = 'tiny'
2520 if quality:
2521 if itag:
2522 itag_qualities[itag] = quality
2523 if height:
2524 res_qualities[height] = quality
2525 # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment
2526 # (adding `&sq=0` to the URL) and parsing emsg box to determine the
2527 # number of fragment that would subsequently requested with (`&sq=N`)
2528 if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF':
2529 continue
2530
2531 fmt_url = fmt.get('url')
2532 if not fmt_url:
2533 sc = compat_parse_qs(fmt.get('signatureCipher'))
2534 fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0]))
2535 encrypted_sig = try_get(sc, lambda x: x['s'][0])
2536 if not (sc and fmt_url and encrypted_sig):
2537 continue
2538 if not player_url:
2539 continue
2540 signature = self._decrypt_signature(sc['s'][0], video_id, player_url)
2541 sp = try_get(sc, lambda x: x['sp'][0]) or 'signature'
2542 fmt_url += '&' + sp + '=' + signature
2543
2544 if itag:
2545 itags.append(itag)
2546 stream_ids.append(stream_id)
2547
2548 tbr = float_or_none(
2549 fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
2550 dct = {
2551 'asr': int_or_none(fmt.get('audioSampleRate')),
2552 'filesize': int_or_none(fmt.get('contentLength')),
2553 'format_id': itag,
2554 'format_note': ', '.join(filter(None, (
2555 audio_track.get('displayName'),
2556 fmt.get('qualityLabel') or quality.replace('audio_quality_', '')))),
2557 'fps': int_or_none(fmt.get('fps')),
2558 'height': height,
2559 'quality': q(quality),
2560 'tbr': tbr,
2561 'url': fmt_url,
2562 'width': int_or_none(fmt.get('width')),
2563 'language': audio_track.get('id', '').split('.')[0],
2564 }
2565 mime_mobj = re.match(
2566 r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '')
2567 if mime_mobj:
2568 dct['ext'] = mimetype2ext(mime_mobj.group(1))
2569 dct.update(parse_codecs(mime_mobj.group(2)))
2570 no_audio = dct.get('acodec') == 'none'
2571 no_video = dct.get('vcodec') == 'none'
2572 if no_audio:
2573 dct['vbr'] = tbr
2574 if no_video:
2575 dct['abr'] = tbr
2576 if no_audio or no_video:
2577 dct['downloader_options'] = {
2578 # Youtube throttles chunks >~10M
2579 'http_chunk_size': 10485760,
2580 }
2581 if dct.get('ext'):
2582 dct['container'] = dct['ext'] + '_dash'
2583 yield dct
2584
2585 skip_manifests = self._configuration_arg('skip')
2586 get_dash = not is_live and 'dash' not in skip_manifests and self.get_param('youtube_include_dash_manifest', True)
2587 get_hls = 'hls' not in skip_manifests and self.get_param('youtube_include_hls_manifest', True)
2588
2589 def guess_quality(f):
2590 for val, qdict in ((f.get('format_id'), itag_qualities), (f.get('height'), res_qualities)):
2591 if val in qdict:
2592 return q(qdict[val])
2593 return -1
2594
2595 for sd in streaming_data:
2596 hls_manifest_url = get_hls and sd.get('hlsManifestUrl')
2597 if hls_manifest_url:
2598 for f in self._extract_m3u8_formats(hls_manifest_url, video_id, 'mp4', fatal=False):
2599 itag = self._search_regex(
2600 r'/itag/(\d+)', f['url'], 'itag', default=None)
2601 if itag in itags:
2602 continue
2603 if itag:
2604 f['format_id'] = itag
2605 itags.append(itag)
2606 f['quality'] = guess_quality(f)
2607 yield f
2608
2609 dash_manifest_url = get_dash and sd.get('dashManifestUrl')
2610 if dash_manifest_url:
2611 for f in self._extract_mpd_formats(dash_manifest_url, video_id, fatal=False):
2612 itag = f['format_id']
2613 if itag in itags:
2614 continue
2615 if itag:
2616 itags.append(itag)
2617 f['quality'] = guess_quality(f)
2618 filesize = int_or_none(self._search_regex(
2619 r'/clen/(\d+)', f.get('fragment_base_url')
2620 or f['url'], 'file size', default=None))
2621 if filesize:
2622 f['filesize'] = filesize
2623 yield f
2624
2625 def _real_extract(self, url):
2626 url, smuggled_data = unsmuggle_url(url, {})
2627 video_id = self._match_id(url)
2628
2629 base_url = self.http_scheme() + '//www.youtube.com/'
2630 webpage_url = base_url + 'watch?v=' + video_id
2631 webpage = self._download_webpage(
2632 webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
2633
2634 master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg()
2635 player_url = self._extract_player_url(master_ytcfg, webpage)
2636 identity_token = self._extract_identity_token(webpage, video_id)
2637
2638 player_responses = list(self._extract_player_responses(
2639 self._get_requested_clients(url, smuggled_data),
2640 video_id, webpage, master_ytcfg, player_url, identity_token))
2641
2642 get_first = lambda obj, keys, **kwargs: traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
2643
2644 playability_statuses = traverse_obj(
2645 player_responses, (..., 'playabilityStatus'), expected_type=dict, default=[])
2646
2647 trailer_video_id = get_first(
2648 playability_statuses,
2649 ('errorScreen', 'playerLegacyDesktopYpcTrailerRenderer', 'trailerVideoId'),
2650 expected_type=str)
2651 if trailer_video_id:
2652 return self.url_result(
2653 trailer_video_id, self.ie_key(), trailer_video_id)
2654
2655 search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None))
2656 if webpage else (lambda x: None))
2657
2658 video_details = traverse_obj(
2659 player_responses, (..., 'videoDetails'), expected_type=dict, default=[])
2660 microformats = traverse_obj(
2661 player_responses, (..., 'microformat', 'playerMicroformatRenderer'),
2662 expected_type=dict, default=[])
2663 video_title = (
2664 get_first(video_details, 'title')
2665 or self._get_text(microformats, (..., 'title'))
2666 or search_meta(['og:title', 'twitter:title', 'title']))
2667 video_description = get_first(video_details, 'shortDescription')
2668
2669 if not smuggled_data.get('force_singlefeed', False):
2670 if not self.get_param('noplaylist'):
2671 multifeed_metadata_list = get_first(
2672 player_responses,
2673 ('multicamera', 'playerLegacyMulticameraRenderer', 'metadataList'),
2674 expected_type=str)
2675 if multifeed_metadata_list:
2676 entries = []
2677 feed_ids = []
2678 for feed in multifeed_metadata_list.split(','):
2679 # Unquote should take place before split on comma (,) since textual
2680 # fields may contain comma as well (see
2681 # https://github.com/ytdl-org/youtube-dl/issues/8536)
2682 feed_data = compat_parse_qs(
2683 compat_urllib_parse_unquote_plus(feed))
2684
2685 def feed_entry(name):
2686 return try_get(
2687 feed_data, lambda x: x[name][0], compat_str)
2688
2689 feed_id = feed_entry('id')
2690 if not feed_id:
2691 continue
2692 feed_title = feed_entry('title')
2693 title = video_title
2694 if feed_title:
2695 title += ' (%s)' % feed_title
2696 entries.append({
2697 '_type': 'url_transparent',
2698 'ie_key': 'Youtube',
2699 'url': smuggle_url(
2700 '%swatch?v=%s' % (base_url, feed_data['id'][0]),
2701 {'force_singlefeed': True}),
2702 'title': title,
2703 })
2704 feed_ids.append(feed_id)
2705 self.to_screen(
2706 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
2707 % (', '.join(feed_ids), video_id))
2708 return self.playlist_result(
2709 entries, video_id, video_title, video_description)
2710 else:
2711 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2712
2713 live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails'))
2714 is_live = get_first(video_details, 'isLive')
2715 if is_live is None:
2716 is_live = get_first(live_broadcast_details, 'isLiveNow')
2717
2718 streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[])
2719 formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live))
2720
2721 if not formats:
2722 if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')):
2723 self.raise_no_formats(
2724 'This video is DRM protected.', expected=True)
2725 pemr = get_first(
2726 playability_statuses,
2727 ('errorScreen', 'playerErrorMessageRenderer'), expected_type=dict) or {}
2728 reason = self._get_text(pemr, 'reason') or get_first(playability_statuses, 'reason')
2729 subreason = clean_html(self._get_text(pemr, 'subreason') or '')
2730 if subreason:
2731 if subreason == 'The uploader has not made this video available in your country.':
2732 countries = get_first(microformats, 'availableCountries')
2733 if not countries:
2734 regions_allowed = search_meta('regionsAllowed')
2735 countries = regions_allowed.split(',') if regions_allowed else None
2736 self.raise_geo_restricted(subreason, countries, metadata_available=True)
2737 reason += f'. {subreason}'
2738 if reason:
2739 self.raise_no_formats(reason, expected=True)
2740
2741 for f in formats:
2742 if '&c=WEB&' in f['url'] and '&ratebypass=yes&' not in f['url']: # throttled
2743 f['source_preference'] = -10
2744 note = f.get('format_note')
2745 f['format_note'] = f'{note} (throttled)' if note else '(throttled)'
2746
2747 # Source is given priority since formats that throttle are given lower source_preference
2748 # When throttling issue is fully fixed, remove this
2749 self._sort_formats(formats, ('quality', 'height', 'fps', 'source'))
2750
2751 keywords = get_first(video_details, 'keywords', expected_type=list) or []
2752 if not keywords and webpage:
2753 keywords = [
2754 unescapeHTML(m.group('content'))
2755 for m in re.finditer(self._meta_regex('og:video:tag'), webpage)]
2756 for keyword in keywords:
2757 if keyword.startswith('yt:stretch='):
2758 mobj = re.search(r'(\d+)\s*:\s*(\d+)', keyword)
2759 if mobj:
2760 # NB: float is intentional for forcing float division
2761 w, h = (float(v) for v in mobj.groups())
2762 if w > 0 and h > 0:
2763 ratio = w / h
2764 for f in formats:
2765 if f.get('vcodec') != 'none':
2766 f['stretched_ratio'] = ratio
2767 break
2768
2769 thumbnails = []
2770 thumbnail_dicts = traverse_obj(
2771 (video_details, microformats), (..., ..., 'thumbnail', 'thumbnails', ...),
2772 expected_type=dict, default=[])
2773 for thumbnail in thumbnail_dicts:
2774 thumbnail_url = thumbnail.get('url')
2775 if not thumbnail_url:
2776 continue
2777 # Sometimes youtube gives a wrong thumbnail URL. See:
2778 # https://github.com/yt-dlp/yt-dlp/issues/233
2779 # https://github.com/ytdl-org/youtube-dl/issues/28023
2780 if 'maxresdefault' in thumbnail_url:
2781 thumbnail_url = thumbnail_url.split('?')[0]
2782 thumbnails.append({
2783 'url': thumbnail_url,
2784 'height': int_or_none(thumbnail.get('height')),
2785 'width': int_or_none(thumbnail.get('width')),
2786 })
2787 thumbnail_url = search_meta(['og:image', 'twitter:image'])
2788 if thumbnail_url:
2789 thumbnails.append({
2790 'url': thumbnail_url,
2791 })
2792 # The best resolution thumbnails sometimes does not appear in the webpage
2793 # See: https://github.com/ytdl-org/youtube-dl/issues/29049, https://github.com/yt-dlp/yt-dlp/issues/340
2794 # List of possible thumbnails - Ref: <https://stackoverflow.com/a/20542029>
2795 hq_thumbnail_names = ['maxresdefault', 'hq720', 'sddefault', 'sd1', 'sd2', 'sd3']
2796 # TODO: Test them also? - For some videos, even these don't exist
2797 guaranteed_thumbnail_names = [
2798 'hqdefault', 'hq1', 'hq2', 'hq3', '0',
2799 'mqdefault', 'mq1', 'mq2', 'mq3',
2800 'default', '1', '2', '3'
2801 ]
2802 thumbnail_names = hq_thumbnail_names + guaranteed_thumbnail_names
2803 n_thumbnail_names = len(thumbnail_names)
2804
2805 thumbnails.extend({
2806 'url': 'https://i.ytimg.com/vi{webp}/{video_id}/{name}{live}.{ext}'.format(
2807 video_id=video_id, name=name, ext=ext,
2808 webp='_webp' if ext == 'webp' else '', live='_live' if is_live else ''),
2809 '_test_url': name in hq_thumbnail_names,
2810 } for name in thumbnail_names for ext in ('webp', 'jpg'))
2811 for thumb in thumbnails:
2812 i = next((i for i, t in enumerate(thumbnail_names) if f'/{video_id}/{t}' in thumb['url']), n_thumbnail_names)
2813 thumb['preference'] = (0 if '.webp' in thumb['url'] else -1) - (2 * i)
2814 self._remove_duplicate_formats(thumbnails)
2815
2816 category = get_first(microformats, 'category') or search_meta('genre')
2817 channel_id = str_or_none(
2818 get_first(video_details, 'channelId')
2819 or get_first(microformats, 'externalChannelId')
2820 or search_meta('channelId'))
2821 duration = int_or_none(
2822 get_first(video_details, 'lengthSeconds')
2823 or get_first(microformats, 'lengthSeconds')
2824 or parse_duration(search_meta('duration'))) or None
2825 owner_profile_url = get_first(microformats, 'ownerProfileUrl')
2826
2827 live_content = get_first(video_details, 'isLiveContent')
2828 is_upcoming = get_first(video_details, 'isUpcoming')
2829 if is_live is None:
2830 if is_upcoming or live_content is False:
2831 is_live = False
2832 if is_upcoming is None and (live_content or is_live):
2833 is_upcoming = False
2834 live_starttime = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp'))
2835 live_endtime = parse_iso8601(get_first(live_broadcast_details, 'endTimestamp'))
2836 if not duration and live_endtime and live_starttime:
2837 duration = live_endtime - live_starttime
2838
2839 info = {
2840 'id': video_id,
2841 'title': self._live_title(video_title) if is_live else video_title,
2842 'formats': formats,
2843 'thumbnails': thumbnails,
2844 'description': video_description,
2845 'upload_date': unified_strdate(
2846 get_first(microformats, 'uploadDate')
2847 or search_meta('uploadDate')),
2848 'uploader': get_first(video_details, 'author'),
2849 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None,
2850 'uploader_url': owner_profile_url,
2851 'channel_id': channel_id,
2852 'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None,
2853 'duration': duration,
2854 'view_count': int_or_none(
2855 get_first((video_details, microformats), (..., 'viewCount'))
2856 or search_meta('interactionCount')),
2857 'average_rating': float_or_none(get_first(video_details, 'averageRating')),
2858 'age_limit': 18 if (
2859 get_first(microformats, 'isFamilySafe') is False
2860 or search_meta('isFamilyFriendly') == 'false'
2861 or search_meta('og:restrictions:age') == '18+') else 0,
2862 'webpage_url': webpage_url,
2863 'categories': [category] if category else None,
2864 'tags': keywords,
2865 'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'),
2866 'is_live': is_live,
2867 'was_live': (False if is_live or is_upcoming or live_content is False
2868 else None if is_live is None or is_upcoming is None
2869 else live_content),
2870 'live_status': 'is_upcoming' if is_upcoming else None, # rest will be set by YoutubeDL
2871 'release_timestamp': live_starttime,
2872 }
2873
2874 pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict)
2875 # Converted into dicts to remove duplicates
2876 captions = {
2877 sub.get('baseUrl'): sub
2878 for sub in traverse_obj(pctr, (..., 'captionTracks', ...), default=[])}
2879 translation_languages = {
2880 lang.get('languageCode'): lang.get('languageName')
2881 for lang in traverse_obj(pctr, (..., 'translationLanguages', ...), default=[])}
2882 subtitles = {}
2883 if pctr:
2884 def process_language(container, base_url, lang_code, sub_name, query):
2885 lang_subs = container.setdefault(lang_code, [])
2886 for fmt in self._SUBTITLE_FORMATS:
2887 query.update({
2888 'fmt': fmt,
2889 })
2890 lang_subs.append({
2891 'ext': fmt,
2892 'url': update_url_query(base_url, query),
2893 'name': sub_name,
2894 })
2895
2896 for base_url, caption_track in captions.items():
2897 if not base_url:
2898 continue
2899 if caption_track.get('kind') != 'asr':
2900 lang_code = (
2901 remove_start(caption_track.get('vssId') or '', '.').replace('.', '-')
2902 or caption_track.get('languageCode'))
2903 if not lang_code:
2904 continue
2905 process_language(
2906 subtitles, base_url, lang_code,
2907 traverse_obj(caption_track, ('name', 'simpleText')),
2908 {})
2909 continue
2910 automatic_captions = {}
2911 for trans_code, trans_name in translation_languages.items():
2912 if not trans_code:
2913 continue
2914 process_language(
2915 automatic_captions, base_url, trans_code,
2916 self._get_text(trans_name, max_runs=1),
2917 {'tlang': trans_code})
2918 info['automatic_captions'] = automatic_captions
2919 info['subtitles'] = subtitles
2920
2921 parsed_url = compat_urllib_parse_urlparse(url)
2922 for component in [parsed_url.fragment, parsed_url.query]:
2923 query = compat_parse_qs(component)
2924 for k, v in query.items():
2925 for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]:
2926 d_k += '_time'
2927 if d_k not in info and k in s_ks:
2928 info[d_k] = parse_duration(query[k][0])
2929
2930 # Youtube Music Auto-generated description
2931 if video_description:
2932 mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
2933 if mobj:
2934 release_year = mobj.group('release_year')
2935 release_date = mobj.group('release_date')
2936 if release_date:
2937 release_date = release_date.replace('-', '')
2938 if not release_year:
2939 release_year = release_date[:4]
2940 info.update({
2941 'album': mobj.group('album'.strip()),
2942 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')),
2943 'track': mobj.group('track').strip(),
2944 'release_date': release_date,
2945 'release_year': int_or_none(release_year),
2946 })
2947
2948 initial_data = None
2949 if webpage:
2950 initial_data = self._extract_yt_initial_variable(
2951 webpage, self._YT_INITIAL_DATA_RE, video_id,
2952 'yt initial data')
2953 if not initial_data:
2954 headers = self.generate_api_headers(
2955 master_ytcfg, identity_token, self._extract_account_syncid(master_ytcfg),
2956 session_index=self._extract_session_index(master_ytcfg))
2957
2958 initial_data = self._extract_response(
2959 item_id=video_id, ep='next', fatal=False,
2960 ytcfg=master_ytcfg, headers=headers, query={'videoId': video_id},
2961 note='Downloading initial data API JSON')
2962
2963 try:
2964 # This will error if there is no livechat
2965 initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
2966 info['subtitles']['live_chat'] = [{
2967 'url': 'https://www.youtube.com/watch?v=%s' % video_id, # url is needed to set cookies
2968 'video_id': video_id,
2969 'ext': 'json',
2970 'protocol': 'youtube_live_chat' if is_live or is_upcoming else 'youtube_live_chat_replay',
2971 }]
2972 except (KeyError, IndexError, TypeError):
2973 pass
2974
2975 if initial_data:
2976 info['chapters'] = (
2977 self._extract_chapters_from_json(initial_data, duration)
2978 or self._extract_chapters_from_engagement_panel(initial_data, duration)
2979 or None)
2980
2981 contents = try_get(
2982 initial_data,
2983 lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'],
2984 list) or []
2985 for content in contents:
2986 vpir = content.get('videoPrimaryInfoRenderer')
2987 if vpir:
2988 stl = vpir.get('superTitleLink')
2989 if stl:
2990 stl = self._get_text(stl)
2991 if try_get(
2992 vpir,
2993 lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':
2994 info['location'] = stl
2995 else:
2996 mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl)
2997 if mobj:
2998 info.update({
2999 'series': mobj.group(1),
3000 'season_number': int(mobj.group(2)),
3001 'episode_number': int(mobj.group(3)),
3002 })
3003 for tlb in (try_get(
3004 vpir,
3005 lambda x: x['videoActions']['menuRenderer']['topLevelButtons'],
3006 list) or []):
3007 tbr = tlb.get('toggleButtonRenderer') or {}
3008 for getter, regex in [(
3009 lambda x: x['defaultText']['accessibility']['accessibilityData'],
3010 r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([
3011 lambda x: x['accessibility'],
3012 lambda x: x['accessibilityData']['accessibilityData'],
3013 ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]:
3014 label = (try_get(tbr, getter, dict) or {}).get('label')
3015 if label:
3016 mobj = re.match(regex, label)
3017 if mobj:
3018 info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count'))
3019 break
3020 sbr_tooltip = try_get(
3021 vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])
3022 if sbr_tooltip:
3023 like_count, dislike_count = sbr_tooltip.split(' / ')
3024 info.update({
3025 'like_count': str_to_int(like_count),
3026 'dislike_count': str_to_int(dislike_count),
3027 })
3028 vsir = content.get('videoSecondaryInfoRenderer')
3029 if vsir:
3030 info['channel'] = self._get_text(vsir, ('owner', 'videoOwnerRenderer', 'title'))
3031 rows = try_get(
3032 vsir,
3033 lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
3034 list) or []
3035 multiple_songs = False
3036 for row in rows:
3037 if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
3038 multiple_songs = True
3039 break
3040 for row in rows:
3041 mrr = row.get('metadataRowRenderer') or {}
3042 mrr_title = mrr.get('title')
3043 if not mrr_title:
3044 continue
3045 mrr_title = self._get_text(mrr, 'title')
3046 mrr_contents_text = self._get_text(mrr, ('contents', 0))
3047 if mrr_title == 'License':
3048 info['license'] = mrr_contents_text
3049 elif not multiple_songs:
3050 if mrr_title == 'Album':
3051 info['album'] = mrr_contents_text
3052 elif mrr_title == 'Artist':
3053 info['artist'] = mrr_contents_text
3054 elif mrr_title == 'Song':
3055 info['track'] = mrr_contents_text
3056
3057 fallbacks = {
3058 'channel': 'uploader',
3059 'channel_id': 'uploader_id',
3060 'channel_url': 'uploader_url',
3061 }
3062 for to, frm in fallbacks.items():
3063 if not info.get(to):
3064 info[to] = info.get(frm)
3065
3066 for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]:
3067 v = info.get(s_k)
3068 if v:
3069 info[d_k] = v
3070
3071 is_private = get_first(video_details, 'isPrivate', expected_type=bool)
3072 is_unlisted = get_first(microformats, 'isUnlisted', expected_type=bool)
3073 is_membersonly = None
3074 is_premium = None
3075 if initial_data and is_private is not None:
3076 is_membersonly = False
3077 is_premium = False
3078 contents = try_get(initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or []
3079 badge_labels = set()
3080 for content in contents:
3081 if not isinstance(content, dict):
3082 continue
3083 badge_labels.update(self._extract_badges(content.get('videoPrimaryInfoRenderer')))
3084 for badge_label in badge_labels:
3085 if badge_label.lower() == 'members only':
3086 is_membersonly = True
3087 elif badge_label.lower() == 'premium':
3088 is_premium = True
3089 elif badge_label.lower() == 'unlisted':
3090 is_unlisted = True
3091
3092 info['availability'] = self._availability(
3093 is_private=is_private,
3094 needs_premium=is_premium,
3095 needs_subscription=is_membersonly,
3096 needs_auth=info['age_limit'] >= 18,
3097 is_unlisted=None if is_private is None else is_unlisted)
3098
3099 # get xsrf for annotations or comments
3100 get_annotations = self.get_param('writeannotations', False)
3101 get_comments = self.get_param('getcomments', False)
3102 if get_annotations or get_comments:
3103 xsrf_token = None
3104 if master_ytcfg:
3105 xsrf_token = try_get(master_ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
3106 if not xsrf_token:
3107 xsrf_token = self._search_regex(
3108 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
3109 webpage, 'xsrf token', group='xsrf_token', fatal=False)
3110
3111 # annotations
3112 if get_annotations:
3113 invideo_url = get_first(
3114 player_responses,
3115 ('annotations', 0, 'playerAnnotationsUrlsRenderer', 'invideoUrl'),
3116 expected_type=str)
3117 if xsrf_token and invideo_url:
3118 xsrf_field_name = None
3119 if master_ytcfg:
3120 xsrf_field_name = try_get(master_ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
3121 if not xsrf_field_name:
3122 xsrf_field_name = self._search_regex(
3123 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
3124 webpage, 'xsrf field name',
3125 group='xsrf_field_name', default='session_token')
3126 info['annotations'] = self._download_webpage(
3127 self._proto_relative_url(invideo_url),
3128 video_id, note='Downloading annotations',
3129 errnote='Unable to download video annotations', fatal=False,
3130 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
3131
3132 if get_comments:
3133 info['__post_extractor'] = lambda: self._extract_comments(master_ytcfg, video_id, contents, webpage)
3134
3135 self.mark_watched(video_id, player_responses)
3136
3137 return info
3138
3139
3140class YoutubeTabIE(YoutubeBaseInfoExtractor):
3141 IE_DESC = 'YouTube.com tab'
3142 _VALID_URL = r'''(?x)
3143 https?://
3144 (?:\w+\.)?
3145 (?:
3146 youtube(?:kids)?\.com|
3147 invidio\.us
3148 )/
3149 (?:
3150 (?P<channel_type>channel|c|user|browse)/|
3151 (?P<not_channel>
3152 feed/|hashtag/|
3153 (?:playlist|watch)\?.*?\blist=
3154 )|
3155 (?!(?:%s)\b) # Direct URLs
3156 )
3157 (?P<id>[^/?\#&]+)
3158 ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
3159 IE_NAME = 'youtube:tab'
3160
3161 _TESTS = [{
3162 'note': 'playlists, multipage',
3163 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
3164 'playlist_mincount': 94,
3165 'info_dict': {
3166 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
3167 'title': 'Игорь Клейнер - Playlists',
3168 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
3169 'uploader': 'Игорь Клейнер',
3170 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
3171 },
3172 }, {
3173 'note': 'playlists, multipage, different order',
3174 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
3175 'playlist_mincount': 94,
3176 'info_dict': {
3177 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
3178 'title': 'Игорь Клейнер - Playlists',
3179 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
3180 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
3181 'uploader': 'Игорь Клейнер',
3182 },
3183 }, {
3184 'note': 'playlists, series',
3185 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3',
3186 'playlist_mincount': 5,
3187 'info_dict': {
3188 'id': 'UCYO_jab_esuFRV4b17AJtAw',
3189 'title': '3Blue1Brown - Playlists',
3190 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
3191 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
3192 'uploader': '3Blue1Brown',
3193 },
3194 }, {
3195 'note': 'playlists, singlepage',
3196 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
3197 'playlist_mincount': 4,
3198 'info_dict': {
3199 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
3200 'title': 'ThirstForScience - Playlists',
3201 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
3202 'uploader': 'ThirstForScience',
3203 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
3204 }
3205 }, {
3206 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3207 'only_matching': True,
3208 }, {
3209 'note': 'basic, single video playlist',
3210 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3211 'info_dict': {
3212 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
3213 'uploader': 'Sergey M.',
3214 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3215 'title': 'youtube-dl public playlist',
3216 },
3217 'playlist_count': 1,
3218 }, {
3219 'note': 'empty playlist',
3220 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3221 'info_dict': {
3222 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
3223 'uploader': 'Sergey M.',
3224 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3225 'title': 'youtube-dl empty playlist',
3226 },
3227 'playlist_count': 0,
3228 }, {
3229 'note': 'Home tab',
3230 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
3231 'info_dict': {
3232 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3233 'title': 'lex will - Home',
3234 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3235 'uploader': 'lex will',
3236 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3237 },
3238 'playlist_mincount': 2,
3239 }, {
3240 'note': 'Videos tab',
3241 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
3242 'info_dict': {
3243 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3244 'title': 'lex will - Videos',
3245 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3246 'uploader': 'lex will',
3247 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3248 },
3249 'playlist_mincount': 975,
3250 }, {
3251 'note': 'Videos tab, sorted by popular',
3252 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
3253 'info_dict': {
3254 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3255 'title': 'lex will - Videos',
3256 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3257 'uploader': 'lex will',
3258 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3259 },
3260 'playlist_mincount': 199,
3261 }, {
3262 'note': 'Playlists tab',
3263 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
3264 'info_dict': {
3265 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3266 'title': 'lex will - Playlists',
3267 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3268 'uploader': 'lex will',
3269 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3270 },
3271 'playlist_mincount': 17,
3272 }, {
3273 'note': 'Community tab',
3274 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
3275 'info_dict': {
3276 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3277 'title': 'lex will - Community',
3278 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3279 'uploader': 'lex will',
3280 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3281 },
3282 'playlist_mincount': 18,
3283 }, {
3284 'note': 'Channels tab',
3285 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
3286 'info_dict': {
3287 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3288 'title': 'lex will - Channels',
3289 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3290 'uploader': 'lex will',
3291 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3292 },
3293 'playlist_mincount': 12,
3294 }, {
3295 'note': 'Search tab',
3296 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra',
3297 'playlist_mincount': 40,
3298 'info_dict': {
3299 'id': 'UCYO_jab_esuFRV4b17AJtAw',
3300 'title': '3Blue1Brown - Search - linear algebra',
3301 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
3302 'uploader': '3Blue1Brown',
3303 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
3304 },
3305 }, {
3306 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3307 'only_matching': True,
3308 }, {
3309 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3310 'only_matching': True,
3311 }, {
3312 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3313 'only_matching': True,
3314 }, {
3315 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
3316 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
3317 'info_dict': {
3318 'title': '29C3: Not my department',
3319 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
3320 'uploader': 'Christiaan008',
3321 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
3322 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268',
3323 },
3324 'playlist_count': 96,
3325 }, {
3326 'note': 'Large playlist',
3327 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
3328 'info_dict': {
3329 'title': 'Uploads from Cauchemar',
3330 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
3331 'uploader': 'Cauchemar',
3332 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
3333 },
3334 'playlist_mincount': 1123,
3335 }, {
3336 'note': 'even larger playlist, 8832 videos',
3337 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
3338 'only_matching': True,
3339 }, {
3340 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
3341 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
3342 'info_dict': {
3343 'title': 'Uploads from Interstellar Movie',
3344 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
3345 'uploader': 'Interstellar Movie',
3346 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
3347 },
3348 'playlist_mincount': 21,
3349 }, {
3350 'note': 'Playlist with "show unavailable videos" button',
3351 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q',
3352 'info_dict': {
3353 'title': 'Uploads from Phim Siêu Nhân Nhật Bản',
3354 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q',
3355 'uploader': 'Phim Siêu Nhân Nhật Bản',
3356 'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q',
3357 },
3358 'playlist_mincount': 200,
3359 }, {
3360 'note': 'Playlist with unavailable videos in page 7',
3361 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w',
3362 'info_dict': {
3363 'title': 'Uploads from BlankTV',
3364 'id': 'UU8l9frL61Yl5KFOl87nIm2w',
3365 'uploader': 'BlankTV',
3366 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w',
3367 },
3368 'playlist_mincount': 1000,
3369 }, {
3370 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844',
3371 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
3372 'info_dict': {
3373 'title': 'Data Analysis with Dr Mike Pound',
3374 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
3375 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
3376 'uploader': 'Computerphile',
3377 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487',
3378 },
3379 'playlist_mincount': 11,
3380 }, {
3381 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3382 'only_matching': True,
3383 }, {
3384 'note': 'Playlist URL that does not actually serve a playlist',
3385 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
3386 'info_dict': {
3387 'id': 'FqZTN594JQw',
3388 'ext': 'webm',
3389 'title': "Smiley's People 01 detective, Adventure Series, Action",
3390 'uploader': 'STREEM',
3391 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
3392 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
3393 'upload_date': '20150526',
3394 'license': 'Standard YouTube License',
3395 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
3396 'categories': ['People & Blogs'],
3397 'tags': list,
3398 'view_count': int,
3399 'like_count': int,
3400 'dislike_count': int,
3401 },
3402 'params': {
3403 'skip_download': True,
3404 },
3405 'skip': 'This video is not available.',
3406 'add_ie': [YoutubeIE.ie_key()],
3407 }, {
3408 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
3409 'only_matching': True,
3410 }, {
3411 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
3412 'only_matching': True,
3413 }, {
3414 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
3415 'info_dict': {
3416 'id': 'FMtPN8yp5LU', # This will keep changing
3417 'ext': 'mp4',
3418 'title': compat_str,
3419 'uploader': 'Sky News',
3420 'uploader_id': 'skynews',
3421 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
3422 'upload_date': r're:\d{8}',
3423 'description': compat_str,
3424 'categories': ['News & Politics'],
3425 'tags': list,
3426 'like_count': int,
3427 'dislike_count': int,
3428 },
3429 'params': {
3430 'skip_download': True,
3431 },
3432 'expected_warnings': ['Downloading just video ', 'Ignoring subtitle tracks found in '],
3433 }, {
3434 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
3435 'info_dict': {
3436 'id': 'a48o2S1cPoo',
3437 'ext': 'mp4',
3438 'title': 'The Young Turks - Live Main Show',
3439 'uploader': 'The Young Turks',
3440 'uploader_id': 'TheYoungTurks',
3441 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
3442 'upload_date': '20150715',
3443 'license': 'Standard YouTube License',
3444 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3445 'categories': ['News & Politics'],
3446 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3447 'like_count': int,
3448 'dislike_count': int,
3449 },
3450 'params': {
3451 'skip_download': True,
3452 },
3453 'only_matching': True,
3454 }, {
3455 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
3456 'only_matching': True,
3457 }, {
3458 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3459 'only_matching': True,
3460 }, {
3461 'note': 'A channel that is not live. Should raise error',
3462 'url': 'https://www.youtube.com/user/numberphile/live',
3463 'only_matching': True,
3464 }, {
3465 'url': 'https://www.youtube.com/feed/trending',
3466 'only_matching': True,
3467 }, {
3468 'url': 'https://www.youtube.com/feed/library',
3469 'only_matching': True,
3470 }, {
3471 'url': 'https://www.youtube.com/feed/history',
3472 'only_matching': True,
3473 }, {
3474 'url': 'https://www.youtube.com/feed/subscriptions',
3475 'only_matching': True,
3476 }, {
3477 'url': 'https://www.youtube.com/feed/watch_later',
3478 'only_matching': True,
3479 }, {
3480 'note': 'Recommended - redirects to home page',
3481 'url': 'https://www.youtube.com/feed/recommended',
3482 'only_matching': True,
3483 }, {
3484 'note': 'inline playlist with not always working continuations',
3485 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
3486 'only_matching': True,
3487 }, {
3488 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8',
3489 'only_matching': True,
3490 }, {
3491 'url': 'https://www.youtube.com/course',
3492 'only_matching': True,
3493 }, {
3494 'url': 'https://www.youtube.com/zsecurity',
3495 'only_matching': True,
3496 }, {
3497 'url': 'http://www.youtube.com/NASAgovVideo/videos',
3498 'only_matching': True,
3499 }, {
3500 'url': 'https://www.youtube.com/TheYoungTurks/live',
3501 'only_matching': True,
3502 }, {
3503 'url': 'https://www.youtube.com/hashtag/cctv9',
3504 'info_dict': {
3505 'id': 'cctv9',
3506 'title': '#cctv9',
3507 },
3508 'playlist_mincount': 350,
3509 }, {
3510 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU',
3511 'only_matching': True,
3512 }, {
3513 'note': 'Requires Premium: should request additional YTM-info webpage (and have format 141) for videos in playlist',
3514 'url': 'https://music.youtube.com/playlist?list=PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3515 'only_matching': True
3516 }, {
3517 'note': '/browse/ should redirect to /channel/',
3518 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng',
3519 'only_matching': True
3520 }, {
3521 'note': 'VLPL, should redirect to playlist?list=PL...',
3522 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3523 'info_dict': {
3524 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3525 'uploader': 'NoCopyrightSounds',
3526 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!',
3527 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
3528 'title': 'NCS Releases',
3529 },
3530 'playlist_mincount': 166,
3531 }, {
3532 'note': 'Topic, should redirect to playlist?list=UU...',
3533 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw',
3534 'info_dict': {
3535 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw',
3536 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
3537 'title': 'Uploads from Royalty Free Music - Topic',
3538 'uploader': 'Royalty Free Music - Topic',
3539 },
3540 'expected_warnings': [
3541 'A channel/user page was given',
3542 'The URL does not have a videos tab',
3543 ],
3544 'playlist_mincount': 101,
3545 }, {
3546 'note': 'Topic without a UU playlist',
3547 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg',
3548 'info_dict': {
3549 'id': 'UCtFRv9O2AHqOZjjynzrv-xg',
3550 'title': 'UCtFRv9O2AHqOZjjynzrv-xg',
3551 },
3552 'expected_warnings': [
3553 'A channel/user page was given',
3554 'The URL does not have a videos tab',
3555 'Falling back to channel URL',
3556 ],
3557 'playlist_mincount': 9,
3558 }, {
3559 'note': 'Youtube music Album',
3560 'url': 'https://music.youtube.com/browse/MPREb_gTAcphH99wE',
3561 'info_dict': {
3562 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0',
3563 'title': 'Album - Royalty Free Music Library V2 (50 Songs)',
3564 },
3565 'playlist_count': 50,
3566 }, {
3567 'note': 'unlisted single video playlist',
3568 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
3569 'info_dict': {
3570 'uploader_id': 'UC9zHu_mHU96r19o-wV5Qs1Q',
3571 'uploader': 'colethedj',
3572 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
3573 'title': 'yt-dlp unlisted playlist test',
3574 'availability': 'unlisted'
3575 },
3576 'playlist_count': 1,
3577 }]
3578
3579 @classmethod
3580 def suitable(cls, url):
3581 return False if YoutubeIE.suitable(url) else super(
3582 YoutubeTabIE, cls).suitable(url)
3583
3584 def _extract_channel_id(self, webpage):
3585 channel_id = self._html_search_meta(
3586 'channelId', webpage, 'channel id', default=None)
3587 if channel_id:
3588 return channel_id
3589 channel_url = self._html_search_meta(
3590 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
3591 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
3592 'twitter:app:url:googleplay'), webpage, 'channel url')
3593 return self._search_regex(
3594 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
3595 channel_url, 'channel id')
3596
3597 @staticmethod
3598 def _extract_basic_item_renderer(item):
3599 # Modified from _extract_grid_item_renderer
3600 known_basic_renderers = (
3601 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer'
3602 )
3603 for key, renderer in item.items():
3604 if not isinstance(renderer, dict):
3605 continue
3606 elif key in known_basic_renderers:
3607 return renderer
3608 elif key.startswith('grid') and key.endswith('Renderer'):
3609 return renderer
3610
3611 def _grid_entries(self, grid_renderer):
3612 for item in grid_renderer['items']:
3613 if not isinstance(item, dict):
3614 continue
3615 renderer = self._extract_basic_item_renderer(item)
3616 if not isinstance(renderer, dict):
3617 continue
3618 title = self._get_text(renderer, 'title')
3619
3620 # playlist
3621 playlist_id = renderer.get('playlistId')
3622 if playlist_id:
3623 yield self.url_result(
3624 'https://www.youtube.com/playlist?list=%s' % playlist_id,
3625 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3626 video_title=title)
3627 continue
3628 # video
3629 video_id = renderer.get('videoId')
3630 if video_id:
3631 yield self._extract_video(renderer)
3632 continue
3633 # channel
3634 channel_id = renderer.get('channelId')
3635 if channel_id:
3636 yield self.url_result(
3637 'https://www.youtube.com/channel/%s' % channel_id,
3638 ie=YoutubeTabIE.ie_key(), video_title=title)
3639 continue
3640 # generic endpoint URL support
3641 ep_url = urljoin('https://www.youtube.com/', try_get(
3642 renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'],
3643 compat_str))
3644 if ep_url:
3645 for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE):
3646 if ie.suitable(ep_url):
3647 yield self.url_result(
3648 ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title)
3649 break
3650
3651 def _shelf_entries_from_content(self, shelf_renderer):
3652 content = shelf_renderer.get('content')
3653 if not isinstance(content, dict):
3654 return
3655 renderer = content.get('gridRenderer') or content.get('expandedShelfContentsRenderer')
3656 if renderer:
3657 # TODO: add support for nested playlists so each shelf is processed
3658 # as separate playlist
3659 # TODO: this includes only first N items
3660 for entry in self._grid_entries(renderer):
3661 yield entry
3662 renderer = content.get('horizontalListRenderer')
3663 if renderer:
3664 # TODO
3665 pass
3666
3667 def _shelf_entries(self, shelf_renderer, skip_channels=False):
3668 ep = try_get(
3669 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3670 compat_str)
3671 shelf_url = urljoin('https://www.youtube.com', ep)
3672 if shelf_url:
3673 # Skipping links to another channels, note that checking for
3674 # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL
3675 # will not work
3676 if skip_channels and '/channels?' in shelf_url:
3677 return
3678 title = self._get_text(shelf_renderer, 'title')
3679 yield self.url_result(shelf_url, video_title=title)
3680 # Shelf may not contain shelf URL, fallback to extraction from content
3681 for entry in self._shelf_entries_from_content(shelf_renderer):
3682 yield entry
3683
3684 def _playlist_entries(self, video_list_renderer):
3685 for content in video_list_renderer['contents']:
3686 if not isinstance(content, dict):
3687 continue
3688 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
3689 if not isinstance(renderer, dict):
3690 continue
3691 video_id = renderer.get('videoId')
3692 if not video_id:
3693 continue
3694 yield self._extract_video(renderer)
3695
3696 def _rich_entries(self, rich_grid_renderer):
3697 renderer = try_get(
3698 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
3699 video_id = renderer.get('videoId')
3700 if not video_id:
3701 return
3702 yield self._extract_video(renderer)
3703
3704 def _video_entry(self, video_renderer):
3705 video_id = video_renderer.get('videoId')
3706 if video_id:
3707 return self._extract_video(video_renderer)
3708
3709 def _post_thread_entries(self, post_thread_renderer):
3710 post_renderer = try_get(
3711 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
3712 if not post_renderer:
3713 return
3714 # video attachment
3715 video_renderer = try_get(
3716 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict) or {}
3717 video_id = video_renderer.get('videoId')
3718 if video_id:
3719 entry = self._extract_video(video_renderer)
3720 if entry:
3721 yield entry
3722 # playlist attachment
3723 playlist_id = try_get(
3724 post_renderer, lambda x: x['backstageAttachment']['playlistRenderer']['playlistId'], compat_str)
3725 if playlist_id:
3726 yield self.url_result(
3727 'https://www.youtube.com/playlist?list=%s' % playlist_id,
3728 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3729 # inline video links
3730 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
3731 for run in runs:
3732 if not isinstance(run, dict):
3733 continue
3734 ep_url = try_get(
3735 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
3736 if not ep_url:
3737 continue
3738 if not YoutubeIE.suitable(ep_url):
3739 continue
3740 ep_video_id = YoutubeIE._match_id(ep_url)
3741 if video_id == ep_video_id:
3742 continue
3743 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=ep_video_id)
3744
3745 def _post_thread_continuation_entries(self, post_thread_continuation):
3746 contents = post_thread_continuation.get('contents')
3747 if not isinstance(contents, list):
3748 return
3749 for content in contents:
3750 renderer = content.get('backstagePostThreadRenderer')
3751 if not isinstance(renderer, dict):
3752 continue
3753 for entry in self._post_thread_entries(renderer):
3754 yield entry
3755
3756 r''' # unused
3757 def _rich_grid_entries(self, contents):
3758 for content in contents:
3759 video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict)
3760 if video_renderer:
3761 entry = self._video_entry(video_renderer)
3762 if entry:
3763 yield entry
3764 '''
3765 def _entries(self, tab, item_id, identity_token, account_syncid, ytcfg):
3766
3767 def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
3768 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
3769 for content in contents:
3770 if not isinstance(content, dict):
3771 continue
3772 is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
3773 if not is_renderer:
3774 renderer = content.get('richItemRenderer')
3775 if renderer:
3776 for entry in self._rich_entries(renderer):
3777 yield entry
3778 continuation_list[0] = self._extract_continuation(parent_renderer)
3779 continue
3780 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
3781 for isr_content in isr_contents:
3782 if not isinstance(isr_content, dict):
3783 continue
3784
3785 known_renderers = {
3786 'playlistVideoListRenderer': self._playlist_entries,
3787 'gridRenderer': self._grid_entries,
3788 'shelfRenderer': lambda x: self._shelf_entries(x, tab.get('title') != 'Channels'),
3789 'backstagePostThreadRenderer': self._post_thread_entries,
3790 'videoRenderer': lambda x: [self._video_entry(x)],
3791 }
3792 for key, renderer in isr_content.items():
3793 if key not in known_renderers:
3794 continue
3795 for entry in known_renderers[key](renderer):
3796 if entry:
3797 yield entry
3798 continuation_list[0] = self._extract_continuation(renderer)
3799 break
3800
3801 if not continuation_list[0]:
3802 continuation_list[0] = self._extract_continuation(is_renderer)
3803
3804 if not continuation_list[0]:
3805 continuation_list[0] = self._extract_continuation(parent_renderer)
3806
3807 continuation_list = [None] # Python 2 doesnot support nonlocal
3808 tab_content = try_get(tab, lambda x: x['content'], dict)
3809 if not tab_content:
3810 return
3811 parent_renderer = (
3812 try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
3813 or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
3814 for entry in extract_entries(parent_renderer):
3815 yield entry
3816 continuation = continuation_list[0]
3817 visitor_data = None
3818
3819 for page_num in itertools.count(1):
3820 if not continuation:
3821 break
3822 headers = self.generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
3823 response = self._extract_response(
3824 item_id='%s page %s' % (item_id, page_num),
3825 query=continuation, headers=headers, ytcfg=ytcfg,
3826 check_get_keys=('continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints'))
3827
3828 if not response:
3829 break
3830 visitor_data = try_get(
3831 response, lambda x: x['responseContext']['visitorData'], compat_str) or visitor_data
3832
3833 known_continuation_renderers = {
3834 'playlistVideoListContinuation': self._playlist_entries,
3835 'gridContinuation': self._grid_entries,
3836 'itemSectionContinuation': self._post_thread_continuation_entries,
3837 'sectionListContinuation': extract_entries, # for feeds
3838 }
3839 continuation_contents = try_get(
3840 response, lambda x: x['continuationContents'], dict) or {}
3841 continuation_renderer = None
3842 for key, value in continuation_contents.items():
3843 if key not in known_continuation_renderers:
3844 continue
3845 continuation_renderer = value
3846 continuation_list = [None]
3847 for entry in known_continuation_renderers[key](continuation_renderer):
3848 yield entry
3849 continuation = continuation_list[0] or self._extract_continuation(continuation_renderer)
3850 break
3851 if continuation_renderer:
3852 continue
3853
3854 known_renderers = {
3855 'gridPlaylistRenderer': (self._grid_entries, 'items'),
3856 'gridVideoRenderer': (self._grid_entries, 'items'),
3857 'gridChannelRenderer': (self._grid_entries, 'items'),
3858 'playlistVideoRenderer': (self._playlist_entries, 'contents'),
3859 'itemSectionRenderer': (extract_entries, 'contents'), # for feeds
3860 'richItemRenderer': (extract_entries, 'contents'), # for hashtag
3861 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents')
3862 }
3863 on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints'))
3864 continuation_items = try_get(
3865 on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list)
3866 continuation_item = try_get(continuation_items, lambda x: x[0], dict) or {}
3867 video_items_renderer = None
3868 for key, value in continuation_item.items():
3869 if key not in known_renderers:
3870 continue
3871 video_items_renderer = {known_renderers[key][1]: continuation_items}
3872 continuation_list = [None]
3873 for entry in known_renderers[key][0](video_items_renderer):
3874 yield entry
3875 continuation = continuation_list[0] or self._extract_continuation(video_items_renderer)
3876 break
3877 if video_items_renderer:
3878 continue
3879 break
3880
3881 @staticmethod
3882 def _extract_selected_tab(tabs):
3883 for tab in tabs:
3884 renderer = dict_get(tab, ('tabRenderer', 'expandableTabRenderer')) or {}
3885 if renderer.get('selected') is True:
3886 return renderer
3887 else:
3888 raise ExtractorError('Unable to find selected tab')
3889
3890 @classmethod
3891 def _extract_uploader(cls, data):
3892 uploader = {}
3893 renderer = cls._extract_sidebar_info_renderer(data, 'playlistSidebarSecondaryInfoRenderer') or {}
3894 owner = try_get(
3895 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3896 if owner:
3897 uploader['uploader'] = owner.get('text')
3898 uploader['uploader_id'] = try_get(
3899 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3900 uploader['uploader_url'] = urljoin(
3901 'https://www.youtube.com/',
3902 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3903 return {k: v for k, v in uploader.items() if v is not None}
3904
3905 def _extract_from_tabs(self, item_id, webpage, data, tabs):
3906 playlist_id = title = description = channel_url = channel_name = channel_id = None
3907 thumbnails_list = tags = []
3908
3909 selected_tab = self._extract_selected_tab(tabs)
3910 renderer = try_get(
3911 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3912 if renderer:
3913 channel_name = renderer.get('title')
3914 channel_url = renderer.get('channelUrl')
3915 channel_id = renderer.get('externalId')
3916 else:
3917 renderer = try_get(
3918 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3919
3920 if renderer:
3921 title = renderer.get('title')
3922 description = renderer.get('description', '')
3923 playlist_id = channel_id
3924 tags = renderer.get('keywords', '').split()
3925 thumbnails_list = (
3926 try_get(renderer, lambda x: x['avatar']['thumbnails'], list)
3927 or try_get(
3928 self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer'),
3929 lambda x: x['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails'],
3930 list)
3931 or [])
3932
3933 thumbnails = []
3934 for t in thumbnails_list:
3935 if not isinstance(t, dict):
3936 continue
3937 thumbnail_url = url_or_none(t.get('url'))
3938 if not thumbnail_url:
3939 continue
3940 thumbnails.append({
3941 'url': thumbnail_url,
3942 'width': int_or_none(t.get('width')),
3943 'height': int_or_none(t.get('height')),
3944 })
3945 if playlist_id is None:
3946 playlist_id = item_id
3947 if title is None:
3948 title = (
3949 try_get(data, lambda x: x['header']['hashtagHeaderRenderer']['hashtag']['simpleText'])
3950 or playlist_id)
3951 title += format_field(selected_tab, 'title', ' - %s')
3952 title += format_field(selected_tab, 'expandedText', ' - %s')
3953 metadata = {
3954 'playlist_id': playlist_id,
3955 'playlist_title': title,
3956 'playlist_description': description,
3957 'uploader': channel_name,
3958 'uploader_id': channel_id,
3959 'uploader_url': channel_url,
3960 'thumbnails': thumbnails,
3961 'tags': tags,
3962 }
3963 availability = self._extract_availability(data)
3964 if availability:
3965 metadata['availability'] = availability
3966 if not channel_id:
3967 metadata.update(self._extract_uploader(data))
3968 metadata.update({
3969 'channel': metadata['uploader'],
3970 'channel_id': metadata['uploader_id'],
3971 'channel_url': metadata['uploader_url']})
3972 ytcfg = self.extract_ytcfg(item_id, webpage)
3973 return self.playlist_result(
3974 self._entries(
3975 selected_tab, playlist_id,
3976 self._extract_identity_token(webpage, item_id),
3977 self._extract_account_syncid(ytcfg, data), ytcfg),
3978 **metadata)
3979
3980 def _extract_mix_playlist(self, playlist, playlist_id, data, webpage):
3981 first_id = last_id = None
3982 ytcfg = self.extract_ytcfg(playlist_id, webpage)
3983 headers = self.generate_api_headers(
3984 ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data),
3985 identity_token=self._extract_identity_token(webpage, item_id=playlist_id))
3986 for page_num in itertools.count(1):
3987 videos = list(self._playlist_entries(playlist))
3988 if not videos:
3989 return
3990 start = next((i for i, v in enumerate(videos) if v['id'] == last_id), -1) + 1
3991 if start >= len(videos):
3992 return
3993 for video in videos[start:]:
3994 if video['id'] == first_id:
3995 self.to_screen('First video %s found again; Assuming end of Mix' % first_id)
3996 return
3997 yield video
3998 first_id = first_id or videos[0]['id']
3999 last_id = videos[-1]['id']
4000 watch_endpoint = try_get(
4001 playlist, lambda x: x['contents'][-1]['playlistPanelVideoRenderer']['navigationEndpoint']['watchEndpoint'])
4002 query = {
4003 'playlistId': playlist_id,
4004 'videoId': watch_endpoint.get('videoId') or last_id,
4005 'index': watch_endpoint.get('index') or len(videos),
4006 'params': watch_endpoint.get('params') or 'OAE%3D'
4007 }
4008 response = self._extract_response(
4009 item_id='%s page %d' % (playlist_id, page_num),
4010 query=query, ep='next', headers=headers, ytcfg=ytcfg,
4011 check_get_keys='contents'
4012 )
4013 playlist = try_get(
4014 response, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
4015
4016 def _extract_from_playlist(self, item_id, url, data, playlist, webpage):
4017 title = playlist.get('title') or try_get(
4018 data, lambda x: x['titleText']['simpleText'], compat_str)
4019 playlist_id = playlist.get('playlistId') or item_id
4020
4021 # Delegating everything except mix playlists to regular tab-based playlist URL
4022 playlist_url = urljoin(url, try_get(
4023 playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
4024 compat_str))
4025 if playlist_url and playlist_url != url:
4026 return self.url_result(
4027 playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
4028 video_title=title)
4029
4030 return self.playlist_result(
4031 self._extract_mix_playlist(playlist, playlist_id, data, webpage),
4032 playlist_id=playlist_id, playlist_title=title)
4033
4034 def _extract_availability(self, data):
4035 """
4036 Gets the availability of a given playlist/tab.
4037 Note: Unless YouTube tells us explicitly, we do not assume it is public
4038 @param data: response
4039 """
4040 is_private = is_unlisted = None
4041 renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') or {}
4042 badge_labels = self._extract_badges(renderer)
4043
4044 # Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge
4045 privacy_dropdown_entries = try_get(
4046 renderer, lambda x: x['privacyForm']['dropdownFormFieldRenderer']['dropdown']['dropdownRenderer']['entries'], list) or []
4047 for renderer_dict in privacy_dropdown_entries:
4048 is_selected = try_get(
4049 renderer_dict, lambda x: x['privacyDropdownItemRenderer']['isSelected'], bool) or False
4050 if not is_selected:
4051 continue
4052 label = self._get_text(renderer_dict, ('privacyDropdownItemRenderer', 'label'))
4053 if label:
4054 badge_labels.add(label.lower())
4055 break
4056
4057 for badge_label in badge_labels:
4058 if badge_label == 'unlisted':
4059 is_unlisted = True
4060 elif badge_label == 'private':
4061 is_private = True
4062 elif badge_label == 'public':
4063 is_unlisted = is_private = False
4064 return self._availability(is_private, False, False, False, is_unlisted)
4065
4066 @staticmethod
4067 def _extract_sidebar_info_renderer(data, info_renderer, expected_type=dict):
4068 sidebar_renderer = try_get(
4069 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list) or []
4070 for item in sidebar_renderer:
4071 renderer = try_get(item, lambda x: x[info_renderer], expected_type)
4072 if renderer:
4073 return renderer
4074
4075 def _reload_with_unavailable_videos(self, item_id, data, webpage):
4076 """
4077 Get playlist with unavailable videos if the 'show unavailable videos' button exists.
4078 """
4079 browse_id = params = None
4080 renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer')
4081 if not renderer:
4082 return
4083 menu_renderer = try_get(
4084 renderer, lambda x: x['menu']['menuRenderer']['items'], list) or []
4085 for menu_item in menu_renderer:
4086 if not isinstance(menu_item, dict):
4087 continue
4088 nav_item_renderer = menu_item.get('menuNavigationItemRenderer')
4089 text = try_get(
4090 nav_item_renderer, lambda x: x['text']['simpleText'], compat_str)
4091 if not text or text.lower() != 'show unavailable videos':
4092 continue
4093 browse_endpoint = try_get(
4094 nav_item_renderer, lambda x: x['navigationEndpoint']['browseEndpoint'], dict) or {}
4095 browse_id = browse_endpoint.get('browseId')
4096 params = browse_endpoint.get('params')
4097 break
4098
4099 ytcfg = self.extract_ytcfg(item_id, webpage)
4100 headers = self.generate_api_headers(
4101 ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data),
4102 identity_token=self._extract_identity_token(webpage, item_id=item_id),
4103 visitor_data=try_get(
4104 self._extract_context(ytcfg), lambda x: x['client']['visitorData'], compat_str))
4105 query = {
4106 'params': params or 'wgYCCAA=',
4107 'browseId': browse_id or 'VL%s' % item_id
4108 }
4109 return self._extract_response(
4110 item_id=item_id, headers=headers, query=query,
4111 check_get_keys='contents', fatal=False, ytcfg=ytcfg,
4112 note='Downloading API JSON with unavailable videos')
4113
4114 def _extract_webpage(self, url, item_id):
4115 retries = self.get_param('extractor_retries', 3)
4116 count = -1
4117 last_error = 'Incomplete yt initial data recieved'
4118 while count < retries:
4119 count += 1
4120 # Sometimes youtube returns a webpage with incomplete ytInitialData
4121 # See: https://github.com/yt-dlp/yt-dlp/issues/116
4122 if count:
4123 self.report_warning('%s. Retrying ...' % last_error)
4124 webpage = self._download_webpage(
4125 url, item_id,
4126 'Downloading webpage%s' % (' (retry #%d)' % count if count else ''))
4127 data = self.extract_yt_initial_data(item_id, webpage)
4128 if data.get('contents') or data.get('currentVideoEndpoint'):
4129 break
4130 # Extract alerts here only when there is error
4131 self._extract_and_report_alerts(data)
4132 if count >= retries:
4133 raise ExtractorError(last_error)
4134 return webpage, data
4135
4136 @staticmethod
4137 def _smuggle_data(entries, data):
4138 for entry in entries:
4139 if data:
4140 entry['url'] = smuggle_url(entry['url'], data)
4141 yield entry
4142
4143 def _real_extract(self, url):
4144 url, smuggled_data = unsmuggle_url(url, {})
4145 if self.is_music_url(url):
4146 smuggled_data['is_music_url'] = True
4147 info_dict = self.__real_extract(url, smuggled_data)
4148 if info_dict.get('entries'):
4149 info_dict['entries'] = self._smuggle_data(info_dict['entries'], smuggled_data)
4150 return info_dict
4151
4152 _url_re = re.compile(r'(?P<pre>%s)(?(channel_type)(?P<tab>/\w+))?(?P<post>.*)$' % _VALID_URL)
4153
4154 def __real_extract(self, url, smuggled_data):
4155 item_id = self._match_id(url)
4156 url = compat_urlparse.urlunparse(
4157 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
4158 compat_opts = self.get_param('compat_opts', [])
4159
4160 def get_mobj(url):
4161 mobj = self._url_re.match(url).groupdict()
4162 mobj.update((k, '') for k, v in mobj.items() if v is None)
4163 return mobj
4164
4165 mobj = get_mobj(url)
4166 # Youtube returns incomplete data if tabname is not lower case
4167 pre, tab, post, is_channel = mobj['pre'], mobj['tab'].lower(), mobj['post'], not mobj['not_channel']
4168
4169 if is_channel:
4170 if smuggled_data.get('is_music_url'):
4171 if item_id[:2] == 'VL':
4172 # Youtube music VL channels have an equivalent playlist
4173 item_id = item_id[2:]
4174 pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False
4175 elif item_id[:2] == 'MP':
4176 # Youtube music albums (/channel/MP...) have a OLAK playlist that can be extracted from the webpage
4177 item_id = self._search_regex(
4178 r'\\x22audioPlaylistId\\x22:\\x22([0-9A-Za-z_-]+)\\x22',
4179 self._download_webpage('https://music.youtube.com/channel/%s' % item_id, item_id),
4180 'playlist id')
4181 pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False
4182 elif mobj['channel_type'] == 'browse':
4183 # Youtube music /browse/ should be changed to /channel/
4184 pre = 'https://www.youtube.com/channel/%s' % item_id
4185 if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts:
4186 # Home URLs should redirect to /videos/
4187 self.report_warning(
4188 'A channel/user page was given. All the channel\'s videos will be downloaded. '
4189 'To download only the videos in the home page, add a "/featured" to the URL')
4190 tab = '/videos'
4191
4192 url = ''.join((pre, tab, post))
4193 mobj = get_mobj(url)
4194
4195 # Handle both video/playlist URLs
4196 qs = parse_qs(url)
4197 video_id = qs.get('v', [None])[0]
4198 playlist_id = qs.get('list', [None])[0]
4199
4200 if not video_id and mobj['not_channel'].startswith('watch'):
4201 if not playlist_id:
4202 # If there is neither video or playlist ids, youtube redirects to home page, which is undesirable
4203 raise ExtractorError('Unable to recognize tab page')
4204 # Common mistake: https://www.youtube.com/watch?list=playlist_id
4205 self.report_warning('A video URL was given without video ID. Trying to download playlist %s' % playlist_id)
4206 url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
4207 mobj = get_mobj(url)
4208
4209 if video_id and playlist_id:
4210 if self.get_param('noplaylist'):
4211 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
4212 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
4213 self.to_screen('Downloading playlist %s; add --no-playlist to just download video %s' % (playlist_id, video_id))
4214
4215 webpage, data = self._extract_webpage(url, item_id)
4216
4217 tabs = try_get(
4218 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
4219 if tabs:
4220 selected_tab = self._extract_selected_tab(tabs)
4221 tab_name = selected_tab.get('title', '')
4222 if 'no-youtube-channel-redirect' not in compat_opts:
4223 if mobj['tab'] == '/live':
4224 # Live tab should have redirected to the video
4225 raise ExtractorError('The channel is not currently live', expected=True)
4226 if mobj['tab'] == '/videos' and tab_name.lower() != mobj['tab'][1:]:
4227 if not mobj['not_channel'] and item_id[:2] == 'UC':
4228 # Topic channels don't have /videos. Use the equivalent playlist instead
4229 self.report_warning('The URL does not have a %s tab. Trying to redirect to playlist UU%s instead' % (mobj['tab'][1:], item_id[2:]))
4230 pl_id = 'UU%s' % item_id[2:]
4231 pl_url = 'https://www.youtube.com/playlist?list=%s%s' % (pl_id, mobj['post'])
4232 try:
4233 pl_webpage, pl_data = self._extract_webpage(pl_url, pl_id)
4234 for alert_type, alert_message in self._extract_alerts(pl_data):
4235 if alert_type == 'error':
4236 raise ExtractorError('Youtube said: %s' % alert_message)
4237 item_id, url, webpage, data = pl_id, pl_url, pl_webpage, pl_data
4238 except ExtractorError:
4239 self.report_warning('The playlist gave error. Falling back to channel URL')
4240 else:
4241 self.report_warning('The URL does not have a %s tab. %s is being downloaded instead' % (mobj['tab'][1:], tab_name))
4242
4243 self.write_debug('Final URL: %s' % url)
4244
4245 # YouTube sometimes provides a button to reload playlist with unavailable videos.
4246 if 'no-youtube-unavailable-videos' not in compat_opts:
4247 data = self._reload_with_unavailable_videos(item_id, data, webpage) or data
4248 self._extract_and_report_alerts(data)
4249 tabs = try_get(
4250 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
4251 if tabs:
4252 return self._extract_from_tabs(item_id, webpage, data, tabs)
4253
4254 playlist = try_get(
4255 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
4256 if playlist:
4257 return self._extract_from_playlist(item_id, url, data, playlist, webpage)
4258
4259 video_id = try_get(
4260 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
4261 compat_str) or video_id
4262 if video_id:
4263 if mobj['tab'] != '/live': # live tab is expected to redirect to video
4264 self.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id)
4265 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
4266
4267 raise ExtractorError('Unable to recognize tab page')
4268
4269
4270class YoutubePlaylistIE(InfoExtractor):
4271 IE_DESC = 'YouTube.com playlists'
4272 _VALID_URL = r'''(?x)(?:
4273 (?:https?://)?
4274 (?:\w+\.)?
4275 (?:
4276 (?:
4277 youtube(?:kids)?\.com|
4278 invidio\.us
4279 )
4280 /.*?\?.*?\blist=
4281 )?
4282 (?P<id>%(playlist_id)s)
4283 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
4284 IE_NAME = 'youtube:playlist'
4285 _TESTS = [{
4286 'note': 'issue #673',
4287 'url': 'PLBB231211A4F62143',
4288 'info_dict': {
4289 'title': '[OLD]Team Fortress 2 (Class-based LP)',
4290 'id': 'PLBB231211A4F62143',
4291 'uploader': 'Wickydoo',
4292 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
4293 'description': 'md5:8fa6f52abb47a9552002fa3ddfc57fc2',
4294 },
4295 'playlist_mincount': 29,
4296 }, {
4297 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
4298 'info_dict': {
4299 'title': 'YDL_safe_search',
4300 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
4301 },
4302 'playlist_count': 2,
4303 'skip': 'This playlist is private',
4304 }, {
4305 'note': 'embedded',
4306 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
4307 'playlist_count': 4,
4308 'info_dict': {
4309 'title': 'JODA15',
4310 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
4311 'uploader': 'milan',
4312 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
4313 }
4314 }, {
4315 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
4316 'playlist_mincount': 654,
4317 'info_dict': {
4318 'title': '2018 Chinese New Singles (11/6 updated)',
4319 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
4320 'uploader': 'LBK',
4321 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
4322 'description': 'md5:da521864744d60a198e3a88af4db0d9d',
4323 }
4324 }, {
4325 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
4326 'only_matching': True,
4327 }, {
4328 # music album playlist
4329 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
4330 'only_matching': True,
4331 }]
4332
4333 @classmethod
4334 def suitable(cls, url):
4335 if YoutubeTabIE.suitable(url):
4336 return False
4337 # Hack for lazy extractors until more generic solution is implemented
4338 # (see #28780)
4339 from .youtube import parse_qs
4340 qs = parse_qs(url)
4341 if qs.get('v', [None])[0]:
4342 return False
4343 return super(YoutubePlaylistIE, cls).suitable(url)
4344
4345 def _real_extract(self, url):
4346 playlist_id = self._match_id(url)
4347 is_music_url = YoutubeBaseInfoExtractor.is_music_url(url)
4348 url = update_url_query(
4349 'https://www.youtube.com/playlist',
4350 parse_qs(url) or {'list': playlist_id})
4351 if is_music_url:
4352 url = smuggle_url(url, {'is_music_url': True})
4353 return self.url_result(url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
4354
4355
4356class YoutubeYtBeIE(InfoExtractor):
4357 IE_DESC = 'youtu.be'
4358 _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
4359 _TESTS = [{
4360 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
4361 'info_dict': {
4362 'id': 'yeWKywCrFtk',
4363 'ext': 'mp4',
4364 'title': 'Small Scale Baler and Braiding Rugs',
4365 'uploader': 'Backus-Page House Museum',
4366 'uploader_id': 'backuspagemuseum',
4367 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
4368 'upload_date': '20161008',
4369 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
4370 'categories': ['Nonprofits & Activism'],
4371 'tags': list,
4372 'like_count': int,
4373 'dislike_count': int,
4374 },
4375 'params': {
4376 'noplaylist': True,
4377 'skip_download': True,
4378 },
4379 }, {
4380 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
4381 'only_matching': True,
4382 }]
4383
4384 def _real_extract(self, url):
4385 mobj = re.match(self._VALID_URL, url)
4386 video_id = mobj.group('id')
4387 playlist_id = mobj.group('playlist_id')
4388 return self.url_result(
4389 update_url_query('https://www.youtube.com/watch', {
4390 'v': video_id,
4391 'list': playlist_id,
4392 'feature': 'youtu.be',
4393 }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
4394
4395
4396class YoutubeYtUserIE(InfoExtractor):
4397 IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
4398 _VALID_URL = r'ytuser:(?P<id>.+)'
4399 _TESTS = [{
4400 'url': 'ytuser:phihag',
4401 'only_matching': True,
4402 }]
4403
4404 def _real_extract(self, url):
4405 user_id = self._match_id(url)
4406 return self.url_result(
4407 'https://www.youtube.com/user/%s' % user_id,
4408 ie=YoutubeTabIE.ie_key(), video_id=user_id)
4409
4410
4411class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
4412 IE_NAME = 'youtube:favorites'
4413 IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
4414 _VALID_URL = r':ytfav(?:ou?rite)?s?'
4415 _LOGIN_REQUIRED = True
4416 _TESTS = [{
4417 'url': ':ytfav',
4418 'only_matching': True,
4419 }, {
4420 'url': ':ytfavorites',
4421 'only_matching': True,
4422 }]
4423
4424 def _real_extract(self, url):
4425 return self.url_result(
4426 'https://www.youtube.com/playlist?list=LL',
4427 ie=YoutubeTabIE.ie_key())
4428
4429
4430class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE):
4431 IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
4432 # there doesn't appear to be a real limit, for example if you search for
4433 # 'python' you get more than 8.000.000 results
4434 _MAX_RESULTS = float('inf')
4435 IE_NAME = 'youtube:search'
4436 _SEARCH_KEY = 'ytsearch'
4437 _SEARCH_PARAMS = None
4438 _TESTS = []
4439
4440 def _entries(self, query, n):
4441 data = {'query': query}
4442 if self._SEARCH_PARAMS:
4443 data['params'] = self._SEARCH_PARAMS
4444 total = 0
4445 continuation = {}
4446 for page_num in itertools.count(1):
4447 data.update(continuation)
4448 search = self._extract_response(
4449 item_id='query "%s" page %s' % (query, page_num), ep='search', query=data,
4450 check_get_keys=('contents', 'onResponseReceivedCommands')
4451 )
4452 if not search:
4453 break
4454 slr_contents = try_get(
4455 search,
4456 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
4457 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
4458 list)
4459 if not slr_contents:
4460 break
4461
4462 # Youtube sometimes adds promoted content to searches,
4463 # changing the index location of videos and token.
4464 # So we search through all entries till we find them.
4465 continuation = None
4466 for slr_content in slr_contents:
4467 if not continuation:
4468 continuation = self._extract_continuation({'contents': [slr_content]})
4469
4470 isr_contents = try_get(
4471 slr_content,
4472 lambda x: x['itemSectionRenderer']['contents'],
4473 list)
4474 if not isr_contents:
4475 continue
4476 for content in isr_contents:
4477 if not isinstance(content, dict):
4478 continue
4479 video = content.get('videoRenderer')
4480 if not isinstance(video, dict):
4481 continue
4482 video_id = video.get('videoId')
4483 if not video_id:
4484 continue
4485
4486 yield self._extract_video(video)
4487 total += 1
4488 if total == n:
4489 return
4490
4491 if not continuation:
4492 break
4493
4494 def _get_n_results(self, query, n):
4495 """Get a specified number of results for a query"""
4496 return self.playlist_result(self._entries(query, n), query, query)
4497
4498
4499class YoutubeSearchDateIE(YoutubeSearchIE):
4500 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
4501 _SEARCH_KEY = 'ytsearchdate'
4502 IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
4503 _SEARCH_PARAMS = 'CAI%3D'
4504
4505
4506class YoutubeSearchURLIE(YoutubeSearchIE):
4507 IE_DESC = 'YouTube.com search URLs'
4508 IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
4509 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
4510 # _MAX_RESULTS = 100
4511 _TESTS = [{
4512 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
4513 'playlist_mincount': 5,
4514 'info_dict': {
4515 'id': 'youtube-dl test video',
4516 'title': 'youtube-dl test video',
4517 }
4518 }, {
4519 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
4520 'only_matching': True,
4521 }]
4522
4523 @classmethod
4524 def _make_valid_url(cls):
4525 return cls._VALID_URL
4526
4527 def _real_extract(self, url):
4528 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
4529 query = (qs.get('search_query') or qs.get('q'))[0]
4530 self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
4531 return self._get_n_results(query, self._MAX_RESULTS)
4532
4533
4534class YoutubeFeedsInfoExtractor(YoutubeTabIE):
4535 """
4536 Base class for feed extractors
4537 Subclasses must define the _FEED_NAME property.
4538 """
4539 _LOGIN_REQUIRED = True
4540 _TESTS = []
4541
4542 @property
4543 def IE_NAME(self):
4544 return 'youtube:%s' % self._FEED_NAME
4545
4546 def _real_extract(self, url):
4547 return self.url_result(
4548 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
4549 ie=YoutubeTabIE.ie_key())
4550
4551
4552class YoutubeWatchLaterIE(InfoExtractor):
4553 IE_NAME = 'youtube:watchlater'
4554 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
4555 _VALID_URL = r':ytwatchlater'
4556 _TESTS = [{
4557 'url': ':ytwatchlater',
4558 'only_matching': True,
4559 }]
4560
4561 def _real_extract(self, url):
4562 return self.url_result(
4563 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
4564
4565
4566class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
4567 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
4568 _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
4569 _FEED_NAME = 'recommended'
4570 _LOGIN_REQUIRED = False
4571 _TESTS = [{
4572 'url': ':ytrec',
4573 'only_matching': True,
4574 }, {
4575 'url': ':ytrecommended',
4576 'only_matching': True,
4577 }, {
4578 'url': 'https://youtube.com',
4579 'only_matching': True,
4580 }]
4581
4582
4583class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
4584 IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
4585 _VALID_URL = r':ytsub(?:scription)?s?'
4586 _FEED_NAME = 'subscriptions'
4587 _TESTS = [{
4588 'url': ':ytsubs',
4589 'only_matching': True,
4590 }, {
4591 'url': ':ytsubscriptions',
4592 'only_matching': True,
4593 }]
4594
4595
4596class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
4597 IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)'
4598 _VALID_URL = r':ythis(?:tory)?'
4599 _FEED_NAME = 'history'
4600 _TESTS = [{
4601 'url': ':ythistory',
4602 'only_matching': True,
4603 }]
4604
4605
4606class YoutubeTruncatedURLIE(InfoExtractor):
4607 IE_NAME = 'youtube:truncated_url'
4608 IE_DESC = False # Do not list
4609 _VALID_URL = r'''(?x)
4610 (?:https?://)?
4611 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
4612 (?:watch\?(?:
4613 feature=[a-z_]+|
4614 annotation_id=annotation_[^&]+|
4615 x-yt-cl=[0-9]+|
4616 hl=[^&]*|
4617 t=[0-9]+
4618 )?
4619 |
4620 attribution_link\?a=[^&]+
4621 )
4622 $
4623 '''
4624
4625 _TESTS = [{
4626 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
4627 'only_matching': True,
4628 }, {
4629 'url': 'https://www.youtube.com/watch?',
4630 'only_matching': True,
4631 }, {
4632 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
4633 'only_matching': True,
4634 }, {
4635 'url': 'https://www.youtube.com/watch?feature=foo',
4636 'only_matching': True,
4637 }, {
4638 'url': 'https://www.youtube.com/watch?hl=en-GB',
4639 'only_matching': True,
4640 }, {
4641 'url': 'https://www.youtube.com/watch?t=2372',
4642 'only_matching': True,
4643 }]
4644
4645 def _real_extract(self, url):
4646 raise ExtractorError(
4647 'Did you forget to quote the URL? Remember that & is a meta '
4648 'character in most shells, so you want to put the URL in quotes, '
4649 'like youtube-dl '
4650 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
4651 ' or simply youtube-dl BaW_jenozKc .',
4652 expected=True)
4653
4654
4655class YoutubeTruncatedIDIE(InfoExtractor):
4656 IE_NAME = 'youtube:truncated_id'
4657 IE_DESC = False # Do not list
4658 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
4659
4660 _TESTS = [{
4661 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
4662 'only_matching': True,
4663 }]
4664
4665 def _real_extract(self, url):
4666 video_id = self._match_id(url)
4667 raise ExtractorError(
4668 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
4669 expected=True)