]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/youtube.py
[test] Add Python 3.10 (#480)
[yt-dlp.git] / yt_dlp / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5 import base64
6 import calendar
7 import copy
8 import datetime
9 import hashlib
10 import itertools
11 import json
12 import os.path
13 import random
14 import re
15 import time
16 import traceback
17
18 from .common import InfoExtractor, SearchInfoExtractor
19 from ..compat import (
20 compat_chr,
21 compat_HTTPError,
22 compat_parse_qs,
23 compat_str,
24 compat_urllib_parse_unquote_plus,
25 compat_urllib_parse_urlencode,
26 compat_urllib_parse_urlparse,
27 compat_urlparse,
28 )
29 from ..jsinterp import JSInterpreter
30 from ..utils import (
31 bytes_to_intlist,
32 clean_html,
33 datetime_from_str,
34 dict_get,
35 error_to_compat_str,
36 ExtractorError,
37 float_or_none,
38 format_field,
39 int_or_none,
40 intlist_to_bytes,
41 mimetype2ext,
42 network_exceptions,
43 orderedSet,
44 parse_codecs,
45 parse_count,
46 parse_duration,
47 parse_iso8601,
48 qualities,
49 remove_start,
50 smuggle_url,
51 str_or_none,
52 str_to_int,
53 traverse_obj,
54 try_get,
55 unescapeHTML,
56 unified_strdate,
57 unsmuggle_url,
58 update_url_query,
59 url_or_none,
60 urlencode_postdata,
61 urljoin,
62 variadic,
63 )
64
65
66 def parse_qs(url):
67 return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
68
69
70 class YoutubeBaseInfoExtractor(InfoExtractor):
71 """Provide base functions for Youtube extractors"""
72 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
73 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
74
75 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
76 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
77 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
78
79 _RESERVED_NAMES = (
80 r'channel|c|user|browse|playlist|watch|w|v|embed|e|watch_popup|shorts|'
81 r'movies|results|shared|hashtag|trending|feed|feeds|oembed|get_video_info|'
82 r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout')
83
84 _NETRC_MACHINE = 'youtube'
85 # If True it will raise an error if no login info is provided
86 _LOGIN_REQUIRED = False
87
88 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
89
90 def _login(self):
91 """
92 Attempt to log in to YouTube.
93 True is returned if successful or skipped.
94 False is returned if login failed.
95
96 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
97 """
98
99 def warn(message):
100 self.report_warning(message)
101
102 # username+password login is broken
103 if (self._LOGIN_REQUIRED
104 and self.get_param('cookiefile') is None
105 and self.get_param('cookiesfrombrowser') is None):
106 self.raise_login_required(
107 'Login details are needed to download this content', method='cookies')
108 username, password = self._get_login_info()
109 if username:
110 warn('Logging in using username and password is broken. %s' % self._LOGIN_HINTS['cookies'])
111 return
112
113 # Everything below this is broken!
114 r'''
115 # No authentication to be performed
116 if username is None:
117 if self._LOGIN_REQUIRED and self.get_param('cookiefile') is None:
118 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
119 # if self.get_param('cookiefile'): # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
120 # self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
121 return True
122
123 login_page = self._download_webpage(
124 self._LOGIN_URL, None,
125 note='Downloading login page',
126 errnote='unable to fetch login page', fatal=False)
127 if login_page is False:
128 return
129
130 login_form = self._hidden_inputs(login_page)
131
132 def req(url, f_req, note, errnote):
133 data = login_form.copy()
134 data.update({
135 'pstMsg': 1,
136 'checkConnection': 'youtube',
137 'checkedDomains': 'youtube',
138 'hl': 'en',
139 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
140 'f.req': json.dumps(f_req),
141 'flowName': 'GlifWebSignIn',
142 'flowEntry': 'ServiceLogin',
143 # TODO: reverse actual botguard identifier generation algo
144 'bgRequest': '["identifier",""]',
145 })
146 return self._download_json(
147 url, None, note=note, errnote=errnote,
148 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
149 fatal=False,
150 data=urlencode_postdata(data), headers={
151 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
152 'Google-Accounts-XSRF': 1,
153 })
154
155 lookup_req = [
156 username,
157 None, [], None, 'US', None, None, 2, False, True,
158 [
159 None, None,
160 [2, 1, None, 1,
161 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
162 None, [], 4],
163 1, [None, None, []], None, None, None, True
164 ],
165 username,
166 ]
167
168 lookup_results = req(
169 self._LOOKUP_URL, lookup_req,
170 'Looking up account info', 'Unable to look up account info')
171
172 if lookup_results is False:
173 return False
174
175 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
176 if not user_hash:
177 warn('Unable to extract user hash')
178 return False
179
180 challenge_req = [
181 user_hash,
182 None, 1, None, [1, None, None, None, [password, None, True]],
183 [
184 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
185 1, [None, None, []], None, None, None, True
186 ]]
187
188 challenge_results = req(
189 self._CHALLENGE_URL, challenge_req,
190 'Logging in', 'Unable to log in')
191
192 if challenge_results is False:
193 return
194
195 login_res = try_get(challenge_results, lambda x: x[0][5], list)
196 if login_res:
197 login_msg = try_get(login_res, lambda x: x[5], compat_str)
198 warn(
199 'Unable to login: %s' % 'Invalid password'
200 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
201 return False
202
203 res = try_get(challenge_results, lambda x: x[0][-1], list)
204 if not res:
205 warn('Unable to extract result entry')
206 return False
207
208 login_challenge = try_get(res, lambda x: x[0][0], list)
209 if login_challenge:
210 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
211 if challenge_str == 'TWO_STEP_VERIFICATION':
212 # SEND_SUCCESS - TFA code has been successfully sent to phone
213 # QUOTA_EXCEEDED - reached the limit of TFA codes
214 status = try_get(login_challenge, lambda x: x[5], compat_str)
215 if status == 'QUOTA_EXCEEDED':
216 warn('Exceeded the limit of TFA codes, try later')
217 return False
218
219 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
220 if not tl:
221 warn('Unable to extract TL')
222 return False
223
224 tfa_code = self._get_tfa_info('2-step verification code')
225
226 if not tfa_code:
227 warn(
228 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
229 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
230 return False
231
232 tfa_code = remove_start(tfa_code, 'G-')
233
234 tfa_req = [
235 user_hash, None, 2, None,
236 [
237 9, None, None, None, None, None, None, None,
238 [None, tfa_code, True, 2]
239 ]]
240
241 tfa_results = req(
242 self._TFA_URL.format(tl), tfa_req,
243 'Submitting TFA code', 'Unable to submit TFA code')
244
245 if tfa_results is False:
246 return False
247
248 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
249 if tfa_res:
250 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
251 warn(
252 'Unable to finish TFA: %s' % 'Invalid TFA code'
253 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
254 return False
255
256 check_cookie_url = try_get(
257 tfa_results, lambda x: x[0][-1][2], compat_str)
258 else:
259 CHALLENGES = {
260 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
261 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
262 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
263 }
264 challenge = CHALLENGES.get(
265 challenge_str,
266 '%s returned error %s.' % (self.IE_NAME, challenge_str))
267 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
268 return False
269 else:
270 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
271
272 if not check_cookie_url:
273 warn('Unable to extract CheckCookie URL')
274 return False
275
276 check_cookie_results = self._download_webpage(
277 check_cookie_url, None, 'Checking cookie', fatal=False)
278
279 if check_cookie_results is False:
280 return False
281
282 if 'https://myaccount.google.com/' not in check_cookie_results:
283 warn('Unable to log in')
284 return False
285
286 return True
287 '''
288
289 def _initialize_consent(self):
290 cookies = self._get_cookies('https://www.youtube.com/')
291 if cookies.get('__Secure-3PSID'):
292 return
293 consent_id = None
294 consent = cookies.get('CONSENT')
295 if consent:
296 if 'YES' in consent.value:
297 return
298 consent_id = self._search_regex(
299 r'PENDING\+(\d+)', consent.value, 'consent', default=None)
300 if not consent_id:
301 consent_id = random.randint(100, 999)
302 self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id)
303
304 def _real_initialize(self):
305 self._initialize_consent()
306 if self._downloader is None:
307 return
308 if not self._login():
309 return
310
311 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
312 _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
313 _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
314
315 _YT_DEFAULT_YTCFGS = {
316 'WEB': {
317 'INNERTUBE_API_VERSION': 'v1',
318 'INNERTUBE_CLIENT_NAME': 'WEB',
319 'INNERTUBE_CLIENT_VERSION': '2.20210622.10.00',
320 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
321 'INNERTUBE_CONTEXT': {
322 'client': {
323 'clientName': 'WEB',
324 'clientVersion': '2.20210622.10.00',
325 'hl': 'en',
326 }
327 },
328 'INNERTUBE_CONTEXT_CLIENT_NAME': 1
329 },
330 'WEB_REMIX': {
331 'INNERTUBE_API_VERSION': 'v1',
332 'INNERTUBE_CLIENT_NAME': 'WEB_REMIX',
333 'INNERTUBE_CLIENT_VERSION': '1.20210621.00.00',
334 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
335 'INNERTUBE_CONTEXT': {
336 'client': {
337 'clientName': 'WEB_REMIX',
338 'clientVersion': '1.20210621.00.00',
339 'hl': 'en',
340 }
341 },
342 'INNERTUBE_CONTEXT_CLIENT_NAME': 67
343 },
344 'WEB_EMBEDDED_PLAYER': {
345 'INNERTUBE_API_VERSION': 'v1',
346 'INNERTUBE_CLIENT_NAME': 'WEB_EMBEDDED_PLAYER',
347 'INNERTUBE_CLIENT_VERSION': '1.20210620.0.1',
348 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
349 'INNERTUBE_CONTEXT': {
350 'client': {
351 'clientName': 'WEB_EMBEDDED_PLAYER',
352 'clientVersion': '1.20210620.0.1',
353 'hl': 'en',
354 }
355 },
356 'INNERTUBE_CONTEXT_CLIENT_NAME': 56
357 },
358 'ANDROID': {
359 'INNERTUBE_API_VERSION': 'v1',
360 'INNERTUBE_CLIENT_NAME': 'ANDROID',
361 'INNERTUBE_CLIENT_VERSION': '16.20',
362 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
363 'INNERTUBE_CONTEXT': {
364 'client': {
365 'clientName': 'ANDROID',
366 'clientVersion': '16.20',
367 'hl': 'en',
368 }
369 },
370 'INNERTUBE_CONTEXT_CLIENT_NAME': 3
371 },
372 'ANDROID_EMBEDDED_PLAYER': {
373 'INNERTUBE_API_VERSION': 'v1',
374 'INNERTUBE_CLIENT_NAME': 'ANDROID_EMBEDDED_PLAYER',
375 'INNERTUBE_CLIENT_VERSION': '16.20',
376 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
377 'INNERTUBE_CONTEXT': {
378 'client': {
379 'clientName': 'ANDROID_EMBEDDED_PLAYER',
380 'clientVersion': '16.20',
381 'hl': 'en',
382 }
383 },
384 'INNERTUBE_CONTEXT_CLIENT_NAME': 55
385 },
386 'ANDROID_MUSIC': {
387 'INNERTUBE_API_VERSION': 'v1',
388 'INNERTUBE_CLIENT_NAME': 'ANDROID_MUSIC',
389 'INNERTUBE_CLIENT_VERSION': '4.32',
390 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
391 'INNERTUBE_CONTEXT': {
392 'client': {
393 'clientName': 'ANDROID_MUSIC',
394 'clientVersion': '4.32',
395 'hl': 'en',
396 }
397 },
398 'INNERTUBE_CONTEXT_CLIENT_NAME': 21
399 },
400 'IOS': {
401 'INNERTUBE_API_VERSION': 'v1',
402 'INNERTUBE_CLIENT_NAME': 'IOS',
403 'INNERTUBE_CLIENT_VERSION': '16.20',
404 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
405 'INNERTUBE_CONTEXT': {
406 'client': {
407 'clientName': 'IOS',
408 'clientVersion': '16.20',
409 'hl': 'en',
410 }
411 },
412 'INNERTUBE_CONTEXT_CLIENT_NAME': 5
413
414 },
415 'IOS_MUSIC': {
416 'INNERTUBE_API_VERSION': 'v1',
417 'INNERTUBE_CLIENT_NAME': 'IOS_MUSIC',
418 'INNERTUBE_CLIENT_VERSION': '4.32',
419 'INNERTUBE_API_KEY': 'AIzaSyDK3iBpDP9nHVTk2qL73FLJICfOC3c51Og',
420 'INNERTUBE_CONTEXT': {
421 'client': {
422 'clientName': 'IOS_MUSIC',
423 'clientVersion': '4.32',
424 'hl': 'en',
425 }
426 },
427 'INNERTUBE_CONTEXT_CLIENT_NAME': 26
428 },
429 'IOS_MESSAGES_EXTENSION': {
430 'INNERTUBE_API_VERSION': 'v1',
431 'INNERTUBE_CLIENT_NAME': 'IOS_MESSAGES_EXTENSION',
432 'INNERTUBE_CLIENT_VERSION': '16.20',
433 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8',
434 'INNERTUBE_CONTEXT': {
435 'client': {
436 'clientName': 'IOS_MESSAGES_EXTENSION',
437 'clientVersion': '16.20',
438 'hl': 'en',
439 }
440 },
441 'INNERTUBE_CONTEXT_CLIENT_NAME': 66
442 }
443 }
444
445 _YT_DEFAULT_INNERTUBE_HOSTS = {
446 'DIRECT': 'youtubei.googleapis.com',
447 'WEB': 'www.youtube.com',
448 'WEB_REMIX': 'music.youtube.com',
449 'ANDROID_MUSIC': 'music.youtube.com'
450 }
451
452 # clients starting with _ cannot be explicity requested by the user
453 _YT_CLIENTS = {
454 'web': 'WEB',
455 'web_music': 'WEB_REMIX',
456 '_web_embedded': 'WEB_EMBEDDED_PLAYER',
457 '_web_agegate': 'TVHTML5',
458 'android': 'ANDROID',
459 'android_music': 'ANDROID_MUSIC',
460 '_android_embedded': 'ANDROID_EMBEDDED_PLAYER',
461 '_android_agegate': 'ANDROID',
462 'ios': 'IOS',
463 'ios_music': 'IOS_MUSIC',
464 '_ios_embedded': 'IOS_MESSAGES_EXTENSION',
465 '_ios_agegate': 'IOS'
466 }
467
468 def _get_default_ytcfg(self, client='WEB'):
469 if client in self._YT_DEFAULT_YTCFGS:
470 return copy.deepcopy(self._YT_DEFAULT_YTCFGS[client])
471 self.write_debug(f'INNERTUBE default client {client} does not exist - falling back to WEB client.')
472 return copy.deepcopy(self._YT_DEFAULT_YTCFGS['WEB'])
473
474 def _get_innertube_host(self, client='WEB'):
475 return dict_get(self._YT_DEFAULT_INNERTUBE_HOSTS, (client, 'WEB'))
476
477 def _ytcfg_get_safe(self, ytcfg, getter, expected_type=None, default_client='WEB'):
478 # try_get but with fallback to default ytcfg client values when present
479 _func = lambda y: try_get(y, getter, expected_type)
480 return _func(ytcfg) or _func(self._get_default_ytcfg(default_client))
481
482 def _extract_client_name(self, ytcfg, default_client='WEB'):
483 return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CLIENT_NAME'], compat_str, default_client)
484
485 @staticmethod
486 def _extract_session_index(*data):
487 for ytcfg in data:
488 session_index = int_or_none(try_get(ytcfg, lambda x: x['SESSION_INDEX']))
489 if session_index is not None:
490 return session_index
491
492 def _extract_client_version(self, ytcfg, default_client='WEB'):
493 return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CLIENT_VERSION'], compat_str, default_client)
494
495 def _extract_api_key(self, ytcfg=None, default_client='WEB'):
496 return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client)
497
498 def _extract_context(self, ytcfg=None, default_client='WEB'):
499 _get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict)
500 context = _get_context(ytcfg)
501 if context:
502 return context
503
504 context = _get_context(self._get_default_ytcfg(default_client))
505 if not ytcfg:
506 return context
507
508 # Recreate the client context (required)
509 context['client'].update({
510 'clientVersion': self._extract_client_version(ytcfg, default_client),
511 'clientName': self._extract_client_name(ytcfg, default_client),
512 })
513 visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str)
514 if visitor_data:
515 context['client']['visitorData'] = visitor_data
516 return context
517
518 def _generate_sapisidhash_header(self, origin='https://www.youtube.com'):
519 # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is.
520 # See: https://github.com/yt-dlp/yt-dlp/issues/393
521 yt_cookies = self._get_cookies('https://www.youtube.com')
522 sapisid_cookie = dict_get(
523 yt_cookies, ('__Secure-3PAPISID', 'SAPISID'))
524 if sapisid_cookie is None or not sapisid_cookie.value:
525 return
526 time_now = round(time.time())
527 # SAPISID cookie is required if not already present
528 if not yt_cookies.get('SAPISID'):
529 self.write_debug('Copying __Secure-3PAPISID cookie to SAPISID cookie', only_once=True)
530 self._set_cookie(
531 '.youtube.com', 'SAPISID', sapisid_cookie.value, secure=True, expire_time=time_now + 3600)
532 self.write_debug('Extracted SAPISID cookie', only_once=True)
533 # SAPISIDHASH algorithm from https://stackoverflow.com/a/32065323
534 sapisidhash = hashlib.sha1(
535 f'{time_now} {sapisid_cookie.value} {origin}'.encode('utf-8')).hexdigest()
536 return f'SAPISIDHASH {time_now}_{sapisidhash}'
537
538 def _call_api(self, ep, query, video_id, fatal=True, headers=None,
539 note='Downloading API JSON', errnote='Unable to download API page',
540 context=None, api_key=None, api_hostname=None, default_client='WEB'):
541
542 data = {'context': context} if context else {'context': self._extract_context(default_client=default_client)}
543 data.update(query)
544 real_headers = self.generate_api_headers(default_client=default_client)
545 real_headers.update({'content-type': 'application/json'})
546 if headers:
547 real_headers.update(headers)
548 return self._download_json(
549 'https://%s/youtubei/v1/%s' % (api_hostname or self._get_innertube_host(default_client), ep),
550 video_id=video_id, fatal=fatal, note=note, errnote=errnote,
551 data=json.dumps(data).encode('utf8'), headers=real_headers,
552 query={'key': api_key or self._extract_api_key()})
553
554 def extract_yt_initial_data(self, video_id, webpage):
555 return self._parse_json(
556 self._search_regex(
557 (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
558 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
559 video_id)
560
561 def _extract_identity_token(self, webpage, item_id):
562 if not webpage:
563 return None
564 ytcfg = self.extract_ytcfg(item_id, webpage)
565 if ytcfg:
566 token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
567 if token:
568 return token
569 return self._search_regex(
570 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
571 'identity token', default=None)
572
573 @staticmethod
574 def _extract_account_syncid(*args):
575 """
576 Extract syncId required to download private playlists of secondary channels
577 @params response and/or ytcfg
578 """
579 for data in args:
580 # ytcfg includes channel_syncid if on secondary channel
581 delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], compat_str)
582 if delegated_sid:
583 return delegated_sid
584 sync_ids = (try_get(
585 data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'],
586 lambda x: x['DATASYNC_ID']), compat_str) or '').split("||")
587 if len(sync_ids) >= 2 and sync_ids[1]:
588 # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
589 # and just "user_syncid||" for primary channel. We only want the channel_syncid
590 return sync_ids[0]
591
592 def extract_ytcfg(self, video_id, webpage):
593 if not webpage:
594 return {}
595 return self._parse_json(
596 self._search_regex(
597 r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
598 default='{}'), video_id, fatal=False) or {}
599
600 def generate_api_headers(
601 self, ytcfg=None, identity_token=None, account_syncid=None,
602 visitor_data=None, api_hostname=None, default_client='WEB', session_index=None):
603 origin = 'https://' + (api_hostname if api_hostname else self._get_innertube_host(default_client))
604 headers = {
605 'X-YouTube-Client-Name': compat_str(
606 self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=default_client)),
607 'X-YouTube-Client-Version': self._extract_client_version(ytcfg, default_client),
608 'Origin': origin
609 }
610 if not visitor_data and ytcfg:
611 visitor_data = try_get(
612 self._extract_context(ytcfg, default_client), lambda x: x['client']['visitorData'], compat_str)
613 if identity_token:
614 headers['X-Youtube-Identity-Token'] = identity_token
615 if account_syncid:
616 headers['X-Goog-PageId'] = account_syncid
617 if session_index is None and ytcfg:
618 session_index = self._extract_session_index(ytcfg)
619 if account_syncid or session_index is not None:
620 headers['X-Goog-AuthUser'] = session_index if session_index is not None else 0
621 if visitor_data:
622 headers['X-Goog-Visitor-Id'] = visitor_data
623 auth = self._generate_sapisidhash_header(origin)
624 if auth is not None:
625 headers['Authorization'] = auth
626 headers['X-Origin'] = origin
627 return headers
628
629 @staticmethod
630 def _build_api_continuation_query(continuation, ctp=None):
631 query = {
632 'continuation': continuation
633 }
634 # TODO: Inconsistency with clickTrackingParams.
635 # Currently we have a fixed ctp contained within context (from ytcfg)
636 # and a ctp in root query for continuation.
637 if ctp:
638 query['clickTracking'] = {'clickTrackingParams': ctp}
639 return query
640
641 @classmethod
642 def _extract_next_continuation_data(cls, renderer):
643 next_continuation = try_get(
644 renderer, (lambda x: x['continuations'][0]['nextContinuationData'],
645 lambda x: x['continuation']['reloadContinuationData']), dict)
646 if not next_continuation:
647 return
648 continuation = next_continuation.get('continuation')
649 if not continuation:
650 return
651 ctp = next_continuation.get('clickTrackingParams')
652 return cls._build_api_continuation_query(continuation, ctp)
653
654 @classmethod
655 def _extract_continuation_ep_data(cls, continuation_ep: dict):
656 if isinstance(continuation_ep, dict):
657 continuation = try_get(
658 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
659 if not continuation:
660 return
661 ctp = continuation_ep.get('clickTrackingParams')
662 return cls._build_api_continuation_query(continuation, ctp)
663
664 @classmethod
665 def _extract_continuation(cls, renderer):
666 next_continuation = cls._extract_next_continuation_data(renderer)
667 if next_continuation:
668 return next_continuation
669
670 contents = []
671 for key in ('contents', 'items'):
672 contents.extend(try_get(renderer, lambda x: x[key], list) or [])
673
674 for content in contents:
675 if not isinstance(content, dict):
676 continue
677 continuation_ep = try_get(
678 content, (lambda x: x['continuationItemRenderer']['continuationEndpoint'],
679 lambda x: x['continuationItemRenderer']['button']['buttonRenderer']['command']),
680 dict)
681 continuation = cls._extract_continuation_ep_data(continuation_ep)
682 if continuation:
683 return continuation
684
685 @classmethod
686 def _extract_alerts(cls, data):
687 for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
688 if not isinstance(alert_dict, dict):
689 continue
690 for alert in alert_dict.values():
691 alert_type = alert.get('type')
692 if not alert_type:
693 continue
694 message = cls._get_text(alert.get('text'))
695 if message:
696 yield alert_type, message
697
698 def _report_alerts(self, alerts, expected=True):
699 errors = []
700 warnings = []
701 for alert_type, alert_message in alerts:
702 if alert_type.lower() == 'error':
703 errors.append([alert_type, alert_message])
704 else:
705 warnings.append([alert_type, alert_message])
706
707 for alert_type, alert_message in (warnings + errors[:-1]):
708 self.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
709 if errors:
710 raise ExtractorError('YouTube said: %s' % errors[-1][1], expected=expected)
711
712 def _extract_and_report_alerts(self, data, *args, **kwargs):
713 return self._report_alerts(self._extract_alerts(data), *args, **kwargs)
714
715 def _extract_badges(self, renderer: dict):
716 badges = set()
717 for badge in try_get(renderer, lambda x: x['badges'], list) or []:
718 label = try_get(badge, lambda x: x['metadataBadgeRenderer']['label'], compat_str)
719 if label:
720 badges.add(label.lower())
721 return badges
722
723 @staticmethod
724 def _get_text(data, getter=None, max_runs=None):
725 for get in variadic(getter):
726 d = try_get(data, get) if get is not None else data
727 text = try_get(d, lambda x: x['simpleText'], compat_str)
728 if text:
729 return text
730 runs = try_get(d, lambda x: x['runs'], list) or []
731 if not runs and isinstance(d, list):
732 runs = d
733
734 def get_runs(runs):
735 for run in runs[:min(len(runs), max_runs or len(runs))]:
736 yield try_get(run, lambda x: x['text'], compat_str) or ''
737
738 text = ''.join(get_runs(runs))
739 if text:
740 return text
741
742 def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None,
743 ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None,
744 default_client='WEB'):
745 response = None
746 last_error = None
747 count = -1
748 retries = self.get_param('extractor_retries', 3)
749 if check_get_keys is None:
750 check_get_keys = []
751 while count < retries:
752 count += 1
753 if last_error:
754 self.report_warning('%s. Retrying ...' % last_error)
755 try:
756 response = self._call_api(
757 ep=ep, fatal=True, headers=headers,
758 video_id=item_id, query=query,
759 context=self._extract_context(ytcfg, default_client),
760 api_key=self._extract_api_key(ytcfg, default_client),
761 api_hostname=api_hostname, default_client=default_client,
762 note='%s%s' % (note, ' (retry #%d)' % count if count else ''))
763 except ExtractorError as e:
764 if isinstance(e.cause, network_exceptions):
765 # Downloading page may result in intermittent 5xx HTTP error
766 # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
767 # We also want to catch all other network exceptions since errors in later pages can be troublesome
768 # See https://github.com/yt-dlp/yt-dlp/issues/507#issuecomment-880188210
769 if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (403, 429):
770 last_error = error_to_compat_str(e.cause or e)
771 if count < retries:
772 continue
773 if fatal:
774 raise
775 else:
776 self.report_warning(error_to_compat_str(e))
777 return
778
779 else:
780 # Youtube may send alerts if there was an issue with the continuation page
781 try:
782 self._extract_and_report_alerts(response, expected=False)
783 except ExtractorError as e:
784 if fatal:
785 raise
786 self.report_warning(error_to_compat_str(e))
787 return
788 if not check_get_keys or dict_get(response, check_get_keys):
789 break
790 # Youtube sometimes sends incomplete data
791 # See: https://github.com/ytdl-org/youtube-dl/issues/28194
792 last_error = 'Incomplete data received'
793 if count >= retries:
794 if fatal:
795 raise ExtractorError(last_error)
796 else:
797 self.report_warning(last_error)
798 return
799 return response
800
801 @staticmethod
802 def is_music_url(url):
803 return re.match(r'https?://music\.youtube\.com/', url) is not None
804
805 def _extract_video(self, renderer):
806 video_id = renderer.get('videoId')
807 title = self._get_text(renderer.get('title'))
808 description = self._get_text(renderer.get('descriptionSnippet'))
809 duration = parse_duration(self._get_text(renderer.get('lengthText')))
810 view_count_text = self._get_text(renderer.get('viewCountText')) or ''
811 view_count = str_to_int(self._search_regex(
812 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
813 'view count', default=None))
814
815 uploader = self._get_text(renderer, (lambda x: x['ownerText'], lambda x: x['shortBylineText']))
816
817 return {
818 '_type': 'url',
819 'ie_key': YoutubeIE.ie_key(),
820 'id': video_id,
821 'url': video_id,
822 'title': title,
823 'description': description,
824 'duration': duration,
825 'view_count': view_count,
826 'uploader': uploader,
827 }
828
829
830 class YoutubeIE(YoutubeBaseInfoExtractor):
831 IE_DESC = 'YouTube.com'
832 _INVIDIOUS_SITES = (
833 # invidious-redirect websites
834 r'(?:www\.)?redirect\.invidious\.io',
835 r'(?:(?:www|dev)\.)?invidio\.us',
836 # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
837 r'(?:www\.)?invidious\.pussthecat\.org',
838 r'(?:www\.)?invidious\.zee\.li',
839 r'(?:www\.)?invidious\.ethibox\.fr',
840 r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
841 # youtube-dl invidious instances list
842 r'(?:(?:www|no)\.)?invidiou\.sh',
843 r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
844 r'(?:www\.)?invidious\.kabi\.tk',
845 r'(?:www\.)?invidious\.mastodon\.host',
846 r'(?:www\.)?invidious\.zapashcanon\.fr',
847 r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks',
848 r'(?:www\.)?invidious\.tinfoil-hat\.net',
849 r'(?:www\.)?invidious\.himiko\.cloud',
850 r'(?:www\.)?invidious\.reallyancient\.tech',
851 r'(?:www\.)?invidious\.tube',
852 r'(?:www\.)?invidiou\.site',
853 r'(?:www\.)?invidious\.site',
854 r'(?:www\.)?invidious\.xyz',
855 r'(?:www\.)?invidious\.nixnet\.xyz',
856 r'(?:www\.)?invidious\.048596\.xyz',
857 r'(?:www\.)?invidious\.drycat\.fr',
858 r'(?:www\.)?inv\.skyn3t\.in',
859 r'(?:www\.)?tube\.poal\.co',
860 r'(?:www\.)?tube\.connect\.cafe',
861 r'(?:www\.)?vid\.wxzm\.sx',
862 r'(?:www\.)?vid\.mint\.lgbt',
863 r'(?:www\.)?vid\.puffyan\.us',
864 r'(?:www\.)?yewtu\.be',
865 r'(?:www\.)?yt\.elukerio\.org',
866 r'(?:www\.)?yt\.lelux\.fi',
867 r'(?:www\.)?invidious\.ggc-project\.de',
868 r'(?:www\.)?yt\.maisputain\.ovh',
869 r'(?:www\.)?ytprivate\.com',
870 r'(?:www\.)?invidious\.13ad\.de',
871 r'(?:www\.)?invidious\.toot\.koeln',
872 r'(?:www\.)?invidious\.fdn\.fr',
873 r'(?:www\.)?watch\.nettohikari\.com',
874 r'(?:www\.)?invidious\.namazso\.eu',
875 r'(?:www\.)?invidious\.silkky\.cloud',
876 r'(?:www\.)?invidious\.exonip\.de',
877 r'(?:www\.)?invidious\.riverside\.rocks',
878 r'(?:www\.)?invidious\.blamefran\.net',
879 r'(?:www\.)?invidious\.moomoo\.de',
880 r'(?:www\.)?ytb\.trom\.tf',
881 r'(?:www\.)?yt\.cyberhost\.uk',
882 r'(?:www\.)?kgg2m7yk5aybusll\.onion',
883 r'(?:www\.)?qklhadlycap4cnod\.onion',
884 r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion',
885 r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion',
886 r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion',
887 r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion',
888 r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p',
889 r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion',
890 r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
891 r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
892 r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
893 r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion',
894 )
895 _VALID_URL = r"""(?x)^
896 (
897 (?:https?://|//) # http(s):// or protocol-independent URL
898 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com|
899 (?:www\.)?deturl\.com/www\.youtube\.com|
900 (?:www\.)?pwnyoutube\.com|
901 (?:www\.)?hooktube\.com|
902 (?:www\.)?yourepeat\.com|
903 tube\.majestyc\.net|
904 %(invidious)s|
905 youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
906 (?:.*?\#/)? # handle anchor (#/) redirect urls
907 (?: # the various things that can precede the ID:
908 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
909 |(?: # or the v= param in all its forms
910 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
911 (?:\?|\#!?) # the params delimiter ? or # or #!
912 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
913 v=
914 )
915 ))
916 |(?:
917 youtu\.be| # just youtu.be/xxxx
918 vid\.plus| # or vid.plus/xxxx
919 zwearz\.com/watch| # or zwearz.com/watch/xxxx
920 %(invidious)s
921 )/
922 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
923 )
924 )? # all until now is optional -> you can pass the naked ID
925 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
926 (?(1).+)? # if we found the ID, everything can follow
927 (?:\#|$)""" % {
928 'invidious': '|'.join(_INVIDIOUS_SITES),
929 }
930 _PLAYER_INFO_RE = (
931 r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
932 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
933 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
934 )
935 _formats = {
936 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
937 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
938 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
939 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
940 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
941 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
942 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
943 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
944 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
945 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
946 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
947 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
948 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
949 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
950 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
951 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
952 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
953 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
954
955
956 # 3D videos
957 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
958 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
959 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
960 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
961 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
962 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
963 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
964
965 # Apple HTTP Live Streaming
966 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
967 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
968 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
969 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
970 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
971 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
972 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
973 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
974
975 # DASH mp4 video
976 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
977 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
978 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
979 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
980 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
981 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
982 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
983 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
984 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
985 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
986 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
987 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
988
989 # Dash mp4 audio
990 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
991 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
992 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
993 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
994 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
995 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
996 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
997
998 # Dash webm
999 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
1000 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
1001 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
1002 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
1003 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
1004 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
1005 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
1006 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1007 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1008 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1009 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1010 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1011 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1012 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1013 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1014 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
1015 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1016 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
1017 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
1018 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
1019 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
1020 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
1021
1022 # Dash webm audio
1023 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
1024 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
1025
1026 # Dash webm audio with opus inside
1027 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
1028 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
1029 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
1030
1031 # RTMP (unnamed)
1032 '_rtmp': {'protocol': 'rtmp'},
1033
1034 # av01 video only formats sometimes served with "unknown" codecs
1035 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
1036 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
1037 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
1038 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
1039 }
1040 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
1041
1042 _AGE_GATE_REASONS = (
1043 'Sign in to confirm your age',
1044 'This video may be inappropriate for some users.',
1045 'Sorry, this content is age-restricted.')
1046
1047 _GEO_BYPASS = False
1048
1049 IE_NAME = 'youtube'
1050 _TESTS = [
1051 {
1052 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
1053 'info_dict': {
1054 'id': 'BaW_jenozKc',
1055 'ext': 'mp4',
1056 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
1057 'uploader': 'Philipp Hagemeister',
1058 'uploader_id': 'phihag',
1059 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
1060 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
1061 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
1062 'upload_date': '20121002',
1063 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
1064 'categories': ['Science & Technology'],
1065 'tags': ['youtube-dl'],
1066 'duration': 10,
1067 'view_count': int,
1068 'like_count': int,
1069 'dislike_count': int,
1070 'start_time': 1,
1071 'end_time': 9,
1072 }
1073 },
1074 {
1075 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
1076 'note': 'Embed-only video (#1746)',
1077 'info_dict': {
1078 'id': 'yZIXLfi8CZQ',
1079 'ext': 'mp4',
1080 'upload_date': '20120608',
1081 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
1082 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
1083 'uploader': 'SET India',
1084 'uploader_id': 'setindia',
1085 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
1086 'age_limit': 18,
1087 },
1088 'skip': 'Private video',
1089 },
1090 {
1091 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
1092 'note': 'Use the first video ID in the URL',
1093 'info_dict': {
1094 'id': 'BaW_jenozKc',
1095 'ext': 'mp4',
1096 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
1097 'uploader': 'Philipp Hagemeister',
1098 'uploader_id': 'phihag',
1099 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
1100 'upload_date': '20121002',
1101 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
1102 'categories': ['Science & Technology'],
1103 'tags': ['youtube-dl'],
1104 'duration': 10,
1105 'view_count': int,
1106 'like_count': int,
1107 'dislike_count': int,
1108 },
1109 'params': {
1110 'skip_download': True,
1111 },
1112 },
1113 {
1114 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
1115 'note': '256k DASH audio (format 141) via DASH manifest',
1116 'info_dict': {
1117 'id': 'a9LDPn-MO4I',
1118 'ext': 'm4a',
1119 'upload_date': '20121002',
1120 'uploader_id': '8KVIDEO',
1121 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
1122 'description': '',
1123 'uploader': '8KVIDEO',
1124 'title': 'UHDTV TEST 8K VIDEO.mp4'
1125 },
1126 'params': {
1127 'youtube_include_dash_manifest': True,
1128 'format': '141',
1129 },
1130 'skip': 'format 141 not served anymore',
1131 },
1132 # DASH manifest with encrypted signature
1133 {
1134 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
1135 'info_dict': {
1136 'id': 'IB3lcPjvWLA',
1137 'ext': 'm4a',
1138 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
1139 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
1140 'duration': 244,
1141 'uploader': 'AfrojackVEVO',
1142 'uploader_id': 'AfrojackVEVO',
1143 'upload_date': '20131011',
1144 'abr': 129.495,
1145 },
1146 'params': {
1147 'youtube_include_dash_manifest': True,
1148 'format': '141/bestaudio[ext=m4a]',
1149 },
1150 },
1151 # Normal age-gate video (embed allowed)
1152 {
1153 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
1154 'info_dict': {
1155 'id': 'HtVdAasjOgU',
1156 'ext': 'mp4',
1157 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
1158 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
1159 'duration': 142,
1160 'uploader': 'The Witcher',
1161 'uploader_id': 'WitcherGame',
1162 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
1163 'upload_date': '20140605',
1164 'age_limit': 18,
1165 },
1166 },
1167 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
1168 # YouTube Red ad is not captured for creator
1169 {
1170 'url': '__2ABJjxzNo',
1171 'info_dict': {
1172 'id': '__2ABJjxzNo',
1173 'ext': 'mp4',
1174 'duration': 266,
1175 'upload_date': '20100430',
1176 'uploader_id': 'deadmau5',
1177 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
1178 'creator': 'deadmau5',
1179 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336',
1180 'uploader': 'deadmau5',
1181 'title': 'Deadmau5 - Some Chords (HD)',
1182 'alt_title': 'Some Chords',
1183 },
1184 'expected_warnings': [
1185 'DASH manifest missing',
1186 ]
1187 },
1188 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
1189 {
1190 'url': 'lqQg6PlCWgI',
1191 'info_dict': {
1192 'id': 'lqQg6PlCWgI',
1193 'ext': 'mp4',
1194 'duration': 6085,
1195 'upload_date': '20150827',
1196 'uploader_id': 'olympic',
1197 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
1198 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
1199 'uploader': 'Olympics',
1200 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
1201 },
1202 'params': {
1203 'skip_download': 'requires avconv',
1204 }
1205 },
1206 # Non-square pixels
1207 {
1208 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
1209 'info_dict': {
1210 'id': '_b-2C3KPAM0',
1211 'ext': 'mp4',
1212 'stretched_ratio': 16 / 9.,
1213 'duration': 85,
1214 'upload_date': '20110310',
1215 'uploader_id': 'AllenMeow',
1216 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
1217 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
1218 'uploader': '孫ᄋᄅ',
1219 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
1220 },
1221 },
1222 # url_encoded_fmt_stream_map is empty string
1223 {
1224 'url': 'qEJwOuvDf7I',
1225 'info_dict': {
1226 'id': 'qEJwOuvDf7I',
1227 'ext': 'webm',
1228 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
1229 'description': '',
1230 'upload_date': '20150404',
1231 'uploader_id': 'spbelect',
1232 'uploader': 'Наблюдатели Петербурга',
1233 },
1234 'params': {
1235 'skip_download': 'requires avconv',
1236 },
1237 'skip': 'This live event has ended.',
1238 },
1239 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
1240 {
1241 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
1242 'info_dict': {
1243 'id': 'FIl7x6_3R5Y',
1244 'ext': 'webm',
1245 'title': 'md5:7b81415841e02ecd4313668cde88737a',
1246 'description': 'md5:116377fd2963b81ec4ce64b542173306',
1247 'duration': 220,
1248 'upload_date': '20150625',
1249 'uploader_id': 'dorappi2000',
1250 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
1251 'uploader': 'dorappi2000',
1252 'formats': 'mincount:31',
1253 },
1254 'skip': 'not actual anymore',
1255 },
1256 # DASH manifest with segment_list
1257 {
1258 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
1259 'md5': '8ce563a1d667b599d21064e982ab9e31',
1260 'info_dict': {
1261 'id': 'CsmdDsKjzN8',
1262 'ext': 'mp4',
1263 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
1264 'uploader': 'Airtek',
1265 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
1266 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
1267 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
1268 },
1269 'params': {
1270 'youtube_include_dash_manifest': True,
1271 'format': '135', # bestvideo
1272 },
1273 'skip': 'This live event has ended.',
1274 },
1275 {
1276 # Multifeed videos (multiple cameras), URL is for Main Camera
1277 'url': 'https://www.youtube.com/watch?v=jvGDaLqkpTg',
1278 'info_dict': {
1279 'id': 'jvGDaLqkpTg',
1280 'title': 'Tom Clancy Free Weekend Rainbow Whatever',
1281 'description': 'md5:e03b909557865076822aa169218d6a5d',
1282 },
1283 'playlist': [{
1284 'info_dict': {
1285 'id': 'jvGDaLqkpTg',
1286 'ext': 'mp4',
1287 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Main Camera)',
1288 'description': 'md5:e03b909557865076822aa169218d6a5d',
1289 'duration': 10643,
1290 'upload_date': '20161111',
1291 'uploader': 'Team PGP',
1292 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1293 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1294 },
1295 }, {
1296 'info_dict': {
1297 'id': '3AKt1R1aDnw',
1298 'ext': 'mp4',
1299 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 2)',
1300 'description': 'md5:e03b909557865076822aa169218d6a5d',
1301 'duration': 10991,
1302 'upload_date': '20161111',
1303 'uploader': 'Team PGP',
1304 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1305 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1306 },
1307 }, {
1308 'info_dict': {
1309 'id': 'RtAMM00gpVc',
1310 'ext': 'mp4',
1311 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 3)',
1312 'description': 'md5:e03b909557865076822aa169218d6a5d',
1313 'duration': 10995,
1314 'upload_date': '20161111',
1315 'uploader': 'Team PGP',
1316 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1317 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1318 },
1319 }, {
1320 'info_dict': {
1321 'id': '6N2fdlP3C5U',
1322 'ext': 'mp4',
1323 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 4)',
1324 'description': 'md5:e03b909557865076822aa169218d6a5d',
1325 'duration': 10990,
1326 'upload_date': '20161111',
1327 'uploader': 'Team PGP',
1328 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1329 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1330 },
1331 }],
1332 'params': {
1333 'skip_download': True,
1334 },
1335 },
1336 {
1337 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
1338 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
1339 'info_dict': {
1340 'id': 'gVfLd0zydlo',
1341 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
1342 },
1343 'playlist_count': 2,
1344 'skip': 'Not multifeed anymore',
1345 },
1346 {
1347 'url': 'https://vid.plus/FlRa-iH7PGw',
1348 'only_matching': True,
1349 },
1350 {
1351 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
1352 'only_matching': True,
1353 },
1354 {
1355 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
1356 # Also tests cut-off URL expansion in video description (see
1357 # https://github.com/ytdl-org/youtube-dl/issues/1892,
1358 # https://github.com/ytdl-org/youtube-dl/issues/8164)
1359 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
1360 'info_dict': {
1361 'id': 'lsguqyKfVQg',
1362 'ext': 'mp4',
1363 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
1364 'alt_title': 'Dark Walk',
1365 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
1366 'duration': 133,
1367 'upload_date': '20151119',
1368 'uploader_id': 'IronSoulElf',
1369 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
1370 'uploader': 'IronSoulElf',
1371 'creator': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan',
1372 'track': 'Dark Walk',
1373 'artist': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan',
1374 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
1375 },
1376 'params': {
1377 'skip_download': True,
1378 },
1379 },
1380 {
1381 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
1382 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
1383 'only_matching': True,
1384 },
1385 {
1386 # Video with yt:stretch=17:0
1387 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
1388 'info_dict': {
1389 'id': 'Q39EVAstoRM',
1390 'ext': 'mp4',
1391 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
1392 'description': 'md5:ee18a25c350637c8faff806845bddee9',
1393 'upload_date': '20151107',
1394 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
1395 'uploader': 'CH GAMER DROID',
1396 },
1397 'params': {
1398 'skip_download': True,
1399 },
1400 'skip': 'This video does not exist.',
1401 },
1402 {
1403 # Video with incomplete 'yt:stretch=16:'
1404 'url': 'https://www.youtube.com/watch?v=FRhJzUSJbGI',
1405 'only_matching': True,
1406 },
1407 {
1408 # Video licensed under Creative Commons
1409 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
1410 'info_dict': {
1411 'id': 'M4gD1WSo5mA',
1412 'ext': 'mp4',
1413 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
1414 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
1415 'duration': 721,
1416 'upload_date': '20150127',
1417 'uploader_id': 'BerkmanCenter',
1418 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
1419 'uploader': 'The Berkman Klein Center for Internet & Society',
1420 'license': 'Creative Commons Attribution license (reuse allowed)',
1421 },
1422 'params': {
1423 'skip_download': True,
1424 },
1425 },
1426 {
1427 # Channel-like uploader_url
1428 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
1429 'info_dict': {
1430 'id': 'eQcmzGIKrzg',
1431 'ext': 'mp4',
1432 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
1433 'description': 'md5:13a2503d7b5904ef4b223aa101628f39',
1434 'duration': 4060,
1435 'upload_date': '20151119',
1436 'uploader': 'Bernie Sanders',
1437 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
1438 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
1439 'license': 'Creative Commons Attribution license (reuse allowed)',
1440 },
1441 'params': {
1442 'skip_download': True,
1443 },
1444 },
1445 {
1446 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
1447 'only_matching': True,
1448 },
1449 {
1450 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
1451 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
1452 'only_matching': True,
1453 },
1454 {
1455 # Rental video preview
1456 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1457 'info_dict': {
1458 'id': 'uGpuVWrhIzE',
1459 'ext': 'mp4',
1460 'title': 'Piku - Trailer',
1461 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1462 'upload_date': '20150811',
1463 'uploader': 'FlixMatrix',
1464 'uploader_id': 'FlixMatrixKaravan',
1465 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
1466 'license': 'Standard YouTube License',
1467 },
1468 'params': {
1469 'skip_download': True,
1470 },
1471 'skip': 'This video is not available.',
1472 },
1473 {
1474 # YouTube Red video with episode data
1475 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1476 'info_dict': {
1477 'id': 'iqKdEhx-dD4',
1478 'ext': 'mp4',
1479 'title': 'Isolation - Mind Field (Ep 1)',
1480 'description': 'md5:f540112edec5d09fc8cc752d3d4ba3cd',
1481 'duration': 2085,
1482 'upload_date': '20170118',
1483 'uploader': 'Vsauce',
1484 'uploader_id': 'Vsauce',
1485 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1486 'series': 'Mind Field',
1487 'season_number': 1,
1488 'episode_number': 1,
1489 },
1490 'params': {
1491 'skip_download': True,
1492 },
1493 'expected_warnings': [
1494 'Skipping DASH manifest',
1495 ],
1496 },
1497 {
1498 # The following content has been identified by the YouTube community
1499 # as inappropriate or offensive to some audiences.
1500 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1501 'info_dict': {
1502 'id': '6SJNVb0GnPI',
1503 'ext': 'mp4',
1504 'title': 'Race Differences in Intelligence',
1505 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1506 'duration': 965,
1507 'upload_date': '20140124',
1508 'uploader': 'New Century Foundation',
1509 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1510 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1511 },
1512 'params': {
1513 'skip_download': True,
1514 },
1515 'skip': 'This video has been removed for violating YouTube\'s policy on hate speech.',
1516 },
1517 {
1518 # itag 212
1519 'url': '1t24XAntNCY',
1520 'only_matching': True,
1521 },
1522 {
1523 # geo restricted to JP
1524 'url': 'sJL6WA-aGkQ',
1525 'only_matching': True,
1526 },
1527 {
1528 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1529 'only_matching': True,
1530 },
1531 {
1532 'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc',
1533 'only_matching': True,
1534 },
1535 {
1536 # from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m
1537 'url': 'https://redirect.invidious.io/Yh0AhrY9GjA',
1538 'only_matching': True,
1539 },
1540 {
1541 # DRM protected
1542 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1543 'only_matching': True,
1544 },
1545 {
1546 # Video with unsupported adaptive stream type formats
1547 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1548 'info_dict': {
1549 'id': 'Z4Vy8R84T1U',
1550 'ext': 'mp4',
1551 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1552 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1553 'duration': 433,
1554 'upload_date': '20130923',
1555 'uploader': 'Amelia Putri Harwita',
1556 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1557 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1558 'formats': 'maxcount:10',
1559 },
1560 'params': {
1561 'skip_download': True,
1562 'youtube_include_dash_manifest': False,
1563 },
1564 'skip': 'not actual anymore',
1565 },
1566 {
1567 # Youtube Music Auto-generated description
1568 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1569 'info_dict': {
1570 'id': 'MgNrAu2pzNs',
1571 'ext': 'mp4',
1572 'title': 'Voyeur Girl',
1573 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1574 'upload_date': '20190312',
1575 'uploader': 'Stephen - Topic',
1576 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1577 'artist': 'Stephen',
1578 'track': 'Voyeur Girl',
1579 'album': 'it\'s too much love to know my dear',
1580 'release_date': '20190313',
1581 'release_year': 2019,
1582 },
1583 'params': {
1584 'skip_download': True,
1585 },
1586 },
1587 {
1588 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1589 'only_matching': True,
1590 },
1591 {
1592 # invalid -> valid video id redirection
1593 'url': 'DJztXj2GPfl',
1594 'info_dict': {
1595 'id': 'DJztXj2GPfk',
1596 'ext': 'mp4',
1597 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1598 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1599 'upload_date': '20090125',
1600 'uploader': 'Prochorowka',
1601 'uploader_id': 'Prochorowka',
1602 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1603 'artist': 'Panjabi MC',
1604 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1605 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1606 },
1607 'params': {
1608 'skip_download': True,
1609 },
1610 'skip': 'Video unavailable',
1611 },
1612 {
1613 # empty description results in an empty string
1614 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1615 'info_dict': {
1616 'id': 'x41yOUIvK2k',
1617 'ext': 'mp4',
1618 'title': 'IMG 3456',
1619 'description': '',
1620 'upload_date': '20170613',
1621 'uploader_id': 'ElevageOrVert',
1622 'uploader': 'ElevageOrVert',
1623 },
1624 'params': {
1625 'skip_download': True,
1626 },
1627 },
1628 {
1629 # with '};' inside yt initial data (see [1])
1630 # see [2] for an example with '};' inside ytInitialPlayerResponse
1631 # 1. https://github.com/ytdl-org/youtube-dl/issues/27093
1632 # 2. https://github.com/ytdl-org/youtube-dl/issues/27216
1633 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1634 'info_dict': {
1635 'id': 'CHqg6qOn4no',
1636 'ext': 'mp4',
1637 'title': 'Part 77 Sort a list of simple types in c#',
1638 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1639 'upload_date': '20130831',
1640 'uploader_id': 'kudvenkat',
1641 'uploader': 'kudvenkat',
1642 },
1643 'params': {
1644 'skip_download': True,
1645 },
1646 },
1647 {
1648 # another example of '};' in ytInitialData
1649 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY',
1650 'only_matching': True,
1651 },
1652 {
1653 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ',
1654 'only_matching': True,
1655 },
1656 {
1657 # https://github.com/ytdl-org/youtube-dl/pull/28094
1658 'url': 'OtqTfy26tG0',
1659 'info_dict': {
1660 'id': 'OtqTfy26tG0',
1661 'ext': 'mp4',
1662 'title': 'Burn Out',
1663 'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131',
1664 'upload_date': '20141120',
1665 'uploader': 'The Cinematic Orchestra - Topic',
1666 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw',
1667 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw',
1668 'artist': 'The Cinematic Orchestra',
1669 'track': 'Burn Out',
1670 'album': 'Every Day',
1671 'release_data': None,
1672 'release_year': None,
1673 },
1674 'params': {
1675 'skip_download': True,
1676 },
1677 },
1678 {
1679 # controversial video, only works with bpctr when authenticated with cookies
1680 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg',
1681 'only_matching': True,
1682 },
1683 {
1684 # controversial video, requires bpctr/contentCheckOk
1685 'url': 'https://www.youtube.com/watch?v=SZJvDhaSDnc',
1686 'info_dict': {
1687 'id': 'SZJvDhaSDnc',
1688 'ext': 'mp4',
1689 'title': 'San Diego teen commits suicide after bullying over embarrassing video',
1690 'channel_id': 'UC-SJ6nODDmufqBzPBwCvYvQ',
1691 'uploader': 'CBS This Morning',
1692 'uploader_id': 'CBSThisMorning',
1693 'upload_date': '20140716',
1694 'description': 'md5:acde3a73d3f133fc97e837a9f76b53b7'
1695 }
1696 },
1697 {
1698 # restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685
1699 'url': 'cBvYw8_A0vQ',
1700 'info_dict': {
1701 'id': 'cBvYw8_A0vQ',
1702 'ext': 'mp4',
1703 'title': '4K Ueno Okachimachi Street Scenes 上野御徒町歩き',
1704 'description': 'md5:ea770e474b7cd6722b4c95b833c03630',
1705 'upload_date': '20201120',
1706 'uploader': 'Walk around Japan',
1707 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw',
1708 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
1709 },
1710 'params': {
1711 'skip_download': True,
1712 },
1713 }, {
1714 # Has multiple audio streams
1715 'url': 'WaOKSUlf4TM',
1716 'only_matching': True
1717 }, {
1718 # Requires Premium: has format 141 when requested using YTM url
1719 'url': 'https://music.youtube.com/watch?v=XclachpHxis',
1720 'only_matching': True
1721 }, {
1722 # multiple subtitles with same lang_code
1723 'url': 'https://www.youtube.com/watch?v=wsQiKKfKxug',
1724 'only_matching': True,
1725 }, {
1726 # Force use android client fallback
1727 'url': 'https://www.youtube.com/watch?v=YOelRv7fMxY',
1728 'info_dict': {
1729 'id': 'YOelRv7fMxY',
1730 'title': 'DIGGING A SECRET TUNNEL Part 1',
1731 'ext': '3gp',
1732 'upload_date': '20210624',
1733 'channel_id': 'UCp68_FLety0O-n9QU6phsgw',
1734 'uploader': 'colinfurze',
1735 'uploader_id': 'colinfurze',
1736 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCp68_FLety0O-n9QU6phsgw',
1737 'description': 'md5:b5096f56af7ccd7a555c84db81738b22'
1738 },
1739 'params': {
1740 'format': '17', # 3gp format available on android
1741 'extractor_args': {'youtube': {'player_client': ['android']}},
1742 },
1743 },
1744 {
1745 # Skip download of additional client configs (remix client config in this case)
1746 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1747 'only_matching': True,
1748 'params': {
1749 'extractor_args': {'youtube': {'player_skip': ['configs']}},
1750 },
1751 }
1752 ]
1753
1754 @classmethod
1755 def suitable(cls, url):
1756 # Hack for lazy extractors until more generic solution is implemented
1757 # (see #28780)
1758 from .youtube import parse_qs
1759 qs = parse_qs(url)
1760 if qs.get('list', [None])[0]:
1761 return False
1762 return super(YoutubeIE, cls).suitable(url)
1763
1764 def __init__(self, *args, **kwargs):
1765 super(YoutubeIE, self).__init__(*args, **kwargs)
1766 self._code_cache = {}
1767 self._player_cache = {}
1768
1769 def _extract_player_url(self, ytcfg=None, webpage=None):
1770 player_url = try_get(ytcfg, (lambda x: x['PLAYER_JS_URL']), str)
1771 if not player_url and webpage:
1772 player_url = self._search_regex(
1773 r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
1774 webpage, 'player URL', fatal=False)
1775 if not player_url:
1776 return None
1777 if player_url.startswith('//'):
1778 player_url = 'https:' + player_url
1779 elif not re.match(r'https?://', player_url):
1780 player_url = compat_urlparse.urljoin(
1781 'https://www.youtube.com', player_url)
1782 return player_url
1783
1784 def _signature_cache_id(self, example_sig):
1785 """ Return a string representation of a signature """
1786 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1787
1788 @classmethod
1789 def _extract_player_info(cls, player_url):
1790 for player_re in cls._PLAYER_INFO_RE:
1791 id_m = re.search(player_re, player_url)
1792 if id_m:
1793 break
1794 else:
1795 raise ExtractorError('Cannot identify player %r' % player_url)
1796 return id_m.group('id')
1797
1798 def _load_player(self, video_id, player_url, fatal=True) -> bool:
1799 player_id = self._extract_player_info(player_url)
1800 if player_id not in self._code_cache:
1801 self._code_cache[player_id] = self._download_webpage(
1802 player_url, video_id, fatal=fatal,
1803 note='Downloading player ' + player_id,
1804 errnote='Download of %s failed' % player_url)
1805 return player_id in self._code_cache
1806
1807 def _extract_signature_function(self, video_id, player_url, example_sig):
1808 player_id = self._extract_player_info(player_url)
1809
1810 # Read from filesystem cache
1811 func_id = 'js_%s_%s' % (
1812 player_id, self._signature_cache_id(example_sig))
1813 assert os.path.basename(func_id) == func_id
1814
1815 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1816 if cache_spec is not None:
1817 return lambda s: ''.join(s[i] for i in cache_spec)
1818
1819 if self._load_player(video_id, player_url):
1820 code = self._code_cache[player_id]
1821 res = self._parse_sig_js(code)
1822
1823 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1824 cache_res = res(test_string)
1825 cache_spec = [ord(c) for c in cache_res]
1826
1827 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1828 return res
1829
1830 def _print_sig_code(self, func, example_sig):
1831 def gen_sig_code(idxs):
1832 def _genslice(start, end, step):
1833 starts = '' if start == 0 else str(start)
1834 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1835 steps = '' if step == 1 else (':%d' % step)
1836 return 's[%s%s%s]' % (starts, ends, steps)
1837
1838 step = None
1839 # Quelch pyflakes warnings - start will be set when step is set
1840 start = '(Never used)'
1841 for i, prev in zip(idxs[1:], idxs[:-1]):
1842 if step is not None:
1843 if i - prev == step:
1844 continue
1845 yield _genslice(start, prev, step)
1846 step = None
1847 continue
1848 if i - prev in [-1, 1]:
1849 step = i - prev
1850 start = prev
1851 continue
1852 else:
1853 yield 's[%d]' % prev
1854 if step is None:
1855 yield 's[%d]' % i
1856 else:
1857 yield _genslice(start, i, step)
1858
1859 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1860 cache_res = func(test_string)
1861 cache_spec = [ord(c) for c in cache_res]
1862 expr_code = ' + '.join(gen_sig_code(cache_spec))
1863 signature_id_tuple = '(%s)' % (
1864 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1865 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1866 ' return %s\n') % (signature_id_tuple, expr_code)
1867 self.to_screen('Extracted signature function:\n' + code)
1868
1869 def _parse_sig_js(self, jscode):
1870 funcname = self._search_regex(
1871 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1872 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1873 r'\bm=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(h\.s\)\)',
1874 r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(c\)\)',
1875 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)',
1876 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1877 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1878 # Obsolete patterns
1879 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1880 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1881 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1882 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1883 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1884 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1885 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1886 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1887 jscode, 'Initial JS player signature function name', group='sig')
1888
1889 jsi = JSInterpreter(jscode)
1890 initial_function = jsi.extract_function(funcname)
1891 return lambda s: initial_function([s])
1892
1893 def _decrypt_signature(self, s, video_id, player_url):
1894 """Turn the encrypted s field into a working signature"""
1895
1896 if player_url is None:
1897 raise ExtractorError('Cannot decrypt signature without player_url')
1898
1899 try:
1900 player_id = (player_url, self._signature_cache_id(s))
1901 if player_id not in self._player_cache:
1902 func = self._extract_signature_function(
1903 video_id, player_url, s
1904 )
1905 self._player_cache[player_id] = func
1906 func = self._player_cache[player_id]
1907 if self.get_param('youtube_print_sig_code'):
1908 self._print_sig_code(func, s)
1909 return func(s)
1910 except Exception as e:
1911 tb = traceback.format_exc()
1912 raise ExtractorError(
1913 'Signature extraction failed: ' + tb, cause=e)
1914
1915 def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False):
1916 """
1917 Extract signatureTimestamp (sts)
1918 Required to tell API what sig/player version is in use.
1919 """
1920 sts = None
1921 if isinstance(ytcfg, dict):
1922 sts = int_or_none(ytcfg.get('STS'))
1923
1924 if not sts:
1925 # Attempt to extract from player
1926 if player_url is None:
1927 error_msg = 'Cannot extract signature timestamp without player_url.'
1928 if fatal:
1929 raise ExtractorError(error_msg)
1930 self.report_warning(error_msg)
1931 return
1932 if self._load_player(video_id, player_url, fatal=fatal):
1933 player_id = self._extract_player_info(player_url)
1934 code = self._code_cache[player_id]
1935 sts = int_or_none(self._search_regex(
1936 r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code,
1937 'JS player signature timestamp', group='sts', fatal=fatal))
1938 return sts
1939
1940 def _mark_watched(self, video_id, player_responses):
1941 playback_url = traverse_obj(
1942 player_responses, (..., 'playbackTracking', 'videostatsPlaybackUrl', 'baseUrl'),
1943 expected_type=url_or_none, get_all=False)
1944 if not playback_url:
1945 self.report_warning('Unable to mark watched')
1946 return
1947 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1948 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1949
1950 # cpn generation algorithm is reverse engineered from base.js.
1951 # In fact it works even with dummy cpn.
1952 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1953 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1954
1955 qs.update({
1956 'ver': ['2'],
1957 'cpn': [cpn],
1958 })
1959 playback_url = compat_urlparse.urlunparse(
1960 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1961
1962 self._download_webpage(
1963 playback_url, video_id, 'Marking watched',
1964 'Unable to mark watched', fatal=False)
1965
1966 @staticmethod
1967 def _extract_urls(webpage):
1968 # Embedded YouTube player
1969 entries = [
1970 unescapeHTML(mobj.group('url'))
1971 for mobj in re.finditer(r'''(?x)
1972 (?:
1973 <iframe[^>]+?src=|
1974 data-video-url=|
1975 <embed[^>]+?src=|
1976 embedSWF\(?:\s*|
1977 <object[^>]+data=|
1978 new\s+SWFObject\(
1979 )
1980 (["\'])
1981 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1982 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1983 \1''', webpage)]
1984
1985 # lazyYT YouTube embed
1986 entries.extend(list(map(
1987 unescapeHTML,
1988 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1989
1990 # Wordpress "YouTube Video Importer" plugin
1991 matches = re.findall(r'''(?x)<div[^>]+
1992 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1993 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1994 entries.extend(m[-1] for m in matches)
1995
1996 return entries
1997
1998 @staticmethod
1999 def _extract_url(webpage):
2000 urls = YoutubeIE._extract_urls(webpage)
2001 return urls[0] if urls else None
2002
2003 @classmethod
2004 def extract_id(cls, url):
2005 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
2006 if mobj is None:
2007 raise ExtractorError('Invalid URL: %s' % url)
2008 video_id = mobj.group(2)
2009 return video_id
2010
2011 def _extract_chapters_from_json(self, data, duration):
2012 chapter_list = traverse_obj(
2013 data, (
2014 'playerOverlays', 'playerOverlayRenderer', 'decoratedPlayerBarRenderer',
2015 'decoratedPlayerBarRenderer', 'playerBar', 'chapteredPlayerBarRenderer', 'chapters'
2016 ), expected_type=list)
2017
2018 return self._extract_chapters(
2019 chapter_list,
2020 chapter_time=lambda chapter: float_or_none(
2021 traverse_obj(chapter, ('chapterRenderer', 'timeRangeStartMillis')), scale=1000),
2022 chapter_title=lambda chapter: traverse_obj(
2023 chapter, ('chapterRenderer', 'title', 'simpleText'), expected_type=str),
2024 duration=duration)
2025
2026 def _extract_chapters_from_engagement_panel(self, data, duration):
2027 content_list = traverse_obj(
2028 data,
2029 ('engagementPanels', ..., 'engagementPanelSectionListRenderer', 'content', 'macroMarkersListRenderer', 'contents'),
2030 expected_type=list, default=[])
2031 chapter_time = lambda chapter: parse_duration(self._get_text(chapter.get('timeDescription')))
2032 chapter_title = lambda chapter: self._get_text(chapter.get('title'))
2033
2034 return next((
2035 filter(None, (
2036 self._extract_chapters(
2037 traverse_obj(contents, (..., 'macroMarkersListItemRenderer')),
2038 chapter_time, chapter_title, duration)
2039 for contents in content_list
2040 ))), [])
2041
2042 def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration):
2043 chapters = []
2044 last_chapter = {'start_time': 0}
2045 for idx, chapter in enumerate(chapter_list or []):
2046 title = chapter_title(chapter)
2047 start_time = chapter_time(chapter)
2048 if start_time is None:
2049 continue
2050 last_chapter['end_time'] = start_time
2051 if start_time < last_chapter['start_time']:
2052 if idx == 1:
2053 chapters.pop()
2054 self.report_warning('Invalid start time for chapter "%s"' % last_chapter['title'])
2055 else:
2056 self.report_warning(f'Invalid start time for chapter "{title}"')
2057 continue
2058 last_chapter = {'start_time': start_time, 'title': title}
2059 chapters.append(last_chapter)
2060 last_chapter['end_time'] = duration
2061 return chapters
2062
2063 def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
2064 return self._parse_json(self._search_regex(
2065 (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
2066 regex), webpage, name, default='{}'), video_id, fatal=False)
2067
2068 @staticmethod
2069 def parse_time_text(time_text):
2070 """
2071 Parse the comment time text
2072 time_text is in the format 'X units ago (edited)'
2073 """
2074 time_text_split = time_text.split(' ')
2075 if len(time_text_split) >= 3:
2076 try:
2077 return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto')
2078 except ValueError:
2079 return None
2080
2081 def _extract_comment(self, comment_renderer, parent=None):
2082 comment_id = comment_renderer.get('commentId')
2083 if not comment_id:
2084 return
2085
2086 text = self._get_text(comment_renderer.get('contentText'))
2087
2088 # note: timestamp is an estimate calculated from the current time and time_text
2089 time_text = self._get_text(comment_renderer.get('publishedTimeText')) or ''
2090 time_text_dt = self.parse_time_text(time_text)
2091 if isinstance(time_text_dt, datetime.datetime):
2092 timestamp = calendar.timegm(time_text_dt.timetuple())
2093 author = self._get_text(comment_renderer.get('authorText'))
2094 author_id = try_get(comment_renderer,
2095 lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
2096
2097 votes = parse_count(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'],
2098 lambda x: x['likeCount']), compat_str)) or 0
2099 author_thumbnail = try_get(comment_renderer,
2100 lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], compat_str)
2101
2102 author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
2103 is_favorited = 'creatorHeart' in (try_get(
2104 comment_renderer, lambda x: x['actionButtons']['commentActionButtonsRenderer'], dict) or {})
2105 return {
2106 'id': comment_id,
2107 'text': text,
2108 'timestamp': timestamp,
2109 'time_text': time_text,
2110 'like_count': votes,
2111 'is_favorited': is_favorited,
2112 'author': author,
2113 'author_id': author_id,
2114 'author_thumbnail': author_thumbnail,
2115 'author_is_uploader': author_is_uploader,
2116 'parent': parent or 'root'
2117 }
2118
2119 def _comment_entries(self, root_continuation_data, identity_token, account_syncid,
2120 ytcfg, video_id, parent=None, comment_counts=None):
2121
2122 def extract_header(contents):
2123 _total_comments = 0
2124 _continuation = None
2125 for content in contents:
2126 comments_header_renderer = try_get(content, lambda x: x['commentsHeaderRenderer'])
2127 expected_comment_count = parse_count(self._get_text(
2128 comments_header_renderer, (lambda x: x['countText'], lambda x: x['commentsCount']), max_runs=1))
2129
2130 if expected_comment_count:
2131 comment_counts[1] = expected_comment_count
2132 self.to_screen('Downloading ~%d comments' % expected_comment_count)
2133 _total_comments = comment_counts[1]
2134 sort_mode_str = self._configuration_arg('comment_sort', [''])[0]
2135 comment_sort_index = int(sort_mode_str != 'top') # 1 = new, 0 = top
2136
2137 sort_menu_item = try_get(
2138 comments_header_renderer,
2139 lambda x: x['sortMenu']['sortFilterSubMenuRenderer']['subMenuItems'][comment_sort_index], dict) or {}
2140 sort_continuation_ep = sort_menu_item.get('serviceEndpoint') or {}
2141
2142 _continuation = self._extract_continuation_ep_data(sort_continuation_ep) or self._extract_continuation(sort_menu_item)
2143 if not _continuation:
2144 continue
2145
2146 sort_text = sort_menu_item.get('title')
2147 if isinstance(sort_text, compat_str):
2148 sort_text = sort_text.lower()
2149 else:
2150 sort_text = 'top comments' if comment_sort_index == 0 else 'newest first'
2151 self.to_screen('Sorting comments by %s' % sort_text)
2152 break
2153 return _total_comments, _continuation
2154
2155 def extract_thread(contents):
2156 if not parent:
2157 comment_counts[2] = 0
2158 for content in contents:
2159 comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
2160 comment_renderer = try_get(
2161 comment_thread_renderer, (lambda x: x['comment']['commentRenderer'], dict)) or try_get(
2162 content, (lambda x: x['commentRenderer'], dict))
2163
2164 if not comment_renderer:
2165 continue
2166 comment = self._extract_comment(comment_renderer, parent)
2167 if not comment:
2168 continue
2169 comment_counts[0] += 1
2170 yield comment
2171 # Attempt to get the replies
2172 comment_replies_renderer = try_get(
2173 comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
2174
2175 if comment_replies_renderer:
2176 comment_counts[2] += 1
2177 comment_entries_iter = self._comment_entries(
2178 comment_replies_renderer, identity_token, account_syncid, ytcfg,
2179 video_id, parent=comment.get('id'), comment_counts=comment_counts)
2180
2181 for reply_comment in comment_entries_iter:
2182 yield reply_comment
2183
2184 # YouTube comments have a max depth of 2
2185 max_depth = int_or_none(self._configuration_arg('max_comment_depth', [''])[0]) or float('inf')
2186 if max_depth == 1 and parent:
2187 return
2188 if not comment_counts:
2189 # comment so far, est. total comments, current comment thread #
2190 comment_counts = [0, 0, 0]
2191
2192 continuation = self._extract_continuation(root_continuation_data)
2193 if continuation and len(continuation['continuation']) < 27:
2194 self.write_debug('Detected old API continuation token. Generating new API compatible token.')
2195 continuation_token = self._generate_comment_continuation(video_id)
2196 continuation = self._build_api_continuation_query(continuation_token, None)
2197
2198 visitor_data = None
2199 is_first_continuation = parent is None
2200
2201 for page_num in itertools.count(0):
2202 if not continuation:
2203 break
2204 headers = self.generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
2205 comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
2206 if page_num == 0:
2207 if is_first_continuation:
2208 note_prefix = 'Downloading comment section API JSON'
2209 else:
2210 note_prefix = ' Downloading comment API JSON reply thread %d %s' % (
2211 comment_counts[2], comment_prog_str)
2212 else:
2213 note_prefix = '%sDownloading comment%s API JSON page %d %s' % (
2214 ' ' if parent else '', ' replies' if parent else '',
2215 page_num, comment_prog_str)
2216
2217 response = self._extract_response(
2218 item_id=None, query=continuation,
2219 ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
2220 check_get_keys=('onResponseReceivedEndpoints', 'continuationContents'))
2221 if not response:
2222 break
2223 visitor_data = try_get(
2224 response,
2225 lambda x: x['responseContext']['webResponseContextExtensionData']['ytConfigData']['visitorData'],
2226 compat_str) or visitor_data
2227
2228 continuation_contents = dict_get(response, ('onResponseReceivedEndpoints', 'continuationContents'))
2229
2230 continuation = None
2231 if isinstance(continuation_contents, list):
2232 for continuation_section in continuation_contents:
2233 if not isinstance(continuation_section, dict):
2234 continue
2235 continuation_items = try_get(
2236 continuation_section,
2237 (lambda x: x['reloadContinuationItemsCommand']['continuationItems'],
2238 lambda x: x['appendContinuationItemsAction']['continuationItems']),
2239 list) or []
2240 if is_first_continuation:
2241 total_comments, continuation = extract_header(continuation_items)
2242 if total_comments:
2243 yield total_comments
2244 is_first_continuation = False
2245 if continuation:
2246 break
2247 continue
2248 count = 0
2249 for count, entry in enumerate(extract_thread(continuation_items)):
2250 yield entry
2251 continuation = self._extract_continuation({'contents': continuation_items})
2252 if continuation:
2253 # Sometimes YouTube provides a continuation without any comments
2254 # In most cases we end up just downloading these with very little comments to come.
2255 if count == 0:
2256 if not parent:
2257 self.report_warning('No comments received - assuming end of comments')
2258 continuation = None
2259 break
2260
2261 # Deprecated response structure
2262 elif isinstance(continuation_contents, dict):
2263 known_continuation_renderers = ('itemSectionContinuation', 'commentRepliesContinuation')
2264 for key, continuation_renderer in continuation_contents.items():
2265 if key not in known_continuation_renderers:
2266 continue
2267 if not isinstance(continuation_renderer, dict):
2268 continue
2269 if is_first_continuation:
2270 header_continuation_items = [continuation_renderer.get('header') or {}]
2271 total_comments, continuation = extract_header(header_continuation_items)
2272 if total_comments:
2273 yield total_comments
2274 is_first_continuation = False
2275 if continuation:
2276 break
2277
2278 # Sometimes YouTube provides a continuation without any comments
2279 # In most cases we end up just downloading these with very little comments to come.
2280 count = 0
2281 for count, entry in enumerate(extract_thread(continuation_renderer.get('contents') or {})):
2282 yield entry
2283 continuation = self._extract_continuation(continuation_renderer)
2284 if count == 0:
2285 if not parent:
2286 self.report_warning('No comments received - assuming end of comments')
2287 continuation = None
2288 break
2289
2290 @staticmethod
2291 def _generate_comment_continuation(video_id):
2292 """
2293 Generates initial comment section continuation token from given video id
2294 """
2295 b64_vid_id = base64.b64encode(bytes(video_id.encode('utf-8')))
2296 parts = ('Eg0SCw==', b64_vid_id, 'GAYyJyIRIgs=', b64_vid_id, 'MAB4AjAAQhBjb21tZW50cy1zZWN0aW9u')
2297 new_continuation_intlist = list(itertools.chain.from_iterable(
2298 [bytes_to_intlist(base64.b64decode(part)) for part in parts]))
2299 return base64.b64encode(intlist_to_bytes(new_continuation_intlist)).decode('utf-8')
2300
2301 def _extract_comments(self, ytcfg, video_id, contents, webpage):
2302 """Entry for comment extraction"""
2303 def _real_comment_extract(contents):
2304 if isinstance(contents, list):
2305 for entry in contents:
2306 for key, renderer in entry.items():
2307 if key not in known_entry_comment_renderers:
2308 continue
2309 yield from self._comment_entries(
2310 renderer, video_id=video_id, ytcfg=ytcfg,
2311 identity_token=self._extract_identity_token(webpage, item_id=video_id),
2312 account_syncid=self._extract_account_syncid(ytcfg))
2313 break
2314 comments = []
2315 known_entry_comment_renderers = ('itemSectionRenderer',)
2316 estimated_total = 0
2317 max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) or float('inf')
2318
2319 try:
2320 for comment in _real_comment_extract(contents):
2321 if len(comments) >= max_comments:
2322 break
2323 if isinstance(comment, int):
2324 estimated_total = comment
2325 continue
2326 comments.append(comment)
2327 except KeyboardInterrupt:
2328 self.to_screen('Interrupted by user')
2329 self.to_screen('Downloaded %d/%d comments' % (len(comments), estimated_total))
2330 return {
2331 'comments': comments,
2332 'comment_count': len(comments),
2333 }
2334
2335 @staticmethod
2336 def _generate_player_context(sts=None):
2337 context = {
2338 'html5Preference': 'HTML5_PREF_WANTS',
2339 }
2340 if sts is not None:
2341 context['signatureTimestamp'] = sts
2342 return {
2343 'playbackContext': {
2344 'contentPlaybackContext': context
2345 },
2346 'contentCheckOk': True,
2347 'racyCheckOk': True
2348 }
2349
2350 @staticmethod
2351 def _get_video_info_params(video_id, client='TVHTML5'):
2352 GVI_CLIENTS = {
2353 'ANDROID': {
2354 'c': 'ANDROID',
2355 'cver': '16.20',
2356 },
2357 'TVHTML5': {
2358 'c': 'TVHTML5',
2359 'cver': '6.20180913',
2360 },
2361 'IOS': {
2362 'c': 'IOS',
2363 'cver': '16.20'
2364 }
2365 }
2366 query = {
2367 'video_id': video_id,
2368 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
2369 'html5': '1'
2370 }
2371 query.update(GVI_CLIENTS.get(client))
2372 return query
2373
2374 def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, identity_token, player_url, initial_pr):
2375
2376 session_index = self._extract_session_index(player_ytcfg, master_ytcfg)
2377 syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr)
2378 sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False)
2379 headers = self.generate_api_headers(
2380 player_ytcfg, identity_token, syncid,
2381 default_client=self._YT_CLIENTS[client], session_index=session_index)
2382
2383 yt_query = {'videoId': video_id}
2384 yt_query.update(self._generate_player_context(sts))
2385 return self._extract_response(
2386 item_id=video_id, ep='player', query=yt_query,
2387 ytcfg=player_ytcfg, headers=headers, fatal=False,
2388 default_client=self._YT_CLIENTS[client],
2389 note='Downloading %s player API JSON' % client.replace('_', ' ').strip()
2390 ) or None
2391
2392 def _extract_age_gated_player_response(self, client, video_id, ytcfg, identity_token, player_url, initial_pr):
2393 # get_video_info endpoint seems to be completely dead
2394 gvi_client = None # self._YT_CLIENTS.get(f'_{client}_agegate')
2395 if gvi_client:
2396 pr = self._parse_json(traverse_obj(
2397 compat_parse_qs(self._download_webpage(
2398 self.http_scheme() + '//www.youtube.com/get_video_info', video_id,
2399 'Refetching age-gated %s info webpage' % gvi_client.lower(),
2400 'unable to download video info webpage', fatal=False,
2401 query=self._get_video_info_params(video_id, client=gvi_client))),
2402 ('player_response', 0), expected_type=str) or '{}', video_id)
2403 if pr:
2404 return pr
2405 self.report_warning('Falling back to embedded-only age-gate workaround')
2406
2407 if not self._YT_CLIENTS.get(f'_{client}_embedded'):
2408 return
2409 embed_webpage = None
2410 if client == 'web' and 'configs' not in self._configuration_arg('player_skip'):
2411 embed_webpage = self._download_webpage(
2412 'https://www.youtube.com/embed/%s?html5=1' % video_id,
2413 video_id=video_id, note=f'Downloading age-gated {client} embed config')
2414
2415 ytcfg_age = self.extract_ytcfg(video_id, embed_webpage) or {}
2416 # If we extracted the embed webpage, it'll tell us if we can view the video
2417 embedded_pr = self._parse_json(
2418 traverse_obj(ytcfg_age, ('PLAYER_VARS', 'embedded_player_response'), expected_type=str) or '{}',
2419 video_id=video_id)
2420 embedded_ps_reason = traverse_obj(embedded_pr, ('playabilityStatus', 'reason'), expected_type=str) or ''
2421 if embedded_ps_reason in self._AGE_GATE_REASONS:
2422 return
2423 return self._extract_player_response(
2424 f'_{client}_embedded', video_id,
2425 ytcfg_age or ytcfg, ytcfg_age if client == 'web' else {},
2426 identity_token, player_url, initial_pr)
2427
2428 def _get_requested_clients(self, url, smuggled_data):
2429 requested_clients = [client for client in self._configuration_arg('player_client')
2430 if client[:0] != '_' and client in self._YT_CLIENTS]
2431 if not requested_clients:
2432 requested_clients = ['android', 'web']
2433
2434 if smuggled_data.get('is_music_url') or self.is_music_url(url):
2435 requested_clients.extend(
2436 f'{client}_music' for client in requested_clients if not client.endswith('_music'))
2437
2438 return orderedSet(requested_clients)
2439
2440 def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, player_url, identity_token):
2441 initial_pr = None
2442 if webpage:
2443 initial_pr = self._extract_yt_initial_variable(
2444 webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
2445 video_id, 'initial player response')
2446
2447 for client in clients:
2448 player_ytcfg = master_ytcfg if client == 'web' else {}
2449 if client == 'web' and initial_pr:
2450 pr = initial_pr
2451 else:
2452 if client == 'web_music' and 'configs' not in self._configuration_arg('player_skip'):
2453 ytm_webpage = self._download_webpage(
2454 'https://music.youtube.com',
2455 video_id, fatal=False, note='Downloading remix client config')
2456 player_ytcfg = self.extract_ytcfg(video_id, ytm_webpage) or {}
2457 pr = self._extract_player_response(
2458 client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, identity_token, player_url, initial_pr)
2459 if pr:
2460 yield pr
2461 if traverse_obj(pr, ('playabilityStatus', 'reason')) in self._AGE_GATE_REASONS:
2462 pr = self._extract_age_gated_player_response(
2463 client, video_id, player_ytcfg or master_ytcfg, identity_token, player_url, initial_pr)
2464 if pr:
2465 yield pr
2466 # Android player_response does not have microFormats which are needed for
2467 # extraction of some data. So we return the initial_pr with formats
2468 # stripped out even if not requested by the user
2469 # See: https://github.com/yt-dlp/yt-dlp/issues/501
2470 if initial_pr and 'web' not in clients:
2471 initial_pr['streamingData'] = None
2472 yield initial_pr
2473
2474 def _extract_formats(self, streaming_data, video_id, player_url, is_live):
2475 itags, stream_ids = [], []
2476 itag_qualities = {}
2477 q = qualities([
2478 # "tiny" is the smallest video-only format. But some audio-only formats
2479 # was also labeled "tiny". It is not clear if such formats still exist
2480 'tiny', 'audio_quality_low', 'audio_quality_medium', 'audio_quality_high', # Audio only formats
2481 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'
2482 ])
2483 streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[])
2484
2485 for fmt in streaming_formats:
2486 if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
2487 continue
2488
2489 itag = str_or_none(fmt.get('itag'))
2490 audio_track = fmt.get('audioTrack') or {}
2491 stream_id = '%s.%s' % (itag or '', audio_track.get('id', ''))
2492 if stream_id in stream_ids:
2493 continue
2494
2495 quality = fmt.get('quality')
2496 if quality == 'tiny' or not quality:
2497 quality = fmt.get('audioQuality', '').lower() or quality
2498 if itag and quality:
2499 itag_qualities[itag] = quality
2500 # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment
2501 # (adding `&sq=0` to the URL) and parsing emsg box to determine the
2502 # number of fragment that would subsequently requested with (`&sq=N`)
2503 if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF':
2504 continue
2505
2506 fmt_url = fmt.get('url')
2507 if not fmt_url:
2508 sc = compat_parse_qs(fmt.get('signatureCipher'))
2509 fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0]))
2510 encrypted_sig = try_get(sc, lambda x: x['s'][0])
2511 if not (sc and fmt_url and encrypted_sig):
2512 continue
2513 if not player_url:
2514 continue
2515 signature = self._decrypt_signature(sc['s'][0], video_id, player_url)
2516 sp = try_get(sc, lambda x: x['sp'][0]) or 'signature'
2517 fmt_url += '&' + sp + '=' + signature
2518
2519 if itag:
2520 itags.append(itag)
2521 stream_ids.append(stream_id)
2522
2523 tbr = float_or_none(
2524 fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
2525 dct = {
2526 'asr': int_or_none(fmt.get('audioSampleRate')),
2527 'filesize': int_or_none(fmt.get('contentLength')),
2528 'format_id': itag,
2529 'format_note': ', '.join(filter(None, (
2530 audio_track.get('displayName'), fmt.get('qualityLabel') or quality))),
2531 'fps': int_or_none(fmt.get('fps')),
2532 'height': int_or_none(fmt.get('height')),
2533 'quality': q(quality),
2534 'tbr': tbr,
2535 'url': fmt_url,
2536 'width': fmt.get('width'),
2537 'language': audio_track.get('id', '').split('.')[0],
2538 }
2539 mime_mobj = re.match(
2540 r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '')
2541 if mime_mobj:
2542 dct['ext'] = mimetype2ext(mime_mobj.group(1))
2543 dct.update(parse_codecs(mime_mobj.group(2)))
2544 # The 3gp format in android client has a quality of "small",
2545 # but is actually worse than all other formats
2546 if dct['ext'] == '3gp':
2547 dct['quality'] = q('tiny')
2548 dct['preference'] = -10
2549 no_audio = dct.get('acodec') == 'none'
2550 no_video = dct.get('vcodec') == 'none'
2551 if no_audio:
2552 dct['vbr'] = tbr
2553 if no_video:
2554 dct['abr'] = tbr
2555 if no_audio or no_video:
2556 dct['downloader_options'] = {
2557 # Youtube throttles chunks >~10M
2558 'http_chunk_size': 10485760,
2559 }
2560 if dct.get('ext'):
2561 dct['container'] = dct['ext'] + '_dash'
2562 yield dct
2563
2564 skip_manifests = self._configuration_arg('skip')
2565 get_dash = not is_live and 'dash' not in skip_manifests and self.get_param('youtube_include_dash_manifest', True)
2566 get_hls = 'hls' not in skip_manifests and self.get_param('youtube_include_hls_manifest', True)
2567
2568 for sd in streaming_data:
2569 hls_manifest_url = get_hls and sd.get('hlsManifestUrl')
2570 if hls_manifest_url:
2571 for f in self._extract_m3u8_formats(
2572 hls_manifest_url, video_id, 'mp4', fatal=False):
2573 itag = self._search_regex(
2574 r'/itag/(\d+)', f['url'], 'itag', default=None)
2575 if itag in itags:
2576 continue
2577 if itag:
2578 f['format_id'] = itag
2579 itags.append(itag)
2580 yield f
2581
2582 dash_manifest_url = get_dash and sd.get('dashManifestUrl')
2583 if dash_manifest_url:
2584 for f in self._extract_mpd_formats(
2585 dash_manifest_url, video_id, fatal=False):
2586 itag = f['format_id']
2587 if itag in itags:
2588 continue
2589 if itag:
2590 itags.append(itag)
2591 if itag in itag_qualities:
2592 f['quality'] = q(itag_qualities[itag])
2593 filesize = int_or_none(self._search_regex(
2594 r'/clen/(\d+)', f.get('fragment_base_url')
2595 or f['url'], 'file size', default=None))
2596 if filesize:
2597 f['filesize'] = filesize
2598 yield f
2599
2600 def _real_extract(self, url):
2601 url, smuggled_data = unsmuggle_url(url, {})
2602 video_id = self._match_id(url)
2603
2604 base_url = self.http_scheme() + '//www.youtube.com/'
2605 webpage_url = base_url + 'watch?v=' + video_id
2606 webpage = self._download_webpage(
2607 webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
2608
2609 master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg()
2610 player_url = self._extract_player_url(master_ytcfg, webpage)
2611 identity_token = self._extract_identity_token(webpage, video_id)
2612
2613 player_responses = list(self._extract_player_responses(
2614 self._get_requested_clients(url, smuggled_data),
2615 video_id, webpage, master_ytcfg, player_url, identity_token))
2616
2617 get_first = lambda obj, keys, **kwargs: traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
2618
2619 playability_statuses = traverse_obj(
2620 player_responses, (..., 'playabilityStatus'), expected_type=dict, default=[])
2621
2622 trailer_video_id = get_first(
2623 playability_statuses,
2624 ('errorScreen', 'playerLegacyDesktopYpcTrailerRenderer', 'trailerVideoId'),
2625 expected_type=str)
2626 if trailer_video_id:
2627 return self.url_result(
2628 trailer_video_id, self.ie_key(), trailer_video_id)
2629
2630 search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None))
2631 if webpage else (lambda x: None))
2632
2633 video_details = traverse_obj(
2634 player_responses, (..., 'videoDetails'), expected_type=dict, default=[])
2635 microformats = traverse_obj(
2636 player_responses, (..., 'microformat', 'playerMicroformatRenderer'),
2637 expected_type=dict, default=[])
2638 video_title = (
2639 get_first(video_details, 'title')
2640 or self._get_text(microformats, (..., 'title'))
2641 or search_meta(['og:title', 'twitter:title', 'title']))
2642 video_description = get_first(video_details, 'shortDescription')
2643
2644 if not smuggled_data.get('force_singlefeed', False):
2645 if not self.get_param('noplaylist'):
2646 multifeed_metadata_list = get_first(
2647 player_responses,
2648 ('multicamera', 'playerLegacyMulticameraRenderer', 'metadataList'),
2649 expected_type=str)
2650 if multifeed_metadata_list:
2651 entries = []
2652 feed_ids = []
2653 for feed in multifeed_metadata_list.split(','):
2654 # Unquote should take place before split on comma (,) since textual
2655 # fields may contain comma as well (see
2656 # https://github.com/ytdl-org/youtube-dl/issues/8536)
2657 feed_data = compat_parse_qs(
2658 compat_urllib_parse_unquote_plus(feed))
2659
2660 def feed_entry(name):
2661 return try_get(
2662 feed_data, lambda x: x[name][0], compat_str)
2663
2664 feed_id = feed_entry('id')
2665 if not feed_id:
2666 continue
2667 feed_title = feed_entry('title')
2668 title = video_title
2669 if feed_title:
2670 title += ' (%s)' % feed_title
2671 entries.append({
2672 '_type': 'url_transparent',
2673 'ie_key': 'Youtube',
2674 'url': smuggle_url(
2675 '%swatch?v=%s' % (base_url, feed_data['id'][0]),
2676 {'force_singlefeed': True}),
2677 'title': title,
2678 })
2679 feed_ids.append(feed_id)
2680 self.to_screen(
2681 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
2682 % (', '.join(feed_ids), video_id))
2683 return self.playlist_result(
2684 entries, video_id, video_title, video_description)
2685 else:
2686 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2687
2688 live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails'))
2689 is_live = get_first(video_details, 'isLive')
2690 if is_live is None:
2691 is_live = get_first(live_broadcast_details, 'isLiveNow')
2692
2693 streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[])
2694 formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live))
2695
2696 if not formats:
2697 if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')):
2698 self.raise_no_formats(
2699 'This video is DRM protected.', expected=True)
2700 pemr = get_first(
2701 playability_statuses,
2702 ('errorScreen', 'playerErrorMessageRenderer'), expected_type=dict) or {}
2703 reason = self._get_text(pemr, 'reason') or get_first(playability_statuses, 'reason')
2704 subreason = clean_html(self._get_text(pemr, 'subreason') or '')
2705 if subreason:
2706 if subreason == 'The uploader has not made this video available in your country.':
2707 countries = get_first(microformats, 'availableCountries')
2708 if not countries:
2709 regions_allowed = search_meta('regionsAllowed')
2710 countries = regions_allowed.split(',') if regions_allowed else None
2711 self.raise_geo_restricted(subreason, countries, metadata_available=True)
2712 reason += f'. {subreason}'
2713 if reason:
2714 self.raise_no_formats(reason, expected=True)
2715
2716 for f in formats:
2717 # TODO: detect if throttled
2718 if '&n=' in f['url']: # possibly throttled
2719 f['source_preference'] = -10
2720 # note = f.get('format_note')
2721 # f['format_note'] = f'{note} (throttled)' if note else '(throttled)'
2722
2723 self._sort_formats(formats)
2724
2725 keywords = get_first(video_details, 'keywords', expected_type=list) or []
2726 if not keywords and webpage:
2727 keywords = [
2728 unescapeHTML(m.group('content'))
2729 for m in re.finditer(self._meta_regex('og:video:tag'), webpage)]
2730 for keyword in keywords:
2731 if keyword.startswith('yt:stretch='):
2732 mobj = re.search(r'(\d+)\s*:\s*(\d+)', keyword)
2733 if mobj:
2734 # NB: float is intentional for forcing float division
2735 w, h = (float(v) for v in mobj.groups())
2736 if w > 0 and h > 0:
2737 ratio = w / h
2738 for f in formats:
2739 if f.get('vcodec') != 'none':
2740 f['stretched_ratio'] = ratio
2741 break
2742
2743 thumbnails = []
2744 thumbnail_dicts = traverse_obj(
2745 (video_details, microformats), (..., ..., 'thumbnail', 'thumbnails', ...),
2746 expected_type=dict, default=[])
2747 for thumbnail in thumbnail_dicts:
2748 thumbnail_url = thumbnail.get('url')
2749 if not thumbnail_url:
2750 continue
2751 # Sometimes youtube gives a wrong thumbnail URL. See:
2752 # https://github.com/yt-dlp/yt-dlp/issues/233
2753 # https://github.com/ytdl-org/youtube-dl/issues/28023
2754 if 'maxresdefault' in thumbnail_url:
2755 thumbnail_url = thumbnail_url.split('?')[0]
2756 thumbnails.append({
2757 'url': thumbnail_url,
2758 'height': int_or_none(thumbnail.get('height')),
2759 'width': int_or_none(thumbnail.get('width')),
2760 })
2761 thumbnail_url = search_meta(['og:image', 'twitter:image'])
2762 if thumbnail_url:
2763 thumbnails.append({
2764 'url': thumbnail_url,
2765 })
2766 # The best resolution thumbnails sometimes does not appear in the webpage
2767 # See: https://github.com/ytdl-org/youtube-dl/issues/29049, https://github.com/yt-dlp/yt-dlp/issues/340
2768 # List of possible thumbnails - Ref: <https://stackoverflow.com/a/20542029>
2769 hq_thumbnail_names = ['maxresdefault', 'hq720', 'sddefault', 'sd1', 'sd2', 'sd3']
2770 # TODO: Test them also? - For some videos, even these don't exist
2771 guaranteed_thumbnail_names = [
2772 'hqdefault', 'hq1', 'hq2', 'hq3', '0',
2773 'mqdefault', 'mq1', 'mq2', 'mq3',
2774 'default', '1', '2', '3'
2775 ]
2776 thumbnail_names = hq_thumbnail_names + guaranteed_thumbnail_names
2777 n_thumbnail_names = len(thumbnail_names)
2778
2779 thumbnails.extend({
2780 'url': 'https://i.ytimg.com/vi{webp}/{video_id}/{name}{live}.{ext}'.format(
2781 video_id=video_id, name=name, ext=ext,
2782 webp='_webp' if ext == 'webp' else '', live='_live' if is_live else ''),
2783 '_test_url': name in hq_thumbnail_names,
2784 } for name in thumbnail_names for ext in ('webp', 'jpg'))
2785 for thumb in thumbnails:
2786 i = next((i for i, t in enumerate(thumbnail_names) if f'/{video_id}/{t}' in thumb['url']), n_thumbnail_names)
2787 thumb['preference'] = (0 if '.webp' in thumb['url'] else -1) - (2 * i)
2788 self._remove_duplicate_formats(thumbnails)
2789
2790 category = get_first(microformats, 'category') or search_meta('genre')
2791 channel_id = str_or_none(
2792 get_first(video_details, 'channelId')
2793 or get_first(microformats, 'externalChannelId')
2794 or search_meta('channelId'))
2795 duration = int_or_none(
2796 get_first(video_details, 'lengthSeconds')
2797 or get_first(microformats, 'lengthSeconds')
2798 or parse_duration(search_meta('duration'))) or None
2799 owner_profile_url = get_first(microformats, 'ownerProfileUrl')
2800
2801 live_content = get_first(video_details, 'isLiveContent')
2802 is_upcoming = get_first(video_details, 'isUpcoming')
2803 if is_live is None:
2804 if is_upcoming or live_content is False:
2805 is_live = False
2806 if is_upcoming is None and (live_content or is_live):
2807 is_upcoming = False
2808 live_starttime = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp'))
2809 live_endtime = parse_iso8601(get_first(live_broadcast_details, 'endTimestamp'))
2810 if not duration and live_endtime and live_starttime:
2811 duration = live_endtime - live_starttime
2812
2813 info = {
2814 'id': video_id,
2815 'title': self._live_title(video_title) if is_live else video_title,
2816 'formats': formats,
2817 'thumbnails': thumbnails,
2818 'description': video_description,
2819 'upload_date': unified_strdate(
2820 get_first(microformats, 'uploadDate')
2821 or search_meta('uploadDate')),
2822 'uploader': get_first(video_details, 'author'),
2823 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None,
2824 'uploader_url': owner_profile_url,
2825 'channel_id': channel_id,
2826 'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None,
2827 'duration': duration,
2828 'view_count': int_or_none(
2829 get_first((video_details, microformats), (..., 'viewCount'))
2830 or search_meta('interactionCount')),
2831 'average_rating': float_or_none(get_first(video_details, 'averageRating')),
2832 'age_limit': 18 if (
2833 get_first(microformats, 'isFamilySafe') is False
2834 or search_meta('isFamilyFriendly') == 'false'
2835 or search_meta('og:restrictions:age') == '18+') else 0,
2836 'webpage_url': webpage_url,
2837 'categories': [category] if category else None,
2838 'tags': keywords,
2839 'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'),
2840 'is_live': is_live,
2841 'was_live': (False if is_live or is_upcoming or live_content is False
2842 else None if is_live is None or is_upcoming is None
2843 else live_content),
2844 'live_status': 'is_upcoming' if is_upcoming else None, # rest will be set by YoutubeDL
2845 'release_timestamp': live_starttime,
2846 }
2847
2848 pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict)
2849 # Converted into dicts to remove duplicates
2850 captions = {
2851 sub.get('baseUrl'): sub
2852 for sub in traverse_obj(pctr, (..., 'captionTracks', ...), default=[])}
2853 translation_languages = {
2854 lang.get('languageCode'): lang.get('languageName')
2855 for lang in traverse_obj(pctr, (..., 'translationLanguages', ...), default=[])}
2856 subtitles = {}
2857 if pctr:
2858 def process_language(container, base_url, lang_code, sub_name, query):
2859 lang_subs = container.setdefault(lang_code, [])
2860 for fmt in self._SUBTITLE_FORMATS:
2861 query.update({
2862 'fmt': fmt,
2863 })
2864 lang_subs.append({
2865 'ext': fmt,
2866 'url': update_url_query(base_url, query),
2867 'name': sub_name,
2868 })
2869
2870 for base_url, caption_track in captions.items():
2871 if not base_url:
2872 continue
2873 if caption_track.get('kind') != 'asr':
2874 lang_code = (
2875 remove_start(caption_track.get('vssId') or '', '.').replace('.', '-')
2876 or caption_track.get('languageCode'))
2877 if not lang_code:
2878 continue
2879 process_language(
2880 subtitles, base_url, lang_code,
2881 traverse_obj(caption_track, ('name', 'simpleText')),
2882 {})
2883 continue
2884 automatic_captions = {}
2885 for trans_code, trans_name in translation_languages.items():
2886 if not trans_code:
2887 continue
2888 process_language(
2889 automatic_captions, base_url, trans_code,
2890 self._get_text(trans_name, max_runs=1),
2891 {'tlang': trans_code})
2892 info['automatic_captions'] = automatic_captions
2893 info['subtitles'] = subtitles
2894
2895 parsed_url = compat_urllib_parse_urlparse(url)
2896 for component in [parsed_url.fragment, parsed_url.query]:
2897 query = compat_parse_qs(component)
2898 for k, v in query.items():
2899 for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]:
2900 d_k += '_time'
2901 if d_k not in info and k in s_ks:
2902 info[d_k] = parse_duration(query[k][0])
2903
2904 # Youtube Music Auto-generated description
2905 if video_description:
2906 mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
2907 if mobj:
2908 release_year = mobj.group('release_year')
2909 release_date = mobj.group('release_date')
2910 if release_date:
2911 release_date = release_date.replace('-', '')
2912 if not release_year:
2913 release_year = release_date[:4]
2914 info.update({
2915 'album': mobj.group('album'.strip()),
2916 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')),
2917 'track': mobj.group('track').strip(),
2918 'release_date': release_date,
2919 'release_year': int_or_none(release_year),
2920 })
2921
2922 initial_data = None
2923 if webpage:
2924 initial_data = self._extract_yt_initial_variable(
2925 webpage, self._YT_INITIAL_DATA_RE, video_id,
2926 'yt initial data')
2927 if not initial_data:
2928 headers = self.generate_api_headers(
2929 master_ytcfg, identity_token, self._extract_account_syncid(master_ytcfg),
2930 session_index=self._extract_session_index(master_ytcfg))
2931
2932 initial_data = self._extract_response(
2933 item_id=video_id, ep='next', fatal=False,
2934 ytcfg=master_ytcfg, headers=headers, query={'videoId': video_id},
2935 note='Downloading initial data API JSON')
2936
2937 try:
2938 # This will error if there is no livechat
2939 initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
2940 info['subtitles']['live_chat'] = [{
2941 'url': 'https://www.youtube.com/watch?v=%s' % video_id, # url is needed to set cookies
2942 'video_id': video_id,
2943 'ext': 'json',
2944 'protocol': 'youtube_live_chat' if is_live or is_upcoming else 'youtube_live_chat_replay',
2945 }]
2946 except (KeyError, IndexError, TypeError):
2947 pass
2948
2949 if initial_data:
2950 info['chapters'] = (
2951 self._extract_chapters_from_json(initial_data, duration)
2952 or self._extract_chapters_from_engagement_panel(initial_data, duration)
2953 or None)
2954
2955 contents = try_get(
2956 initial_data,
2957 lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'],
2958 list) or []
2959 for content in contents:
2960 vpir = content.get('videoPrimaryInfoRenderer')
2961 if vpir:
2962 stl = vpir.get('superTitleLink')
2963 if stl:
2964 stl = self._get_text(stl)
2965 if try_get(
2966 vpir,
2967 lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':
2968 info['location'] = stl
2969 else:
2970 mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl)
2971 if mobj:
2972 info.update({
2973 'series': mobj.group(1),
2974 'season_number': int(mobj.group(2)),
2975 'episode_number': int(mobj.group(3)),
2976 })
2977 for tlb in (try_get(
2978 vpir,
2979 lambda x: x['videoActions']['menuRenderer']['topLevelButtons'],
2980 list) or []):
2981 tbr = tlb.get('toggleButtonRenderer') or {}
2982 for getter, regex in [(
2983 lambda x: x['defaultText']['accessibility']['accessibilityData'],
2984 r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([
2985 lambda x: x['accessibility'],
2986 lambda x: x['accessibilityData']['accessibilityData'],
2987 ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]:
2988 label = (try_get(tbr, getter, dict) or {}).get('label')
2989 if label:
2990 mobj = re.match(regex, label)
2991 if mobj:
2992 info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count'))
2993 break
2994 sbr_tooltip = try_get(
2995 vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])
2996 if sbr_tooltip:
2997 like_count, dislike_count = sbr_tooltip.split(' / ')
2998 info.update({
2999 'like_count': str_to_int(like_count),
3000 'dislike_count': str_to_int(dislike_count),
3001 })
3002 vsir = content.get('videoSecondaryInfoRenderer')
3003 if vsir:
3004 info['channel'] = self._get_text(try_get(
3005 vsir,
3006 lambda x: x['owner']['videoOwnerRenderer']['title'],
3007 dict))
3008 rows = try_get(
3009 vsir,
3010 lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
3011 list) or []
3012 multiple_songs = False
3013 for row in rows:
3014 if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
3015 multiple_songs = True
3016 break
3017 for row in rows:
3018 mrr = row.get('metadataRowRenderer') or {}
3019 mrr_title = mrr.get('title')
3020 if not mrr_title:
3021 continue
3022 mrr_title = self._get_text(mrr['title'])
3023 mrr_contents_text = self._get_text(mrr['contents'][0])
3024 if mrr_title == 'License':
3025 info['license'] = mrr_contents_text
3026 elif not multiple_songs:
3027 if mrr_title == 'Album':
3028 info['album'] = mrr_contents_text
3029 elif mrr_title == 'Artist':
3030 info['artist'] = mrr_contents_text
3031 elif mrr_title == 'Song':
3032 info['track'] = mrr_contents_text
3033
3034 fallbacks = {
3035 'channel': 'uploader',
3036 'channel_id': 'uploader_id',
3037 'channel_url': 'uploader_url',
3038 }
3039 for to, frm in fallbacks.items():
3040 if not info.get(to):
3041 info[to] = info.get(frm)
3042
3043 for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]:
3044 v = info.get(s_k)
3045 if v:
3046 info[d_k] = v
3047
3048 is_private = get_first(video_details, 'isPrivate', expected_type=bool)
3049 is_unlisted = get_first(microformats, 'isUnlisted', expected_type=bool)
3050 is_membersonly = None
3051 is_premium = None
3052 if initial_data and is_private is not None:
3053 is_membersonly = False
3054 is_premium = False
3055 contents = try_get(initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or []
3056 badge_labels = set()
3057 for content in contents:
3058 if not isinstance(content, dict):
3059 continue
3060 badge_labels.update(self._extract_badges(content.get('videoPrimaryInfoRenderer')))
3061 for badge_label in badge_labels:
3062 if badge_label.lower() == 'members only':
3063 is_membersonly = True
3064 elif badge_label.lower() == 'premium':
3065 is_premium = True
3066 elif badge_label.lower() == 'unlisted':
3067 is_unlisted = True
3068
3069 info['availability'] = self._availability(
3070 is_private=is_private,
3071 needs_premium=is_premium,
3072 needs_subscription=is_membersonly,
3073 needs_auth=info['age_limit'] >= 18,
3074 is_unlisted=None if is_private is None else is_unlisted)
3075
3076 # get xsrf for annotations or comments
3077 get_annotations = self.get_param('writeannotations', False)
3078 get_comments = self.get_param('getcomments', False)
3079 if get_annotations or get_comments:
3080 xsrf_token = None
3081 if master_ytcfg:
3082 xsrf_token = try_get(master_ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
3083 if not xsrf_token:
3084 xsrf_token = self._search_regex(
3085 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
3086 webpage, 'xsrf token', group='xsrf_token', fatal=False)
3087
3088 # annotations
3089 if get_annotations:
3090 invideo_url = get_first(
3091 player_responses,
3092 ('annotations', 0, 'playerAnnotationsUrlsRenderer', 'invideoUrl'),
3093 expected_type=str)
3094 if xsrf_token and invideo_url:
3095 xsrf_field_name = None
3096 if master_ytcfg:
3097 xsrf_field_name = try_get(master_ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
3098 if not xsrf_field_name:
3099 xsrf_field_name = self._search_regex(
3100 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
3101 webpage, 'xsrf field name',
3102 group='xsrf_field_name', default='session_token')
3103 info['annotations'] = self._download_webpage(
3104 self._proto_relative_url(invideo_url),
3105 video_id, note='Downloading annotations',
3106 errnote='Unable to download video annotations', fatal=False,
3107 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
3108
3109 if get_comments:
3110 info['__post_extractor'] = lambda: self._extract_comments(master_ytcfg, video_id, contents, webpage)
3111
3112 self.mark_watched(video_id, player_responses)
3113
3114 return info
3115
3116
3117 class YoutubeTabIE(YoutubeBaseInfoExtractor):
3118 IE_DESC = 'YouTube.com tab'
3119 _VALID_URL = r'''(?x)
3120 https?://
3121 (?:\w+\.)?
3122 (?:
3123 youtube(?:kids)?\.com|
3124 invidio\.us
3125 )/
3126 (?:
3127 (?P<channel_type>channel|c|user|browse)/|
3128 (?P<not_channel>
3129 feed/|hashtag/|
3130 (?:playlist|watch)\?.*?\blist=
3131 )|
3132 (?!(?:%s)\b) # Direct URLs
3133 )
3134 (?P<id>[^/?\#&]+)
3135 ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
3136 IE_NAME = 'youtube:tab'
3137
3138 _TESTS = [{
3139 'note': 'playlists, multipage',
3140 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
3141 'playlist_mincount': 94,
3142 'info_dict': {
3143 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
3144 'title': 'Игорь Клейнер - Playlists',
3145 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
3146 'uploader': 'Игорь Клейнер',
3147 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
3148 },
3149 }, {
3150 'note': 'playlists, multipage, different order',
3151 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
3152 'playlist_mincount': 94,
3153 'info_dict': {
3154 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
3155 'title': 'Игорь Клейнер - Playlists',
3156 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
3157 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
3158 'uploader': 'Игорь Клейнер',
3159 },
3160 }, {
3161 'note': 'playlists, series',
3162 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3',
3163 'playlist_mincount': 5,
3164 'info_dict': {
3165 'id': 'UCYO_jab_esuFRV4b17AJtAw',
3166 'title': '3Blue1Brown - Playlists',
3167 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
3168 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
3169 'uploader': '3Blue1Brown',
3170 },
3171 }, {
3172 'note': 'playlists, singlepage',
3173 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
3174 'playlist_mincount': 4,
3175 'info_dict': {
3176 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
3177 'title': 'ThirstForScience - Playlists',
3178 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
3179 'uploader': 'ThirstForScience',
3180 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
3181 }
3182 }, {
3183 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3184 'only_matching': True,
3185 }, {
3186 'note': 'basic, single video playlist',
3187 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3188 'info_dict': {
3189 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
3190 'uploader': 'Sergey M.',
3191 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3192 'title': 'youtube-dl public playlist',
3193 },
3194 'playlist_count': 1,
3195 }, {
3196 'note': 'empty playlist',
3197 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3198 'info_dict': {
3199 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
3200 'uploader': 'Sergey M.',
3201 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3202 'title': 'youtube-dl empty playlist',
3203 },
3204 'playlist_count': 0,
3205 }, {
3206 'note': 'Home tab',
3207 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
3208 'info_dict': {
3209 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3210 'title': 'lex will - Home',
3211 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3212 'uploader': 'lex will',
3213 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3214 },
3215 'playlist_mincount': 2,
3216 }, {
3217 'note': 'Videos tab',
3218 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
3219 'info_dict': {
3220 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3221 'title': 'lex will - Videos',
3222 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3223 'uploader': 'lex will',
3224 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3225 },
3226 'playlist_mincount': 975,
3227 }, {
3228 'note': 'Videos tab, sorted by popular',
3229 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
3230 'info_dict': {
3231 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3232 'title': 'lex will - Videos',
3233 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3234 'uploader': 'lex will',
3235 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3236 },
3237 'playlist_mincount': 199,
3238 }, {
3239 'note': 'Playlists tab',
3240 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
3241 'info_dict': {
3242 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3243 'title': 'lex will - Playlists',
3244 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3245 'uploader': 'lex will',
3246 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3247 },
3248 'playlist_mincount': 17,
3249 }, {
3250 'note': 'Community tab',
3251 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
3252 'info_dict': {
3253 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3254 'title': 'lex will - Community',
3255 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3256 'uploader': 'lex will',
3257 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3258 },
3259 'playlist_mincount': 18,
3260 }, {
3261 'note': 'Channels tab',
3262 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
3263 'info_dict': {
3264 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3265 'title': 'lex will - Channels',
3266 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3267 'uploader': 'lex will',
3268 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3269 },
3270 'playlist_mincount': 12,
3271 }, {
3272 'note': 'Search tab',
3273 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra',
3274 'playlist_mincount': 40,
3275 'info_dict': {
3276 'id': 'UCYO_jab_esuFRV4b17AJtAw',
3277 'title': '3Blue1Brown - Search - linear algebra',
3278 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
3279 'uploader': '3Blue1Brown',
3280 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
3281 },
3282 }, {
3283 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3284 'only_matching': True,
3285 }, {
3286 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3287 'only_matching': True,
3288 }, {
3289 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3290 'only_matching': True,
3291 }, {
3292 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
3293 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
3294 'info_dict': {
3295 'title': '29C3: Not my department',
3296 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
3297 'uploader': 'Christiaan008',
3298 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
3299 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268',
3300 },
3301 'playlist_count': 96,
3302 }, {
3303 'note': 'Large playlist',
3304 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
3305 'info_dict': {
3306 'title': 'Uploads from Cauchemar',
3307 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
3308 'uploader': 'Cauchemar',
3309 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
3310 },
3311 'playlist_mincount': 1123,
3312 }, {
3313 'note': 'even larger playlist, 8832 videos',
3314 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
3315 'only_matching': True,
3316 }, {
3317 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
3318 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
3319 'info_dict': {
3320 'title': 'Uploads from Interstellar Movie',
3321 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
3322 'uploader': 'Interstellar Movie',
3323 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
3324 },
3325 'playlist_mincount': 21,
3326 }, {
3327 'note': 'Playlist with "show unavailable videos" button',
3328 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q',
3329 'info_dict': {
3330 'title': 'Uploads from Phim Siêu Nhân Nhật Bản',
3331 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q',
3332 'uploader': 'Phim Siêu Nhân Nhật Bản',
3333 'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q',
3334 },
3335 'playlist_mincount': 200,
3336 }, {
3337 'note': 'Playlist with unavailable videos in page 7',
3338 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w',
3339 'info_dict': {
3340 'title': 'Uploads from BlankTV',
3341 'id': 'UU8l9frL61Yl5KFOl87nIm2w',
3342 'uploader': 'BlankTV',
3343 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w',
3344 },
3345 'playlist_mincount': 1000,
3346 }, {
3347 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844',
3348 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
3349 'info_dict': {
3350 'title': 'Data Analysis with Dr Mike Pound',
3351 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
3352 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
3353 'uploader': 'Computerphile',
3354 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487',
3355 },
3356 'playlist_mincount': 11,
3357 }, {
3358 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3359 'only_matching': True,
3360 }, {
3361 'note': 'Playlist URL that does not actually serve a playlist',
3362 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
3363 'info_dict': {
3364 'id': 'FqZTN594JQw',
3365 'ext': 'webm',
3366 'title': "Smiley's People 01 detective, Adventure Series, Action",
3367 'uploader': 'STREEM',
3368 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
3369 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
3370 'upload_date': '20150526',
3371 'license': 'Standard YouTube License',
3372 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
3373 'categories': ['People & Blogs'],
3374 'tags': list,
3375 'view_count': int,
3376 'like_count': int,
3377 'dislike_count': int,
3378 },
3379 'params': {
3380 'skip_download': True,
3381 },
3382 'skip': 'This video is not available.',
3383 'add_ie': [YoutubeIE.ie_key()],
3384 }, {
3385 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
3386 'only_matching': True,
3387 }, {
3388 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
3389 'only_matching': True,
3390 }, {
3391 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
3392 'info_dict': {
3393 'id': 'FMtPN8yp5LU', # This will keep changing
3394 'ext': 'mp4',
3395 'title': compat_str,
3396 'uploader': 'Sky News',
3397 'uploader_id': 'skynews',
3398 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
3399 'upload_date': r're:\d{8}',
3400 'description': compat_str,
3401 'categories': ['News & Politics'],
3402 'tags': list,
3403 'like_count': int,
3404 'dislike_count': int,
3405 },
3406 'params': {
3407 'skip_download': True,
3408 },
3409 'expected_warnings': ['Downloading just video ', 'Ignoring subtitle tracks found in '],
3410 }, {
3411 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
3412 'info_dict': {
3413 'id': 'a48o2S1cPoo',
3414 'ext': 'mp4',
3415 'title': 'The Young Turks - Live Main Show',
3416 'uploader': 'The Young Turks',
3417 'uploader_id': 'TheYoungTurks',
3418 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
3419 'upload_date': '20150715',
3420 'license': 'Standard YouTube License',
3421 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3422 'categories': ['News & Politics'],
3423 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3424 'like_count': int,
3425 'dislike_count': int,
3426 },
3427 'params': {
3428 'skip_download': True,
3429 },
3430 'only_matching': True,
3431 }, {
3432 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
3433 'only_matching': True,
3434 }, {
3435 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3436 'only_matching': True,
3437 }, {
3438 'note': 'A channel that is not live. Should raise error',
3439 'url': 'https://www.youtube.com/user/numberphile/live',
3440 'only_matching': True,
3441 }, {
3442 'url': 'https://www.youtube.com/feed/trending',
3443 'only_matching': True,
3444 }, {
3445 'url': 'https://www.youtube.com/feed/library',
3446 'only_matching': True,
3447 }, {
3448 'url': 'https://www.youtube.com/feed/history',
3449 'only_matching': True,
3450 }, {
3451 'url': 'https://www.youtube.com/feed/subscriptions',
3452 'only_matching': True,
3453 }, {
3454 'url': 'https://www.youtube.com/feed/watch_later',
3455 'only_matching': True,
3456 }, {
3457 'note': 'Recommended - redirects to home page',
3458 'url': 'https://www.youtube.com/feed/recommended',
3459 'only_matching': True,
3460 }, {
3461 'note': 'inline playlist with not always working continuations',
3462 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
3463 'only_matching': True,
3464 }, {
3465 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8',
3466 'only_matching': True,
3467 }, {
3468 'url': 'https://www.youtube.com/course',
3469 'only_matching': True,
3470 }, {
3471 'url': 'https://www.youtube.com/zsecurity',
3472 'only_matching': True,
3473 }, {
3474 'url': 'http://www.youtube.com/NASAgovVideo/videos',
3475 'only_matching': True,
3476 }, {
3477 'url': 'https://www.youtube.com/TheYoungTurks/live',
3478 'only_matching': True,
3479 }, {
3480 'url': 'https://www.youtube.com/hashtag/cctv9',
3481 'info_dict': {
3482 'id': 'cctv9',
3483 'title': '#cctv9',
3484 },
3485 'playlist_mincount': 350,
3486 }, {
3487 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU',
3488 'only_matching': True,
3489 }, {
3490 'note': 'Requires Premium: should request additional YTM-info webpage (and have format 141) for videos in playlist',
3491 'url': 'https://music.youtube.com/playlist?list=PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3492 'only_matching': True
3493 }, {
3494 'note': '/browse/ should redirect to /channel/',
3495 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng',
3496 'only_matching': True
3497 }, {
3498 'note': 'VLPL, should redirect to playlist?list=PL...',
3499 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3500 'info_dict': {
3501 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3502 'uploader': 'NoCopyrightSounds',
3503 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!',
3504 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
3505 'title': 'NCS Releases',
3506 },
3507 'playlist_mincount': 166,
3508 }, {
3509 'note': 'Topic, should redirect to playlist?list=UU...',
3510 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw',
3511 'info_dict': {
3512 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw',
3513 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
3514 'title': 'Uploads from Royalty Free Music - Topic',
3515 'uploader': 'Royalty Free Music - Topic',
3516 },
3517 'expected_warnings': [
3518 'A channel/user page was given',
3519 'The URL does not have a videos tab',
3520 ],
3521 'playlist_mincount': 101,
3522 }, {
3523 'note': 'Topic without a UU playlist',
3524 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg',
3525 'info_dict': {
3526 'id': 'UCtFRv9O2AHqOZjjynzrv-xg',
3527 'title': 'UCtFRv9O2AHqOZjjynzrv-xg',
3528 },
3529 'expected_warnings': [
3530 'A channel/user page was given',
3531 'The URL does not have a videos tab',
3532 'Falling back to channel URL',
3533 ],
3534 'playlist_mincount': 9,
3535 }, {
3536 'note': 'Youtube music Album',
3537 'url': 'https://music.youtube.com/browse/MPREb_gTAcphH99wE',
3538 'info_dict': {
3539 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0',
3540 'title': 'Album - Royalty Free Music Library V2 (50 Songs)',
3541 },
3542 'playlist_count': 50,
3543 }, {
3544 'note': 'unlisted single video playlist',
3545 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
3546 'info_dict': {
3547 'uploader_id': 'UC9zHu_mHU96r19o-wV5Qs1Q',
3548 'uploader': 'colethedj',
3549 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
3550 'title': 'yt-dlp unlisted playlist test',
3551 'availability': 'unlisted'
3552 },
3553 'playlist_count': 1,
3554 }]
3555
3556 @classmethod
3557 def suitable(cls, url):
3558 return False if YoutubeIE.suitable(url) else super(
3559 YoutubeTabIE, cls).suitable(url)
3560
3561 def _extract_channel_id(self, webpage):
3562 channel_id = self._html_search_meta(
3563 'channelId', webpage, 'channel id', default=None)
3564 if channel_id:
3565 return channel_id
3566 channel_url = self._html_search_meta(
3567 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
3568 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
3569 'twitter:app:url:googleplay'), webpage, 'channel url')
3570 return self._search_regex(
3571 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
3572 channel_url, 'channel id')
3573
3574 @staticmethod
3575 def _extract_basic_item_renderer(item):
3576 # Modified from _extract_grid_item_renderer
3577 known_basic_renderers = (
3578 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer'
3579 )
3580 for key, renderer in item.items():
3581 if not isinstance(renderer, dict):
3582 continue
3583 elif key in known_basic_renderers:
3584 return renderer
3585 elif key.startswith('grid') and key.endswith('Renderer'):
3586 return renderer
3587
3588 def _grid_entries(self, grid_renderer):
3589 for item in grid_renderer['items']:
3590 if not isinstance(item, dict):
3591 continue
3592 renderer = self._extract_basic_item_renderer(item)
3593 if not isinstance(renderer, dict):
3594 continue
3595 title = self._get_text(renderer.get('title'))
3596
3597 # playlist
3598 playlist_id = renderer.get('playlistId')
3599 if playlist_id:
3600 yield self.url_result(
3601 'https://www.youtube.com/playlist?list=%s' % playlist_id,
3602 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3603 video_title=title)
3604 continue
3605 # video
3606 video_id = renderer.get('videoId')
3607 if video_id:
3608 yield self._extract_video(renderer)
3609 continue
3610 # channel
3611 channel_id = renderer.get('channelId')
3612 if channel_id:
3613 yield self.url_result(
3614 'https://www.youtube.com/channel/%s' % channel_id,
3615 ie=YoutubeTabIE.ie_key(), video_title=title)
3616 continue
3617 # generic endpoint URL support
3618 ep_url = urljoin('https://www.youtube.com/', try_get(
3619 renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'],
3620 compat_str))
3621 if ep_url:
3622 for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE):
3623 if ie.suitable(ep_url):
3624 yield self.url_result(
3625 ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title)
3626 break
3627
3628 def _shelf_entries_from_content(self, shelf_renderer):
3629 content = shelf_renderer.get('content')
3630 if not isinstance(content, dict):
3631 return
3632 renderer = content.get('gridRenderer') or content.get('expandedShelfContentsRenderer')
3633 if renderer:
3634 # TODO: add support for nested playlists so each shelf is processed
3635 # as separate playlist
3636 # TODO: this includes only first N items
3637 for entry in self._grid_entries(renderer):
3638 yield entry
3639 renderer = content.get('horizontalListRenderer')
3640 if renderer:
3641 # TODO
3642 pass
3643
3644 def _shelf_entries(self, shelf_renderer, skip_channels=False):
3645 ep = try_get(
3646 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3647 compat_str)
3648 shelf_url = urljoin('https://www.youtube.com', ep)
3649 if shelf_url:
3650 # Skipping links to another channels, note that checking for
3651 # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL
3652 # will not work
3653 if skip_channels and '/channels?' in shelf_url:
3654 return
3655 title = self._get_text(shelf_renderer, lambda x: x['title'])
3656 yield self.url_result(shelf_url, video_title=title)
3657 # Shelf may not contain shelf URL, fallback to extraction from content
3658 for entry in self._shelf_entries_from_content(shelf_renderer):
3659 yield entry
3660
3661 def _playlist_entries(self, video_list_renderer):
3662 for content in video_list_renderer['contents']:
3663 if not isinstance(content, dict):
3664 continue
3665 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
3666 if not isinstance(renderer, dict):
3667 continue
3668 video_id = renderer.get('videoId')
3669 if not video_id:
3670 continue
3671 yield self._extract_video(renderer)
3672
3673 def _rich_entries(self, rich_grid_renderer):
3674 renderer = try_get(
3675 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
3676 video_id = renderer.get('videoId')
3677 if not video_id:
3678 return
3679 yield self._extract_video(renderer)
3680
3681 def _video_entry(self, video_renderer):
3682 video_id = video_renderer.get('videoId')
3683 if video_id:
3684 return self._extract_video(video_renderer)
3685
3686 def _post_thread_entries(self, post_thread_renderer):
3687 post_renderer = try_get(
3688 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
3689 if not post_renderer:
3690 return
3691 # video attachment
3692 video_renderer = try_get(
3693 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict) or {}
3694 video_id = video_renderer.get('videoId')
3695 if video_id:
3696 entry = self._extract_video(video_renderer)
3697 if entry:
3698 yield entry
3699 # playlist attachment
3700 playlist_id = try_get(
3701 post_renderer, lambda x: x['backstageAttachment']['playlistRenderer']['playlistId'], compat_str)
3702 if playlist_id:
3703 yield self.url_result(
3704 'https://www.youtube.com/playlist?list=%s' % playlist_id,
3705 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3706 # inline video links
3707 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
3708 for run in runs:
3709 if not isinstance(run, dict):
3710 continue
3711 ep_url = try_get(
3712 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
3713 if not ep_url:
3714 continue
3715 if not YoutubeIE.suitable(ep_url):
3716 continue
3717 ep_video_id = YoutubeIE._match_id(ep_url)
3718 if video_id == ep_video_id:
3719 continue
3720 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=ep_video_id)
3721
3722 def _post_thread_continuation_entries(self, post_thread_continuation):
3723 contents = post_thread_continuation.get('contents')
3724 if not isinstance(contents, list):
3725 return
3726 for content in contents:
3727 renderer = content.get('backstagePostThreadRenderer')
3728 if not isinstance(renderer, dict):
3729 continue
3730 for entry in self._post_thread_entries(renderer):
3731 yield entry
3732
3733 r''' # unused
3734 def _rich_grid_entries(self, contents):
3735 for content in contents:
3736 video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict)
3737 if video_renderer:
3738 entry = self._video_entry(video_renderer)
3739 if entry:
3740 yield entry
3741 '''
3742 def _entries(self, tab, item_id, identity_token, account_syncid, ytcfg):
3743
3744 def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
3745 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
3746 for content in contents:
3747 if not isinstance(content, dict):
3748 continue
3749 is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
3750 if not is_renderer:
3751 renderer = content.get('richItemRenderer')
3752 if renderer:
3753 for entry in self._rich_entries(renderer):
3754 yield entry
3755 continuation_list[0] = self._extract_continuation(parent_renderer)
3756 continue
3757 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
3758 for isr_content in isr_contents:
3759 if not isinstance(isr_content, dict):
3760 continue
3761
3762 known_renderers = {
3763 'playlistVideoListRenderer': self._playlist_entries,
3764 'gridRenderer': self._grid_entries,
3765 'shelfRenderer': lambda x: self._shelf_entries(x, tab.get('title') != 'Channels'),
3766 'backstagePostThreadRenderer': self._post_thread_entries,
3767 'videoRenderer': lambda x: [self._video_entry(x)],
3768 }
3769 for key, renderer in isr_content.items():
3770 if key not in known_renderers:
3771 continue
3772 for entry in known_renderers[key](renderer):
3773 if entry:
3774 yield entry
3775 continuation_list[0] = self._extract_continuation(renderer)
3776 break
3777
3778 if not continuation_list[0]:
3779 continuation_list[0] = self._extract_continuation(is_renderer)
3780
3781 if not continuation_list[0]:
3782 continuation_list[0] = self._extract_continuation(parent_renderer)
3783
3784 continuation_list = [None] # Python 2 doesnot support nonlocal
3785 tab_content = try_get(tab, lambda x: x['content'], dict)
3786 if not tab_content:
3787 return
3788 parent_renderer = (
3789 try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
3790 or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
3791 for entry in extract_entries(parent_renderer):
3792 yield entry
3793 continuation = continuation_list[0]
3794 visitor_data = None
3795
3796 for page_num in itertools.count(1):
3797 if not continuation:
3798 break
3799 headers = self.generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
3800 response = self._extract_response(
3801 item_id='%s page %s' % (item_id, page_num),
3802 query=continuation, headers=headers, ytcfg=ytcfg,
3803 check_get_keys=('continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints'))
3804
3805 if not response:
3806 break
3807 visitor_data = try_get(
3808 response, lambda x: x['responseContext']['visitorData'], compat_str) or visitor_data
3809
3810 known_continuation_renderers = {
3811 'playlistVideoListContinuation': self._playlist_entries,
3812 'gridContinuation': self._grid_entries,
3813 'itemSectionContinuation': self._post_thread_continuation_entries,
3814 'sectionListContinuation': extract_entries, # for feeds
3815 }
3816 continuation_contents = try_get(
3817 response, lambda x: x['continuationContents'], dict) or {}
3818 continuation_renderer = None
3819 for key, value in continuation_contents.items():
3820 if key not in known_continuation_renderers:
3821 continue
3822 continuation_renderer = value
3823 continuation_list = [None]
3824 for entry in known_continuation_renderers[key](continuation_renderer):
3825 yield entry
3826 continuation = continuation_list[0] or self._extract_continuation(continuation_renderer)
3827 break
3828 if continuation_renderer:
3829 continue
3830
3831 known_renderers = {
3832 'gridPlaylistRenderer': (self._grid_entries, 'items'),
3833 'gridVideoRenderer': (self._grid_entries, 'items'),
3834 'gridChannelRenderer': (self._grid_entries, 'items'),
3835 'playlistVideoRenderer': (self._playlist_entries, 'contents'),
3836 'itemSectionRenderer': (extract_entries, 'contents'), # for feeds
3837 'richItemRenderer': (extract_entries, 'contents'), # for hashtag
3838 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents')
3839 }
3840 on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints'))
3841 continuation_items = try_get(
3842 on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list)
3843 continuation_item = try_get(continuation_items, lambda x: x[0], dict) or {}
3844 video_items_renderer = None
3845 for key, value in continuation_item.items():
3846 if key not in known_renderers:
3847 continue
3848 video_items_renderer = {known_renderers[key][1]: continuation_items}
3849 continuation_list = [None]
3850 for entry in known_renderers[key][0](video_items_renderer):
3851 yield entry
3852 continuation = continuation_list[0] or self._extract_continuation(video_items_renderer)
3853 break
3854 if video_items_renderer:
3855 continue
3856 break
3857
3858 @staticmethod
3859 def _extract_selected_tab(tabs):
3860 for tab in tabs:
3861 renderer = dict_get(tab, ('tabRenderer', 'expandableTabRenderer')) or {}
3862 if renderer.get('selected') is True:
3863 return renderer
3864 else:
3865 raise ExtractorError('Unable to find selected tab')
3866
3867 @classmethod
3868 def _extract_uploader(cls, data):
3869 uploader = {}
3870 renderer = cls._extract_sidebar_info_renderer(data, 'playlistSidebarSecondaryInfoRenderer') or {}
3871 owner = try_get(
3872 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3873 if owner:
3874 uploader['uploader'] = owner.get('text')
3875 uploader['uploader_id'] = try_get(
3876 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3877 uploader['uploader_url'] = urljoin(
3878 'https://www.youtube.com/',
3879 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3880 return {k: v for k, v in uploader.items() if v is not None}
3881
3882 def _extract_from_tabs(self, item_id, webpage, data, tabs):
3883 playlist_id = title = description = channel_url = channel_name = channel_id = None
3884 thumbnails_list = tags = []
3885
3886 selected_tab = self._extract_selected_tab(tabs)
3887 renderer = try_get(
3888 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3889 if renderer:
3890 channel_name = renderer.get('title')
3891 channel_url = renderer.get('channelUrl')
3892 channel_id = renderer.get('externalId')
3893 else:
3894 renderer = try_get(
3895 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3896
3897 if renderer:
3898 title = renderer.get('title')
3899 description = renderer.get('description', '')
3900 playlist_id = channel_id
3901 tags = renderer.get('keywords', '').split()
3902 thumbnails_list = (
3903 try_get(renderer, lambda x: x['avatar']['thumbnails'], list)
3904 or try_get(
3905 self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer'),
3906 lambda x: x['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails'],
3907 list)
3908 or [])
3909
3910 thumbnails = []
3911 for t in thumbnails_list:
3912 if not isinstance(t, dict):
3913 continue
3914 thumbnail_url = url_or_none(t.get('url'))
3915 if not thumbnail_url:
3916 continue
3917 thumbnails.append({
3918 'url': thumbnail_url,
3919 'width': int_or_none(t.get('width')),
3920 'height': int_or_none(t.get('height')),
3921 })
3922 if playlist_id is None:
3923 playlist_id = item_id
3924 if title is None:
3925 title = (
3926 try_get(data, lambda x: x['header']['hashtagHeaderRenderer']['hashtag']['simpleText'])
3927 or playlist_id)
3928 title += format_field(selected_tab, 'title', ' - %s')
3929 title += format_field(selected_tab, 'expandedText', ' - %s')
3930 metadata = {
3931 'playlist_id': playlist_id,
3932 'playlist_title': title,
3933 'playlist_description': description,
3934 'uploader': channel_name,
3935 'uploader_id': channel_id,
3936 'uploader_url': channel_url,
3937 'thumbnails': thumbnails,
3938 'tags': tags,
3939 }
3940 availability = self._extract_availability(data)
3941 if availability:
3942 metadata['availability'] = availability
3943 if not channel_id:
3944 metadata.update(self._extract_uploader(data))
3945 metadata.update({
3946 'channel': metadata['uploader'],
3947 'channel_id': metadata['uploader_id'],
3948 'channel_url': metadata['uploader_url']})
3949 ytcfg = self.extract_ytcfg(item_id, webpage)
3950 return self.playlist_result(
3951 self._entries(
3952 selected_tab, playlist_id,
3953 self._extract_identity_token(webpage, item_id),
3954 self._extract_account_syncid(ytcfg, data), ytcfg),
3955 **metadata)
3956
3957 def _extract_mix_playlist(self, playlist, playlist_id, data, webpage):
3958 first_id = last_id = None
3959 ytcfg = self.extract_ytcfg(playlist_id, webpage)
3960 headers = self.generate_api_headers(
3961 ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data),
3962 identity_token=self._extract_identity_token(webpage, item_id=playlist_id))
3963 for page_num in itertools.count(1):
3964 videos = list(self._playlist_entries(playlist))
3965 if not videos:
3966 return
3967 start = next((i for i, v in enumerate(videos) if v['id'] == last_id), -1) + 1
3968 if start >= len(videos):
3969 return
3970 for video in videos[start:]:
3971 if video['id'] == first_id:
3972 self.to_screen('First video %s found again; Assuming end of Mix' % first_id)
3973 return
3974 yield video
3975 first_id = first_id or videos[0]['id']
3976 last_id = videos[-1]['id']
3977 watch_endpoint = try_get(
3978 playlist, lambda x: x['contents'][-1]['playlistPanelVideoRenderer']['navigationEndpoint']['watchEndpoint'])
3979 query = {
3980 'playlistId': playlist_id,
3981 'videoId': watch_endpoint.get('videoId') or last_id,
3982 'index': watch_endpoint.get('index') or len(videos),
3983 'params': watch_endpoint.get('params') or 'OAE%3D'
3984 }
3985 response = self._extract_response(
3986 item_id='%s page %d' % (playlist_id, page_num),
3987 query=query, ep='next', headers=headers, ytcfg=ytcfg,
3988 check_get_keys='contents'
3989 )
3990 playlist = try_get(
3991 response, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3992
3993 def _extract_from_playlist(self, item_id, url, data, playlist, webpage):
3994 title = playlist.get('title') or try_get(
3995 data, lambda x: x['titleText']['simpleText'], compat_str)
3996 playlist_id = playlist.get('playlistId') or item_id
3997
3998 # Delegating everything except mix playlists to regular tab-based playlist URL
3999 playlist_url = urljoin(url, try_get(
4000 playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
4001 compat_str))
4002 if playlist_url and playlist_url != url:
4003 return self.url_result(
4004 playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
4005 video_title=title)
4006
4007 return self.playlist_result(
4008 self._extract_mix_playlist(playlist, playlist_id, data, webpage),
4009 playlist_id=playlist_id, playlist_title=title)
4010
4011 def _extract_availability(self, data):
4012 """
4013 Gets the availability of a given playlist/tab.
4014 Note: Unless YouTube tells us explicitly, we do not assume it is public
4015 @param data: response
4016 """
4017 is_private = is_unlisted = None
4018 renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') or {}
4019 badge_labels = self._extract_badges(renderer)
4020
4021 # Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge
4022 privacy_dropdown_entries = try_get(
4023 renderer, lambda x: x['privacyForm']['dropdownFormFieldRenderer']['dropdown']['dropdownRenderer']['entries'], list) or []
4024 for renderer_dict in privacy_dropdown_entries:
4025 is_selected = try_get(
4026 renderer_dict, lambda x: x['privacyDropdownItemRenderer']['isSelected'], bool) or False
4027 if not is_selected:
4028 continue
4029 label = self._get_text(
4030 try_get(renderer_dict, lambda x: x['privacyDropdownItemRenderer']['label'], dict) or [])
4031 if label:
4032 badge_labels.add(label.lower())
4033 break
4034
4035 for badge_label in badge_labels:
4036 if badge_label == 'unlisted':
4037 is_unlisted = True
4038 elif badge_label == 'private':
4039 is_private = True
4040 elif badge_label == 'public':
4041 is_unlisted = is_private = False
4042 return self._availability(is_private, False, False, False, is_unlisted)
4043
4044 @staticmethod
4045 def _extract_sidebar_info_renderer(data, info_renderer, expected_type=dict):
4046 sidebar_renderer = try_get(
4047 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list) or []
4048 for item in sidebar_renderer:
4049 renderer = try_get(item, lambda x: x[info_renderer], expected_type)
4050 if renderer:
4051 return renderer
4052
4053 def _reload_with_unavailable_videos(self, item_id, data, webpage):
4054 """
4055 Get playlist with unavailable videos if the 'show unavailable videos' button exists.
4056 """
4057 browse_id = params = None
4058 renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer')
4059 if not renderer:
4060 return
4061 menu_renderer = try_get(
4062 renderer, lambda x: x['menu']['menuRenderer']['items'], list) or []
4063 for menu_item in menu_renderer:
4064 if not isinstance(menu_item, dict):
4065 continue
4066 nav_item_renderer = menu_item.get('menuNavigationItemRenderer')
4067 text = try_get(
4068 nav_item_renderer, lambda x: x['text']['simpleText'], compat_str)
4069 if not text or text.lower() != 'show unavailable videos':
4070 continue
4071 browse_endpoint = try_get(
4072 nav_item_renderer, lambda x: x['navigationEndpoint']['browseEndpoint'], dict) or {}
4073 browse_id = browse_endpoint.get('browseId')
4074 params = browse_endpoint.get('params')
4075 break
4076
4077 ytcfg = self.extract_ytcfg(item_id, webpage)
4078 headers = self.generate_api_headers(
4079 ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data),
4080 identity_token=self._extract_identity_token(webpage, item_id=item_id),
4081 visitor_data=try_get(
4082 self._extract_context(ytcfg), lambda x: x['client']['visitorData'], compat_str))
4083 query = {
4084 'params': params or 'wgYCCAA=',
4085 'browseId': browse_id or 'VL%s' % item_id
4086 }
4087 return self._extract_response(
4088 item_id=item_id, headers=headers, query=query,
4089 check_get_keys='contents', fatal=False, ytcfg=ytcfg,
4090 note='Downloading API JSON with unavailable videos')
4091
4092 def _extract_webpage(self, url, item_id):
4093 retries = self.get_param('extractor_retries', 3)
4094 count = -1
4095 last_error = 'Incomplete yt initial data recieved'
4096 while count < retries:
4097 count += 1
4098 # Sometimes youtube returns a webpage with incomplete ytInitialData
4099 # See: https://github.com/yt-dlp/yt-dlp/issues/116
4100 if count:
4101 self.report_warning('%s. Retrying ...' % last_error)
4102 webpage = self._download_webpage(
4103 url, item_id,
4104 'Downloading webpage%s' % (' (retry #%d)' % count if count else ''))
4105 data = self.extract_yt_initial_data(item_id, webpage)
4106 if data.get('contents') or data.get('currentVideoEndpoint'):
4107 break
4108 # Extract alerts here only when there is error
4109 self._extract_and_report_alerts(data)
4110 if count >= retries:
4111 raise ExtractorError(last_error)
4112 return webpage, data
4113
4114 @staticmethod
4115 def _smuggle_data(entries, data):
4116 for entry in entries:
4117 if data:
4118 entry['url'] = smuggle_url(entry['url'], data)
4119 yield entry
4120
4121 def _real_extract(self, url):
4122 url, smuggled_data = unsmuggle_url(url, {})
4123 if self.is_music_url(url):
4124 smuggled_data['is_music_url'] = True
4125 info_dict = self.__real_extract(url, smuggled_data)
4126 if info_dict.get('entries'):
4127 info_dict['entries'] = self._smuggle_data(info_dict['entries'], smuggled_data)
4128 return info_dict
4129
4130 _url_re = re.compile(r'(?P<pre>%s)(?(channel_type)(?P<tab>/\w+))?(?P<post>.*)$' % _VALID_URL)
4131
4132 def __real_extract(self, url, smuggled_data):
4133 item_id = self._match_id(url)
4134 url = compat_urlparse.urlunparse(
4135 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
4136 compat_opts = self.get_param('compat_opts', [])
4137
4138 def get_mobj(url):
4139 mobj = self._url_re.match(url).groupdict()
4140 mobj.update((k, '') for k, v in mobj.items() if v is None)
4141 return mobj
4142
4143 mobj = get_mobj(url)
4144 # Youtube returns incomplete data if tabname is not lower case
4145 pre, tab, post, is_channel = mobj['pre'], mobj['tab'].lower(), mobj['post'], not mobj['not_channel']
4146
4147 if is_channel:
4148 if smuggled_data.get('is_music_url'):
4149 if item_id[:2] == 'VL':
4150 # Youtube music VL channels have an equivalent playlist
4151 item_id = item_id[2:]
4152 pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False
4153 elif item_id[:2] == 'MP':
4154 # Youtube music albums (/channel/MP...) have a OLAK playlist that can be extracted from the webpage
4155 item_id = self._search_regex(
4156 r'\\x22audioPlaylistId\\x22:\\x22([0-9A-Za-z_-]+)\\x22',
4157 self._download_webpage('https://music.youtube.com/channel/%s' % item_id, item_id),
4158 'playlist id')
4159 pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False
4160 elif mobj['channel_type'] == 'browse':
4161 # Youtube music /browse/ should be changed to /channel/
4162 pre = 'https://www.youtube.com/channel/%s' % item_id
4163 if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts:
4164 # Home URLs should redirect to /videos/
4165 self.report_warning(
4166 'A channel/user page was given. All the channel\'s videos will be downloaded. '
4167 'To download only the videos in the home page, add a "/featured" to the URL')
4168 tab = '/videos'
4169
4170 url = ''.join((pre, tab, post))
4171 mobj = get_mobj(url)
4172
4173 # Handle both video/playlist URLs
4174 qs = parse_qs(url)
4175 video_id = qs.get('v', [None])[0]
4176 playlist_id = qs.get('list', [None])[0]
4177
4178 if not video_id and mobj['not_channel'].startswith('watch'):
4179 if not playlist_id:
4180 # If there is neither video or playlist ids, youtube redirects to home page, which is undesirable
4181 raise ExtractorError('Unable to recognize tab page')
4182 # Common mistake: https://www.youtube.com/watch?list=playlist_id
4183 self.report_warning('A video URL was given without video ID. Trying to download playlist %s' % playlist_id)
4184 url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
4185 mobj = get_mobj(url)
4186
4187 if video_id and playlist_id:
4188 if self.get_param('noplaylist'):
4189 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
4190 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
4191 self.to_screen('Downloading playlist %s; add --no-playlist to just download video %s' % (playlist_id, video_id))
4192
4193 webpage, data = self._extract_webpage(url, item_id)
4194
4195 tabs = try_get(
4196 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
4197 if tabs:
4198 selected_tab = self._extract_selected_tab(tabs)
4199 tab_name = selected_tab.get('title', '')
4200 if 'no-youtube-channel-redirect' not in compat_opts:
4201 if mobj['tab'] == '/live':
4202 # Live tab should have redirected to the video
4203 raise ExtractorError('The channel is not currently live', expected=True)
4204 if mobj['tab'] == '/videos' and tab_name.lower() != mobj['tab'][1:]:
4205 if not mobj['not_channel'] and item_id[:2] == 'UC':
4206 # Topic channels don't have /videos. Use the equivalent playlist instead
4207 self.report_warning('The URL does not have a %s tab. Trying to redirect to playlist UU%s instead' % (mobj['tab'][1:], item_id[2:]))
4208 pl_id = 'UU%s' % item_id[2:]
4209 pl_url = 'https://www.youtube.com/playlist?list=%s%s' % (pl_id, mobj['post'])
4210 try:
4211 pl_webpage, pl_data = self._extract_webpage(pl_url, pl_id)
4212 for alert_type, alert_message in self._extract_alerts(pl_data):
4213 if alert_type == 'error':
4214 raise ExtractorError('Youtube said: %s' % alert_message)
4215 item_id, url, webpage, data = pl_id, pl_url, pl_webpage, pl_data
4216 except ExtractorError:
4217 self.report_warning('The playlist gave error. Falling back to channel URL')
4218 else:
4219 self.report_warning('The URL does not have a %s tab. %s is being downloaded instead' % (mobj['tab'][1:], tab_name))
4220
4221 self.write_debug('Final URL: %s' % url)
4222
4223 # YouTube sometimes provides a button to reload playlist with unavailable videos.
4224 if 'no-youtube-unavailable-videos' not in compat_opts:
4225 data = self._reload_with_unavailable_videos(item_id, data, webpage) or data
4226 self._extract_and_report_alerts(data)
4227 tabs = try_get(
4228 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
4229 if tabs:
4230 return self._extract_from_tabs(item_id, webpage, data, tabs)
4231
4232 playlist = try_get(
4233 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
4234 if playlist:
4235 return self._extract_from_playlist(item_id, url, data, playlist, webpage)
4236
4237 video_id = try_get(
4238 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
4239 compat_str) or video_id
4240 if video_id:
4241 if mobj['tab'] != '/live': # live tab is expected to redirect to video
4242 self.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id)
4243 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
4244
4245 raise ExtractorError('Unable to recognize tab page')
4246
4247
4248 class YoutubePlaylistIE(InfoExtractor):
4249 IE_DESC = 'YouTube.com playlists'
4250 _VALID_URL = r'''(?x)(?:
4251 (?:https?://)?
4252 (?:\w+\.)?
4253 (?:
4254 (?:
4255 youtube(?:kids)?\.com|
4256 invidio\.us
4257 )
4258 /.*?\?.*?\blist=
4259 )?
4260 (?P<id>%(playlist_id)s)
4261 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
4262 IE_NAME = 'youtube:playlist'
4263 _TESTS = [{
4264 'note': 'issue #673',
4265 'url': 'PLBB231211A4F62143',
4266 'info_dict': {
4267 'title': '[OLD]Team Fortress 2 (Class-based LP)',
4268 'id': 'PLBB231211A4F62143',
4269 'uploader': 'Wickydoo',
4270 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
4271 'description': 'md5:8fa6f52abb47a9552002fa3ddfc57fc2',
4272 },
4273 'playlist_mincount': 29,
4274 }, {
4275 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
4276 'info_dict': {
4277 'title': 'YDL_safe_search',
4278 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
4279 },
4280 'playlist_count': 2,
4281 'skip': 'This playlist is private',
4282 }, {
4283 'note': 'embedded',
4284 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
4285 'playlist_count': 4,
4286 'info_dict': {
4287 'title': 'JODA15',
4288 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
4289 'uploader': 'milan',
4290 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
4291 }
4292 }, {
4293 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
4294 'playlist_mincount': 654,
4295 'info_dict': {
4296 'title': '2018 Chinese New Singles (11/6 updated)',
4297 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
4298 'uploader': 'LBK',
4299 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
4300 'description': 'md5:da521864744d60a198e3a88af4db0d9d',
4301 }
4302 }, {
4303 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
4304 'only_matching': True,
4305 }, {
4306 # music album playlist
4307 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
4308 'only_matching': True,
4309 }]
4310
4311 @classmethod
4312 def suitable(cls, url):
4313 if YoutubeTabIE.suitable(url):
4314 return False
4315 # Hack for lazy extractors until more generic solution is implemented
4316 # (see #28780)
4317 from .youtube import parse_qs
4318 qs = parse_qs(url)
4319 if qs.get('v', [None])[0]:
4320 return False
4321 return super(YoutubePlaylistIE, cls).suitable(url)
4322
4323 def _real_extract(self, url):
4324 playlist_id = self._match_id(url)
4325 is_music_url = YoutubeBaseInfoExtractor.is_music_url(url)
4326 url = update_url_query(
4327 'https://www.youtube.com/playlist',
4328 parse_qs(url) or {'list': playlist_id})
4329 if is_music_url:
4330 url = smuggle_url(url, {'is_music_url': True})
4331 return self.url_result(url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
4332
4333
4334 class YoutubeYtBeIE(InfoExtractor):
4335 IE_DESC = 'youtu.be'
4336 _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
4337 _TESTS = [{
4338 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
4339 'info_dict': {
4340 'id': 'yeWKywCrFtk',
4341 'ext': 'mp4',
4342 'title': 'Small Scale Baler and Braiding Rugs',
4343 'uploader': 'Backus-Page House Museum',
4344 'uploader_id': 'backuspagemuseum',
4345 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
4346 'upload_date': '20161008',
4347 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
4348 'categories': ['Nonprofits & Activism'],
4349 'tags': list,
4350 'like_count': int,
4351 'dislike_count': int,
4352 },
4353 'params': {
4354 'noplaylist': True,
4355 'skip_download': True,
4356 },
4357 }, {
4358 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
4359 'only_matching': True,
4360 }]
4361
4362 def _real_extract(self, url):
4363 mobj = re.match(self._VALID_URL, url)
4364 video_id = mobj.group('id')
4365 playlist_id = mobj.group('playlist_id')
4366 return self.url_result(
4367 update_url_query('https://www.youtube.com/watch', {
4368 'v': video_id,
4369 'list': playlist_id,
4370 'feature': 'youtu.be',
4371 }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
4372
4373
4374 class YoutubeYtUserIE(InfoExtractor):
4375 IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
4376 _VALID_URL = r'ytuser:(?P<id>.+)'
4377 _TESTS = [{
4378 'url': 'ytuser:phihag',
4379 'only_matching': True,
4380 }]
4381
4382 def _real_extract(self, url):
4383 user_id = self._match_id(url)
4384 return self.url_result(
4385 'https://www.youtube.com/user/%s' % user_id,
4386 ie=YoutubeTabIE.ie_key(), video_id=user_id)
4387
4388
4389 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
4390 IE_NAME = 'youtube:favorites'
4391 IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
4392 _VALID_URL = r':ytfav(?:ou?rite)?s?'
4393 _LOGIN_REQUIRED = True
4394 _TESTS = [{
4395 'url': ':ytfav',
4396 'only_matching': True,
4397 }, {
4398 'url': ':ytfavorites',
4399 'only_matching': True,
4400 }]
4401
4402 def _real_extract(self, url):
4403 return self.url_result(
4404 'https://www.youtube.com/playlist?list=LL',
4405 ie=YoutubeTabIE.ie_key())
4406
4407
4408 class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE):
4409 IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
4410 # there doesn't appear to be a real limit, for example if you search for
4411 # 'python' you get more than 8.000.000 results
4412 _MAX_RESULTS = float('inf')
4413 IE_NAME = 'youtube:search'
4414 _SEARCH_KEY = 'ytsearch'
4415 _SEARCH_PARAMS = None
4416 _TESTS = []
4417
4418 def _entries(self, query, n):
4419 data = {'query': query}
4420 if self._SEARCH_PARAMS:
4421 data['params'] = self._SEARCH_PARAMS
4422 total = 0
4423 continuation = {}
4424 for page_num in itertools.count(1):
4425 data.update(continuation)
4426 search = self._extract_response(
4427 item_id='query "%s" page %s' % (query, page_num), ep='search', query=data,
4428 check_get_keys=('contents', 'onResponseReceivedCommands')
4429 )
4430 if not search:
4431 break
4432 slr_contents = try_get(
4433 search,
4434 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
4435 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
4436 list)
4437 if not slr_contents:
4438 break
4439
4440 # Youtube sometimes adds promoted content to searches,
4441 # changing the index location of videos and token.
4442 # So we search through all entries till we find them.
4443 continuation = None
4444 for slr_content in slr_contents:
4445 if not continuation:
4446 continuation = self._extract_continuation({'contents': [slr_content]})
4447
4448 isr_contents = try_get(
4449 slr_content,
4450 lambda x: x['itemSectionRenderer']['contents'],
4451 list)
4452 if not isr_contents:
4453 continue
4454 for content in isr_contents:
4455 if not isinstance(content, dict):
4456 continue
4457 video = content.get('videoRenderer')
4458 if not isinstance(video, dict):
4459 continue
4460 video_id = video.get('videoId')
4461 if not video_id:
4462 continue
4463
4464 yield self._extract_video(video)
4465 total += 1
4466 if total == n:
4467 return
4468
4469 if not continuation:
4470 break
4471
4472 def _get_n_results(self, query, n):
4473 """Get a specified number of results for a query"""
4474 return self.playlist_result(self._entries(query, n), query, query)
4475
4476
4477 class YoutubeSearchDateIE(YoutubeSearchIE):
4478 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
4479 _SEARCH_KEY = 'ytsearchdate'
4480 IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
4481 _SEARCH_PARAMS = 'CAI%3D'
4482
4483
4484 class YoutubeSearchURLIE(YoutubeSearchIE):
4485 IE_DESC = 'YouTube.com search URLs'
4486 IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
4487 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
4488 # _MAX_RESULTS = 100
4489 _TESTS = [{
4490 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
4491 'playlist_mincount': 5,
4492 'info_dict': {
4493 'id': 'youtube-dl test video',
4494 'title': 'youtube-dl test video',
4495 }
4496 }, {
4497 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
4498 'only_matching': True,
4499 }]
4500
4501 @classmethod
4502 def _make_valid_url(cls):
4503 return cls._VALID_URL
4504
4505 def _real_extract(self, url):
4506 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
4507 query = (qs.get('search_query') or qs.get('q'))[0]
4508 self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
4509 return self._get_n_results(query, self._MAX_RESULTS)
4510
4511
4512 class YoutubeFeedsInfoExtractor(YoutubeTabIE):
4513 """
4514 Base class for feed extractors
4515 Subclasses must define the _FEED_NAME property.
4516 """
4517 _LOGIN_REQUIRED = True
4518 _TESTS = []
4519
4520 @property
4521 def IE_NAME(self):
4522 return 'youtube:%s' % self._FEED_NAME
4523
4524 def _real_extract(self, url):
4525 return self.url_result(
4526 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
4527 ie=YoutubeTabIE.ie_key())
4528
4529
4530 class YoutubeWatchLaterIE(InfoExtractor):
4531 IE_NAME = 'youtube:watchlater'
4532 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
4533 _VALID_URL = r':ytwatchlater'
4534 _TESTS = [{
4535 'url': ':ytwatchlater',
4536 'only_matching': True,
4537 }]
4538
4539 def _real_extract(self, url):
4540 return self.url_result(
4541 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
4542
4543
4544 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
4545 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
4546 _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
4547 _FEED_NAME = 'recommended'
4548 _LOGIN_REQUIRED = False
4549 _TESTS = [{
4550 'url': ':ytrec',
4551 'only_matching': True,
4552 }, {
4553 'url': ':ytrecommended',
4554 'only_matching': True,
4555 }, {
4556 'url': 'https://youtube.com',
4557 'only_matching': True,
4558 }]
4559
4560
4561 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
4562 IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
4563 _VALID_URL = r':ytsub(?:scription)?s?'
4564 _FEED_NAME = 'subscriptions'
4565 _TESTS = [{
4566 'url': ':ytsubs',
4567 'only_matching': True,
4568 }, {
4569 'url': ':ytsubscriptions',
4570 'only_matching': True,
4571 }]
4572
4573
4574 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
4575 IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)'
4576 _VALID_URL = r':ythis(?:tory)?'
4577 _FEED_NAME = 'history'
4578 _TESTS = [{
4579 'url': ':ythistory',
4580 'only_matching': True,
4581 }]
4582
4583
4584 class YoutubeTruncatedURLIE(InfoExtractor):
4585 IE_NAME = 'youtube:truncated_url'
4586 IE_DESC = False # Do not list
4587 _VALID_URL = r'''(?x)
4588 (?:https?://)?
4589 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
4590 (?:watch\?(?:
4591 feature=[a-z_]+|
4592 annotation_id=annotation_[^&]+|
4593 x-yt-cl=[0-9]+|
4594 hl=[^&]*|
4595 t=[0-9]+
4596 )?
4597 |
4598 attribution_link\?a=[^&]+
4599 )
4600 $
4601 '''
4602
4603 _TESTS = [{
4604 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
4605 'only_matching': True,
4606 }, {
4607 'url': 'https://www.youtube.com/watch?',
4608 'only_matching': True,
4609 }, {
4610 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
4611 'only_matching': True,
4612 }, {
4613 'url': 'https://www.youtube.com/watch?feature=foo',
4614 'only_matching': True,
4615 }, {
4616 'url': 'https://www.youtube.com/watch?hl=en-GB',
4617 'only_matching': True,
4618 }, {
4619 'url': 'https://www.youtube.com/watch?t=2372',
4620 'only_matching': True,
4621 }]
4622
4623 def _real_extract(self, url):
4624 raise ExtractorError(
4625 'Did you forget to quote the URL? Remember that & is a meta '
4626 'character in most shells, so you want to put the URL in quotes, '
4627 'like youtube-dl '
4628 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
4629 ' or simply youtube-dl BaW_jenozKc .',
4630 expected=True)
4631
4632
4633 class YoutubeTruncatedIDIE(InfoExtractor):
4634 IE_NAME = 'youtube:truncated_id'
4635 IE_DESC = False # Do not list
4636 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
4637
4638 _TESTS = [{
4639 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
4640 'only_matching': True,
4641 }]
4642
4643 def _real_extract(self, url):
4644 video_id = self._match_id(url)
4645 raise ExtractorError(
4646 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
4647 expected=True)