]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/youtube.py
[youtube] Fix session index extraction and headers for non-web player clients (#526)
[yt-dlp.git] / yt_dlp / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5 import base64
6 import calendar
7 import copy
8 import hashlib
9 import itertools
10 import json
11 import os.path
12 import random
13 import re
14 import time
15 import traceback
16
17 from .common import InfoExtractor, SearchInfoExtractor
18 from ..compat import (
19 compat_chr,
20 compat_HTTPError,
21 compat_parse_qs,
22 compat_str,
23 compat_urllib_parse_unquote_plus,
24 compat_urllib_parse_urlencode,
25 compat_urllib_parse_urlparse,
26 compat_urlparse,
27 )
28 from ..jsinterp import JSInterpreter
29 from ..utils import (
30 bool_or_none,
31 bytes_to_intlist,
32 clean_html,
33 dict_get,
34 datetime_from_str,
35 error_to_compat_str,
36 ExtractorError,
37 format_field,
38 float_or_none,
39 int_or_none,
40 intlist_to_bytes,
41 mimetype2ext,
42 parse_codecs,
43 parse_count,
44 parse_duration,
45 qualities,
46 remove_start,
47 smuggle_url,
48 str_or_none,
49 str_to_int,
50 try_get,
51 unescapeHTML,
52 unified_strdate,
53 unsmuggle_url,
54 update_url_query,
55 url_or_none,
56 urlencode_postdata,
57 urljoin
58 )
59
60
61 def parse_qs(url):
62 return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
63
64
65 class YoutubeBaseInfoExtractor(InfoExtractor):
66 """Provide base functions for Youtube extractors"""
67 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
68 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
69
70 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
71 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
72 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
73
74 _RESERVED_NAMES = (
75 r'channel|c|user|browse|playlist|watch|w|v|embed|e|watch_popup|shorts|'
76 r'movies|results|shared|hashtag|trending|feed|feeds|oembed|get_video_info|'
77 r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout')
78
79 _NETRC_MACHINE = 'youtube'
80 # If True it will raise an error if no login info is provided
81 _LOGIN_REQUIRED = False
82
83 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
84
85 def _login(self):
86 """
87 Attempt to log in to YouTube.
88 True is returned if successful or skipped.
89 False is returned if login failed.
90
91 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
92 """
93
94 def warn(message):
95 self.report_warning(message)
96
97 # username+password login is broken
98 if self._LOGIN_REQUIRED and self.get_param('cookiefile') is None:
99 self.raise_login_required(
100 'Login details are needed to download this content', method='cookies')
101 username, password = self._get_login_info()
102 if username:
103 warn('Logging in using username and password is broken. %s' % self._LOGIN_HINTS['cookies'])
104 return
105
106 # Everything below this is broken!
107 r'''
108 # No authentication to be performed
109 if username is None:
110 if self._LOGIN_REQUIRED and self.get_param('cookiefile') is None:
111 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
112 # if self.get_param('cookiefile'): # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
113 # self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
114 return True
115
116 login_page = self._download_webpage(
117 self._LOGIN_URL, None,
118 note='Downloading login page',
119 errnote='unable to fetch login page', fatal=False)
120 if login_page is False:
121 return
122
123 login_form = self._hidden_inputs(login_page)
124
125 def req(url, f_req, note, errnote):
126 data = login_form.copy()
127 data.update({
128 'pstMsg': 1,
129 'checkConnection': 'youtube',
130 'checkedDomains': 'youtube',
131 'hl': 'en',
132 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
133 'f.req': json.dumps(f_req),
134 'flowName': 'GlifWebSignIn',
135 'flowEntry': 'ServiceLogin',
136 # TODO: reverse actual botguard identifier generation algo
137 'bgRequest': '["identifier",""]',
138 })
139 return self._download_json(
140 url, None, note=note, errnote=errnote,
141 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
142 fatal=False,
143 data=urlencode_postdata(data), headers={
144 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
145 'Google-Accounts-XSRF': 1,
146 })
147
148 lookup_req = [
149 username,
150 None, [], None, 'US', None, None, 2, False, True,
151 [
152 None, None,
153 [2, 1, None, 1,
154 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
155 None, [], 4],
156 1, [None, None, []], None, None, None, True
157 ],
158 username,
159 ]
160
161 lookup_results = req(
162 self._LOOKUP_URL, lookup_req,
163 'Looking up account info', 'Unable to look up account info')
164
165 if lookup_results is False:
166 return False
167
168 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
169 if not user_hash:
170 warn('Unable to extract user hash')
171 return False
172
173 challenge_req = [
174 user_hash,
175 None, 1, None, [1, None, None, None, [password, None, True]],
176 [
177 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
178 1, [None, None, []], None, None, None, True
179 ]]
180
181 challenge_results = req(
182 self._CHALLENGE_URL, challenge_req,
183 'Logging in', 'Unable to log in')
184
185 if challenge_results is False:
186 return
187
188 login_res = try_get(challenge_results, lambda x: x[0][5], list)
189 if login_res:
190 login_msg = try_get(login_res, lambda x: x[5], compat_str)
191 warn(
192 'Unable to login: %s' % 'Invalid password'
193 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
194 return False
195
196 res = try_get(challenge_results, lambda x: x[0][-1], list)
197 if not res:
198 warn('Unable to extract result entry')
199 return False
200
201 login_challenge = try_get(res, lambda x: x[0][0], list)
202 if login_challenge:
203 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
204 if challenge_str == 'TWO_STEP_VERIFICATION':
205 # SEND_SUCCESS - TFA code has been successfully sent to phone
206 # QUOTA_EXCEEDED - reached the limit of TFA codes
207 status = try_get(login_challenge, lambda x: x[5], compat_str)
208 if status == 'QUOTA_EXCEEDED':
209 warn('Exceeded the limit of TFA codes, try later')
210 return False
211
212 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
213 if not tl:
214 warn('Unable to extract TL')
215 return False
216
217 tfa_code = self._get_tfa_info('2-step verification code')
218
219 if not tfa_code:
220 warn(
221 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
222 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
223 return False
224
225 tfa_code = remove_start(tfa_code, 'G-')
226
227 tfa_req = [
228 user_hash, None, 2, None,
229 [
230 9, None, None, None, None, None, None, None,
231 [None, tfa_code, True, 2]
232 ]]
233
234 tfa_results = req(
235 self._TFA_URL.format(tl), tfa_req,
236 'Submitting TFA code', 'Unable to submit TFA code')
237
238 if tfa_results is False:
239 return False
240
241 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
242 if tfa_res:
243 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
244 warn(
245 'Unable to finish TFA: %s' % 'Invalid TFA code'
246 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
247 return False
248
249 check_cookie_url = try_get(
250 tfa_results, lambda x: x[0][-1][2], compat_str)
251 else:
252 CHALLENGES = {
253 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
254 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
255 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
256 }
257 challenge = CHALLENGES.get(
258 challenge_str,
259 '%s returned error %s.' % (self.IE_NAME, challenge_str))
260 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
261 return False
262 else:
263 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
264
265 if not check_cookie_url:
266 warn('Unable to extract CheckCookie URL')
267 return False
268
269 check_cookie_results = self._download_webpage(
270 check_cookie_url, None, 'Checking cookie', fatal=False)
271
272 if check_cookie_results is False:
273 return False
274
275 if 'https://myaccount.google.com/' not in check_cookie_results:
276 warn('Unable to log in')
277 return False
278
279 return True
280 '''
281
282 def _initialize_consent(self):
283 cookies = self._get_cookies('https://www.youtube.com/')
284 if cookies.get('__Secure-3PSID'):
285 return
286 consent_id = None
287 consent = cookies.get('CONSENT')
288 if consent:
289 if 'YES' in consent.value:
290 return
291 consent_id = self._search_regex(
292 r'PENDING\+(\d+)', consent.value, 'consent', default=None)
293 if not consent_id:
294 consent_id = random.randint(100, 999)
295 self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id)
296
297 def _real_initialize(self):
298 self._initialize_consent()
299 if self._downloader is None:
300 return
301 if not self._login():
302 return
303
304 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
305 _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
306 _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
307
308 _YT_DEFAULT_YTCFGS = {
309 'WEB': {
310 'INNERTUBE_API_VERSION': 'v1',
311 'INNERTUBE_CLIENT_NAME': 'WEB',
312 'INNERTUBE_CLIENT_VERSION': '2.20210622.10.00',
313 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
314 'INNERTUBE_CONTEXT': {
315 'client': {
316 'clientName': 'WEB',
317 'clientVersion': '2.20210622.10.00',
318 'hl': 'en',
319 }
320 },
321 'INNERTUBE_CONTEXT_CLIENT_NAME': 1
322 },
323 'WEB_REMIX': {
324 'INNERTUBE_API_VERSION': 'v1',
325 'INNERTUBE_CLIENT_NAME': 'WEB_REMIX',
326 'INNERTUBE_CLIENT_VERSION': '1.20210621.00.00',
327 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
328 'INNERTUBE_CONTEXT': {
329 'client': {
330 'clientName': 'WEB_REMIX',
331 'clientVersion': '1.20210621.00.00',
332 'hl': 'en',
333 }
334 },
335 'INNERTUBE_CONTEXT_CLIENT_NAME': 67
336 },
337 'WEB_EMBEDDED_PLAYER': {
338 'INNERTUBE_API_VERSION': 'v1',
339 'INNERTUBE_CLIENT_NAME': 'WEB_EMBEDDED_PLAYER',
340 'INNERTUBE_CLIENT_VERSION': '1.20210620.0.1',
341 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
342 'INNERTUBE_CONTEXT': {
343 'client': {
344 'clientName': 'WEB_EMBEDDED_PLAYER',
345 'clientVersion': '1.20210620.0.1',
346 'hl': 'en',
347 }
348 },
349 'INNERTUBE_CONTEXT_CLIENT_NAME': 56
350 },
351 'ANDROID': {
352 'INNERTUBE_API_VERSION': 'v1',
353 'INNERTUBE_CLIENT_NAME': 'ANDROID',
354 'INNERTUBE_CLIENT_VERSION': '16.20',
355 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
356 'INNERTUBE_CONTEXT': {
357 'client': {
358 'clientName': 'ANDROID',
359 'clientVersion': '16.20',
360 'hl': 'en',
361 }
362 },
363 'INNERTUBE_CONTEXT_CLIENT_NAME': 'ANDROID'
364 },
365 'ANDROID_EMBEDDED_PLAYER': {
366 'INNERTUBE_API_VERSION': 'v1',
367 'INNERTUBE_CLIENT_NAME': 'ANDROID_EMBEDDED_PLAYER',
368 'INNERTUBE_CLIENT_VERSION': '16.20',
369 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
370 'INNERTUBE_CONTEXT': {
371 'client': {
372 'clientName': 'ANDROID_EMBEDDED_PLAYER',
373 'clientVersion': '16.20',
374 'hl': 'en',
375 }
376 },
377 'INNERTUBE_CONTEXT_CLIENT_NAME': 'ANDROID_EMBEDDED_PLAYER'
378 },
379 'ANDROID_MUSIC': {
380 'INNERTUBE_API_VERSION': 'v1',
381 'INNERTUBE_CLIENT_NAME': 'ANDROID_MUSIC',
382 'INNERTUBE_CLIENT_VERSION': '4.32',
383 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
384 'INNERTUBE_CONTEXT': {
385 'client': {
386 'clientName': 'ANDROID_MUSIC',
387 'clientVersion': '4.32',
388 'hl': 'en',
389 }
390 },
391 'INNERTUBE_CONTEXT_CLIENT_NAME': 'ANDROID_MUSIC'
392 }
393 }
394
395 _YT_DEFAULT_INNERTUBE_HOSTS = {
396 'DIRECT': 'youtubei.googleapis.com',
397 'WEB': 'www.youtube.com',
398 'WEB_REMIX': 'music.youtube.com',
399 'ANDROID_MUSIC': 'music.youtube.com'
400 }
401
402 def _get_default_ytcfg(self, client='WEB'):
403 if client in self._YT_DEFAULT_YTCFGS:
404 return copy.deepcopy(self._YT_DEFAULT_YTCFGS[client])
405 self.write_debug(f'INNERTUBE default client {client} does not exist - falling back to WEB client.')
406 return copy.deepcopy(self._YT_DEFAULT_YTCFGS['WEB'])
407
408 def _get_innertube_host(self, client='WEB'):
409 return dict_get(self._YT_DEFAULT_INNERTUBE_HOSTS, (client, 'WEB'))
410
411 def _ytcfg_get_safe(self, ytcfg, getter, expected_type=None, default_client='WEB'):
412 # try_get but with fallback to default ytcfg client values when present
413 _func = lambda y: try_get(y, getter, expected_type)
414 return _func(ytcfg) or _func(self._get_default_ytcfg(default_client))
415
416 def _extract_client_name(self, ytcfg, default_client='WEB'):
417 return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CLIENT_NAME'], compat_str, default_client)
418
419 @staticmethod
420 def _extract_session_index(ytcfg):
421 return int_or_none(try_get(ytcfg, lambda x: x['SESSION_INDEX']))
422
423 def _extract_client_version(self, ytcfg, default_client='WEB'):
424 return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CLIENT_VERSION'], compat_str, default_client)
425
426 def _extract_api_key(self, ytcfg=None, default_client='WEB'):
427 return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client)
428
429 def _extract_context(self, ytcfg=None, default_client='WEB'):
430 _get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict)
431 context = _get_context(ytcfg)
432 if context:
433 return context
434
435 context = _get_context(self._get_default_ytcfg(default_client))
436 if not ytcfg:
437 return context
438
439 # Recreate the client context (required)
440 context['client'].update({
441 'clientVersion': self._extract_client_version(ytcfg, default_client),
442 'clientName': self._extract_client_name(ytcfg, default_client),
443 })
444 visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str)
445 if visitor_data:
446 context['client']['visitorData'] = visitor_data
447 return context
448
449 def _generate_sapisidhash_header(self, origin='https://www.youtube.com'):
450 # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is.
451 # See: https://github.com/yt-dlp/yt-dlp/issues/393
452 yt_cookies = self._get_cookies('https://www.youtube.com')
453 sapisid_cookie = dict_get(
454 yt_cookies, ('__Secure-3PAPISID', 'SAPISID'))
455 if sapisid_cookie is None:
456 return
457 time_now = round(time.time())
458 # SAPISID cookie is required if not already present
459 if not yt_cookies.get('SAPISID'):
460 self._set_cookie(
461 '.youtube.com', 'SAPISID', sapisid_cookie.value, secure=True, expire_time=time_now + 3600)
462 # SAPISIDHASH algorithm from https://stackoverflow.com/a/32065323
463 sapisidhash = hashlib.sha1(
464 f'{time_now} {sapisid_cookie.value} {origin}'.encode('utf-8')).hexdigest()
465 return f'SAPISIDHASH {time_now}_{sapisidhash}'
466
467 def _call_api(self, ep, query, video_id, fatal=True, headers=None,
468 note='Downloading API JSON', errnote='Unable to download API page',
469 context=None, api_key=None, api_hostname=None, default_client='WEB'):
470
471 data = {'context': context} if context else {'context': self._extract_context(default_client=default_client)}
472 data.update(query)
473 real_headers = self._generate_api_headers(client=default_client)
474 real_headers.update({'content-type': 'application/json'})
475 if headers:
476 real_headers.update(headers)
477 return self._download_json(
478 'https://%s/youtubei/v1/%s' % (api_hostname or self._get_innertube_host(default_client), ep),
479 video_id=video_id, fatal=fatal, note=note, errnote=errnote,
480 data=json.dumps(data).encode('utf8'), headers=real_headers,
481 query={'key': api_key or self._extract_api_key()})
482
483 def _extract_yt_initial_data(self, video_id, webpage):
484 return self._parse_json(
485 self._search_regex(
486 (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
487 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
488 video_id)
489
490 def _extract_identity_token(self, webpage, item_id):
491 ytcfg = self._extract_ytcfg(item_id, webpage)
492 if ytcfg:
493 token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
494 if token:
495 return token
496 return self._search_regex(
497 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
498 'identity token', default=None)
499
500 @staticmethod
501 def _extract_account_syncid(data):
502 """
503 Extract syncId required to download private playlists of secondary channels
504 @param data Either response or ytcfg
505 """
506 sync_ids = (try_get(
507 data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'],
508 lambda x: x['DATASYNC_ID']), compat_str) or '').split("||")
509 if len(sync_ids) >= 2 and sync_ids[1]:
510 # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
511 # and just "user_syncid||" for primary channel. We only want the channel_syncid
512 return sync_ids[0]
513 # ytcfg includes channel_syncid if on secondary channel
514 return data.get('DELEGATED_SESSION_ID')
515
516 def _extract_ytcfg(self, video_id, webpage):
517 if not webpage:
518 return {}
519 return self._parse_json(
520 self._search_regex(
521 r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
522 default='{}'), video_id, fatal=False) or {}
523
524 def _generate_api_headers(self, ytcfg=None, identity_token=None, account_syncid=None,
525 visitor_data=None, api_hostname=None, client='WEB', session_index=None):
526 origin = 'https://' + (api_hostname if api_hostname else self._get_innertube_host(client))
527 headers = {
528 'X-YouTube-Client-Name': compat_str(
529 self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=client)),
530 'X-YouTube-Client-Version': self._extract_client_version(ytcfg, client),
531 'Origin': origin
532 }
533 if not visitor_data and ytcfg:
534 visitor_data = try_get(
535 self._extract_context(ytcfg, client), lambda x: x['client']['visitorData'], compat_str)
536 if identity_token:
537 headers['X-Youtube-Identity-Token'] = identity_token
538 if account_syncid:
539 headers['X-Goog-PageId'] = account_syncid
540 if session_index is None and ytcfg:
541 session_index = self._extract_session_index(ytcfg)
542 if account_syncid or session_index is not None:
543 headers['X-Goog-AuthUser'] = session_index if session_index is not None else 0
544 if visitor_data:
545 headers['X-Goog-Visitor-Id'] = visitor_data
546 auth = self._generate_sapisidhash_header(origin)
547 if auth is not None:
548 headers['Authorization'] = auth
549 headers['X-Origin'] = origin
550 return headers
551
552 @staticmethod
553 def _build_api_continuation_query(continuation, ctp=None):
554 query = {
555 'continuation': continuation
556 }
557 # TODO: Inconsistency with clickTrackingParams.
558 # Currently we have a fixed ctp contained within context (from ytcfg)
559 # and a ctp in root query for continuation.
560 if ctp:
561 query['clickTracking'] = {'clickTrackingParams': ctp}
562 return query
563
564 @classmethod
565 def _continuation_query_ajax_to_api(cls, continuation_query):
566 continuation = dict_get(continuation_query, ('continuation', 'ctoken'))
567 return cls._build_api_continuation_query(continuation, continuation_query.get('itct'))
568
569 @staticmethod
570 def _build_continuation_query(continuation, ctp=None):
571 query = {
572 'ctoken': continuation,
573 'continuation': continuation,
574 }
575 if ctp:
576 query['itct'] = ctp
577 return query
578
579 @classmethod
580 def _extract_next_continuation_data(cls, renderer):
581 next_continuation = try_get(
582 renderer, (lambda x: x['continuations'][0]['nextContinuationData'],
583 lambda x: x['continuation']['reloadContinuationData']), dict)
584 if not next_continuation:
585 return
586 continuation = next_continuation.get('continuation')
587 if not continuation:
588 return
589 ctp = next_continuation.get('clickTrackingParams')
590 return cls._build_continuation_query(continuation, ctp)
591
592 @classmethod
593 def _extract_continuation_ep_data(cls, continuation_ep: dict):
594 if isinstance(continuation_ep, dict):
595 continuation = try_get(
596 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
597 if not continuation:
598 return
599 ctp = continuation_ep.get('clickTrackingParams')
600 return cls._build_continuation_query(continuation, ctp)
601
602 @classmethod
603 def _extract_continuation(cls, renderer):
604 next_continuation = cls._extract_next_continuation_data(renderer)
605 if next_continuation:
606 return next_continuation
607 contents = []
608 for key in ('contents', 'items'):
609 contents.extend(try_get(renderer, lambda x: x[key], list) or [])
610 for content in contents:
611 if not isinstance(content, dict):
612 continue
613 continuation_ep = try_get(
614 content, (lambda x: x['continuationItemRenderer']['continuationEndpoint'],
615 lambda x: x['continuationItemRenderer']['button']['buttonRenderer']['command']),
616 dict)
617 continuation = cls._extract_continuation_ep_data(continuation_ep)
618 if continuation:
619 return continuation
620
621 @staticmethod
622 def _extract_alerts(data):
623 for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
624 if not isinstance(alert_dict, dict):
625 continue
626 for alert in alert_dict.values():
627 alert_type = alert.get('type')
628 if not alert_type:
629 continue
630 message = try_get(alert, lambda x: x['text']['simpleText'], compat_str) or ''
631 if message:
632 yield alert_type, message
633 for run in try_get(alert, lambda x: x['text']['runs'], list) or []:
634 message += try_get(run, lambda x: x['text'], compat_str)
635 if message:
636 yield alert_type, message
637
638 def _report_alerts(self, alerts, expected=True):
639 errors = []
640 warnings = []
641 for alert_type, alert_message in alerts:
642 if alert_type.lower() == 'error':
643 errors.append([alert_type, alert_message])
644 else:
645 warnings.append([alert_type, alert_message])
646
647 for alert_type, alert_message in (warnings + errors[:-1]):
648 self.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
649 if errors:
650 raise ExtractorError('YouTube said: %s' % errors[-1][1], expected=expected)
651
652 def _extract_and_report_alerts(self, data, *args, **kwargs):
653 return self._report_alerts(self._extract_alerts(data), *args, **kwargs)
654
655 def _extract_badges(self, renderer: dict):
656 badges = set()
657 for badge in try_get(renderer, lambda x: x['badges'], list) or []:
658 label = try_get(badge, lambda x: x['metadataBadgeRenderer']['label'], compat_str)
659 if label:
660 badges.add(label.lower())
661 return badges
662
663 @staticmethod
664 def _join_text_entries(runs):
665 text = None
666 for run in runs:
667 if not isinstance(run, dict):
668 continue
669 sub_text = try_get(run, lambda x: x['text'], compat_str)
670 if sub_text:
671 if not text:
672 text = sub_text
673 continue
674 text += sub_text
675 return text
676
677 def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None,
678 ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None,
679 default_client='WEB'):
680 response = None
681 last_error = None
682 count = -1
683 retries = self.get_param('extractor_retries', 3)
684 if check_get_keys is None:
685 check_get_keys = []
686 while count < retries:
687 count += 1
688 if last_error:
689 self.report_warning('%s. Retrying ...' % last_error)
690 try:
691 response = self._call_api(
692 ep=ep, fatal=True, headers=headers,
693 video_id=item_id, query=query,
694 context=self._extract_context(ytcfg, default_client),
695 api_key=self._extract_api_key(ytcfg, default_client),
696 api_hostname=api_hostname, default_client=default_client,
697 note='%s%s' % (note, ' (retry #%d)' % count if count else ''))
698 except ExtractorError as e:
699 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404):
700 # Downloading page may result in intermittent 5xx HTTP error
701 # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
702 last_error = 'HTTP Error %s' % e.cause.code
703 if count < retries:
704 continue
705 if fatal:
706 raise
707 else:
708 self.report_warning(error_to_compat_str(e))
709 return
710
711 else:
712 # Youtube may send alerts if there was an issue with the continuation page
713 try:
714 self._extract_and_report_alerts(response, expected=False)
715 except ExtractorError as e:
716 if fatal:
717 raise
718 self.report_warning(error_to_compat_str(e))
719 return
720 if not check_get_keys or dict_get(response, check_get_keys):
721 break
722 # Youtube sometimes sends incomplete data
723 # See: https://github.com/ytdl-org/youtube-dl/issues/28194
724 last_error = 'Incomplete data received'
725 if count >= retries:
726 if fatal:
727 raise ExtractorError(last_error)
728 else:
729 self.report_warning(last_error)
730 return
731 return response
732
733 @staticmethod
734 def is_music_url(url):
735 return re.match(r'https?://music\.youtube\.com/', url) is not None
736
737 def _extract_video(self, renderer):
738 video_id = renderer.get('videoId')
739 title = try_get(
740 renderer,
741 (lambda x: x['title']['runs'][0]['text'],
742 lambda x: x['title']['simpleText']), compat_str)
743 description = try_get(
744 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
745 compat_str)
746 duration = parse_duration(try_get(
747 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
748 view_count_text = try_get(
749 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
750 view_count = str_to_int(self._search_regex(
751 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
752 'view count', default=None))
753 uploader = try_get(
754 renderer,
755 (lambda x: x['ownerText']['runs'][0]['text'],
756 lambda x: x['shortBylineText']['runs'][0]['text']), compat_str)
757 return {
758 '_type': 'url',
759 'ie_key': YoutubeIE.ie_key(),
760 'id': video_id,
761 'url': video_id,
762 'title': title,
763 'description': description,
764 'duration': duration,
765 'view_count': view_count,
766 'uploader': uploader,
767 }
768
769
770 class YoutubeIE(YoutubeBaseInfoExtractor):
771 IE_DESC = 'YouTube.com'
772 _INVIDIOUS_SITES = (
773 # invidious-redirect websites
774 r'(?:www\.)?redirect\.invidious\.io',
775 r'(?:(?:www|dev)\.)?invidio\.us',
776 # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
777 r'(?:www\.)?invidious\.pussthecat\.org',
778 r'(?:www\.)?invidious\.zee\.li',
779 r'(?:www\.)?invidious\.ethibox\.fr',
780 r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
781 # youtube-dl invidious instances list
782 r'(?:(?:www|no)\.)?invidiou\.sh',
783 r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
784 r'(?:www\.)?invidious\.kabi\.tk',
785 r'(?:www\.)?invidious\.mastodon\.host',
786 r'(?:www\.)?invidious\.zapashcanon\.fr',
787 r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks',
788 r'(?:www\.)?invidious\.tinfoil-hat\.net',
789 r'(?:www\.)?invidious\.himiko\.cloud',
790 r'(?:www\.)?invidious\.reallyancient\.tech',
791 r'(?:www\.)?invidious\.tube',
792 r'(?:www\.)?invidiou\.site',
793 r'(?:www\.)?invidious\.site',
794 r'(?:www\.)?invidious\.xyz',
795 r'(?:www\.)?invidious\.nixnet\.xyz',
796 r'(?:www\.)?invidious\.048596\.xyz',
797 r'(?:www\.)?invidious\.drycat\.fr',
798 r'(?:www\.)?inv\.skyn3t\.in',
799 r'(?:www\.)?tube\.poal\.co',
800 r'(?:www\.)?tube\.connect\.cafe',
801 r'(?:www\.)?vid\.wxzm\.sx',
802 r'(?:www\.)?vid\.mint\.lgbt',
803 r'(?:www\.)?vid\.puffyan\.us',
804 r'(?:www\.)?yewtu\.be',
805 r'(?:www\.)?yt\.elukerio\.org',
806 r'(?:www\.)?yt\.lelux\.fi',
807 r'(?:www\.)?invidious\.ggc-project\.de',
808 r'(?:www\.)?yt\.maisputain\.ovh',
809 r'(?:www\.)?ytprivate\.com',
810 r'(?:www\.)?invidious\.13ad\.de',
811 r'(?:www\.)?invidious\.toot\.koeln',
812 r'(?:www\.)?invidious\.fdn\.fr',
813 r'(?:www\.)?watch\.nettohikari\.com',
814 r'(?:www\.)?invidious\.namazso\.eu',
815 r'(?:www\.)?invidious\.silkky\.cloud',
816 r'(?:www\.)?invidious\.exonip\.de',
817 r'(?:www\.)?invidious\.riverside\.rocks',
818 r'(?:www\.)?invidious\.blamefran\.net',
819 r'(?:www\.)?invidious\.moomoo\.de',
820 r'(?:www\.)?ytb\.trom\.tf',
821 r'(?:www\.)?yt\.cyberhost\.uk',
822 r'(?:www\.)?kgg2m7yk5aybusll\.onion',
823 r'(?:www\.)?qklhadlycap4cnod\.onion',
824 r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion',
825 r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion',
826 r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion',
827 r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion',
828 r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p',
829 r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion',
830 r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
831 r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
832 r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
833 r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion',
834 )
835 _VALID_URL = r"""(?x)^
836 (
837 (?:https?://|//) # http(s):// or protocol-independent URL
838 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com|
839 (?:www\.)?deturl\.com/www\.youtube\.com|
840 (?:www\.)?pwnyoutube\.com|
841 (?:www\.)?hooktube\.com|
842 (?:www\.)?yourepeat\.com|
843 tube\.majestyc\.net|
844 %(invidious)s|
845 youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
846 (?:.*?\#/)? # handle anchor (#/) redirect urls
847 (?: # the various things that can precede the ID:
848 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
849 |(?: # or the v= param in all its forms
850 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
851 (?:\?|\#!?) # the params delimiter ? or # or #!
852 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
853 v=
854 )
855 ))
856 |(?:
857 youtu\.be| # just youtu.be/xxxx
858 vid\.plus| # or vid.plus/xxxx
859 zwearz\.com/watch| # or zwearz.com/watch/xxxx
860 %(invidious)s
861 )/
862 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
863 )
864 )? # all until now is optional -> you can pass the naked ID
865 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
866 (?(1).+)? # if we found the ID, everything can follow
867 (?:\#|$)""" % {
868 'invidious': '|'.join(_INVIDIOUS_SITES),
869 }
870 _PLAYER_INFO_RE = (
871 r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
872 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
873 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
874 )
875 _formats = {
876 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
877 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
878 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
879 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
880 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
881 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
882 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
883 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
884 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
885 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
886 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
887 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
888 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
889 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
890 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
891 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
892 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
893 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
894
895
896 # 3D videos
897 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
898 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
899 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
900 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
901 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
902 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
903 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
904
905 # Apple HTTP Live Streaming
906 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
907 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
908 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
909 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
910 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
911 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
912 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
913 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
914
915 # DASH mp4 video
916 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
917 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
918 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
919 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
920 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
921 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
922 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
923 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
924 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
925 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
926 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
927 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
928
929 # Dash mp4 audio
930 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
931 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
932 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
933 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
934 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
935 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
936 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
937
938 # Dash webm
939 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
940 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
941 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
942 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
943 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
944 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
945 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
946 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
947 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
948 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
949 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
950 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
951 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
952 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
953 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
954 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
955 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
956 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
957 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
958 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
959 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
960 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
961
962 # Dash webm audio
963 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
964 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
965
966 # Dash webm audio with opus inside
967 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
968 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
969 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
970
971 # RTMP (unnamed)
972 '_rtmp': {'protocol': 'rtmp'},
973
974 # av01 video only formats sometimes served with "unknown" codecs
975 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
976 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
977 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
978 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
979 }
980 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
981
982 _AGE_GATE_REASONS = (
983 'Sign in to confirm your age',
984 'This video may be inappropriate for some users.',
985 'Sorry, this content is age-restricted.')
986
987 _GEO_BYPASS = False
988
989 IE_NAME = 'youtube'
990 _TESTS = [
991 {
992 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
993 'info_dict': {
994 'id': 'BaW_jenozKc',
995 'ext': 'mp4',
996 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
997 'uploader': 'Philipp Hagemeister',
998 'uploader_id': 'phihag',
999 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
1000 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
1001 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
1002 'upload_date': '20121002',
1003 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
1004 'categories': ['Science & Technology'],
1005 'tags': ['youtube-dl'],
1006 'duration': 10,
1007 'view_count': int,
1008 'like_count': int,
1009 'dislike_count': int,
1010 'start_time': 1,
1011 'end_time': 9,
1012 }
1013 },
1014 {
1015 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
1016 'note': 'Embed-only video (#1746)',
1017 'info_dict': {
1018 'id': 'yZIXLfi8CZQ',
1019 'ext': 'mp4',
1020 'upload_date': '20120608',
1021 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
1022 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
1023 'uploader': 'SET India',
1024 'uploader_id': 'setindia',
1025 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
1026 'age_limit': 18,
1027 },
1028 'skip': 'Private video',
1029 },
1030 {
1031 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
1032 'note': 'Use the first video ID in the URL',
1033 'info_dict': {
1034 'id': 'BaW_jenozKc',
1035 'ext': 'mp4',
1036 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
1037 'uploader': 'Philipp Hagemeister',
1038 'uploader_id': 'phihag',
1039 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
1040 'upload_date': '20121002',
1041 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
1042 'categories': ['Science & Technology'],
1043 'tags': ['youtube-dl'],
1044 'duration': 10,
1045 'view_count': int,
1046 'like_count': int,
1047 'dislike_count': int,
1048 },
1049 'params': {
1050 'skip_download': True,
1051 },
1052 },
1053 {
1054 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
1055 'note': '256k DASH audio (format 141) via DASH manifest',
1056 'info_dict': {
1057 'id': 'a9LDPn-MO4I',
1058 'ext': 'm4a',
1059 'upload_date': '20121002',
1060 'uploader_id': '8KVIDEO',
1061 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
1062 'description': '',
1063 'uploader': '8KVIDEO',
1064 'title': 'UHDTV TEST 8K VIDEO.mp4'
1065 },
1066 'params': {
1067 'youtube_include_dash_manifest': True,
1068 'format': '141',
1069 },
1070 'skip': 'format 141 not served anymore',
1071 },
1072 # DASH manifest with encrypted signature
1073 {
1074 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
1075 'info_dict': {
1076 'id': 'IB3lcPjvWLA',
1077 'ext': 'm4a',
1078 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
1079 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
1080 'duration': 244,
1081 'uploader': 'AfrojackVEVO',
1082 'uploader_id': 'AfrojackVEVO',
1083 'upload_date': '20131011',
1084 'abr': 129.495,
1085 },
1086 'params': {
1087 'youtube_include_dash_manifest': True,
1088 'format': '141/bestaudio[ext=m4a]',
1089 },
1090 },
1091 # Controversy video
1092 {
1093 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
1094 'info_dict': {
1095 'id': 'T4XJQO3qol8',
1096 'ext': 'mp4',
1097 'duration': 219,
1098 'upload_date': '20100909',
1099 'uploader': 'Amazing Atheist',
1100 'uploader_id': 'TheAmazingAtheist',
1101 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
1102 'title': 'Burning Everyone\'s Koran',
1103 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
1104 }
1105 },
1106 # Normal age-gate video (embed allowed)
1107 {
1108 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
1109 'info_dict': {
1110 'id': 'HtVdAasjOgU',
1111 'ext': 'mp4',
1112 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
1113 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
1114 'duration': 142,
1115 'uploader': 'The Witcher',
1116 'uploader_id': 'WitcherGame',
1117 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
1118 'upload_date': '20140605',
1119 'age_limit': 18,
1120 },
1121 },
1122 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
1123 # YouTube Red ad is not captured for creator
1124 {
1125 'url': '__2ABJjxzNo',
1126 'info_dict': {
1127 'id': '__2ABJjxzNo',
1128 'ext': 'mp4',
1129 'duration': 266,
1130 'upload_date': '20100430',
1131 'uploader_id': 'deadmau5',
1132 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
1133 'creator': 'deadmau5',
1134 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336',
1135 'uploader': 'deadmau5',
1136 'title': 'Deadmau5 - Some Chords (HD)',
1137 'alt_title': 'Some Chords',
1138 },
1139 'expected_warnings': [
1140 'DASH manifest missing',
1141 ]
1142 },
1143 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
1144 {
1145 'url': 'lqQg6PlCWgI',
1146 'info_dict': {
1147 'id': 'lqQg6PlCWgI',
1148 'ext': 'mp4',
1149 'duration': 6085,
1150 'upload_date': '20150827',
1151 'uploader_id': 'olympic',
1152 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
1153 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
1154 'uploader': 'Olympic',
1155 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
1156 },
1157 'params': {
1158 'skip_download': 'requires avconv',
1159 }
1160 },
1161 # Non-square pixels
1162 {
1163 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
1164 'info_dict': {
1165 'id': '_b-2C3KPAM0',
1166 'ext': 'mp4',
1167 'stretched_ratio': 16 / 9.,
1168 'duration': 85,
1169 'upload_date': '20110310',
1170 'uploader_id': 'AllenMeow',
1171 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
1172 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
1173 'uploader': '孫ᄋᄅ',
1174 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
1175 },
1176 },
1177 # url_encoded_fmt_stream_map is empty string
1178 {
1179 'url': 'qEJwOuvDf7I',
1180 'info_dict': {
1181 'id': 'qEJwOuvDf7I',
1182 'ext': 'webm',
1183 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
1184 'description': '',
1185 'upload_date': '20150404',
1186 'uploader_id': 'spbelect',
1187 'uploader': 'Наблюдатели Петербурга',
1188 },
1189 'params': {
1190 'skip_download': 'requires avconv',
1191 },
1192 'skip': 'This live event has ended.',
1193 },
1194 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
1195 {
1196 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
1197 'info_dict': {
1198 'id': 'FIl7x6_3R5Y',
1199 'ext': 'webm',
1200 'title': 'md5:7b81415841e02ecd4313668cde88737a',
1201 'description': 'md5:116377fd2963b81ec4ce64b542173306',
1202 'duration': 220,
1203 'upload_date': '20150625',
1204 'uploader_id': 'dorappi2000',
1205 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
1206 'uploader': 'dorappi2000',
1207 'formats': 'mincount:31',
1208 },
1209 'skip': 'not actual anymore',
1210 },
1211 # DASH manifest with segment_list
1212 {
1213 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
1214 'md5': '8ce563a1d667b599d21064e982ab9e31',
1215 'info_dict': {
1216 'id': 'CsmdDsKjzN8',
1217 'ext': 'mp4',
1218 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
1219 'uploader': 'Airtek',
1220 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
1221 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
1222 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
1223 },
1224 'params': {
1225 'youtube_include_dash_manifest': True,
1226 'format': '135', # bestvideo
1227 },
1228 'skip': 'This live event has ended.',
1229 },
1230 {
1231 # Multifeed videos (multiple cameras), URL is for Main Camera
1232 'url': 'https://www.youtube.com/watch?v=jvGDaLqkpTg',
1233 'info_dict': {
1234 'id': 'jvGDaLqkpTg',
1235 'title': 'Tom Clancy Free Weekend Rainbow Whatever',
1236 'description': 'md5:e03b909557865076822aa169218d6a5d',
1237 },
1238 'playlist': [{
1239 'info_dict': {
1240 'id': 'jvGDaLqkpTg',
1241 'ext': 'mp4',
1242 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Main Camera)',
1243 'description': 'md5:e03b909557865076822aa169218d6a5d',
1244 'duration': 10643,
1245 'upload_date': '20161111',
1246 'uploader': 'Team PGP',
1247 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1248 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1249 },
1250 }, {
1251 'info_dict': {
1252 'id': '3AKt1R1aDnw',
1253 'ext': 'mp4',
1254 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 2)',
1255 'description': 'md5:e03b909557865076822aa169218d6a5d',
1256 'duration': 10991,
1257 'upload_date': '20161111',
1258 'uploader': 'Team PGP',
1259 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1260 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1261 },
1262 }, {
1263 'info_dict': {
1264 'id': 'RtAMM00gpVc',
1265 'ext': 'mp4',
1266 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 3)',
1267 'description': 'md5:e03b909557865076822aa169218d6a5d',
1268 'duration': 10995,
1269 'upload_date': '20161111',
1270 'uploader': 'Team PGP',
1271 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1272 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1273 },
1274 }, {
1275 'info_dict': {
1276 'id': '6N2fdlP3C5U',
1277 'ext': 'mp4',
1278 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 4)',
1279 'description': 'md5:e03b909557865076822aa169218d6a5d',
1280 'duration': 10990,
1281 'upload_date': '20161111',
1282 'uploader': 'Team PGP',
1283 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1284 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1285 },
1286 }],
1287 'params': {
1288 'skip_download': True,
1289 },
1290 },
1291 {
1292 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
1293 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
1294 'info_dict': {
1295 'id': 'gVfLd0zydlo',
1296 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
1297 },
1298 'playlist_count': 2,
1299 'skip': 'Not multifeed anymore',
1300 },
1301 {
1302 'url': 'https://vid.plus/FlRa-iH7PGw',
1303 'only_matching': True,
1304 },
1305 {
1306 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
1307 'only_matching': True,
1308 },
1309 {
1310 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
1311 # Also tests cut-off URL expansion in video description (see
1312 # https://github.com/ytdl-org/youtube-dl/issues/1892,
1313 # https://github.com/ytdl-org/youtube-dl/issues/8164)
1314 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
1315 'info_dict': {
1316 'id': 'lsguqyKfVQg',
1317 'ext': 'mp4',
1318 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
1319 'alt_title': 'Dark Walk - Position Music',
1320 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
1321 'duration': 133,
1322 'upload_date': '20151119',
1323 'uploader_id': 'IronSoulElf',
1324 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
1325 'uploader': 'IronSoulElf',
1326 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
1327 'track': 'Dark Walk - Position Music',
1328 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
1329 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
1330 },
1331 'params': {
1332 'skip_download': True,
1333 },
1334 },
1335 {
1336 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
1337 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
1338 'only_matching': True,
1339 },
1340 {
1341 # Video with yt:stretch=17:0
1342 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
1343 'info_dict': {
1344 'id': 'Q39EVAstoRM',
1345 'ext': 'mp4',
1346 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
1347 'description': 'md5:ee18a25c350637c8faff806845bddee9',
1348 'upload_date': '20151107',
1349 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
1350 'uploader': 'CH GAMER DROID',
1351 },
1352 'params': {
1353 'skip_download': True,
1354 },
1355 'skip': 'This video does not exist.',
1356 },
1357 {
1358 # Video with incomplete 'yt:stretch=16:'
1359 'url': 'https://www.youtube.com/watch?v=FRhJzUSJbGI',
1360 'only_matching': True,
1361 },
1362 {
1363 # Video licensed under Creative Commons
1364 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
1365 'info_dict': {
1366 'id': 'M4gD1WSo5mA',
1367 'ext': 'mp4',
1368 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
1369 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
1370 'duration': 721,
1371 'upload_date': '20150127',
1372 'uploader_id': 'BerkmanCenter',
1373 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
1374 'uploader': 'The Berkman Klein Center for Internet & Society',
1375 'license': 'Creative Commons Attribution license (reuse allowed)',
1376 },
1377 'params': {
1378 'skip_download': True,
1379 },
1380 },
1381 {
1382 # Channel-like uploader_url
1383 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
1384 'info_dict': {
1385 'id': 'eQcmzGIKrzg',
1386 'ext': 'mp4',
1387 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
1388 'description': 'md5:13a2503d7b5904ef4b223aa101628f39',
1389 'duration': 4060,
1390 'upload_date': '20151119',
1391 'uploader': 'Bernie Sanders',
1392 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
1393 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
1394 'license': 'Creative Commons Attribution license (reuse allowed)',
1395 },
1396 'params': {
1397 'skip_download': True,
1398 },
1399 },
1400 {
1401 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
1402 'only_matching': True,
1403 },
1404 {
1405 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
1406 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
1407 'only_matching': True,
1408 },
1409 {
1410 # Rental video preview
1411 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1412 'info_dict': {
1413 'id': 'uGpuVWrhIzE',
1414 'ext': 'mp4',
1415 'title': 'Piku - Trailer',
1416 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1417 'upload_date': '20150811',
1418 'uploader': 'FlixMatrix',
1419 'uploader_id': 'FlixMatrixKaravan',
1420 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
1421 'license': 'Standard YouTube License',
1422 },
1423 'params': {
1424 'skip_download': True,
1425 },
1426 'skip': 'This video is not available.',
1427 },
1428 {
1429 # YouTube Red video with episode data
1430 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1431 'info_dict': {
1432 'id': 'iqKdEhx-dD4',
1433 'ext': 'mp4',
1434 'title': 'Isolation - Mind Field (Ep 1)',
1435 'description': 'md5:f540112edec5d09fc8cc752d3d4ba3cd',
1436 'duration': 2085,
1437 'upload_date': '20170118',
1438 'uploader': 'Vsauce',
1439 'uploader_id': 'Vsauce',
1440 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1441 'series': 'Mind Field',
1442 'season_number': 1,
1443 'episode_number': 1,
1444 },
1445 'params': {
1446 'skip_download': True,
1447 },
1448 'expected_warnings': [
1449 'Skipping DASH manifest',
1450 ],
1451 },
1452 {
1453 # The following content has been identified by the YouTube community
1454 # as inappropriate or offensive to some audiences.
1455 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1456 'info_dict': {
1457 'id': '6SJNVb0GnPI',
1458 'ext': 'mp4',
1459 'title': 'Race Differences in Intelligence',
1460 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1461 'duration': 965,
1462 'upload_date': '20140124',
1463 'uploader': 'New Century Foundation',
1464 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1465 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1466 },
1467 'params': {
1468 'skip_download': True,
1469 },
1470 'skip': 'This video has been removed for violating YouTube\'s policy on hate speech.',
1471 },
1472 {
1473 # itag 212
1474 'url': '1t24XAntNCY',
1475 'only_matching': True,
1476 },
1477 {
1478 # geo restricted to JP
1479 'url': 'sJL6WA-aGkQ',
1480 'only_matching': True,
1481 },
1482 {
1483 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1484 'only_matching': True,
1485 },
1486 {
1487 'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc',
1488 'only_matching': True,
1489 },
1490 {
1491 # from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m
1492 'url': 'https://redirect.invidious.io/Yh0AhrY9GjA',
1493 'only_matching': True,
1494 },
1495 {
1496 # DRM protected
1497 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1498 'only_matching': True,
1499 },
1500 {
1501 # Video with unsupported adaptive stream type formats
1502 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1503 'info_dict': {
1504 'id': 'Z4Vy8R84T1U',
1505 'ext': 'mp4',
1506 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1507 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1508 'duration': 433,
1509 'upload_date': '20130923',
1510 'uploader': 'Amelia Putri Harwita',
1511 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1512 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1513 'formats': 'maxcount:10',
1514 },
1515 'params': {
1516 'skip_download': True,
1517 'youtube_include_dash_manifest': False,
1518 },
1519 'skip': 'not actual anymore',
1520 },
1521 {
1522 # Youtube Music Auto-generated description
1523 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1524 'info_dict': {
1525 'id': 'MgNrAu2pzNs',
1526 'ext': 'mp4',
1527 'title': 'Voyeur Girl',
1528 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1529 'upload_date': '20190312',
1530 'uploader': 'Stephen - Topic',
1531 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1532 'artist': 'Stephen',
1533 'track': 'Voyeur Girl',
1534 'album': 'it\'s too much love to know my dear',
1535 'release_date': '20190313',
1536 'release_year': 2019,
1537 },
1538 'params': {
1539 'skip_download': True,
1540 },
1541 },
1542 {
1543 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1544 'only_matching': True,
1545 },
1546 {
1547 # invalid -> valid video id redirection
1548 'url': 'DJztXj2GPfl',
1549 'info_dict': {
1550 'id': 'DJztXj2GPfk',
1551 'ext': 'mp4',
1552 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1553 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1554 'upload_date': '20090125',
1555 'uploader': 'Prochorowka',
1556 'uploader_id': 'Prochorowka',
1557 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1558 'artist': 'Panjabi MC',
1559 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1560 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1561 },
1562 'params': {
1563 'skip_download': True,
1564 },
1565 'skip': 'Video unavailable',
1566 },
1567 {
1568 # empty description results in an empty string
1569 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1570 'info_dict': {
1571 'id': 'x41yOUIvK2k',
1572 'ext': 'mp4',
1573 'title': 'IMG 3456',
1574 'description': '',
1575 'upload_date': '20170613',
1576 'uploader_id': 'ElevageOrVert',
1577 'uploader': 'ElevageOrVert',
1578 },
1579 'params': {
1580 'skip_download': True,
1581 },
1582 },
1583 {
1584 # with '};' inside yt initial data (see [1])
1585 # see [2] for an example with '};' inside ytInitialPlayerResponse
1586 # 1. https://github.com/ytdl-org/youtube-dl/issues/27093
1587 # 2. https://github.com/ytdl-org/youtube-dl/issues/27216
1588 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1589 'info_dict': {
1590 'id': 'CHqg6qOn4no',
1591 'ext': 'mp4',
1592 'title': 'Part 77 Sort a list of simple types in c#',
1593 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1594 'upload_date': '20130831',
1595 'uploader_id': 'kudvenkat',
1596 'uploader': 'kudvenkat',
1597 },
1598 'params': {
1599 'skip_download': True,
1600 },
1601 },
1602 {
1603 # another example of '};' in ytInitialData
1604 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY',
1605 'only_matching': True,
1606 },
1607 {
1608 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ',
1609 'only_matching': True,
1610 },
1611 {
1612 # https://github.com/ytdl-org/youtube-dl/pull/28094
1613 'url': 'OtqTfy26tG0',
1614 'info_dict': {
1615 'id': 'OtqTfy26tG0',
1616 'ext': 'mp4',
1617 'title': 'Burn Out',
1618 'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131',
1619 'upload_date': '20141120',
1620 'uploader': 'The Cinematic Orchestra - Topic',
1621 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw',
1622 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw',
1623 'artist': 'The Cinematic Orchestra',
1624 'track': 'Burn Out',
1625 'album': 'Every Day',
1626 'release_data': None,
1627 'release_year': None,
1628 },
1629 'params': {
1630 'skip_download': True,
1631 },
1632 },
1633 {
1634 # controversial video, only works with bpctr when authenticated with cookies
1635 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg',
1636 'only_matching': True,
1637 },
1638 {
1639 # restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685
1640 'url': 'cBvYw8_A0vQ',
1641 'info_dict': {
1642 'id': 'cBvYw8_A0vQ',
1643 'ext': 'mp4',
1644 'title': '4K Ueno Okachimachi Street Scenes 上野御徒町歩き',
1645 'description': 'md5:ea770e474b7cd6722b4c95b833c03630',
1646 'upload_date': '20201120',
1647 'uploader': 'Walk around Japan',
1648 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw',
1649 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
1650 },
1651 'params': {
1652 'skip_download': True,
1653 },
1654 }, {
1655 # Has multiple audio streams
1656 'url': 'WaOKSUlf4TM',
1657 'only_matching': True
1658 }, {
1659 # Requires Premium: has format 141 when requested using YTM url
1660 'url': 'https://music.youtube.com/watch?v=XclachpHxis',
1661 'only_matching': True
1662 }, {
1663 # multiple subtitles with same lang_code
1664 'url': 'https://www.youtube.com/watch?v=wsQiKKfKxug',
1665 'only_matching': True,
1666 }, {
1667 # Force use android client fallback
1668 'url': 'https://www.youtube.com/watch?v=YOelRv7fMxY',
1669 'info_dict': {
1670 'id': 'YOelRv7fMxY',
1671 'title': 'Digging a Secret Tunnel from my Workshop',
1672 'ext': '3gp',
1673 'upload_date': '20210624',
1674 'channel_id': 'UCp68_FLety0O-n9QU6phsgw',
1675 'uploader': 'colinfurze',
1676 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCp68_FLety0O-n9QU6phsgw',
1677 'description': 'md5:ecb672623246d98c6c562eed6ae798c3'
1678 },
1679 'params': {
1680 'format': '17', # 3gp format available on android
1681 'extractor_args': {'youtube': {'player_client': ['android']}},
1682 },
1683 },
1684 {
1685 # Skip download of additional client configs (remix client config in this case)
1686 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1687 'only_matching': True,
1688 'params': {
1689 'extractor_args': {'youtube': {'player_skip': ['configs']}},
1690 },
1691 }
1692 ]
1693
1694 @classmethod
1695 def suitable(cls, url):
1696 # Hack for lazy extractors until more generic solution is implemented
1697 # (see #28780)
1698 from .youtube import parse_qs
1699 qs = parse_qs(url)
1700 if qs.get('list', [None])[0]:
1701 return False
1702 return super(YoutubeIE, cls).suitable(url)
1703
1704 def __init__(self, *args, **kwargs):
1705 super(YoutubeIE, self).__init__(*args, **kwargs)
1706 self._code_cache = {}
1707 self._player_cache = {}
1708
1709 def _extract_player_url(self, ytcfg=None, webpage=None):
1710 player_url = try_get(ytcfg, (lambda x: x['PLAYER_JS_URL']), str)
1711 if not player_url:
1712 player_url = self._search_regex(
1713 r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
1714 webpage, 'player URL', fatal=False)
1715 if player_url.startswith('//'):
1716 player_url = 'https:' + player_url
1717 elif not re.match(r'https?://', player_url):
1718 player_url = compat_urlparse.urljoin(
1719 'https://www.youtube.com', player_url)
1720 return player_url
1721
1722 def _signature_cache_id(self, example_sig):
1723 """ Return a string representation of a signature """
1724 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1725
1726 @classmethod
1727 def _extract_player_info(cls, player_url):
1728 for player_re in cls._PLAYER_INFO_RE:
1729 id_m = re.search(player_re, player_url)
1730 if id_m:
1731 break
1732 else:
1733 raise ExtractorError('Cannot identify player %r' % player_url)
1734 return id_m.group('id')
1735
1736 def _load_player(self, video_id, player_url, fatal=True) -> bool:
1737 player_id = self._extract_player_info(player_url)
1738 if player_id not in self._code_cache:
1739 self._code_cache[player_id] = self._download_webpage(
1740 player_url, video_id, fatal=fatal,
1741 note='Downloading player ' + player_id,
1742 errnote='Download of %s failed' % player_url)
1743 return player_id in self._code_cache
1744
1745 def _extract_signature_function(self, video_id, player_url, example_sig):
1746 player_id = self._extract_player_info(player_url)
1747
1748 # Read from filesystem cache
1749 func_id = 'js_%s_%s' % (
1750 player_id, self._signature_cache_id(example_sig))
1751 assert os.path.basename(func_id) == func_id
1752
1753 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1754 if cache_spec is not None:
1755 return lambda s: ''.join(s[i] for i in cache_spec)
1756
1757 if self._load_player(video_id, player_url):
1758 code = self._code_cache[player_id]
1759 res = self._parse_sig_js(code)
1760
1761 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1762 cache_res = res(test_string)
1763 cache_spec = [ord(c) for c in cache_res]
1764
1765 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1766 return res
1767
1768 def _print_sig_code(self, func, example_sig):
1769 def gen_sig_code(idxs):
1770 def _genslice(start, end, step):
1771 starts = '' if start == 0 else str(start)
1772 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1773 steps = '' if step == 1 else (':%d' % step)
1774 return 's[%s%s%s]' % (starts, ends, steps)
1775
1776 step = None
1777 # Quelch pyflakes warnings - start will be set when step is set
1778 start = '(Never used)'
1779 for i, prev in zip(idxs[1:], idxs[:-1]):
1780 if step is not None:
1781 if i - prev == step:
1782 continue
1783 yield _genslice(start, prev, step)
1784 step = None
1785 continue
1786 if i - prev in [-1, 1]:
1787 step = i - prev
1788 start = prev
1789 continue
1790 else:
1791 yield 's[%d]' % prev
1792 if step is None:
1793 yield 's[%d]' % i
1794 else:
1795 yield _genslice(start, i, step)
1796
1797 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1798 cache_res = func(test_string)
1799 cache_spec = [ord(c) for c in cache_res]
1800 expr_code = ' + '.join(gen_sig_code(cache_spec))
1801 signature_id_tuple = '(%s)' % (
1802 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1803 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1804 ' return %s\n') % (signature_id_tuple, expr_code)
1805 self.to_screen('Extracted signature function:\n' + code)
1806
1807 def _parse_sig_js(self, jscode):
1808 funcname = self._search_regex(
1809 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1810 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1811 r'\bm=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(h\.s\)\)',
1812 r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(c\)\)',
1813 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)',
1814 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1815 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1816 # Obsolete patterns
1817 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1818 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1819 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1820 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1821 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1822 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1823 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1824 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1825 jscode, 'Initial JS player signature function name', group='sig')
1826
1827 jsi = JSInterpreter(jscode)
1828 initial_function = jsi.extract_function(funcname)
1829 return lambda s: initial_function([s])
1830
1831 def _decrypt_signature(self, s, video_id, player_url):
1832 """Turn the encrypted s field into a working signature"""
1833
1834 if player_url is None:
1835 raise ExtractorError('Cannot decrypt signature without player_url')
1836
1837 try:
1838 player_id = (player_url, self._signature_cache_id(s))
1839 if player_id not in self._player_cache:
1840 func = self._extract_signature_function(
1841 video_id, player_url, s
1842 )
1843 self._player_cache[player_id] = func
1844 func = self._player_cache[player_id]
1845 if self.get_param('youtube_print_sig_code'):
1846 self._print_sig_code(func, s)
1847 return func(s)
1848 except Exception as e:
1849 tb = traceback.format_exc()
1850 raise ExtractorError(
1851 'Signature extraction failed: ' + tb, cause=e)
1852
1853 def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False):
1854 """
1855 Extract signatureTimestamp (sts)
1856 Required to tell API what sig/player version is in use.
1857 """
1858 sts = None
1859 if isinstance(ytcfg, dict):
1860 sts = int_or_none(ytcfg.get('STS'))
1861
1862 if not sts:
1863 # Attempt to extract from player
1864 if player_url is None:
1865 error_msg = 'Cannot extract signature timestamp without player_url.'
1866 if fatal:
1867 raise ExtractorError(error_msg)
1868 self.report_warning(error_msg)
1869 return
1870 if self._load_player(video_id, player_url, fatal=fatal):
1871 player_id = self._extract_player_info(player_url)
1872 code = self._code_cache[player_id]
1873 sts = int_or_none(self._search_regex(
1874 r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code,
1875 'JS player signature timestamp', group='sts', fatal=fatal))
1876 return sts
1877
1878 def _mark_watched(self, video_id, player_response):
1879 playback_url = url_or_none(try_get(
1880 player_response,
1881 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']))
1882 if not playback_url:
1883 return
1884 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1885 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1886
1887 # cpn generation algorithm is reverse engineered from base.js.
1888 # In fact it works even with dummy cpn.
1889 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1890 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1891
1892 qs.update({
1893 'ver': ['2'],
1894 'cpn': [cpn],
1895 })
1896 playback_url = compat_urlparse.urlunparse(
1897 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1898
1899 self._download_webpage(
1900 playback_url, video_id, 'Marking watched',
1901 'Unable to mark watched', fatal=False)
1902
1903 @staticmethod
1904 def _extract_urls(webpage):
1905 # Embedded YouTube player
1906 entries = [
1907 unescapeHTML(mobj.group('url'))
1908 for mobj in re.finditer(r'''(?x)
1909 (?:
1910 <iframe[^>]+?src=|
1911 data-video-url=|
1912 <embed[^>]+?src=|
1913 embedSWF\(?:\s*|
1914 <object[^>]+data=|
1915 new\s+SWFObject\(
1916 )
1917 (["\'])
1918 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1919 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1920 \1''', webpage)]
1921
1922 # lazyYT YouTube embed
1923 entries.extend(list(map(
1924 unescapeHTML,
1925 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1926
1927 # Wordpress "YouTube Video Importer" plugin
1928 matches = re.findall(r'''(?x)<div[^>]+
1929 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1930 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1931 entries.extend(m[-1] for m in matches)
1932
1933 return entries
1934
1935 @staticmethod
1936 def _extract_url(webpage):
1937 urls = YoutubeIE._extract_urls(webpage)
1938 return urls[0] if urls else None
1939
1940 @classmethod
1941 def extract_id(cls, url):
1942 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1943 if mobj is None:
1944 raise ExtractorError('Invalid URL: %s' % url)
1945 video_id = mobj.group(2)
1946 return video_id
1947
1948 def _extract_chapters_from_json(self, data, video_id, duration):
1949 chapters_list = try_get(
1950 data,
1951 lambda x: x['playerOverlays']
1952 ['playerOverlayRenderer']
1953 ['decoratedPlayerBarRenderer']
1954 ['decoratedPlayerBarRenderer']
1955 ['playerBar']
1956 ['chapteredPlayerBarRenderer']
1957 ['chapters'],
1958 list)
1959 if not chapters_list:
1960 return
1961
1962 def chapter_time(chapter):
1963 return float_or_none(
1964 try_get(
1965 chapter,
1966 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1967 int),
1968 scale=1000)
1969 chapters = []
1970 for next_num, chapter in enumerate(chapters_list, start=1):
1971 start_time = chapter_time(chapter)
1972 if start_time is None:
1973 continue
1974 end_time = (chapter_time(chapters_list[next_num])
1975 if next_num < len(chapters_list) else duration)
1976 if end_time is None:
1977 continue
1978 title = try_get(
1979 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1980 compat_str)
1981 chapters.append({
1982 'start_time': start_time,
1983 'end_time': end_time,
1984 'title': title,
1985 })
1986 return chapters
1987
1988 def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
1989 return self._parse_json(self._search_regex(
1990 (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
1991 regex), webpage, name, default='{}'), video_id, fatal=False)
1992
1993 @staticmethod
1994 def parse_time_text(time_text):
1995 """
1996 Parse the comment time text
1997 time_text is in the format 'X units ago (edited)'
1998 """
1999 time_text_split = time_text.split(' ')
2000 if len(time_text_split) >= 3:
2001 return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto')
2002
2003 def _extract_comment(self, comment_renderer, parent=None):
2004 comment_id = comment_renderer.get('commentId')
2005 if not comment_id:
2006 return
2007 comment_text_runs = try_get(comment_renderer, lambda x: x['contentText']['runs']) or []
2008 text = self._join_text_entries(comment_text_runs) or ''
2009 comment_time_text = try_get(comment_renderer, lambda x: x['publishedTimeText']['runs']) or []
2010 time_text = self._join_text_entries(comment_time_text)
2011 # note: timestamp is an estimate calculated from the current time and time_text
2012 timestamp = calendar.timegm(self.parse_time_text(time_text).timetuple())
2013 author = try_get(comment_renderer, lambda x: x['authorText']['simpleText'], compat_str)
2014 author_id = try_get(comment_renderer,
2015 lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
2016 votes = parse_count(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'],
2017 lambda x: x['likeCount']), compat_str)) or 0
2018 author_thumbnail = try_get(comment_renderer,
2019 lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], compat_str)
2020
2021 author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
2022 is_favorited = 'creatorHeart' in (try_get(
2023 comment_renderer, lambda x: x['actionButtons']['commentActionButtonsRenderer'], dict) or {})
2024 return {
2025 'id': comment_id,
2026 'text': text,
2027 'timestamp': timestamp,
2028 'time_text': time_text,
2029 'like_count': votes,
2030 'is_favorited': is_favorited,
2031 'author': author,
2032 'author_id': author_id,
2033 'author_thumbnail': author_thumbnail,
2034 'author_is_uploader': author_is_uploader,
2035 'parent': parent or 'root'
2036 }
2037
2038 def _comment_entries(self, root_continuation_data, identity_token, account_syncid,
2039 ytcfg, video_id, parent=None, comment_counts=None):
2040
2041 def extract_header(contents):
2042 _total_comments = 0
2043 _continuation = None
2044 for content in contents:
2045 comments_header_renderer = try_get(content, lambda x: x['commentsHeaderRenderer'])
2046 expected_comment_count = try_get(comments_header_renderer,
2047 (lambda x: x['countText']['runs'][0]['text'],
2048 lambda x: x['commentsCount']['runs'][0]['text']),
2049 compat_str)
2050 if expected_comment_count:
2051 comment_counts[1] = str_to_int(expected_comment_count)
2052 self.to_screen('Downloading ~%d comments' % str_to_int(expected_comment_count))
2053 _total_comments = comment_counts[1]
2054 sort_mode_str = self._configuration_arg('comment_sort', [''])[0]
2055 comment_sort_index = int(sort_mode_str != 'top') # 1 = new, 0 = top
2056
2057 sort_menu_item = try_get(
2058 comments_header_renderer,
2059 lambda x: x['sortMenu']['sortFilterSubMenuRenderer']['subMenuItems'][comment_sort_index], dict) or {}
2060 sort_continuation_ep = sort_menu_item.get('serviceEndpoint') or {}
2061
2062 _continuation = self._extract_continuation_ep_data(sort_continuation_ep) or self._extract_continuation(sort_menu_item)
2063 if not _continuation:
2064 continue
2065
2066 sort_text = sort_menu_item.get('title')
2067 if isinstance(sort_text, compat_str):
2068 sort_text = sort_text.lower()
2069 else:
2070 sort_text = 'top comments' if comment_sort_index == 0 else 'newest first'
2071 self.to_screen('Sorting comments by %s' % sort_text)
2072 break
2073 return _total_comments, _continuation
2074
2075 def extract_thread(contents):
2076 if not parent:
2077 comment_counts[2] = 0
2078 for content in contents:
2079 comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
2080 comment_renderer = try_get(
2081 comment_thread_renderer, (lambda x: x['comment']['commentRenderer'], dict)) or try_get(
2082 content, (lambda x: x['commentRenderer'], dict))
2083
2084 if not comment_renderer:
2085 continue
2086 comment = self._extract_comment(comment_renderer, parent)
2087 if not comment:
2088 continue
2089 comment_counts[0] += 1
2090 yield comment
2091 # Attempt to get the replies
2092 comment_replies_renderer = try_get(
2093 comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
2094
2095 if comment_replies_renderer:
2096 comment_counts[2] += 1
2097 comment_entries_iter = self._comment_entries(
2098 comment_replies_renderer, identity_token, account_syncid, ytcfg,
2099 video_id, parent=comment.get('id'), comment_counts=comment_counts)
2100
2101 for reply_comment in comment_entries_iter:
2102 yield reply_comment
2103
2104 # YouTube comments have a max depth of 2
2105 max_depth = int_or_none(self._configuration_arg('max_comment_depth', [''])[0]) or float('inf')
2106 if max_depth == 1 and parent:
2107 return
2108 if not comment_counts:
2109 # comment so far, est. total comments, current comment thread #
2110 comment_counts = [0, 0, 0]
2111
2112 continuation = self._extract_continuation(root_continuation_data)
2113 if continuation and len(continuation['ctoken']) < 27:
2114 self.write_debug('Detected old API continuation token. Generating new API compatible token.')
2115 continuation_token = self._generate_comment_continuation(video_id)
2116 continuation = self._build_continuation_query(continuation_token, None)
2117
2118 visitor_data = None
2119 is_first_continuation = parent is None
2120
2121 for page_num in itertools.count(0):
2122 if not continuation:
2123 break
2124 headers = self._generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
2125 comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
2126 if page_num == 0:
2127 if is_first_continuation:
2128 note_prefix = 'Downloading comment section API JSON'
2129 else:
2130 note_prefix = ' Downloading comment API JSON reply thread %d %s' % (
2131 comment_counts[2], comment_prog_str)
2132 else:
2133 note_prefix = '%sDownloading comment%s API JSON page %d %s' % (
2134 ' ' if parent else '', ' replies' if parent else '',
2135 page_num, comment_prog_str)
2136
2137 response = self._extract_response(
2138 item_id=None, query=self._continuation_query_ajax_to_api(continuation),
2139 ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
2140 check_get_keys=('onResponseReceivedEndpoints', 'continuationContents'))
2141 if not response:
2142 break
2143 visitor_data = try_get(
2144 response,
2145 lambda x: x['responseContext']['webResponseContextExtensionData']['ytConfigData']['visitorData'],
2146 compat_str) or visitor_data
2147
2148 continuation_contents = dict_get(response, ('onResponseReceivedEndpoints', 'continuationContents'))
2149
2150 continuation = None
2151 if isinstance(continuation_contents, list):
2152 for continuation_section in continuation_contents:
2153 if not isinstance(continuation_section, dict):
2154 continue
2155 continuation_items = try_get(
2156 continuation_section,
2157 (lambda x: x['reloadContinuationItemsCommand']['continuationItems'],
2158 lambda x: x['appendContinuationItemsAction']['continuationItems']),
2159 list) or []
2160 if is_first_continuation:
2161 total_comments, continuation = extract_header(continuation_items)
2162 if total_comments:
2163 yield total_comments
2164 is_first_continuation = False
2165 if continuation:
2166 break
2167 continue
2168 count = 0
2169 for count, entry in enumerate(extract_thread(continuation_items)):
2170 yield entry
2171 continuation = self._extract_continuation({'contents': continuation_items})
2172 if continuation:
2173 # Sometimes YouTube provides a continuation without any comments
2174 # In most cases we end up just downloading these with very little comments to come.
2175 if count == 0:
2176 if not parent:
2177 self.report_warning('No comments received - assuming end of comments')
2178 continuation = None
2179 break
2180
2181 # Deprecated response structure
2182 elif isinstance(continuation_contents, dict):
2183 known_continuation_renderers = ('itemSectionContinuation', 'commentRepliesContinuation')
2184 for key, continuation_renderer in continuation_contents.items():
2185 if key not in known_continuation_renderers:
2186 continue
2187 if not isinstance(continuation_renderer, dict):
2188 continue
2189 if is_first_continuation:
2190 header_continuation_items = [continuation_renderer.get('header') or {}]
2191 total_comments, continuation = extract_header(header_continuation_items)
2192 if total_comments:
2193 yield total_comments
2194 is_first_continuation = False
2195 if continuation:
2196 break
2197
2198 # Sometimes YouTube provides a continuation without any comments
2199 # In most cases we end up just downloading these with very little comments to come.
2200 count = 0
2201 for count, entry in enumerate(extract_thread(continuation_renderer.get('contents') or {})):
2202 yield entry
2203 continuation = self._extract_continuation(continuation_renderer)
2204 if count == 0:
2205 if not parent:
2206 self.report_warning('No comments received - assuming end of comments')
2207 continuation = None
2208 break
2209
2210 @staticmethod
2211 def _generate_comment_continuation(video_id):
2212 """
2213 Generates initial comment section continuation token from given video id
2214 """
2215 b64_vid_id = base64.b64encode(bytes(video_id.encode('utf-8')))
2216 parts = ('Eg0SCw==', b64_vid_id, 'GAYyJyIRIgs=', b64_vid_id, 'MAB4AjAAQhBjb21tZW50cy1zZWN0aW9u')
2217 new_continuation_intlist = list(itertools.chain.from_iterable(
2218 [bytes_to_intlist(base64.b64decode(part)) for part in parts]))
2219 return base64.b64encode(intlist_to_bytes(new_continuation_intlist)).decode('utf-8')
2220
2221 def _extract_comments(self, ytcfg, video_id, contents, webpage):
2222 """Entry for comment extraction"""
2223 def _real_comment_extract(contents):
2224 if isinstance(contents, list):
2225 for entry in contents:
2226 for key, renderer in entry.items():
2227 if key not in known_entry_comment_renderers:
2228 continue
2229 yield from self._comment_entries(
2230 renderer, video_id=video_id, ytcfg=ytcfg,
2231 identity_token=self._extract_identity_token(webpage, item_id=video_id),
2232 account_syncid=self._extract_account_syncid(ytcfg))
2233 break
2234 comments = []
2235 known_entry_comment_renderers = ('itemSectionRenderer',)
2236 estimated_total = 0
2237 max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) or float('inf')
2238
2239 try:
2240 for comment in _real_comment_extract(contents):
2241 if len(comments) >= max_comments:
2242 break
2243 if isinstance(comment, int):
2244 estimated_total = comment
2245 continue
2246 comments.append(comment)
2247 except KeyboardInterrupt:
2248 self.to_screen('Interrupted by user')
2249 self.to_screen('Downloaded %d/%d comments' % (len(comments), estimated_total))
2250 return {
2251 'comments': comments,
2252 'comment_count': len(comments),
2253 }
2254
2255 @staticmethod
2256 def _generate_player_context(sts=None):
2257 context = {
2258 'html5Preference': 'HTML5_PREF_WANTS',
2259 }
2260 if sts is not None:
2261 context['signatureTimestamp'] = sts
2262 return {
2263 'playbackContext': {
2264 'contentPlaybackContext': context
2265 }
2266 }
2267
2268 @staticmethod
2269 def _get_video_info_params(video_id, client='TVHTML5'):
2270 GVI_CLIENTS = {
2271 'ANDROID': {
2272 'c': 'ANDROID',
2273 'cver': '16.20',
2274 },
2275 'TVHTML5': {
2276 'c': 'TVHTML5',
2277 'cver': '6.20180913',
2278 }
2279 }
2280 query = {
2281 'video_id': video_id,
2282 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
2283 'html5': '1'
2284 }
2285 query.update(GVI_CLIENTS.get(client))
2286 return query
2287
2288 def _real_extract(self, url):
2289 url, smuggled_data = unsmuggle_url(url, {})
2290 video_id = self._match_id(url)
2291
2292 is_music_url = smuggled_data.get('is_music_url') or self.is_music_url(url)
2293
2294 base_url = self.http_scheme() + '//www.youtube.com/'
2295 webpage_url = base_url + 'watch?v=' + video_id
2296 webpage = self._download_webpage(
2297 webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
2298
2299 ytcfg = self._extract_ytcfg(video_id, webpage) or self._get_default_ytcfg()
2300 identity_token = self._extract_identity_token(webpage, video_id)
2301 syncid = self._extract_account_syncid(ytcfg)
2302 session_index = self._extract_session_index(ytcfg)
2303 headers = self._generate_api_headers(ytcfg, identity_token, syncid, session_index=session_index)
2304 player_url = self._extract_player_url(ytcfg, webpage)
2305
2306 player_client = self._configuration_arg('player_client', [''])[0]
2307 if player_client not in ('web', 'android', ''):
2308 self.report_warning(f'Invalid player_client {player_client} given. Falling back to android client.')
2309 force_mobile_client = player_client != 'web'
2310 player_skip = self._configuration_arg('player_skip')
2311
2312 def get_text(x):
2313 if not x:
2314 return
2315 text = x.get('simpleText')
2316 if text and isinstance(text, compat_str):
2317 return text
2318 runs = x.get('runs')
2319 if not isinstance(runs, list):
2320 return
2321 return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)])
2322
2323 ytm_streaming_data = {}
2324 if is_music_url:
2325 ytm_webpage = None
2326 sts = self._extract_signature_timestamp(video_id, player_url, ytcfg, fatal=False)
2327 if sts and not force_mobile_client and 'configs' not in player_skip:
2328 ytm_webpage = self._download_webpage(
2329 'https://music.youtube.com',
2330 video_id, fatal=False, note='Downloading remix client config')
2331
2332 ytm_cfg = self._extract_ytcfg(video_id, ytm_webpage) or {}
2333 ytm_client = 'WEB_REMIX'
2334 if not sts or force_mobile_client:
2335 # Android client already has signature descrambled
2336 # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/562
2337 if not sts:
2338 self.report_warning('Falling back to android remix client for player API.')
2339 ytm_client = 'ANDROID_MUSIC'
2340 ytm_cfg = {}
2341
2342 ytm_headers = self._generate_api_headers(
2343 ytm_cfg, identity_token, syncid,
2344 client=ytm_client, session_index=session_index)
2345 ytm_query = {'videoId': video_id}
2346 ytm_query.update(self._generate_player_context(sts))
2347
2348 ytm_player_response = self._extract_response(
2349 item_id=video_id, ep='player', query=ytm_query,
2350 ytcfg=ytm_cfg, headers=ytm_headers, fatal=False,
2351 default_client=ytm_client,
2352 note='Downloading %sremix player API JSON' % ('android ' if force_mobile_client else ''))
2353 ytm_streaming_data = try_get(ytm_player_response, lambda x: x['streamingData'], dict) or {}
2354
2355 player_response = None
2356 if webpage:
2357 player_response = self._extract_yt_initial_variable(
2358 webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
2359 video_id, 'initial player response')
2360
2361 if not player_response or force_mobile_client:
2362 sts = self._extract_signature_timestamp(video_id, player_url, ytcfg, fatal=False)
2363 yt_client = 'WEB'
2364 ytpcfg = ytcfg
2365 ytp_headers = headers
2366 if not sts or force_mobile_client:
2367 # Android client already has signature descrambled
2368 # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/562
2369 if not sts:
2370 self.report_warning('Falling back to android client for player API.')
2371 yt_client = 'ANDROID'
2372 ytpcfg = {}
2373 ytp_headers = self._generate_api_headers(ytpcfg, identity_token, syncid,
2374 client=yt_client, session_index=session_index)
2375
2376 yt_query = {'videoId': video_id}
2377 yt_query.update(self._generate_player_context(sts))
2378 player_response = self._extract_response(
2379 item_id=video_id, ep='player', query=yt_query,
2380 ytcfg=ytpcfg, headers=ytp_headers, fatal=False,
2381 default_client=yt_client,
2382 note='Downloading %splayer API JSON' % ('android ' if force_mobile_client else '')
2383 ) or player_response
2384
2385 # Age-gate workarounds
2386 playability_status = player_response.get('playabilityStatus') or {}
2387 if playability_status.get('reason') in self._AGE_GATE_REASONS:
2388 gvi_clients = ('ANDROID', 'TVHTML5') if force_mobile_client else ('TVHTML5', 'ANDROID')
2389 for gvi_client in gvi_clients:
2390 pr = self._parse_json(try_get(compat_parse_qs(
2391 self._download_webpage(
2392 base_url + 'get_video_info', video_id,
2393 'Refetching age-gated %s info webpage' % gvi_client.lower(),
2394 'unable to download video info webpage', fatal=False,
2395 query=self._get_video_info_params(video_id, client=gvi_client))),
2396 lambda x: x['player_response'][0],
2397 compat_str) or '{}', video_id)
2398 if pr:
2399 break
2400 if not pr:
2401 self.report_warning('Falling back to embedded-only age-gate workaround.')
2402 embed_webpage = None
2403 sts = self._extract_signature_timestamp(video_id, player_url, ytcfg, fatal=False)
2404 if sts and not force_mobile_client and 'configs' not in player_skip:
2405 embed_webpage = self._download_webpage(
2406 'https://www.youtube.com/embed/%s?html5=1' % video_id,
2407 video_id=video_id, note='Downloading age-gated embed config')
2408
2409 ytcfg_age = self._extract_ytcfg(video_id, embed_webpage) or {}
2410 # If we extracted the embed webpage, it'll tell us if we can view the video
2411 embedded_pr = self._parse_json(
2412 try_get(ytcfg_age, lambda x: x['PLAYER_VARS']['embedded_player_response'], str) or '{}',
2413 video_id=video_id)
2414 embedded_ps_reason = try_get(embedded_pr, lambda x: x['playabilityStatus']['reason'], str) or ''
2415 if embedded_ps_reason not in self._AGE_GATE_REASONS:
2416 yt_client = 'WEB_EMBEDDED_PLAYER'
2417 if not sts or force_mobile_client:
2418 # Android client already has signature descrambled
2419 # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/562
2420 if not sts:
2421 self.report_warning(
2422 'Falling back to android embedded client for player API (note: some formats may be missing).')
2423 yt_client = 'ANDROID_EMBEDDED_PLAYER'
2424 ytcfg_age = {}
2425
2426 ytage_headers = self._generate_api_headers(
2427 ytcfg_age, identity_token, syncid,
2428 client=yt_client, session_index=session_index)
2429 yt_age_query = {'videoId': video_id}
2430 yt_age_query.update(self._generate_player_context(sts))
2431 pr = self._extract_response(
2432 item_id=video_id, ep='player', query=yt_age_query,
2433 ytcfg=ytcfg_age, headers=ytage_headers, fatal=False,
2434 default_client=yt_client,
2435 note='Downloading %sage-gated player API JSON' % ('android ' if force_mobile_client else '')
2436 ) or {}
2437
2438 if pr:
2439 player_response = pr
2440
2441 trailer_video_id = try_get(
2442 playability_status,
2443 lambda x: x['errorScreen']['playerLegacyDesktopYpcTrailerRenderer']['trailerVideoId'],
2444 compat_str)
2445 if trailer_video_id:
2446 return self.url_result(
2447 trailer_video_id, self.ie_key(), trailer_video_id)
2448
2449 search_meta = (
2450 lambda x: self._html_search_meta(x, webpage, default=None)) \
2451 if webpage else lambda x: None
2452
2453 video_details = player_response.get('videoDetails') or {}
2454 microformat = try_get(
2455 player_response,
2456 lambda x: x['microformat']['playerMicroformatRenderer'],
2457 dict) or {}
2458 video_title = video_details.get('title') \
2459 or get_text(microformat.get('title')) \
2460 or search_meta(['og:title', 'twitter:title', 'title'])
2461 video_description = video_details.get('shortDescription')
2462
2463 if not smuggled_data.get('force_singlefeed', False):
2464 if not self.get_param('noplaylist'):
2465 multifeed_metadata_list = try_get(
2466 player_response,
2467 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
2468 compat_str)
2469 if multifeed_metadata_list:
2470 entries = []
2471 feed_ids = []
2472 for feed in multifeed_metadata_list.split(','):
2473 # Unquote should take place before split on comma (,) since textual
2474 # fields may contain comma as well (see
2475 # https://github.com/ytdl-org/youtube-dl/issues/8536)
2476 feed_data = compat_parse_qs(
2477 compat_urllib_parse_unquote_plus(feed))
2478
2479 def feed_entry(name):
2480 return try_get(
2481 feed_data, lambda x: x[name][0], compat_str)
2482
2483 feed_id = feed_entry('id')
2484 if not feed_id:
2485 continue
2486 feed_title = feed_entry('title')
2487 title = video_title
2488 if feed_title:
2489 title += ' (%s)' % feed_title
2490 entries.append({
2491 '_type': 'url_transparent',
2492 'ie_key': 'Youtube',
2493 'url': smuggle_url(
2494 base_url + 'watch?v=' + feed_data['id'][0],
2495 {'force_singlefeed': True}),
2496 'title': title,
2497 })
2498 feed_ids.append(feed_id)
2499 self.to_screen(
2500 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
2501 % (', '.join(feed_ids), video_id))
2502 return self.playlist_result(
2503 entries, video_id, video_title, video_description)
2504 else:
2505 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2506
2507 formats, itags, stream_ids = [], [], []
2508 itag_qualities = {}
2509 q = qualities([
2510 # "tiny" is the smallest video-only format. But some audio-only formats
2511 # was also labeled "tiny". It is not clear if such formats still exist
2512 'tiny', 'audio_quality_low', 'audio_quality_medium', 'audio_quality_high', # Audio only formats
2513 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'
2514 ])
2515
2516 streaming_data = player_response.get('streamingData') or {}
2517 streaming_formats = streaming_data.get('formats') or []
2518 streaming_formats.extend(streaming_data.get('adaptiveFormats') or [])
2519 streaming_formats.extend(ytm_streaming_data.get('formats') or [])
2520 streaming_formats.extend(ytm_streaming_data.get('adaptiveFormats') or [])
2521
2522 for fmt in streaming_formats:
2523 if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
2524 continue
2525
2526 itag = str_or_none(fmt.get('itag'))
2527 audio_track = fmt.get('audioTrack') or {}
2528 stream_id = '%s.%s' % (itag or '', audio_track.get('id', ''))
2529 if stream_id in stream_ids:
2530 continue
2531
2532 quality = fmt.get('quality')
2533 if quality == 'tiny' or not quality:
2534 quality = fmt.get('audioQuality', '').lower() or quality
2535 if itag and quality:
2536 itag_qualities[itag] = quality
2537 # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment
2538 # (adding `&sq=0` to the URL) and parsing emsg box to determine the
2539 # number of fragment that would subsequently requested with (`&sq=N`)
2540 if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF':
2541 continue
2542
2543 fmt_url = fmt.get('url')
2544 if not fmt_url:
2545 sc = compat_parse_qs(fmt.get('signatureCipher'))
2546 fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0]))
2547 encrypted_sig = try_get(sc, lambda x: x['s'][0])
2548 if not (sc and fmt_url and encrypted_sig):
2549 continue
2550 if not player_url:
2551 continue
2552 signature = self._decrypt_signature(sc['s'][0], video_id, player_url)
2553 sp = try_get(sc, lambda x: x['sp'][0]) or 'signature'
2554 fmt_url += '&' + sp + '=' + signature
2555
2556 if itag:
2557 itags.append(itag)
2558 stream_ids.append(stream_id)
2559
2560 tbr = float_or_none(
2561 fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
2562 dct = {
2563 'asr': int_or_none(fmt.get('audioSampleRate')),
2564 'filesize': int_or_none(fmt.get('contentLength')),
2565 'format_id': itag,
2566 'format_note': audio_track.get('displayName') or fmt.get('qualityLabel') or quality,
2567 'fps': int_or_none(fmt.get('fps')),
2568 'height': int_or_none(fmt.get('height')),
2569 'quality': q(quality),
2570 'tbr': tbr,
2571 'url': fmt_url,
2572 'width': fmt.get('width'),
2573 'language': audio_track.get('id', '').split('.')[0],
2574 }
2575 mime_mobj = re.match(
2576 r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '')
2577 if mime_mobj:
2578 dct['ext'] = mimetype2ext(mime_mobj.group(1))
2579 dct.update(parse_codecs(mime_mobj.group(2)))
2580 # The 3gp format in android client has a quality of "small",
2581 # but is actually worse than all other formats
2582 if dct['ext'] == '3gp':
2583 dct['quality'] = q('tiny')
2584 no_audio = dct.get('acodec') == 'none'
2585 no_video = dct.get('vcodec') == 'none'
2586 if no_audio:
2587 dct['vbr'] = tbr
2588 if no_video:
2589 dct['abr'] = tbr
2590 if no_audio or no_video:
2591 dct['downloader_options'] = {
2592 # Youtube throttles chunks >~10M
2593 'http_chunk_size': 10485760,
2594 }
2595 if dct.get('ext'):
2596 dct['container'] = dct['ext'] + '_dash'
2597 formats.append(dct)
2598
2599 skip_manifests = self._configuration_arg('skip')
2600 get_dash = 'dash' not in skip_manifests and self.get_param('youtube_include_dash_manifest', True)
2601 get_hls = 'hls' not in skip_manifests and self.get_param('youtube_include_hls_manifest', True)
2602
2603 for sd in (streaming_data, ytm_streaming_data):
2604 hls_manifest_url = get_hls and sd.get('hlsManifestUrl')
2605 if hls_manifest_url:
2606 for f in self._extract_m3u8_formats(
2607 hls_manifest_url, video_id, 'mp4', fatal=False):
2608 itag = self._search_regex(
2609 r'/itag/(\d+)', f['url'], 'itag', default=None)
2610 if itag:
2611 f['format_id'] = itag
2612 formats.append(f)
2613
2614 dash_manifest_url = get_dash and sd.get('dashManifestUrl')
2615 if dash_manifest_url:
2616 for f in self._extract_mpd_formats(
2617 dash_manifest_url, video_id, fatal=False):
2618 itag = f['format_id']
2619 if itag in itags:
2620 continue
2621 if itag in itag_qualities:
2622 f['quality'] = q(itag_qualities[itag])
2623 filesize = int_or_none(self._search_regex(
2624 r'/clen/(\d+)', f.get('fragment_base_url')
2625 or f['url'], 'file size', default=None))
2626 if filesize:
2627 f['filesize'] = filesize
2628 formats.append(f)
2629
2630 if not formats:
2631 if not self.get_param('allow_unplayable_formats') and streaming_data.get('licenseInfos'):
2632 self.raise_no_formats(
2633 'This video is DRM protected.', expected=True)
2634 pemr = try_get(
2635 playability_status,
2636 lambda x: x['errorScreen']['playerErrorMessageRenderer'],
2637 dict) or {}
2638 reason = get_text(pemr.get('reason')) or playability_status.get('reason')
2639 subreason = pemr.get('subreason')
2640 if subreason:
2641 subreason = clean_html(get_text(subreason))
2642 if subreason == 'The uploader has not made this video available in your country.':
2643 countries = microformat.get('availableCountries')
2644 if not countries:
2645 regions_allowed = search_meta('regionsAllowed')
2646 countries = regions_allowed.split(',') if regions_allowed else None
2647 self.raise_geo_restricted(subreason, countries, metadata_available=True)
2648 reason += '\n' + subreason
2649 if reason:
2650 self.raise_no_formats(reason, expected=True)
2651
2652 self._sort_formats(formats)
2653
2654 keywords = video_details.get('keywords') or []
2655 if not keywords and webpage:
2656 keywords = [
2657 unescapeHTML(m.group('content'))
2658 for m in re.finditer(self._meta_regex('og:video:tag'), webpage)]
2659 for keyword in keywords:
2660 if keyword.startswith('yt:stretch='):
2661 mobj = re.search(r'(\d+)\s*:\s*(\d+)', keyword)
2662 if mobj:
2663 # NB: float is intentional for forcing float division
2664 w, h = (float(v) for v in mobj.groups())
2665 if w > 0 and h > 0:
2666 ratio = w / h
2667 for f in formats:
2668 if f.get('vcodec') != 'none':
2669 f['stretched_ratio'] = ratio
2670 break
2671
2672 thumbnails = []
2673 for container in (video_details, microformat):
2674 for thumbnail in (try_get(
2675 container,
2676 lambda x: x['thumbnail']['thumbnails'], list) or []):
2677 thumbnail_url = thumbnail.get('url')
2678 if not thumbnail_url:
2679 continue
2680 # Sometimes youtube gives a wrong thumbnail URL. See:
2681 # https://github.com/yt-dlp/yt-dlp/issues/233
2682 # https://github.com/ytdl-org/youtube-dl/issues/28023
2683 if 'maxresdefault' in thumbnail_url:
2684 thumbnail_url = thumbnail_url.split('?')[0]
2685 thumbnails.append({
2686 'url': thumbnail_url,
2687 'height': int_or_none(thumbnail.get('height')),
2688 'width': int_or_none(thumbnail.get('width')),
2689 'preference': 1 if 'maxresdefault' in thumbnail_url else -1
2690 })
2691 thumbnail_url = search_meta(['og:image', 'twitter:image'])
2692 if thumbnail_url:
2693 thumbnails.append({
2694 'url': thumbnail_url,
2695 'preference': 1 if 'maxresdefault' in thumbnail_url else -1
2696 })
2697 # All videos have a maxresdefault thumbnail, but sometimes it does not appear in the webpage
2698 # See: https://github.com/ytdl-org/youtube-dl/issues/29049
2699 thumbnails.append({
2700 'url': 'https://i.ytimg.com/vi/%s/maxresdefault.jpg' % video_id,
2701 'preference': 1,
2702 })
2703 self._remove_duplicate_formats(thumbnails)
2704
2705 category = microformat.get('category') or search_meta('genre')
2706 channel_id = video_details.get('channelId') \
2707 or microformat.get('externalChannelId') \
2708 or search_meta('channelId')
2709 duration = int_or_none(
2710 video_details.get('lengthSeconds')
2711 or microformat.get('lengthSeconds')) \
2712 or parse_duration(search_meta('duration'))
2713 is_live = video_details.get('isLive')
2714 is_upcoming = video_details.get('isUpcoming')
2715 owner_profile_url = microformat.get('ownerProfileUrl')
2716
2717 info = {
2718 'id': video_id,
2719 'title': self._live_title(video_title) if is_live else video_title,
2720 'formats': formats,
2721 'thumbnails': thumbnails,
2722 'description': video_description,
2723 'upload_date': unified_strdate(
2724 microformat.get('uploadDate')
2725 or search_meta('uploadDate')),
2726 'uploader': video_details['author'],
2727 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None,
2728 'uploader_url': owner_profile_url,
2729 'channel_id': channel_id,
2730 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None,
2731 'duration': duration,
2732 'view_count': int_or_none(
2733 video_details.get('viewCount')
2734 or microformat.get('viewCount')
2735 or search_meta('interactionCount')),
2736 'average_rating': float_or_none(video_details.get('averageRating')),
2737 'age_limit': 18 if (
2738 microformat.get('isFamilySafe') is False
2739 or search_meta('isFamilyFriendly') == 'false'
2740 or search_meta('og:restrictions:age') == '18+') else 0,
2741 'webpage_url': webpage_url,
2742 'categories': [category] if category else None,
2743 'tags': keywords,
2744 'is_live': is_live,
2745 'playable_in_embed': playability_status.get('playableInEmbed'),
2746 'was_live': video_details.get('isLiveContent'),
2747 }
2748
2749 pctr = try_get(
2750 player_response,
2751 lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict)
2752 subtitles = {}
2753 if pctr:
2754 def process_language(container, base_url, lang_code, sub_name, query):
2755 lang_subs = container.setdefault(lang_code, [])
2756 for fmt in self._SUBTITLE_FORMATS:
2757 query.update({
2758 'fmt': fmt,
2759 })
2760 lang_subs.append({
2761 'ext': fmt,
2762 'url': update_url_query(base_url, query),
2763 'name': sub_name,
2764 })
2765
2766 for caption_track in (pctr.get('captionTracks') or []):
2767 base_url = caption_track.get('baseUrl')
2768 if not base_url:
2769 continue
2770 if caption_track.get('kind') != 'asr':
2771 lang_code = (
2772 remove_start(caption_track.get('vssId') or '', '.').replace('.', '-')
2773 or caption_track.get('languageCode'))
2774 if not lang_code:
2775 continue
2776 process_language(
2777 subtitles, base_url, lang_code,
2778 try_get(caption_track, lambda x: x['name']['simpleText']),
2779 {})
2780 continue
2781 automatic_captions = {}
2782 for translation_language in (pctr.get('translationLanguages') or []):
2783 translation_language_code = translation_language.get('languageCode')
2784 if not translation_language_code:
2785 continue
2786 process_language(
2787 automatic_captions, base_url, translation_language_code,
2788 try_get(translation_language, (
2789 lambda x: x['languageName']['simpleText'],
2790 lambda x: x['languageName']['runs'][0]['text'])),
2791 {'tlang': translation_language_code})
2792 info['automatic_captions'] = automatic_captions
2793 info['subtitles'] = subtitles
2794
2795 parsed_url = compat_urllib_parse_urlparse(url)
2796 for component in [parsed_url.fragment, parsed_url.query]:
2797 query = compat_parse_qs(component)
2798 for k, v in query.items():
2799 for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]:
2800 d_k += '_time'
2801 if d_k not in info and k in s_ks:
2802 info[d_k] = parse_duration(query[k][0])
2803
2804 # Youtube Music Auto-generated description
2805 if video_description:
2806 mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
2807 if mobj:
2808 release_year = mobj.group('release_year')
2809 release_date = mobj.group('release_date')
2810 if release_date:
2811 release_date = release_date.replace('-', '')
2812 if not release_year:
2813 release_year = release_date[:4]
2814 info.update({
2815 'album': mobj.group('album'.strip()),
2816 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')),
2817 'track': mobj.group('track').strip(),
2818 'release_date': release_date,
2819 'release_year': int_or_none(release_year),
2820 })
2821
2822 initial_data = None
2823 if webpage:
2824 initial_data = self._extract_yt_initial_variable(
2825 webpage, self._YT_INITIAL_DATA_RE, video_id,
2826 'yt initial data')
2827 if not initial_data:
2828 initial_data = self._extract_response(
2829 item_id=video_id, ep='next', fatal=False,
2830 ytcfg=ytcfg, headers=headers, query={'videoId': video_id},
2831 note='Downloading initial data API JSON')
2832
2833 try:
2834 # This will error if there is no livechat
2835 initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
2836 info['subtitles']['live_chat'] = [{
2837 'url': 'https://www.youtube.com/watch?v=%s' % video_id, # url is needed to set cookies
2838 'video_id': video_id,
2839 'ext': 'json',
2840 'protocol': 'youtube_live_chat' if is_live or is_upcoming else 'youtube_live_chat_replay',
2841 }]
2842 except (KeyError, IndexError, TypeError):
2843 pass
2844
2845 if initial_data:
2846 chapters = self._extract_chapters_from_json(
2847 initial_data, video_id, duration)
2848 if not chapters:
2849 for engagment_pannel in (initial_data.get('engagementPanels') or []):
2850 contents = try_get(
2851 engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'],
2852 list)
2853 if not contents:
2854 continue
2855
2856 def chapter_time(mmlir):
2857 return parse_duration(
2858 get_text(mmlir.get('timeDescription')))
2859
2860 chapters = []
2861 for next_num, content in enumerate(contents, start=1):
2862 mmlir = content.get('macroMarkersListItemRenderer') or {}
2863 start_time = chapter_time(mmlir)
2864 end_time = chapter_time(try_get(
2865 contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \
2866 if next_num < len(contents) else duration
2867 if start_time is None or end_time is None:
2868 continue
2869 chapters.append({
2870 'start_time': start_time,
2871 'end_time': end_time,
2872 'title': get_text(mmlir.get('title')),
2873 })
2874 if chapters:
2875 break
2876 if chapters:
2877 info['chapters'] = chapters
2878
2879 contents = try_get(
2880 initial_data,
2881 lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'],
2882 list) or []
2883 for content in contents:
2884 vpir = content.get('videoPrimaryInfoRenderer')
2885 if vpir:
2886 stl = vpir.get('superTitleLink')
2887 if stl:
2888 stl = get_text(stl)
2889 if try_get(
2890 vpir,
2891 lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':
2892 info['location'] = stl
2893 else:
2894 mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl)
2895 if mobj:
2896 info.update({
2897 'series': mobj.group(1),
2898 'season_number': int(mobj.group(2)),
2899 'episode_number': int(mobj.group(3)),
2900 })
2901 for tlb in (try_get(
2902 vpir,
2903 lambda x: x['videoActions']['menuRenderer']['topLevelButtons'],
2904 list) or []):
2905 tbr = tlb.get('toggleButtonRenderer') or {}
2906 for getter, regex in [(
2907 lambda x: x['defaultText']['accessibility']['accessibilityData'],
2908 r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([
2909 lambda x: x['accessibility'],
2910 lambda x: x['accessibilityData']['accessibilityData'],
2911 ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]:
2912 label = (try_get(tbr, getter, dict) or {}).get('label')
2913 if label:
2914 mobj = re.match(regex, label)
2915 if mobj:
2916 info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count'))
2917 break
2918 sbr_tooltip = try_get(
2919 vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])
2920 if sbr_tooltip:
2921 like_count, dislike_count = sbr_tooltip.split(' / ')
2922 info.update({
2923 'like_count': str_to_int(like_count),
2924 'dislike_count': str_to_int(dislike_count),
2925 })
2926 vsir = content.get('videoSecondaryInfoRenderer')
2927 if vsir:
2928 info['channel'] = get_text(try_get(
2929 vsir,
2930 lambda x: x['owner']['videoOwnerRenderer']['title'],
2931 dict))
2932 rows = try_get(
2933 vsir,
2934 lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
2935 list) or []
2936 multiple_songs = False
2937 for row in rows:
2938 if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
2939 multiple_songs = True
2940 break
2941 for row in rows:
2942 mrr = row.get('metadataRowRenderer') or {}
2943 mrr_title = mrr.get('title')
2944 if not mrr_title:
2945 continue
2946 mrr_title = get_text(mrr['title'])
2947 mrr_contents_text = get_text(mrr['contents'][0])
2948 if mrr_title == 'License':
2949 info['license'] = mrr_contents_text
2950 elif not multiple_songs:
2951 if mrr_title == 'Album':
2952 info['album'] = mrr_contents_text
2953 elif mrr_title == 'Artist':
2954 info['artist'] = mrr_contents_text
2955 elif mrr_title == 'Song':
2956 info['track'] = mrr_contents_text
2957
2958 fallbacks = {
2959 'channel': 'uploader',
2960 'channel_id': 'uploader_id',
2961 'channel_url': 'uploader_url',
2962 }
2963 for to, frm in fallbacks.items():
2964 if not info.get(to):
2965 info[to] = info.get(frm)
2966
2967 for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]:
2968 v = info.get(s_k)
2969 if v:
2970 info[d_k] = v
2971
2972 is_private = bool_or_none(video_details.get('isPrivate'))
2973 is_unlisted = bool_or_none(microformat.get('isUnlisted'))
2974 is_membersonly = None
2975 is_premium = None
2976 if initial_data and is_private is not None:
2977 is_membersonly = False
2978 is_premium = False
2979 contents = try_get(initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or []
2980 badge_labels = set()
2981 for content in contents:
2982 if not isinstance(content, dict):
2983 continue
2984 badge_labels.update(self._extract_badges(content.get('videoPrimaryInfoRenderer')))
2985 for badge_label in badge_labels:
2986 if badge_label.lower() == 'members only':
2987 is_membersonly = True
2988 elif badge_label.lower() == 'premium':
2989 is_premium = True
2990 elif badge_label.lower() == 'unlisted':
2991 is_unlisted = True
2992
2993 info['availability'] = self._availability(
2994 is_private=is_private,
2995 needs_premium=is_premium,
2996 needs_subscription=is_membersonly,
2997 needs_auth=info['age_limit'] >= 18,
2998 is_unlisted=None if is_private is None else is_unlisted)
2999
3000 # get xsrf for annotations or comments
3001 get_annotations = self.get_param('writeannotations', False)
3002 get_comments = self.get_param('getcomments', False)
3003 if get_annotations or get_comments:
3004 xsrf_token = None
3005 ytcfg = self._extract_ytcfg(video_id, webpage)
3006 if ytcfg:
3007 xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
3008 if not xsrf_token:
3009 xsrf_token = self._search_regex(
3010 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
3011 webpage, 'xsrf token', group='xsrf_token', fatal=False)
3012
3013 # annotations
3014 if get_annotations:
3015 invideo_url = try_get(
3016 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
3017 if xsrf_token and invideo_url:
3018 xsrf_field_name = None
3019 if ytcfg:
3020 xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
3021 if not xsrf_field_name:
3022 xsrf_field_name = self._search_regex(
3023 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
3024 webpage, 'xsrf field name',
3025 group='xsrf_field_name', default='session_token')
3026 info['annotations'] = self._download_webpage(
3027 self._proto_relative_url(invideo_url),
3028 video_id, note='Downloading annotations',
3029 errnote='Unable to download video annotations', fatal=False,
3030 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
3031
3032 if get_comments:
3033 info['__post_extractor'] = lambda: self._extract_comments(ytcfg, video_id, contents, webpage)
3034
3035 self.mark_watched(video_id, player_response)
3036
3037 return info
3038
3039
3040 class YoutubeTabIE(YoutubeBaseInfoExtractor):
3041 IE_DESC = 'YouTube.com tab'
3042 _VALID_URL = r'''(?x)
3043 https?://
3044 (?:\w+\.)?
3045 (?:
3046 youtube(?:kids)?\.com|
3047 invidio\.us
3048 )/
3049 (?:
3050 (?P<channel_type>channel|c|user|browse)/|
3051 (?P<not_channel>
3052 feed/|hashtag/|
3053 (?:playlist|watch)\?.*?\blist=
3054 )|
3055 (?!(?:%s)\b) # Direct URLs
3056 )
3057 (?P<id>[^/?\#&]+)
3058 ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
3059 IE_NAME = 'youtube:tab'
3060
3061 _TESTS = [{
3062 'note': 'playlists, multipage',
3063 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
3064 'playlist_mincount': 94,
3065 'info_dict': {
3066 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
3067 'title': 'Игорь Клейнер - Playlists',
3068 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
3069 'uploader': 'Игорь Клейнер',
3070 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
3071 },
3072 }, {
3073 'note': 'playlists, multipage, different order',
3074 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
3075 'playlist_mincount': 94,
3076 'info_dict': {
3077 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
3078 'title': 'Игорь Клейнер - Playlists',
3079 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
3080 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
3081 'uploader': 'Игорь Клейнер',
3082 },
3083 }, {
3084 'note': 'playlists, series',
3085 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3',
3086 'playlist_mincount': 5,
3087 'info_dict': {
3088 'id': 'UCYO_jab_esuFRV4b17AJtAw',
3089 'title': '3Blue1Brown - Playlists',
3090 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
3091 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
3092 'uploader': '3Blue1Brown',
3093 },
3094 }, {
3095 'note': 'playlists, singlepage',
3096 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
3097 'playlist_mincount': 4,
3098 'info_dict': {
3099 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
3100 'title': 'ThirstForScience - Playlists',
3101 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
3102 'uploader': 'ThirstForScience',
3103 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
3104 }
3105 }, {
3106 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3107 'only_matching': True,
3108 }, {
3109 'note': 'basic, single video playlist',
3110 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3111 'info_dict': {
3112 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
3113 'uploader': 'Sergey M.',
3114 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3115 'title': 'youtube-dl public playlist',
3116 },
3117 'playlist_count': 1,
3118 }, {
3119 'note': 'empty playlist',
3120 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3121 'info_dict': {
3122 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
3123 'uploader': 'Sergey M.',
3124 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3125 'title': 'youtube-dl empty playlist',
3126 },
3127 'playlist_count': 0,
3128 }, {
3129 'note': 'Home tab',
3130 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
3131 'info_dict': {
3132 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3133 'title': 'lex will - Home',
3134 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3135 'uploader': 'lex will',
3136 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3137 },
3138 'playlist_mincount': 2,
3139 }, {
3140 'note': 'Videos tab',
3141 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
3142 'info_dict': {
3143 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3144 'title': 'lex will - Videos',
3145 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3146 'uploader': 'lex will',
3147 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3148 },
3149 'playlist_mincount': 975,
3150 }, {
3151 'note': 'Videos tab, sorted by popular',
3152 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
3153 'info_dict': {
3154 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3155 'title': 'lex will - Videos',
3156 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3157 'uploader': 'lex will',
3158 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3159 },
3160 'playlist_mincount': 199,
3161 }, {
3162 'note': 'Playlists tab',
3163 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
3164 'info_dict': {
3165 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3166 'title': 'lex will - Playlists',
3167 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3168 'uploader': 'lex will',
3169 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3170 },
3171 'playlist_mincount': 17,
3172 }, {
3173 'note': 'Community tab',
3174 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
3175 'info_dict': {
3176 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3177 'title': 'lex will - Community',
3178 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3179 'uploader': 'lex will',
3180 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3181 },
3182 'playlist_mincount': 18,
3183 }, {
3184 'note': 'Channels tab',
3185 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
3186 'info_dict': {
3187 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3188 'title': 'lex will - Channels',
3189 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3190 'uploader': 'lex will',
3191 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3192 },
3193 'playlist_mincount': 12,
3194 }, {
3195 'note': 'Search tab',
3196 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra',
3197 'playlist_mincount': 40,
3198 'info_dict': {
3199 'id': 'UCYO_jab_esuFRV4b17AJtAw',
3200 'title': '3Blue1Brown - Search - linear algebra',
3201 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
3202 'uploader': '3Blue1Brown',
3203 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
3204 },
3205 }, {
3206 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3207 'only_matching': True,
3208 }, {
3209 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3210 'only_matching': True,
3211 }, {
3212 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3213 'only_matching': True,
3214 }, {
3215 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
3216 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
3217 'info_dict': {
3218 'title': '29C3: Not my department',
3219 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
3220 'uploader': 'Christiaan008',
3221 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
3222 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268',
3223 },
3224 'playlist_count': 96,
3225 }, {
3226 'note': 'Large playlist',
3227 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
3228 'info_dict': {
3229 'title': 'Uploads from Cauchemar',
3230 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
3231 'uploader': 'Cauchemar',
3232 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
3233 },
3234 'playlist_mincount': 1123,
3235 }, {
3236 'note': 'even larger playlist, 8832 videos',
3237 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
3238 'only_matching': True,
3239 }, {
3240 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
3241 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
3242 'info_dict': {
3243 'title': 'Uploads from Interstellar Movie',
3244 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
3245 'uploader': 'Interstellar Movie',
3246 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
3247 },
3248 'playlist_mincount': 21,
3249 }, {
3250 'note': 'Playlist with "show unavailable videos" button',
3251 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q',
3252 'info_dict': {
3253 'title': 'Uploads from Phim Siêu Nhân Nhật Bản',
3254 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q',
3255 'uploader': 'Phim Siêu Nhân Nhật Bản',
3256 'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q',
3257 },
3258 'playlist_mincount': 200,
3259 }, {
3260 'note': 'Playlist with unavailable videos in page 7',
3261 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w',
3262 'info_dict': {
3263 'title': 'Uploads from BlankTV',
3264 'id': 'UU8l9frL61Yl5KFOl87nIm2w',
3265 'uploader': 'BlankTV',
3266 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w',
3267 },
3268 'playlist_mincount': 1000,
3269 }, {
3270 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844',
3271 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
3272 'info_dict': {
3273 'title': 'Data Analysis with Dr Mike Pound',
3274 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
3275 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
3276 'uploader': 'Computerphile',
3277 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487',
3278 },
3279 'playlist_mincount': 11,
3280 }, {
3281 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3282 'only_matching': True,
3283 }, {
3284 'note': 'Playlist URL that does not actually serve a playlist',
3285 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
3286 'info_dict': {
3287 'id': 'FqZTN594JQw',
3288 'ext': 'webm',
3289 'title': "Smiley's People 01 detective, Adventure Series, Action",
3290 'uploader': 'STREEM',
3291 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
3292 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
3293 'upload_date': '20150526',
3294 'license': 'Standard YouTube License',
3295 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
3296 'categories': ['People & Blogs'],
3297 'tags': list,
3298 'view_count': int,
3299 'like_count': int,
3300 'dislike_count': int,
3301 },
3302 'params': {
3303 'skip_download': True,
3304 },
3305 'skip': 'This video is not available.',
3306 'add_ie': [YoutubeIE.ie_key()],
3307 }, {
3308 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
3309 'only_matching': True,
3310 }, {
3311 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
3312 'only_matching': True,
3313 }, {
3314 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
3315 'info_dict': {
3316 'id': 'X1whbWASnNQ', # This will keep changing
3317 'ext': 'mp4',
3318 'title': compat_str,
3319 'uploader': 'Sky News',
3320 'uploader_id': 'skynews',
3321 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
3322 'upload_date': r're:\d{8}',
3323 'description': compat_str,
3324 'categories': ['News & Politics'],
3325 'tags': list,
3326 'like_count': int,
3327 'dislike_count': int,
3328 },
3329 'params': {
3330 'skip_download': True,
3331 },
3332 'expected_warnings': ['Downloading just video ', 'Ignoring subtitle tracks found in '],
3333 }, {
3334 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
3335 'info_dict': {
3336 'id': 'a48o2S1cPoo',
3337 'ext': 'mp4',
3338 'title': 'The Young Turks - Live Main Show',
3339 'uploader': 'The Young Turks',
3340 'uploader_id': 'TheYoungTurks',
3341 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
3342 'upload_date': '20150715',
3343 'license': 'Standard YouTube License',
3344 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3345 'categories': ['News & Politics'],
3346 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3347 'like_count': int,
3348 'dislike_count': int,
3349 },
3350 'params': {
3351 'skip_download': True,
3352 },
3353 'only_matching': True,
3354 }, {
3355 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
3356 'only_matching': True,
3357 }, {
3358 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3359 'only_matching': True,
3360 }, {
3361 'note': 'A channel that is not live. Should raise error',
3362 'url': 'https://www.youtube.com/user/numberphile/live',
3363 'only_matching': True,
3364 }, {
3365 'url': 'https://www.youtube.com/feed/trending',
3366 'only_matching': True,
3367 }, {
3368 'url': 'https://www.youtube.com/feed/library',
3369 'only_matching': True,
3370 }, {
3371 'url': 'https://www.youtube.com/feed/history',
3372 'only_matching': True,
3373 }, {
3374 'url': 'https://www.youtube.com/feed/subscriptions',
3375 'only_matching': True,
3376 }, {
3377 'url': 'https://www.youtube.com/feed/watch_later',
3378 'only_matching': True,
3379 }, {
3380 'note': 'Recommended - redirects to home page',
3381 'url': 'https://www.youtube.com/feed/recommended',
3382 'only_matching': True,
3383 }, {
3384 'note': 'inline playlist with not always working continuations',
3385 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
3386 'only_matching': True,
3387 }, {
3388 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8',
3389 'only_matching': True,
3390 }, {
3391 'url': 'https://www.youtube.com/course',
3392 'only_matching': True,
3393 }, {
3394 'url': 'https://www.youtube.com/zsecurity',
3395 'only_matching': True,
3396 }, {
3397 'url': 'http://www.youtube.com/NASAgovVideo/videos',
3398 'only_matching': True,
3399 }, {
3400 'url': 'https://www.youtube.com/TheYoungTurks/live',
3401 'only_matching': True,
3402 }, {
3403 'url': 'https://www.youtube.com/hashtag/cctv9',
3404 'info_dict': {
3405 'id': 'cctv9',
3406 'title': '#cctv9',
3407 },
3408 'playlist_mincount': 350,
3409 }, {
3410 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU',
3411 'only_matching': True,
3412 }, {
3413 'note': 'Requires Premium: should request additional YTM-info webpage (and have format 141) for videos in playlist',
3414 'url': 'https://music.youtube.com/playlist?list=PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3415 'only_matching': True
3416 }, {
3417 'note': '/browse/ should redirect to /channel/',
3418 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng',
3419 'only_matching': True
3420 }, {
3421 'note': 'VLPL, should redirect to playlist?list=PL...',
3422 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3423 'info_dict': {
3424 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3425 'uploader': 'NoCopyrightSounds',
3426 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!',
3427 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
3428 'title': 'NCS Releases',
3429 },
3430 'playlist_mincount': 166,
3431 }, {
3432 'note': 'Topic, should redirect to playlist?list=UU...',
3433 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw',
3434 'info_dict': {
3435 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw',
3436 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
3437 'title': 'Uploads from Royalty Free Music - Topic',
3438 'uploader': 'Royalty Free Music - Topic',
3439 },
3440 'expected_warnings': [
3441 'A channel/user page was given',
3442 'The URL does not have a videos tab',
3443 ],
3444 'playlist_mincount': 101,
3445 }, {
3446 'note': 'Topic without a UU playlist',
3447 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg',
3448 'info_dict': {
3449 'id': 'UCtFRv9O2AHqOZjjynzrv-xg',
3450 'title': 'UCtFRv9O2AHqOZjjynzrv-xg',
3451 },
3452 'expected_warnings': [
3453 'A channel/user page was given',
3454 'The URL does not have a videos tab',
3455 'Falling back to channel URL',
3456 ],
3457 'playlist_mincount': 9,
3458 }, {
3459 'note': 'Youtube music Album',
3460 'url': 'https://music.youtube.com/browse/MPREb_gTAcphH99wE',
3461 'info_dict': {
3462 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0',
3463 'title': 'Album - Royalty Free Music Library V2 (50 Songs)',
3464 },
3465 'playlist_count': 50,
3466 }, {
3467 'note': 'unlisted single video playlist',
3468 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
3469 'info_dict': {
3470 'uploader_id': 'UC9zHu_mHU96r19o-wV5Qs1Q',
3471 'uploader': 'colethedj',
3472 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
3473 'title': 'yt-dlp unlisted playlist test',
3474 'availability': 'unlisted'
3475 },
3476 'playlist_count': 1,
3477 }]
3478
3479 @classmethod
3480 def suitable(cls, url):
3481 return False if YoutubeIE.suitable(url) else super(
3482 YoutubeTabIE, cls).suitable(url)
3483
3484 def _extract_channel_id(self, webpage):
3485 channel_id = self._html_search_meta(
3486 'channelId', webpage, 'channel id', default=None)
3487 if channel_id:
3488 return channel_id
3489 channel_url = self._html_search_meta(
3490 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
3491 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
3492 'twitter:app:url:googleplay'), webpage, 'channel url')
3493 return self._search_regex(
3494 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
3495 channel_url, 'channel id')
3496
3497 @staticmethod
3498 def _extract_basic_item_renderer(item):
3499 # Modified from _extract_grid_item_renderer
3500 known_basic_renderers = (
3501 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer'
3502 )
3503 for key, renderer in item.items():
3504 if not isinstance(renderer, dict):
3505 continue
3506 elif key in known_basic_renderers:
3507 return renderer
3508 elif key.startswith('grid') and key.endswith('Renderer'):
3509 return renderer
3510
3511 def _grid_entries(self, grid_renderer):
3512 for item in grid_renderer['items']:
3513 if not isinstance(item, dict):
3514 continue
3515 renderer = self._extract_basic_item_renderer(item)
3516 if not isinstance(renderer, dict):
3517 continue
3518 title = try_get(
3519 renderer, (lambda x: x['title']['runs'][0]['text'],
3520 lambda x: x['title']['simpleText']), compat_str)
3521 # playlist
3522 playlist_id = renderer.get('playlistId')
3523 if playlist_id:
3524 yield self.url_result(
3525 'https://www.youtube.com/playlist?list=%s' % playlist_id,
3526 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3527 video_title=title)
3528 continue
3529 # video
3530 video_id = renderer.get('videoId')
3531 if video_id:
3532 yield self._extract_video(renderer)
3533 continue
3534 # channel
3535 channel_id = renderer.get('channelId')
3536 if channel_id:
3537 title = try_get(
3538 renderer, lambda x: x['title']['simpleText'], compat_str)
3539 yield self.url_result(
3540 'https://www.youtube.com/channel/%s' % channel_id,
3541 ie=YoutubeTabIE.ie_key(), video_title=title)
3542 continue
3543 # generic endpoint URL support
3544 ep_url = urljoin('https://www.youtube.com/', try_get(
3545 renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'],
3546 compat_str))
3547 if ep_url:
3548 for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE):
3549 if ie.suitable(ep_url):
3550 yield self.url_result(
3551 ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title)
3552 break
3553
3554 def _shelf_entries_from_content(self, shelf_renderer):
3555 content = shelf_renderer.get('content')
3556 if not isinstance(content, dict):
3557 return
3558 renderer = content.get('gridRenderer') or content.get('expandedShelfContentsRenderer')
3559 if renderer:
3560 # TODO: add support for nested playlists so each shelf is processed
3561 # as separate playlist
3562 # TODO: this includes only first N items
3563 for entry in self._grid_entries(renderer):
3564 yield entry
3565 renderer = content.get('horizontalListRenderer')
3566 if renderer:
3567 # TODO
3568 pass
3569
3570 def _shelf_entries(self, shelf_renderer, skip_channels=False):
3571 ep = try_get(
3572 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3573 compat_str)
3574 shelf_url = urljoin('https://www.youtube.com', ep)
3575 if shelf_url:
3576 # Skipping links to another channels, note that checking for
3577 # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL
3578 # will not work
3579 if skip_channels and '/channels?' in shelf_url:
3580 return
3581 title = try_get(
3582 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
3583 yield self.url_result(shelf_url, video_title=title)
3584 # Shelf may not contain shelf URL, fallback to extraction from content
3585 for entry in self._shelf_entries_from_content(shelf_renderer):
3586 yield entry
3587
3588 def _playlist_entries(self, video_list_renderer):
3589 for content in video_list_renderer['contents']:
3590 if not isinstance(content, dict):
3591 continue
3592 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
3593 if not isinstance(renderer, dict):
3594 continue
3595 video_id = renderer.get('videoId')
3596 if not video_id:
3597 continue
3598 yield self._extract_video(renderer)
3599
3600 def _rich_entries(self, rich_grid_renderer):
3601 renderer = try_get(
3602 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
3603 video_id = renderer.get('videoId')
3604 if not video_id:
3605 return
3606 yield self._extract_video(renderer)
3607
3608 def _video_entry(self, video_renderer):
3609 video_id = video_renderer.get('videoId')
3610 if video_id:
3611 return self._extract_video(video_renderer)
3612
3613 def _post_thread_entries(self, post_thread_renderer):
3614 post_renderer = try_get(
3615 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
3616 if not post_renderer:
3617 return
3618 # video attachment
3619 video_renderer = try_get(
3620 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict) or {}
3621 video_id = video_renderer.get('videoId')
3622 if video_id:
3623 entry = self._extract_video(video_renderer)
3624 if entry:
3625 yield entry
3626 # playlist attachment
3627 playlist_id = try_get(
3628 post_renderer, lambda x: x['backstageAttachment']['playlistRenderer']['playlistId'], compat_str)
3629 if playlist_id:
3630 yield self.url_result(
3631 'https://www.youtube.com/playlist?list=%s' % playlist_id,
3632 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3633 # inline video links
3634 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
3635 for run in runs:
3636 if not isinstance(run, dict):
3637 continue
3638 ep_url = try_get(
3639 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
3640 if not ep_url:
3641 continue
3642 if not YoutubeIE.suitable(ep_url):
3643 continue
3644 ep_video_id = YoutubeIE._match_id(ep_url)
3645 if video_id == ep_video_id:
3646 continue
3647 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=ep_video_id)
3648
3649 def _post_thread_continuation_entries(self, post_thread_continuation):
3650 contents = post_thread_continuation.get('contents')
3651 if not isinstance(contents, list):
3652 return
3653 for content in contents:
3654 renderer = content.get('backstagePostThreadRenderer')
3655 if not isinstance(renderer, dict):
3656 continue
3657 for entry in self._post_thread_entries(renderer):
3658 yield entry
3659
3660 r''' # unused
3661 def _rich_grid_entries(self, contents):
3662 for content in contents:
3663 video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict)
3664 if video_renderer:
3665 entry = self._video_entry(video_renderer)
3666 if entry:
3667 yield entry
3668 '''
3669 def _entries(self, tab, item_id, identity_token, account_syncid, ytcfg):
3670
3671 def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
3672 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
3673 for content in contents:
3674 if not isinstance(content, dict):
3675 continue
3676 is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
3677 if not is_renderer:
3678 renderer = content.get('richItemRenderer')
3679 if renderer:
3680 for entry in self._rich_entries(renderer):
3681 yield entry
3682 continuation_list[0] = self._extract_continuation(parent_renderer)
3683 continue
3684 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
3685 for isr_content in isr_contents:
3686 if not isinstance(isr_content, dict):
3687 continue
3688
3689 known_renderers = {
3690 'playlistVideoListRenderer': self._playlist_entries,
3691 'gridRenderer': self._grid_entries,
3692 'shelfRenderer': lambda x: self._shelf_entries(x, tab.get('title') != 'Channels'),
3693 'backstagePostThreadRenderer': self._post_thread_entries,
3694 'videoRenderer': lambda x: [self._video_entry(x)],
3695 }
3696 for key, renderer in isr_content.items():
3697 if key not in known_renderers:
3698 continue
3699 for entry in known_renderers[key](renderer):
3700 if entry:
3701 yield entry
3702 continuation_list[0] = self._extract_continuation(renderer)
3703 break
3704
3705 if not continuation_list[0]:
3706 continuation_list[0] = self._extract_continuation(is_renderer)
3707
3708 if not continuation_list[0]:
3709 continuation_list[0] = self._extract_continuation(parent_renderer)
3710
3711 continuation_list = [None] # Python 2 doesnot support nonlocal
3712 tab_content = try_get(tab, lambda x: x['content'], dict)
3713 if not tab_content:
3714 return
3715 parent_renderer = (
3716 try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
3717 or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
3718 for entry in extract_entries(parent_renderer):
3719 yield entry
3720 continuation = continuation_list[0]
3721 context = self._extract_context(ytcfg)
3722 visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str)
3723
3724 for page_num in itertools.count(1):
3725 if not continuation:
3726 break
3727 query = {
3728 'continuation': continuation['continuation'],
3729 'clickTracking': {'clickTrackingParams': continuation['itct']}
3730 }
3731 headers = self._generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
3732 response = self._extract_response(
3733 item_id='%s page %s' % (item_id, page_num),
3734 query=query, headers=headers, ytcfg=ytcfg,
3735 check_get_keys=('continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints'))
3736
3737 if not response:
3738 break
3739 visitor_data = try_get(
3740 response, lambda x: x['responseContext']['visitorData'], compat_str) or visitor_data
3741
3742 known_continuation_renderers = {
3743 'playlistVideoListContinuation': self._playlist_entries,
3744 'gridContinuation': self._grid_entries,
3745 'itemSectionContinuation': self._post_thread_continuation_entries,
3746 'sectionListContinuation': extract_entries, # for feeds
3747 }
3748 continuation_contents = try_get(
3749 response, lambda x: x['continuationContents'], dict) or {}
3750 continuation_renderer = None
3751 for key, value in continuation_contents.items():
3752 if key not in known_continuation_renderers:
3753 continue
3754 continuation_renderer = value
3755 continuation_list = [None]
3756 for entry in known_continuation_renderers[key](continuation_renderer):
3757 yield entry
3758 continuation = continuation_list[0] or self._extract_continuation(continuation_renderer)
3759 break
3760 if continuation_renderer:
3761 continue
3762
3763 known_renderers = {
3764 'gridPlaylistRenderer': (self._grid_entries, 'items'),
3765 'gridVideoRenderer': (self._grid_entries, 'items'),
3766 'playlistVideoRenderer': (self._playlist_entries, 'contents'),
3767 'itemSectionRenderer': (extract_entries, 'contents'), # for feeds
3768 'richItemRenderer': (extract_entries, 'contents'), # for hashtag
3769 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents')
3770 }
3771 on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints'))
3772 continuation_items = try_get(
3773 on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list)
3774 continuation_item = try_get(continuation_items, lambda x: x[0], dict) or {}
3775 video_items_renderer = None
3776 for key, value in continuation_item.items():
3777 if key not in known_renderers:
3778 continue
3779 video_items_renderer = {known_renderers[key][1]: continuation_items}
3780 continuation_list = [None]
3781 for entry in known_renderers[key][0](video_items_renderer):
3782 yield entry
3783 continuation = continuation_list[0] or self._extract_continuation(video_items_renderer)
3784 break
3785 if video_items_renderer:
3786 continue
3787 break
3788
3789 @staticmethod
3790 def _extract_selected_tab(tabs):
3791 for tab in tabs:
3792 renderer = dict_get(tab, ('tabRenderer', 'expandableTabRenderer')) or {}
3793 if renderer.get('selected') is True:
3794 return renderer
3795 else:
3796 raise ExtractorError('Unable to find selected tab')
3797
3798 @classmethod
3799 def _extract_uploader(cls, data):
3800 uploader = {}
3801 renderer = cls._extract_sidebar_info_renderer(data, 'playlistSidebarSecondaryInfoRenderer') or {}
3802 owner = try_get(
3803 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3804 if owner:
3805 uploader['uploader'] = owner.get('text')
3806 uploader['uploader_id'] = try_get(
3807 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3808 uploader['uploader_url'] = urljoin(
3809 'https://www.youtube.com/',
3810 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3811 return {k: v for k, v in uploader.items() if v is not None}
3812
3813 def _extract_from_tabs(self, item_id, webpage, data, tabs):
3814 playlist_id = title = description = channel_url = channel_name = channel_id = None
3815 thumbnails_list = tags = []
3816
3817 selected_tab = self._extract_selected_tab(tabs)
3818 renderer = try_get(
3819 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3820 if renderer:
3821 channel_name = renderer.get('title')
3822 channel_url = renderer.get('channelUrl')
3823 channel_id = renderer.get('externalId')
3824 else:
3825 renderer = try_get(
3826 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3827
3828 if renderer:
3829 title = renderer.get('title')
3830 description = renderer.get('description', '')
3831 playlist_id = channel_id
3832 tags = renderer.get('keywords', '').split()
3833 thumbnails_list = (
3834 try_get(renderer, lambda x: x['avatar']['thumbnails'], list)
3835 or try_get(
3836 self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer'),
3837 lambda x: x['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails'],
3838 list)
3839 or [])
3840
3841 thumbnails = []
3842 for t in thumbnails_list:
3843 if not isinstance(t, dict):
3844 continue
3845 thumbnail_url = url_or_none(t.get('url'))
3846 if not thumbnail_url:
3847 continue
3848 thumbnails.append({
3849 'url': thumbnail_url,
3850 'width': int_or_none(t.get('width')),
3851 'height': int_or_none(t.get('height')),
3852 })
3853 if playlist_id is None:
3854 playlist_id = item_id
3855 if title is None:
3856 title = (
3857 try_get(data, lambda x: x['header']['hashtagHeaderRenderer']['hashtag']['simpleText'])
3858 or playlist_id)
3859 title += format_field(selected_tab, 'title', ' - %s')
3860 title += format_field(selected_tab, 'expandedText', ' - %s')
3861 metadata = {
3862 'playlist_id': playlist_id,
3863 'playlist_title': title,
3864 'playlist_description': description,
3865 'uploader': channel_name,
3866 'uploader_id': channel_id,
3867 'uploader_url': channel_url,
3868 'thumbnails': thumbnails,
3869 'tags': tags,
3870 }
3871 availability = self._extract_availability(data)
3872 if availability:
3873 metadata['availability'] = availability
3874 if not channel_id:
3875 metadata.update(self._extract_uploader(data))
3876 metadata.update({
3877 'channel': metadata['uploader'],
3878 'channel_id': metadata['uploader_id'],
3879 'channel_url': metadata['uploader_url']})
3880 return self.playlist_result(
3881 self._entries(
3882 selected_tab, playlist_id,
3883 self._extract_identity_token(webpage, item_id),
3884 self._extract_account_syncid(data),
3885 self._extract_ytcfg(item_id, webpage)),
3886 **metadata)
3887
3888 def _extract_mix_playlist(self, playlist, playlist_id, data, webpage):
3889 first_id = last_id = None
3890 ytcfg = self._extract_ytcfg(playlist_id, webpage)
3891 headers = self._generate_api_headers(
3892 ytcfg, account_syncid=self._extract_account_syncid(data),
3893 identity_token=self._extract_identity_token(webpage, item_id=playlist_id),
3894 visitor_data=try_get(self._extract_context(ytcfg), lambda x: x['client']['visitorData'], compat_str))
3895 for page_num in itertools.count(1):
3896 videos = list(self._playlist_entries(playlist))
3897 if not videos:
3898 return
3899 start = next((i for i, v in enumerate(videos) if v['id'] == last_id), -1) + 1
3900 if start >= len(videos):
3901 return
3902 for video in videos[start:]:
3903 if video['id'] == first_id:
3904 self.to_screen('First video %s found again; Assuming end of Mix' % first_id)
3905 return
3906 yield video
3907 first_id = first_id or videos[0]['id']
3908 last_id = videos[-1]['id']
3909 watch_endpoint = try_get(
3910 playlist, lambda x: x['contents'][-1]['playlistPanelVideoRenderer']['navigationEndpoint']['watchEndpoint'])
3911 query = {
3912 'playlistId': playlist_id,
3913 'videoId': watch_endpoint.get('videoId') or last_id,
3914 'index': watch_endpoint.get('index') or len(videos),
3915 'params': watch_endpoint.get('params') or 'OAE%3D'
3916 }
3917 response = self._extract_response(
3918 item_id='%s page %d' % (playlist_id, page_num),
3919 query=query,
3920 ep='next',
3921 headers=headers,
3922 check_get_keys='contents'
3923 )
3924 playlist = try_get(
3925 response, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3926
3927 def _extract_from_playlist(self, item_id, url, data, playlist, webpage):
3928 title = playlist.get('title') or try_get(
3929 data, lambda x: x['titleText']['simpleText'], compat_str)
3930 playlist_id = playlist.get('playlistId') or item_id
3931
3932 # Delegating everything except mix playlists to regular tab-based playlist URL
3933 playlist_url = urljoin(url, try_get(
3934 playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3935 compat_str))
3936 if playlist_url and playlist_url != url:
3937 return self.url_result(
3938 playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3939 video_title=title)
3940
3941 return self.playlist_result(
3942 self._extract_mix_playlist(playlist, playlist_id, data, webpage),
3943 playlist_id=playlist_id, playlist_title=title)
3944
3945 def _extract_availability(self, data):
3946 """
3947 Gets the availability of a given playlist/tab.
3948 Note: Unless YouTube tells us explicitly, we do not assume it is public
3949 @param data: response
3950 """
3951 is_private = is_unlisted = None
3952 renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') or {}
3953 badge_labels = self._extract_badges(renderer)
3954
3955 # Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge
3956 privacy_dropdown_entries = try_get(
3957 renderer, lambda x: x['privacyForm']['dropdownFormFieldRenderer']['dropdown']['dropdownRenderer']['entries'], list) or []
3958 for renderer_dict in privacy_dropdown_entries:
3959 is_selected = try_get(
3960 renderer_dict, lambda x: x['privacyDropdownItemRenderer']['isSelected'], bool) or False
3961 if not is_selected:
3962 continue
3963 label = self._join_text_entries(
3964 try_get(renderer_dict, lambda x: x['privacyDropdownItemRenderer']['label']['runs'], list) or [])
3965 if label:
3966 badge_labels.add(label.lower())
3967 break
3968
3969 for badge_label in badge_labels:
3970 if badge_label == 'unlisted':
3971 is_unlisted = True
3972 elif badge_label == 'private':
3973 is_private = True
3974 elif badge_label == 'public':
3975 is_unlisted = is_private = False
3976 return self._availability(is_private, False, False, False, is_unlisted)
3977
3978 @staticmethod
3979 def _extract_sidebar_info_renderer(data, info_renderer, expected_type=dict):
3980 sidebar_renderer = try_get(
3981 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list) or []
3982 for item in sidebar_renderer:
3983 renderer = try_get(item, lambda x: x[info_renderer], expected_type)
3984 if renderer:
3985 return renderer
3986
3987 def _reload_with_unavailable_videos(self, item_id, data, webpage):
3988 """
3989 Get playlist with unavailable videos if the 'show unavailable videos' button exists.
3990 """
3991 browse_id = params = None
3992 renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer')
3993 if not renderer:
3994 return
3995 menu_renderer = try_get(
3996 renderer, lambda x: x['menu']['menuRenderer']['items'], list) or []
3997 for menu_item in menu_renderer:
3998 if not isinstance(menu_item, dict):
3999 continue
4000 nav_item_renderer = menu_item.get('menuNavigationItemRenderer')
4001 text = try_get(
4002 nav_item_renderer, lambda x: x['text']['simpleText'], compat_str)
4003 if not text or text.lower() != 'show unavailable videos':
4004 continue
4005 browse_endpoint = try_get(
4006 nav_item_renderer, lambda x: x['navigationEndpoint']['browseEndpoint'], dict) or {}
4007 browse_id = browse_endpoint.get('browseId')
4008 params = browse_endpoint.get('params')
4009 break
4010
4011 ytcfg = self._extract_ytcfg(item_id, webpage)
4012 headers = self._generate_api_headers(
4013 ytcfg, account_syncid=self._extract_account_syncid(ytcfg),
4014 identity_token=self._extract_identity_token(webpage, item_id=item_id),
4015 visitor_data=try_get(
4016 self._extract_context(ytcfg), lambda x: x['client']['visitorData'], compat_str))
4017 query = {
4018 'params': params or 'wgYCCAA=',
4019 'browseId': browse_id or 'VL%s' % item_id
4020 }
4021 return self._extract_response(
4022 item_id=item_id, headers=headers, query=query,
4023 check_get_keys='contents', fatal=False,
4024 note='Downloading API JSON with unavailable videos')
4025
4026 def _extract_webpage(self, url, item_id):
4027 retries = self.get_param('extractor_retries', 3)
4028 count = -1
4029 last_error = 'Incomplete yt initial data recieved'
4030 while count < retries:
4031 count += 1
4032 # Sometimes youtube returns a webpage with incomplete ytInitialData
4033 # See: https://github.com/yt-dlp/yt-dlp/issues/116
4034 if count:
4035 self.report_warning('%s. Retrying ...' % last_error)
4036 webpage = self._download_webpage(
4037 url, item_id,
4038 'Downloading webpage%s' % (' (retry #%d)' % count if count else ''))
4039 data = self._extract_yt_initial_data(item_id, webpage)
4040 if data.get('contents') or data.get('currentVideoEndpoint'):
4041 break
4042 # Extract alerts here only when there is error
4043 self._extract_and_report_alerts(data)
4044 if count >= retries:
4045 raise ExtractorError(last_error)
4046 return webpage, data
4047
4048 @staticmethod
4049 def _smuggle_data(entries, data):
4050 for entry in entries:
4051 if data:
4052 entry['url'] = smuggle_url(entry['url'], data)
4053 yield entry
4054
4055 def _real_extract(self, url):
4056 url, smuggled_data = unsmuggle_url(url, {})
4057 if self.is_music_url(url):
4058 smuggled_data['is_music_url'] = True
4059 info_dict = self.__real_extract(url, smuggled_data)
4060 if info_dict.get('entries'):
4061 info_dict['entries'] = self._smuggle_data(info_dict['entries'], smuggled_data)
4062 return info_dict
4063
4064 _url_re = re.compile(r'(?P<pre>%s)(?(channel_type)(?P<tab>/\w+))?(?P<post>.*)$' % _VALID_URL)
4065
4066 def __real_extract(self, url, smuggled_data):
4067 item_id = self._match_id(url)
4068 url = compat_urlparse.urlunparse(
4069 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
4070 compat_opts = self.get_param('compat_opts', [])
4071
4072 def get_mobj(url):
4073 mobj = self._url_re.match(url).groupdict()
4074 mobj.update((k, '') for k, v in mobj.items() if v is None)
4075 return mobj
4076
4077 mobj = get_mobj(url)
4078 # Youtube returns incomplete data if tabname is not lower case
4079 pre, tab, post, is_channel = mobj['pre'], mobj['tab'].lower(), mobj['post'], not mobj['not_channel']
4080
4081 if is_channel:
4082 if smuggled_data.get('is_music_url'):
4083 if item_id[:2] == 'VL':
4084 # Youtube music VL channels have an equivalent playlist
4085 item_id = item_id[2:]
4086 pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False
4087 elif item_id[:2] == 'MP':
4088 # Youtube music albums (/channel/MP...) have a OLAK playlist that can be extracted from the webpage
4089 item_id = self._search_regex(
4090 r'\\x22audioPlaylistId\\x22:\\x22([0-9A-Za-z_-]+)\\x22',
4091 self._download_webpage('https://music.youtube.com/channel/%s' % item_id, item_id),
4092 'playlist id')
4093 pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False
4094 elif mobj['channel_type'] == 'browse':
4095 # Youtube music /browse/ should be changed to /channel/
4096 pre = 'https://www.youtube.com/channel/%s' % item_id
4097 if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts:
4098 # Home URLs should redirect to /videos/
4099 self.report_warning(
4100 'A channel/user page was given. All the channel\'s videos will be downloaded. '
4101 'To download only the videos in the home page, add a "/featured" to the URL')
4102 tab = '/videos'
4103
4104 url = ''.join((pre, tab, post))
4105 mobj = get_mobj(url)
4106
4107 # Handle both video/playlist URLs
4108 qs = parse_qs(url)
4109 video_id = qs.get('v', [None])[0]
4110 playlist_id = qs.get('list', [None])[0]
4111
4112 if not video_id and mobj['not_channel'].startswith('watch'):
4113 if not playlist_id:
4114 # If there is neither video or playlist ids, youtube redirects to home page, which is undesirable
4115 raise ExtractorError('Unable to recognize tab page')
4116 # Common mistake: https://www.youtube.com/watch?list=playlist_id
4117 self.report_warning('A video URL was given without video ID. Trying to download playlist %s' % playlist_id)
4118 url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
4119 mobj = get_mobj(url)
4120
4121 if video_id and playlist_id:
4122 if self.get_param('noplaylist'):
4123 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
4124 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
4125 self.to_screen('Downloading playlist %s; add --no-playlist to just download video %s' % (playlist_id, video_id))
4126
4127 webpage, data = self._extract_webpage(url, item_id)
4128
4129 tabs = try_get(
4130 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
4131 if tabs:
4132 selected_tab = self._extract_selected_tab(tabs)
4133 tab_name = selected_tab.get('title', '')
4134 if 'no-youtube-channel-redirect' not in compat_opts:
4135 if mobj['tab'] == '/live':
4136 # Live tab should have redirected to the video
4137 raise ExtractorError('The channel is not currently live', expected=True)
4138 if mobj['tab'] == '/videos' and tab_name.lower() != mobj['tab'][1:]:
4139 if not mobj['not_channel'] and item_id[:2] == 'UC':
4140 # Topic channels don't have /videos. Use the equivalent playlist instead
4141 self.report_warning('The URL does not have a %s tab. Trying to redirect to playlist UU%s instead' % (mobj['tab'][1:], item_id[2:]))
4142 pl_id = 'UU%s' % item_id[2:]
4143 pl_url = 'https://www.youtube.com/playlist?list=%s%s' % (pl_id, mobj['post'])
4144 try:
4145 pl_webpage, pl_data = self._extract_webpage(pl_url, pl_id)
4146 for alert_type, alert_message in self._extract_alerts(pl_data):
4147 if alert_type == 'error':
4148 raise ExtractorError('Youtube said: %s' % alert_message)
4149 item_id, url, webpage, data = pl_id, pl_url, pl_webpage, pl_data
4150 except ExtractorError:
4151 self.report_warning('The playlist gave error. Falling back to channel URL')
4152 else:
4153 self.report_warning('The URL does not have a %s tab. %s is being downloaded instead' % (mobj['tab'][1:], tab_name))
4154
4155 self.write_debug('Final URL: %s' % url)
4156
4157 # YouTube sometimes provides a button to reload playlist with unavailable videos.
4158 if 'no-youtube-unavailable-videos' not in compat_opts:
4159 data = self._reload_with_unavailable_videos(item_id, data, webpage) or data
4160 self._extract_and_report_alerts(data)
4161 tabs = try_get(
4162 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
4163 if tabs:
4164 return self._extract_from_tabs(item_id, webpage, data, tabs)
4165
4166 playlist = try_get(
4167 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
4168 if playlist:
4169 return self._extract_from_playlist(item_id, url, data, playlist, webpage)
4170
4171 video_id = try_get(
4172 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
4173 compat_str) or video_id
4174 if video_id:
4175 if mobj['tab'] != '/live': # live tab is expected to redirect to video
4176 self.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id)
4177 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
4178
4179 raise ExtractorError('Unable to recognize tab page')
4180
4181
4182 class YoutubePlaylistIE(InfoExtractor):
4183 IE_DESC = 'YouTube.com playlists'
4184 _VALID_URL = r'''(?x)(?:
4185 (?:https?://)?
4186 (?:\w+\.)?
4187 (?:
4188 (?:
4189 youtube(?:kids)?\.com|
4190 invidio\.us
4191 )
4192 /.*?\?.*?\blist=
4193 )?
4194 (?P<id>%(playlist_id)s)
4195 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
4196 IE_NAME = 'youtube:playlist'
4197 _TESTS = [{
4198 'note': 'issue #673',
4199 'url': 'PLBB231211A4F62143',
4200 'info_dict': {
4201 'title': '[OLD]Team Fortress 2 (Class-based LP)',
4202 'id': 'PLBB231211A4F62143',
4203 'uploader': 'Wickydoo',
4204 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
4205 },
4206 'playlist_mincount': 29,
4207 }, {
4208 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
4209 'info_dict': {
4210 'title': 'YDL_safe_search',
4211 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
4212 },
4213 'playlist_count': 2,
4214 'skip': 'This playlist is private',
4215 }, {
4216 'note': 'embedded',
4217 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
4218 'playlist_count': 4,
4219 'info_dict': {
4220 'title': 'JODA15',
4221 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
4222 'uploader': 'milan',
4223 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
4224 }
4225 }, {
4226 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
4227 'playlist_mincount': 982,
4228 'info_dict': {
4229 'title': '2018 Chinese New Singles (11/6 updated)',
4230 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
4231 'uploader': 'LBK',
4232 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
4233 }
4234 }, {
4235 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
4236 'only_matching': True,
4237 }, {
4238 # music album playlist
4239 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
4240 'only_matching': True,
4241 }]
4242
4243 @classmethod
4244 def suitable(cls, url):
4245 if YoutubeTabIE.suitable(url):
4246 return False
4247 # Hack for lazy extractors until more generic solution is implemented
4248 # (see #28780)
4249 from .youtube import parse_qs
4250 qs = parse_qs(url)
4251 if qs.get('v', [None])[0]:
4252 return False
4253 return super(YoutubePlaylistIE, cls).suitable(url)
4254
4255 def _real_extract(self, url):
4256 playlist_id = self._match_id(url)
4257 is_music_url = YoutubeBaseInfoExtractor.is_music_url(url)
4258 url = update_url_query(
4259 'https://www.youtube.com/playlist',
4260 parse_qs(url) or {'list': playlist_id})
4261 if is_music_url:
4262 url = smuggle_url(url, {'is_music_url': True})
4263 return self.url_result(url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
4264
4265
4266 class YoutubeYtBeIE(InfoExtractor):
4267 IE_DESC = 'youtu.be'
4268 _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
4269 _TESTS = [{
4270 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
4271 'info_dict': {
4272 'id': 'yeWKywCrFtk',
4273 'ext': 'mp4',
4274 'title': 'Small Scale Baler and Braiding Rugs',
4275 'uploader': 'Backus-Page House Museum',
4276 'uploader_id': 'backuspagemuseum',
4277 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
4278 'upload_date': '20161008',
4279 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
4280 'categories': ['Nonprofits & Activism'],
4281 'tags': list,
4282 'like_count': int,
4283 'dislike_count': int,
4284 },
4285 'params': {
4286 'noplaylist': True,
4287 'skip_download': True,
4288 },
4289 }, {
4290 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
4291 'only_matching': True,
4292 }]
4293
4294 def _real_extract(self, url):
4295 mobj = re.match(self._VALID_URL, url)
4296 video_id = mobj.group('id')
4297 playlist_id = mobj.group('playlist_id')
4298 return self.url_result(
4299 update_url_query('https://www.youtube.com/watch', {
4300 'v': video_id,
4301 'list': playlist_id,
4302 'feature': 'youtu.be',
4303 }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
4304
4305
4306 class YoutubeYtUserIE(InfoExtractor):
4307 IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
4308 _VALID_URL = r'ytuser:(?P<id>.+)'
4309 _TESTS = [{
4310 'url': 'ytuser:phihag',
4311 'only_matching': True,
4312 }]
4313
4314 def _real_extract(self, url):
4315 user_id = self._match_id(url)
4316 return self.url_result(
4317 'https://www.youtube.com/user/%s' % user_id,
4318 ie=YoutubeTabIE.ie_key(), video_id=user_id)
4319
4320
4321 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
4322 IE_NAME = 'youtube:favorites'
4323 IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
4324 _VALID_URL = r':ytfav(?:ou?rite)?s?'
4325 _LOGIN_REQUIRED = True
4326 _TESTS = [{
4327 'url': ':ytfav',
4328 'only_matching': True,
4329 }, {
4330 'url': ':ytfavorites',
4331 'only_matching': True,
4332 }]
4333
4334 def _real_extract(self, url):
4335 return self.url_result(
4336 'https://www.youtube.com/playlist?list=LL',
4337 ie=YoutubeTabIE.ie_key())
4338
4339
4340 class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE):
4341 IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
4342 # there doesn't appear to be a real limit, for example if you search for
4343 # 'python' you get more than 8.000.000 results
4344 _MAX_RESULTS = float('inf')
4345 IE_NAME = 'youtube:search'
4346 _SEARCH_KEY = 'ytsearch'
4347 _SEARCH_PARAMS = None
4348 _TESTS = []
4349
4350 def _entries(self, query, n):
4351 data = {'query': query}
4352 if self._SEARCH_PARAMS:
4353 data['params'] = self._SEARCH_PARAMS
4354 total = 0
4355 for page_num in itertools.count(1):
4356 search = self._extract_response(
4357 item_id='query "%s" page %s' % (query, page_num), ep='search', query=data,
4358 check_get_keys=('contents', 'onResponseReceivedCommands')
4359 )
4360 if not search:
4361 break
4362 slr_contents = try_get(
4363 search,
4364 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
4365 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
4366 list)
4367 if not slr_contents:
4368 break
4369
4370 # Youtube sometimes adds promoted content to searches,
4371 # changing the index location of videos and token.
4372 # So we search through all entries till we find them.
4373 continuation_token = None
4374 for slr_content in slr_contents:
4375 if continuation_token is None:
4376 continuation_token = try_get(
4377 slr_content,
4378 lambda x: x['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
4379 compat_str)
4380
4381 isr_contents = try_get(
4382 slr_content,
4383 lambda x: x['itemSectionRenderer']['contents'],
4384 list)
4385 if not isr_contents:
4386 continue
4387 for content in isr_contents:
4388 if not isinstance(content, dict):
4389 continue
4390 video = content.get('videoRenderer')
4391 if not isinstance(video, dict):
4392 continue
4393 video_id = video.get('videoId')
4394 if not video_id:
4395 continue
4396
4397 yield self._extract_video(video)
4398 total += 1
4399 if total == n:
4400 return
4401
4402 if not continuation_token:
4403 break
4404 data['continuation'] = continuation_token
4405
4406 def _get_n_results(self, query, n):
4407 """Get a specified number of results for a query"""
4408 return self.playlist_result(self._entries(query, n), query)
4409
4410
4411 class YoutubeSearchDateIE(YoutubeSearchIE):
4412 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
4413 _SEARCH_KEY = 'ytsearchdate'
4414 IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
4415 _SEARCH_PARAMS = 'CAI%3D'
4416
4417
4418 class YoutubeSearchURLIE(YoutubeSearchIE):
4419 IE_DESC = 'YouTube.com search URLs'
4420 IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
4421 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
4422 # _MAX_RESULTS = 100
4423 _TESTS = [{
4424 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
4425 'playlist_mincount': 5,
4426 'info_dict': {
4427 'title': 'youtube-dl test video',
4428 }
4429 }, {
4430 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
4431 'only_matching': True,
4432 }]
4433
4434 @classmethod
4435 def _make_valid_url(cls):
4436 return cls._VALID_URL
4437
4438 def _real_extract(self, url):
4439 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
4440 query = (qs.get('search_query') or qs.get('q'))[0]
4441 self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
4442 return self._get_n_results(query, self._MAX_RESULTS)
4443
4444
4445 class YoutubeFeedsInfoExtractor(YoutubeTabIE):
4446 """
4447 Base class for feed extractors
4448 Subclasses must define the _FEED_NAME property.
4449 """
4450 _LOGIN_REQUIRED = True
4451 _TESTS = []
4452
4453 @property
4454 def IE_NAME(self):
4455 return 'youtube:%s' % self._FEED_NAME
4456
4457 def _real_extract(self, url):
4458 return self.url_result(
4459 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
4460 ie=YoutubeTabIE.ie_key())
4461
4462
4463 class YoutubeWatchLaterIE(InfoExtractor):
4464 IE_NAME = 'youtube:watchlater'
4465 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
4466 _VALID_URL = r':ytwatchlater'
4467 _TESTS = [{
4468 'url': ':ytwatchlater',
4469 'only_matching': True,
4470 }]
4471
4472 def _real_extract(self, url):
4473 return self.url_result(
4474 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
4475
4476
4477 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
4478 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
4479 _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
4480 _FEED_NAME = 'recommended'
4481 _LOGIN_REQUIRED = False
4482 _TESTS = [{
4483 'url': ':ytrec',
4484 'only_matching': True,
4485 }, {
4486 'url': ':ytrecommended',
4487 'only_matching': True,
4488 }, {
4489 'url': 'https://youtube.com',
4490 'only_matching': True,
4491 }]
4492
4493
4494 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
4495 IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
4496 _VALID_URL = r':ytsub(?:scription)?s?'
4497 _FEED_NAME = 'subscriptions'
4498 _TESTS = [{
4499 'url': ':ytsubs',
4500 'only_matching': True,
4501 }, {
4502 'url': ':ytsubscriptions',
4503 'only_matching': True,
4504 }]
4505
4506
4507 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
4508 IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)'
4509 _VALID_URL = r':ythis(?:tory)?'
4510 _FEED_NAME = 'history'
4511 _TESTS = [{
4512 'url': ':ythistory',
4513 'only_matching': True,
4514 }]
4515
4516
4517 class YoutubeTruncatedURLIE(InfoExtractor):
4518 IE_NAME = 'youtube:truncated_url'
4519 IE_DESC = False # Do not list
4520 _VALID_URL = r'''(?x)
4521 (?:https?://)?
4522 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
4523 (?:watch\?(?:
4524 feature=[a-z_]+|
4525 annotation_id=annotation_[^&]+|
4526 x-yt-cl=[0-9]+|
4527 hl=[^&]*|
4528 t=[0-9]+
4529 )?
4530 |
4531 attribution_link\?a=[^&]+
4532 )
4533 $
4534 '''
4535
4536 _TESTS = [{
4537 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
4538 'only_matching': True,
4539 }, {
4540 'url': 'https://www.youtube.com/watch?',
4541 'only_matching': True,
4542 }, {
4543 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
4544 'only_matching': True,
4545 }, {
4546 'url': 'https://www.youtube.com/watch?feature=foo',
4547 'only_matching': True,
4548 }, {
4549 'url': 'https://www.youtube.com/watch?hl=en-GB',
4550 'only_matching': True,
4551 }, {
4552 'url': 'https://www.youtube.com/watch?t=2372',
4553 'only_matching': True,
4554 }]
4555
4556 def _real_extract(self, url):
4557 raise ExtractorError(
4558 'Did you forget to quote the URL? Remember that & is a meta '
4559 'character in most shells, so you want to put the URL in quotes, '
4560 'like youtube-dl '
4561 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
4562 ' or simply youtube-dl BaW_jenozKc .',
4563 expected=True)
4564
4565
4566 class YoutubeTruncatedIDIE(InfoExtractor):
4567 IE_NAME = 'youtube:truncated_id'
4568 IE_DESC = False # Do not list
4569 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
4570
4571 _TESTS = [{
4572 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
4573 'only_matching': True,
4574 }]
4575
4576 def _real_extract(self, url):
4577 video_id = self._match_id(url)
4578 raise ExtractorError(
4579 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
4580 expected=True)