]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/youtube.py
[RTP] Fix extraction and add subtitles (#497)
[yt-dlp.git] / yt_dlp / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5 import base64
6 import calendar
7 import copy
8 import hashlib
9 import itertools
10 import json
11 import os.path
12 import random
13 import re
14 import time
15 import traceback
16
17 from .common import InfoExtractor, SearchInfoExtractor
18 from ..compat import (
19 compat_chr,
20 compat_HTTPError,
21 compat_parse_qs,
22 compat_str,
23 compat_urllib_parse_unquote_plus,
24 compat_urllib_parse_urlencode,
25 compat_urllib_parse_urlparse,
26 compat_urlparse,
27 )
28 from ..jsinterp import JSInterpreter
29 from ..utils import (
30 bool_or_none,
31 bytes_to_intlist,
32 clean_html,
33 dict_get,
34 datetime_from_str,
35 error_to_compat_str,
36 ExtractorError,
37 format_field,
38 float_or_none,
39 int_or_none,
40 intlist_to_bytes,
41 mimetype2ext,
42 parse_codecs,
43 parse_duration,
44 qualities,
45 remove_start,
46 smuggle_url,
47 str_or_none,
48 str_to_int,
49 try_get,
50 unescapeHTML,
51 unified_strdate,
52 unsmuggle_url,
53 update_url_query,
54 url_or_none,
55 urlencode_postdata,
56 urljoin
57 )
58
59
60 def parse_qs(url):
61 return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
62
63
64 class YoutubeBaseInfoExtractor(InfoExtractor):
65 """Provide base functions for Youtube extractors"""
66 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
67 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
68
69 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
70 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
71 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
72
73 _RESERVED_NAMES = (
74 r'channel|c|user|browse|playlist|watch|w|v|embed|e|watch_popup|shorts|'
75 r'movies|results|shared|hashtag|trending|feed|feeds|oembed|get_video_info|'
76 r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout')
77
78 _NETRC_MACHINE = 'youtube'
79 # If True it will raise an error if no login info is provided
80 _LOGIN_REQUIRED = False
81
82 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
83
84 def _login(self):
85 """
86 Attempt to log in to YouTube.
87 True is returned if successful or skipped.
88 False is returned if login failed.
89
90 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
91 """
92
93 def warn(message):
94 self.report_warning(message)
95
96 # username+password login is broken
97 if self._LOGIN_REQUIRED and self.get_param('cookiefile') is None:
98 self.raise_login_required(
99 'Login details are needed to download this content', method='cookies')
100 username, password = self._get_login_info()
101 if username:
102 warn('Logging in using username and password is broken. %s' % self._LOGIN_HINTS['cookies'])
103 return
104
105 # Everything below this is broken!
106 r'''
107 # No authentication to be performed
108 if username is None:
109 if self._LOGIN_REQUIRED and self.get_param('cookiefile') is None:
110 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
111 # if self.get_param('cookiefile'): # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
112 # self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
113 return True
114
115 login_page = self._download_webpage(
116 self._LOGIN_URL, None,
117 note='Downloading login page',
118 errnote='unable to fetch login page', fatal=False)
119 if login_page is False:
120 return
121
122 login_form = self._hidden_inputs(login_page)
123
124 def req(url, f_req, note, errnote):
125 data = login_form.copy()
126 data.update({
127 'pstMsg': 1,
128 'checkConnection': 'youtube',
129 'checkedDomains': 'youtube',
130 'hl': 'en',
131 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
132 'f.req': json.dumps(f_req),
133 'flowName': 'GlifWebSignIn',
134 'flowEntry': 'ServiceLogin',
135 # TODO: reverse actual botguard identifier generation algo
136 'bgRequest': '["identifier",""]',
137 })
138 return self._download_json(
139 url, None, note=note, errnote=errnote,
140 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
141 fatal=False,
142 data=urlencode_postdata(data), headers={
143 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
144 'Google-Accounts-XSRF': 1,
145 })
146
147 lookup_req = [
148 username,
149 None, [], None, 'US', None, None, 2, False, True,
150 [
151 None, None,
152 [2, 1, None, 1,
153 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
154 None, [], 4],
155 1, [None, None, []], None, None, None, True
156 ],
157 username,
158 ]
159
160 lookup_results = req(
161 self._LOOKUP_URL, lookup_req,
162 'Looking up account info', 'Unable to look up account info')
163
164 if lookup_results is False:
165 return False
166
167 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
168 if not user_hash:
169 warn('Unable to extract user hash')
170 return False
171
172 challenge_req = [
173 user_hash,
174 None, 1, None, [1, None, None, None, [password, None, True]],
175 [
176 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
177 1, [None, None, []], None, None, None, True
178 ]]
179
180 challenge_results = req(
181 self._CHALLENGE_URL, challenge_req,
182 'Logging in', 'Unable to log in')
183
184 if challenge_results is False:
185 return
186
187 login_res = try_get(challenge_results, lambda x: x[0][5], list)
188 if login_res:
189 login_msg = try_get(login_res, lambda x: x[5], compat_str)
190 warn(
191 'Unable to login: %s' % 'Invalid password'
192 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
193 return False
194
195 res = try_get(challenge_results, lambda x: x[0][-1], list)
196 if not res:
197 warn('Unable to extract result entry')
198 return False
199
200 login_challenge = try_get(res, lambda x: x[0][0], list)
201 if login_challenge:
202 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
203 if challenge_str == 'TWO_STEP_VERIFICATION':
204 # SEND_SUCCESS - TFA code has been successfully sent to phone
205 # QUOTA_EXCEEDED - reached the limit of TFA codes
206 status = try_get(login_challenge, lambda x: x[5], compat_str)
207 if status == 'QUOTA_EXCEEDED':
208 warn('Exceeded the limit of TFA codes, try later')
209 return False
210
211 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
212 if not tl:
213 warn('Unable to extract TL')
214 return False
215
216 tfa_code = self._get_tfa_info('2-step verification code')
217
218 if not tfa_code:
219 warn(
220 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
221 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
222 return False
223
224 tfa_code = remove_start(tfa_code, 'G-')
225
226 tfa_req = [
227 user_hash, None, 2, None,
228 [
229 9, None, None, None, None, None, None, None,
230 [None, tfa_code, True, 2]
231 ]]
232
233 tfa_results = req(
234 self._TFA_URL.format(tl), tfa_req,
235 'Submitting TFA code', 'Unable to submit TFA code')
236
237 if tfa_results is False:
238 return False
239
240 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
241 if tfa_res:
242 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
243 warn(
244 'Unable to finish TFA: %s' % 'Invalid TFA code'
245 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
246 return False
247
248 check_cookie_url = try_get(
249 tfa_results, lambda x: x[0][-1][2], compat_str)
250 else:
251 CHALLENGES = {
252 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
253 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
254 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
255 }
256 challenge = CHALLENGES.get(
257 challenge_str,
258 '%s returned error %s.' % (self.IE_NAME, challenge_str))
259 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
260 return False
261 else:
262 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
263
264 if not check_cookie_url:
265 warn('Unable to extract CheckCookie URL')
266 return False
267
268 check_cookie_results = self._download_webpage(
269 check_cookie_url, None, 'Checking cookie', fatal=False)
270
271 if check_cookie_results is False:
272 return False
273
274 if 'https://myaccount.google.com/' not in check_cookie_results:
275 warn('Unable to log in')
276 return False
277
278 return True
279 '''
280
281 def _initialize_consent(self):
282 cookies = self._get_cookies('https://www.youtube.com/')
283 if cookies.get('__Secure-3PSID'):
284 return
285 consent_id = None
286 consent = cookies.get('CONSENT')
287 if consent:
288 if 'YES' in consent.value:
289 return
290 consent_id = self._search_regex(
291 r'PENDING\+(\d+)', consent.value, 'consent', default=None)
292 if not consent_id:
293 consent_id = random.randint(100, 999)
294 self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id)
295
296 def _real_initialize(self):
297 self._initialize_consent()
298 if self._downloader is None:
299 return
300 if not self._login():
301 return
302
303 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
304 _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
305 _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
306
307 _YT_DEFAULT_YTCFGS = {
308 'WEB': {
309 'INNERTUBE_API_VERSION': 'v1',
310 'INNERTUBE_CLIENT_NAME': 'WEB',
311 'INNERTUBE_CLIENT_VERSION': '2.20210622.10.00',
312 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
313 'INNERTUBE_CONTEXT': {
314 'client': {
315 'clientName': 'WEB',
316 'clientVersion': '2.20210622.10.00',
317 'hl': 'en',
318 }
319 },
320 'INNERTUBE_CONTEXT_CLIENT_NAME': 1
321 },
322 'WEB_REMIX': {
323 'INNERTUBE_API_VERSION': 'v1',
324 'INNERTUBE_CLIENT_NAME': 'WEB_REMIX',
325 'INNERTUBE_CLIENT_VERSION': '1.20210621.00.00',
326 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
327 'INNERTUBE_CONTEXT': {
328 'client': {
329 'clientName': 'WEB_REMIX',
330 'clientVersion': '1.20210621.00.00',
331 'hl': 'en',
332 }
333 },
334 'INNERTUBE_CONTEXT_CLIENT_NAME': 67
335 },
336 'WEB_EMBEDDED_PLAYER': {
337 'INNERTUBE_API_VERSION': 'v1',
338 'INNERTUBE_CLIENT_NAME': 'WEB_EMBEDDED_PLAYER',
339 'INNERTUBE_CLIENT_VERSION': '1.20210620.0.1',
340 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
341 'INNERTUBE_CONTEXT': {
342 'client': {
343 'clientName': 'WEB_EMBEDDED_PLAYER',
344 'clientVersion': '1.20210620.0.1',
345 'hl': 'en',
346 }
347 },
348 'INNERTUBE_CONTEXT_CLIENT_NAME': 56
349 },
350 'ANDROID': {
351 'INNERTUBE_API_VERSION': 'v1',
352 'INNERTUBE_CLIENT_NAME': 'ANDROID',
353 'INNERTUBE_CLIENT_VERSION': '16.20',
354 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
355 'INNERTUBE_CONTEXT': {
356 'client': {
357 'clientName': 'ANDROID',
358 'clientVersion': '16.20',
359 'hl': 'en',
360 }
361 },
362 'INNERTUBE_CONTEXT_CLIENT_NAME': 'ANDROID'
363 },
364 'ANDROID_EMBEDDED_PLAYER': {
365 'INNERTUBE_API_VERSION': 'v1',
366 'INNERTUBE_CLIENT_NAME': 'ANDROID_EMBEDDED_PLAYER',
367 'INNERTUBE_CLIENT_VERSION': '16.20',
368 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
369 'INNERTUBE_CONTEXT': {
370 'client': {
371 'clientName': 'ANDROID_EMBEDDED_PLAYER',
372 'clientVersion': '16.20',
373 'hl': 'en',
374 }
375 },
376 'INNERTUBE_CONTEXT_CLIENT_NAME': 'ANDROID_EMBEDDED_PLAYER'
377 },
378 'ANDROID_MUSIC': {
379 'INNERTUBE_API_VERSION': 'v1',
380 'INNERTUBE_CLIENT_NAME': 'ANDROID_MUSIC',
381 'INNERTUBE_CLIENT_VERSION': '4.32',
382 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
383 'INNERTUBE_CONTEXT': {
384 'client': {
385 'clientName': 'ANDROID_MUSIC',
386 'clientVersion': '4.32',
387 'hl': 'en',
388 }
389 },
390 'INNERTUBE_CONTEXT_CLIENT_NAME': 'ANDROID_MUSIC'
391 }
392 }
393
394 _YT_DEFAULT_INNERTUBE_HOSTS = {
395 'DIRECT': 'youtubei.googleapis.com',
396 'WEB': 'www.youtube.com',
397 'WEB_REMIX': 'music.youtube.com',
398 'ANDROID_MUSIC': 'music.youtube.com'
399 }
400
401 def _get_default_ytcfg(self, client='WEB'):
402 if client in self._YT_DEFAULT_YTCFGS:
403 return copy.deepcopy(self._YT_DEFAULT_YTCFGS[client])
404 self.write_debug(f'INNERTUBE default client {client} does not exist - falling back to WEB client.')
405 return copy.deepcopy(self._YT_DEFAULT_YTCFGS['WEB'])
406
407 def _get_innertube_host(self, client='WEB'):
408 return dict_get(self._YT_DEFAULT_INNERTUBE_HOSTS, (client, 'WEB'))
409
410 def _ytcfg_get_safe(self, ytcfg, getter, expected_type=None, default_client='WEB'):
411 # try_get but with fallback to default ytcfg client values when present
412 _func = lambda y: try_get(y, getter, expected_type)
413 return _func(ytcfg) or _func(self._get_default_ytcfg(default_client))
414
415 def _extract_client_name(self, ytcfg, default_client='WEB'):
416 return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CLIENT_NAME'], compat_str, default_client)
417
418 def _extract_client_version(self, ytcfg, default_client='WEB'):
419 return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CLIENT_VERSION'], compat_str, default_client)
420
421 def _extract_api_key(self, ytcfg=None, default_client='WEB'):
422 return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client)
423
424 def _extract_context(self, ytcfg=None, default_client='WEB'):
425 _get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict)
426 context = _get_context(ytcfg)
427 if context:
428 return context
429
430 context = _get_context(self._get_default_ytcfg(default_client))
431 if not ytcfg:
432 return context
433
434 # Recreate the client context (required)
435 context['client'].update({
436 'clientVersion': self._extract_client_version(ytcfg, default_client),
437 'clientName': self._extract_client_name(ytcfg, default_client),
438 })
439 visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str)
440 if visitor_data:
441 context['client']['visitorData'] = visitor_data
442 return context
443
444 def _generate_sapisidhash_header(self, origin='https://www.youtube.com'):
445 # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is.
446 # See: https://github.com/yt-dlp/yt-dlp/issues/393
447 yt_cookies = self._get_cookies('https://www.youtube.com')
448 sapisid_cookie = dict_get(
449 yt_cookies, ('__Secure-3PAPISID', 'SAPISID'))
450 if sapisid_cookie is None:
451 return
452 time_now = round(time.time())
453 # SAPISID cookie is required if not already present
454 if not yt_cookies.get('SAPISID'):
455 self._set_cookie(
456 '.youtube.com', 'SAPISID', sapisid_cookie.value, secure=True, expire_time=time_now + 3600)
457 # SAPISIDHASH algorithm from https://stackoverflow.com/a/32065323
458 sapisidhash = hashlib.sha1(
459 f'{time_now} {sapisid_cookie.value} {origin}'.encode('utf-8')).hexdigest()
460 return f'SAPISIDHASH {time_now}_{sapisidhash}'
461
462 def _call_api(self, ep, query, video_id, fatal=True, headers=None,
463 note='Downloading API JSON', errnote='Unable to download API page',
464 context=None, api_key=None, api_hostname=None, default_client='WEB'):
465
466 data = {'context': context} if context else {'context': self._extract_context(default_client=default_client)}
467 data.update(query)
468 real_headers = self._generate_api_headers(client=default_client)
469 real_headers.update({'content-type': 'application/json'})
470 if headers:
471 real_headers.update(headers)
472 return self._download_json(
473 'https://%s/youtubei/v1/%s' % (api_hostname or self._get_innertube_host(default_client), ep),
474 video_id=video_id, fatal=fatal, note=note, errnote=errnote,
475 data=json.dumps(data).encode('utf8'), headers=real_headers,
476 query={'key': api_key or self._extract_api_key()})
477
478 def _extract_yt_initial_data(self, video_id, webpage):
479 return self._parse_json(
480 self._search_regex(
481 (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
482 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
483 video_id)
484
485 def _extract_identity_token(self, webpage, item_id):
486 ytcfg = self._extract_ytcfg(item_id, webpage)
487 if ytcfg:
488 token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
489 if token:
490 return token
491 return self._search_regex(
492 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
493 'identity token', default=None)
494
495 @staticmethod
496 def _extract_account_syncid(data):
497 """
498 Extract syncId required to download private playlists of secondary channels
499 @param data Either response or ytcfg
500 """
501 sync_ids = (try_get(
502 data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'],
503 lambda x: x['DATASYNC_ID']), compat_str) or '').split("||")
504 if len(sync_ids) >= 2 and sync_ids[1]:
505 # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
506 # and just "user_syncid||" for primary channel. We only want the channel_syncid
507 return sync_ids[0]
508 # ytcfg includes channel_syncid if on secondary channel
509 return data.get('DELEGATED_SESSION_ID')
510
511 def _extract_ytcfg(self, video_id, webpage):
512 if not webpage:
513 return {}
514 return self._parse_json(
515 self._search_regex(
516 r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
517 default='{}'), video_id, fatal=False) or {}
518
519 def _generate_api_headers(self, ytcfg=None, identity_token=None, account_syncid=None,
520 visitor_data=None, api_hostname=None, client='WEB'):
521 origin = 'https://' + (api_hostname if api_hostname else self._get_innertube_host(client))
522 headers = {
523 'X-YouTube-Client-Name': compat_str(
524 self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=client)),
525 'X-YouTube-Client-Version': self._extract_client_version(ytcfg, client),
526 'Origin': origin
527 }
528 if not visitor_data and ytcfg:
529 visitor_data = try_get(
530 self._extract_context(ytcfg, client), lambda x: x['client']['visitorData'], compat_str)
531 if identity_token:
532 headers['X-Youtube-Identity-Token'] = identity_token
533 if account_syncid:
534 headers['X-Goog-PageId'] = account_syncid
535 headers['X-Goog-AuthUser'] = 0
536 if visitor_data:
537 headers['X-Goog-Visitor-Id'] = visitor_data
538 auth = self._generate_sapisidhash_header(origin)
539 if auth is not None:
540 headers['Authorization'] = auth
541 headers['X-Origin'] = origin
542 return headers
543
544 @staticmethod
545 def _build_api_continuation_query(continuation, ctp=None):
546 query = {
547 'continuation': continuation
548 }
549 # TODO: Inconsistency with clickTrackingParams.
550 # Currently we have a fixed ctp contained within context (from ytcfg)
551 # and a ctp in root query for continuation.
552 if ctp:
553 query['clickTracking'] = {'clickTrackingParams': ctp}
554 return query
555
556 @classmethod
557 def _continuation_query_ajax_to_api(cls, continuation_query):
558 continuation = dict_get(continuation_query, ('continuation', 'ctoken'))
559 return cls._build_api_continuation_query(continuation, continuation_query.get('itct'))
560
561 @staticmethod
562 def _build_continuation_query(continuation, ctp=None):
563 query = {
564 'ctoken': continuation,
565 'continuation': continuation,
566 }
567 if ctp:
568 query['itct'] = ctp
569 return query
570
571 @classmethod
572 def _extract_next_continuation_data(cls, renderer):
573 next_continuation = try_get(
574 renderer, (lambda x: x['continuations'][0]['nextContinuationData'],
575 lambda x: x['continuation']['reloadContinuationData']), dict)
576 if not next_continuation:
577 return
578 continuation = next_continuation.get('continuation')
579 if not continuation:
580 return
581 ctp = next_continuation.get('clickTrackingParams')
582 return cls._build_continuation_query(continuation, ctp)
583
584 @classmethod
585 def _extract_continuation_ep_data(cls, continuation_ep: dict):
586 if isinstance(continuation_ep, dict):
587 continuation = try_get(
588 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
589 if not continuation:
590 return
591 ctp = continuation_ep.get('clickTrackingParams')
592 return cls._build_continuation_query(continuation, ctp)
593
594 @classmethod
595 def _extract_continuation(cls, renderer):
596 next_continuation = cls._extract_next_continuation_data(renderer)
597 if next_continuation:
598 return next_continuation
599 contents = []
600 for key in ('contents', 'items'):
601 contents.extend(try_get(renderer, lambda x: x[key], list) or [])
602 for content in contents:
603 if not isinstance(content, dict):
604 continue
605 continuation_ep = try_get(
606 content, (lambda x: x['continuationItemRenderer']['continuationEndpoint'],
607 lambda x: x['continuationItemRenderer']['button']['buttonRenderer']['command']),
608 dict)
609 continuation = cls._extract_continuation_ep_data(continuation_ep)
610 if continuation:
611 return continuation
612
613 @staticmethod
614 def _extract_alerts(data):
615 for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
616 if not isinstance(alert_dict, dict):
617 continue
618 for alert in alert_dict.values():
619 alert_type = alert.get('type')
620 if not alert_type:
621 continue
622 message = try_get(alert, lambda x: x['text']['simpleText'], compat_str) or ''
623 if message:
624 yield alert_type, message
625 for run in try_get(alert, lambda x: x['text']['runs'], list) or []:
626 message += try_get(run, lambda x: x['text'], compat_str)
627 if message:
628 yield alert_type, message
629
630 def _report_alerts(self, alerts, expected=True):
631 errors = []
632 warnings = []
633 for alert_type, alert_message in alerts:
634 if alert_type.lower() == 'error':
635 errors.append([alert_type, alert_message])
636 else:
637 warnings.append([alert_type, alert_message])
638
639 for alert_type, alert_message in (warnings + errors[:-1]):
640 self.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
641 if errors:
642 raise ExtractorError('YouTube said: %s' % errors[-1][1], expected=expected)
643
644 def _extract_and_report_alerts(self, data, *args, **kwargs):
645 return self._report_alerts(self._extract_alerts(data), *args, **kwargs)
646
647 def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None,
648 ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None,
649 default_client='WEB'):
650 response = None
651 last_error = None
652 count = -1
653 retries = self.get_param('extractor_retries', 3)
654 if check_get_keys is None:
655 check_get_keys = []
656 while count < retries:
657 count += 1
658 if last_error:
659 self.report_warning('%s. Retrying ...' % last_error)
660 try:
661 response = self._call_api(
662 ep=ep, fatal=True, headers=headers,
663 video_id=item_id, query=query,
664 context=self._extract_context(ytcfg, default_client),
665 api_key=self._extract_api_key(ytcfg, default_client),
666 api_hostname=api_hostname, default_client=default_client,
667 note='%s%s' % (note, ' (retry #%d)' % count if count else ''))
668 except ExtractorError as e:
669 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404):
670 # Downloading page may result in intermittent 5xx HTTP error
671 # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
672 last_error = 'HTTP Error %s' % e.cause.code
673 if count < retries:
674 continue
675 if fatal:
676 raise
677 else:
678 self.report_warning(error_to_compat_str(e))
679 return
680
681 else:
682 # Youtube may send alerts if there was an issue with the continuation page
683 try:
684 self._extract_and_report_alerts(response, expected=False)
685 except ExtractorError as e:
686 if fatal:
687 raise
688 self.report_warning(error_to_compat_str(e))
689 return
690 if not check_get_keys or dict_get(response, check_get_keys):
691 break
692 # Youtube sometimes sends incomplete data
693 # See: https://github.com/ytdl-org/youtube-dl/issues/28194
694 last_error = 'Incomplete data received'
695 if count >= retries:
696 if fatal:
697 raise ExtractorError(last_error)
698 else:
699 self.report_warning(last_error)
700 return
701 return response
702
703 @staticmethod
704 def is_music_url(url):
705 return re.match(r'https?://music\.youtube\.com/', url) is not None
706
707 def _extract_video(self, renderer):
708 video_id = renderer.get('videoId')
709 title = try_get(
710 renderer,
711 (lambda x: x['title']['runs'][0]['text'],
712 lambda x: x['title']['simpleText']), compat_str)
713 description = try_get(
714 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
715 compat_str)
716 duration = parse_duration(try_get(
717 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
718 view_count_text = try_get(
719 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
720 view_count = str_to_int(self._search_regex(
721 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
722 'view count', default=None))
723 uploader = try_get(
724 renderer,
725 (lambda x: x['ownerText']['runs'][0]['text'],
726 lambda x: x['shortBylineText']['runs'][0]['text']), compat_str)
727 return {
728 '_type': 'url',
729 'ie_key': YoutubeIE.ie_key(),
730 'id': video_id,
731 'url': video_id,
732 'title': title,
733 'description': description,
734 'duration': duration,
735 'view_count': view_count,
736 'uploader': uploader,
737 }
738
739
740 class YoutubeIE(YoutubeBaseInfoExtractor):
741 IE_DESC = 'YouTube.com'
742 _INVIDIOUS_SITES = (
743 # invidious-redirect websites
744 r'(?:www\.)?redirect\.invidious\.io',
745 r'(?:(?:www|dev)\.)?invidio\.us',
746 # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
747 r'(?:www\.)?invidious\.pussthecat\.org',
748 r'(?:www\.)?invidious\.zee\.li',
749 r'(?:www\.)?invidious\.ethibox\.fr',
750 r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
751 # youtube-dl invidious instances list
752 r'(?:(?:www|no)\.)?invidiou\.sh',
753 r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
754 r'(?:www\.)?invidious\.kabi\.tk',
755 r'(?:www\.)?invidious\.mastodon\.host',
756 r'(?:www\.)?invidious\.zapashcanon\.fr',
757 r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks',
758 r'(?:www\.)?invidious\.tinfoil-hat\.net',
759 r'(?:www\.)?invidious\.himiko\.cloud',
760 r'(?:www\.)?invidious\.reallyancient\.tech',
761 r'(?:www\.)?invidious\.tube',
762 r'(?:www\.)?invidiou\.site',
763 r'(?:www\.)?invidious\.site',
764 r'(?:www\.)?invidious\.xyz',
765 r'(?:www\.)?invidious\.nixnet\.xyz',
766 r'(?:www\.)?invidious\.048596\.xyz',
767 r'(?:www\.)?invidious\.drycat\.fr',
768 r'(?:www\.)?inv\.skyn3t\.in',
769 r'(?:www\.)?tube\.poal\.co',
770 r'(?:www\.)?tube\.connect\.cafe',
771 r'(?:www\.)?vid\.wxzm\.sx',
772 r'(?:www\.)?vid\.mint\.lgbt',
773 r'(?:www\.)?vid\.puffyan\.us',
774 r'(?:www\.)?yewtu\.be',
775 r'(?:www\.)?yt\.elukerio\.org',
776 r'(?:www\.)?yt\.lelux\.fi',
777 r'(?:www\.)?invidious\.ggc-project\.de',
778 r'(?:www\.)?yt\.maisputain\.ovh',
779 r'(?:www\.)?ytprivate\.com',
780 r'(?:www\.)?invidious\.13ad\.de',
781 r'(?:www\.)?invidious\.toot\.koeln',
782 r'(?:www\.)?invidious\.fdn\.fr',
783 r'(?:www\.)?watch\.nettohikari\.com',
784 r'(?:www\.)?invidious\.namazso\.eu',
785 r'(?:www\.)?invidious\.silkky\.cloud',
786 r'(?:www\.)?invidious\.exonip\.de',
787 r'(?:www\.)?invidious\.riverside\.rocks',
788 r'(?:www\.)?invidious\.blamefran\.net',
789 r'(?:www\.)?invidious\.moomoo\.de',
790 r'(?:www\.)?ytb\.trom\.tf',
791 r'(?:www\.)?yt\.cyberhost\.uk',
792 r'(?:www\.)?kgg2m7yk5aybusll\.onion',
793 r'(?:www\.)?qklhadlycap4cnod\.onion',
794 r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion',
795 r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion',
796 r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion',
797 r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion',
798 r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p',
799 r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion',
800 r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
801 r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
802 r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
803 r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion',
804 )
805 _VALID_URL = r"""(?x)^
806 (
807 (?:https?://|//) # http(s):// or protocol-independent URL
808 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com|
809 (?:www\.)?deturl\.com/www\.youtube\.com|
810 (?:www\.)?pwnyoutube\.com|
811 (?:www\.)?hooktube\.com|
812 (?:www\.)?yourepeat\.com|
813 tube\.majestyc\.net|
814 %(invidious)s|
815 youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
816 (?:.*?\#/)? # handle anchor (#/) redirect urls
817 (?: # the various things that can precede the ID:
818 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
819 |(?: # or the v= param in all its forms
820 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
821 (?:\?|\#!?) # the params delimiter ? or # or #!
822 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
823 v=
824 )
825 ))
826 |(?:
827 youtu\.be| # just youtu.be/xxxx
828 vid\.plus| # or vid.plus/xxxx
829 zwearz\.com/watch| # or zwearz.com/watch/xxxx
830 %(invidious)s
831 )/
832 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
833 )
834 )? # all until now is optional -> you can pass the naked ID
835 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
836 (?(1).+)? # if we found the ID, everything can follow
837 (?:\#|$)""" % {
838 'invidious': '|'.join(_INVIDIOUS_SITES),
839 }
840 _PLAYER_INFO_RE = (
841 r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
842 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
843 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
844 )
845 _formats = {
846 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
847 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
848 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
849 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
850 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
851 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
852 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
853 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
854 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
855 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
856 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
857 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
858 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
859 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
860 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
861 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
862 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
863 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
864
865
866 # 3D videos
867 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
868 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
869 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
870 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
871 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
872 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
873 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
874
875 # Apple HTTP Live Streaming
876 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
877 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
878 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
879 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
880 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
881 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
882 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
883 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
884
885 # DASH mp4 video
886 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
887 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
888 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
889 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
890 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
891 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
892 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
893 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
894 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
895 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
896 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
897 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
898
899 # Dash mp4 audio
900 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
901 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
902 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
903 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
904 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
905 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
906 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
907
908 # Dash webm
909 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
910 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
911 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
912 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
913 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
914 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
915 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
916 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
917 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
918 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
919 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
920 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
921 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
922 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
923 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
924 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
925 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
926 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
927 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
928 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
929 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
930 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
931
932 # Dash webm audio
933 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
934 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
935
936 # Dash webm audio with opus inside
937 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
938 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
939 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
940
941 # RTMP (unnamed)
942 '_rtmp': {'protocol': 'rtmp'},
943
944 # av01 video only formats sometimes served with "unknown" codecs
945 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
946 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
947 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
948 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
949 }
950 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
951
952 _AGE_GATE_REASONS = (
953 'Sign in to confirm your age',
954 'This video may be inappropriate for some users.',
955 'Sorry, this content is age-restricted.')
956
957 _GEO_BYPASS = False
958
959 IE_NAME = 'youtube'
960 _TESTS = [
961 {
962 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
963 'info_dict': {
964 'id': 'BaW_jenozKc',
965 'ext': 'mp4',
966 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
967 'uploader': 'Philipp Hagemeister',
968 'uploader_id': 'phihag',
969 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
970 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
971 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
972 'upload_date': '20121002',
973 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
974 'categories': ['Science & Technology'],
975 'tags': ['youtube-dl'],
976 'duration': 10,
977 'view_count': int,
978 'like_count': int,
979 'dislike_count': int,
980 'start_time': 1,
981 'end_time': 9,
982 }
983 },
984 {
985 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
986 'note': 'Embed-only video (#1746)',
987 'info_dict': {
988 'id': 'yZIXLfi8CZQ',
989 'ext': 'mp4',
990 'upload_date': '20120608',
991 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
992 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
993 'uploader': 'SET India',
994 'uploader_id': 'setindia',
995 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
996 'age_limit': 18,
997 },
998 'skip': 'Private video',
999 },
1000 {
1001 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
1002 'note': 'Use the first video ID in the URL',
1003 'info_dict': {
1004 'id': 'BaW_jenozKc',
1005 'ext': 'mp4',
1006 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
1007 'uploader': 'Philipp Hagemeister',
1008 'uploader_id': 'phihag',
1009 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
1010 'upload_date': '20121002',
1011 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
1012 'categories': ['Science & Technology'],
1013 'tags': ['youtube-dl'],
1014 'duration': 10,
1015 'view_count': int,
1016 'like_count': int,
1017 'dislike_count': int,
1018 },
1019 'params': {
1020 'skip_download': True,
1021 },
1022 },
1023 {
1024 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
1025 'note': '256k DASH audio (format 141) via DASH manifest',
1026 'info_dict': {
1027 'id': 'a9LDPn-MO4I',
1028 'ext': 'm4a',
1029 'upload_date': '20121002',
1030 'uploader_id': '8KVIDEO',
1031 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
1032 'description': '',
1033 'uploader': '8KVIDEO',
1034 'title': 'UHDTV TEST 8K VIDEO.mp4'
1035 },
1036 'params': {
1037 'youtube_include_dash_manifest': True,
1038 'format': '141',
1039 },
1040 'skip': 'format 141 not served anymore',
1041 },
1042 # DASH manifest with encrypted signature
1043 {
1044 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
1045 'info_dict': {
1046 'id': 'IB3lcPjvWLA',
1047 'ext': 'm4a',
1048 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
1049 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
1050 'duration': 244,
1051 'uploader': 'AfrojackVEVO',
1052 'uploader_id': 'AfrojackVEVO',
1053 'upload_date': '20131011',
1054 'abr': 129.495,
1055 },
1056 'params': {
1057 'youtube_include_dash_manifest': True,
1058 'format': '141/bestaudio[ext=m4a]',
1059 },
1060 },
1061 # Controversy video
1062 {
1063 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
1064 'info_dict': {
1065 'id': 'T4XJQO3qol8',
1066 'ext': 'mp4',
1067 'duration': 219,
1068 'upload_date': '20100909',
1069 'uploader': 'Amazing Atheist',
1070 'uploader_id': 'TheAmazingAtheist',
1071 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
1072 'title': 'Burning Everyone\'s Koran',
1073 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
1074 }
1075 },
1076 # Normal age-gate video (embed allowed)
1077 {
1078 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
1079 'info_dict': {
1080 'id': 'HtVdAasjOgU',
1081 'ext': 'mp4',
1082 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
1083 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
1084 'duration': 142,
1085 'uploader': 'The Witcher',
1086 'uploader_id': 'WitcherGame',
1087 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
1088 'upload_date': '20140605',
1089 'age_limit': 18,
1090 },
1091 },
1092 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
1093 # YouTube Red ad is not captured for creator
1094 {
1095 'url': '__2ABJjxzNo',
1096 'info_dict': {
1097 'id': '__2ABJjxzNo',
1098 'ext': 'mp4',
1099 'duration': 266,
1100 'upload_date': '20100430',
1101 'uploader_id': 'deadmau5',
1102 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
1103 'creator': 'deadmau5',
1104 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336',
1105 'uploader': 'deadmau5',
1106 'title': 'Deadmau5 - Some Chords (HD)',
1107 'alt_title': 'Some Chords',
1108 },
1109 'expected_warnings': [
1110 'DASH manifest missing',
1111 ]
1112 },
1113 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
1114 {
1115 'url': 'lqQg6PlCWgI',
1116 'info_dict': {
1117 'id': 'lqQg6PlCWgI',
1118 'ext': 'mp4',
1119 'duration': 6085,
1120 'upload_date': '20150827',
1121 'uploader_id': 'olympic',
1122 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
1123 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
1124 'uploader': 'Olympic',
1125 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
1126 },
1127 'params': {
1128 'skip_download': 'requires avconv',
1129 }
1130 },
1131 # Non-square pixels
1132 {
1133 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
1134 'info_dict': {
1135 'id': '_b-2C3KPAM0',
1136 'ext': 'mp4',
1137 'stretched_ratio': 16 / 9.,
1138 'duration': 85,
1139 'upload_date': '20110310',
1140 'uploader_id': 'AllenMeow',
1141 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
1142 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
1143 'uploader': '孫ᄋᄅ',
1144 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
1145 },
1146 },
1147 # url_encoded_fmt_stream_map is empty string
1148 {
1149 'url': 'qEJwOuvDf7I',
1150 'info_dict': {
1151 'id': 'qEJwOuvDf7I',
1152 'ext': 'webm',
1153 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
1154 'description': '',
1155 'upload_date': '20150404',
1156 'uploader_id': 'spbelect',
1157 'uploader': 'Наблюдатели Петербурга',
1158 },
1159 'params': {
1160 'skip_download': 'requires avconv',
1161 },
1162 'skip': 'This live event has ended.',
1163 },
1164 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
1165 {
1166 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
1167 'info_dict': {
1168 'id': 'FIl7x6_3R5Y',
1169 'ext': 'webm',
1170 'title': 'md5:7b81415841e02ecd4313668cde88737a',
1171 'description': 'md5:116377fd2963b81ec4ce64b542173306',
1172 'duration': 220,
1173 'upload_date': '20150625',
1174 'uploader_id': 'dorappi2000',
1175 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
1176 'uploader': 'dorappi2000',
1177 'formats': 'mincount:31',
1178 },
1179 'skip': 'not actual anymore',
1180 },
1181 # DASH manifest with segment_list
1182 {
1183 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
1184 'md5': '8ce563a1d667b599d21064e982ab9e31',
1185 'info_dict': {
1186 'id': 'CsmdDsKjzN8',
1187 'ext': 'mp4',
1188 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
1189 'uploader': 'Airtek',
1190 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
1191 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
1192 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
1193 },
1194 'params': {
1195 'youtube_include_dash_manifest': True,
1196 'format': '135', # bestvideo
1197 },
1198 'skip': 'This live event has ended.',
1199 },
1200 {
1201 # Multifeed videos (multiple cameras), URL is for Main Camera
1202 'url': 'https://www.youtube.com/watch?v=jvGDaLqkpTg',
1203 'info_dict': {
1204 'id': 'jvGDaLqkpTg',
1205 'title': 'Tom Clancy Free Weekend Rainbow Whatever',
1206 'description': 'md5:e03b909557865076822aa169218d6a5d',
1207 },
1208 'playlist': [{
1209 'info_dict': {
1210 'id': 'jvGDaLqkpTg',
1211 'ext': 'mp4',
1212 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Main Camera)',
1213 'description': 'md5:e03b909557865076822aa169218d6a5d',
1214 'duration': 10643,
1215 'upload_date': '20161111',
1216 'uploader': 'Team PGP',
1217 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1218 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1219 },
1220 }, {
1221 'info_dict': {
1222 'id': '3AKt1R1aDnw',
1223 'ext': 'mp4',
1224 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 2)',
1225 'description': 'md5:e03b909557865076822aa169218d6a5d',
1226 'duration': 10991,
1227 'upload_date': '20161111',
1228 'uploader': 'Team PGP',
1229 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1230 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1231 },
1232 }, {
1233 'info_dict': {
1234 'id': 'RtAMM00gpVc',
1235 'ext': 'mp4',
1236 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 3)',
1237 'description': 'md5:e03b909557865076822aa169218d6a5d',
1238 'duration': 10995,
1239 'upload_date': '20161111',
1240 'uploader': 'Team PGP',
1241 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1242 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1243 },
1244 }, {
1245 'info_dict': {
1246 'id': '6N2fdlP3C5U',
1247 'ext': 'mp4',
1248 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 4)',
1249 'description': 'md5:e03b909557865076822aa169218d6a5d',
1250 'duration': 10990,
1251 'upload_date': '20161111',
1252 'uploader': 'Team PGP',
1253 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1254 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1255 },
1256 }],
1257 'params': {
1258 'skip_download': True,
1259 },
1260 },
1261 {
1262 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
1263 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
1264 'info_dict': {
1265 'id': 'gVfLd0zydlo',
1266 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
1267 },
1268 'playlist_count': 2,
1269 'skip': 'Not multifeed anymore',
1270 },
1271 {
1272 'url': 'https://vid.plus/FlRa-iH7PGw',
1273 'only_matching': True,
1274 },
1275 {
1276 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
1277 'only_matching': True,
1278 },
1279 {
1280 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
1281 # Also tests cut-off URL expansion in video description (see
1282 # https://github.com/ytdl-org/youtube-dl/issues/1892,
1283 # https://github.com/ytdl-org/youtube-dl/issues/8164)
1284 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
1285 'info_dict': {
1286 'id': 'lsguqyKfVQg',
1287 'ext': 'mp4',
1288 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
1289 'alt_title': 'Dark Walk - Position Music',
1290 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
1291 'duration': 133,
1292 'upload_date': '20151119',
1293 'uploader_id': 'IronSoulElf',
1294 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
1295 'uploader': 'IronSoulElf',
1296 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
1297 'track': 'Dark Walk - Position Music',
1298 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
1299 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
1300 },
1301 'params': {
1302 'skip_download': True,
1303 },
1304 },
1305 {
1306 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
1307 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
1308 'only_matching': True,
1309 },
1310 {
1311 # Video with yt:stretch=17:0
1312 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
1313 'info_dict': {
1314 'id': 'Q39EVAstoRM',
1315 'ext': 'mp4',
1316 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
1317 'description': 'md5:ee18a25c350637c8faff806845bddee9',
1318 'upload_date': '20151107',
1319 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
1320 'uploader': 'CH GAMER DROID',
1321 },
1322 'params': {
1323 'skip_download': True,
1324 },
1325 'skip': 'This video does not exist.',
1326 },
1327 {
1328 # Video with incomplete 'yt:stretch=16:'
1329 'url': 'https://www.youtube.com/watch?v=FRhJzUSJbGI',
1330 'only_matching': True,
1331 },
1332 {
1333 # Video licensed under Creative Commons
1334 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
1335 'info_dict': {
1336 'id': 'M4gD1WSo5mA',
1337 'ext': 'mp4',
1338 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
1339 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
1340 'duration': 721,
1341 'upload_date': '20150127',
1342 'uploader_id': 'BerkmanCenter',
1343 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
1344 'uploader': 'The Berkman Klein Center for Internet & Society',
1345 'license': 'Creative Commons Attribution license (reuse allowed)',
1346 },
1347 'params': {
1348 'skip_download': True,
1349 },
1350 },
1351 {
1352 # Channel-like uploader_url
1353 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
1354 'info_dict': {
1355 'id': 'eQcmzGIKrzg',
1356 'ext': 'mp4',
1357 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
1358 'description': 'md5:13a2503d7b5904ef4b223aa101628f39',
1359 'duration': 4060,
1360 'upload_date': '20151119',
1361 'uploader': 'Bernie Sanders',
1362 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
1363 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
1364 'license': 'Creative Commons Attribution license (reuse allowed)',
1365 },
1366 'params': {
1367 'skip_download': True,
1368 },
1369 },
1370 {
1371 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
1372 'only_matching': True,
1373 },
1374 {
1375 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
1376 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
1377 'only_matching': True,
1378 },
1379 {
1380 # Rental video preview
1381 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1382 'info_dict': {
1383 'id': 'uGpuVWrhIzE',
1384 'ext': 'mp4',
1385 'title': 'Piku - Trailer',
1386 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1387 'upload_date': '20150811',
1388 'uploader': 'FlixMatrix',
1389 'uploader_id': 'FlixMatrixKaravan',
1390 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
1391 'license': 'Standard YouTube License',
1392 },
1393 'params': {
1394 'skip_download': True,
1395 },
1396 'skip': 'This video is not available.',
1397 },
1398 {
1399 # YouTube Red video with episode data
1400 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1401 'info_dict': {
1402 'id': 'iqKdEhx-dD4',
1403 'ext': 'mp4',
1404 'title': 'Isolation - Mind Field (Ep 1)',
1405 'description': 'md5:f540112edec5d09fc8cc752d3d4ba3cd',
1406 'duration': 2085,
1407 'upload_date': '20170118',
1408 'uploader': 'Vsauce',
1409 'uploader_id': 'Vsauce',
1410 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1411 'series': 'Mind Field',
1412 'season_number': 1,
1413 'episode_number': 1,
1414 },
1415 'params': {
1416 'skip_download': True,
1417 },
1418 'expected_warnings': [
1419 'Skipping DASH manifest',
1420 ],
1421 },
1422 {
1423 # The following content has been identified by the YouTube community
1424 # as inappropriate or offensive to some audiences.
1425 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1426 'info_dict': {
1427 'id': '6SJNVb0GnPI',
1428 'ext': 'mp4',
1429 'title': 'Race Differences in Intelligence',
1430 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1431 'duration': 965,
1432 'upload_date': '20140124',
1433 'uploader': 'New Century Foundation',
1434 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1435 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1436 },
1437 'params': {
1438 'skip_download': True,
1439 },
1440 'skip': 'This video has been removed for violating YouTube\'s policy on hate speech.',
1441 },
1442 {
1443 # itag 212
1444 'url': '1t24XAntNCY',
1445 'only_matching': True,
1446 },
1447 {
1448 # geo restricted to JP
1449 'url': 'sJL6WA-aGkQ',
1450 'only_matching': True,
1451 },
1452 {
1453 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1454 'only_matching': True,
1455 },
1456 {
1457 'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc',
1458 'only_matching': True,
1459 },
1460 {
1461 # from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m
1462 'url': 'https://redirect.invidious.io/Yh0AhrY9GjA',
1463 'only_matching': True,
1464 },
1465 {
1466 # DRM protected
1467 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1468 'only_matching': True,
1469 },
1470 {
1471 # Video with unsupported adaptive stream type formats
1472 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1473 'info_dict': {
1474 'id': 'Z4Vy8R84T1U',
1475 'ext': 'mp4',
1476 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1477 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1478 'duration': 433,
1479 'upload_date': '20130923',
1480 'uploader': 'Amelia Putri Harwita',
1481 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1482 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1483 'formats': 'maxcount:10',
1484 },
1485 'params': {
1486 'skip_download': True,
1487 'youtube_include_dash_manifest': False,
1488 },
1489 'skip': 'not actual anymore',
1490 },
1491 {
1492 # Youtube Music Auto-generated description
1493 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1494 'info_dict': {
1495 'id': 'MgNrAu2pzNs',
1496 'ext': 'mp4',
1497 'title': 'Voyeur Girl',
1498 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1499 'upload_date': '20190312',
1500 'uploader': 'Stephen - Topic',
1501 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1502 'artist': 'Stephen',
1503 'track': 'Voyeur Girl',
1504 'album': 'it\'s too much love to know my dear',
1505 'release_date': '20190313',
1506 'release_year': 2019,
1507 },
1508 'params': {
1509 'skip_download': True,
1510 },
1511 },
1512 {
1513 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1514 'only_matching': True,
1515 },
1516 {
1517 # invalid -> valid video id redirection
1518 'url': 'DJztXj2GPfl',
1519 'info_dict': {
1520 'id': 'DJztXj2GPfk',
1521 'ext': 'mp4',
1522 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1523 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1524 'upload_date': '20090125',
1525 'uploader': 'Prochorowka',
1526 'uploader_id': 'Prochorowka',
1527 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1528 'artist': 'Panjabi MC',
1529 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1530 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1531 },
1532 'params': {
1533 'skip_download': True,
1534 },
1535 'skip': 'Video unavailable',
1536 },
1537 {
1538 # empty description results in an empty string
1539 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1540 'info_dict': {
1541 'id': 'x41yOUIvK2k',
1542 'ext': 'mp4',
1543 'title': 'IMG 3456',
1544 'description': '',
1545 'upload_date': '20170613',
1546 'uploader_id': 'ElevageOrVert',
1547 'uploader': 'ElevageOrVert',
1548 },
1549 'params': {
1550 'skip_download': True,
1551 },
1552 },
1553 {
1554 # with '};' inside yt initial data (see [1])
1555 # see [2] for an example with '};' inside ytInitialPlayerResponse
1556 # 1. https://github.com/ytdl-org/youtube-dl/issues/27093
1557 # 2. https://github.com/ytdl-org/youtube-dl/issues/27216
1558 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1559 'info_dict': {
1560 'id': 'CHqg6qOn4no',
1561 'ext': 'mp4',
1562 'title': 'Part 77 Sort a list of simple types in c#',
1563 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1564 'upload_date': '20130831',
1565 'uploader_id': 'kudvenkat',
1566 'uploader': 'kudvenkat',
1567 },
1568 'params': {
1569 'skip_download': True,
1570 },
1571 },
1572 {
1573 # another example of '};' in ytInitialData
1574 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY',
1575 'only_matching': True,
1576 },
1577 {
1578 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ',
1579 'only_matching': True,
1580 },
1581 {
1582 # https://github.com/ytdl-org/youtube-dl/pull/28094
1583 'url': 'OtqTfy26tG0',
1584 'info_dict': {
1585 'id': 'OtqTfy26tG0',
1586 'ext': 'mp4',
1587 'title': 'Burn Out',
1588 'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131',
1589 'upload_date': '20141120',
1590 'uploader': 'The Cinematic Orchestra - Topic',
1591 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw',
1592 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw',
1593 'artist': 'The Cinematic Orchestra',
1594 'track': 'Burn Out',
1595 'album': 'Every Day',
1596 'release_data': None,
1597 'release_year': None,
1598 },
1599 'params': {
1600 'skip_download': True,
1601 },
1602 },
1603 {
1604 # controversial video, only works with bpctr when authenticated with cookies
1605 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg',
1606 'only_matching': True,
1607 },
1608 {
1609 # restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685
1610 'url': 'cBvYw8_A0vQ',
1611 'info_dict': {
1612 'id': 'cBvYw8_A0vQ',
1613 'ext': 'mp4',
1614 'title': '4K Ueno Okachimachi Street Scenes 上野御徒町歩き',
1615 'description': 'md5:ea770e474b7cd6722b4c95b833c03630',
1616 'upload_date': '20201120',
1617 'uploader': 'Walk around Japan',
1618 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw',
1619 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
1620 },
1621 'params': {
1622 'skip_download': True,
1623 },
1624 }, {
1625 # Has multiple audio streams
1626 'url': 'WaOKSUlf4TM',
1627 'only_matching': True
1628 }, {
1629 # Requires Premium: has format 141 when requested using YTM url
1630 'url': 'https://music.youtube.com/watch?v=XclachpHxis',
1631 'only_matching': True
1632 }, {
1633 # multiple subtitles with same lang_code
1634 'url': 'https://www.youtube.com/watch?v=wsQiKKfKxug',
1635 'only_matching': True,
1636 }, {
1637 # Force use android client fallback
1638 'url': 'https://www.youtube.com/watch?v=YOelRv7fMxY',
1639 'info_dict': {
1640 'id': 'YOelRv7fMxY',
1641 'title': 'Digging a Secret Tunnel from my Workshop',
1642 'ext': '3gp',
1643 'upload_date': '20210624',
1644 'channel_id': 'UCp68_FLety0O-n9QU6phsgw',
1645 'uploader': 'colinfurze',
1646 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCp68_FLety0O-n9QU6phsgw',
1647 'description': 'md5:ecb672623246d98c6c562eed6ae798c3'
1648 },
1649 'params': {
1650 'format': '17', # 3gp format available on android
1651 'extractor_args': {'youtube': {'player_client': ['android']}},
1652 },
1653 },
1654 {
1655 # Skip download of additional client configs (remix client config in this case)
1656 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1657 'only_matching': True,
1658 'params': {
1659 'extractor_args': {'youtube': {'player_skip': ['configs']}},
1660 },
1661 }
1662 ]
1663
1664 @classmethod
1665 def suitable(cls, url):
1666 # Hack for lazy extractors until more generic solution is implemented
1667 # (see #28780)
1668 from .youtube import parse_qs
1669 qs = parse_qs(url)
1670 if qs.get('list', [None])[0]:
1671 return False
1672 return super(YoutubeIE, cls).suitable(url)
1673
1674 def __init__(self, *args, **kwargs):
1675 super(YoutubeIE, self).__init__(*args, **kwargs)
1676 self._code_cache = {}
1677 self._player_cache = {}
1678
1679 def _extract_player_url(self, ytcfg=None, webpage=None):
1680 player_url = try_get(ytcfg, (lambda x: x['PLAYER_JS_URL']), str)
1681 if not player_url:
1682 player_url = self._search_regex(
1683 r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
1684 webpage, 'player URL', fatal=False)
1685 if player_url.startswith('//'):
1686 player_url = 'https:' + player_url
1687 elif not re.match(r'https?://', player_url):
1688 player_url = compat_urlparse.urljoin(
1689 'https://www.youtube.com', player_url)
1690 return player_url
1691
1692 def _signature_cache_id(self, example_sig):
1693 """ Return a string representation of a signature """
1694 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1695
1696 @classmethod
1697 def _extract_player_info(cls, player_url):
1698 for player_re in cls._PLAYER_INFO_RE:
1699 id_m = re.search(player_re, player_url)
1700 if id_m:
1701 break
1702 else:
1703 raise ExtractorError('Cannot identify player %r' % player_url)
1704 return id_m.group('id')
1705
1706 def _load_player(self, video_id, player_url, fatal=True) -> bool:
1707 player_id = self._extract_player_info(player_url)
1708 if player_id not in self._code_cache:
1709 self._code_cache[player_id] = self._download_webpage(
1710 player_url, video_id, fatal=fatal,
1711 note='Downloading player ' + player_id,
1712 errnote='Download of %s failed' % player_url)
1713 return player_id in self._code_cache
1714
1715 def _extract_signature_function(self, video_id, player_url, example_sig):
1716 player_id = self._extract_player_info(player_url)
1717
1718 # Read from filesystem cache
1719 func_id = 'js_%s_%s' % (
1720 player_id, self._signature_cache_id(example_sig))
1721 assert os.path.basename(func_id) == func_id
1722
1723 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1724 if cache_spec is not None:
1725 return lambda s: ''.join(s[i] for i in cache_spec)
1726
1727 if self._load_player(video_id, player_url):
1728 code = self._code_cache[player_id]
1729 res = self._parse_sig_js(code)
1730
1731 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1732 cache_res = res(test_string)
1733 cache_spec = [ord(c) for c in cache_res]
1734
1735 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1736 return res
1737
1738 def _print_sig_code(self, func, example_sig):
1739 def gen_sig_code(idxs):
1740 def _genslice(start, end, step):
1741 starts = '' if start == 0 else str(start)
1742 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1743 steps = '' if step == 1 else (':%d' % step)
1744 return 's[%s%s%s]' % (starts, ends, steps)
1745
1746 step = None
1747 # Quelch pyflakes warnings - start will be set when step is set
1748 start = '(Never used)'
1749 for i, prev in zip(idxs[1:], idxs[:-1]):
1750 if step is not None:
1751 if i - prev == step:
1752 continue
1753 yield _genslice(start, prev, step)
1754 step = None
1755 continue
1756 if i - prev in [-1, 1]:
1757 step = i - prev
1758 start = prev
1759 continue
1760 else:
1761 yield 's[%d]' % prev
1762 if step is None:
1763 yield 's[%d]' % i
1764 else:
1765 yield _genslice(start, i, step)
1766
1767 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1768 cache_res = func(test_string)
1769 cache_spec = [ord(c) for c in cache_res]
1770 expr_code = ' + '.join(gen_sig_code(cache_spec))
1771 signature_id_tuple = '(%s)' % (
1772 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1773 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1774 ' return %s\n') % (signature_id_tuple, expr_code)
1775 self.to_screen('Extracted signature function:\n' + code)
1776
1777 def _parse_sig_js(self, jscode):
1778 funcname = self._search_regex(
1779 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1780 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1781 r'\bm=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(h\.s\)\)',
1782 r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(c\)\)',
1783 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)',
1784 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1785 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1786 # Obsolete patterns
1787 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1788 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1789 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1790 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1791 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1792 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1793 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1794 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1795 jscode, 'Initial JS player signature function name', group='sig')
1796
1797 jsi = JSInterpreter(jscode)
1798 initial_function = jsi.extract_function(funcname)
1799 return lambda s: initial_function([s])
1800
1801 def _decrypt_signature(self, s, video_id, player_url):
1802 """Turn the encrypted s field into a working signature"""
1803
1804 if player_url is None:
1805 raise ExtractorError('Cannot decrypt signature without player_url')
1806
1807 try:
1808 player_id = (player_url, self._signature_cache_id(s))
1809 if player_id not in self._player_cache:
1810 func = self._extract_signature_function(
1811 video_id, player_url, s
1812 )
1813 self._player_cache[player_id] = func
1814 func = self._player_cache[player_id]
1815 if self.get_param('youtube_print_sig_code'):
1816 self._print_sig_code(func, s)
1817 return func(s)
1818 except Exception as e:
1819 tb = traceback.format_exc()
1820 raise ExtractorError(
1821 'Signature extraction failed: ' + tb, cause=e)
1822
1823 def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False):
1824 """
1825 Extract signatureTimestamp (sts)
1826 Required to tell API what sig/player version is in use.
1827 """
1828 sts = None
1829 if isinstance(ytcfg, dict):
1830 sts = int_or_none(ytcfg.get('STS'))
1831
1832 if not sts:
1833 # Attempt to extract from player
1834 if player_url is None:
1835 error_msg = 'Cannot extract signature timestamp without player_url.'
1836 if fatal:
1837 raise ExtractorError(error_msg)
1838 self.report_warning(error_msg)
1839 return
1840 if self._load_player(video_id, player_url, fatal=fatal):
1841 player_id = self._extract_player_info(player_url)
1842 code = self._code_cache[player_id]
1843 sts = int_or_none(self._search_regex(
1844 r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code,
1845 'JS player signature timestamp', group='sts', fatal=fatal))
1846 return sts
1847
1848 def _mark_watched(self, video_id, player_response):
1849 playback_url = url_or_none(try_get(
1850 player_response,
1851 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']))
1852 if not playback_url:
1853 return
1854 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1855 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1856
1857 # cpn generation algorithm is reverse engineered from base.js.
1858 # In fact it works even with dummy cpn.
1859 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1860 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1861
1862 qs.update({
1863 'ver': ['2'],
1864 'cpn': [cpn],
1865 })
1866 playback_url = compat_urlparse.urlunparse(
1867 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1868
1869 self._download_webpage(
1870 playback_url, video_id, 'Marking watched',
1871 'Unable to mark watched', fatal=False)
1872
1873 @staticmethod
1874 def _extract_urls(webpage):
1875 # Embedded YouTube player
1876 entries = [
1877 unescapeHTML(mobj.group('url'))
1878 for mobj in re.finditer(r'''(?x)
1879 (?:
1880 <iframe[^>]+?src=|
1881 data-video-url=|
1882 <embed[^>]+?src=|
1883 embedSWF\(?:\s*|
1884 <object[^>]+data=|
1885 new\s+SWFObject\(
1886 )
1887 (["\'])
1888 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1889 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1890 \1''', webpage)]
1891
1892 # lazyYT YouTube embed
1893 entries.extend(list(map(
1894 unescapeHTML,
1895 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1896
1897 # Wordpress "YouTube Video Importer" plugin
1898 matches = re.findall(r'''(?x)<div[^>]+
1899 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1900 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1901 entries.extend(m[-1] for m in matches)
1902
1903 return entries
1904
1905 @staticmethod
1906 def _extract_url(webpage):
1907 urls = YoutubeIE._extract_urls(webpage)
1908 return urls[0] if urls else None
1909
1910 @classmethod
1911 def extract_id(cls, url):
1912 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1913 if mobj is None:
1914 raise ExtractorError('Invalid URL: %s' % url)
1915 video_id = mobj.group(2)
1916 return video_id
1917
1918 def _extract_chapters_from_json(self, data, video_id, duration):
1919 chapters_list = try_get(
1920 data,
1921 lambda x: x['playerOverlays']
1922 ['playerOverlayRenderer']
1923 ['decoratedPlayerBarRenderer']
1924 ['decoratedPlayerBarRenderer']
1925 ['playerBar']
1926 ['chapteredPlayerBarRenderer']
1927 ['chapters'],
1928 list)
1929 if not chapters_list:
1930 return
1931
1932 def chapter_time(chapter):
1933 return float_or_none(
1934 try_get(
1935 chapter,
1936 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1937 int),
1938 scale=1000)
1939 chapters = []
1940 for next_num, chapter in enumerate(chapters_list, start=1):
1941 start_time = chapter_time(chapter)
1942 if start_time is None:
1943 continue
1944 end_time = (chapter_time(chapters_list[next_num])
1945 if next_num < len(chapters_list) else duration)
1946 if end_time is None:
1947 continue
1948 title = try_get(
1949 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1950 compat_str)
1951 chapters.append({
1952 'start_time': start_time,
1953 'end_time': end_time,
1954 'title': title,
1955 })
1956 return chapters
1957
1958 def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
1959 return self._parse_json(self._search_regex(
1960 (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
1961 regex), webpage, name, default='{}'), video_id, fatal=False)
1962
1963 @staticmethod
1964 def parse_time_text(time_text):
1965 """
1966 Parse the comment time text
1967 time_text is in the format 'X units ago (edited)'
1968 """
1969 time_text_split = time_text.split(' ')
1970 if len(time_text_split) >= 3:
1971 return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto')
1972
1973 @staticmethod
1974 def _join_text_entries(runs):
1975 text = None
1976 for run in runs:
1977 if not isinstance(run, dict):
1978 continue
1979 sub_text = try_get(run, lambda x: x['text'], compat_str)
1980 if sub_text:
1981 if not text:
1982 text = sub_text
1983 continue
1984 text += sub_text
1985 return text
1986
1987 def _extract_comment(self, comment_renderer, parent=None):
1988 comment_id = comment_renderer.get('commentId')
1989 if not comment_id:
1990 return
1991 comment_text_runs = try_get(comment_renderer, lambda x: x['contentText']['runs']) or []
1992 text = self._join_text_entries(comment_text_runs) or ''
1993 comment_time_text = try_get(comment_renderer, lambda x: x['publishedTimeText']['runs']) or []
1994 time_text = self._join_text_entries(comment_time_text)
1995 timestamp = calendar.timegm(self.parse_time_text(time_text).timetuple())
1996 author = try_get(comment_renderer, lambda x: x['authorText']['simpleText'], compat_str)
1997 author_id = try_get(comment_renderer,
1998 lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
1999 votes = str_to_int(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'],
2000 lambda x: x['likeCount']), compat_str)) or 0
2001 author_thumbnail = try_get(comment_renderer,
2002 lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], compat_str)
2003
2004 author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
2005 is_favorited = 'creatorHeart' in (try_get(
2006 comment_renderer, lambda x: x['actionButtons']['commentActionButtonsRenderer'], dict) or {})
2007 return {
2008 'id': comment_id,
2009 'text': text,
2010 'timestamp': timestamp,
2011 'time_text': time_text,
2012 'like_count': votes,
2013 'is_favorited': is_favorited,
2014 'author': author,
2015 'author_id': author_id,
2016 'author_thumbnail': author_thumbnail,
2017 'author_is_uploader': author_is_uploader,
2018 'parent': parent or 'root'
2019 }
2020
2021 def _comment_entries(self, root_continuation_data, identity_token, account_syncid,
2022 ytcfg, video_id, parent=None, comment_counts=None):
2023
2024 def extract_header(contents):
2025 _total_comments = 0
2026 _continuation = None
2027 for content in contents:
2028 comments_header_renderer = try_get(content, lambda x: x['commentsHeaderRenderer'])
2029 expected_comment_count = try_get(comments_header_renderer,
2030 (lambda x: x['countText']['runs'][0]['text'],
2031 lambda x: x['commentsCount']['runs'][0]['text']),
2032 compat_str)
2033 if expected_comment_count:
2034 comment_counts[1] = str_to_int(expected_comment_count)
2035 self.to_screen('Downloading ~%d comments' % str_to_int(expected_comment_count))
2036 _total_comments = comment_counts[1]
2037 sort_mode_str = self._configuration_arg('comment_sort', [''])[0]
2038 comment_sort_index = int(sort_mode_str != 'top') # 1 = new, 0 = top
2039
2040 sort_menu_item = try_get(
2041 comments_header_renderer,
2042 lambda x: x['sortMenu']['sortFilterSubMenuRenderer']['subMenuItems'][comment_sort_index], dict) or {}
2043 sort_continuation_ep = sort_menu_item.get('serviceEndpoint') or {}
2044
2045 _continuation = self._extract_continuation_ep_data(sort_continuation_ep) or self._extract_continuation(sort_menu_item)
2046 if not _continuation:
2047 continue
2048
2049 sort_text = sort_menu_item.get('title')
2050 if isinstance(sort_text, compat_str):
2051 sort_text = sort_text.lower()
2052 else:
2053 sort_text = 'top comments' if comment_sort_index == 0 else 'newest first'
2054 self.to_screen('Sorting comments by %s' % sort_text)
2055 break
2056 return _total_comments, _continuation
2057
2058 def extract_thread(contents):
2059 if not parent:
2060 comment_counts[2] = 0
2061 for content in contents:
2062 comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
2063 comment_renderer = try_get(
2064 comment_thread_renderer, (lambda x: x['comment']['commentRenderer'], dict)) or try_get(
2065 content, (lambda x: x['commentRenderer'], dict))
2066
2067 if not comment_renderer:
2068 continue
2069 comment = self._extract_comment(comment_renderer, parent)
2070 if not comment:
2071 continue
2072 comment_counts[0] += 1
2073 yield comment
2074 # Attempt to get the replies
2075 comment_replies_renderer = try_get(
2076 comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
2077
2078 if comment_replies_renderer:
2079 comment_counts[2] += 1
2080 comment_entries_iter = self._comment_entries(
2081 comment_replies_renderer, identity_token, account_syncid, ytcfg,
2082 video_id, parent=comment.get('id'), comment_counts=comment_counts)
2083
2084 for reply_comment in comment_entries_iter:
2085 yield reply_comment
2086
2087 # YouTube comments have a max depth of 2
2088 max_depth = int_or_none(self._configuration_arg('max_comment_depth', [''])[0]) or float('inf')
2089 if max_depth == 1 and parent:
2090 return
2091 if not comment_counts:
2092 # comment so far, est. total comments, current comment thread #
2093 comment_counts = [0, 0, 0]
2094
2095 continuation = self._extract_continuation(root_continuation_data)
2096 if continuation and len(continuation['ctoken']) < 27:
2097 self.write_debug('Detected old API continuation token. Generating new API compatible token.')
2098 continuation_token = self._generate_comment_continuation(video_id)
2099 continuation = self._build_continuation_query(continuation_token, None)
2100
2101 visitor_data = None
2102 is_first_continuation = parent is None
2103
2104 for page_num in itertools.count(0):
2105 if not continuation:
2106 break
2107 headers = self._generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
2108 comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
2109 if page_num == 0:
2110 if is_first_continuation:
2111 note_prefix = 'Downloading comment section API JSON'
2112 else:
2113 note_prefix = ' Downloading comment API JSON reply thread %d %s' % (
2114 comment_counts[2], comment_prog_str)
2115 else:
2116 note_prefix = '%sDownloading comment%s API JSON page %d %s' % (
2117 ' ' if parent else '', ' replies' if parent else '',
2118 page_num, comment_prog_str)
2119
2120 response = self._extract_response(
2121 item_id=None, query=self._continuation_query_ajax_to_api(continuation),
2122 ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
2123 check_get_keys=('onResponseReceivedEndpoints', 'continuationContents'))
2124 if not response:
2125 break
2126 visitor_data = try_get(
2127 response,
2128 lambda x: x['responseContext']['webResponseContextExtensionData']['ytConfigData']['visitorData'],
2129 compat_str) or visitor_data
2130
2131 continuation_contents = dict_get(response, ('onResponseReceivedEndpoints', 'continuationContents'))
2132
2133 continuation = None
2134 if isinstance(continuation_contents, list):
2135 for continuation_section in continuation_contents:
2136 if not isinstance(continuation_section, dict):
2137 continue
2138 continuation_items = try_get(
2139 continuation_section,
2140 (lambda x: x['reloadContinuationItemsCommand']['continuationItems'],
2141 lambda x: x['appendContinuationItemsAction']['continuationItems']),
2142 list) or []
2143 if is_first_continuation:
2144 total_comments, continuation = extract_header(continuation_items)
2145 if total_comments:
2146 yield total_comments
2147 is_first_continuation = False
2148 if continuation:
2149 break
2150 continue
2151 count = 0
2152 for count, entry in enumerate(extract_thread(continuation_items)):
2153 yield entry
2154 continuation = self._extract_continuation({'contents': continuation_items})
2155 if continuation:
2156 # Sometimes YouTube provides a continuation without any comments
2157 # In most cases we end up just downloading these with very little comments to come.
2158 if count == 0:
2159 if not parent:
2160 self.report_warning('No comments received - assuming end of comments')
2161 continuation = None
2162 break
2163
2164 # Deprecated response structure
2165 elif isinstance(continuation_contents, dict):
2166 known_continuation_renderers = ('itemSectionContinuation', 'commentRepliesContinuation')
2167 for key, continuation_renderer in continuation_contents.items():
2168 if key not in known_continuation_renderers:
2169 continue
2170 if not isinstance(continuation_renderer, dict):
2171 continue
2172 if is_first_continuation:
2173 header_continuation_items = [continuation_renderer.get('header') or {}]
2174 total_comments, continuation = extract_header(header_continuation_items)
2175 if total_comments:
2176 yield total_comments
2177 is_first_continuation = False
2178 if continuation:
2179 break
2180
2181 # Sometimes YouTube provides a continuation without any comments
2182 # In most cases we end up just downloading these with very little comments to come.
2183 count = 0
2184 for count, entry in enumerate(extract_thread(continuation_renderer.get('contents') or {})):
2185 yield entry
2186 continuation = self._extract_continuation(continuation_renderer)
2187 if count == 0:
2188 if not parent:
2189 self.report_warning('No comments received - assuming end of comments')
2190 continuation = None
2191 break
2192
2193 @staticmethod
2194 def _generate_comment_continuation(video_id):
2195 """
2196 Generates initial comment section continuation token from given video id
2197 """
2198 b64_vid_id = base64.b64encode(bytes(video_id.encode('utf-8')))
2199 parts = ('Eg0SCw==', b64_vid_id, 'GAYyJyIRIgs=', b64_vid_id, 'MAB4AjAAQhBjb21tZW50cy1zZWN0aW9u')
2200 new_continuation_intlist = list(itertools.chain.from_iterable(
2201 [bytes_to_intlist(base64.b64decode(part)) for part in parts]))
2202 return base64.b64encode(intlist_to_bytes(new_continuation_intlist)).decode('utf-8')
2203
2204 def _extract_comments(self, ytcfg, video_id, contents, webpage):
2205 """Entry for comment extraction"""
2206 def _real_comment_extract(contents):
2207 if isinstance(contents, list):
2208 for entry in contents:
2209 for key, renderer in entry.items():
2210 if key not in known_entry_comment_renderers:
2211 continue
2212 yield from self._comment_entries(
2213 renderer, video_id=video_id, ytcfg=ytcfg,
2214 identity_token=self._extract_identity_token(webpage, item_id=video_id),
2215 account_syncid=self._extract_account_syncid(ytcfg))
2216 break
2217 comments = []
2218 known_entry_comment_renderers = ('itemSectionRenderer',)
2219 estimated_total = 0
2220 max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) or float('inf')
2221
2222 try:
2223 for comment in _real_comment_extract(contents):
2224 if len(comments) >= max_comments:
2225 break
2226 if isinstance(comment, int):
2227 estimated_total = comment
2228 continue
2229 comments.append(comment)
2230 except KeyboardInterrupt:
2231 self.to_screen('Interrupted by user')
2232 self.to_screen('Downloaded %d/%d comments' % (len(comments), estimated_total))
2233 return {
2234 'comments': comments,
2235 'comment_count': len(comments),
2236 }
2237
2238 @staticmethod
2239 def _generate_player_context(sts=None):
2240 context = {
2241 'html5Preference': 'HTML5_PREF_WANTS',
2242 }
2243 if sts is not None:
2244 context['signatureTimestamp'] = sts
2245 return {
2246 'playbackContext': {
2247 'contentPlaybackContext': context
2248 }
2249 }
2250
2251 @staticmethod
2252 def _get_video_info_params(video_id, client='TVHTML5'):
2253 GVI_CLIENTS = {
2254 'ANDROID': {
2255 'c': 'ANDROID',
2256 'cver': '16.20',
2257 },
2258 'TVHTML5': {
2259 'c': 'TVHTML5',
2260 'cver': '6.20180913',
2261 }
2262 }
2263 query = {
2264 'video_id': video_id,
2265 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
2266 'html5': '1'
2267 }
2268 query.update(GVI_CLIENTS.get(client))
2269 return query
2270
2271 def _real_extract(self, url):
2272 url, smuggled_data = unsmuggle_url(url, {})
2273 video_id = self._match_id(url)
2274
2275 is_music_url = smuggled_data.get('is_music_url') or self.is_music_url(url)
2276
2277 base_url = self.http_scheme() + '//www.youtube.com/'
2278 webpage_url = base_url + 'watch?v=' + video_id
2279 webpage = self._download_webpage(
2280 webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
2281
2282 ytcfg = self._extract_ytcfg(video_id, webpage) or self._get_default_ytcfg()
2283 identity_token = self._extract_identity_token(webpage, video_id)
2284 syncid = self._extract_account_syncid(ytcfg)
2285 headers = self._generate_api_headers(ytcfg, identity_token, syncid)
2286
2287 player_url = self._extract_player_url(ytcfg, webpage)
2288
2289 player_client = self._configuration_arg('player_client', [''])[0]
2290 if player_client not in ('web', 'android', ''):
2291 self.report_warning(f'Invalid player_client {player_client} given. Falling back to android client.')
2292 force_mobile_client = player_client != 'web'
2293 player_skip = self._configuration_arg('player_skip')
2294
2295 def get_text(x):
2296 if not x:
2297 return
2298 text = x.get('simpleText')
2299 if text and isinstance(text, compat_str):
2300 return text
2301 runs = x.get('runs')
2302 if not isinstance(runs, list):
2303 return
2304 return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)])
2305
2306 ytm_streaming_data = {}
2307 if is_music_url:
2308 ytm_webpage = None
2309 sts = self._extract_signature_timestamp(video_id, player_url, ytcfg, fatal=False)
2310 if sts and not force_mobile_client and 'configs' not in player_skip:
2311 ytm_webpage = self._download_webpage(
2312 'https://music.youtube.com',
2313 video_id, fatal=False, note='Downloading remix client config')
2314
2315 ytm_cfg = self._extract_ytcfg(video_id, ytm_webpage) or {}
2316 ytm_client = 'WEB_REMIX'
2317 if not sts or force_mobile_client:
2318 # Android client already has signature descrambled
2319 # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/562
2320 if not sts:
2321 self.report_warning('Falling back to android remix client for player API.')
2322 ytm_client = 'ANDROID_MUSIC'
2323 ytm_cfg = {}
2324
2325 ytm_headers = self._generate_api_headers(
2326 ytm_cfg, identity_token, syncid,
2327 client=ytm_client)
2328 ytm_query = {'videoId': video_id}
2329 ytm_query.update(self._generate_player_context(sts))
2330
2331 ytm_player_response = self._extract_response(
2332 item_id=video_id, ep='player', query=ytm_query,
2333 ytcfg=ytm_cfg, headers=ytm_headers, fatal=False,
2334 default_client=ytm_client,
2335 note='Downloading %sremix player API JSON' % ('android ' if force_mobile_client else ''))
2336 ytm_streaming_data = try_get(ytm_player_response, lambda x: x['streamingData'], dict) or {}
2337
2338 player_response = None
2339 if webpage:
2340 player_response = self._extract_yt_initial_variable(
2341 webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
2342 video_id, 'initial player response')
2343
2344 if not player_response or force_mobile_client:
2345 sts = self._extract_signature_timestamp(video_id, player_url, ytcfg, fatal=False)
2346 yt_client = 'WEB'
2347 ytpcfg = ytcfg
2348 ytp_headers = headers
2349 if not sts or force_mobile_client:
2350 # Android client already has signature descrambled
2351 # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/562
2352 if not sts:
2353 self.report_warning('Falling back to android client for player API.')
2354 yt_client = 'ANDROID'
2355 ytpcfg = {}
2356 ytp_headers = self._generate_api_headers(ytpcfg, identity_token, syncid, yt_client)
2357
2358 yt_query = {'videoId': video_id}
2359 yt_query.update(self._generate_player_context(sts))
2360 player_response = self._extract_response(
2361 item_id=video_id, ep='player', query=yt_query,
2362 ytcfg=ytpcfg, headers=ytp_headers, fatal=False,
2363 default_client=yt_client,
2364 note='Downloading %splayer API JSON' % ('android ' if force_mobile_client else '')
2365 ) or player_response
2366
2367 # Age-gate workarounds
2368 playability_status = player_response.get('playabilityStatus') or {}
2369 if playability_status.get('reason') in self._AGE_GATE_REASONS:
2370 gvi_clients = ('ANDROID', 'TVHTML5') if force_mobile_client else ('TVHTML5', 'ANDROID')
2371 for gvi_client in gvi_clients:
2372 pr = self._parse_json(try_get(compat_parse_qs(
2373 self._download_webpage(
2374 base_url + 'get_video_info', video_id,
2375 'Refetching age-gated %s info webpage' % gvi_client.lower(),
2376 'unable to download video info webpage', fatal=False,
2377 query=self._get_video_info_params(video_id, client=gvi_client))),
2378 lambda x: x['player_response'][0],
2379 compat_str) or '{}', video_id)
2380 if pr:
2381 break
2382 if not pr:
2383 self.report_warning('Falling back to embedded-only age-gate workaround.')
2384 embed_webpage = None
2385 sts = self._extract_signature_timestamp(video_id, player_url, ytcfg, fatal=False)
2386 if sts and not force_mobile_client and 'configs' not in player_skip:
2387 embed_webpage = self._download_webpage(
2388 'https://www.youtube.com/embed/%s?html5=1' % video_id,
2389 video_id=video_id, note='Downloading age-gated embed config')
2390
2391 ytcfg_age = self._extract_ytcfg(video_id, embed_webpage) or {}
2392 # If we extracted the embed webpage, it'll tell us if we can view the video
2393 embedded_pr = self._parse_json(
2394 try_get(ytcfg_age, lambda x: x['PLAYER_VARS']['embedded_player_response'], str) or '{}',
2395 video_id=video_id)
2396 embedded_ps_reason = try_get(embedded_pr, lambda x: x['playabilityStatus']['reason'], str) or ''
2397 if embedded_ps_reason not in self._AGE_GATE_REASONS:
2398 yt_client = 'WEB_EMBEDDED_PLAYER'
2399 if not sts or force_mobile_client:
2400 # Android client already has signature descrambled
2401 # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/562
2402 if not sts:
2403 self.report_warning(
2404 'Falling back to android embedded client for player API (note: some formats may be missing).')
2405 yt_client = 'ANDROID_EMBEDDED_PLAYER'
2406 ytcfg_age = {}
2407
2408 ytage_headers = self._generate_api_headers(
2409 ytcfg_age, identity_token, syncid, client=yt_client)
2410 yt_age_query = {'videoId': video_id}
2411 yt_age_query.update(self._generate_player_context(sts))
2412 pr = self._extract_response(
2413 item_id=video_id, ep='player', query=yt_age_query,
2414 ytcfg=ytcfg_age, headers=ytage_headers, fatal=False,
2415 default_client=yt_client,
2416 note='Downloading %sage-gated player API JSON' % ('android ' if force_mobile_client else '')
2417 ) or {}
2418
2419 if pr:
2420 player_response = pr
2421
2422 trailer_video_id = try_get(
2423 playability_status,
2424 lambda x: x['errorScreen']['playerLegacyDesktopYpcTrailerRenderer']['trailerVideoId'],
2425 compat_str)
2426 if trailer_video_id:
2427 return self.url_result(
2428 trailer_video_id, self.ie_key(), trailer_video_id)
2429
2430 search_meta = (
2431 lambda x: self._html_search_meta(x, webpage, default=None)) \
2432 if webpage else lambda x: None
2433
2434 video_details = player_response.get('videoDetails') or {}
2435 microformat = try_get(
2436 player_response,
2437 lambda x: x['microformat']['playerMicroformatRenderer'],
2438 dict) or {}
2439 video_title = video_details.get('title') \
2440 or get_text(microformat.get('title')) \
2441 or search_meta(['og:title', 'twitter:title', 'title'])
2442 video_description = video_details.get('shortDescription')
2443
2444 if not smuggled_data.get('force_singlefeed', False):
2445 if not self.get_param('noplaylist'):
2446 multifeed_metadata_list = try_get(
2447 player_response,
2448 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
2449 compat_str)
2450 if multifeed_metadata_list:
2451 entries = []
2452 feed_ids = []
2453 for feed in multifeed_metadata_list.split(','):
2454 # Unquote should take place before split on comma (,) since textual
2455 # fields may contain comma as well (see
2456 # https://github.com/ytdl-org/youtube-dl/issues/8536)
2457 feed_data = compat_parse_qs(
2458 compat_urllib_parse_unquote_plus(feed))
2459
2460 def feed_entry(name):
2461 return try_get(
2462 feed_data, lambda x: x[name][0], compat_str)
2463
2464 feed_id = feed_entry('id')
2465 if not feed_id:
2466 continue
2467 feed_title = feed_entry('title')
2468 title = video_title
2469 if feed_title:
2470 title += ' (%s)' % feed_title
2471 entries.append({
2472 '_type': 'url_transparent',
2473 'ie_key': 'Youtube',
2474 'url': smuggle_url(
2475 base_url + 'watch?v=' + feed_data['id'][0],
2476 {'force_singlefeed': True}),
2477 'title': title,
2478 })
2479 feed_ids.append(feed_id)
2480 self.to_screen(
2481 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
2482 % (', '.join(feed_ids), video_id))
2483 return self.playlist_result(
2484 entries, video_id, video_title, video_description)
2485 else:
2486 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2487
2488 formats, itags, stream_ids = [], [], []
2489 itag_qualities = {}
2490 q = qualities([
2491 # "tiny" is the smallest video-only format. But some audio-only formats
2492 # was also labeled "tiny". It is not clear if such formats still exist
2493 'tiny', 'audio_quality_low', 'audio_quality_medium', 'audio_quality_high', # Audio only formats
2494 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'
2495 ])
2496
2497 streaming_data = player_response.get('streamingData') or {}
2498 streaming_formats = streaming_data.get('formats') or []
2499 streaming_formats.extend(streaming_data.get('adaptiveFormats') or [])
2500 streaming_formats.extend(ytm_streaming_data.get('formats') or [])
2501 streaming_formats.extend(ytm_streaming_data.get('adaptiveFormats') or [])
2502
2503 for fmt in streaming_formats:
2504 if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
2505 continue
2506
2507 itag = str_or_none(fmt.get('itag'))
2508 audio_track = fmt.get('audioTrack') or {}
2509 stream_id = '%s.%s' % (itag or '', audio_track.get('id', ''))
2510 if stream_id in stream_ids:
2511 continue
2512
2513 quality = fmt.get('quality')
2514 if quality == 'tiny' or not quality:
2515 quality = fmt.get('audioQuality', '').lower() or quality
2516 if itag and quality:
2517 itag_qualities[itag] = quality
2518 # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment
2519 # (adding `&sq=0` to the URL) and parsing emsg box to determine the
2520 # number of fragment that would subsequently requested with (`&sq=N`)
2521 if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF':
2522 continue
2523
2524 fmt_url = fmt.get('url')
2525 if not fmt_url:
2526 sc = compat_parse_qs(fmt.get('signatureCipher'))
2527 fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0]))
2528 encrypted_sig = try_get(sc, lambda x: x['s'][0])
2529 if not (sc and fmt_url and encrypted_sig):
2530 continue
2531 if not player_url:
2532 continue
2533 signature = self._decrypt_signature(sc['s'][0], video_id, player_url)
2534 sp = try_get(sc, lambda x: x['sp'][0]) or 'signature'
2535 fmt_url += '&' + sp + '=' + signature
2536
2537 if itag:
2538 itags.append(itag)
2539 stream_ids.append(stream_id)
2540
2541 tbr = float_or_none(
2542 fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
2543 dct = {
2544 'asr': int_or_none(fmt.get('audioSampleRate')),
2545 'filesize': int_or_none(fmt.get('contentLength')),
2546 'format_id': itag,
2547 'format_note': audio_track.get('displayName') or fmt.get('qualityLabel') or quality,
2548 'fps': int_or_none(fmt.get('fps')),
2549 'height': int_or_none(fmt.get('height')),
2550 'quality': q(quality),
2551 'tbr': tbr,
2552 'url': fmt_url,
2553 'width': fmt.get('width'),
2554 'language': audio_track.get('id', '').split('.')[0],
2555 }
2556 mime_mobj = re.match(
2557 r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '')
2558 if mime_mobj:
2559 dct['ext'] = mimetype2ext(mime_mobj.group(1))
2560 dct.update(parse_codecs(mime_mobj.group(2)))
2561 # The 3gp format in android client has a quality of "small",
2562 # but is actually worse than all other formats
2563 if dct['ext'] == '3gp':
2564 dct['quality'] = q('tiny')
2565 no_audio = dct.get('acodec') == 'none'
2566 no_video = dct.get('vcodec') == 'none'
2567 if no_audio:
2568 dct['vbr'] = tbr
2569 if no_video:
2570 dct['abr'] = tbr
2571 if no_audio or no_video:
2572 dct['downloader_options'] = {
2573 # Youtube throttles chunks >~10M
2574 'http_chunk_size': 10485760,
2575 }
2576 if dct.get('ext'):
2577 dct['container'] = dct['ext'] + '_dash'
2578 formats.append(dct)
2579
2580 skip_manifests = self._configuration_arg('skip')
2581 get_dash = 'dash' not in skip_manifests and self.get_param('youtube_include_dash_manifest', True)
2582 get_hls = 'hls' not in skip_manifests and self.get_param('youtube_include_hls_manifest', True)
2583
2584 for sd in (streaming_data, ytm_streaming_data):
2585 hls_manifest_url = get_hls and sd.get('hlsManifestUrl')
2586 if hls_manifest_url:
2587 for f in self._extract_m3u8_formats(
2588 hls_manifest_url, video_id, 'mp4', fatal=False):
2589 itag = self._search_regex(
2590 r'/itag/(\d+)', f['url'], 'itag', default=None)
2591 if itag:
2592 f['format_id'] = itag
2593 formats.append(f)
2594
2595 dash_manifest_url = get_dash and sd.get('dashManifestUrl')
2596 if dash_manifest_url:
2597 for f in self._extract_mpd_formats(
2598 dash_manifest_url, video_id, fatal=False):
2599 itag = f['format_id']
2600 if itag in itags:
2601 continue
2602 if itag in itag_qualities:
2603 f['quality'] = q(itag_qualities[itag])
2604 filesize = int_or_none(self._search_regex(
2605 r'/clen/(\d+)', f.get('fragment_base_url')
2606 or f['url'], 'file size', default=None))
2607 if filesize:
2608 f['filesize'] = filesize
2609 formats.append(f)
2610
2611 if not formats:
2612 if not self.get_param('allow_unplayable_formats') and streaming_data.get('licenseInfos'):
2613 self.raise_no_formats(
2614 'This video is DRM protected.', expected=True)
2615 pemr = try_get(
2616 playability_status,
2617 lambda x: x['errorScreen']['playerErrorMessageRenderer'],
2618 dict) or {}
2619 reason = get_text(pemr.get('reason')) or playability_status.get('reason')
2620 subreason = pemr.get('subreason')
2621 if subreason:
2622 subreason = clean_html(get_text(subreason))
2623 if subreason == 'The uploader has not made this video available in your country.':
2624 countries = microformat.get('availableCountries')
2625 if not countries:
2626 regions_allowed = search_meta('regionsAllowed')
2627 countries = regions_allowed.split(',') if regions_allowed else None
2628 self.raise_geo_restricted(subreason, countries, metadata_available=True)
2629 reason += '\n' + subreason
2630 if reason:
2631 self.raise_no_formats(reason, expected=True)
2632
2633 self._sort_formats(formats)
2634
2635 keywords = video_details.get('keywords') or []
2636 if not keywords and webpage:
2637 keywords = [
2638 unescapeHTML(m.group('content'))
2639 for m in re.finditer(self._meta_regex('og:video:tag'), webpage)]
2640 for keyword in keywords:
2641 if keyword.startswith('yt:stretch='):
2642 mobj = re.search(r'(\d+)\s*:\s*(\d+)', keyword)
2643 if mobj:
2644 # NB: float is intentional for forcing float division
2645 w, h = (float(v) for v in mobj.groups())
2646 if w > 0 and h > 0:
2647 ratio = w / h
2648 for f in formats:
2649 if f.get('vcodec') != 'none':
2650 f['stretched_ratio'] = ratio
2651 break
2652
2653 thumbnails = []
2654 for container in (video_details, microformat):
2655 for thumbnail in (try_get(
2656 container,
2657 lambda x: x['thumbnail']['thumbnails'], list) or []):
2658 thumbnail_url = thumbnail.get('url')
2659 if not thumbnail_url:
2660 continue
2661 # Sometimes youtube gives a wrong thumbnail URL. See:
2662 # https://github.com/yt-dlp/yt-dlp/issues/233
2663 # https://github.com/ytdl-org/youtube-dl/issues/28023
2664 if 'maxresdefault' in thumbnail_url:
2665 thumbnail_url = thumbnail_url.split('?')[0]
2666 thumbnails.append({
2667 'url': thumbnail_url,
2668 'height': int_or_none(thumbnail.get('height')),
2669 'width': int_or_none(thumbnail.get('width')),
2670 'preference': 1 if 'maxresdefault' in thumbnail_url else -1
2671 })
2672 thumbnail_url = search_meta(['og:image', 'twitter:image'])
2673 if thumbnail_url:
2674 thumbnails.append({
2675 'url': thumbnail_url,
2676 'preference': 1 if 'maxresdefault' in thumbnail_url else -1
2677 })
2678 # All videos have a maxresdefault thumbnail, but sometimes it does not appear in the webpage
2679 # See: https://github.com/ytdl-org/youtube-dl/issues/29049
2680 thumbnails.append({
2681 'url': 'https://i.ytimg.com/vi/%s/maxresdefault.jpg' % video_id,
2682 'preference': 1,
2683 })
2684 self._remove_duplicate_formats(thumbnails)
2685
2686 category = microformat.get('category') or search_meta('genre')
2687 channel_id = video_details.get('channelId') \
2688 or microformat.get('externalChannelId') \
2689 or search_meta('channelId')
2690 duration = int_or_none(
2691 video_details.get('lengthSeconds')
2692 or microformat.get('lengthSeconds')) \
2693 or parse_duration(search_meta('duration'))
2694 is_live = video_details.get('isLive')
2695 is_upcoming = video_details.get('isUpcoming')
2696 owner_profile_url = microformat.get('ownerProfileUrl')
2697
2698 info = {
2699 'id': video_id,
2700 'title': self._live_title(video_title) if is_live else video_title,
2701 'formats': formats,
2702 'thumbnails': thumbnails,
2703 'description': video_description,
2704 'upload_date': unified_strdate(
2705 microformat.get('uploadDate')
2706 or search_meta('uploadDate')),
2707 'uploader': video_details['author'],
2708 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None,
2709 'uploader_url': owner_profile_url,
2710 'channel_id': channel_id,
2711 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None,
2712 'duration': duration,
2713 'view_count': int_or_none(
2714 video_details.get('viewCount')
2715 or microformat.get('viewCount')
2716 or search_meta('interactionCount')),
2717 'average_rating': float_or_none(video_details.get('averageRating')),
2718 'age_limit': 18 if (
2719 microformat.get('isFamilySafe') is False
2720 or search_meta('isFamilyFriendly') == 'false'
2721 or search_meta('og:restrictions:age') == '18+') else 0,
2722 'webpage_url': webpage_url,
2723 'categories': [category] if category else None,
2724 'tags': keywords,
2725 'is_live': is_live,
2726 'playable_in_embed': playability_status.get('playableInEmbed'),
2727 'was_live': video_details.get('isLiveContent'),
2728 }
2729
2730 pctr = try_get(
2731 player_response,
2732 lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict)
2733 subtitles = {}
2734 if pctr:
2735 def process_language(container, base_url, lang_code, sub_name, query):
2736 lang_subs = container.setdefault(lang_code, [])
2737 for fmt in self._SUBTITLE_FORMATS:
2738 query.update({
2739 'fmt': fmt,
2740 })
2741 lang_subs.append({
2742 'ext': fmt,
2743 'url': update_url_query(base_url, query),
2744 'name': sub_name,
2745 })
2746
2747 for caption_track in (pctr.get('captionTracks') or []):
2748 base_url = caption_track.get('baseUrl')
2749 if not base_url:
2750 continue
2751 if caption_track.get('kind') != 'asr':
2752 lang_code = (
2753 remove_start(caption_track.get('vssId') or '', '.').replace('.', '-')
2754 or caption_track.get('languageCode'))
2755 if not lang_code:
2756 continue
2757 process_language(
2758 subtitles, base_url, lang_code,
2759 try_get(caption_track, lambda x: x['name']['simpleText']),
2760 {})
2761 continue
2762 automatic_captions = {}
2763 for translation_language in (pctr.get('translationLanguages') or []):
2764 translation_language_code = translation_language.get('languageCode')
2765 if not translation_language_code:
2766 continue
2767 process_language(
2768 automatic_captions, base_url, translation_language_code,
2769 try_get(translation_language, (
2770 lambda x: x['languageName']['simpleText'],
2771 lambda x: x['languageName']['runs'][0]['text'])),
2772 {'tlang': translation_language_code})
2773 info['automatic_captions'] = automatic_captions
2774 info['subtitles'] = subtitles
2775
2776 parsed_url = compat_urllib_parse_urlparse(url)
2777 for component in [parsed_url.fragment, parsed_url.query]:
2778 query = compat_parse_qs(component)
2779 for k, v in query.items():
2780 for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]:
2781 d_k += '_time'
2782 if d_k not in info and k in s_ks:
2783 info[d_k] = parse_duration(query[k][0])
2784
2785 # Youtube Music Auto-generated description
2786 if video_description:
2787 mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
2788 if mobj:
2789 release_year = mobj.group('release_year')
2790 release_date = mobj.group('release_date')
2791 if release_date:
2792 release_date = release_date.replace('-', '')
2793 if not release_year:
2794 release_year = release_date[:4]
2795 info.update({
2796 'album': mobj.group('album'.strip()),
2797 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')),
2798 'track': mobj.group('track').strip(),
2799 'release_date': release_date,
2800 'release_year': int_or_none(release_year),
2801 })
2802
2803 initial_data = None
2804 if webpage:
2805 initial_data = self._extract_yt_initial_variable(
2806 webpage, self._YT_INITIAL_DATA_RE, video_id,
2807 'yt initial data')
2808 if not initial_data:
2809 initial_data = self._extract_response(
2810 item_id=video_id, ep='next', fatal=False,
2811 ytcfg=ytcfg, headers=headers, query={'videoId': video_id},
2812 note='Downloading initial data API JSON')
2813
2814 try:
2815 # This will error if there is no livechat
2816 initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
2817 info['subtitles']['live_chat'] = [{
2818 'url': 'https://www.youtube.com/watch?v=%s' % video_id, # url is needed to set cookies
2819 'video_id': video_id,
2820 'ext': 'json',
2821 'protocol': 'youtube_live_chat' if is_live or is_upcoming else 'youtube_live_chat_replay',
2822 }]
2823 except (KeyError, IndexError, TypeError):
2824 pass
2825
2826 if initial_data:
2827 chapters = self._extract_chapters_from_json(
2828 initial_data, video_id, duration)
2829 if not chapters:
2830 for engagment_pannel in (initial_data.get('engagementPanels') or []):
2831 contents = try_get(
2832 engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'],
2833 list)
2834 if not contents:
2835 continue
2836
2837 def chapter_time(mmlir):
2838 return parse_duration(
2839 get_text(mmlir.get('timeDescription')))
2840
2841 chapters = []
2842 for next_num, content in enumerate(contents, start=1):
2843 mmlir = content.get('macroMarkersListItemRenderer') or {}
2844 start_time = chapter_time(mmlir)
2845 end_time = chapter_time(try_get(
2846 contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \
2847 if next_num < len(contents) else duration
2848 if start_time is None or end_time is None:
2849 continue
2850 chapters.append({
2851 'start_time': start_time,
2852 'end_time': end_time,
2853 'title': get_text(mmlir.get('title')),
2854 })
2855 if chapters:
2856 break
2857 if chapters:
2858 info['chapters'] = chapters
2859
2860 contents = try_get(
2861 initial_data,
2862 lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'],
2863 list) or []
2864 for content in contents:
2865 vpir = content.get('videoPrimaryInfoRenderer')
2866 if vpir:
2867 stl = vpir.get('superTitleLink')
2868 if stl:
2869 stl = get_text(stl)
2870 if try_get(
2871 vpir,
2872 lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':
2873 info['location'] = stl
2874 else:
2875 mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl)
2876 if mobj:
2877 info.update({
2878 'series': mobj.group(1),
2879 'season_number': int(mobj.group(2)),
2880 'episode_number': int(mobj.group(3)),
2881 })
2882 for tlb in (try_get(
2883 vpir,
2884 lambda x: x['videoActions']['menuRenderer']['topLevelButtons'],
2885 list) or []):
2886 tbr = tlb.get('toggleButtonRenderer') or {}
2887 for getter, regex in [(
2888 lambda x: x['defaultText']['accessibility']['accessibilityData'],
2889 r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([
2890 lambda x: x['accessibility'],
2891 lambda x: x['accessibilityData']['accessibilityData'],
2892 ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]:
2893 label = (try_get(tbr, getter, dict) or {}).get('label')
2894 if label:
2895 mobj = re.match(regex, label)
2896 if mobj:
2897 info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count'))
2898 break
2899 sbr_tooltip = try_get(
2900 vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])
2901 if sbr_tooltip:
2902 like_count, dislike_count = sbr_tooltip.split(' / ')
2903 info.update({
2904 'like_count': str_to_int(like_count),
2905 'dislike_count': str_to_int(dislike_count),
2906 })
2907 vsir = content.get('videoSecondaryInfoRenderer')
2908 if vsir:
2909 info['channel'] = get_text(try_get(
2910 vsir,
2911 lambda x: x['owner']['videoOwnerRenderer']['title'],
2912 dict))
2913 rows = try_get(
2914 vsir,
2915 lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
2916 list) or []
2917 multiple_songs = False
2918 for row in rows:
2919 if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
2920 multiple_songs = True
2921 break
2922 for row in rows:
2923 mrr = row.get('metadataRowRenderer') or {}
2924 mrr_title = mrr.get('title')
2925 if not mrr_title:
2926 continue
2927 mrr_title = get_text(mrr['title'])
2928 mrr_contents_text = get_text(mrr['contents'][0])
2929 if mrr_title == 'License':
2930 info['license'] = mrr_contents_text
2931 elif not multiple_songs:
2932 if mrr_title == 'Album':
2933 info['album'] = mrr_contents_text
2934 elif mrr_title == 'Artist':
2935 info['artist'] = mrr_contents_text
2936 elif mrr_title == 'Song':
2937 info['track'] = mrr_contents_text
2938
2939 fallbacks = {
2940 'channel': 'uploader',
2941 'channel_id': 'uploader_id',
2942 'channel_url': 'uploader_url',
2943 }
2944 for to, frm in fallbacks.items():
2945 if not info.get(to):
2946 info[to] = info.get(frm)
2947
2948 for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]:
2949 v = info.get(s_k)
2950 if v:
2951 info[d_k] = v
2952
2953 is_private = bool_or_none(video_details.get('isPrivate'))
2954 is_unlisted = bool_or_none(microformat.get('isUnlisted'))
2955 is_membersonly = None
2956 is_premium = None
2957 if initial_data and is_private is not None:
2958 is_membersonly = False
2959 is_premium = False
2960 contents = try_get(initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list)
2961 for content in contents or []:
2962 badges = try_get(content, lambda x: x['videoPrimaryInfoRenderer']['badges'], list)
2963 for badge in badges or []:
2964 label = try_get(badge, lambda x: x['metadataBadgeRenderer']['label']) or ''
2965 if label.lower() == 'members only':
2966 is_membersonly = True
2967 break
2968 elif label.lower() == 'premium':
2969 is_premium = True
2970 break
2971 if is_membersonly or is_premium:
2972 break
2973
2974 # TODO: Add this for playlists
2975 info['availability'] = self._availability(
2976 is_private=is_private,
2977 needs_premium=is_premium,
2978 needs_subscription=is_membersonly,
2979 needs_auth=info['age_limit'] >= 18,
2980 is_unlisted=None if is_private is None else is_unlisted)
2981
2982 # get xsrf for annotations or comments
2983 get_annotations = self.get_param('writeannotations', False)
2984 get_comments = self.get_param('getcomments', False)
2985 if get_annotations or get_comments:
2986 xsrf_token = None
2987 ytcfg = self._extract_ytcfg(video_id, webpage)
2988 if ytcfg:
2989 xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
2990 if not xsrf_token:
2991 xsrf_token = self._search_regex(
2992 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
2993 webpage, 'xsrf token', group='xsrf_token', fatal=False)
2994
2995 # annotations
2996 if get_annotations:
2997 invideo_url = try_get(
2998 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2999 if xsrf_token and invideo_url:
3000 xsrf_field_name = None
3001 if ytcfg:
3002 xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
3003 if not xsrf_field_name:
3004 xsrf_field_name = self._search_regex(
3005 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
3006 webpage, 'xsrf field name',
3007 group='xsrf_field_name', default='session_token')
3008 info['annotations'] = self._download_webpage(
3009 self._proto_relative_url(invideo_url),
3010 video_id, note='Downloading annotations',
3011 errnote='Unable to download video annotations', fatal=False,
3012 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
3013
3014 if get_comments:
3015 info['__post_extractor'] = lambda: self._extract_comments(ytcfg, video_id, contents, webpage)
3016
3017 self.mark_watched(video_id, player_response)
3018
3019 return info
3020
3021
3022 class YoutubeTabIE(YoutubeBaseInfoExtractor):
3023 IE_DESC = 'YouTube.com tab'
3024 _VALID_URL = r'''(?x)
3025 https?://
3026 (?:\w+\.)?
3027 (?:
3028 youtube(?:kids)?\.com|
3029 invidio\.us
3030 )/
3031 (?:
3032 (?P<channel_type>channel|c|user|browse)/|
3033 (?P<not_channel>
3034 feed/|hashtag/|
3035 (?:playlist|watch)\?.*?\blist=
3036 )|
3037 (?!(?:%s)\b) # Direct URLs
3038 )
3039 (?P<id>[^/?\#&]+)
3040 ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
3041 IE_NAME = 'youtube:tab'
3042
3043 _TESTS = [{
3044 'note': 'playlists, multipage',
3045 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
3046 'playlist_mincount': 94,
3047 'info_dict': {
3048 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
3049 'title': 'Игорь Клейнер - Playlists',
3050 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
3051 'uploader': 'Игорь Клейнер',
3052 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
3053 },
3054 }, {
3055 'note': 'playlists, multipage, different order',
3056 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
3057 'playlist_mincount': 94,
3058 'info_dict': {
3059 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
3060 'title': 'Игорь Клейнер - Playlists',
3061 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
3062 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
3063 'uploader': 'Игорь Клейнер',
3064 },
3065 }, {
3066 'note': 'playlists, series',
3067 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3',
3068 'playlist_mincount': 5,
3069 'info_dict': {
3070 'id': 'UCYO_jab_esuFRV4b17AJtAw',
3071 'title': '3Blue1Brown - Playlists',
3072 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
3073 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
3074 'uploader': '3Blue1Brown',
3075 },
3076 }, {
3077 'note': 'playlists, singlepage',
3078 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
3079 'playlist_mincount': 4,
3080 'info_dict': {
3081 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
3082 'title': 'ThirstForScience - Playlists',
3083 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
3084 'uploader': 'ThirstForScience',
3085 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
3086 }
3087 }, {
3088 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3089 'only_matching': True,
3090 }, {
3091 'note': 'basic, single video playlist',
3092 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3093 'info_dict': {
3094 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
3095 'uploader': 'Sergey M.',
3096 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3097 'title': 'youtube-dl public playlist',
3098 },
3099 'playlist_count': 1,
3100 }, {
3101 'note': 'empty playlist',
3102 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3103 'info_dict': {
3104 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
3105 'uploader': 'Sergey M.',
3106 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3107 'title': 'youtube-dl empty playlist',
3108 },
3109 'playlist_count': 0,
3110 }, {
3111 'note': 'Home tab',
3112 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
3113 'info_dict': {
3114 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3115 'title': 'lex will - Home',
3116 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3117 'uploader': 'lex will',
3118 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3119 },
3120 'playlist_mincount': 2,
3121 }, {
3122 'note': 'Videos tab',
3123 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
3124 'info_dict': {
3125 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3126 'title': 'lex will - Videos',
3127 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3128 'uploader': 'lex will',
3129 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3130 },
3131 'playlist_mincount': 975,
3132 }, {
3133 'note': 'Videos tab, sorted by popular',
3134 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
3135 'info_dict': {
3136 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3137 'title': 'lex will - Videos',
3138 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3139 'uploader': 'lex will',
3140 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3141 },
3142 'playlist_mincount': 199,
3143 }, {
3144 'note': 'Playlists tab',
3145 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
3146 'info_dict': {
3147 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3148 'title': 'lex will - Playlists',
3149 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3150 'uploader': 'lex will',
3151 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3152 },
3153 'playlist_mincount': 17,
3154 }, {
3155 'note': 'Community tab',
3156 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
3157 'info_dict': {
3158 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3159 'title': 'lex will - Community',
3160 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3161 'uploader': 'lex will',
3162 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3163 },
3164 'playlist_mincount': 18,
3165 }, {
3166 'note': 'Channels tab',
3167 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
3168 'info_dict': {
3169 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3170 'title': 'lex will - Channels',
3171 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3172 'uploader': 'lex will',
3173 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3174 },
3175 'playlist_mincount': 12,
3176 }, {
3177 'note': 'Search tab',
3178 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra',
3179 'playlist_mincount': 40,
3180 'info_dict': {
3181 'id': 'UCYO_jab_esuFRV4b17AJtAw',
3182 'title': '3Blue1Brown - Search - linear algebra',
3183 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
3184 'uploader': '3Blue1Brown',
3185 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
3186 },
3187 }, {
3188 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3189 'only_matching': True,
3190 }, {
3191 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3192 'only_matching': True,
3193 }, {
3194 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3195 'only_matching': True,
3196 }, {
3197 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
3198 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
3199 'info_dict': {
3200 'title': '29C3: Not my department',
3201 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
3202 'uploader': 'Christiaan008',
3203 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
3204 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268',
3205 },
3206 'playlist_count': 96,
3207 }, {
3208 'note': 'Large playlist',
3209 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
3210 'info_dict': {
3211 'title': 'Uploads from Cauchemar',
3212 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
3213 'uploader': 'Cauchemar',
3214 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
3215 },
3216 'playlist_mincount': 1123,
3217 }, {
3218 'note': 'even larger playlist, 8832 videos',
3219 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
3220 'only_matching': True,
3221 }, {
3222 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
3223 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
3224 'info_dict': {
3225 'title': 'Uploads from Interstellar Movie',
3226 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
3227 'uploader': 'Interstellar Movie',
3228 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
3229 },
3230 'playlist_mincount': 21,
3231 }, {
3232 'note': 'Playlist with "show unavailable videos" button',
3233 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q',
3234 'info_dict': {
3235 'title': 'Uploads from Phim Siêu Nhân Nhật Bản',
3236 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q',
3237 'uploader': 'Phim Siêu Nhân Nhật Bản',
3238 'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q',
3239 },
3240 'playlist_mincount': 200,
3241 }, {
3242 'note': 'Playlist with unavailable videos in page 7',
3243 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w',
3244 'info_dict': {
3245 'title': 'Uploads from BlankTV',
3246 'id': 'UU8l9frL61Yl5KFOl87nIm2w',
3247 'uploader': 'BlankTV',
3248 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w',
3249 },
3250 'playlist_mincount': 1000,
3251 }, {
3252 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844',
3253 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
3254 'info_dict': {
3255 'title': 'Data Analysis with Dr Mike Pound',
3256 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
3257 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
3258 'uploader': 'Computerphile',
3259 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487',
3260 },
3261 'playlist_mincount': 11,
3262 }, {
3263 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3264 'only_matching': True,
3265 }, {
3266 'note': 'Playlist URL that does not actually serve a playlist',
3267 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
3268 'info_dict': {
3269 'id': 'FqZTN594JQw',
3270 'ext': 'webm',
3271 'title': "Smiley's People 01 detective, Adventure Series, Action",
3272 'uploader': 'STREEM',
3273 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
3274 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
3275 'upload_date': '20150526',
3276 'license': 'Standard YouTube License',
3277 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
3278 'categories': ['People & Blogs'],
3279 'tags': list,
3280 'view_count': int,
3281 'like_count': int,
3282 'dislike_count': int,
3283 },
3284 'params': {
3285 'skip_download': True,
3286 },
3287 'skip': 'This video is not available.',
3288 'add_ie': [YoutubeIE.ie_key()],
3289 }, {
3290 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
3291 'only_matching': True,
3292 }, {
3293 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
3294 'only_matching': True,
3295 }, {
3296 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
3297 'info_dict': {
3298 'id': 'X1whbWASnNQ', # This will keep changing
3299 'ext': 'mp4',
3300 'title': compat_str,
3301 'uploader': 'Sky News',
3302 'uploader_id': 'skynews',
3303 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
3304 'upload_date': r're:\d{8}',
3305 'description': compat_str,
3306 'categories': ['News & Politics'],
3307 'tags': list,
3308 'like_count': int,
3309 'dislike_count': int,
3310 },
3311 'params': {
3312 'skip_download': True,
3313 },
3314 'expected_warnings': ['Downloading just video ', 'Ignoring subtitle tracks found in '],
3315 }, {
3316 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
3317 'info_dict': {
3318 'id': 'a48o2S1cPoo',
3319 'ext': 'mp4',
3320 'title': 'The Young Turks - Live Main Show',
3321 'uploader': 'The Young Turks',
3322 'uploader_id': 'TheYoungTurks',
3323 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
3324 'upload_date': '20150715',
3325 'license': 'Standard YouTube License',
3326 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3327 'categories': ['News & Politics'],
3328 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3329 'like_count': int,
3330 'dislike_count': int,
3331 },
3332 'params': {
3333 'skip_download': True,
3334 },
3335 'only_matching': True,
3336 }, {
3337 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
3338 'only_matching': True,
3339 }, {
3340 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3341 'only_matching': True,
3342 }, {
3343 'note': 'A channel that is not live. Should raise error',
3344 'url': 'https://www.youtube.com/user/numberphile/live',
3345 'only_matching': True,
3346 }, {
3347 'url': 'https://www.youtube.com/feed/trending',
3348 'only_matching': True,
3349 }, {
3350 'url': 'https://www.youtube.com/feed/library',
3351 'only_matching': True,
3352 }, {
3353 'url': 'https://www.youtube.com/feed/history',
3354 'only_matching': True,
3355 }, {
3356 'url': 'https://www.youtube.com/feed/subscriptions',
3357 'only_matching': True,
3358 }, {
3359 'url': 'https://www.youtube.com/feed/watch_later',
3360 'only_matching': True,
3361 }, {
3362 'note': 'Recommended - redirects to home page',
3363 'url': 'https://www.youtube.com/feed/recommended',
3364 'only_matching': True,
3365 }, {
3366 'note': 'inline playlist with not always working continuations',
3367 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
3368 'only_matching': True,
3369 }, {
3370 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8',
3371 'only_matching': True,
3372 }, {
3373 'url': 'https://www.youtube.com/course',
3374 'only_matching': True,
3375 }, {
3376 'url': 'https://www.youtube.com/zsecurity',
3377 'only_matching': True,
3378 }, {
3379 'url': 'http://www.youtube.com/NASAgovVideo/videos',
3380 'only_matching': True,
3381 }, {
3382 'url': 'https://www.youtube.com/TheYoungTurks/live',
3383 'only_matching': True,
3384 }, {
3385 'url': 'https://www.youtube.com/hashtag/cctv9',
3386 'info_dict': {
3387 'id': 'cctv9',
3388 'title': '#cctv9',
3389 },
3390 'playlist_mincount': 350,
3391 }, {
3392 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU',
3393 'only_matching': True,
3394 }, {
3395 'note': 'Requires Premium: should request additional YTM-info webpage (and have format 141) for videos in playlist',
3396 'url': 'https://music.youtube.com/playlist?list=PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3397 'only_matching': True
3398 }, {
3399 'note': '/browse/ should redirect to /channel/',
3400 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng',
3401 'only_matching': True
3402 }, {
3403 'note': 'VLPL, should redirect to playlist?list=PL...',
3404 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3405 'info_dict': {
3406 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3407 'uploader': 'NoCopyrightSounds',
3408 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!',
3409 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
3410 'title': 'NCS Releases',
3411 },
3412 'playlist_mincount': 166,
3413 }, {
3414 'note': 'Topic, should redirect to playlist?list=UU...',
3415 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw',
3416 'info_dict': {
3417 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw',
3418 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
3419 'title': 'Uploads from Royalty Free Music - Topic',
3420 'uploader': 'Royalty Free Music - Topic',
3421 },
3422 'expected_warnings': [
3423 'A channel/user page was given',
3424 'The URL does not have a videos tab',
3425 ],
3426 'playlist_mincount': 101,
3427 }, {
3428 'note': 'Topic without a UU playlist',
3429 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg',
3430 'info_dict': {
3431 'id': 'UCtFRv9O2AHqOZjjynzrv-xg',
3432 'title': 'UCtFRv9O2AHqOZjjynzrv-xg',
3433 },
3434 'expected_warnings': [
3435 'A channel/user page was given',
3436 'The URL does not have a videos tab',
3437 'Falling back to channel URL',
3438 ],
3439 'playlist_mincount': 9,
3440 }, {
3441 'note': 'Youtube music Album',
3442 'url': 'https://music.youtube.com/browse/MPREb_gTAcphH99wE',
3443 'info_dict': {
3444 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0',
3445 'title': 'Album - Royalty Free Music Library V2 (50 Songs)',
3446 },
3447 'playlist_count': 50,
3448 }]
3449
3450 @classmethod
3451 def suitable(cls, url):
3452 return False if YoutubeIE.suitable(url) else super(
3453 YoutubeTabIE, cls).suitable(url)
3454
3455 def _extract_channel_id(self, webpage):
3456 channel_id = self._html_search_meta(
3457 'channelId', webpage, 'channel id', default=None)
3458 if channel_id:
3459 return channel_id
3460 channel_url = self._html_search_meta(
3461 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
3462 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
3463 'twitter:app:url:googleplay'), webpage, 'channel url')
3464 return self._search_regex(
3465 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
3466 channel_url, 'channel id')
3467
3468 @staticmethod
3469 def _extract_basic_item_renderer(item):
3470 # Modified from _extract_grid_item_renderer
3471 known_basic_renderers = (
3472 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer'
3473 )
3474 for key, renderer in item.items():
3475 if not isinstance(renderer, dict):
3476 continue
3477 elif key in known_basic_renderers:
3478 return renderer
3479 elif key.startswith('grid') and key.endswith('Renderer'):
3480 return renderer
3481
3482 def _grid_entries(self, grid_renderer):
3483 for item in grid_renderer['items']:
3484 if not isinstance(item, dict):
3485 continue
3486 renderer = self._extract_basic_item_renderer(item)
3487 if not isinstance(renderer, dict):
3488 continue
3489 title = try_get(
3490 renderer, (lambda x: x['title']['runs'][0]['text'],
3491 lambda x: x['title']['simpleText']), compat_str)
3492 # playlist
3493 playlist_id = renderer.get('playlistId')
3494 if playlist_id:
3495 yield self.url_result(
3496 'https://www.youtube.com/playlist?list=%s' % playlist_id,
3497 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3498 video_title=title)
3499 continue
3500 # video
3501 video_id = renderer.get('videoId')
3502 if video_id:
3503 yield self._extract_video(renderer)
3504 continue
3505 # channel
3506 channel_id = renderer.get('channelId')
3507 if channel_id:
3508 title = try_get(
3509 renderer, lambda x: x['title']['simpleText'], compat_str)
3510 yield self.url_result(
3511 'https://www.youtube.com/channel/%s' % channel_id,
3512 ie=YoutubeTabIE.ie_key(), video_title=title)
3513 continue
3514 # generic endpoint URL support
3515 ep_url = urljoin('https://www.youtube.com/', try_get(
3516 renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'],
3517 compat_str))
3518 if ep_url:
3519 for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE):
3520 if ie.suitable(ep_url):
3521 yield self.url_result(
3522 ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title)
3523 break
3524
3525 def _shelf_entries_from_content(self, shelf_renderer):
3526 content = shelf_renderer.get('content')
3527 if not isinstance(content, dict):
3528 return
3529 renderer = content.get('gridRenderer') or content.get('expandedShelfContentsRenderer')
3530 if renderer:
3531 # TODO: add support for nested playlists so each shelf is processed
3532 # as separate playlist
3533 # TODO: this includes only first N items
3534 for entry in self._grid_entries(renderer):
3535 yield entry
3536 renderer = content.get('horizontalListRenderer')
3537 if renderer:
3538 # TODO
3539 pass
3540
3541 def _shelf_entries(self, shelf_renderer, skip_channels=False):
3542 ep = try_get(
3543 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3544 compat_str)
3545 shelf_url = urljoin('https://www.youtube.com', ep)
3546 if shelf_url:
3547 # Skipping links to another channels, note that checking for
3548 # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL
3549 # will not work
3550 if skip_channels and '/channels?' in shelf_url:
3551 return
3552 title = try_get(
3553 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
3554 yield self.url_result(shelf_url, video_title=title)
3555 # Shelf may not contain shelf URL, fallback to extraction from content
3556 for entry in self._shelf_entries_from_content(shelf_renderer):
3557 yield entry
3558
3559 def _playlist_entries(self, video_list_renderer):
3560 for content in video_list_renderer['contents']:
3561 if not isinstance(content, dict):
3562 continue
3563 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
3564 if not isinstance(renderer, dict):
3565 continue
3566 video_id = renderer.get('videoId')
3567 if not video_id:
3568 continue
3569 yield self._extract_video(renderer)
3570
3571 def _rich_entries(self, rich_grid_renderer):
3572 renderer = try_get(
3573 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
3574 video_id = renderer.get('videoId')
3575 if not video_id:
3576 return
3577 yield self._extract_video(renderer)
3578
3579 def _video_entry(self, video_renderer):
3580 video_id = video_renderer.get('videoId')
3581 if video_id:
3582 return self._extract_video(video_renderer)
3583
3584 def _post_thread_entries(self, post_thread_renderer):
3585 post_renderer = try_get(
3586 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
3587 if not post_renderer:
3588 return
3589 # video attachment
3590 video_renderer = try_get(
3591 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict) or {}
3592 video_id = video_renderer.get('videoId')
3593 if video_id:
3594 entry = self._extract_video(video_renderer)
3595 if entry:
3596 yield entry
3597 # playlist attachment
3598 playlist_id = try_get(
3599 post_renderer, lambda x: x['backstageAttachment']['playlistRenderer']['playlistId'], compat_str)
3600 if playlist_id:
3601 yield self.url_result(
3602 'https://www.youtube.com/playlist?list=%s' % playlist_id,
3603 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3604 # inline video links
3605 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
3606 for run in runs:
3607 if not isinstance(run, dict):
3608 continue
3609 ep_url = try_get(
3610 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
3611 if not ep_url:
3612 continue
3613 if not YoutubeIE.suitable(ep_url):
3614 continue
3615 ep_video_id = YoutubeIE._match_id(ep_url)
3616 if video_id == ep_video_id:
3617 continue
3618 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=ep_video_id)
3619
3620 def _post_thread_continuation_entries(self, post_thread_continuation):
3621 contents = post_thread_continuation.get('contents')
3622 if not isinstance(contents, list):
3623 return
3624 for content in contents:
3625 renderer = content.get('backstagePostThreadRenderer')
3626 if not isinstance(renderer, dict):
3627 continue
3628 for entry in self._post_thread_entries(renderer):
3629 yield entry
3630
3631 r''' # unused
3632 def _rich_grid_entries(self, contents):
3633 for content in contents:
3634 video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict)
3635 if video_renderer:
3636 entry = self._video_entry(video_renderer)
3637 if entry:
3638 yield entry
3639 '''
3640 def _entries(self, tab, item_id, identity_token, account_syncid, ytcfg):
3641
3642 def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
3643 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
3644 for content in contents:
3645 if not isinstance(content, dict):
3646 continue
3647 is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
3648 if not is_renderer:
3649 renderer = content.get('richItemRenderer')
3650 if renderer:
3651 for entry in self._rich_entries(renderer):
3652 yield entry
3653 continuation_list[0] = self._extract_continuation(parent_renderer)
3654 continue
3655 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
3656 for isr_content in isr_contents:
3657 if not isinstance(isr_content, dict):
3658 continue
3659
3660 known_renderers = {
3661 'playlistVideoListRenderer': self._playlist_entries,
3662 'gridRenderer': self._grid_entries,
3663 'shelfRenderer': lambda x: self._shelf_entries(x, tab.get('title') != 'Channels'),
3664 'backstagePostThreadRenderer': self._post_thread_entries,
3665 'videoRenderer': lambda x: [self._video_entry(x)],
3666 }
3667 for key, renderer in isr_content.items():
3668 if key not in known_renderers:
3669 continue
3670 for entry in known_renderers[key](renderer):
3671 if entry:
3672 yield entry
3673 continuation_list[0] = self._extract_continuation(renderer)
3674 break
3675
3676 if not continuation_list[0]:
3677 continuation_list[0] = self._extract_continuation(is_renderer)
3678
3679 if not continuation_list[0]:
3680 continuation_list[0] = self._extract_continuation(parent_renderer)
3681
3682 continuation_list = [None] # Python 2 doesnot support nonlocal
3683 tab_content = try_get(tab, lambda x: x['content'], dict)
3684 if not tab_content:
3685 return
3686 parent_renderer = (
3687 try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
3688 or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
3689 for entry in extract_entries(parent_renderer):
3690 yield entry
3691 continuation = continuation_list[0]
3692 context = self._extract_context(ytcfg)
3693 visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str)
3694
3695 for page_num in itertools.count(1):
3696 if not continuation:
3697 break
3698 query = {
3699 'continuation': continuation['continuation'],
3700 'clickTracking': {'clickTrackingParams': continuation['itct']}
3701 }
3702 headers = self._generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
3703 response = self._extract_response(
3704 item_id='%s page %s' % (item_id, page_num),
3705 query=query, headers=headers, ytcfg=ytcfg,
3706 check_get_keys=('continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints'))
3707
3708 if not response:
3709 break
3710 visitor_data = try_get(
3711 response, lambda x: x['responseContext']['visitorData'], compat_str) or visitor_data
3712
3713 known_continuation_renderers = {
3714 'playlistVideoListContinuation': self._playlist_entries,
3715 'gridContinuation': self._grid_entries,
3716 'itemSectionContinuation': self._post_thread_continuation_entries,
3717 'sectionListContinuation': extract_entries, # for feeds
3718 }
3719 continuation_contents = try_get(
3720 response, lambda x: x['continuationContents'], dict) or {}
3721 continuation_renderer = None
3722 for key, value in continuation_contents.items():
3723 if key not in known_continuation_renderers:
3724 continue
3725 continuation_renderer = value
3726 continuation_list = [None]
3727 for entry in known_continuation_renderers[key](continuation_renderer):
3728 yield entry
3729 continuation = continuation_list[0] or self._extract_continuation(continuation_renderer)
3730 break
3731 if continuation_renderer:
3732 continue
3733
3734 known_renderers = {
3735 'gridPlaylistRenderer': (self._grid_entries, 'items'),
3736 'gridVideoRenderer': (self._grid_entries, 'items'),
3737 'playlistVideoRenderer': (self._playlist_entries, 'contents'),
3738 'itemSectionRenderer': (extract_entries, 'contents'), # for feeds
3739 'richItemRenderer': (extract_entries, 'contents'), # for hashtag
3740 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents')
3741 }
3742 on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints'))
3743 continuation_items = try_get(
3744 on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list)
3745 continuation_item = try_get(continuation_items, lambda x: x[0], dict) or {}
3746 video_items_renderer = None
3747 for key, value in continuation_item.items():
3748 if key not in known_renderers:
3749 continue
3750 video_items_renderer = {known_renderers[key][1]: continuation_items}
3751 continuation_list = [None]
3752 for entry in known_renderers[key][0](video_items_renderer):
3753 yield entry
3754 continuation = continuation_list[0] or self._extract_continuation(video_items_renderer)
3755 break
3756 if video_items_renderer:
3757 continue
3758 break
3759
3760 @staticmethod
3761 def _extract_selected_tab(tabs):
3762 for tab in tabs:
3763 renderer = dict_get(tab, ('tabRenderer', 'expandableTabRenderer')) or {}
3764 if renderer.get('selected') is True:
3765 return renderer
3766 else:
3767 raise ExtractorError('Unable to find selected tab')
3768
3769 @staticmethod
3770 def _extract_uploader(data):
3771 uploader = {}
3772 sidebar_renderer = try_get(
3773 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
3774 if sidebar_renderer:
3775 for item in sidebar_renderer:
3776 if not isinstance(item, dict):
3777 continue
3778 renderer = item.get('playlistSidebarSecondaryInfoRenderer')
3779 if not isinstance(renderer, dict):
3780 continue
3781 owner = try_get(
3782 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3783 if owner:
3784 uploader['uploader'] = owner.get('text')
3785 uploader['uploader_id'] = try_get(
3786 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3787 uploader['uploader_url'] = urljoin(
3788 'https://www.youtube.com/',
3789 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3790 return {k: v for k, v in uploader.items() if v is not None}
3791
3792 def _extract_from_tabs(self, item_id, webpage, data, tabs):
3793 playlist_id = title = description = channel_url = channel_name = channel_id = None
3794 thumbnails_list = tags = []
3795
3796 selected_tab = self._extract_selected_tab(tabs)
3797 renderer = try_get(
3798 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3799 if renderer:
3800 channel_name = renderer.get('title')
3801 channel_url = renderer.get('channelUrl')
3802 channel_id = renderer.get('externalId')
3803 else:
3804 renderer = try_get(
3805 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3806
3807 if renderer:
3808 title = renderer.get('title')
3809 description = renderer.get('description', '')
3810 playlist_id = channel_id
3811 tags = renderer.get('keywords', '').split()
3812 thumbnails_list = (
3813 try_get(renderer, lambda x: x['avatar']['thumbnails'], list)
3814 or try_get(
3815 data,
3816 lambda x: x['sidebar']['playlistSidebarRenderer']['items'][0]['playlistSidebarPrimaryInfoRenderer']['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails'],
3817 list)
3818 or [])
3819
3820 thumbnails = []
3821 for t in thumbnails_list:
3822 if not isinstance(t, dict):
3823 continue
3824 thumbnail_url = url_or_none(t.get('url'))
3825 if not thumbnail_url:
3826 continue
3827 thumbnails.append({
3828 'url': thumbnail_url,
3829 'width': int_or_none(t.get('width')),
3830 'height': int_or_none(t.get('height')),
3831 })
3832 if playlist_id is None:
3833 playlist_id = item_id
3834 if title is None:
3835 title = (
3836 try_get(data, lambda x: x['header']['hashtagHeaderRenderer']['hashtag']['simpleText'])
3837 or playlist_id)
3838 title += format_field(selected_tab, 'title', ' - %s')
3839 title += format_field(selected_tab, 'expandedText', ' - %s')
3840
3841 metadata = {
3842 'playlist_id': playlist_id,
3843 'playlist_title': title,
3844 'playlist_description': description,
3845 'uploader': channel_name,
3846 'uploader_id': channel_id,
3847 'uploader_url': channel_url,
3848 'thumbnails': thumbnails,
3849 'tags': tags,
3850 }
3851 if not channel_id:
3852 metadata.update(self._extract_uploader(data))
3853 metadata.update({
3854 'channel': metadata['uploader'],
3855 'channel_id': metadata['uploader_id'],
3856 'channel_url': metadata['uploader_url']})
3857 return self.playlist_result(
3858 self._entries(
3859 selected_tab, playlist_id,
3860 self._extract_identity_token(webpage, item_id),
3861 self._extract_account_syncid(data),
3862 self._extract_ytcfg(item_id, webpage)),
3863 **metadata)
3864
3865 def _extract_mix_playlist(self, playlist, playlist_id, data, webpage):
3866 first_id = last_id = None
3867 ytcfg = self._extract_ytcfg(playlist_id, webpage)
3868 headers = self._generate_api_headers(
3869 ytcfg, account_syncid=self._extract_account_syncid(data),
3870 identity_token=self._extract_identity_token(webpage, item_id=playlist_id),
3871 visitor_data=try_get(self._extract_context(ytcfg), lambda x: x['client']['visitorData'], compat_str))
3872 for page_num in itertools.count(1):
3873 videos = list(self._playlist_entries(playlist))
3874 if not videos:
3875 return
3876 start = next((i for i, v in enumerate(videos) if v['id'] == last_id), -1) + 1
3877 if start >= len(videos):
3878 return
3879 for video in videos[start:]:
3880 if video['id'] == first_id:
3881 self.to_screen('First video %s found again; Assuming end of Mix' % first_id)
3882 return
3883 yield video
3884 first_id = first_id or videos[0]['id']
3885 last_id = videos[-1]['id']
3886 watch_endpoint = try_get(
3887 playlist, lambda x: x['contents'][-1]['playlistPanelVideoRenderer']['navigationEndpoint']['watchEndpoint'])
3888 query = {
3889 'playlistId': playlist_id,
3890 'videoId': watch_endpoint.get('videoId') or last_id,
3891 'index': watch_endpoint.get('index') or len(videos),
3892 'params': watch_endpoint.get('params') or 'OAE%3D'
3893 }
3894 response = self._extract_response(
3895 item_id='%s page %d' % (playlist_id, page_num),
3896 query=query,
3897 ep='next',
3898 headers=headers,
3899 check_get_keys='contents'
3900 )
3901 playlist = try_get(
3902 response, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3903
3904 def _extract_from_playlist(self, item_id, url, data, playlist, webpage):
3905 title = playlist.get('title') or try_get(
3906 data, lambda x: x['titleText']['simpleText'], compat_str)
3907 playlist_id = playlist.get('playlistId') or item_id
3908
3909 # Delegating everything except mix playlists to regular tab-based playlist URL
3910 playlist_url = urljoin(url, try_get(
3911 playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3912 compat_str))
3913 if playlist_url and playlist_url != url:
3914 return self.url_result(
3915 playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3916 video_title=title)
3917
3918 return self.playlist_result(
3919 self._extract_mix_playlist(playlist, playlist_id, data, webpage),
3920 playlist_id=playlist_id, playlist_title=title)
3921
3922 def _reload_with_unavailable_videos(self, item_id, data, webpage):
3923 """
3924 Get playlist with unavailable videos if the 'show unavailable videos' button exists.
3925 """
3926 sidebar_renderer = try_get(
3927 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
3928 if not sidebar_renderer:
3929 return
3930 browse_id = params = None
3931 for item in sidebar_renderer:
3932 if not isinstance(item, dict):
3933 continue
3934 renderer = item.get('playlistSidebarPrimaryInfoRenderer')
3935 menu_renderer = try_get(
3936 renderer, lambda x: x['menu']['menuRenderer']['items'], list) or []
3937 for menu_item in menu_renderer:
3938 if not isinstance(menu_item, dict):
3939 continue
3940 nav_item_renderer = menu_item.get('menuNavigationItemRenderer')
3941 text = try_get(
3942 nav_item_renderer, lambda x: x['text']['simpleText'], compat_str)
3943 if not text or text.lower() != 'show unavailable videos':
3944 continue
3945 browse_endpoint = try_get(
3946 nav_item_renderer, lambda x: x['navigationEndpoint']['browseEndpoint'], dict) or {}
3947 browse_id = browse_endpoint.get('browseId')
3948 params = browse_endpoint.get('params')
3949 break
3950
3951 ytcfg = self._extract_ytcfg(item_id, webpage)
3952 headers = self._generate_api_headers(
3953 ytcfg, account_syncid=self._extract_account_syncid(ytcfg),
3954 identity_token=self._extract_identity_token(webpage, item_id=item_id),
3955 visitor_data=try_get(
3956 self._extract_context(ytcfg), lambda x: x['client']['visitorData'], compat_str))
3957 query = {
3958 'params': params or 'wgYCCAA=',
3959 'browseId': browse_id or 'VL%s' % item_id
3960 }
3961 return self._extract_response(
3962 item_id=item_id, headers=headers, query=query,
3963 check_get_keys='contents', fatal=False,
3964 note='Downloading API JSON with unavailable videos')
3965
3966 def _extract_webpage(self, url, item_id):
3967 retries = self.get_param('extractor_retries', 3)
3968 count = -1
3969 last_error = 'Incomplete yt initial data recieved'
3970 while count < retries:
3971 count += 1
3972 # Sometimes youtube returns a webpage with incomplete ytInitialData
3973 # See: https://github.com/yt-dlp/yt-dlp/issues/116
3974 if count:
3975 self.report_warning('%s. Retrying ...' % last_error)
3976 webpage = self._download_webpage(
3977 url, item_id,
3978 'Downloading webpage%s' % (' (retry #%d)' % count if count else ''))
3979 data = self._extract_yt_initial_data(item_id, webpage)
3980 if data.get('contents') or data.get('currentVideoEndpoint'):
3981 break
3982 # Extract alerts here only when there is error
3983 self._extract_and_report_alerts(data)
3984 if count >= retries:
3985 raise ExtractorError(last_error)
3986 return webpage, data
3987
3988 @staticmethod
3989 def _smuggle_data(entries, data):
3990 for entry in entries:
3991 if data:
3992 entry['url'] = smuggle_url(entry['url'], data)
3993 yield entry
3994
3995 def _real_extract(self, url):
3996 url, smuggled_data = unsmuggle_url(url, {})
3997 if self.is_music_url(url):
3998 smuggled_data['is_music_url'] = True
3999 info_dict = self.__real_extract(url, smuggled_data)
4000 if info_dict.get('entries'):
4001 info_dict['entries'] = self._smuggle_data(info_dict['entries'], smuggled_data)
4002 return info_dict
4003
4004 _url_re = re.compile(r'(?P<pre>%s)(?(channel_type)(?P<tab>/\w+))?(?P<post>.*)$' % _VALID_URL)
4005
4006 def __real_extract(self, url, smuggled_data):
4007 item_id = self._match_id(url)
4008 url = compat_urlparse.urlunparse(
4009 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
4010 compat_opts = self.get_param('compat_opts', [])
4011
4012 def get_mobj(url):
4013 mobj = self._url_re.match(url).groupdict()
4014 mobj.update((k, '') for k, v in mobj.items() if v is None)
4015 return mobj
4016
4017 mobj = get_mobj(url)
4018 # Youtube returns incomplete data if tabname is not lower case
4019 pre, tab, post, is_channel = mobj['pre'], mobj['tab'].lower(), mobj['post'], not mobj['not_channel']
4020
4021 if is_channel:
4022 if smuggled_data.get('is_music_url'):
4023 if item_id[:2] == 'VL':
4024 # Youtube music VL channels have an equivalent playlist
4025 item_id = item_id[2:]
4026 pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False
4027 elif item_id[:2] == 'MP':
4028 # Youtube music albums (/channel/MP...) have a OLAK playlist that can be extracted from the webpage
4029 item_id = self._search_regex(
4030 r'\\x22audioPlaylistId\\x22:\\x22([0-9A-Za-z_-]+)\\x22',
4031 self._download_webpage('https://music.youtube.com/channel/%s' % item_id, item_id),
4032 'playlist id')
4033 pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False
4034 elif mobj['channel_type'] == 'browse':
4035 # Youtube music /browse/ should be changed to /channel/
4036 pre = 'https://www.youtube.com/channel/%s' % item_id
4037 if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts:
4038 # Home URLs should redirect to /videos/
4039 self.report_warning(
4040 'A channel/user page was given. All the channel\'s videos will be downloaded. '
4041 'To download only the videos in the home page, add a "/featured" to the URL')
4042 tab = '/videos'
4043
4044 url = ''.join((pre, tab, post))
4045 mobj = get_mobj(url)
4046
4047 # Handle both video/playlist URLs
4048 qs = parse_qs(url)
4049 video_id = qs.get('v', [None])[0]
4050 playlist_id = qs.get('list', [None])[0]
4051
4052 if not video_id and mobj['not_channel'].startswith('watch'):
4053 if not playlist_id:
4054 # If there is neither video or playlist ids, youtube redirects to home page, which is undesirable
4055 raise ExtractorError('Unable to recognize tab page')
4056 # Common mistake: https://www.youtube.com/watch?list=playlist_id
4057 self.report_warning('A video URL was given without video ID. Trying to download playlist %s' % playlist_id)
4058 url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
4059 mobj = get_mobj(url)
4060
4061 if video_id and playlist_id:
4062 if self.get_param('noplaylist'):
4063 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
4064 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
4065 self.to_screen('Downloading playlist %s; add --no-playlist to just download video %s' % (playlist_id, video_id))
4066
4067 webpage, data = self._extract_webpage(url, item_id)
4068
4069 tabs = try_get(
4070 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
4071 if tabs:
4072 selected_tab = self._extract_selected_tab(tabs)
4073 tab_name = selected_tab.get('title', '')
4074 if 'no-youtube-channel-redirect' not in compat_opts:
4075 if mobj['tab'] == '/live':
4076 # Live tab should have redirected to the video
4077 raise ExtractorError('The channel is not currently live', expected=True)
4078 if mobj['tab'] == '/videos' and tab_name.lower() != mobj['tab'][1:]:
4079 if not mobj['not_channel'] and item_id[:2] == 'UC':
4080 # Topic channels don't have /videos. Use the equivalent playlist instead
4081 self.report_warning('The URL does not have a %s tab. Trying to redirect to playlist UU%s instead' % (mobj['tab'][1:], item_id[2:]))
4082 pl_id = 'UU%s' % item_id[2:]
4083 pl_url = 'https://www.youtube.com/playlist?list=%s%s' % (pl_id, mobj['post'])
4084 try:
4085 pl_webpage, pl_data = self._extract_webpage(pl_url, pl_id)
4086 for alert_type, alert_message in self._extract_alerts(pl_data):
4087 if alert_type == 'error':
4088 raise ExtractorError('Youtube said: %s' % alert_message)
4089 item_id, url, webpage, data = pl_id, pl_url, pl_webpage, pl_data
4090 except ExtractorError:
4091 self.report_warning('The playlist gave error. Falling back to channel URL')
4092 else:
4093 self.report_warning('The URL does not have a %s tab. %s is being downloaded instead' % (mobj['tab'][1:], tab_name))
4094
4095 self.write_debug('Final URL: %s' % url)
4096
4097 # YouTube sometimes provides a button to reload playlist with unavailable videos.
4098 if 'no-youtube-unavailable-videos' not in compat_opts:
4099 data = self._reload_with_unavailable_videos(item_id, data, webpage) or data
4100 self._extract_and_report_alerts(data)
4101
4102 tabs = try_get(
4103 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
4104 if tabs:
4105 return self._extract_from_tabs(item_id, webpage, data, tabs)
4106
4107 playlist = try_get(
4108 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
4109 if playlist:
4110 return self._extract_from_playlist(item_id, url, data, playlist, webpage)
4111
4112 video_id = try_get(
4113 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
4114 compat_str) or video_id
4115 if video_id:
4116 if mobj['tab'] != '/live': # live tab is expected to redirect to video
4117 self.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id)
4118 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
4119
4120 raise ExtractorError('Unable to recognize tab page')
4121
4122
4123 class YoutubePlaylistIE(InfoExtractor):
4124 IE_DESC = 'YouTube.com playlists'
4125 _VALID_URL = r'''(?x)(?:
4126 (?:https?://)?
4127 (?:\w+\.)?
4128 (?:
4129 (?:
4130 youtube(?:kids)?\.com|
4131 invidio\.us
4132 )
4133 /.*?\?.*?\blist=
4134 )?
4135 (?P<id>%(playlist_id)s)
4136 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
4137 IE_NAME = 'youtube:playlist'
4138 _TESTS = [{
4139 'note': 'issue #673',
4140 'url': 'PLBB231211A4F62143',
4141 'info_dict': {
4142 'title': '[OLD]Team Fortress 2 (Class-based LP)',
4143 'id': 'PLBB231211A4F62143',
4144 'uploader': 'Wickydoo',
4145 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
4146 },
4147 'playlist_mincount': 29,
4148 }, {
4149 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
4150 'info_dict': {
4151 'title': 'YDL_safe_search',
4152 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
4153 },
4154 'playlist_count': 2,
4155 'skip': 'This playlist is private',
4156 }, {
4157 'note': 'embedded',
4158 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
4159 'playlist_count': 4,
4160 'info_dict': {
4161 'title': 'JODA15',
4162 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
4163 'uploader': 'milan',
4164 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
4165 }
4166 }, {
4167 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
4168 'playlist_mincount': 982,
4169 'info_dict': {
4170 'title': '2018 Chinese New Singles (11/6 updated)',
4171 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
4172 'uploader': 'LBK',
4173 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
4174 }
4175 }, {
4176 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
4177 'only_matching': True,
4178 }, {
4179 # music album playlist
4180 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
4181 'only_matching': True,
4182 }]
4183
4184 @classmethod
4185 def suitable(cls, url):
4186 if YoutubeTabIE.suitable(url):
4187 return False
4188 # Hack for lazy extractors until more generic solution is implemented
4189 # (see #28780)
4190 from .youtube import parse_qs
4191 qs = parse_qs(url)
4192 if qs.get('v', [None])[0]:
4193 return False
4194 return super(YoutubePlaylistIE, cls).suitable(url)
4195
4196 def _real_extract(self, url):
4197 playlist_id = self._match_id(url)
4198 is_music_url = YoutubeBaseInfoExtractor.is_music_url(url)
4199 url = update_url_query(
4200 'https://www.youtube.com/playlist',
4201 parse_qs(url) or {'list': playlist_id})
4202 if is_music_url:
4203 url = smuggle_url(url, {'is_music_url': True})
4204 return self.url_result(url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
4205
4206
4207 class YoutubeYtBeIE(InfoExtractor):
4208 IE_DESC = 'youtu.be'
4209 _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
4210 _TESTS = [{
4211 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
4212 'info_dict': {
4213 'id': 'yeWKywCrFtk',
4214 'ext': 'mp4',
4215 'title': 'Small Scale Baler and Braiding Rugs',
4216 'uploader': 'Backus-Page House Museum',
4217 'uploader_id': 'backuspagemuseum',
4218 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
4219 'upload_date': '20161008',
4220 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
4221 'categories': ['Nonprofits & Activism'],
4222 'tags': list,
4223 'like_count': int,
4224 'dislike_count': int,
4225 },
4226 'params': {
4227 'noplaylist': True,
4228 'skip_download': True,
4229 },
4230 }, {
4231 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
4232 'only_matching': True,
4233 }]
4234
4235 def _real_extract(self, url):
4236 mobj = re.match(self._VALID_URL, url)
4237 video_id = mobj.group('id')
4238 playlist_id = mobj.group('playlist_id')
4239 return self.url_result(
4240 update_url_query('https://www.youtube.com/watch', {
4241 'v': video_id,
4242 'list': playlist_id,
4243 'feature': 'youtu.be',
4244 }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
4245
4246
4247 class YoutubeYtUserIE(InfoExtractor):
4248 IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
4249 _VALID_URL = r'ytuser:(?P<id>.+)'
4250 _TESTS = [{
4251 'url': 'ytuser:phihag',
4252 'only_matching': True,
4253 }]
4254
4255 def _real_extract(self, url):
4256 user_id = self._match_id(url)
4257 return self.url_result(
4258 'https://www.youtube.com/user/%s' % user_id,
4259 ie=YoutubeTabIE.ie_key(), video_id=user_id)
4260
4261
4262 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
4263 IE_NAME = 'youtube:favorites'
4264 IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
4265 _VALID_URL = r':ytfav(?:ou?rite)?s?'
4266 _LOGIN_REQUIRED = True
4267 _TESTS = [{
4268 'url': ':ytfav',
4269 'only_matching': True,
4270 }, {
4271 'url': ':ytfavorites',
4272 'only_matching': True,
4273 }]
4274
4275 def _real_extract(self, url):
4276 return self.url_result(
4277 'https://www.youtube.com/playlist?list=LL',
4278 ie=YoutubeTabIE.ie_key())
4279
4280
4281 class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE):
4282 IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
4283 # there doesn't appear to be a real limit, for example if you search for
4284 # 'python' you get more than 8.000.000 results
4285 _MAX_RESULTS = float('inf')
4286 IE_NAME = 'youtube:search'
4287 _SEARCH_KEY = 'ytsearch'
4288 _SEARCH_PARAMS = None
4289 _TESTS = []
4290
4291 def _entries(self, query, n):
4292 data = {'query': query}
4293 if self._SEARCH_PARAMS:
4294 data['params'] = self._SEARCH_PARAMS
4295 total = 0
4296 for page_num in itertools.count(1):
4297 search = self._extract_response(
4298 item_id='query "%s" page %s' % (query, page_num), ep='search', query=data,
4299 check_get_keys=('contents', 'onResponseReceivedCommands')
4300 )
4301 if not search:
4302 break
4303 slr_contents = try_get(
4304 search,
4305 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
4306 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
4307 list)
4308 if not slr_contents:
4309 break
4310
4311 # Youtube sometimes adds promoted content to searches,
4312 # changing the index location of videos and token.
4313 # So we search through all entries till we find them.
4314 continuation_token = None
4315 for slr_content in slr_contents:
4316 if continuation_token is None:
4317 continuation_token = try_get(
4318 slr_content,
4319 lambda x: x['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
4320 compat_str)
4321
4322 isr_contents = try_get(
4323 slr_content,
4324 lambda x: x['itemSectionRenderer']['contents'],
4325 list)
4326 if not isr_contents:
4327 continue
4328 for content in isr_contents:
4329 if not isinstance(content, dict):
4330 continue
4331 video = content.get('videoRenderer')
4332 if not isinstance(video, dict):
4333 continue
4334 video_id = video.get('videoId')
4335 if not video_id:
4336 continue
4337
4338 yield self._extract_video(video)
4339 total += 1
4340 if total == n:
4341 return
4342
4343 if not continuation_token:
4344 break
4345 data['continuation'] = continuation_token
4346
4347 def _get_n_results(self, query, n):
4348 """Get a specified number of results for a query"""
4349 return self.playlist_result(self._entries(query, n), query)
4350
4351
4352 class YoutubeSearchDateIE(YoutubeSearchIE):
4353 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
4354 _SEARCH_KEY = 'ytsearchdate'
4355 IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
4356 _SEARCH_PARAMS = 'CAI%3D'
4357
4358
4359 class YoutubeSearchURLIE(YoutubeSearchIE):
4360 IE_DESC = 'YouTube.com search URLs'
4361 IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
4362 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
4363 # _MAX_RESULTS = 100
4364 _TESTS = [{
4365 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
4366 'playlist_mincount': 5,
4367 'info_dict': {
4368 'title': 'youtube-dl test video',
4369 }
4370 }, {
4371 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
4372 'only_matching': True,
4373 }]
4374
4375 @classmethod
4376 def _make_valid_url(cls):
4377 return cls._VALID_URL
4378
4379 def _real_extract(self, url):
4380 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
4381 query = (qs.get('search_query') or qs.get('q'))[0]
4382 self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
4383 return self._get_n_results(query, self._MAX_RESULTS)
4384
4385
4386 class YoutubeFeedsInfoExtractor(YoutubeTabIE):
4387 """
4388 Base class for feed extractors
4389 Subclasses must define the _FEED_NAME property.
4390 """
4391 _LOGIN_REQUIRED = True
4392 _TESTS = []
4393
4394 @property
4395 def IE_NAME(self):
4396 return 'youtube:%s' % self._FEED_NAME
4397
4398 def _real_extract(self, url):
4399 return self.url_result(
4400 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
4401 ie=YoutubeTabIE.ie_key())
4402
4403
4404 class YoutubeWatchLaterIE(InfoExtractor):
4405 IE_NAME = 'youtube:watchlater'
4406 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
4407 _VALID_URL = r':ytwatchlater'
4408 _TESTS = [{
4409 'url': ':ytwatchlater',
4410 'only_matching': True,
4411 }]
4412
4413 def _real_extract(self, url):
4414 return self.url_result(
4415 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
4416
4417
4418 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
4419 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
4420 _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
4421 _FEED_NAME = 'recommended'
4422 _LOGIN_REQUIRED = False
4423 _TESTS = [{
4424 'url': ':ytrec',
4425 'only_matching': True,
4426 }, {
4427 'url': ':ytrecommended',
4428 'only_matching': True,
4429 }, {
4430 'url': 'https://youtube.com',
4431 'only_matching': True,
4432 }]
4433
4434
4435 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
4436 IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
4437 _VALID_URL = r':ytsub(?:scription)?s?'
4438 _FEED_NAME = 'subscriptions'
4439 _TESTS = [{
4440 'url': ':ytsubs',
4441 'only_matching': True,
4442 }, {
4443 'url': ':ytsubscriptions',
4444 'only_matching': True,
4445 }]
4446
4447
4448 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
4449 IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)'
4450 _VALID_URL = r':ythis(?:tory)?'
4451 _FEED_NAME = 'history'
4452 _TESTS = [{
4453 'url': ':ythistory',
4454 'only_matching': True,
4455 }]
4456
4457
4458 class YoutubeTruncatedURLIE(InfoExtractor):
4459 IE_NAME = 'youtube:truncated_url'
4460 IE_DESC = False # Do not list
4461 _VALID_URL = r'''(?x)
4462 (?:https?://)?
4463 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
4464 (?:watch\?(?:
4465 feature=[a-z_]+|
4466 annotation_id=annotation_[^&]+|
4467 x-yt-cl=[0-9]+|
4468 hl=[^&]*|
4469 t=[0-9]+
4470 )?
4471 |
4472 attribution_link\?a=[^&]+
4473 )
4474 $
4475 '''
4476
4477 _TESTS = [{
4478 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
4479 'only_matching': True,
4480 }, {
4481 'url': 'https://www.youtube.com/watch?',
4482 'only_matching': True,
4483 }, {
4484 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
4485 'only_matching': True,
4486 }, {
4487 'url': 'https://www.youtube.com/watch?feature=foo',
4488 'only_matching': True,
4489 }, {
4490 'url': 'https://www.youtube.com/watch?hl=en-GB',
4491 'only_matching': True,
4492 }, {
4493 'url': 'https://www.youtube.com/watch?t=2372',
4494 'only_matching': True,
4495 }]
4496
4497 def _real_extract(self, url):
4498 raise ExtractorError(
4499 'Did you forget to quote the URL? Remember that & is a meta '
4500 'character in most shells, so you want to put the URL in quotes, '
4501 'like youtube-dl '
4502 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
4503 ' or simply youtube-dl BaW_jenozKc .',
4504 expected=True)
4505
4506
4507 class YoutubeTruncatedIDIE(InfoExtractor):
4508 IE_NAME = 'youtube:truncated_id'
4509 IE_DESC = False # Do not list
4510 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
4511
4512 _TESTS = [{
4513 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
4514 'only_matching': True,
4515 }]
4516
4517 def _real_extract(self, url):
4518 video_id = self._match_id(url)
4519 raise ExtractorError(
4520 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
4521 expected=True)