]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/youtube.py
[youtube:comments] Move comment extraction to new API (#466)
[yt-dlp.git] / yt_dlp / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5 import base64
6 import calendar
7 import copy
8 import hashlib
9 import itertools
10 import json
11 import os.path
12 import random
13 import re
14 import time
15 import traceback
16
17 from .common import InfoExtractor, SearchInfoExtractor
18 from ..compat import (
19 compat_chr,
20 compat_HTTPError,
21 compat_parse_qs,
22 compat_str,
23 compat_urllib_parse_unquote_plus,
24 compat_urllib_parse_urlencode,
25 compat_urllib_parse_urlparse,
26 compat_urlparse,
27 )
28 from ..jsinterp import JSInterpreter
29 from ..utils import (
30 bool_or_none,
31 bytes_to_intlist,
32 clean_html,
33 dict_get,
34 datetime_from_str,
35 error_to_compat_str,
36 ExtractorError,
37 format_field,
38 float_or_none,
39 int_or_none,
40 intlist_to_bytes,
41 mimetype2ext,
42 parse_codecs,
43 parse_duration,
44 qualities,
45 remove_start,
46 smuggle_url,
47 str_or_none,
48 str_to_int,
49 try_get,
50 unescapeHTML,
51 unified_strdate,
52 unsmuggle_url,
53 update_url_query,
54 url_or_none,
55 urlencode_postdata,
56 urljoin
57 )
58
59
60 def parse_qs(url):
61 return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
62
63
64 class YoutubeBaseInfoExtractor(InfoExtractor):
65 """Provide base functions for Youtube extractors"""
66 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
67 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
68
69 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
70 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
71 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
72
73 _RESERVED_NAMES = (
74 r'channel|c|user|browse|playlist|watch|w|v|embed|e|watch_popup|shorts|'
75 r'movies|results|shared|hashtag|trending|feed|feeds|oembed|get_video_info|'
76 r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout')
77
78 _NETRC_MACHINE = 'youtube'
79 # If True it will raise an error if no login info is provided
80 _LOGIN_REQUIRED = False
81
82 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
83
84 def _login(self):
85 """
86 Attempt to log in to YouTube.
87 True is returned if successful or skipped.
88 False is returned if login failed.
89
90 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
91 """
92
93 def warn(message):
94 self.report_warning(message)
95
96 # username+password login is broken
97 if self._LOGIN_REQUIRED and self.get_param('cookiefile') is None:
98 self.raise_login_required(
99 'Login details are needed to download this content', method='cookies')
100 username, password = self._get_login_info()
101 if username:
102 warn('Logging in using username and password is broken. %s' % self._LOGIN_HINTS['cookies'])
103 return
104
105 # Everything below this is broken!
106 r'''
107 # No authentication to be performed
108 if username is None:
109 if self._LOGIN_REQUIRED and self.get_param('cookiefile') is None:
110 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
111 # if self.get_param('cookiefile'): # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
112 # self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
113 return True
114
115 login_page = self._download_webpage(
116 self._LOGIN_URL, None,
117 note='Downloading login page',
118 errnote='unable to fetch login page', fatal=False)
119 if login_page is False:
120 return
121
122 login_form = self._hidden_inputs(login_page)
123
124 def req(url, f_req, note, errnote):
125 data = login_form.copy()
126 data.update({
127 'pstMsg': 1,
128 'checkConnection': 'youtube',
129 'checkedDomains': 'youtube',
130 'hl': 'en',
131 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
132 'f.req': json.dumps(f_req),
133 'flowName': 'GlifWebSignIn',
134 'flowEntry': 'ServiceLogin',
135 # TODO: reverse actual botguard identifier generation algo
136 'bgRequest': '["identifier",""]',
137 })
138 return self._download_json(
139 url, None, note=note, errnote=errnote,
140 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
141 fatal=False,
142 data=urlencode_postdata(data), headers={
143 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
144 'Google-Accounts-XSRF': 1,
145 })
146
147 lookup_req = [
148 username,
149 None, [], None, 'US', None, None, 2, False, True,
150 [
151 None, None,
152 [2, 1, None, 1,
153 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
154 None, [], 4],
155 1, [None, None, []], None, None, None, True
156 ],
157 username,
158 ]
159
160 lookup_results = req(
161 self._LOOKUP_URL, lookup_req,
162 'Looking up account info', 'Unable to look up account info')
163
164 if lookup_results is False:
165 return False
166
167 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
168 if not user_hash:
169 warn('Unable to extract user hash')
170 return False
171
172 challenge_req = [
173 user_hash,
174 None, 1, None, [1, None, None, None, [password, None, True]],
175 [
176 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
177 1, [None, None, []], None, None, None, True
178 ]]
179
180 challenge_results = req(
181 self._CHALLENGE_URL, challenge_req,
182 'Logging in', 'Unable to log in')
183
184 if challenge_results is False:
185 return
186
187 login_res = try_get(challenge_results, lambda x: x[0][5], list)
188 if login_res:
189 login_msg = try_get(login_res, lambda x: x[5], compat_str)
190 warn(
191 'Unable to login: %s' % 'Invalid password'
192 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
193 return False
194
195 res = try_get(challenge_results, lambda x: x[0][-1], list)
196 if not res:
197 warn('Unable to extract result entry')
198 return False
199
200 login_challenge = try_get(res, lambda x: x[0][0], list)
201 if login_challenge:
202 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
203 if challenge_str == 'TWO_STEP_VERIFICATION':
204 # SEND_SUCCESS - TFA code has been successfully sent to phone
205 # QUOTA_EXCEEDED - reached the limit of TFA codes
206 status = try_get(login_challenge, lambda x: x[5], compat_str)
207 if status == 'QUOTA_EXCEEDED':
208 warn('Exceeded the limit of TFA codes, try later')
209 return False
210
211 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
212 if not tl:
213 warn('Unable to extract TL')
214 return False
215
216 tfa_code = self._get_tfa_info('2-step verification code')
217
218 if not tfa_code:
219 warn(
220 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
221 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
222 return False
223
224 tfa_code = remove_start(tfa_code, 'G-')
225
226 tfa_req = [
227 user_hash, None, 2, None,
228 [
229 9, None, None, None, None, None, None, None,
230 [None, tfa_code, True, 2]
231 ]]
232
233 tfa_results = req(
234 self._TFA_URL.format(tl), tfa_req,
235 'Submitting TFA code', 'Unable to submit TFA code')
236
237 if tfa_results is False:
238 return False
239
240 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
241 if tfa_res:
242 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
243 warn(
244 'Unable to finish TFA: %s' % 'Invalid TFA code'
245 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
246 return False
247
248 check_cookie_url = try_get(
249 tfa_results, lambda x: x[0][-1][2], compat_str)
250 else:
251 CHALLENGES = {
252 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
253 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
254 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
255 }
256 challenge = CHALLENGES.get(
257 challenge_str,
258 '%s returned error %s.' % (self.IE_NAME, challenge_str))
259 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
260 return False
261 else:
262 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
263
264 if not check_cookie_url:
265 warn('Unable to extract CheckCookie URL')
266 return False
267
268 check_cookie_results = self._download_webpage(
269 check_cookie_url, None, 'Checking cookie', fatal=False)
270
271 if check_cookie_results is False:
272 return False
273
274 if 'https://myaccount.google.com/' not in check_cookie_results:
275 warn('Unable to log in')
276 return False
277
278 return True
279 '''
280
281 def _initialize_consent(self):
282 cookies = self._get_cookies('https://www.youtube.com/')
283 if cookies.get('__Secure-3PSID'):
284 return
285 consent_id = None
286 consent = cookies.get('CONSENT')
287 if consent:
288 if 'YES' in consent.value:
289 return
290 consent_id = self._search_regex(
291 r'PENDING\+(\d+)', consent.value, 'consent', default=None)
292 if not consent_id:
293 consent_id = random.randint(100, 999)
294 self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id)
295
296 def _real_initialize(self):
297 self._initialize_consent()
298 if self._downloader is None:
299 return
300 if not self._login():
301 return
302
303 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
304 _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
305 _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
306
307 _YT_DEFAULT_YTCFGS = {
308 'WEB': {
309 'INNERTUBE_API_VERSION': 'v1',
310 'INNERTUBE_CLIENT_NAME': 'WEB',
311 'INNERTUBE_CLIENT_VERSION': '2.20210622.10.00',
312 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
313 'INNERTUBE_CONTEXT': {
314 'client': {
315 'clientName': 'WEB',
316 'clientVersion': '2.20210622.10.00',
317 'hl': 'en',
318 }
319 },
320 'INNERTUBE_CONTEXT_CLIENT_NAME': 1
321 },
322 'WEB_REMIX': {
323 'INNERTUBE_API_VERSION': 'v1',
324 'INNERTUBE_CLIENT_NAME': 'WEB_REMIX',
325 'INNERTUBE_CLIENT_VERSION': '1.20210621.00.00',
326 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
327 'INNERTUBE_CONTEXT': {
328 'client': {
329 'clientName': 'WEB_REMIX',
330 'clientVersion': '1.20210621.00.00',
331 'hl': 'en',
332 }
333 },
334 'INNERTUBE_CONTEXT_CLIENT_NAME': 67
335 },
336 'WEB_EMBEDDED_PLAYER': {
337 'INNERTUBE_API_VERSION': 'v1',
338 'INNERTUBE_CLIENT_NAME': 'WEB_EMBEDDED_PLAYER',
339 'INNERTUBE_CLIENT_VERSION': '1.20210620.0.1',
340 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
341 'INNERTUBE_CONTEXT': {
342 'client': {
343 'clientName': 'WEB_EMBEDDED_PLAYER',
344 'clientVersion': '1.20210620.0.1',
345 'hl': 'en',
346 }
347 },
348 'INNERTUBE_CONTEXT_CLIENT_NAME': 56
349 },
350 'ANDROID': {
351 'INNERTUBE_API_VERSION': 'v1',
352 'INNERTUBE_CLIENT_NAME': 'ANDROID',
353 'INNERTUBE_CLIENT_VERSION': '16.20',
354 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
355 'INNERTUBE_CONTEXT': {
356 'client': {
357 'clientName': 'ANDROID',
358 'clientVersion': '16.20',
359 'hl': 'en',
360 }
361 },
362 'INNERTUBE_CONTEXT_CLIENT_NAME': 'ANDROID'
363 },
364 'ANDROID_EMBEDDED_PLAYER': {
365 'INNERTUBE_API_VERSION': 'v1',
366 'INNERTUBE_CLIENT_NAME': 'ANDROID_EMBEDDED_PLAYER',
367 'INNERTUBE_CLIENT_VERSION': '16.20',
368 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
369 'INNERTUBE_CONTEXT': {
370 'client': {
371 'clientName': 'ANDROID_EMBEDDED_PLAYER',
372 'clientVersion': '16.20',
373 'hl': 'en',
374 }
375 },
376 'INNERTUBE_CONTEXT_CLIENT_NAME': 'ANDROID_EMBEDDED_PLAYER'
377 },
378 'ANDROID_MUSIC': {
379 'INNERTUBE_API_VERSION': 'v1',
380 'INNERTUBE_CLIENT_NAME': 'ANDROID_MUSIC',
381 'INNERTUBE_CLIENT_VERSION': '4.32',
382 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
383 'INNERTUBE_CONTEXT': {
384 'client': {
385 'clientName': 'ANDROID_MUSIC',
386 'clientVersion': '4.32',
387 'hl': 'en',
388 }
389 },
390 'INNERTUBE_CONTEXT_CLIENT_NAME': 'ANDROID_MUSIC'
391 }
392 }
393
394 _YT_DEFAULT_INNERTUBE_HOSTS = {
395 'DIRECT': 'youtubei.googleapis.com',
396 'WEB': 'www.youtube.com',
397 'WEB_REMIX': 'music.youtube.com',
398 'ANDROID_MUSIC': 'music.youtube.com'
399 }
400
401 def _get_default_ytcfg(self, client='WEB'):
402 if client in self._YT_DEFAULT_YTCFGS:
403 return copy.deepcopy(self._YT_DEFAULT_YTCFGS[client])
404 self.write_debug(f'INNERTUBE default client {client} does not exist - falling back to WEB client.')
405 return copy.deepcopy(self._YT_DEFAULT_YTCFGS['WEB'])
406
407 def _get_innertube_host(self, client='WEB'):
408 return dict_get(self._YT_DEFAULT_INNERTUBE_HOSTS, (client, 'WEB'))
409
410 def _ytcfg_get_safe(self, ytcfg, getter, expected_type=None, default_client='WEB'):
411 # try_get but with fallback to default ytcfg client values when present
412 _func = lambda y: try_get(y, getter, expected_type)
413 return _func(ytcfg) or _func(self._get_default_ytcfg(default_client))
414
415 def _extract_client_name(self, ytcfg, default_client='WEB'):
416 return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CLIENT_NAME'], compat_str, default_client)
417
418 def _extract_client_version(self, ytcfg, default_client='WEB'):
419 return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CLIENT_VERSION'], compat_str, default_client)
420
421 def _extract_api_key(self, ytcfg=None, default_client='WEB'):
422 return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client)
423
424 def _extract_context(self, ytcfg=None, default_client='WEB'):
425 _get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict)
426 context = _get_context(ytcfg)
427 if context:
428 return context
429
430 context = _get_context(self._get_default_ytcfg(default_client))
431 if not ytcfg:
432 return context
433
434 # Recreate the client context (required)
435 context['client'].update({
436 'clientVersion': self._extract_client_version(ytcfg, default_client),
437 'clientName': self._extract_client_name(ytcfg, default_client),
438 })
439 visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str)
440 if visitor_data:
441 context['client']['visitorData'] = visitor_data
442 return context
443
444 def _generate_sapisidhash_header(self, origin='https://www.youtube.com'):
445 # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is.
446 # See: https://github.com/yt-dlp/yt-dlp/issues/393
447 yt_cookies = self._get_cookies('https://www.youtube.com')
448 sapisid_cookie = dict_get(
449 yt_cookies, ('__Secure-3PAPISID', 'SAPISID'))
450 if sapisid_cookie is None:
451 return
452 time_now = round(time.time())
453 # SAPISID cookie is required if not already present
454 if not yt_cookies.get('SAPISID'):
455 self._set_cookie(
456 '.youtube.com', 'SAPISID', sapisid_cookie.value, secure=True, expire_time=time_now + 3600)
457 # SAPISIDHASH algorithm from https://stackoverflow.com/a/32065323
458 sapisidhash = hashlib.sha1(
459 f'{time_now} {sapisid_cookie.value} {origin}'.encode('utf-8')).hexdigest()
460 return f'SAPISIDHASH {time_now}_{sapisidhash}'
461
462 def _call_api(self, ep, query, video_id, fatal=True, headers=None,
463 note='Downloading API JSON', errnote='Unable to download API page',
464 context=None, api_key=None, api_hostname=None, default_client='WEB'):
465
466 data = {'context': context} if context else {'context': self._extract_context(default_client=default_client)}
467 data.update(query)
468 real_headers = self._generate_api_headers(client=default_client)
469 real_headers.update({'content-type': 'application/json'})
470 if headers:
471 real_headers.update(headers)
472 return self._download_json(
473 'https://%s/youtubei/v1/%s' % (api_hostname or self._get_innertube_host(default_client), ep),
474 video_id=video_id, fatal=fatal, note=note, errnote=errnote,
475 data=json.dumps(data).encode('utf8'), headers=real_headers,
476 query={'key': api_key or self._extract_api_key()})
477
478 def _extract_yt_initial_data(self, video_id, webpage):
479 return self._parse_json(
480 self._search_regex(
481 (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
482 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
483 video_id)
484
485 def _extract_identity_token(self, webpage, item_id):
486 ytcfg = self._extract_ytcfg(item_id, webpage)
487 if ytcfg:
488 token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
489 if token:
490 return token
491 return self._search_regex(
492 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
493 'identity token', default=None)
494
495 @staticmethod
496 def _extract_account_syncid(data):
497 """
498 Extract syncId required to download private playlists of secondary channels
499 @param data Either response or ytcfg
500 """
501 sync_ids = (try_get(
502 data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'],
503 lambda x: x['DATASYNC_ID']), compat_str) or '').split("||")
504 if len(sync_ids) >= 2 and sync_ids[1]:
505 # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
506 # and just "user_syncid||" for primary channel. We only want the channel_syncid
507 return sync_ids[0]
508 # ytcfg includes channel_syncid if on secondary channel
509 return data.get('DELEGATED_SESSION_ID')
510
511 def _extract_ytcfg(self, video_id, webpage):
512 if not webpage:
513 return {}
514 return self._parse_json(
515 self._search_regex(
516 r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
517 default='{}'), video_id, fatal=False) or {}
518
519 def _generate_api_headers(self, ytcfg=None, identity_token=None, account_syncid=None,
520 visitor_data=None, api_hostname=None, client='WEB'):
521 origin = 'https://' + (api_hostname if api_hostname else self._get_innertube_host(client))
522 headers = {
523 'X-YouTube-Client-Name': compat_str(
524 self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=client)),
525 'X-YouTube-Client-Version': self._extract_client_version(ytcfg, client),
526 'Origin': origin
527 }
528 if not visitor_data and ytcfg:
529 visitor_data = try_get(
530 self._extract_context(ytcfg, client), lambda x: x['client']['visitorData'], compat_str)
531 if identity_token:
532 headers['X-Youtube-Identity-Token'] = identity_token
533 if account_syncid:
534 headers['X-Goog-PageId'] = account_syncid
535 headers['X-Goog-AuthUser'] = 0
536 if visitor_data:
537 headers['X-Goog-Visitor-Id'] = visitor_data
538 auth = self._generate_sapisidhash_header(origin)
539 if auth is not None:
540 headers['Authorization'] = auth
541 headers['X-Origin'] = origin
542 return headers
543
544 @staticmethod
545 def _build_api_continuation_query(continuation, ctp=None):
546 query = {
547 'continuation': continuation
548 }
549 # TODO: Inconsistency with clickTrackingParams.
550 # Currently we have a fixed ctp contained within context (from ytcfg)
551 # and a ctp in root query for continuation.
552 if ctp:
553 query['clickTracking'] = {'clickTrackingParams': ctp}
554 return query
555
556 @classmethod
557 def _continuation_query_ajax_to_api(cls, continuation_query):
558 continuation = dict_get(continuation_query, ('continuation', 'ctoken'))
559 return cls._build_api_continuation_query(continuation, continuation_query.get('itct'))
560
561 @staticmethod
562 def _build_continuation_query(continuation, ctp=None):
563 query = {
564 'ctoken': continuation,
565 'continuation': continuation,
566 }
567 if ctp:
568 query['itct'] = ctp
569 return query
570
571 @classmethod
572 def _extract_next_continuation_data(cls, renderer):
573 next_continuation = try_get(
574 renderer, (lambda x: x['continuations'][0]['nextContinuationData'],
575 lambda x: x['continuation']['reloadContinuationData']), dict)
576 if not next_continuation:
577 return
578 continuation = next_continuation.get('continuation')
579 if not continuation:
580 return
581 ctp = next_continuation.get('clickTrackingParams')
582 return cls._build_continuation_query(continuation, ctp)
583
584 @classmethod
585 def _extract_continuation_ep_data(cls, continuation_ep: dict):
586 if isinstance(continuation_ep, dict):
587 continuation = try_get(
588 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
589 if not continuation:
590 return
591 ctp = continuation_ep.get('clickTrackingParams')
592 return cls._build_continuation_query(continuation, ctp)
593
594 @classmethod
595 def _extract_continuation(cls, renderer):
596 next_continuation = cls._extract_next_continuation_data(renderer)
597 if next_continuation:
598 return next_continuation
599 contents = []
600 for key in ('contents', 'items'):
601 contents.extend(try_get(renderer, lambda x: x[key], list) or [])
602 for content in contents:
603 if not isinstance(content, dict):
604 continue
605 continuation_ep = try_get(
606 content, (lambda x: x['continuationItemRenderer']['continuationEndpoint'],
607 lambda x: x['continuationItemRenderer']['button']['buttonRenderer']['command']),
608 dict)
609 continuation = cls._extract_continuation_ep_data(continuation_ep)
610 if continuation:
611 return continuation
612
613 @staticmethod
614 def _extract_alerts(data):
615 for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
616 if not isinstance(alert_dict, dict):
617 continue
618 for alert in alert_dict.values():
619 alert_type = alert.get('type')
620 if not alert_type:
621 continue
622 message = try_get(alert, lambda x: x['text']['simpleText'], compat_str) or ''
623 if message:
624 yield alert_type, message
625 for run in try_get(alert, lambda x: x['text']['runs'], list) or []:
626 message += try_get(run, lambda x: x['text'], compat_str)
627 if message:
628 yield alert_type, message
629
630 def _report_alerts(self, alerts, expected=True):
631 errors = []
632 warnings = []
633 for alert_type, alert_message in alerts:
634 if alert_type.lower() == 'error':
635 errors.append([alert_type, alert_message])
636 else:
637 warnings.append([alert_type, alert_message])
638
639 for alert_type, alert_message in (warnings + errors[:-1]):
640 self.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
641 if errors:
642 raise ExtractorError('YouTube said: %s' % errors[-1][1], expected=expected)
643
644 def _extract_and_report_alerts(self, data, *args, **kwargs):
645 return self._report_alerts(self._extract_alerts(data), *args, **kwargs)
646
647 def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None,
648 ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None,
649 default_client='WEB'):
650 response = None
651 last_error = None
652 count = -1
653 retries = self.get_param('extractor_retries', 3)
654 if check_get_keys is None:
655 check_get_keys = []
656 while count < retries:
657 count += 1
658 if last_error:
659 self.report_warning('%s. Retrying ...' % last_error)
660 try:
661 response = self._call_api(
662 ep=ep, fatal=True, headers=headers,
663 video_id=item_id, query=query,
664 context=self._extract_context(ytcfg, default_client),
665 api_key=self._extract_api_key(ytcfg, default_client),
666 api_hostname=api_hostname, default_client=default_client,
667 note='%s%s' % (note, ' (retry #%d)' % count if count else ''))
668 except ExtractorError as e:
669 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404):
670 # Downloading page may result in intermittent 5xx HTTP error
671 # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
672 last_error = 'HTTP Error %s' % e.cause.code
673 if count < retries:
674 continue
675 if fatal:
676 raise
677 else:
678 self.report_warning(error_to_compat_str(e))
679 return
680
681 else:
682 # Youtube may send alerts if there was an issue with the continuation page
683 try:
684 self._extract_and_report_alerts(response, expected=False)
685 except ExtractorError as e:
686 if fatal:
687 raise
688 self.report_warning(error_to_compat_str(e))
689 return
690 if not check_get_keys or dict_get(response, check_get_keys):
691 break
692 # Youtube sometimes sends incomplete data
693 # See: https://github.com/ytdl-org/youtube-dl/issues/28194
694 last_error = 'Incomplete data received'
695 if count >= retries:
696 if fatal:
697 raise ExtractorError(last_error)
698 else:
699 self.report_warning(last_error)
700 return
701 return response
702
703 @staticmethod
704 def is_music_url(url):
705 return re.match(r'https?://music\.youtube\.com/', url) is not None
706
707 def _extract_video(self, renderer):
708 video_id = renderer.get('videoId')
709 title = try_get(
710 renderer,
711 (lambda x: x['title']['runs'][0]['text'],
712 lambda x: x['title']['simpleText']), compat_str)
713 description = try_get(
714 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
715 compat_str)
716 duration = parse_duration(try_get(
717 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
718 view_count_text = try_get(
719 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
720 view_count = str_to_int(self._search_regex(
721 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
722 'view count', default=None))
723 uploader = try_get(
724 renderer,
725 (lambda x: x['ownerText']['runs'][0]['text'],
726 lambda x: x['shortBylineText']['runs'][0]['text']), compat_str)
727 return {
728 '_type': 'url',
729 'ie_key': YoutubeIE.ie_key(),
730 'id': video_id,
731 'url': video_id,
732 'title': title,
733 'description': description,
734 'duration': duration,
735 'view_count': view_count,
736 'uploader': uploader,
737 }
738
739
740 class YoutubeIE(YoutubeBaseInfoExtractor):
741 IE_DESC = 'YouTube.com'
742 _INVIDIOUS_SITES = (
743 # invidious-redirect websites
744 r'(?:www\.)?redirect\.invidious\.io',
745 r'(?:(?:www|dev)\.)?invidio\.us',
746 # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
747 r'(?:www\.)?invidious\.pussthecat\.org',
748 r'(?:www\.)?invidious\.zee\.li',
749 r'(?:www\.)?invidious\.ethibox\.fr',
750 r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
751 # youtube-dl invidious instances list
752 r'(?:(?:www|no)\.)?invidiou\.sh',
753 r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
754 r'(?:www\.)?invidious\.kabi\.tk',
755 r'(?:www\.)?invidious\.mastodon\.host',
756 r'(?:www\.)?invidious\.zapashcanon\.fr',
757 r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks',
758 r'(?:www\.)?invidious\.tinfoil-hat\.net',
759 r'(?:www\.)?invidious\.himiko\.cloud',
760 r'(?:www\.)?invidious\.reallyancient\.tech',
761 r'(?:www\.)?invidious\.tube',
762 r'(?:www\.)?invidiou\.site',
763 r'(?:www\.)?invidious\.site',
764 r'(?:www\.)?invidious\.xyz',
765 r'(?:www\.)?invidious\.nixnet\.xyz',
766 r'(?:www\.)?invidious\.048596\.xyz',
767 r'(?:www\.)?invidious\.drycat\.fr',
768 r'(?:www\.)?inv\.skyn3t\.in',
769 r'(?:www\.)?tube\.poal\.co',
770 r'(?:www\.)?tube\.connect\.cafe',
771 r'(?:www\.)?vid\.wxzm\.sx',
772 r'(?:www\.)?vid\.mint\.lgbt',
773 r'(?:www\.)?vid\.puffyan\.us',
774 r'(?:www\.)?yewtu\.be',
775 r'(?:www\.)?yt\.elukerio\.org',
776 r'(?:www\.)?yt\.lelux\.fi',
777 r'(?:www\.)?invidious\.ggc-project\.de',
778 r'(?:www\.)?yt\.maisputain\.ovh',
779 r'(?:www\.)?ytprivate\.com',
780 r'(?:www\.)?invidious\.13ad\.de',
781 r'(?:www\.)?invidious\.toot\.koeln',
782 r'(?:www\.)?invidious\.fdn\.fr',
783 r'(?:www\.)?watch\.nettohikari\.com',
784 r'(?:www\.)?invidious\.namazso\.eu',
785 r'(?:www\.)?invidious\.silkky\.cloud',
786 r'(?:www\.)?invidious\.exonip\.de',
787 r'(?:www\.)?invidious\.riverside\.rocks',
788 r'(?:www\.)?invidious\.blamefran\.net',
789 r'(?:www\.)?invidious\.moomoo\.de',
790 r'(?:www\.)?ytb\.trom\.tf',
791 r'(?:www\.)?yt\.cyberhost\.uk',
792 r'(?:www\.)?kgg2m7yk5aybusll\.onion',
793 r'(?:www\.)?qklhadlycap4cnod\.onion',
794 r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion',
795 r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion',
796 r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion',
797 r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion',
798 r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p',
799 r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion',
800 r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
801 r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
802 r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
803 r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion',
804 )
805 _VALID_URL = r"""(?x)^
806 (
807 (?:https?://|//) # http(s):// or protocol-independent URL
808 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com|
809 (?:www\.)?deturl\.com/www\.youtube\.com|
810 (?:www\.)?pwnyoutube\.com|
811 (?:www\.)?hooktube\.com|
812 (?:www\.)?yourepeat\.com|
813 tube\.majestyc\.net|
814 %(invidious)s|
815 youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
816 (?:.*?\#/)? # handle anchor (#/) redirect urls
817 (?: # the various things that can precede the ID:
818 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
819 |(?: # or the v= param in all its forms
820 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
821 (?:\?|\#!?) # the params delimiter ? or # or #!
822 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
823 v=
824 )
825 ))
826 |(?:
827 youtu\.be| # just youtu.be/xxxx
828 vid\.plus| # or vid.plus/xxxx
829 zwearz\.com/watch| # or zwearz.com/watch/xxxx
830 %(invidious)s
831 )/
832 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
833 )
834 )? # all until now is optional -> you can pass the naked ID
835 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
836 (?(1).+)? # if we found the ID, everything can follow
837 (?:\#|$)""" % {
838 'invidious': '|'.join(_INVIDIOUS_SITES),
839 }
840 _PLAYER_INFO_RE = (
841 r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
842 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
843 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
844 )
845 _formats = {
846 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
847 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
848 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
849 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
850 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
851 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
852 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
853 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
854 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
855 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
856 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
857 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
858 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
859 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
860 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
861 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
862 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
863 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
864
865
866 # 3D videos
867 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
868 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
869 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
870 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
871 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
872 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
873 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
874
875 # Apple HTTP Live Streaming
876 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
877 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
878 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
879 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
880 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
881 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
882 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
883 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
884
885 # DASH mp4 video
886 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
887 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
888 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
889 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
890 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
891 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
892 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
893 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
894 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
895 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
896 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
897 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
898
899 # Dash mp4 audio
900 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
901 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
902 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
903 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
904 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
905 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
906 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
907
908 # Dash webm
909 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
910 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
911 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
912 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
913 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
914 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
915 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
916 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
917 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
918 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
919 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
920 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
921 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
922 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
923 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
924 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
925 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
926 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
927 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
928 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
929 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
930 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
931
932 # Dash webm audio
933 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
934 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
935
936 # Dash webm audio with opus inside
937 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
938 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
939 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
940
941 # RTMP (unnamed)
942 '_rtmp': {'protocol': 'rtmp'},
943
944 # av01 video only formats sometimes served with "unknown" codecs
945 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
946 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
947 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
948 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
949 }
950 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
951
952 _AGE_GATE_REASONS = (
953 'Sign in to confirm your age',
954 'This video may be inappropriate for some users.',
955 'Sorry, this content is age-restricted.')
956
957 _GEO_BYPASS = False
958
959 IE_NAME = 'youtube'
960 _TESTS = [
961 {
962 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
963 'info_dict': {
964 'id': 'BaW_jenozKc',
965 'ext': 'mp4',
966 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
967 'uploader': 'Philipp Hagemeister',
968 'uploader_id': 'phihag',
969 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
970 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
971 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
972 'upload_date': '20121002',
973 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
974 'categories': ['Science & Technology'],
975 'tags': ['youtube-dl'],
976 'duration': 10,
977 'view_count': int,
978 'like_count': int,
979 'dislike_count': int,
980 'start_time': 1,
981 'end_time': 9,
982 }
983 },
984 {
985 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
986 'note': 'Embed-only video (#1746)',
987 'info_dict': {
988 'id': 'yZIXLfi8CZQ',
989 'ext': 'mp4',
990 'upload_date': '20120608',
991 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
992 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
993 'uploader': 'SET India',
994 'uploader_id': 'setindia',
995 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
996 'age_limit': 18,
997 },
998 'skip': 'Private video',
999 },
1000 {
1001 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
1002 'note': 'Use the first video ID in the URL',
1003 'info_dict': {
1004 'id': 'BaW_jenozKc',
1005 'ext': 'mp4',
1006 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
1007 'uploader': 'Philipp Hagemeister',
1008 'uploader_id': 'phihag',
1009 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
1010 'upload_date': '20121002',
1011 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
1012 'categories': ['Science & Technology'],
1013 'tags': ['youtube-dl'],
1014 'duration': 10,
1015 'view_count': int,
1016 'like_count': int,
1017 'dislike_count': int,
1018 },
1019 'params': {
1020 'skip_download': True,
1021 },
1022 },
1023 {
1024 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
1025 'note': '256k DASH audio (format 141) via DASH manifest',
1026 'info_dict': {
1027 'id': 'a9LDPn-MO4I',
1028 'ext': 'm4a',
1029 'upload_date': '20121002',
1030 'uploader_id': '8KVIDEO',
1031 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
1032 'description': '',
1033 'uploader': '8KVIDEO',
1034 'title': 'UHDTV TEST 8K VIDEO.mp4'
1035 },
1036 'params': {
1037 'youtube_include_dash_manifest': True,
1038 'format': '141',
1039 },
1040 'skip': 'format 141 not served anymore',
1041 },
1042 # DASH manifest with encrypted signature
1043 {
1044 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
1045 'info_dict': {
1046 'id': 'IB3lcPjvWLA',
1047 'ext': 'm4a',
1048 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
1049 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
1050 'duration': 244,
1051 'uploader': 'AfrojackVEVO',
1052 'uploader_id': 'AfrojackVEVO',
1053 'upload_date': '20131011',
1054 'abr': 129.495,
1055 },
1056 'params': {
1057 'youtube_include_dash_manifest': True,
1058 'format': '141/bestaudio[ext=m4a]',
1059 },
1060 },
1061 # Controversy video
1062 {
1063 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
1064 'info_dict': {
1065 'id': 'T4XJQO3qol8',
1066 'ext': 'mp4',
1067 'duration': 219,
1068 'upload_date': '20100909',
1069 'uploader': 'Amazing Atheist',
1070 'uploader_id': 'TheAmazingAtheist',
1071 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
1072 'title': 'Burning Everyone\'s Koran',
1073 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
1074 }
1075 },
1076 # Normal age-gate video (embed allowed)
1077 {
1078 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
1079 'info_dict': {
1080 'id': 'HtVdAasjOgU',
1081 'ext': 'mp4',
1082 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
1083 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
1084 'duration': 142,
1085 'uploader': 'The Witcher',
1086 'uploader_id': 'WitcherGame',
1087 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
1088 'upload_date': '20140605',
1089 'age_limit': 18,
1090 },
1091 },
1092 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
1093 # YouTube Red ad is not captured for creator
1094 {
1095 'url': '__2ABJjxzNo',
1096 'info_dict': {
1097 'id': '__2ABJjxzNo',
1098 'ext': 'mp4',
1099 'duration': 266,
1100 'upload_date': '20100430',
1101 'uploader_id': 'deadmau5',
1102 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
1103 'creator': 'deadmau5',
1104 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336',
1105 'uploader': 'deadmau5',
1106 'title': 'Deadmau5 - Some Chords (HD)',
1107 'alt_title': 'Some Chords',
1108 },
1109 'expected_warnings': [
1110 'DASH manifest missing',
1111 ]
1112 },
1113 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
1114 {
1115 'url': 'lqQg6PlCWgI',
1116 'info_dict': {
1117 'id': 'lqQg6PlCWgI',
1118 'ext': 'mp4',
1119 'duration': 6085,
1120 'upload_date': '20150827',
1121 'uploader_id': 'olympic',
1122 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
1123 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
1124 'uploader': 'Olympic',
1125 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
1126 },
1127 'params': {
1128 'skip_download': 'requires avconv',
1129 }
1130 },
1131 # Non-square pixels
1132 {
1133 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
1134 'info_dict': {
1135 'id': '_b-2C3KPAM0',
1136 'ext': 'mp4',
1137 'stretched_ratio': 16 / 9.,
1138 'duration': 85,
1139 'upload_date': '20110310',
1140 'uploader_id': 'AllenMeow',
1141 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
1142 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
1143 'uploader': '孫ᄋᄅ',
1144 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
1145 },
1146 },
1147 # url_encoded_fmt_stream_map is empty string
1148 {
1149 'url': 'qEJwOuvDf7I',
1150 'info_dict': {
1151 'id': 'qEJwOuvDf7I',
1152 'ext': 'webm',
1153 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
1154 'description': '',
1155 'upload_date': '20150404',
1156 'uploader_id': 'spbelect',
1157 'uploader': 'Наблюдатели Петербурга',
1158 },
1159 'params': {
1160 'skip_download': 'requires avconv',
1161 },
1162 'skip': 'This live event has ended.',
1163 },
1164 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
1165 {
1166 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
1167 'info_dict': {
1168 'id': 'FIl7x6_3R5Y',
1169 'ext': 'webm',
1170 'title': 'md5:7b81415841e02ecd4313668cde88737a',
1171 'description': 'md5:116377fd2963b81ec4ce64b542173306',
1172 'duration': 220,
1173 'upload_date': '20150625',
1174 'uploader_id': 'dorappi2000',
1175 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
1176 'uploader': 'dorappi2000',
1177 'formats': 'mincount:31',
1178 },
1179 'skip': 'not actual anymore',
1180 },
1181 # DASH manifest with segment_list
1182 {
1183 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
1184 'md5': '8ce563a1d667b599d21064e982ab9e31',
1185 'info_dict': {
1186 'id': 'CsmdDsKjzN8',
1187 'ext': 'mp4',
1188 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
1189 'uploader': 'Airtek',
1190 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
1191 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
1192 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
1193 },
1194 'params': {
1195 'youtube_include_dash_manifest': True,
1196 'format': '135', # bestvideo
1197 },
1198 'skip': 'This live event has ended.',
1199 },
1200 {
1201 # Multifeed videos (multiple cameras), URL is for Main Camera
1202 'url': 'https://www.youtube.com/watch?v=jvGDaLqkpTg',
1203 'info_dict': {
1204 'id': 'jvGDaLqkpTg',
1205 'title': 'Tom Clancy Free Weekend Rainbow Whatever',
1206 'description': 'md5:e03b909557865076822aa169218d6a5d',
1207 },
1208 'playlist': [{
1209 'info_dict': {
1210 'id': 'jvGDaLqkpTg',
1211 'ext': 'mp4',
1212 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Main Camera)',
1213 'description': 'md5:e03b909557865076822aa169218d6a5d',
1214 'duration': 10643,
1215 'upload_date': '20161111',
1216 'uploader': 'Team PGP',
1217 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1218 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1219 },
1220 }, {
1221 'info_dict': {
1222 'id': '3AKt1R1aDnw',
1223 'ext': 'mp4',
1224 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 2)',
1225 'description': 'md5:e03b909557865076822aa169218d6a5d',
1226 'duration': 10991,
1227 'upload_date': '20161111',
1228 'uploader': 'Team PGP',
1229 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1230 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1231 },
1232 }, {
1233 'info_dict': {
1234 'id': 'RtAMM00gpVc',
1235 'ext': 'mp4',
1236 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 3)',
1237 'description': 'md5:e03b909557865076822aa169218d6a5d',
1238 'duration': 10995,
1239 'upload_date': '20161111',
1240 'uploader': 'Team PGP',
1241 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1242 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1243 },
1244 }, {
1245 'info_dict': {
1246 'id': '6N2fdlP3C5U',
1247 'ext': 'mp4',
1248 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 4)',
1249 'description': 'md5:e03b909557865076822aa169218d6a5d',
1250 'duration': 10990,
1251 'upload_date': '20161111',
1252 'uploader': 'Team PGP',
1253 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
1254 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
1255 },
1256 }],
1257 'params': {
1258 'skip_download': True,
1259 },
1260 },
1261 {
1262 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
1263 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
1264 'info_dict': {
1265 'id': 'gVfLd0zydlo',
1266 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
1267 },
1268 'playlist_count': 2,
1269 'skip': 'Not multifeed anymore',
1270 },
1271 {
1272 'url': 'https://vid.plus/FlRa-iH7PGw',
1273 'only_matching': True,
1274 },
1275 {
1276 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
1277 'only_matching': True,
1278 },
1279 {
1280 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
1281 # Also tests cut-off URL expansion in video description (see
1282 # https://github.com/ytdl-org/youtube-dl/issues/1892,
1283 # https://github.com/ytdl-org/youtube-dl/issues/8164)
1284 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
1285 'info_dict': {
1286 'id': 'lsguqyKfVQg',
1287 'ext': 'mp4',
1288 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
1289 'alt_title': 'Dark Walk - Position Music',
1290 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
1291 'duration': 133,
1292 'upload_date': '20151119',
1293 'uploader_id': 'IronSoulElf',
1294 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
1295 'uploader': 'IronSoulElf',
1296 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
1297 'track': 'Dark Walk - Position Music',
1298 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
1299 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
1300 },
1301 'params': {
1302 'skip_download': True,
1303 },
1304 },
1305 {
1306 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
1307 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
1308 'only_matching': True,
1309 },
1310 {
1311 # Video with yt:stretch=17:0
1312 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
1313 'info_dict': {
1314 'id': 'Q39EVAstoRM',
1315 'ext': 'mp4',
1316 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
1317 'description': 'md5:ee18a25c350637c8faff806845bddee9',
1318 'upload_date': '20151107',
1319 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
1320 'uploader': 'CH GAMER DROID',
1321 },
1322 'params': {
1323 'skip_download': True,
1324 },
1325 'skip': 'This video does not exist.',
1326 },
1327 {
1328 # Video with incomplete 'yt:stretch=16:'
1329 'url': 'https://www.youtube.com/watch?v=FRhJzUSJbGI',
1330 'only_matching': True,
1331 },
1332 {
1333 # Video licensed under Creative Commons
1334 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
1335 'info_dict': {
1336 'id': 'M4gD1WSo5mA',
1337 'ext': 'mp4',
1338 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
1339 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
1340 'duration': 721,
1341 'upload_date': '20150127',
1342 'uploader_id': 'BerkmanCenter',
1343 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
1344 'uploader': 'The Berkman Klein Center for Internet & Society',
1345 'license': 'Creative Commons Attribution license (reuse allowed)',
1346 },
1347 'params': {
1348 'skip_download': True,
1349 },
1350 },
1351 {
1352 # Channel-like uploader_url
1353 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
1354 'info_dict': {
1355 'id': 'eQcmzGIKrzg',
1356 'ext': 'mp4',
1357 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
1358 'description': 'md5:13a2503d7b5904ef4b223aa101628f39',
1359 'duration': 4060,
1360 'upload_date': '20151119',
1361 'uploader': 'Bernie Sanders',
1362 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
1363 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
1364 'license': 'Creative Commons Attribution license (reuse allowed)',
1365 },
1366 'params': {
1367 'skip_download': True,
1368 },
1369 },
1370 {
1371 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
1372 'only_matching': True,
1373 },
1374 {
1375 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
1376 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
1377 'only_matching': True,
1378 },
1379 {
1380 # Rental video preview
1381 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1382 'info_dict': {
1383 'id': 'uGpuVWrhIzE',
1384 'ext': 'mp4',
1385 'title': 'Piku - Trailer',
1386 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1387 'upload_date': '20150811',
1388 'uploader': 'FlixMatrix',
1389 'uploader_id': 'FlixMatrixKaravan',
1390 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
1391 'license': 'Standard YouTube License',
1392 },
1393 'params': {
1394 'skip_download': True,
1395 },
1396 'skip': 'This video is not available.',
1397 },
1398 {
1399 # YouTube Red video with episode data
1400 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1401 'info_dict': {
1402 'id': 'iqKdEhx-dD4',
1403 'ext': 'mp4',
1404 'title': 'Isolation - Mind Field (Ep 1)',
1405 'description': 'md5:f540112edec5d09fc8cc752d3d4ba3cd',
1406 'duration': 2085,
1407 'upload_date': '20170118',
1408 'uploader': 'Vsauce',
1409 'uploader_id': 'Vsauce',
1410 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1411 'series': 'Mind Field',
1412 'season_number': 1,
1413 'episode_number': 1,
1414 },
1415 'params': {
1416 'skip_download': True,
1417 },
1418 'expected_warnings': [
1419 'Skipping DASH manifest',
1420 ],
1421 },
1422 {
1423 # The following content has been identified by the YouTube community
1424 # as inappropriate or offensive to some audiences.
1425 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1426 'info_dict': {
1427 'id': '6SJNVb0GnPI',
1428 'ext': 'mp4',
1429 'title': 'Race Differences in Intelligence',
1430 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1431 'duration': 965,
1432 'upload_date': '20140124',
1433 'uploader': 'New Century Foundation',
1434 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1435 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1436 },
1437 'params': {
1438 'skip_download': True,
1439 },
1440 'skip': 'This video has been removed for violating YouTube\'s policy on hate speech.',
1441 },
1442 {
1443 # itag 212
1444 'url': '1t24XAntNCY',
1445 'only_matching': True,
1446 },
1447 {
1448 # geo restricted to JP
1449 'url': 'sJL6WA-aGkQ',
1450 'only_matching': True,
1451 },
1452 {
1453 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1454 'only_matching': True,
1455 },
1456 {
1457 'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc',
1458 'only_matching': True,
1459 },
1460 {
1461 # from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m
1462 'url': 'https://redirect.invidious.io/Yh0AhrY9GjA',
1463 'only_matching': True,
1464 },
1465 {
1466 # DRM protected
1467 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1468 'only_matching': True,
1469 },
1470 {
1471 # Video with unsupported adaptive stream type formats
1472 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1473 'info_dict': {
1474 'id': 'Z4Vy8R84T1U',
1475 'ext': 'mp4',
1476 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1477 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1478 'duration': 433,
1479 'upload_date': '20130923',
1480 'uploader': 'Amelia Putri Harwita',
1481 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1482 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1483 'formats': 'maxcount:10',
1484 },
1485 'params': {
1486 'skip_download': True,
1487 'youtube_include_dash_manifest': False,
1488 },
1489 'skip': 'not actual anymore',
1490 },
1491 {
1492 # Youtube Music Auto-generated description
1493 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1494 'info_dict': {
1495 'id': 'MgNrAu2pzNs',
1496 'ext': 'mp4',
1497 'title': 'Voyeur Girl',
1498 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1499 'upload_date': '20190312',
1500 'uploader': 'Stephen - Topic',
1501 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1502 'artist': 'Stephen',
1503 'track': 'Voyeur Girl',
1504 'album': 'it\'s too much love to know my dear',
1505 'release_date': '20190313',
1506 'release_year': 2019,
1507 },
1508 'params': {
1509 'skip_download': True,
1510 },
1511 },
1512 {
1513 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1514 'only_matching': True,
1515 },
1516 {
1517 # invalid -> valid video id redirection
1518 'url': 'DJztXj2GPfl',
1519 'info_dict': {
1520 'id': 'DJztXj2GPfk',
1521 'ext': 'mp4',
1522 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1523 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1524 'upload_date': '20090125',
1525 'uploader': 'Prochorowka',
1526 'uploader_id': 'Prochorowka',
1527 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1528 'artist': 'Panjabi MC',
1529 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1530 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1531 },
1532 'params': {
1533 'skip_download': True,
1534 },
1535 'skip': 'Video unavailable',
1536 },
1537 {
1538 # empty description results in an empty string
1539 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1540 'info_dict': {
1541 'id': 'x41yOUIvK2k',
1542 'ext': 'mp4',
1543 'title': 'IMG 3456',
1544 'description': '',
1545 'upload_date': '20170613',
1546 'uploader_id': 'ElevageOrVert',
1547 'uploader': 'ElevageOrVert',
1548 },
1549 'params': {
1550 'skip_download': True,
1551 },
1552 },
1553 {
1554 # with '};' inside yt initial data (see [1])
1555 # see [2] for an example with '};' inside ytInitialPlayerResponse
1556 # 1. https://github.com/ytdl-org/youtube-dl/issues/27093
1557 # 2. https://github.com/ytdl-org/youtube-dl/issues/27216
1558 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1559 'info_dict': {
1560 'id': 'CHqg6qOn4no',
1561 'ext': 'mp4',
1562 'title': 'Part 77 Sort a list of simple types in c#',
1563 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1564 'upload_date': '20130831',
1565 'uploader_id': 'kudvenkat',
1566 'uploader': 'kudvenkat',
1567 },
1568 'params': {
1569 'skip_download': True,
1570 },
1571 },
1572 {
1573 # another example of '};' in ytInitialData
1574 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY',
1575 'only_matching': True,
1576 },
1577 {
1578 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ',
1579 'only_matching': True,
1580 },
1581 {
1582 # https://github.com/ytdl-org/youtube-dl/pull/28094
1583 'url': 'OtqTfy26tG0',
1584 'info_dict': {
1585 'id': 'OtqTfy26tG0',
1586 'ext': 'mp4',
1587 'title': 'Burn Out',
1588 'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131',
1589 'upload_date': '20141120',
1590 'uploader': 'The Cinematic Orchestra - Topic',
1591 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw',
1592 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw',
1593 'artist': 'The Cinematic Orchestra',
1594 'track': 'Burn Out',
1595 'album': 'Every Day',
1596 'release_data': None,
1597 'release_year': None,
1598 },
1599 'params': {
1600 'skip_download': True,
1601 },
1602 },
1603 {
1604 # controversial video, only works with bpctr when authenticated with cookies
1605 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg',
1606 'only_matching': True,
1607 },
1608 {
1609 # restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685
1610 'url': 'cBvYw8_A0vQ',
1611 'info_dict': {
1612 'id': 'cBvYw8_A0vQ',
1613 'ext': 'mp4',
1614 'title': '4K Ueno Okachimachi Street Scenes 上野御徒町歩き',
1615 'description': 'md5:ea770e474b7cd6722b4c95b833c03630',
1616 'upload_date': '20201120',
1617 'uploader': 'Walk around Japan',
1618 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw',
1619 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
1620 },
1621 'params': {
1622 'skip_download': True,
1623 },
1624 }, {
1625 # Has multiple audio streams
1626 'url': 'WaOKSUlf4TM',
1627 'only_matching': True
1628 }, {
1629 # Requires Premium: has format 141 when requested using YTM url
1630 'url': 'https://music.youtube.com/watch?v=XclachpHxis',
1631 'only_matching': True
1632 }, {
1633 # multiple subtitles with same lang_code
1634 'url': 'https://www.youtube.com/watch?v=wsQiKKfKxug',
1635 'only_matching': True,
1636 }, {
1637 # Force use android client fallback
1638 'url': 'https://www.youtube.com/watch?v=YOelRv7fMxY',
1639 'info_dict': {
1640 'id': 'YOelRv7fMxY',
1641 'title': 'Digging a Secret Tunnel from my Workshop',
1642 'ext': '3gp',
1643 'upload_date': '20210624',
1644 'channel_id': 'UCp68_FLety0O-n9QU6phsgw',
1645 'uploader': 'colinfurze',
1646 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCp68_FLety0O-n9QU6phsgw',
1647 'description': 'md5:ecb672623246d98c6c562eed6ae798c3'
1648 },
1649 'params': {
1650 'format': '17', # 3gp format available on android
1651 'extractor_args': {'youtube': {'player_client': ['android']}},
1652 },
1653 },
1654 {
1655 # Skip download of additional client configs (remix client config in this case)
1656 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1657 'only_matching': True,
1658 'params': {
1659 'extractor_args': {'youtube': {'player_skip': ['configs']}},
1660 },
1661 }
1662 ]
1663
1664 @classmethod
1665 def suitable(cls, url):
1666 # Hack for lazy extractors until more generic solution is implemented
1667 # (see #28780)
1668 from .youtube import parse_qs
1669 qs = parse_qs(url)
1670 if qs.get('list', [None])[0]:
1671 return False
1672 return super(YoutubeIE, cls).suitable(url)
1673
1674 def __init__(self, *args, **kwargs):
1675 super(YoutubeIE, self).__init__(*args, **kwargs)
1676 self._code_cache = {}
1677 self._player_cache = {}
1678
1679 def _extract_player_url(self, ytcfg=None, webpage=None):
1680 player_url = try_get(ytcfg, (lambda x: x['PLAYER_JS_URL']), str)
1681 if not player_url:
1682 player_url = self._search_regex(
1683 r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
1684 webpage, 'player URL', fatal=False)
1685 if player_url.startswith('//'):
1686 player_url = 'https:' + player_url
1687 elif not re.match(r'https?://', player_url):
1688 player_url = compat_urlparse.urljoin(
1689 'https://www.youtube.com', player_url)
1690 return player_url
1691
1692 def _signature_cache_id(self, example_sig):
1693 """ Return a string representation of a signature """
1694 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1695
1696 @classmethod
1697 def _extract_player_info(cls, player_url):
1698 for player_re in cls._PLAYER_INFO_RE:
1699 id_m = re.search(player_re, player_url)
1700 if id_m:
1701 break
1702 else:
1703 raise ExtractorError('Cannot identify player %r' % player_url)
1704 return id_m.group('id')
1705
1706 def _load_player(self, video_id, player_url, fatal=True) -> bool:
1707 player_id = self._extract_player_info(player_url)
1708 if player_id not in self._code_cache:
1709 self._code_cache[player_id] = self._download_webpage(
1710 player_url, video_id, fatal=fatal,
1711 note='Downloading player ' + player_id,
1712 errnote='Download of %s failed' % player_url)
1713 return player_id in self._code_cache
1714
1715 def _extract_signature_function(self, video_id, player_url, example_sig):
1716 player_id = self._extract_player_info(player_url)
1717
1718 # Read from filesystem cache
1719 func_id = 'js_%s_%s' % (
1720 player_id, self._signature_cache_id(example_sig))
1721 assert os.path.basename(func_id) == func_id
1722
1723 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1724 if cache_spec is not None:
1725 return lambda s: ''.join(s[i] for i in cache_spec)
1726
1727 if self._load_player(video_id, player_url):
1728 code = self._code_cache[player_id]
1729 res = self._parse_sig_js(code)
1730
1731 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1732 cache_res = res(test_string)
1733 cache_spec = [ord(c) for c in cache_res]
1734
1735 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1736 return res
1737
1738 def _print_sig_code(self, func, example_sig):
1739 def gen_sig_code(idxs):
1740 def _genslice(start, end, step):
1741 starts = '' if start == 0 else str(start)
1742 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1743 steps = '' if step == 1 else (':%d' % step)
1744 return 's[%s%s%s]' % (starts, ends, steps)
1745
1746 step = None
1747 # Quelch pyflakes warnings - start will be set when step is set
1748 start = '(Never used)'
1749 for i, prev in zip(idxs[1:], idxs[:-1]):
1750 if step is not None:
1751 if i - prev == step:
1752 continue
1753 yield _genslice(start, prev, step)
1754 step = None
1755 continue
1756 if i - prev in [-1, 1]:
1757 step = i - prev
1758 start = prev
1759 continue
1760 else:
1761 yield 's[%d]' % prev
1762 if step is None:
1763 yield 's[%d]' % i
1764 else:
1765 yield _genslice(start, i, step)
1766
1767 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1768 cache_res = func(test_string)
1769 cache_spec = [ord(c) for c in cache_res]
1770 expr_code = ' + '.join(gen_sig_code(cache_spec))
1771 signature_id_tuple = '(%s)' % (
1772 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1773 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1774 ' return %s\n') % (signature_id_tuple, expr_code)
1775 self.to_screen('Extracted signature function:\n' + code)
1776
1777 def _parse_sig_js(self, jscode):
1778 funcname = self._search_regex(
1779 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1780 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1781 r'\bm=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(h\.s\)\)',
1782 r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(c\)\)',
1783 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)',
1784 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1785 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1786 # Obsolete patterns
1787 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1788 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1789 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1790 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1791 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1792 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1793 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1794 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1795 jscode, 'Initial JS player signature function name', group='sig')
1796
1797 jsi = JSInterpreter(jscode)
1798 initial_function = jsi.extract_function(funcname)
1799 return lambda s: initial_function([s])
1800
1801 def _decrypt_signature(self, s, video_id, player_url):
1802 """Turn the encrypted s field into a working signature"""
1803
1804 if player_url is None:
1805 raise ExtractorError('Cannot decrypt signature without player_url')
1806
1807 try:
1808 player_id = (player_url, self._signature_cache_id(s))
1809 if player_id not in self._player_cache:
1810 func = self._extract_signature_function(
1811 video_id, player_url, s
1812 )
1813 self._player_cache[player_id] = func
1814 func = self._player_cache[player_id]
1815 if self.get_param('youtube_print_sig_code'):
1816 self._print_sig_code(func, s)
1817 return func(s)
1818 except Exception as e:
1819 tb = traceback.format_exc()
1820 raise ExtractorError(
1821 'Signature extraction failed: ' + tb, cause=e)
1822
1823 def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False):
1824 """
1825 Extract signatureTimestamp (sts)
1826 Required to tell API what sig/player version is in use.
1827 """
1828 sts = None
1829 if isinstance(ytcfg, dict):
1830 sts = int_or_none(ytcfg.get('STS'))
1831
1832 if not sts:
1833 # Attempt to extract from player
1834 if player_url is None:
1835 error_msg = 'Cannot extract signature timestamp without player_url.'
1836 if fatal:
1837 raise ExtractorError(error_msg)
1838 self.report_warning(error_msg)
1839 return
1840 if self._load_player(video_id, player_url, fatal=fatal):
1841 player_id = self._extract_player_info(player_url)
1842 code = self._code_cache[player_id]
1843 sts = int_or_none(self._search_regex(
1844 r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code,
1845 'JS player signature timestamp', group='sts', fatal=fatal))
1846 return sts
1847
1848 def _mark_watched(self, video_id, player_response):
1849 playback_url = url_or_none(try_get(
1850 player_response,
1851 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']))
1852 if not playback_url:
1853 return
1854 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1855 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1856
1857 # cpn generation algorithm is reverse engineered from base.js.
1858 # In fact it works even with dummy cpn.
1859 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1860 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1861
1862 qs.update({
1863 'ver': ['2'],
1864 'cpn': [cpn],
1865 })
1866 playback_url = compat_urlparse.urlunparse(
1867 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1868
1869 self._download_webpage(
1870 playback_url, video_id, 'Marking watched',
1871 'Unable to mark watched', fatal=False)
1872
1873 @staticmethod
1874 def _extract_urls(webpage):
1875 # Embedded YouTube player
1876 entries = [
1877 unescapeHTML(mobj.group('url'))
1878 for mobj in re.finditer(r'''(?x)
1879 (?:
1880 <iframe[^>]+?src=|
1881 data-video-url=|
1882 <embed[^>]+?src=|
1883 embedSWF\(?:\s*|
1884 <object[^>]+data=|
1885 new\s+SWFObject\(
1886 )
1887 (["\'])
1888 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1889 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1890 \1''', webpage)]
1891
1892 # lazyYT YouTube embed
1893 entries.extend(list(map(
1894 unescapeHTML,
1895 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1896
1897 # Wordpress "YouTube Video Importer" plugin
1898 matches = re.findall(r'''(?x)<div[^>]+
1899 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1900 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1901 entries.extend(m[-1] for m in matches)
1902
1903 return entries
1904
1905 @staticmethod
1906 def _extract_url(webpage):
1907 urls = YoutubeIE._extract_urls(webpage)
1908 return urls[0] if urls else None
1909
1910 @classmethod
1911 def extract_id(cls, url):
1912 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1913 if mobj is None:
1914 raise ExtractorError('Invalid URL: %s' % url)
1915 video_id = mobj.group(2)
1916 return video_id
1917
1918 def _extract_chapters_from_json(self, data, video_id, duration):
1919 chapters_list = try_get(
1920 data,
1921 lambda x: x['playerOverlays']
1922 ['playerOverlayRenderer']
1923 ['decoratedPlayerBarRenderer']
1924 ['decoratedPlayerBarRenderer']
1925 ['playerBar']
1926 ['chapteredPlayerBarRenderer']
1927 ['chapters'],
1928 list)
1929 if not chapters_list:
1930 return
1931
1932 def chapter_time(chapter):
1933 return float_or_none(
1934 try_get(
1935 chapter,
1936 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1937 int),
1938 scale=1000)
1939 chapters = []
1940 for next_num, chapter in enumerate(chapters_list, start=1):
1941 start_time = chapter_time(chapter)
1942 if start_time is None:
1943 continue
1944 end_time = (chapter_time(chapters_list[next_num])
1945 if next_num < len(chapters_list) else duration)
1946 if end_time is None:
1947 continue
1948 title = try_get(
1949 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1950 compat_str)
1951 chapters.append({
1952 'start_time': start_time,
1953 'end_time': end_time,
1954 'title': title,
1955 })
1956 return chapters
1957
1958 def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
1959 return self._parse_json(self._search_regex(
1960 (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
1961 regex), webpage, name, default='{}'), video_id, fatal=False)
1962
1963 @staticmethod
1964 def parse_time_text(time_text):
1965 """
1966 Parse the comment time text
1967 time_text is in the format 'X units ago (edited)'
1968 """
1969 time_text_split = time_text.split(' ')
1970 if len(time_text_split) >= 3:
1971 return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto')
1972
1973 @staticmethod
1974 def _join_text_entries(runs):
1975 text = None
1976 for run in runs:
1977 if not isinstance(run, dict):
1978 continue
1979 sub_text = try_get(run, lambda x: x['text'], compat_str)
1980 if sub_text:
1981 if not text:
1982 text = sub_text
1983 continue
1984 text += sub_text
1985 return text
1986
1987 def _extract_comment(self, comment_renderer, parent=None):
1988 comment_id = comment_renderer.get('commentId')
1989 if not comment_id:
1990 return
1991 comment_text_runs = try_get(comment_renderer, lambda x: x['contentText']['runs']) or []
1992 text = self._join_text_entries(comment_text_runs) or ''
1993 comment_time_text = try_get(comment_renderer, lambda x: x['publishedTimeText']['runs']) or []
1994 time_text = self._join_text_entries(comment_time_text)
1995 timestamp = calendar.timegm(self.parse_time_text(time_text).timetuple())
1996 author = try_get(comment_renderer, lambda x: x['authorText']['simpleText'], compat_str)
1997 author_id = try_get(comment_renderer,
1998 lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
1999 votes = str_to_int(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'],
2000 lambda x: x['likeCount']), compat_str)) or 0
2001 author_thumbnail = try_get(comment_renderer,
2002 lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], compat_str)
2003
2004 author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
2005 is_liked = try_get(comment_renderer, lambda x: x['isLiked'], bool)
2006 return {
2007 'id': comment_id,
2008 'text': text,
2009 'timestamp': timestamp,
2010 'time_text': time_text,
2011 'like_count': votes,
2012 'is_favorited': is_liked,
2013 'author': author,
2014 'author_id': author_id,
2015 'author_thumbnail': author_thumbnail,
2016 'author_is_uploader': author_is_uploader,
2017 'parent': parent or 'root'
2018 }
2019
2020 def _comment_entries(self, root_continuation_data, identity_token, account_syncid,
2021 ytcfg, video_id, parent=None, comment_counts=None):
2022
2023 def extract_header(contents):
2024 _total_comments = 0
2025 _continuation = None
2026 for content in contents:
2027 comments_header_renderer = try_get(content, lambda x: x['commentsHeaderRenderer'])
2028 expected_comment_count = try_get(comments_header_renderer,
2029 (lambda x: x['countText']['runs'][0]['text'],
2030 lambda x: x['commentsCount']['runs'][0]['text']),
2031 compat_str)
2032 if expected_comment_count:
2033 comment_counts[1] = str_to_int(expected_comment_count)
2034 self.to_screen('Downloading ~%d comments' % str_to_int(expected_comment_count))
2035 _total_comments = comment_counts[1]
2036 sort_mode_str = self._configuration_arg('comment_sort', [''])[0]
2037 comment_sort_index = int(sort_mode_str != 'top') # 1 = new, 0 = top
2038
2039 sort_menu_item = try_get(
2040 comments_header_renderer,
2041 lambda x: x['sortMenu']['sortFilterSubMenuRenderer']['subMenuItems'][comment_sort_index], dict) or {}
2042 sort_continuation_ep = sort_menu_item.get('serviceEndpoint') or {}
2043
2044 _continuation = self._extract_continuation_ep_data(sort_continuation_ep) or self._extract_continuation(sort_menu_item)
2045 if not _continuation:
2046 continue
2047
2048 sort_text = sort_menu_item.get('title')
2049 if isinstance(sort_text, compat_str):
2050 sort_text = sort_text.lower()
2051 else:
2052 sort_text = 'top comments' if comment_sort_index == 0 else 'newest first'
2053 self.to_screen('Sorting comments by %s' % sort_text)
2054 break
2055 return _total_comments, _continuation
2056
2057 def extract_thread(contents):
2058 if not parent:
2059 comment_counts[2] = 0
2060 for content in contents:
2061 comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
2062 comment_renderer = try_get(
2063 comment_thread_renderer, (lambda x: x['comment']['commentRenderer'], dict)) or try_get(
2064 content, (lambda x: x['commentRenderer'], dict))
2065
2066 if not comment_renderer:
2067 continue
2068 comment = self._extract_comment(comment_renderer, parent)
2069 if not comment:
2070 continue
2071 comment_counts[0] += 1
2072 yield comment
2073 # Attempt to get the replies
2074 comment_replies_renderer = try_get(
2075 comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
2076
2077 if comment_replies_renderer:
2078 comment_counts[2] += 1
2079 comment_entries_iter = self._comment_entries(
2080 comment_replies_renderer, identity_token, account_syncid, ytcfg,
2081 video_id, parent=comment.get('id'), comment_counts=comment_counts)
2082
2083 for reply_comment in comment_entries_iter:
2084 yield reply_comment
2085
2086 # YouTube comments have a max depth of 2
2087 max_depth = int_or_none(self._configuration_arg('max_comment_depth', [''])[0]) or float('inf')
2088 if max_depth == 1 and parent:
2089 return
2090 if not comment_counts:
2091 # comment so far, est. total comments, current comment thread #
2092 comment_counts = [0, 0, 0]
2093
2094 continuation = self._extract_continuation(root_continuation_data)
2095 if continuation and len(continuation['ctoken']) < 27:
2096 self.write_debug('Detected old API continuation token. Generating new API compatible token.')
2097 continuation_token = self._generate_comment_continuation(video_id)
2098 continuation = self._build_continuation_query(continuation_token, None)
2099
2100 visitor_data = None
2101 is_first_continuation = parent is None
2102
2103 for page_num in itertools.count(0):
2104 if not continuation:
2105 break
2106 headers = self._generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
2107 comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
2108 if page_num == 0:
2109 if is_first_continuation:
2110 note_prefix = 'Downloading comment section API JSON'
2111 else:
2112 note_prefix = ' Downloading comment API JSON reply thread %d %s' % (
2113 comment_counts[2], comment_prog_str)
2114 else:
2115 note_prefix = '%sDownloading comment%s API JSON page %d %s' % (
2116 ' ' if parent else '', ' replies' if parent else '',
2117 page_num, comment_prog_str)
2118
2119 response = self._extract_response(
2120 item_id=None, query=self._continuation_query_ajax_to_api(continuation),
2121 ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
2122 check_get_keys=('onResponseReceivedEndpoints', 'continuationContents'))
2123 if not response:
2124 break
2125 visitor_data = try_get(
2126 response,
2127 lambda x: x['responseContext']['webResponseContextExtensionData']['ytConfigData']['visitorData'],
2128 compat_str) or visitor_data
2129
2130 continuation_contents = dict_get(response, ('onResponseReceivedEndpoints', 'continuationContents'))
2131
2132 continuation = None
2133 if isinstance(continuation_contents, list):
2134 for continuation_section in continuation_contents:
2135 if not isinstance(continuation_section, dict):
2136 continue
2137 continuation_items = try_get(
2138 continuation_section,
2139 (lambda x: x['reloadContinuationItemsCommand']['continuationItems'],
2140 lambda x: x['appendContinuationItemsAction']['continuationItems']),
2141 list) or []
2142 if is_first_continuation:
2143 total_comments, continuation = extract_header(continuation_items)
2144 if total_comments:
2145 yield total_comments
2146 is_first_continuation = False
2147 if continuation:
2148 break
2149 continue
2150 count = 0
2151 for count, entry in enumerate(extract_thread(continuation_items)):
2152 yield entry
2153 continuation = self._extract_continuation({'contents': continuation_items})
2154 if continuation:
2155 # Sometimes YouTube provides a continuation without any comments
2156 # In most cases we end up just downloading these with very little comments to come.
2157 if count == 0:
2158 if not parent:
2159 self.report_warning('No comments received - assuming end of comments')
2160 continuation = None
2161 break
2162
2163 # Deprecated response structure
2164 elif isinstance(continuation_contents, dict):
2165 known_continuation_renderers = ('itemSectionContinuation', 'commentRepliesContinuation')
2166 for key, continuation_renderer in continuation_contents.items():
2167 if key not in known_continuation_renderers:
2168 continue
2169 if not isinstance(continuation_renderer, dict):
2170 continue
2171 if is_first_continuation:
2172 header_continuation_items = [continuation_renderer.get('header') or {}]
2173 total_comments, continuation = extract_header(header_continuation_items)
2174 if total_comments:
2175 yield total_comments
2176 is_first_continuation = False
2177 if continuation:
2178 break
2179
2180 # Sometimes YouTube provides a continuation without any comments
2181 # In most cases we end up just downloading these with very little comments to come.
2182 count = 0
2183 for count, entry in enumerate(extract_thread(continuation_renderer.get('contents') or {})):
2184 yield entry
2185 continuation = self._extract_continuation(continuation_renderer)
2186 if count == 0:
2187 if not parent:
2188 self.report_warning('No comments received - assuming end of comments')
2189 continuation = None
2190 break
2191
2192 @staticmethod
2193 def _generate_comment_continuation(video_id):
2194 """
2195 Generates initial comment section continuation token from given video id
2196 """
2197 b64_vid_id = base64.b64encode(bytes(video_id.encode('utf-8')))
2198 parts = ('Eg0SCw==', b64_vid_id, 'GAYyJyIRIgs=', b64_vid_id, 'MAB4AjAAQhBjb21tZW50cy1zZWN0aW9u')
2199 new_continuation_intlist = list(itertools.chain.from_iterable(
2200 [bytes_to_intlist(base64.b64decode(part)) for part in parts]))
2201 return base64.b64encode(intlist_to_bytes(new_continuation_intlist)).decode('utf-8')
2202
2203 def _extract_comments(self, ytcfg, video_id, contents, webpage):
2204 """Entry for comment extraction"""
2205 def _real_comment_extract(contents):
2206 if isinstance(contents, list):
2207 for entry in contents:
2208 for key, renderer in entry.items():
2209 if key not in known_entry_comment_renderers:
2210 continue
2211 yield from self._comment_entries(
2212 renderer, video_id=video_id, ytcfg=ytcfg,
2213 identity_token=self._extract_identity_token(webpage, item_id=video_id),
2214 account_syncid=self._extract_account_syncid(ytcfg))
2215 break
2216 comments = []
2217 known_entry_comment_renderers = ('itemSectionRenderer',)
2218 estimated_total = 0
2219 max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) or float('inf')
2220
2221 try:
2222 for comment in _real_comment_extract(contents):
2223 if len(comments) >= max_comments:
2224 break
2225 if isinstance(comment, int):
2226 estimated_total = comment
2227 continue
2228 comments.append(comment)
2229 except KeyboardInterrupt:
2230 self.to_screen('Interrupted by user')
2231 self.to_screen('Downloaded %d/%d comments' % (len(comments), estimated_total))
2232 return {
2233 'comments': comments,
2234 'comment_count': len(comments),
2235 }
2236
2237 @staticmethod
2238 def _generate_player_context(sts=None):
2239 context = {
2240 'html5Preference': 'HTML5_PREF_WANTS',
2241 }
2242 if sts is not None:
2243 context['signatureTimestamp'] = sts
2244 return {
2245 'playbackContext': {
2246 'contentPlaybackContext': context
2247 }
2248 }
2249
2250 @staticmethod
2251 def _get_video_info_params(video_id):
2252 return {
2253 'video_id': video_id,
2254 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
2255 'html5': '1',
2256 'c': 'TVHTML5',
2257 'cver': '6.20180913',
2258 }
2259
2260 def _real_extract(self, url):
2261 url, smuggled_data = unsmuggle_url(url, {})
2262 video_id = self._match_id(url)
2263
2264 is_music_url = smuggled_data.get('is_music_url') or self.is_music_url(url)
2265
2266 base_url = self.http_scheme() + '//www.youtube.com/'
2267 webpage_url = base_url + 'watch?v=' + video_id
2268 webpage = self._download_webpage(
2269 webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
2270
2271 ytcfg = self._extract_ytcfg(video_id, webpage) or self._get_default_ytcfg()
2272 identity_token = self._extract_identity_token(webpage, video_id)
2273 syncid = self._extract_account_syncid(ytcfg)
2274 headers = self._generate_api_headers(ytcfg, identity_token, syncid)
2275
2276 player_url = self._extract_player_url(ytcfg, webpage)
2277
2278 player_client = self._configuration_arg('player_client', [''])[0]
2279 if player_client not in ('web', 'android', ''):
2280 self.report_warning(f'Invalid player_client {player_client} given. Falling back to WEB')
2281 force_mobile_client = player_client == 'android'
2282 player_skip = self._configuration_arg('player_skip')
2283
2284 def get_text(x):
2285 if not x:
2286 return
2287 text = x.get('simpleText')
2288 if text and isinstance(text, compat_str):
2289 return text
2290 runs = x.get('runs')
2291 if not isinstance(runs, list):
2292 return
2293 return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)])
2294
2295 ytm_streaming_data = {}
2296 if is_music_url:
2297 ytm_webpage = None
2298 sts = self._extract_signature_timestamp(video_id, player_url, ytcfg, fatal=False)
2299 if sts and not force_mobile_client and 'configs' not in player_skip:
2300 ytm_webpage = self._download_webpage(
2301 'https://music.youtube.com',
2302 video_id, fatal=False, note='Downloading remix client config')
2303
2304 ytm_cfg = self._extract_ytcfg(video_id, ytm_webpage) or {}
2305 ytm_client = 'WEB_REMIX'
2306 if not sts or force_mobile_client:
2307 # Android client already has signature descrambled
2308 # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/562
2309 if not sts:
2310 self.report_warning('Falling back to mobile remix client for player API.')
2311 ytm_client = 'ANDROID_MUSIC'
2312 ytm_cfg = {}
2313
2314 ytm_headers = self._generate_api_headers(
2315 ytm_cfg, identity_token, syncid,
2316 client=ytm_client)
2317 ytm_query = {'videoId': video_id}
2318 ytm_query.update(self._generate_player_context(sts))
2319
2320 ytm_player_response = self._extract_response(
2321 item_id=video_id, ep='player', query=ytm_query,
2322 ytcfg=ytm_cfg, headers=ytm_headers, fatal=False,
2323 default_client=ytm_client,
2324 note='Downloading %sremix player API JSON' % ('mobile ' if force_mobile_client else ''))
2325 ytm_streaming_data = try_get(ytm_player_response, lambda x: x['streamingData'], dict) or {}
2326
2327 player_response = None
2328 if webpage:
2329 player_response = self._extract_yt_initial_variable(
2330 webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
2331 video_id, 'initial player response')
2332
2333 if not player_response or force_mobile_client:
2334 sts = self._extract_signature_timestamp(video_id, player_url, ytcfg, fatal=False)
2335 yt_client = 'WEB'
2336 ytpcfg = ytcfg
2337 ytp_headers = headers
2338 if not sts or force_mobile_client:
2339 # Android client already has signature descrambled
2340 # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/562
2341 if not sts:
2342 self.report_warning('Falling back to mobile client for player API.')
2343 yt_client = 'ANDROID'
2344 ytpcfg = {}
2345 ytp_headers = self._generate_api_headers(ytpcfg, identity_token, syncid, yt_client)
2346
2347 yt_query = {'videoId': video_id}
2348 yt_query.update(self._generate_player_context(sts))
2349 player_response = self._extract_response(
2350 item_id=video_id, ep='player', query=yt_query,
2351 ytcfg=ytpcfg, headers=ytp_headers, fatal=False,
2352 default_client=yt_client,
2353 note='Downloading %splayer API JSON' % ('mobile ' if force_mobile_client else '')
2354 )
2355
2356 # Age-gate workarounds
2357 playability_status = player_response.get('playabilityStatus') or {}
2358 if playability_status.get('reason') in self._AGE_GATE_REASONS:
2359 pr = self._parse_json(try_get(compat_parse_qs(
2360 self._download_webpage(
2361 base_url + 'get_video_info', video_id,
2362 'Refetching age-gated info webpage', 'unable to download video info webpage',
2363 query=self._get_video_info_params(video_id), fatal=False)),
2364 lambda x: x['player_response'][0],
2365 compat_str) or '{}', video_id)
2366 if not pr:
2367 self.report_warning('Falling back to embedded-only age-gate workaround.')
2368 embed_webpage = None
2369 sts = self._extract_signature_timestamp(video_id, player_url, ytcfg, fatal=False)
2370 if sts and not force_mobile_client and 'configs' not in player_skip:
2371 embed_webpage = self._download_webpage(
2372 'https://www.youtube.com/embed/%s?html5=1' % video_id,
2373 video_id=video_id, note='Downloading age-gated embed config')
2374
2375 ytcfg_age = self._extract_ytcfg(video_id, embed_webpage) or {}
2376 # If we extracted the embed webpage, it'll tell us if we can view the video
2377 embedded_pr = self._parse_json(
2378 try_get(ytcfg_age, lambda x: x['PLAYER_VARS']['embedded_player_response'], str) or '{}',
2379 video_id=video_id)
2380 embedded_ps_reason = try_get(embedded_pr, lambda x: x['playabilityStatus']['reason'], str) or ''
2381 if embedded_ps_reason not in self._AGE_GATE_REASONS:
2382 yt_client = 'WEB_EMBEDDED_PLAYER'
2383 if not sts or force_mobile_client:
2384 # Android client already has signature descrambled
2385 # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/562
2386 if not sts:
2387 self.report_warning(
2388 'Falling back to mobile embedded client for player API (note: some formats may be missing).')
2389 yt_client = 'ANDROID_EMBEDDED_PLAYER'
2390 ytcfg_age = {}
2391
2392 ytage_headers = self._generate_api_headers(
2393 ytcfg_age, identity_token, syncid, client=yt_client)
2394 yt_age_query = {'videoId': video_id}
2395 yt_age_query.update(self._generate_player_context(sts))
2396 pr = self._extract_response(
2397 item_id=video_id, ep='player', query=yt_age_query,
2398 ytcfg=ytcfg_age, headers=ytage_headers, fatal=False,
2399 default_client=yt_client,
2400 note='Downloading %sage-gated player API JSON' % ('mobile ' if force_mobile_client else '')
2401 ) or {}
2402
2403 if pr:
2404 player_response = pr
2405
2406 trailer_video_id = try_get(
2407 playability_status,
2408 lambda x: x['errorScreen']['playerLegacyDesktopYpcTrailerRenderer']['trailerVideoId'],
2409 compat_str)
2410 if trailer_video_id:
2411 return self.url_result(
2412 trailer_video_id, self.ie_key(), trailer_video_id)
2413
2414 search_meta = (
2415 lambda x: self._html_search_meta(x, webpage, default=None)) \
2416 if webpage else lambda x: None
2417
2418 video_details = player_response.get('videoDetails') or {}
2419 microformat = try_get(
2420 player_response,
2421 lambda x: x['microformat']['playerMicroformatRenderer'],
2422 dict) or {}
2423 video_title = video_details.get('title') \
2424 or get_text(microformat.get('title')) \
2425 or search_meta(['og:title', 'twitter:title', 'title'])
2426 video_description = video_details.get('shortDescription')
2427
2428 if not smuggled_data.get('force_singlefeed', False):
2429 if not self.get_param('noplaylist'):
2430 multifeed_metadata_list = try_get(
2431 player_response,
2432 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
2433 compat_str)
2434 if multifeed_metadata_list:
2435 entries = []
2436 feed_ids = []
2437 for feed in multifeed_metadata_list.split(','):
2438 # Unquote should take place before split on comma (,) since textual
2439 # fields may contain comma as well (see
2440 # https://github.com/ytdl-org/youtube-dl/issues/8536)
2441 feed_data = compat_parse_qs(
2442 compat_urllib_parse_unquote_plus(feed))
2443
2444 def feed_entry(name):
2445 return try_get(
2446 feed_data, lambda x: x[name][0], compat_str)
2447
2448 feed_id = feed_entry('id')
2449 if not feed_id:
2450 continue
2451 feed_title = feed_entry('title')
2452 title = video_title
2453 if feed_title:
2454 title += ' (%s)' % feed_title
2455 entries.append({
2456 '_type': 'url_transparent',
2457 'ie_key': 'Youtube',
2458 'url': smuggle_url(
2459 base_url + 'watch?v=' + feed_data['id'][0],
2460 {'force_singlefeed': True}),
2461 'title': title,
2462 })
2463 feed_ids.append(feed_id)
2464 self.to_screen(
2465 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
2466 % (', '.join(feed_ids), video_id))
2467 return self.playlist_result(
2468 entries, video_id, video_title, video_description)
2469 else:
2470 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2471
2472 formats, itags, stream_ids = [], [], []
2473 itag_qualities = {}
2474 q = qualities([
2475 # "tiny" is the smallest video-only format. But some audio-only formats
2476 # was also labeled "tiny". It is not clear if such formats still exist
2477 'tiny', 'audio_quality_low', 'audio_quality_medium', 'audio_quality_high', # Audio only formats
2478 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'
2479 ])
2480
2481 streaming_data = player_response.get('streamingData') or {}
2482 streaming_formats = streaming_data.get('formats') or []
2483 streaming_formats.extend(streaming_data.get('adaptiveFormats') or [])
2484 streaming_formats.extend(ytm_streaming_data.get('formats') or [])
2485 streaming_formats.extend(ytm_streaming_data.get('adaptiveFormats') or [])
2486
2487 for fmt in streaming_formats:
2488 if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
2489 continue
2490
2491 itag = str_or_none(fmt.get('itag'))
2492 audio_track = fmt.get('audioTrack') or {}
2493 stream_id = '%s.%s' % (itag or '', audio_track.get('id', ''))
2494 if stream_id in stream_ids:
2495 continue
2496
2497 quality = fmt.get('quality')
2498 if quality == 'tiny' or not quality:
2499 quality = fmt.get('audioQuality', '').lower() or quality
2500 if itag and quality:
2501 itag_qualities[itag] = quality
2502 # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment
2503 # (adding `&sq=0` to the URL) and parsing emsg box to determine the
2504 # number of fragment that would subsequently requested with (`&sq=N`)
2505 if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF':
2506 continue
2507
2508 fmt_url = fmt.get('url')
2509 if not fmt_url:
2510 sc = compat_parse_qs(fmt.get('signatureCipher'))
2511 fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0]))
2512 encrypted_sig = try_get(sc, lambda x: x['s'][0])
2513 if not (sc and fmt_url and encrypted_sig):
2514 continue
2515 if not player_url:
2516 continue
2517 signature = self._decrypt_signature(sc['s'][0], video_id, player_url)
2518 sp = try_get(sc, lambda x: x['sp'][0]) or 'signature'
2519 fmt_url += '&' + sp + '=' + signature
2520
2521 if itag:
2522 itags.append(itag)
2523 stream_ids.append(stream_id)
2524
2525 tbr = float_or_none(
2526 fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
2527 dct = {
2528 'asr': int_or_none(fmt.get('audioSampleRate')),
2529 'filesize': int_or_none(fmt.get('contentLength')),
2530 'format_id': itag,
2531 'format_note': audio_track.get('displayName') or fmt.get('qualityLabel') or quality,
2532 'fps': int_or_none(fmt.get('fps')),
2533 'height': int_or_none(fmt.get('height')),
2534 'quality': q(quality),
2535 'tbr': tbr,
2536 'url': fmt_url,
2537 'width': fmt.get('width'),
2538 'language': audio_track.get('id', '').split('.')[0],
2539 }
2540 mime_mobj = re.match(
2541 r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '')
2542 if mime_mobj:
2543 dct['ext'] = mimetype2ext(mime_mobj.group(1))
2544 dct.update(parse_codecs(mime_mobj.group(2)))
2545 # The 3gp format in android client has a quality of "small",
2546 # but is actually worse than all other formats
2547 if dct['ext'] == '3gp':
2548 dct['quality'] = q('tiny')
2549 no_audio = dct.get('acodec') == 'none'
2550 no_video = dct.get('vcodec') == 'none'
2551 if no_audio:
2552 dct['vbr'] = tbr
2553 if no_video:
2554 dct['abr'] = tbr
2555 if no_audio or no_video:
2556 dct['downloader_options'] = {
2557 # Youtube throttles chunks >~10M
2558 'http_chunk_size': 10485760,
2559 }
2560 if dct.get('ext'):
2561 dct['container'] = dct['ext'] + '_dash'
2562 formats.append(dct)
2563
2564 skip_manifests = self._configuration_arg('skip')
2565 get_dash = 'dash' not in skip_manifests and self.get_param('youtube_include_dash_manifest', True)
2566 get_hls = 'hls' not in skip_manifests and self.get_param('youtube_include_hls_manifest', True)
2567
2568 for sd in (streaming_data, ytm_streaming_data):
2569 hls_manifest_url = get_hls and sd.get('hlsManifestUrl')
2570 if hls_manifest_url:
2571 for f in self._extract_m3u8_formats(
2572 hls_manifest_url, video_id, 'mp4', fatal=False):
2573 itag = self._search_regex(
2574 r'/itag/(\d+)', f['url'], 'itag', default=None)
2575 if itag:
2576 f['format_id'] = itag
2577 formats.append(f)
2578
2579 dash_manifest_url = get_dash and sd.get('dashManifestUrl')
2580 if dash_manifest_url:
2581 for f in self._extract_mpd_formats(
2582 dash_manifest_url, video_id, fatal=False):
2583 itag = f['format_id']
2584 if itag in itags:
2585 continue
2586 if itag in itag_qualities:
2587 f['quality'] = q(itag_qualities[itag])
2588 filesize = int_or_none(self._search_regex(
2589 r'/clen/(\d+)', f.get('fragment_base_url')
2590 or f['url'], 'file size', default=None))
2591 if filesize:
2592 f['filesize'] = filesize
2593 formats.append(f)
2594
2595 if not formats:
2596 if not self.get_param('allow_unplayable_formats') and streaming_data.get('licenseInfos'):
2597 self.raise_no_formats(
2598 'This video is DRM protected.', expected=True)
2599 pemr = try_get(
2600 playability_status,
2601 lambda x: x['errorScreen']['playerErrorMessageRenderer'],
2602 dict) or {}
2603 reason = get_text(pemr.get('reason')) or playability_status.get('reason')
2604 subreason = pemr.get('subreason')
2605 if subreason:
2606 subreason = clean_html(get_text(subreason))
2607 if subreason == 'The uploader has not made this video available in your country.':
2608 countries = microformat.get('availableCountries')
2609 if not countries:
2610 regions_allowed = search_meta('regionsAllowed')
2611 countries = regions_allowed.split(',') if regions_allowed else None
2612 self.raise_geo_restricted(subreason, countries, metadata_available=True)
2613 reason += '\n' + subreason
2614 if reason:
2615 self.raise_no_formats(reason, expected=True)
2616
2617 self._sort_formats(formats)
2618
2619 keywords = video_details.get('keywords') or []
2620 if not keywords and webpage:
2621 keywords = [
2622 unescapeHTML(m.group('content'))
2623 for m in re.finditer(self._meta_regex('og:video:tag'), webpage)]
2624 for keyword in keywords:
2625 if keyword.startswith('yt:stretch='):
2626 mobj = re.search(r'(\d+)\s*:\s*(\d+)', keyword)
2627 if mobj:
2628 # NB: float is intentional for forcing float division
2629 w, h = (float(v) for v in mobj.groups())
2630 if w > 0 and h > 0:
2631 ratio = w / h
2632 for f in formats:
2633 if f.get('vcodec') != 'none':
2634 f['stretched_ratio'] = ratio
2635 break
2636
2637 thumbnails = []
2638 for container in (video_details, microformat):
2639 for thumbnail in (try_get(
2640 container,
2641 lambda x: x['thumbnail']['thumbnails'], list) or []):
2642 thumbnail_url = thumbnail.get('url')
2643 if not thumbnail_url:
2644 continue
2645 # Sometimes youtube gives a wrong thumbnail URL. See:
2646 # https://github.com/yt-dlp/yt-dlp/issues/233
2647 # https://github.com/ytdl-org/youtube-dl/issues/28023
2648 if 'maxresdefault' in thumbnail_url:
2649 thumbnail_url = thumbnail_url.split('?')[0]
2650 thumbnails.append({
2651 'url': thumbnail_url,
2652 'height': int_or_none(thumbnail.get('height')),
2653 'width': int_or_none(thumbnail.get('width')),
2654 'preference': 1 if 'maxresdefault' in thumbnail_url else -1
2655 })
2656 thumbnail_url = search_meta(['og:image', 'twitter:image'])
2657 if thumbnail_url:
2658 thumbnails.append({
2659 'url': thumbnail_url,
2660 'preference': 1 if 'maxresdefault' in thumbnail_url else -1
2661 })
2662 # All videos have a maxresdefault thumbnail, but sometimes it does not appear in the webpage
2663 # See: https://github.com/ytdl-org/youtube-dl/issues/29049
2664 thumbnails.append({
2665 'url': 'https://i.ytimg.com/vi/%s/maxresdefault.jpg' % video_id,
2666 'preference': 1,
2667 })
2668 self._remove_duplicate_formats(thumbnails)
2669
2670 category = microformat.get('category') or search_meta('genre')
2671 channel_id = video_details.get('channelId') \
2672 or microformat.get('externalChannelId') \
2673 or search_meta('channelId')
2674 duration = int_or_none(
2675 video_details.get('lengthSeconds')
2676 or microformat.get('lengthSeconds')) \
2677 or parse_duration(search_meta('duration'))
2678 is_live = video_details.get('isLive')
2679 is_upcoming = video_details.get('isUpcoming')
2680 owner_profile_url = microformat.get('ownerProfileUrl')
2681
2682 info = {
2683 'id': video_id,
2684 'title': self._live_title(video_title) if is_live else video_title,
2685 'formats': formats,
2686 'thumbnails': thumbnails,
2687 'description': video_description,
2688 'upload_date': unified_strdate(
2689 microformat.get('uploadDate')
2690 or search_meta('uploadDate')),
2691 'uploader': video_details['author'],
2692 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None,
2693 'uploader_url': owner_profile_url,
2694 'channel_id': channel_id,
2695 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None,
2696 'duration': duration,
2697 'view_count': int_or_none(
2698 video_details.get('viewCount')
2699 or microformat.get('viewCount')
2700 or search_meta('interactionCount')),
2701 'average_rating': float_or_none(video_details.get('averageRating')),
2702 'age_limit': 18 if (
2703 microformat.get('isFamilySafe') is False
2704 or search_meta('isFamilyFriendly') == 'false'
2705 or search_meta('og:restrictions:age') == '18+') else 0,
2706 'webpage_url': webpage_url,
2707 'categories': [category] if category else None,
2708 'tags': keywords,
2709 'is_live': is_live,
2710 'playable_in_embed': playability_status.get('playableInEmbed'),
2711 'was_live': video_details.get('isLiveContent'),
2712 }
2713
2714 pctr = try_get(
2715 player_response,
2716 lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict)
2717 subtitles = {}
2718 if pctr:
2719 def process_language(container, base_url, lang_code, sub_name, query):
2720 lang_subs = container.setdefault(lang_code, [])
2721 for fmt in self._SUBTITLE_FORMATS:
2722 query.update({
2723 'fmt': fmt,
2724 })
2725 lang_subs.append({
2726 'ext': fmt,
2727 'url': update_url_query(base_url, query),
2728 'name': sub_name,
2729 })
2730
2731 for caption_track in (pctr.get('captionTracks') or []):
2732 base_url = caption_track.get('baseUrl')
2733 if not base_url:
2734 continue
2735 if caption_track.get('kind') != 'asr':
2736 lang_code = (
2737 remove_start(caption_track.get('vssId') or '', '.').replace('.', '-')
2738 or caption_track.get('languageCode'))
2739 if not lang_code:
2740 continue
2741 process_language(
2742 subtitles, base_url, lang_code,
2743 try_get(caption_track, lambda x: x['name']['simpleText']),
2744 {})
2745 continue
2746 automatic_captions = {}
2747 for translation_language in (pctr.get('translationLanguages') or []):
2748 translation_language_code = translation_language.get('languageCode')
2749 if not translation_language_code:
2750 continue
2751 process_language(
2752 automatic_captions, base_url, translation_language_code,
2753 try_get(translation_language, (
2754 lambda x: x['languageName']['simpleText'],
2755 lambda x: x['languageName']['runs'][0]['text'])),
2756 {'tlang': translation_language_code})
2757 info['automatic_captions'] = automatic_captions
2758 info['subtitles'] = subtitles
2759
2760 parsed_url = compat_urllib_parse_urlparse(url)
2761 for component in [parsed_url.fragment, parsed_url.query]:
2762 query = compat_parse_qs(component)
2763 for k, v in query.items():
2764 for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]:
2765 d_k += '_time'
2766 if d_k not in info and k in s_ks:
2767 info[d_k] = parse_duration(query[k][0])
2768
2769 # Youtube Music Auto-generated description
2770 if video_description:
2771 mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
2772 if mobj:
2773 release_year = mobj.group('release_year')
2774 release_date = mobj.group('release_date')
2775 if release_date:
2776 release_date = release_date.replace('-', '')
2777 if not release_year:
2778 release_year = release_date[:4]
2779 info.update({
2780 'album': mobj.group('album'.strip()),
2781 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')),
2782 'track': mobj.group('track').strip(),
2783 'release_date': release_date,
2784 'release_year': int_or_none(release_year),
2785 })
2786
2787 initial_data = None
2788 if webpage:
2789 initial_data = self._extract_yt_initial_variable(
2790 webpage, self._YT_INITIAL_DATA_RE, video_id,
2791 'yt initial data')
2792 if not initial_data:
2793 initial_data = self._extract_response(
2794 item_id=video_id, ep='next', fatal=False,
2795 ytcfg=ytcfg, headers=headers, query={'videoId': video_id},
2796 note='Downloading initial data API JSON')
2797
2798 try:
2799 # This will error if there is no livechat
2800 initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
2801 info['subtitles']['live_chat'] = [{
2802 'url': 'https://www.youtube.com/watch?v=%s' % video_id, # url is needed to set cookies
2803 'video_id': video_id,
2804 'ext': 'json',
2805 'protocol': 'youtube_live_chat' if is_live or is_upcoming else 'youtube_live_chat_replay',
2806 }]
2807 except (KeyError, IndexError, TypeError):
2808 pass
2809
2810 if initial_data:
2811 chapters = self._extract_chapters_from_json(
2812 initial_data, video_id, duration)
2813 if not chapters:
2814 for engagment_pannel in (initial_data.get('engagementPanels') or []):
2815 contents = try_get(
2816 engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'],
2817 list)
2818 if not contents:
2819 continue
2820
2821 def chapter_time(mmlir):
2822 return parse_duration(
2823 get_text(mmlir.get('timeDescription')))
2824
2825 chapters = []
2826 for next_num, content in enumerate(contents, start=1):
2827 mmlir = content.get('macroMarkersListItemRenderer') or {}
2828 start_time = chapter_time(mmlir)
2829 end_time = chapter_time(try_get(
2830 contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \
2831 if next_num < len(contents) else duration
2832 if start_time is None or end_time is None:
2833 continue
2834 chapters.append({
2835 'start_time': start_time,
2836 'end_time': end_time,
2837 'title': get_text(mmlir.get('title')),
2838 })
2839 if chapters:
2840 break
2841 if chapters:
2842 info['chapters'] = chapters
2843
2844 contents = try_get(
2845 initial_data,
2846 lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'],
2847 list) or []
2848 for content in contents:
2849 vpir = content.get('videoPrimaryInfoRenderer')
2850 if vpir:
2851 stl = vpir.get('superTitleLink')
2852 if stl:
2853 stl = get_text(stl)
2854 if try_get(
2855 vpir,
2856 lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':
2857 info['location'] = stl
2858 else:
2859 mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl)
2860 if mobj:
2861 info.update({
2862 'series': mobj.group(1),
2863 'season_number': int(mobj.group(2)),
2864 'episode_number': int(mobj.group(3)),
2865 })
2866 for tlb in (try_get(
2867 vpir,
2868 lambda x: x['videoActions']['menuRenderer']['topLevelButtons'],
2869 list) or []):
2870 tbr = tlb.get('toggleButtonRenderer') or {}
2871 for getter, regex in [(
2872 lambda x: x['defaultText']['accessibility']['accessibilityData'],
2873 r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([
2874 lambda x: x['accessibility'],
2875 lambda x: x['accessibilityData']['accessibilityData'],
2876 ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]:
2877 label = (try_get(tbr, getter, dict) or {}).get('label')
2878 if label:
2879 mobj = re.match(regex, label)
2880 if mobj:
2881 info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count'))
2882 break
2883 sbr_tooltip = try_get(
2884 vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])
2885 if sbr_tooltip:
2886 like_count, dislike_count = sbr_tooltip.split(' / ')
2887 info.update({
2888 'like_count': str_to_int(like_count),
2889 'dislike_count': str_to_int(dislike_count),
2890 })
2891 vsir = content.get('videoSecondaryInfoRenderer')
2892 if vsir:
2893 info['channel'] = get_text(try_get(
2894 vsir,
2895 lambda x: x['owner']['videoOwnerRenderer']['title'],
2896 dict))
2897 rows = try_get(
2898 vsir,
2899 lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
2900 list) or []
2901 multiple_songs = False
2902 for row in rows:
2903 if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
2904 multiple_songs = True
2905 break
2906 for row in rows:
2907 mrr = row.get('metadataRowRenderer') or {}
2908 mrr_title = mrr.get('title')
2909 if not mrr_title:
2910 continue
2911 mrr_title = get_text(mrr['title'])
2912 mrr_contents_text = get_text(mrr['contents'][0])
2913 if mrr_title == 'License':
2914 info['license'] = mrr_contents_text
2915 elif not multiple_songs:
2916 if mrr_title == 'Album':
2917 info['album'] = mrr_contents_text
2918 elif mrr_title == 'Artist':
2919 info['artist'] = mrr_contents_text
2920 elif mrr_title == 'Song':
2921 info['track'] = mrr_contents_text
2922
2923 fallbacks = {
2924 'channel': 'uploader',
2925 'channel_id': 'uploader_id',
2926 'channel_url': 'uploader_url',
2927 }
2928 for to, frm in fallbacks.items():
2929 if not info.get(to):
2930 info[to] = info.get(frm)
2931
2932 for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]:
2933 v = info.get(s_k)
2934 if v:
2935 info[d_k] = v
2936
2937 is_private = bool_or_none(video_details.get('isPrivate'))
2938 is_unlisted = bool_or_none(microformat.get('isUnlisted'))
2939 is_membersonly = None
2940 is_premium = None
2941 if initial_data and is_private is not None:
2942 is_membersonly = False
2943 is_premium = False
2944 contents = try_get(initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list)
2945 for content in contents or []:
2946 badges = try_get(content, lambda x: x['videoPrimaryInfoRenderer']['badges'], list)
2947 for badge in badges or []:
2948 label = try_get(badge, lambda x: x['metadataBadgeRenderer']['label']) or ''
2949 if label.lower() == 'members only':
2950 is_membersonly = True
2951 break
2952 elif label.lower() == 'premium':
2953 is_premium = True
2954 break
2955 if is_membersonly or is_premium:
2956 break
2957
2958 # TODO: Add this for playlists
2959 info['availability'] = self._availability(
2960 is_private=is_private,
2961 needs_premium=is_premium,
2962 needs_subscription=is_membersonly,
2963 needs_auth=info['age_limit'] >= 18,
2964 is_unlisted=None if is_private is None else is_unlisted)
2965
2966 # get xsrf for annotations or comments
2967 get_annotations = self.get_param('writeannotations', False)
2968 get_comments = self.get_param('getcomments', False)
2969 if get_annotations or get_comments:
2970 xsrf_token = None
2971 ytcfg = self._extract_ytcfg(video_id, webpage)
2972 if ytcfg:
2973 xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
2974 if not xsrf_token:
2975 xsrf_token = self._search_regex(
2976 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
2977 webpage, 'xsrf token', group='xsrf_token', fatal=False)
2978
2979 # annotations
2980 if get_annotations:
2981 invideo_url = try_get(
2982 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2983 if xsrf_token and invideo_url:
2984 xsrf_field_name = None
2985 if ytcfg:
2986 xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
2987 if not xsrf_field_name:
2988 xsrf_field_name = self._search_regex(
2989 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2990 webpage, 'xsrf field name',
2991 group='xsrf_field_name', default='session_token')
2992 info['annotations'] = self._download_webpage(
2993 self._proto_relative_url(invideo_url),
2994 video_id, note='Downloading annotations',
2995 errnote='Unable to download video annotations', fatal=False,
2996 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
2997
2998 if get_comments:
2999 info['__post_extractor'] = lambda: self._extract_comments(ytcfg, video_id, contents, webpage)
3000
3001 self.mark_watched(video_id, player_response)
3002
3003 return info
3004
3005
3006 class YoutubeTabIE(YoutubeBaseInfoExtractor):
3007 IE_DESC = 'YouTube.com tab'
3008 _VALID_URL = r'''(?x)
3009 https?://
3010 (?:\w+\.)?
3011 (?:
3012 youtube(?:kids)?\.com|
3013 invidio\.us
3014 )/
3015 (?:
3016 (?P<channel_type>channel|c|user|browse)/|
3017 (?P<not_channel>
3018 feed/|hashtag/|
3019 (?:playlist|watch)\?.*?\blist=
3020 )|
3021 (?!(?:%s)\b) # Direct URLs
3022 )
3023 (?P<id>[^/?\#&]+)
3024 ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
3025 IE_NAME = 'youtube:tab'
3026
3027 _TESTS = [{
3028 'note': 'playlists, multipage',
3029 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
3030 'playlist_mincount': 94,
3031 'info_dict': {
3032 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
3033 'title': 'Игорь Клейнер - Playlists',
3034 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
3035 'uploader': 'Игорь Клейнер',
3036 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
3037 },
3038 }, {
3039 'note': 'playlists, multipage, different order',
3040 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
3041 'playlist_mincount': 94,
3042 'info_dict': {
3043 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
3044 'title': 'Игорь Клейнер - Playlists',
3045 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
3046 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
3047 'uploader': 'Игорь Клейнер',
3048 },
3049 }, {
3050 'note': 'playlists, series',
3051 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3',
3052 'playlist_mincount': 5,
3053 'info_dict': {
3054 'id': 'UCYO_jab_esuFRV4b17AJtAw',
3055 'title': '3Blue1Brown - Playlists',
3056 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
3057 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
3058 'uploader': '3Blue1Brown',
3059 },
3060 }, {
3061 'note': 'playlists, singlepage',
3062 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
3063 'playlist_mincount': 4,
3064 'info_dict': {
3065 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
3066 'title': 'ThirstForScience - Playlists',
3067 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
3068 'uploader': 'ThirstForScience',
3069 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
3070 }
3071 }, {
3072 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3073 'only_matching': True,
3074 }, {
3075 'note': 'basic, single video playlist',
3076 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3077 'info_dict': {
3078 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
3079 'uploader': 'Sergey M.',
3080 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3081 'title': 'youtube-dl public playlist',
3082 },
3083 'playlist_count': 1,
3084 }, {
3085 'note': 'empty playlist',
3086 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3087 'info_dict': {
3088 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
3089 'uploader': 'Sergey M.',
3090 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3091 'title': 'youtube-dl empty playlist',
3092 },
3093 'playlist_count': 0,
3094 }, {
3095 'note': 'Home tab',
3096 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
3097 'info_dict': {
3098 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3099 'title': 'lex will - Home',
3100 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3101 'uploader': 'lex will',
3102 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3103 },
3104 'playlist_mincount': 2,
3105 }, {
3106 'note': 'Videos tab',
3107 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
3108 'info_dict': {
3109 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3110 'title': 'lex will - Videos',
3111 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3112 'uploader': 'lex will',
3113 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3114 },
3115 'playlist_mincount': 975,
3116 }, {
3117 'note': 'Videos tab, sorted by popular',
3118 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
3119 'info_dict': {
3120 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3121 'title': 'lex will - Videos',
3122 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3123 'uploader': 'lex will',
3124 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3125 },
3126 'playlist_mincount': 199,
3127 }, {
3128 'note': 'Playlists tab',
3129 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
3130 'info_dict': {
3131 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3132 'title': 'lex will - Playlists',
3133 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3134 'uploader': 'lex will',
3135 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3136 },
3137 'playlist_mincount': 17,
3138 }, {
3139 'note': 'Community tab',
3140 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
3141 'info_dict': {
3142 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3143 'title': 'lex will - Community',
3144 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3145 'uploader': 'lex will',
3146 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3147 },
3148 'playlist_mincount': 18,
3149 }, {
3150 'note': 'Channels tab',
3151 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
3152 'info_dict': {
3153 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3154 'title': 'lex will - Channels',
3155 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
3156 'uploader': 'lex will',
3157 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3158 },
3159 'playlist_mincount': 12,
3160 }, {
3161 'note': 'Search tab',
3162 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra',
3163 'playlist_mincount': 40,
3164 'info_dict': {
3165 'id': 'UCYO_jab_esuFRV4b17AJtAw',
3166 'title': '3Blue1Brown - Search - linear algebra',
3167 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
3168 'uploader': '3Blue1Brown',
3169 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
3170 },
3171 }, {
3172 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3173 'only_matching': True,
3174 }, {
3175 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3176 'only_matching': True,
3177 }, {
3178 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
3179 'only_matching': True,
3180 }, {
3181 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
3182 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
3183 'info_dict': {
3184 'title': '29C3: Not my department',
3185 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
3186 'uploader': 'Christiaan008',
3187 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
3188 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268',
3189 },
3190 'playlist_count': 96,
3191 }, {
3192 'note': 'Large playlist',
3193 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
3194 'info_dict': {
3195 'title': 'Uploads from Cauchemar',
3196 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
3197 'uploader': 'Cauchemar',
3198 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
3199 },
3200 'playlist_mincount': 1123,
3201 }, {
3202 'note': 'even larger playlist, 8832 videos',
3203 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
3204 'only_matching': True,
3205 }, {
3206 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
3207 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
3208 'info_dict': {
3209 'title': 'Uploads from Interstellar Movie',
3210 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
3211 'uploader': 'Interstellar Movie',
3212 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
3213 },
3214 'playlist_mincount': 21,
3215 }, {
3216 'note': 'Playlist with "show unavailable videos" button',
3217 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q',
3218 'info_dict': {
3219 'title': 'Uploads from Phim Siêu Nhân Nhật Bản',
3220 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q',
3221 'uploader': 'Phim Siêu Nhân Nhật Bản',
3222 'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q',
3223 },
3224 'playlist_mincount': 200,
3225 }, {
3226 'note': 'Playlist with unavailable videos in page 7',
3227 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w',
3228 'info_dict': {
3229 'title': 'Uploads from BlankTV',
3230 'id': 'UU8l9frL61Yl5KFOl87nIm2w',
3231 'uploader': 'BlankTV',
3232 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w',
3233 },
3234 'playlist_mincount': 1000,
3235 }, {
3236 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844',
3237 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
3238 'info_dict': {
3239 'title': 'Data Analysis with Dr Mike Pound',
3240 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
3241 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
3242 'uploader': 'Computerphile',
3243 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487',
3244 },
3245 'playlist_mincount': 11,
3246 }, {
3247 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3248 'only_matching': True,
3249 }, {
3250 'note': 'Playlist URL that does not actually serve a playlist',
3251 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
3252 'info_dict': {
3253 'id': 'FqZTN594JQw',
3254 'ext': 'webm',
3255 'title': "Smiley's People 01 detective, Adventure Series, Action",
3256 'uploader': 'STREEM',
3257 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
3258 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
3259 'upload_date': '20150526',
3260 'license': 'Standard YouTube License',
3261 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
3262 'categories': ['People & Blogs'],
3263 'tags': list,
3264 'view_count': int,
3265 'like_count': int,
3266 'dislike_count': int,
3267 },
3268 'params': {
3269 'skip_download': True,
3270 },
3271 'skip': 'This video is not available.',
3272 'add_ie': [YoutubeIE.ie_key()],
3273 }, {
3274 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
3275 'only_matching': True,
3276 }, {
3277 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
3278 'only_matching': True,
3279 }, {
3280 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
3281 'info_dict': {
3282 'id': 'X1whbWASnNQ', # This will keep changing
3283 'ext': 'mp4',
3284 'title': compat_str,
3285 'uploader': 'Sky News',
3286 'uploader_id': 'skynews',
3287 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
3288 'upload_date': r're:\d{8}',
3289 'description': compat_str,
3290 'categories': ['News & Politics'],
3291 'tags': list,
3292 'like_count': int,
3293 'dislike_count': int,
3294 },
3295 'params': {
3296 'skip_download': True,
3297 },
3298 'expected_warnings': ['Downloading just video ', 'Ignoring subtitle tracks found in '],
3299 }, {
3300 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
3301 'info_dict': {
3302 'id': 'a48o2S1cPoo',
3303 'ext': 'mp4',
3304 'title': 'The Young Turks - Live Main Show',
3305 'uploader': 'The Young Turks',
3306 'uploader_id': 'TheYoungTurks',
3307 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
3308 'upload_date': '20150715',
3309 'license': 'Standard YouTube License',
3310 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3311 'categories': ['News & Politics'],
3312 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3313 'like_count': int,
3314 'dislike_count': int,
3315 },
3316 'params': {
3317 'skip_download': True,
3318 },
3319 'only_matching': True,
3320 }, {
3321 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
3322 'only_matching': True,
3323 }, {
3324 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3325 'only_matching': True,
3326 }, {
3327 'note': 'A channel that is not live. Should raise error',
3328 'url': 'https://www.youtube.com/user/numberphile/live',
3329 'only_matching': True,
3330 }, {
3331 'url': 'https://www.youtube.com/feed/trending',
3332 'only_matching': True,
3333 }, {
3334 'url': 'https://www.youtube.com/feed/library',
3335 'only_matching': True,
3336 }, {
3337 'url': 'https://www.youtube.com/feed/history',
3338 'only_matching': True,
3339 }, {
3340 'url': 'https://www.youtube.com/feed/subscriptions',
3341 'only_matching': True,
3342 }, {
3343 'url': 'https://www.youtube.com/feed/watch_later',
3344 'only_matching': True,
3345 }, {
3346 'note': 'Recommended - redirects to home page',
3347 'url': 'https://www.youtube.com/feed/recommended',
3348 'only_matching': True,
3349 }, {
3350 'note': 'inline playlist with not always working continuations',
3351 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
3352 'only_matching': True,
3353 }, {
3354 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8',
3355 'only_matching': True,
3356 }, {
3357 'url': 'https://www.youtube.com/course',
3358 'only_matching': True,
3359 }, {
3360 'url': 'https://www.youtube.com/zsecurity',
3361 'only_matching': True,
3362 }, {
3363 'url': 'http://www.youtube.com/NASAgovVideo/videos',
3364 'only_matching': True,
3365 }, {
3366 'url': 'https://www.youtube.com/TheYoungTurks/live',
3367 'only_matching': True,
3368 }, {
3369 'url': 'https://www.youtube.com/hashtag/cctv9',
3370 'info_dict': {
3371 'id': 'cctv9',
3372 'title': '#cctv9',
3373 },
3374 'playlist_mincount': 350,
3375 }, {
3376 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU',
3377 'only_matching': True,
3378 }, {
3379 'note': 'Requires Premium: should request additional YTM-info webpage (and have format 141) for videos in playlist',
3380 'url': 'https://music.youtube.com/playlist?list=PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3381 'only_matching': True
3382 }, {
3383 'note': '/browse/ should redirect to /channel/',
3384 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng',
3385 'only_matching': True
3386 }, {
3387 'note': 'VLPL, should redirect to playlist?list=PL...',
3388 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3389 'info_dict': {
3390 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
3391 'uploader': 'NoCopyrightSounds',
3392 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!',
3393 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
3394 'title': 'NCS Releases',
3395 },
3396 'playlist_mincount': 166,
3397 }, {
3398 'note': 'Topic, should redirect to playlist?list=UU...',
3399 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw',
3400 'info_dict': {
3401 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw',
3402 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
3403 'title': 'Uploads from Royalty Free Music - Topic',
3404 'uploader': 'Royalty Free Music - Topic',
3405 },
3406 'expected_warnings': [
3407 'A channel/user page was given',
3408 'The URL does not have a videos tab',
3409 ],
3410 'playlist_mincount': 101,
3411 }, {
3412 'note': 'Topic without a UU playlist',
3413 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg',
3414 'info_dict': {
3415 'id': 'UCtFRv9O2AHqOZjjynzrv-xg',
3416 'title': 'UCtFRv9O2AHqOZjjynzrv-xg',
3417 },
3418 'expected_warnings': [
3419 'A channel/user page was given',
3420 'The URL does not have a videos tab',
3421 'Falling back to channel URL',
3422 ],
3423 'playlist_mincount': 9,
3424 }, {
3425 'note': 'Youtube music Album',
3426 'url': 'https://music.youtube.com/browse/MPREb_gTAcphH99wE',
3427 'info_dict': {
3428 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0',
3429 'title': 'Album - Royalty Free Music Library V2 (50 Songs)',
3430 },
3431 'playlist_count': 50,
3432 }]
3433
3434 @classmethod
3435 def suitable(cls, url):
3436 return False if YoutubeIE.suitable(url) else super(
3437 YoutubeTabIE, cls).suitable(url)
3438
3439 def _extract_channel_id(self, webpage):
3440 channel_id = self._html_search_meta(
3441 'channelId', webpage, 'channel id', default=None)
3442 if channel_id:
3443 return channel_id
3444 channel_url = self._html_search_meta(
3445 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
3446 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
3447 'twitter:app:url:googleplay'), webpage, 'channel url')
3448 return self._search_regex(
3449 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
3450 channel_url, 'channel id')
3451
3452 @staticmethod
3453 def _extract_basic_item_renderer(item):
3454 # Modified from _extract_grid_item_renderer
3455 known_basic_renderers = (
3456 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer'
3457 )
3458 for key, renderer in item.items():
3459 if not isinstance(renderer, dict):
3460 continue
3461 elif key in known_basic_renderers:
3462 return renderer
3463 elif key.startswith('grid') and key.endswith('Renderer'):
3464 return renderer
3465
3466 def _grid_entries(self, grid_renderer):
3467 for item in grid_renderer['items']:
3468 if not isinstance(item, dict):
3469 continue
3470 renderer = self._extract_basic_item_renderer(item)
3471 if not isinstance(renderer, dict):
3472 continue
3473 title = try_get(
3474 renderer, (lambda x: x['title']['runs'][0]['text'],
3475 lambda x: x['title']['simpleText']), compat_str)
3476 # playlist
3477 playlist_id = renderer.get('playlistId')
3478 if playlist_id:
3479 yield self.url_result(
3480 'https://www.youtube.com/playlist?list=%s' % playlist_id,
3481 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3482 video_title=title)
3483 continue
3484 # video
3485 video_id = renderer.get('videoId')
3486 if video_id:
3487 yield self._extract_video(renderer)
3488 continue
3489 # channel
3490 channel_id = renderer.get('channelId')
3491 if channel_id:
3492 title = try_get(
3493 renderer, lambda x: x['title']['simpleText'], compat_str)
3494 yield self.url_result(
3495 'https://www.youtube.com/channel/%s' % channel_id,
3496 ie=YoutubeTabIE.ie_key(), video_title=title)
3497 continue
3498 # generic endpoint URL support
3499 ep_url = urljoin('https://www.youtube.com/', try_get(
3500 renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'],
3501 compat_str))
3502 if ep_url:
3503 for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE):
3504 if ie.suitable(ep_url):
3505 yield self.url_result(
3506 ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title)
3507 break
3508
3509 def _shelf_entries_from_content(self, shelf_renderer):
3510 content = shelf_renderer.get('content')
3511 if not isinstance(content, dict):
3512 return
3513 renderer = content.get('gridRenderer') or content.get('expandedShelfContentsRenderer')
3514 if renderer:
3515 # TODO: add support for nested playlists so each shelf is processed
3516 # as separate playlist
3517 # TODO: this includes only first N items
3518 for entry in self._grid_entries(renderer):
3519 yield entry
3520 renderer = content.get('horizontalListRenderer')
3521 if renderer:
3522 # TODO
3523 pass
3524
3525 def _shelf_entries(self, shelf_renderer, skip_channels=False):
3526 ep = try_get(
3527 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3528 compat_str)
3529 shelf_url = urljoin('https://www.youtube.com', ep)
3530 if shelf_url:
3531 # Skipping links to another channels, note that checking for
3532 # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL
3533 # will not work
3534 if skip_channels and '/channels?' in shelf_url:
3535 return
3536 title = try_get(
3537 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
3538 yield self.url_result(shelf_url, video_title=title)
3539 # Shelf may not contain shelf URL, fallback to extraction from content
3540 for entry in self._shelf_entries_from_content(shelf_renderer):
3541 yield entry
3542
3543 def _playlist_entries(self, video_list_renderer):
3544 for content in video_list_renderer['contents']:
3545 if not isinstance(content, dict):
3546 continue
3547 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
3548 if not isinstance(renderer, dict):
3549 continue
3550 video_id = renderer.get('videoId')
3551 if not video_id:
3552 continue
3553 yield self._extract_video(renderer)
3554
3555 def _rich_entries(self, rich_grid_renderer):
3556 renderer = try_get(
3557 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
3558 video_id = renderer.get('videoId')
3559 if not video_id:
3560 return
3561 yield self._extract_video(renderer)
3562
3563 def _video_entry(self, video_renderer):
3564 video_id = video_renderer.get('videoId')
3565 if video_id:
3566 return self._extract_video(video_renderer)
3567
3568 def _post_thread_entries(self, post_thread_renderer):
3569 post_renderer = try_get(
3570 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
3571 if not post_renderer:
3572 return
3573 # video attachment
3574 video_renderer = try_get(
3575 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict) or {}
3576 video_id = video_renderer.get('videoId')
3577 if video_id:
3578 entry = self._extract_video(video_renderer)
3579 if entry:
3580 yield entry
3581 # playlist attachment
3582 playlist_id = try_get(
3583 post_renderer, lambda x: x['backstageAttachment']['playlistRenderer']['playlistId'], compat_str)
3584 if playlist_id:
3585 yield self.url_result(
3586 'https://www.youtube.com/playlist?list=%s' % playlist_id,
3587 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3588 # inline video links
3589 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
3590 for run in runs:
3591 if not isinstance(run, dict):
3592 continue
3593 ep_url = try_get(
3594 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
3595 if not ep_url:
3596 continue
3597 if not YoutubeIE.suitable(ep_url):
3598 continue
3599 ep_video_id = YoutubeIE._match_id(ep_url)
3600 if video_id == ep_video_id:
3601 continue
3602 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=ep_video_id)
3603
3604 def _post_thread_continuation_entries(self, post_thread_continuation):
3605 contents = post_thread_continuation.get('contents')
3606 if not isinstance(contents, list):
3607 return
3608 for content in contents:
3609 renderer = content.get('backstagePostThreadRenderer')
3610 if not isinstance(renderer, dict):
3611 continue
3612 for entry in self._post_thread_entries(renderer):
3613 yield entry
3614
3615 r''' # unused
3616 def _rich_grid_entries(self, contents):
3617 for content in contents:
3618 video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict)
3619 if video_renderer:
3620 entry = self._video_entry(video_renderer)
3621 if entry:
3622 yield entry
3623 '''
3624 def _entries(self, tab, item_id, identity_token, account_syncid, ytcfg):
3625
3626 def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
3627 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
3628 for content in contents:
3629 if not isinstance(content, dict):
3630 continue
3631 is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
3632 if not is_renderer:
3633 renderer = content.get('richItemRenderer')
3634 if renderer:
3635 for entry in self._rich_entries(renderer):
3636 yield entry
3637 continuation_list[0] = self._extract_continuation(parent_renderer)
3638 continue
3639 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
3640 for isr_content in isr_contents:
3641 if not isinstance(isr_content, dict):
3642 continue
3643
3644 known_renderers = {
3645 'playlistVideoListRenderer': self._playlist_entries,
3646 'gridRenderer': self._grid_entries,
3647 'shelfRenderer': lambda x: self._shelf_entries(x, tab.get('title') != 'Channels'),
3648 'backstagePostThreadRenderer': self._post_thread_entries,
3649 'videoRenderer': lambda x: [self._video_entry(x)],
3650 }
3651 for key, renderer in isr_content.items():
3652 if key not in known_renderers:
3653 continue
3654 for entry in known_renderers[key](renderer):
3655 if entry:
3656 yield entry
3657 continuation_list[0] = self._extract_continuation(renderer)
3658 break
3659
3660 if not continuation_list[0]:
3661 continuation_list[0] = self._extract_continuation(is_renderer)
3662
3663 if not continuation_list[0]:
3664 continuation_list[0] = self._extract_continuation(parent_renderer)
3665
3666 continuation_list = [None] # Python 2 doesnot support nonlocal
3667 tab_content = try_get(tab, lambda x: x['content'], dict)
3668 if not tab_content:
3669 return
3670 parent_renderer = (
3671 try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
3672 or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
3673 for entry in extract_entries(parent_renderer):
3674 yield entry
3675 continuation = continuation_list[0]
3676 context = self._extract_context(ytcfg)
3677 visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str)
3678
3679 for page_num in itertools.count(1):
3680 if not continuation:
3681 break
3682 query = {
3683 'continuation': continuation['continuation'],
3684 'clickTracking': {'clickTrackingParams': continuation['itct']}
3685 }
3686 headers = self._generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
3687 response = self._extract_response(
3688 item_id='%s page %s' % (item_id, page_num),
3689 query=query, headers=headers, ytcfg=ytcfg,
3690 check_get_keys=('continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints'))
3691
3692 if not response:
3693 break
3694 visitor_data = try_get(
3695 response, lambda x: x['responseContext']['visitorData'], compat_str) or visitor_data
3696
3697 known_continuation_renderers = {
3698 'playlistVideoListContinuation': self._playlist_entries,
3699 'gridContinuation': self._grid_entries,
3700 'itemSectionContinuation': self._post_thread_continuation_entries,
3701 'sectionListContinuation': extract_entries, # for feeds
3702 }
3703 continuation_contents = try_get(
3704 response, lambda x: x['continuationContents'], dict) or {}
3705 continuation_renderer = None
3706 for key, value in continuation_contents.items():
3707 if key not in known_continuation_renderers:
3708 continue
3709 continuation_renderer = value
3710 continuation_list = [None]
3711 for entry in known_continuation_renderers[key](continuation_renderer):
3712 yield entry
3713 continuation = continuation_list[0] or self._extract_continuation(continuation_renderer)
3714 break
3715 if continuation_renderer:
3716 continue
3717
3718 known_renderers = {
3719 'gridPlaylistRenderer': (self._grid_entries, 'items'),
3720 'gridVideoRenderer': (self._grid_entries, 'items'),
3721 'playlistVideoRenderer': (self._playlist_entries, 'contents'),
3722 'itemSectionRenderer': (extract_entries, 'contents'), # for feeds
3723 'richItemRenderer': (extract_entries, 'contents'), # for hashtag
3724 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents')
3725 }
3726 on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints'))
3727 continuation_items = try_get(
3728 on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list)
3729 continuation_item = try_get(continuation_items, lambda x: x[0], dict) or {}
3730 video_items_renderer = None
3731 for key, value in continuation_item.items():
3732 if key not in known_renderers:
3733 continue
3734 video_items_renderer = {known_renderers[key][1]: continuation_items}
3735 continuation_list = [None]
3736 for entry in known_renderers[key][0](video_items_renderer):
3737 yield entry
3738 continuation = continuation_list[0] or self._extract_continuation(video_items_renderer)
3739 break
3740 if video_items_renderer:
3741 continue
3742 break
3743
3744 @staticmethod
3745 def _extract_selected_tab(tabs):
3746 for tab in tabs:
3747 renderer = dict_get(tab, ('tabRenderer', 'expandableTabRenderer')) or {}
3748 if renderer.get('selected') is True:
3749 return renderer
3750 else:
3751 raise ExtractorError('Unable to find selected tab')
3752
3753 @staticmethod
3754 def _extract_uploader(data):
3755 uploader = {}
3756 sidebar_renderer = try_get(
3757 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
3758 if sidebar_renderer:
3759 for item in sidebar_renderer:
3760 if not isinstance(item, dict):
3761 continue
3762 renderer = item.get('playlistSidebarSecondaryInfoRenderer')
3763 if not isinstance(renderer, dict):
3764 continue
3765 owner = try_get(
3766 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3767 if owner:
3768 uploader['uploader'] = owner.get('text')
3769 uploader['uploader_id'] = try_get(
3770 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3771 uploader['uploader_url'] = urljoin(
3772 'https://www.youtube.com/',
3773 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3774 return {k: v for k, v in uploader.items() if v is not None}
3775
3776 def _extract_from_tabs(self, item_id, webpage, data, tabs):
3777 playlist_id = title = description = channel_url = channel_name = channel_id = None
3778 thumbnails_list = tags = []
3779
3780 selected_tab = self._extract_selected_tab(tabs)
3781 renderer = try_get(
3782 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3783 if renderer:
3784 channel_name = renderer.get('title')
3785 channel_url = renderer.get('channelUrl')
3786 channel_id = renderer.get('externalId')
3787 else:
3788 renderer = try_get(
3789 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3790
3791 if renderer:
3792 title = renderer.get('title')
3793 description = renderer.get('description', '')
3794 playlist_id = channel_id
3795 tags = renderer.get('keywords', '').split()
3796 thumbnails_list = (
3797 try_get(renderer, lambda x: x['avatar']['thumbnails'], list)
3798 or try_get(
3799 data,
3800 lambda x: x['sidebar']['playlistSidebarRenderer']['items'][0]['playlistSidebarPrimaryInfoRenderer']['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails'],
3801 list)
3802 or [])
3803
3804 thumbnails = []
3805 for t in thumbnails_list:
3806 if not isinstance(t, dict):
3807 continue
3808 thumbnail_url = url_or_none(t.get('url'))
3809 if not thumbnail_url:
3810 continue
3811 thumbnails.append({
3812 'url': thumbnail_url,
3813 'width': int_or_none(t.get('width')),
3814 'height': int_or_none(t.get('height')),
3815 })
3816 if playlist_id is None:
3817 playlist_id = item_id
3818 if title is None:
3819 title = (
3820 try_get(data, lambda x: x['header']['hashtagHeaderRenderer']['hashtag']['simpleText'])
3821 or playlist_id)
3822 title += format_field(selected_tab, 'title', ' - %s')
3823 title += format_field(selected_tab, 'expandedText', ' - %s')
3824
3825 metadata = {
3826 'playlist_id': playlist_id,
3827 'playlist_title': title,
3828 'playlist_description': description,
3829 'uploader': channel_name,
3830 'uploader_id': channel_id,
3831 'uploader_url': channel_url,
3832 'thumbnails': thumbnails,
3833 'tags': tags,
3834 }
3835 if not channel_id:
3836 metadata.update(self._extract_uploader(data))
3837 metadata.update({
3838 'channel': metadata['uploader'],
3839 'channel_id': metadata['uploader_id'],
3840 'channel_url': metadata['uploader_url']})
3841 return self.playlist_result(
3842 self._entries(
3843 selected_tab, playlist_id,
3844 self._extract_identity_token(webpage, item_id),
3845 self._extract_account_syncid(data),
3846 self._extract_ytcfg(item_id, webpage)),
3847 **metadata)
3848
3849 def _extract_mix_playlist(self, playlist, playlist_id, data, webpage):
3850 first_id = last_id = None
3851 ytcfg = self._extract_ytcfg(playlist_id, webpage)
3852 headers = self._generate_api_headers(
3853 ytcfg, account_syncid=self._extract_account_syncid(data),
3854 identity_token=self._extract_identity_token(webpage, item_id=playlist_id),
3855 visitor_data=try_get(self._extract_context(ytcfg), lambda x: x['client']['visitorData'], compat_str))
3856 for page_num in itertools.count(1):
3857 videos = list(self._playlist_entries(playlist))
3858 if not videos:
3859 return
3860 start = next((i for i, v in enumerate(videos) if v['id'] == last_id), -1) + 1
3861 if start >= len(videos):
3862 return
3863 for video in videos[start:]:
3864 if video['id'] == first_id:
3865 self.to_screen('First video %s found again; Assuming end of Mix' % first_id)
3866 return
3867 yield video
3868 first_id = first_id or videos[0]['id']
3869 last_id = videos[-1]['id']
3870 watch_endpoint = try_get(
3871 playlist, lambda x: x['contents'][-1]['playlistPanelVideoRenderer']['navigationEndpoint']['watchEndpoint'])
3872 query = {
3873 'playlistId': playlist_id,
3874 'videoId': watch_endpoint.get('videoId') or last_id,
3875 'index': watch_endpoint.get('index') or len(videos),
3876 'params': watch_endpoint.get('params') or 'OAE%3D'
3877 }
3878 response = self._extract_response(
3879 item_id='%s page %d' % (playlist_id, page_num),
3880 query=query,
3881 ep='next',
3882 headers=headers,
3883 check_get_keys='contents'
3884 )
3885 playlist = try_get(
3886 response, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3887
3888 def _extract_from_playlist(self, item_id, url, data, playlist, webpage):
3889 title = playlist.get('title') or try_get(
3890 data, lambda x: x['titleText']['simpleText'], compat_str)
3891 playlist_id = playlist.get('playlistId') or item_id
3892
3893 # Delegating everything except mix playlists to regular tab-based playlist URL
3894 playlist_url = urljoin(url, try_get(
3895 playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3896 compat_str))
3897 if playlist_url and playlist_url != url:
3898 return self.url_result(
3899 playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3900 video_title=title)
3901
3902 return self.playlist_result(
3903 self._extract_mix_playlist(playlist, playlist_id, data, webpage),
3904 playlist_id=playlist_id, playlist_title=title)
3905
3906 def _reload_with_unavailable_videos(self, item_id, data, webpage):
3907 """
3908 Get playlist with unavailable videos if the 'show unavailable videos' button exists.
3909 """
3910 sidebar_renderer = try_get(
3911 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
3912 if not sidebar_renderer:
3913 return
3914 browse_id = params = None
3915 for item in sidebar_renderer:
3916 if not isinstance(item, dict):
3917 continue
3918 renderer = item.get('playlistSidebarPrimaryInfoRenderer')
3919 menu_renderer = try_get(
3920 renderer, lambda x: x['menu']['menuRenderer']['items'], list) or []
3921 for menu_item in menu_renderer:
3922 if not isinstance(menu_item, dict):
3923 continue
3924 nav_item_renderer = menu_item.get('menuNavigationItemRenderer')
3925 text = try_get(
3926 nav_item_renderer, lambda x: x['text']['simpleText'], compat_str)
3927 if not text or text.lower() != 'show unavailable videos':
3928 continue
3929 browse_endpoint = try_get(
3930 nav_item_renderer, lambda x: x['navigationEndpoint']['browseEndpoint'], dict) or {}
3931 browse_id = browse_endpoint.get('browseId')
3932 params = browse_endpoint.get('params')
3933 break
3934
3935 ytcfg = self._extract_ytcfg(item_id, webpage)
3936 headers = self._generate_api_headers(
3937 ytcfg, account_syncid=self._extract_account_syncid(ytcfg),
3938 identity_token=self._extract_identity_token(webpage, item_id=item_id),
3939 visitor_data=try_get(
3940 self._extract_context(ytcfg), lambda x: x['client']['visitorData'], compat_str))
3941 query = {
3942 'params': params or 'wgYCCAA=',
3943 'browseId': browse_id or 'VL%s' % item_id
3944 }
3945 return self._extract_response(
3946 item_id=item_id, headers=headers, query=query,
3947 check_get_keys='contents', fatal=False,
3948 note='Downloading API JSON with unavailable videos')
3949
3950 def _extract_webpage(self, url, item_id):
3951 retries = self.get_param('extractor_retries', 3)
3952 count = -1
3953 last_error = 'Incomplete yt initial data recieved'
3954 while count < retries:
3955 count += 1
3956 # Sometimes youtube returns a webpage with incomplete ytInitialData
3957 # See: https://github.com/yt-dlp/yt-dlp/issues/116
3958 if count:
3959 self.report_warning('%s. Retrying ...' % last_error)
3960 webpage = self._download_webpage(
3961 url, item_id,
3962 'Downloading webpage%s' % (' (retry #%d)' % count if count else ''))
3963 data = self._extract_yt_initial_data(item_id, webpage)
3964 if data.get('contents') or data.get('currentVideoEndpoint'):
3965 break
3966 # Extract alerts here only when there is error
3967 self._extract_and_report_alerts(data)
3968 if count >= retries:
3969 raise ExtractorError(last_error)
3970 return webpage, data
3971
3972 @staticmethod
3973 def _smuggle_data(entries, data):
3974 for entry in entries:
3975 if data:
3976 entry['url'] = smuggle_url(entry['url'], data)
3977 yield entry
3978
3979 def _real_extract(self, url):
3980 url, smuggled_data = unsmuggle_url(url, {})
3981 if self.is_music_url(url):
3982 smuggled_data['is_music_url'] = True
3983 info_dict = self.__real_extract(url, smuggled_data)
3984 if info_dict.get('entries'):
3985 info_dict['entries'] = self._smuggle_data(info_dict['entries'], smuggled_data)
3986 return info_dict
3987
3988 _url_re = re.compile(r'(?P<pre>%s)(?(channel_type)(?P<tab>/\w+))?(?P<post>.*)$' % _VALID_URL)
3989
3990 def __real_extract(self, url, smuggled_data):
3991 item_id = self._match_id(url)
3992 url = compat_urlparse.urlunparse(
3993 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
3994 compat_opts = self.get_param('compat_opts', [])
3995
3996 def get_mobj(url):
3997 mobj = self._url_re.match(url).groupdict()
3998 mobj.update((k, '') for k, v in mobj.items() if v is None)
3999 return mobj
4000
4001 mobj = get_mobj(url)
4002 # Youtube returns incomplete data if tabname is not lower case
4003 pre, tab, post, is_channel = mobj['pre'], mobj['tab'].lower(), mobj['post'], not mobj['not_channel']
4004
4005 if is_channel:
4006 if smuggled_data.get('is_music_url'):
4007 if item_id[:2] == 'VL':
4008 # Youtube music VL channels have an equivalent playlist
4009 item_id = item_id[2:]
4010 pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False
4011 elif item_id[:2] == 'MP':
4012 # Youtube music albums (/channel/MP...) have a OLAK playlist that can be extracted from the webpage
4013 item_id = self._search_regex(
4014 r'\\x22audioPlaylistId\\x22:\\x22([0-9A-Za-z_-]+)\\x22',
4015 self._download_webpage('https://music.youtube.com/channel/%s' % item_id, item_id),
4016 'playlist id')
4017 pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False
4018 elif mobj['channel_type'] == 'browse':
4019 # Youtube music /browse/ should be changed to /channel/
4020 pre = 'https://www.youtube.com/channel/%s' % item_id
4021 if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts:
4022 # Home URLs should redirect to /videos/
4023 self.report_warning(
4024 'A channel/user page was given. All the channel\'s videos will be downloaded. '
4025 'To download only the videos in the home page, add a "/featured" to the URL')
4026 tab = '/videos'
4027
4028 url = ''.join((pre, tab, post))
4029 mobj = get_mobj(url)
4030
4031 # Handle both video/playlist URLs
4032 qs = parse_qs(url)
4033 video_id = qs.get('v', [None])[0]
4034 playlist_id = qs.get('list', [None])[0]
4035
4036 if not video_id and mobj['not_channel'].startswith('watch'):
4037 if not playlist_id:
4038 # If there is neither video or playlist ids, youtube redirects to home page, which is undesirable
4039 raise ExtractorError('Unable to recognize tab page')
4040 # Common mistake: https://www.youtube.com/watch?list=playlist_id
4041 self.report_warning('A video URL was given without video ID. Trying to download playlist %s' % playlist_id)
4042 url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
4043 mobj = get_mobj(url)
4044
4045 if video_id and playlist_id:
4046 if self.get_param('noplaylist'):
4047 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
4048 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
4049 self.to_screen('Downloading playlist %s; add --no-playlist to just download video %s' % (playlist_id, video_id))
4050
4051 webpage, data = self._extract_webpage(url, item_id)
4052
4053 tabs = try_get(
4054 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
4055 if tabs:
4056 selected_tab = self._extract_selected_tab(tabs)
4057 tab_name = selected_tab.get('title', '')
4058 if 'no-youtube-channel-redirect' not in compat_opts:
4059 if mobj['tab'] == '/live':
4060 # Live tab should have redirected to the video
4061 raise ExtractorError('The channel is not currently live', expected=True)
4062 if mobj['tab'] == '/videos' and tab_name.lower() != mobj['tab'][1:]:
4063 if not mobj['not_channel'] and item_id[:2] == 'UC':
4064 # Topic channels don't have /videos. Use the equivalent playlist instead
4065 self.report_warning('The URL does not have a %s tab. Trying to redirect to playlist UU%s instead' % (mobj['tab'][1:], item_id[2:]))
4066 pl_id = 'UU%s' % item_id[2:]
4067 pl_url = 'https://www.youtube.com/playlist?list=%s%s' % (pl_id, mobj['post'])
4068 try:
4069 pl_webpage, pl_data = self._extract_webpage(pl_url, pl_id)
4070 for alert_type, alert_message in self._extract_alerts(pl_data):
4071 if alert_type == 'error':
4072 raise ExtractorError('Youtube said: %s' % alert_message)
4073 item_id, url, webpage, data = pl_id, pl_url, pl_webpage, pl_data
4074 except ExtractorError:
4075 self.report_warning('The playlist gave error. Falling back to channel URL')
4076 else:
4077 self.report_warning('The URL does not have a %s tab. %s is being downloaded instead' % (mobj['tab'][1:], tab_name))
4078
4079 self.write_debug('Final URL: %s' % url)
4080
4081 # YouTube sometimes provides a button to reload playlist with unavailable videos.
4082 if 'no-youtube-unavailable-videos' not in compat_opts:
4083 data = self._reload_with_unavailable_videos(item_id, data, webpage) or data
4084 self._extract_and_report_alerts(data)
4085
4086 tabs = try_get(
4087 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
4088 if tabs:
4089 return self._extract_from_tabs(item_id, webpage, data, tabs)
4090
4091 playlist = try_get(
4092 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
4093 if playlist:
4094 return self._extract_from_playlist(item_id, url, data, playlist, webpage)
4095
4096 video_id = try_get(
4097 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
4098 compat_str) or video_id
4099 if video_id:
4100 if mobj['tab'] != '/live': # live tab is expected to redirect to video
4101 self.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id)
4102 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
4103
4104 raise ExtractorError('Unable to recognize tab page')
4105
4106
4107 class YoutubePlaylistIE(InfoExtractor):
4108 IE_DESC = 'YouTube.com playlists'
4109 _VALID_URL = r'''(?x)(?:
4110 (?:https?://)?
4111 (?:\w+\.)?
4112 (?:
4113 (?:
4114 youtube(?:kids)?\.com|
4115 invidio\.us
4116 )
4117 /.*?\?.*?\blist=
4118 )?
4119 (?P<id>%(playlist_id)s)
4120 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
4121 IE_NAME = 'youtube:playlist'
4122 _TESTS = [{
4123 'note': 'issue #673',
4124 'url': 'PLBB231211A4F62143',
4125 'info_dict': {
4126 'title': '[OLD]Team Fortress 2 (Class-based LP)',
4127 'id': 'PLBB231211A4F62143',
4128 'uploader': 'Wickydoo',
4129 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
4130 },
4131 'playlist_mincount': 29,
4132 }, {
4133 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
4134 'info_dict': {
4135 'title': 'YDL_safe_search',
4136 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
4137 },
4138 'playlist_count': 2,
4139 'skip': 'This playlist is private',
4140 }, {
4141 'note': 'embedded',
4142 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
4143 'playlist_count': 4,
4144 'info_dict': {
4145 'title': 'JODA15',
4146 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
4147 'uploader': 'milan',
4148 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
4149 }
4150 }, {
4151 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
4152 'playlist_mincount': 982,
4153 'info_dict': {
4154 'title': '2018 Chinese New Singles (11/6 updated)',
4155 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
4156 'uploader': 'LBK',
4157 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
4158 }
4159 }, {
4160 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
4161 'only_matching': True,
4162 }, {
4163 # music album playlist
4164 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
4165 'only_matching': True,
4166 }]
4167
4168 @classmethod
4169 def suitable(cls, url):
4170 if YoutubeTabIE.suitable(url):
4171 return False
4172 # Hack for lazy extractors until more generic solution is implemented
4173 # (see #28780)
4174 from .youtube import parse_qs
4175 qs = parse_qs(url)
4176 if qs.get('v', [None])[0]:
4177 return False
4178 return super(YoutubePlaylistIE, cls).suitable(url)
4179
4180 def _real_extract(self, url):
4181 playlist_id = self._match_id(url)
4182 is_music_url = YoutubeBaseInfoExtractor.is_music_url(url)
4183 url = update_url_query(
4184 'https://www.youtube.com/playlist',
4185 parse_qs(url) or {'list': playlist_id})
4186 if is_music_url:
4187 url = smuggle_url(url, {'is_music_url': True})
4188 return self.url_result(url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
4189
4190
4191 class YoutubeYtBeIE(InfoExtractor):
4192 IE_DESC = 'youtu.be'
4193 _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
4194 _TESTS = [{
4195 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
4196 'info_dict': {
4197 'id': 'yeWKywCrFtk',
4198 'ext': 'mp4',
4199 'title': 'Small Scale Baler and Braiding Rugs',
4200 'uploader': 'Backus-Page House Museum',
4201 'uploader_id': 'backuspagemuseum',
4202 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
4203 'upload_date': '20161008',
4204 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
4205 'categories': ['Nonprofits & Activism'],
4206 'tags': list,
4207 'like_count': int,
4208 'dislike_count': int,
4209 },
4210 'params': {
4211 'noplaylist': True,
4212 'skip_download': True,
4213 },
4214 }, {
4215 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
4216 'only_matching': True,
4217 }]
4218
4219 def _real_extract(self, url):
4220 mobj = re.match(self._VALID_URL, url)
4221 video_id = mobj.group('id')
4222 playlist_id = mobj.group('playlist_id')
4223 return self.url_result(
4224 update_url_query('https://www.youtube.com/watch', {
4225 'v': video_id,
4226 'list': playlist_id,
4227 'feature': 'youtu.be',
4228 }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
4229
4230
4231 class YoutubeYtUserIE(InfoExtractor):
4232 IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
4233 _VALID_URL = r'ytuser:(?P<id>.+)'
4234 _TESTS = [{
4235 'url': 'ytuser:phihag',
4236 'only_matching': True,
4237 }]
4238
4239 def _real_extract(self, url):
4240 user_id = self._match_id(url)
4241 return self.url_result(
4242 'https://www.youtube.com/user/%s' % user_id,
4243 ie=YoutubeTabIE.ie_key(), video_id=user_id)
4244
4245
4246 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
4247 IE_NAME = 'youtube:favorites'
4248 IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
4249 _VALID_URL = r':ytfav(?:ou?rite)?s?'
4250 _LOGIN_REQUIRED = True
4251 _TESTS = [{
4252 'url': ':ytfav',
4253 'only_matching': True,
4254 }, {
4255 'url': ':ytfavorites',
4256 'only_matching': True,
4257 }]
4258
4259 def _real_extract(self, url):
4260 return self.url_result(
4261 'https://www.youtube.com/playlist?list=LL',
4262 ie=YoutubeTabIE.ie_key())
4263
4264
4265 class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE):
4266 IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
4267 # there doesn't appear to be a real limit, for example if you search for
4268 # 'python' you get more than 8.000.000 results
4269 _MAX_RESULTS = float('inf')
4270 IE_NAME = 'youtube:search'
4271 _SEARCH_KEY = 'ytsearch'
4272 _SEARCH_PARAMS = None
4273 _TESTS = []
4274
4275 def _entries(self, query, n):
4276 data = {'query': query}
4277 if self._SEARCH_PARAMS:
4278 data['params'] = self._SEARCH_PARAMS
4279 total = 0
4280 for page_num in itertools.count(1):
4281 search = self._extract_response(
4282 item_id='query "%s" page %s' % (query, page_num), ep='search', query=data,
4283 check_get_keys=('contents', 'onResponseReceivedCommands')
4284 )
4285 if not search:
4286 break
4287 slr_contents = try_get(
4288 search,
4289 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
4290 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
4291 list)
4292 if not slr_contents:
4293 break
4294
4295 # Youtube sometimes adds promoted content to searches,
4296 # changing the index location of videos and token.
4297 # So we search through all entries till we find them.
4298 continuation_token = None
4299 for slr_content in slr_contents:
4300 if continuation_token is None:
4301 continuation_token = try_get(
4302 slr_content,
4303 lambda x: x['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
4304 compat_str)
4305
4306 isr_contents = try_get(
4307 slr_content,
4308 lambda x: x['itemSectionRenderer']['contents'],
4309 list)
4310 if not isr_contents:
4311 continue
4312 for content in isr_contents:
4313 if not isinstance(content, dict):
4314 continue
4315 video = content.get('videoRenderer')
4316 if not isinstance(video, dict):
4317 continue
4318 video_id = video.get('videoId')
4319 if not video_id:
4320 continue
4321
4322 yield self._extract_video(video)
4323 total += 1
4324 if total == n:
4325 return
4326
4327 if not continuation_token:
4328 break
4329 data['continuation'] = continuation_token
4330
4331 def _get_n_results(self, query, n):
4332 """Get a specified number of results for a query"""
4333 return self.playlist_result(self._entries(query, n), query)
4334
4335
4336 class YoutubeSearchDateIE(YoutubeSearchIE):
4337 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
4338 _SEARCH_KEY = 'ytsearchdate'
4339 IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
4340 _SEARCH_PARAMS = 'CAI%3D'
4341
4342
4343 class YoutubeSearchURLIE(YoutubeSearchIE):
4344 IE_DESC = 'YouTube.com search URLs'
4345 IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
4346 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
4347 # _MAX_RESULTS = 100
4348 _TESTS = [{
4349 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
4350 'playlist_mincount': 5,
4351 'info_dict': {
4352 'title': 'youtube-dl test video',
4353 }
4354 }, {
4355 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
4356 'only_matching': True,
4357 }]
4358
4359 @classmethod
4360 def _make_valid_url(cls):
4361 return cls._VALID_URL
4362
4363 def _real_extract(self, url):
4364 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
4365 query = (qs.get('search_query') or qs.get('q'))[0]
4366 self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
4367 return self._get_n_results(query, self._MAX_RESULTS)
4368
4369
4370 class YoutubeFeedsInfoExtractor(YoutubeTabIE):
4371 """
4372 Base class for feed extractors
4373 Subclasses must define the _FEED_NAME property.
4374 """
4375 _LOGIN_REQUIRED = True
4376 _TESTS = []
4377
4378 @property
4379 def IE_NAME(self):
4380 return 'youtube:%s' % self._FEED_NAME
4381
4382 def _real_extract(self, url):
4383 return self.url_result(
4384 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
4385 ie=YoutubeTabIE.ie_key())
4386
4387
4388 class YoutubeWatchLaterIE(InfoExtractor):
4389 IE_NAME = 'youtube:watchlater'
4390 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
4391 _VALID_URL = r':ytwatchlater'
4392 _TESTS = [{
4393 'url': ':ytwatchlater',
4394 'only_matching': True,
4395 }]
4396
4397 def _real_extract(self, url):
4398 return self.url_result(
4399 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
4400
4401
4402 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
4403 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
4404 _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
4405 _FEED_NAME = 'recommended'
4406 _LOGIN_REQUIRED = False
4407 _TESTS = [{
4408 'url': ':ytrec',
4409 'only_matching': True,
4410 }, {
4411 'url': ':ytrecommended',
4412 'only_matching': True,
4413 }, {
4414 'url': 'https://youtube.com',
4415 'only_matching': True,
4416 }]
4417
4418
4419 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
4420 IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
4421 _VALID_URL = r':ytsub(?:scription)?s?'
4422 _FEED_NAME = 'subscriptions'
4423 _TESTS = [{
4424 'url': ':ytsubs',
4425 'only_matching': True,
4426 }, {
4427 'url': ':ytsubscriptions',
4428 'only_matching': True,
4429 }]
4430
4431
4432 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
4433 IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)'
4434 _VALID_URL = r':ythis(?:tory)?'
4435 _FEED_NAME = 'history'
4436 _TESTS = [{
4437 'url': ':ythistory',
4438 'only_matching': True,
4439 }]
4440
4441
4442 class YoutubeTruncatedURLIE(InfoExtractor):
4443 IE_NAME = 'youtube:truncated_url'
4444 IE_DESC = False # Do not list
4445 _VALID_URL = r'''(?x)
4446 (?:https?://)?
4447 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
4448 (?:watch\?(?:
4449 feature=[a-z_]+|
4450 annotation_id=annotation_[^&]+|
4451 x-yt-cl=[0-9]+|
4452 hl=[^&]*|
4453 t=[0-9]+
4454 )?
4455 |
4456 attribution_link\?a=[^&]+
4457 )
4458 $
4459 '''
4460
4461 _TESTS = [{
4462 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
4463 'only_matching': True,
4464 }, {
4465 'url': 'https://www.youtube.com/watch?',
4466 'only_matching': True,
4467 }, {
4468 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
4469 'only_matching': True,
4470 }, {
4471 'url': 'https://www.youtube.com/watch?feature=foo',
4472 'only_matching': True,
4473 }, {
4474 'url': 'https://www.youtube.com/watch?hl=en-GB',
4475 'only_matching': True,
4476 }, {
4477 'url': 'https://www.youtube.com/watch?t=2372',
4478 'only_matching': True,
4479 }]
4480
4481 def _real_extract(self, url):
4482 raise ExtractorError(
4483 'Did you forget to quote the URL? Remember that & is a meta '
4484 'character in most shells, so you want to put the URL in quotes, '
4485 'like youtube-dl '
4486 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
4487 ' or simply youtube-dl BaW_jenozKc .',
4488 expected=True)
4489
4490
4491 class YoutubeTruncatedIDIE(InfoExtractor):
4492 IE_NAME = 'youtube:truncated_id'
4493 IE_DESC = False # Do not list
4494 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
4495
4496 _TESTS = [{
4497 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
4498 'only_matching': True,
4499 }]
4500
4501 def _real_extract(self, url):
4502 video_id = self._match_id(url)
4503 raise ExtractorError(
4504 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
4505 expected=True)