]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/youtube.py
Update to ytdl-commit-4fb25ff
[yt-dlp.git] / yt_dlp / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
d92f5d5a 5import calendar
a5c56234 6import hashlib
0ca96d48 7import itertools
c5e8d7af 8import json
c4417ddb 9import os.path
d77ab8e2 10import random
c5e8d7af 11import re
8a784c74 12import time
e0df6211 13import traceback
c5e8d7af 14
b05654f0 15from .common import InfoExtractor, SearchInfoExtractor
4bb4a188 16from ..compat import (
edf3e38e 17 compat_chr,
29f7c58a 18 compat_HTTPError,
c5e8d7af 19 compat_parse_qs,
545cc85d 20 compat_str,
7fd002c0 21 compat_urllib_parse_unquote_plus,
15707c7e 22 compat_urllib_parse_urlencode,
7c80519c 23 compat_urllib_parse_urlparse,
7c61bd36 24 compat_urlparse,
4bb4a188 25)
545cc85d 26from ..jsinterp import JSInterpreter
4bb4a188 27from ..utils import (
c224251a 28 bool_or_none,
c5e8d7af 29 clean_html,
26fe8ffe 30 dict_get,
d92f5d5a 31 datetime_from_str,
c5e8d7af 32 ExtractorError,
b60419c5 33 format_field,
2d30521a 34 float_or_none,
dd27fd17 35 int_or_none,
94278f72 36 mimetype2ext,
6310acf5 37 parse_codecs,
7c80519c 38 parse_duration,
dca3ff4a 39 qualities,
3995d37d 40 remove_start,
cf7e015f 41 smuggle_url,
dbdaaa23 42 str_or_none,
c93d53f5 43 str_to_int,
556dbe7f 44 try_get,
c5e8d7af
PH
45 unescapeHTML,
46 unified_strdate,
cf7e015f 47 unsmuggle_url,
8bdd16b4 48 update_url_query,
21c340b8 49 url_or_none,
6e6bc8da 50 urlencode_postdata,
d92f5d5a 51 urljoin
c5e8d7af
PH
52)
53
5f6a1245 54
de7f3446 55class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
56 """Provide base functions for Youtube extractors"""
57 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 58 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
e00eb564
S
59
60 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
3995d37d
S
61 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
62 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
e00eb564 63
3462ffa8 64 _RESERVED_NAMES = (
cd7c66cf 65 r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|'
66 r'movies|results|shared|hashtag|trending|feed|feeds|'
67 r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout')
3462ffa8 68
b2e8bc1b
JMF
69 _NETRC_MACHINE = 'youtube'
70 # If True it will raise an error if no login info is provided
71 _LOGIN_REQUIRED = False
72
70d5c17b 73 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
d0ba5587 74
25f14e9f
S
75 def _ids_to_results(self, ids):
76 return [
77 self.url_result(vid_id, 'Youtube', video_id=vid_id)
78 for vid_id in ids]
79
b2e8bc1b 80 def _login(self):
83317f69 81 """
82 Attempt to log in to YouTube.
83 True is returned if successful or skipped.
84 False is returned if login failed.
85
86 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
87 """
68217024 88 username, password = self._get_login_info()
b2e8bc1b
JMF
89 # No authentication to be performed
90 if username is None:
70d35d16 91 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
69ea8ca4 92 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
545cc85d 93 # if self._downloader.params.get('cookiefile'): # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
94 # self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
83317f69 95 return True
b2e8bc1b 96
7cc3570e
PH
97 login_page = self._download_webpage(
98 self._LOGIN_URL, None,
69ea8ca4
PH
99 note='Downloading login page',
100 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
101 if login_page is False:
102 return
b2e8bc1b 103
1212e997 104 login_form = self._hidden_inputs(login_page)
c5e8d7af 105
e00eb564
S
106 def req(url, f_req, note, errnote):
107 data = login_form.copy()
108 data.update({
109 'pstMsg': 1,
110 'checkConnection': 'youtube',
111 'checkedDomains': 'youtube',
112 'hl': 'en',
113 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
3995d37d 114 'f.req': json.dumps(f_req),
e00eb564
S
115 'flowName': 'GlifWebSignIn',
116 'flowEntry': 'ServiceLogin',
baf67a60
S
117 # TODO: reverse actual botguard identifier generation algo
118 'bgRequest': '["identifier",""]',
041bc3ad 119 })
e00eb564
S
120 return self._download_json(
121 url, None, note=note, errnote=errnote,
122 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
123 fatal=False,
124 data=urlencode_postdata(data), headers={
125 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
126 'Google-Accounts-XSRF': 1,
127 })
128
3995d37d
S
129 def warn(message):
130 self._downloader.report_warning(message)
131
132 lookup_req = [
133 username,
134 None, [], None, 'US', None, None, 2, False, True,
135 [
136 None, None,
137 [2, 1, None, 1,
138 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
139 None, [], 4],
140 1, [None, None, []], None, None, None, True
141 ],
142 username,
143 ]
144
e00eb564 145 lookup_results = req(
3995d37d 146 self._LOOKUP_URL, lookup_req,
e00eb564
S
147 'Looking up account info', 'Unable to look up account info')
148
149 if lookup_results is False:
150 return False
041bc3ad 151
3995d37d
S
152 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
153 if not user_hash:
154 warn('Unable to extract user hash')
155 return False
156
157 challenge_req = [
158 user_hash,
159 None, 1, None, [1, None, None, None, [password, None, True]],
160 [
161 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
162 1, [None, None, []], None, None, None, True
163 ]]
83317f69 164
3995d37d
S
165 challenge_results = req(
166 self._CHALLENGE_URL, challenge_req,
167 'Logging in', 'Unable to log in')
83317f69 168
3995d37d 169 if challenge_results is False:
e00eb564 170 return
83317f69 171
3995d37d
S
172 login_res = try_get(challenge_results, lambda x: x[0][5], list)
173 if login_res:
174 login_msg = try_get(login_res, lambda x: x[5], compat_str)
175 warn(
176 'Unable to login: %s' % 'Invalid password'
177 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
178 return False
179
180 res = try_get(challenge_results, lambda x: x[0][-1], list)
181 if not res:
182 warn('Unable to extract result entry')
183 return False
184
9a6628aa
S
185 login_challenge = try_get(res, lambda x: x[0][0], list)
186 if login_challenge:
187 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
188 if challenge_str == 'TWO_STEP_VERIFICATION':
3995d37d
S
189 # SEND_SUCCESS - TFA code has been successfully sent to phone
190 # QUOTA_EXCEEDED - reached the limit of TFA codes
9a6628aa 191 status = try_get(login_challenge, lambda x: x[5], compat_str)
3995d37d
S
192 if status == 'QUOTA_EXCEEDED':
193 warn('Exceeded the limit of TFA codes, try later')
194 return False
195
196 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
197 if not tl:
198 warn('Unable to extract TL')
199 return False
200
201 tfa_code = self._get_tfa_info('2-step verification code')
202
203 if not tfa_code:
204 warn(
205 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
206 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
207 return False
208
209 tfa_code = remove_start(tfa_code, 'G-')
210
211 tfa_req = [
212 user_hash, None, 2, None,
213 [
214 9, None, None, None, None, None, None, None,
215 [None, tfa_code, True, 2]
216 ]]
217
218 tfa_results = req(
219 self._TFA_URL.format(tl), tfa_req,
220 'Submitting TFA code', 'Unable to submit TFA code')
221
222 if tfa_results is False:
223 return False
224
225 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
226 if tfa_res:
227 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
228 warn(
229 'Unable to finish TFA: %s' % 'Invalid TFA code'
230 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
231 return False
232
233 check_cookie_url = try_get(
234 tfa_results, lambda x: x[0][-1][2], compat_str)
9a6628aa
S
235 else:
236 CHALLENGES = {
237 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
238 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
239 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
240 }
241 challenge = CHALLENGES.get(
242 challenge_str,
243 '%s returned error %s.' % (self.IE_NAME, challenge_str))
244 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
245 return False
3995d37d
S
246 else:
247 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
248
249 if not check_cookie_url:
250 warn('Unable to extract CheckCookie URL')
251 return False
e00eb564
S
252
253 check_cookie_results = self._download_webpage(
3995d37d
S
254 check_cookie_url, None, 'Checking cookie', fatal=False)
255
256 if check_cookie_results is False:
257 return False
e00eb564 258
3995d37d
S
259 if 'https://myaccount.google.com/' not in check_cookie_results:
260 warn('Unable to log in')
b2e8bc1b 261 return False
e00eb564 262
b2e8bc1b
JMF
263 return True
264
cce889b9 265 def _initialize_consent(self):
266 cookies = self._get_cookies('https://www.youtube.com/')
267 if cookies.get('__Secure-3PSID'):
268 return
269 consent_id = None
270 consent = cookies.get('CONSENT')
271 if consent:
272 if 'YES' in consent.value:
273 return
274 consent_id = self._search_regex(
275 r'PENDING\+(\d+)', consent.value, 'consent', default=None)
276 if not consent_id:
277 consent_id = random.randint(100, 999)
278 self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id)
8d81f3e3 279
b2e8bc1b 280 def _real_initialize(self):
cce889b9 281 self._initialize_consent()
b2e8bc1b
JMF
282 if self._downloader is None:
283 return
b2e8bc1b
JMF
284 if not self._login():
285 return
c5e8d7af 286
a1c5d2ca 287 _YT_WEB_CLIENT_VERSION = '2.20210301.08.00'
8bdd16b4 288 _DEFAULT_API_DATA = {
289 'context': {
290 'client': {
291 'clientName': 'WEB',
a1c5d2ca 292 'clientVersion': _YT_WEB_CLIENT_VERSION,
8bdd16b4 293 }
294 },
295 }
8377574c 296
a1c5d2ca
M
297 _DEFAULT_BASIC_API_HEADERS = {
298 'X-YouTube-Client-Name': '1',
299 'X-YouTube-Client-Version': _YT_WEB_CLIENT_VERSION
300 }
301
a0566bbf 302 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
29f7c58a 303 _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
304 _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
a0566bbf 305
a5c56234
M
306 def _generate_sapisidhash_header(self):
307 sapisid_cookie = self._get_cookies('https://www.youtube.com').get('SAPISID')
308 if sapisid_cookie is None:
309 return
310 time_now = round(time.time())
311 sapisidhash = hashlib.sha1((str(time_now) + " " + sapisid_cookie.value + " " + "https://www.youtube.com").encode("utf-8")).hexdigest()
312 return "SAPISIDHASH %s_%s" % (time_now, sapisidhash)
313
314 def _call_api(self, ep, query, video_id, fatal=True, headers=None,
315 note='Downloading API JSON', errnote='Unable to download API page'):
8bdd16b4 316 data = self._DEFAULT_API_DATA.copy()
317 data.update(query)
a5c56234
M
318 headers = headers or {}
319 headers.update({'content-type': 'application/json'})
320 auth = self._generate_sapisidhash_header()
321 if auth is not None:
322 headers.update({'Authorization': auth, 'X-Origin': 'https://www.youtube.com'})
545cc85d 323 return self._download_json(
a5c56234
M
324 'https://www.youtube.com/youtubei/v1/%s' % ep,
325 video_id=video_id, fatal=fatal, note=note, errnote=errnote,
326 data=json.dumps(data).encode('utf8'), headers=headers,
8bdd16b4 327 query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
c54f4aad 328
8bdd16b4 329 def _extract_yt_initial_data(self, video_id, webpage):
330 return self._parse_json(
331 self._search_regex(
29f7c58a 332 (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
a0566bbf 333 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
8bdd16b4 334 video_id)
0c148415 335
a1c5d2ca
M
336 def _extract_identity_token(self, webpage, item_id):
337 ytcfg = self._extract_ytcfg(item_id, webpage)
338 if ytcfg:
339 token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
340 if token:
341 return token
342 return self._search_regex(
343 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
344 'identity token', default=None)
345
346 @staticmethod
347 def _extract_account_syncid(data):
348 """Extract syncId required to download private playlists of secondary channels"""
349 sync_ids = (
350 try_get(data, lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'], compat_str)
351 or '').split("||")
352 if len(sync_ids) >= 2 and sync_ids[1]:
353 # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
354 # and just "user_syncid||" for primary channel. We only want the channel_syncid
355 return sync_ids[0]
356
29f7c58a 357 def _extract_ytcfg(self, video_id, webpage):
358 return self._parse_json(
359 self._search_regex(
360 r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
361 default='{}'), video_id, fatal=False)
362
30a074c2 363 def _extract_video(self, renderer):
364 video_id = renderer.get('videoId')
365 title = try_get(
366 renderer,
367 (lambda x: x['title']['runs'][0]['text'],
368 lambda x: x['title']['simpleText']), compat_str)
369 description = try_get(
370 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
371 compat_str)
372 duration = parse_duration(try_get(
373 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
374 view_count_text = try_get(
375 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
376 view_count = str_to_int(self._search_regex(
377 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
378 'view count', default=None))
379 uploader = try_get(
bc2ca1bb 380 renderer,
381 (lambda x: x['ownerText']['runs'][0]['text'],
382 lambda x: x['shortBylineText']['runs'][0]['text']), compat_str)
30a074c2 383 return {
39ed931e 384 '_type': 'url',
30a074c2 385 'ie_key': YoutubeIE.ie_key(),
386 'id': video_id,
387 'url': video_id,
388 'title': title,
389 'description': description,
390 'duration': duration,
391 'view_count': view_count,
392 'uploader': uploader,
393 }
394
0c148415 395
360e1ca5 396class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 397 IE_DESC = 'YouTube.com'
bc2ca1bb 398 _INVIDIOUS_SITES = (
399 # invidious-redirect websites
400 r'(?:www\.)?redirect\.invidious\.io',
401 r'(?:(?:www|dev)\.)?invidio\.us',
402 # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
403 r'(?:www\.)?invidious\.pussthecat\.org',
404 r'(?:www\.)?invidious\.048596\.xyz',
405 r'(?:www\.)?invidious\.zee\.li',
406 r'(?:www\.)?vid\.puffyan\.us',
407 r'(?:(?:www|au)\.)?ytprivate\.com',
408 r'(?:www\.)?invidious\.namazso\.eu',
409 r'(?:www\.)?invidious\.ethibox\.fr',
410 r'(?:www\.)?inv\.skyn3t\.in',
411 r'(?:www\.)?invidious\.himiko\.cloud',
412 r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
413 r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
414 r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
415 r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
416 # youtube-dl invidious instances list
417 r'(?:(?:www|no)\.)?invidiou\.sh',
418 r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
419 r'(?:www\.)?invidious\.kabi\.tk',
420 r'(?:www\.)?invidious\.13ad\.de',
421 r'(?:www\.)?invidious\.mastodon\.host',
422 r'(?:www\.)?invidious\.zapashcanon\.fr',
423 r'(?:www\.)?invidious\.kavin\.rocks',
424 r'(?:www\.)?invidious\.tube',
425 r'(?:www\.)?invidiou\.site',
426 r'(?:www\.)?invidious\.site',
427 r'(?:www\.)?invidious\.xyz',
428 r'(?:www\.)?invidious\.nixnet\.xyz',
429 r'(?:www\.)?invidious\.drycat\.fr',
430 r'(?:www\.)?tube\.poal\.co',
431 r'(?:www\.)?tube\.connect\.cafe',
432 r'(?:www\.)?vid\.wxzm\.sx',
433 r'(?:www\.)?vid\.mint\.lgbt',
434 r'(?:www\.)?yewtu\.be',
435 r'(?:www\.)?yt\.elukerio\.org',
436 r'(?:www\.)?yt\.lelux\.fi',
437 r'(?:www\.)?invidious\.ggc-project\.de',
438 r'(?:www\.)?yt\.maisputain\.ovh',
439 r'(?:www\.)?invidious\.toot\.koeln',
440 r'(?:www\.)?invidious\.fdn\.fr',
441 r'(?:www\.)?watch\.nettohikari\.com',
442 r'(?:www\.)?kgg2m7yk5aybusll\.onion',
443 r'(?:www\.)?qklhadlycap4cnod\.onion',
444 r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion',
445 r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion',
446 r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion',
447 r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion',
448 r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p',
449 r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion',
450 )
cb7dfeea 451 _VALID_URL = r"""(?x)^
c5e8d7af 452 (
edb53e2d 453 (?:https?://|//) # http(s):// or protocol-independent URL
bc2ca1bb 454 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com|
455 (?:www\.)?deturl\.com/www\.youtube\.com|
456 (?:www\.)?pwnyoutube\.com|
457 (?:www\.)?hooktube\.com|
458 (?:www\.)?yourepeat\.com|
459 tube\.majestyc\.net|
460 %(invidious)s|
461 youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
c5e8d7af
PH
462 (?:.*?\#/)? # handle anchor (#/) redirect urls
463 (?: # the various things that can precede the ID:
ac7553d0 464 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 465 |(?: # or the v= param in all its forms
f7000f3a 466 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 467 (?:\?|\#!?) # the params delimiter ? or # or #!
040ac686 468 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
c5e8d7af
PH
469 v=
470 )
f4b05232 471 ))
cbaed4bb
S
472 |(?:
473 youtu\.be| # just youtu.be/xxxx
6d4fc66b
S
474 vid\.plus| # or vid.plus/xxxx
475 zwearz\.com/watch| # or zwearz.com/watch/xxxx
bc2ca1bb 476 %(invidious)s
cbaed4bb 477 )/
edb53e2d 478 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 479 )
c5e8d7af 480 )? # all until now is optional -> you can pass the naked ID
8bdd16b4 481 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
d0ba5587
S
482 (?!.*?\blist=
483 (?:
484 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
485 WL # WL are handled by the watch later IE
486 )
487 )
c5e8d7af 488 (?(1).+)? # if we found the ID, everything can follow
bc2ca1bb 489 $""" % {
490 'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE,
491 'invidious': '|'.join(_INVIDIOUS_SITES),
492 }
e40c758c 493 _PLAYER_INFO_RE = (
cc2db878 494 r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
495 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
545cc85d 496 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
e40c758c 497 )
2c62dc26 498 _formats = {
c2d3cb4c 499 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
500 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
501 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
502 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
503 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
504 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
505 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
506 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
3834d3e3 507 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
c2d3cb4c 508 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
509 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
510 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
511 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
512 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
513 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
e1a0bfdf 514 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
c2d3cb4c 515 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
516 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
e1a0bfdf 517
518
519 # 3D videos
c2d3cb4c 520 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
521 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
522 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
523 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
e1a0bfdf 524 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
525 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
526 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
836a086c 527
96fb5605 528 # Apple HTTP Live Streaming
11f12195 529 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
c2d3cb4c 530 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
531 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
532 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
533 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
534 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
e1a0bfdf 535 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
536 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
2c62dc26
PH
537
538 # DASH mp4 video
d23028a8
S
539 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
540 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
541 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
542 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
543 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
067aa17e 544 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
d23028a8
S
545 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
546 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
547 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
548 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
549 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
550 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
836a086c 551
f6f1fc92 552 # Dash mp4 audio
d23028a8
S
553 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
554 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
555 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
556 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
557 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
558 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
559 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
836a086c
AZ
560
561 # Dash webm
d23028a8
S
562 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
563 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
564 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
565 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
566 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
567 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
568 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
569 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
570 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
571 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
572 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
573 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
574 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
575 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
576 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
4c6b4764 577 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
d23028a8
S
578 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
579 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
580 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
581 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
582 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
583 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
2c62dc26
PH
584
585 # Dash webm audio
d23028a8
S
586 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
587 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
ce6b9a2d 588
0857baad 589 # Dash webm audio with opus inside
d23028a8
S
590 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
591 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
592 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
0857baad 593
ce6b9a2d
PH
594 # RTMP (unnamed)
595 '_rtmp': {'protocol': 'rtmp'},
b85eae0f
S
596
597 # av01 video only formats sometimes served with "unknown" codecs
598 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
599 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
600 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
601 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
c5e8d7af 602 }
29f7c58a 603 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
836a086c 604
fd5c4aab
S
605 _GEO_BYPASS = False
606
78caa52a 607 IE_NAME = 'youtube'
2eb88d95
PH
608 _TESTS = [
609 {
2d3d2997 610 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
4bc3a23e
PH
611 'info_dict': {
612 'id': 'BaW_jenozKc',
613 'ext': 'mp4',
3867038a 614 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
4bc3a23e
PH
615 'uploader': 'Philipp Hagemeister',
616 'uploader_id': 'phihag',
ec85ded8 617 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
dd4c4492
S
618 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
619 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
4bc3a23e 620 'upload_date': '20121002',
3867038a 621 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
4bc3a23e 622 'categories': ['Science & Technology'],
3867038a 623 'tags': ['youtube-dl'],
556dbe7f 624 'duration': 10,
dbdaaa23 625 'view_count': int,
3e7c1224
PH
626 'like_count': int,
627 'dislike_count': int,
7c80519c 628 'start_time': 1,
297a564b 629 'end_time': 9,
2eb88d95 630 }
0e853ca4 631 },
fccd3771 632 {
4bc3a23e
PH
633 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
634 'note': 'Embed-only video (#1746)',
635 'info_dict': {
636 'id': 'yZIXLfi8CZQ',
637 'ext': 'mp4',
638 'upload_date': '20120608',
639 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
640 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
641 'uploader': 'SET India',
94bfcd23 642 'uploader_id': 'setindia',
ec85ded8 643 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
94bfcd23 644 'age_limit': 18,
545cc85d 645 },
646 'skip': 'Private video',
fccd3771 647 },
11b56058 648 {
8bdd16b4 649 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
11b56058
PM
650 'note': 'Use the first video ID in the URL',
651 'info_dict': {
652 'id': 'BaW_jenozKc',
653 'ext': 'mp4',
3867038a 654 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
11b56058
PM
655 'uploader': 'Philipp Hagemeister',
656 'uploader_id': 'phihag',
ec85ded8 657 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
11b56058 658 'upload_date': '20121002',
3867038a 659 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
11b56058 660 'categories': ['Science & Technology'],
3867038a 661 'tags': ['youtube-dl'],
556dbe7f 662 'duration': 10,
dbdaaa23 663 'view_count': int,
11b56058
PM
664 'like_count': int,
665 'dislike_count': int,
34a7de29
S
666 },
667 'params': {
668 'skip_download': True,
669 },
11b56058 670 },
dd27fd17 671 {
2d3d2997 672 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
4bc3a23e
PH
673 'note': '256k DASH audio (format 141) via DASH manifest',
674 'info_dict': {
675 'id': 'a9LDPn-MO4I',
676 'ext': 'm4a',
677 'upload_date': '20121002',
678 'uploader_id': '8KVIDEO',
ec85ded8 679 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
4bc3a23e
PH
680 'description': '',
681 'uploader': '8KVIDEO',
682 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 683 },
4bc3a23e
PH
684 'params': {
685 'youtube_include_dash_manifest': True,
686 'format': '141',
4919603f 687 },
de3c7fe0 688 'skip': 'format 141 not served anymore',
dd27fd17 689 },
8bdd16b4 690 # DASH manifest with encrypted signature
691 {
692 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
693 'info_dict': {
694 'id': 'IB3lcPjvWLA',
695 'ext': 'm4a',
696 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
697 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
698 'duration': 244,
699 'uploader': 'AfrojackVEVO',
700 'uploader_id': 'AfrojackVEVO',
701 'upload_date': '20131011',
cc2db878 702 'abr': 129.495,
8bdd16b4 703 },
704 'params': {
705 'youtube_include_dash_manifest': True,
706 'format': '141/bestaudio[ext=m4a]',
707 },
708 },
aa79ac0c
PH
709 # Controversy video
710 {
711 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
712 'info_dict': {
713 'id': 'T4XJQO3qol8',
714 'ext': 'mp4',
556dbe7f 715 'duration': 219,
aa79ac0c 716 'upload_date': '20100909',
4fe54c12 717 'uploader': 'Amazing Atheist',
aa79ac0c 718 'uploader_id': 'TheAmazingAtheist',
ec85ded8 719 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
aa79ac0c 720 'title': 'Burning Everyone\'s Koran',
545cc85d 721 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
aa79ac0c 722 }
c522adb1 723 },
dd2d55f1 724 # Normal age-gate video (embed allowed)
c522adb1 725 {
2d3d2997 726 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
c522adb1
JMF
727 'info_dict': {
728 'id': 'HtVdAasjOgU',
729 'ext': 'mp4',
730 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
ec85ded8 731 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
556dbe7f 732 'duration': 142,
c522adb1
JMF
733 'uploader': 'The Witcher',
734 'uploader_id': 'WitcherGame',
ec85ded8 735 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
c522adb1 736 'upload_date': '20140605',
34952f09 737 'age_limit': 18,
c522adb1
JMF
738 },
739 },
8bdd16b4 740 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
741 # YouTube Red ad is not captured for creator
742 {
743 'url': '__2ABJjxzNo',
744 'info_dict': {
745 'id': '__2ABJjxzNo',
746 'ext': 'mp4',
747 'duration': 266,
748 'upload_date': '20100430',
749 'uploader_id': 'deadmau5',
750 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
545cc85d 751 'creator': 'deadmau5',
752 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336',
8bdd16b4 753 'uploader': 'deadmau5',
754 'title': 'Deadmau5 - Some Chords (HD)',
545cc85d 755 'alt_title': 'Some Chords',
8bdd16b4 756 },
757 'expected_warnings': [
758 'DASH manifest missing',
759 ]
760 },
067aa17e 761 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
e52a40ab
PH
762 {
763 'url': 'lqQg6PlCWgI',
764 'info_dict': {
765 'id': 'lqQg6PlCWgI',
766 'ext': 'mp4',
556dbe7f 767 'duration': 6085,
90227264 768 'upload_date': '20150827',
cbe2bd91 769 'uploader_id': 'olympic',
ec85ded8 770 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
cbe2bd91 771 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
be49068d 772 'uploader': 'Olympic',
cbe2bd91
PH
773 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
774 },
775 'params': {
776 'skip_download': 'requires avconv',
e52a40ab 777 }
cbe2bd91 778 },
6271f1ca
PH
779 # Non-square pixels
780 {
781 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
782 'info_dict': {
783 'id': '_b-2C3KPAM0',
784 'ext': 'mp4',
785 'stretched_ratio': 16 / 9.,
556dbe7f 786 'duration': 85,
6271f1ca
PH
787 'upload_date': '20110310',
788 'uploader_id': 'AllenMeow',
ec85ded8 789 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
6271f1ca 790 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
eb6793ba 791 'uploader': '孫ᄋᄅ',
6271f1ca
PH
792 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
793 },
06b491eb
S
794 },
795 # url_encoded_fmt_stream_map is empty string
796 {
797 'url': 'qEJwOuvDf7I',
798 'info_dict': {
799 'id': 'qEJwOuvDf7I',
f57b7835 800 'ext': 'webm',
06b491eb
S
801 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
802 'description': '',
803 'upload_date': '20150404',
804 'uploader_id': 'spbelect',
805 'uploader': 'Наблюдатели Петербурга',
806 },
807 'params': {
808 'skip_download': 'requires avconv',
e323cf3f
S
809 },
810 'skip': 'This live event has ended.',
06b491eb 811 },
067aa17e 812 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
da77d856
S
813 {
814 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
815 'info_dict': {
816 'id': 'FIl7x6_3R5Y',
eb6793ba 817 'ext': 'webm',
da77d856
S
818 'title': 'md5:7b81415841e02ecd4313668cde88737a',
819 'description': 'md5:116377fd2963b81ec4ce64b542173306',
556dbe7f 820 'duration': 220,
da77d856
S
821 'upload_date': '20150625',
822 'uploader_id': 'dorappi2000',
ec85ded8 823 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
da77d856 824 'uploader': 'dorappi2000',
eb6793ba 825 'formats': 'mincount:31',
da77d856 826 },
eb6793ba 827 'skip': 'not actual anymore',
2ee8f5d8 828 },
8a1a26ce
YCH
829 # DASH manifest with segment_list
830 {
831 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
832 'md5': '8ce563a1d667b599d21064e982ab9e31',
833 'info_dict': {
834 'id': 'CsmdDsKjzN8',
835 'ext': 'mp4',
17ee98e1 836 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
837 'uploader': 'Airtek',
838 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
839 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
840 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
841 },
842 'params': {
843 'youtube_include_dash_manifest': True,
844 'format': '135', # bestvideo
be49068d
S
845 },
846 'skip': 'This live event has ended.',
2ee8f5d8 847 },
cf7e015f
S
848 {
849 # Multifeed videos (multiple cameras), URL is for Main Camera
545cc85d 850 'url': 'https://www.youtube.com/watch?v=jvGDaLqkpTg',
cf7e015f 851 'info_dict': {
545cc85d 852 'id': 'jvGDaLqkpTg',
853 'title': 'Tom Clancy Free Weekend Rainbow Whatever',
854 'description': 'md5:e03b909557865076822aa169218d6a5d',
cf7e015f
S
855 },
856 'playlist': [{
857 'info_dict': {
545cc85d 858 'id': 'jvGDaLqkpTg',
cf7e015f 859 'ext': 'mp4',
545cc85d 860 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Main Camera)',
861 'description': 'md5:e03b909557865076822aa169218d6a5d',
862 'duration': 10643,
863 'upload_date': '20161111',
864 'uploader': 'Team PGP',
865 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
866 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
cf7e015f
S
867 },
868 }, {
869 'info_dict': {
545cc85d 870 'id': '3AKt1R1aDnw',
cf7e015f 871 'ext': 'mp4',
545cc85d 872 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 2)',
873 'description': 'md5:e03b909557865076822aa169218d6a5d',
874 'duration': 10991,
875 'upload_date': '20161111',
876 'uploader': 'Team PGP',
877 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
878 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
cf7e015f
S
879 },
880 }, {
881 'info_dict': {
545cc85d 882 'id': 'RtAMM00gpVc',
cf7e015f 883 'ext': 'mp4',
545cc85d 884 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 3)',
885 'description': 'md5:e03b909557865076822aa169218d6a5d',
886 'duration': 10995,
887 'upload_date': '20161111',
888 'uploader': 'Team PGP',
889 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
890 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
cf7e015f
S
891 },
892 }, {
893 'info_dict': {
545cc85d 894 'id': '6N2fdlP3C5U',
cf7e015f 895 'ext': 'mp4',
545cc85d 896 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 4)',
897 'description': 'md5:e03b909557865076822aa169218d6a5d',
898 'duration': 10990,
899 'upload_date': '20161111',
900 'uploader': 'Team PGP',
901 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
902 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
cf7e015f
S
903 },
904 }],
905 'params': {
906 'skip_download': True,
907 },
cbaed4bb 908 },
f9f49d87 909 {
067aa17e 910 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
f9f49d87
S
911 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
912 'info_dict': {
913 'id': 'gVfLd0zydlo',
914 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
915 },
916 'playlist_count': 2,
be49068d 917 'skip': 'Not multifeed anymore',
f9f49d87 918 },
cbaed4bb 919 {
2d3d2997 920 'url': 'https://vid.plus/FlRa-iH7PGw',
cbaed4bb 921 'only_matching': True,
0e49d9a6 922 },
6d4fc66b 923 {
2d3d2997 924 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
6d4fc66b
S
925 'only_matching': True,
926 },
0e49d9a6 927 {
067aa17e 928 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
a8776b10 929 # Also tests cut-off URL expansion in video description (see
067aa17e
S
930 # https://github.com/ytdl-org/youtube-dl/issues/1892,
931 # https://github.com/ytdl-org/youtube-dl/issues/8164)
0e49d9a6
LL
932 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
933 'info_dict': {
934 'id': 'lsguqyKfVQg',
935 'ext': 'mp4',
936 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
eb6793ba 937 'alt_title': 'Dark Walk - Position Music',
0e49d9a6 938 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
556dbe7f 939 'duration': 133,
0e49d9a6
LL
940 'upload_date': '20151119',
941 'uploader_id': 'IronSoulElf',
ec85ded8 942 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
0e49d9a6 943 'uploader': 'IronSoulElf',
eb6793ba
S
944 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
945 'track': 'Dark Walk - Position Music',
946 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
92bc97d3 947 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
0e49d9a6
LL
948 },
949 'params': {
950 'skip_download': True,
951 },
952 },
61f92af1 953 {
067aa17e 954 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
61f92af1
S
955 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
956 'only_matching': True,
957 },
313dfc45
LL
958 {
959 # Video with yt:stretch=17:0
960 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
961 'info_dict': {
962 'id': 'Q39EVAstoRM',
963 'ext': 'mp4',
964 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
965 'description': 'md5:ee18a25c350637c8faff806845bddee9',
966 'upload_date': '20151107',
967 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
968 'uploader': 'CH GAMER DROID',
969 },
970 'params': {
971 'skip_download': True,
972 },
be49068d 973 'skip': 'This video does not exist.',
313dfc45 974 },
7caf9830
S
975 {
976 # Video licensed under Creative Commons
977 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
978 'info_dict': {
979 'id': 'M4gD1WSo5mA',
980 'ext': 'mp4',
981 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
982 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
556dbe7f 983 'duration': 721,
7caf9830
S
984 'upload_date': '20150127',
985 'uploader_id': 'BerkmanCenter',
ec85ded8 986 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
556dbe7f 987 'uploader': 'The Berkman Klein Center for Internet & Society',
7caf9830
S
988 'license': 'Creative Commons Attribution license (reuse allowed)',
989 },
990 'params': {
991 'skip_download': True,
992 },
993 },
fd050249
S
994 {
995 # Channel-like uploader_url
996 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
997 'info_dict': {
998 'id': 'eQcmzGIKrzg',
999 'ext': 'mp4',
1000 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
545cc85d 1001 'description': 'md5:13a2503d7b5904ef4b223aa101628f39',
556dbe7f 1002 'duration': 4060,
fd050249 1003 'upload_date': '20151119',
eb6793ba 1004 'uploader': 'Bernie Sanders',
fd050249 1005 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
ec85ded8 1006 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
fd050249
S
1007 'license': 'Creative Commons Attribution license (reuse allowed)',
1008 },
1009 'params': {
1010 'skip_download': True,
1011 },
1012 },
040ac686
S
1013 {
1014 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
1015 'only_matching': True,
7f29cf54
S
1016 },
1017 {
067aa17e 1018 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
7f29cf54
S
1019 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
1020 'only_matching': True,
6496ccb4
S
1021 },
1022 {
1023 # Rental video preview
1024 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1025 'info_dict': {
1026 'id': 'uGpuVWrhIzE',
1027 'ext': 'mp4',
1028 'title': 'Piku - Trailer',
1029 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1030 'upload_date': '20150811',
1031 'uploader': 'FlixMatrix',
1032 'uploader_id': 'FlixMatrixKaravan',
ec85ded8 1033 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
6496ccb4
S
1034 'license': 'Standard YouTube License',
1035 },
1036 'params': {
1037 'skip_download': True,
1038 },
eb6793ba 1039 'skip': 'This video is not available.',
022a5d66 1040 },
12afdc2a
S
1041 {
1042 # YouTube Red video with episode data
1043 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1044 'info_dict': {
1045 'id': 'iqKdEhx-dD4',
1046 'ext': 'mp4',
1047 'title': 'Isolation - Mind Field (Ep 1)',
545cc85d 1048 'description': 'md5:f540112edec5d09fc8cc752d3d4ba3cd',
556dbe7f 1049 'duration': 2085,
12afdc2a
S
1050 'upload_date': '20170118',
1051 'uploader': 'Vsauce',
1052 'uploader_id': 'Vsauce',
1053 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
12afdc2a
S
1054 'series': 'Mind Field',
1055 'season_number': 1,
1056 'episode_number': 1,
1057 },
1058 'params': {
1059 'skip_download': True,
1060 },
1061 'expected_warnings': [
1062 'Skipping DASH manifest',
1063 ],
1064 },
c7121fa7
S
1065 {
1066 # The following content has been identified by the YouTube community
1067 # as inappropriate or offensive to some audiences.
1068 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1069 'info_dict': {
1070 'id': '6SJNVb0GnPI',
1071 'ext': 'mp4',
1072 'title': 'Race Differences in Intelligence',
1073 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1074 'duration': 965,
1075 'upload_date': '20140124',
1076 'uploader': 'New Century Foundation',
1077 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1078 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
c7121fa7
S
1079 },
1080 'params': {
1081 'skip_download': True,
1082 },
545cc85d 1083 'skip': 'This video has been removed for violating YouTube\'s policy on hate speech.',
c7121fa7 1084 },
022a5d66
S
1085 {
1086 # itag 212
1087 'url': '1t24XAntNCY',
1088 'only_matching': True,
fd5c4aab
S
1089 },
1090 {
1091 # geo restricted to JP
1092 'url': 'sJL6WA-aGkQ',
1093 'only_matching': True,
1094 },
cd5a74a2
S
1095 {
1096 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1097 'only_matching': True,
1098 },
bc2ca1bb 1099 {
1100 'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc',
1101 'only_matching': True,
1102 },
1103 {
1104 # from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m
1105 'url': 'https://redirect.invidious.io/Yh0AhrY9GjA',
1106 'only_matching': True,
1107 },
825cd268
RA
1108 {
1109 # DRM protected
1110 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1111 'only_matching': True,
4fe54c12
S
1112 },
1113 {
1114 # Video with unsupported adaptive stream type formats
1115 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1116 'info_dict': {
1117 'id': 'Z4Vy8R84T1U',
1118 'ext': 'mp4',
1119 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1120 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1121 'duration': 433,
1122 'upload_date': '20130923',
1123 'uploader': 'Amelia Putri Harwita',
1124 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1125 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1126 'formats': 'maxcount:10',
1127 },
1128 'params': {
1129 'skip_download': True,
1130 'youtube_include_dash_manifest': False,
1131 },
5429d6a9 1132 'skip': 'not actual anymore',
5caabd3c 1133 },
1134 {
822b9d9c 1135 # Youtube Music Auto-generated description
5caabd3c 1136 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1137 'info_dict': {
1138 'id': 'MgNrAu2pzNs',
1139 'ext': 'mp4',
1140 'title': 'Voyeur Girl',
1141 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1142 'upload_date': '20190312',
5429d6a9
S
1143 'uploader': 'Stephen - Topic',
1144 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
5caabd3c 1145 'artist': 'Stephen',
1146 'track': 'Voyeur Girl',
1147 'album': 'it\'s too much love to know my dear',
1148 'release_date': '20190313',
1149 'release_year': 2019,
1150 },
1151 'params': {
1152 'skip_download': True,
1153 },
1154 },
66b48727
RA
1155 {
1156 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1157 'only_matching': True,
1158 },
011e75e6
S
1159 {
1160 # invalid -> valid video id redirection
1161 'url': 'DJztXj2GPfl',
1162 'info_dict': {
1163 'id': 'DJztXj2GPfk',
1164 'ext': 'mp4',
1165 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1166 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1167 'upload_date': '20090125',
1168 'uploader': 'Prochorowka',
1169 'uploader_id': 'Prochorowka',
1170 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1171 'artist': 'Panjabi MC',
1172 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1173 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1174 },
1175 'params': {
1176 'skip_download': True,
1177 },
545cc85d 1178 'skip': 'Video unavailable',
ea74e00b
DP
1179 },
1180 {
1181 # empty description results in an empty string
1182 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1183 'info_dict': {
1184 'id': 'x41yOUIvK2k',
1185 'ext': 'mp4',
1186 'title': 'IMG 3456',
1187 'description': '',
1188 'upload_date': '20170613',
1189 'uploader_id': 'ElevageOrVert',
1190 'uploader': 'ElevageOrVert',
1191 },
1192 'params': {
1193 'skip_download': True,
1194 },
1195 },
a0566bbf 1196 {
29f7c58a 1197 # with '};' inside yt initial data (see [1])
1198 # see [2] for an example with '};' inside ytInitialPlayerResponse
1199 # 1. https://github.com/ytdl-org/youtube-dl/issues/27093
1200 # 2. https://github.com/ytdl-org/youtube-dl/issues/27216
a0566bbf 1201 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1202 'info_dict': {
1203 'id': 'CHqg6qOn4no',
1204 'ext': 'mp4',
1205 'title': 'Part 77 Sort a list of simple types in c#',
1206 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1207 'upload_date': '20130831',
1208 'uploader_id': 'kudvenkat',
1209 'uploader': 'kudvenkat',
1210 },
1211 'params': {
1212 'skip_download': True,
1213 },
1214 },
29f7c58a 1215 {
1216 # another example of '};' in ytInitialData
1217 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY',
1218 'only_matching': True,
1219 },
1220 {
1221 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ',
1222 'only_matching': True,
1223 },
545cc85d 1224 {
cc2db878 1225 # https://github.com/ytdl-org/youtube-dl/pull/28094
1226 'url': 'OtqTfy26tG0',
1227 'info_dict': {
1228 'id': 'OtqTfy26tG0',
1229 'ext': 'mp4',
1230 'title': 'Burn Out',
1231 'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131',
1232 'upload_date': '20141120',
1233 'uploader': 'The Cinematic Orchestra - Topic',
1234 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw',
1235 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw',
1236 'artist': 'The Cinematic Orchestra',
1237 'track': 'Burn Out',
1238 'album': 'Every Day',
1239 'release_data': None,
1240 'release_year': None,
1241 },
1242 'params': {
1243 'skip_download': True,
1244 },
545cc85d 1245 },
bc2ca1bb 1246 {
1247 # controversial video, only works with bpctr when authenticated with cookies
1248 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg',
1249 'only_matching': True,
1250 },
f7ad7160 1251 {
1252 # restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685
1253 'url': 'cBvYw8_A0vQ',
1254 'info_dict': {
1255 'id': 'cBvYw8_A0vQ',
1256 'ext': 'mp4',
1257 'title': '4K Ueno Okachimachi Street Scenes 上野御徒町歩き',
1258 'description': 'md5:ea770e474b7cd6722b4c95b833c03630',
1259 'upload_date': '20201120',
1260 'uploader': 'Walk around Japan',
1261 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw',
1262 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
1263 },
1264 'params': {
1265 'skip_download': True,
1266 },
1267 },
2eb88d95
PH
1268 ]
1269
e0df6211
PH
1270 def __init__(self, *args, **kwargs):
1271 super(YoutubeIE, self).__init__(*args, **kwargs)
545cc85d 1272 self._code_cache = {}
83799698 1273 self._player_cache = {}
e0df6211 1274
60064c53
PH
1275 def _signature_cache_id(self, example_sig):
1276 """ Return a string representation of a signature """
78caa52a 1277 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53 1278
e40c758c
S
1279 @classmethod
1280 def _extract_player_info(cls, player_url):
1281 for player_re in cls._PLAYER_INFO_RE:
1282 id_m = re.search(player_re, player_url)
1283 if id_m:
1284 break
1285 else:
c081b35c 1286 raise ExtractorError('Cannot identify player %r' % player_url)
545cc85d 1287 return id_m.group('id')
e40c758c
S
1288
1289 def _extract_signature_function(self, video_id, player_url, example_sig):
545cc85d 1290 player_id = self._extract_player_info(player_url)
e0df6211 1291
c4417ddb 1292 # Read from filesystem cache
545cc85d 1293 func_id = 'js_%s_%s' % (
1294 player_id, self._signature_cache_id(example_sig))
c4417ddb 1295 assert os.path.basename(func_id) == func_id
a0e07d31 1296
69ea8ca4 1297 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 1298 if cache_spec is not None:
78caa52a 1299 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 1300
545cc85d 1301 if player_id not in self._code_cache:
1302 self._code_cache[player_id] = self._download_webpage(
e0df6211 1303 player_url, video_id,
545cc85d 1304 note='Downloading player ' + player_id,
69ea8ca4 1305 errnote='Download of %s failed' % player_url)
545cc85d 1306 code = self._code_cache[player_id]
1307 res = self._parse_sig_js(code)
e0df6211 1308
785521bf
PH
1309 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1310 cache_res = res(test_string)
1311 cache_spec = [ord(c) for c in cache_res]
83799698 1312
69ea8ca4 1313 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
1314 return res
1315
60064c53 1316 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
1317 def gen_sig_code(idxs):
1318 def _genslice(start, end, step):
78caa52a 1319 starts = '' if start == 0 else str(start)
8bcc8756 1320 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 1321 steps = '' if step == 1 else (':%d' % step)
78caa52a 1322 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
1323
1324 step = None
7af808a5
PH
1325 # Quelch pyflakes warnings - start will be set when step is set
1326 start = '(Never used)'
edf3e38e
PH
1327 for i, prev in zip(idxs[1:], idxs[:-1]):
1328 if step is not None:
1329 if i - prev == step:
1330 continue
1331 yield _genslice(start, prev, step)
1332 step = None
1333 continue
1334 if i - prev in [-1, 1]:
1335 step = i - prev
1336 start = prev
1337 continue
1338 else:
78caa52a 1339 yield 's[%d]' % prev
edf3e38e 1340 if step is None:
78caa52a 1341 yield 's[%d]' % i
edf3e38e
PH
1342 else:
1343 yield _genslice(start, i, step)
1344
78caa52a 1345 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 1346 cache_res = func(test_string)
edf3e38e 1347 cache_spec = [ord(c) for c in cache_res]
78caa52a 1348 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
1349 signature_id_tuple = '(%s)' % (
1350 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 1351 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 1352 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 1353 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 1354
e0df6211
PH
1355 def _parse_sig_js(self, jscode):
1356 funcname = self._search_regex(
abefc03f
S
1357 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1358 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
cc2db878 1359 r'\bm=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(h\.s\)\)',
1360 r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(c\)\)',
1361 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)',
e450f6cb 1362 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
31ce6e99 1363 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
abefc03f
S
1364 # Obsolete patterns
1365 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
9a47fa35 1366 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
abefc03f
S
1367 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1368 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1369 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1370 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1371 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1372 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
3c90cc8b 1373 jscode, 'Initial JS player signature function name', group='sig')
2b25cb5d
PH
1374
1375 jsi = JSInterpreter(jscode)
1376 initial_function = jsi.extract_function(funcname)
e0df6211
PH
1377 return lambda s: initial_function([s])
1378
545cc85d 1379 def _decrypt_signature(self, s, video_id, player_url):
257a2501 1380 """Turn the encrypted s field into a working signature"""
6b37f0be 1381
c8bf86d5 1382 if player_url is None:
69ea8ca4 1383 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 1384
69ea8ca4 1385 if player_url.startswith('//'):
78caa52a 1386 player_url = 'https:' + player_url
3c90cc8b
S
1387 elif not re.match(r'https?://', player_url):
1388 player_url = compat_urlparse.urljoin(
1389 'https://www.youtube.com', player_url)
c8bf86d5 1390 try:
62af3a0e 1391 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
1392 if player_id not in self._player_cache:
1393 func = self._extract_signature_function(
60064c53 1394 video_id, player_url, s
c8bf86d5
PH
1395 )
1396 self._player_cache[player_id] = func
1397 func = self._player_cache[player_id]
1398 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 1399 self._print_sig_code(func, s)
c8bf86d5
PH
1400 return func(s)
1401 except Exception as e:
1402 tb = traceback.format_exc()
1403 raise ExtractorError(
78caa52a 1404 'Signature extraction failed: ' + tb, cause=e)
e0df6211 1405
545cc85d 1406 def _mark_watched(self, video_id, player_response):
21c340b8
S
1407 playback_url = url_or_none(try_get(
1408 player_response,
545cc85d 1409 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']))
d77ab8e2
S
1410 if not playback_url:
1411 return
1412 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1413 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1414
1415 # cpn generation algorithm is reverse engineered from base.js.
1416 # In fact it works even with dummy cpn.
1417 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1418 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1419
1420 qs.update({
1421 'ver': ['2'],
1422 'cpn': [cpn],
1423 })
1424 playback_url = compat_urlparse.urlunparse(
15707c7e 1425 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
d77ab8e2
S
1426
1427 self._download_webpage(
1428 playback_url, video_id, 'Marking watched',
1429 'Unable to mark watched', fatal=False)
1430
66c9fa36
S
1431 @staticmethod
1432 def _extract_urls(webpage):
1433 # Embedded YouTube player
1434 entries = [
1435 unescapeHTML(mobj.group('url'))
1436 for mobj in re.finditer(r'''(?x)
1437 (?:
1438 <iframe[^>]+?src=|
1439 data-video-url=|
1440 <embed[^>]+?src=|
1441 embedSWF\(?:\s*|
1442 <object[^>]+data=|
1443 new\s+SWFObject\(
1444 )
1445 (["\'])
1446 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
f2332f18 1447 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
66c9fa36
S
1448 \1''', webpage)]
1449
1450 # lazyYT YouTube embed
1451 entries.extend(list(map(
1452 unescapeHTML,
1453 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1454
1455 # Wordpress "YouTube Video Importer" plugin
1456 matches = re.findall(r'''(?x)<div[^>]+
1457 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1458 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1459 entries.extend(m[-1] for m in matches)
1460
1461 return entries
1462
1463 @staticmethod
1464 def _extract_url(webpage):
1465 urls = YoutubeIE._extract_urls(webpage)
1466 return urls[0] if urls else None
1467
97665381
PH
1468 @classmethod
1469 def extract_id(cls, url):
1470 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 1471 if mobj is None:
69ea8ca4 1472 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1473 video_id = mobj.group(2)
1474 return video_id
1475
545cc85d 1476 def _extract_chapters_from_json(self, data, video_id, duration):
84213ea8 1477 chapters_list = try_get(
8bdd16b4 1478 data,
84213ea8
S
1479 lambda x: x['playerOverlays']
1480 ['playerOverlayRenderer']
1481 ['decoratedPlayerBarRenderer']
1482 ['decoratedPlayerBarRenderer']
1483 ['playerBar']
1484 ['chapteredPlayerBarRenderer']
1485 ['chapters'],
1486 list)
1487 if not chapters_list:
1488 return
1489
1490 def chapter_time(chapter):
1491 return float_or_none(
1492 try_get(
1493 chapter,
1494 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1495 int),
1496 scale=1000)
1497 chapters = []
1498 for next_num, chapter in enumerate(chapters_list, start=1):
1499 start_time = chapter_time(chapter)
1500 if start_time is None:
1501 continue
1502 end_time = (chapter_time(chapters_list[next_num])
1503 if next_num < len(chapters_list) else duration)
1504 if end_time is None:
1505 continue
1506 title = try_get(
1507 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1508 compat_str)
1509 chapters.append({
1510 'start_time': start_time,
1511 'end_time': end_time,
1512 'title': title,
1513 })
1514 return chapters
1515
545cc85d 1516 def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
1517 return self._parse_json(self._search_regex(
1518 (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
1519 regex), webpage, name, default='{}'), video_id, fatal=False)
84213ea8 1520
d92f5d5a 1521 @staticmethod
1522 def parse_time_text(time_text):
1523 """
1524 Parse the comment time text
1525 time_text is in the format 'X units ago (edited)'
1526 """
1527 time_text_split = time_text.split(' ')
1528 if len(time_text_split) >= 3:
1529 return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto')
1530
a1c5d2ca
M
1531 @staticmethod
1532 def _join_text_entries(runs):
1533 text = None
1534 for run in runs:
1535 if not isinstance(run, dict):
1536 continue
1537 sub_text = try_get(run, lambda x: x['text'], compat_str)
1538 if sub_text:
1539 if not text:
1540 text = sub_text
1541 continue
1542 text += sub_text
1543 return text
1544
1545 def _extract_comment(self, comment_renderer, parent=None):
1546 comment_id = comment_renderer.get('commentId')
1547 if not comment_id:
1548 return
1549 comment_text_runs = try_get(comment_renderer, lambda x: x['contentText']['runs']) or []
1550 text = self._join_text_entries(comment_text_runs) or ''
1551 comment_time_text = try_get(comment_renderer, lambda x: x['publishedTimeText']['runs']) or []
1552 time_text = self._join_text_entries(comment_time_text)
d92f5d5a 1553 timestamp = calendar.timegm(self.parse_time_text(time_text).timetuple())
a1c5d2ca
M
1554 author = try_get(comment_renderer, lambda x: x['authorText']['simpleText'], compat_str)
1555 author_id = try_get(comment_renderer,
1556 lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
1557 votes = str_to_int(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'],
1558 lambda x: x['likeCount']), compat_str)) or 0
1559 author_thumbnail = try_get(comment_renderer,
1560 lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], compat_str)
1561
1562 author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
1563 is_liked = try_get(comment_renderer, lambda x: x['isLiked'], bool)
a1c5d2ca
M
1564 return {
1565 'id': comment_id,
1566 'text': text,
d92f5d5a 1567 'timestamp': timestamp,
a1c5d2ca
M
1568 'time_text': time_text,
1569 'like_count': votes,
1570 'is_favorited': is_liked,
1571 'author': author,
1572 'author_id': author_id,
1573 'author_thumbnail': author_thumbnail,
1574 'author_is_uploader': author_is_uploader,
1575 'parent': parent or 'root'
1576 }
1577
1578 def _comment_entries(self, root_continuation_data, identity_token, account_syncid,
1579 session_token_list, parent=None, comment_counts=None):
1580
1581 def extract_thread(parent_renderer):
1582 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
1583 if not parent:
1584 comment_counts[2] = 0
1585 for content in contents:
1586 comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
1587 comment_renderer = try_get(
1588 comment_thread_renderer, (lambda x: x['comment']['commentRenderer'], dict)) or try_get(
1589 content, (lambda x: x['commentRenderer'], dict))
1590
1591 if not comment_renderer:
1592 continue
1593 comment = self._extract_comment(comment_renderer, parent)
1594 if not comment:
1595 continue
1596 comment_counts[0] += 1
1597 yield comment
1598 # Attempt to get the replies
1599 comment_replies_renderer = try_get(
1600 comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
1601
1602 if comment_replies_renderer:
1603 comment_counts[2] += 1
1604 comment_entries_iter = self._comment_entries(
1605 comment_replies_renderer, identity_token, account_syncid,
1606 parent=comment.get('id'), session_token_list=session_token_list,
1607 comment_counts=comment_counts)
1608
1609 for reply_comment in comment_entries_iter:
1610 yield reply_comment
1611
1612 if not comment_counts:
1613 # comment so far, est. total comments, current comment thread #
1614 comment_counts = [0, 0, 0]
1615 headers = self._DEFAULT_BASIC_API_HEADERS.copy()
1616
1617 # TODO: Generalize the download code with TabIE
1618 if identity_token:
1619 headers['x-youtube-identity-token'] = identity_token
1620
1621 if account_syncid:
1622 headers['X-Goog-PageId'] = account_syncid
1623 headers['X-Goog-AuthUser'] = 0
1624
1625 continuation = YoutubeTabIE._extract_continuation(root_continuation_data) # TODO
1626 first_continuation = False
1627 if parent is None:
1628 first_continuation = True
1629
1630 for page_num in itertools.count(0):
1631 if not continuation:
1632 break
1633 retries = self._downloader.params.get('extractor_retries', 3)
1634 count = -1
1635 last_error = None
1636
1637 while count < retries:
1638 count += 1
1639 if last_error:
1640 self.report_warning('%s. Retrying ...' % last_error)
1641 try:
1642 query = {
1643 'ctoken': continuation['ctoken'],
1644 'pbj': 1,
1645 'type': 'next',
1646 }
1647 if parent:
1648 query['action_get_comment_replies'] = 1
1649 else:
1650 query['action_get_comments'] = 1
1651
1652 comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
1653 if page_num == 0:
1654 if first_continuation:
d92f5d5a 1655 note_prefix = 'Downloading initial comment continuation page'
a1c5d2ca 1656 else:
d92f5d5a 1657 note_prefix = ' Downloading comment reply thread %d %s' % (comment_counts[2], comment_prog_str)
a1c5d2ca 1658 else:
d92f5d5a 1659 note_prefix = '%sDownloading comment%s page %d %s' % (
1660 ' ' if parent else '',
a1c5d2ca
M
1661 ' replies' if parent else '',
1662 page_num,
1663 comment_prog_str)
1664
1665 browse = self._download_json(
1666 'https://www.youtube.com/comment_service_ajax', None,
1667 '%s %s' % (note_prefix, '(retry #%d)' % count if count else ''),
1668 headers=headers, query=query,
1669 data=urlencode_postdata({
1670 'session_token': session_token_list[0]
1671 }))
1672 except ExtractorError as e:
1673 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404, 413):
1674 if e.cause.code == 413:
d92f5d5a 1675 self.report_warning('Assumed end of comments (received HTTP Error 413)')
a1c5d2ca
M
1676 return
1677 # Downloading page may result in intermittent 5xx HTTP error
1678 # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
1679 last_error = 'HTTP Error %s' % e.cause.code
1680 if e.cause.code == 404:
d92f5d5a 1681 last_error = last_error + ' (this API is probably deprecated)'
a1c5d2ca
M
1682 if count < retries:
1683 continue
1684 raise
1685 else:
1686 session_token = try_get(browse, lambda x: x['xsrf_token'], compat_str)
1687 if session_token:
1688 session_token_list[0] = session_token
1689
1690 response = try_get(browse,
1691 (lambda x: x['response'],
1692 lambda x: x[1]['response'])) or {}
1693
1694 if response.get('continuationContents'):
1695 break
1696
1697 # YouTube sometimes gives reload: now json if something went wrong (e.g. bad auth)
1698 if browse.get('reload'):
d92f5d5a 1699 raise ExtractorError('Invalid or missing params in continuation request', expected=False)
a1c5d2ca
M
1700
1701 # TODO: not tested, merged from old extractor
1702 err_msg = browse.get('externalErrorMessage')
1703 if err_msg:
1704 raise ExtractorError('YouTube said: %s' % err_msg, expected=False)
1705
1706 # Youtube sometimes sends incomplete data
1707 # See: https://github.com/ytdl-org/youtube-dl/issues/28194
1708 last_error = 'Incomplete data received'
1709 if count >= retries:
1710 self._downloader.report_error(last_error)
1711
1712 if not response:
1713 break
1714
1715 known_continuation_renderers = {
1716 'itemSectionContinuation': extract_thread,
1717 'commentRepliesContinuation': extract_thread
1718 }
1719
1720 # extract next root continuation from the results
1721 continuation_contents = try_get(
1722 response, lambda x: x['continuationContents'], dict) or {}
1723
1724 for key, value in continuation_contents.items():
1725 if key not in known_continuation_renderers:
1726 continue
1727 continuation_renderer = value
1728
1729 if first_continuation:
1730 first_continuation = False
1731 expected_comment_count = try_get(
1732 continuation_renderer,
1733 (lambda x: x['header']['commentsHeaderRenderer']['countText']['runs'][0]['text'],
1734 lambda x: x['header']['commentsHeaderRenderer']['commentsCount']['runs'][0]['text']),
1735 compat_str)
1736
1737 if expected_comment_count:
1738 comment_counts[1] = str_to_int(expected_comment_count)
d92f5d5a 1739 self.to_screen('Downloading ~%d comments' % str_to_int(expected_comment_count))
a1c5d2ca
M
1740 yield comment_counts[1]
1741
1742 # TODO: cli arg.
1743 # 1/True for newest, 0/False for popular (default)
1744 comment_sort_index = int(True)
1745 sort_continuation_renderer = try_get(
1746 continuation_renderer,
1747 lambda x: x['header']['commentsHeaderRenderer']['sortMenu']['sortFilterSubMenuRenderer']['subMenuItems']
1748 [comment_sort_index]['continuation']['reloadContinuationData'], dict)
1749 # If this fails, the initial continuation page
1750 # starts off with popular anyways.
1751 if sort_continuation_renderer:
1752 continuation = YoutubeTabIE._build_continuation_query(
1753 continuation=sort_continuation_renderer.get('continuation'),
1754 ctp=sort_continuation_renderer.get('clickTrackingParams'))
d92f5d5a 1755 self.to_screen('Sorting comments by %s' % ('popular' if comment_sort_index == 0 else 'newest'))
a1c5d2ca
M
1756 break
1757
1758 for entry in known_continuation_renderers[key](continuation_renderer):
1759 yield entry
1760
1761 continuation = YoutubeTabIE._extract_continuation(continuation_renderer) # TODO
1762 break
1763
1764 def _extract_comments(self, ytcfg, video_id, contents, webpage, xsrf_token):
1765 """Entry for comment extraction"""
1766 comments = []
1767 known_entry_comment_renderers = (
1768 'itemSectionRenderer',
1769 )
1770 estimated_total = 0
1771 for entry in contents:
1772 for key, renderer in entry.items():
1773 if key not in known_entry_comment_renderers:
1774 continue
1775
1776 comment_iter = self._comment_entries(
1777 renderer,
1778 identity_token=self._extract_identity_token(webpage, item_id=video_id),
1779 account_syncid=self._extract_account_syncid(ytcfg),
1780 session_token_list=[xsrf_token])
1781
1782 for comment in comment_iter:
1783 if isinstance(comment, int):
1784 estimated_total = comment
1785 continue
1786 comments.append(comment)
1787 break
d92f5d5a 1788 self.to_screen('Downloaded %d/%d comments' % (len(comments), estimated_total))
a1c5d2ca
M
1789 return {
1790 'comments': comments,
1791 'comment_count': len(comments),
1792 }
1793
c5e8d7af 1794 def _real_extract(self, url):
cf7e015f 1795 url, smuggled_data = unsmuggle_url(url, {})
545cc85d 1796 video_id = self._match_id(url)
1797 base_url = self.http_scheme() + '//www.youtube.com/'
b3d12425 1798 webpage_url = base_url + 'watch?v=' + video_id
1799 webpage = self._download_webpage(
cce889b9 1800 webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
545cc85d 1801
1802 player_response = None
1803 if webpage:
1804 player_response = self._extract_yt_initial_variable(
1805 webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
1806 video_id, 'initial player response')
1807 if not player_response:
1808 player_response = self._call_api(
1809 'player', {'videoId': video_id}, video_id)
1810
1811 playability_status = player_response.get('playabilityStatus') or {}
1812 if playability_status.get('reason') == 'Sign in to confirm your age':
1813 pr = self._parse_json(try_get(compat_parse_qs(
1814 self._download_webpage(
1815 base_url + 'get_video_info', video_id,
1816 'Refetching age-gated info webpage',
1817 'unable to download video info webpage', query={
1818 'video_id': video_id,
7c60c33e 1819 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
545cc85d 1820 }, fatal=False)),
1821 lambda x: x['player_response'][0],
1822 compat_str) or '{}', video_id)
1823 if pr:
1824 player_response = pr
1825
1826 trailer_video_id = try_get(
1827 playability_status,
1828 lambda x: x['errorScreen']['playerLegacyDesktopYpcTrailerRenderer']['trailerVideoId'],
1829 compat_str)
1830 if trailer_video_id:
1831 return self.url_result(
1832 trailer_video_id, self.ie_key(), trailer_video_id)
cf7e015f 1833
545cc85d 1834 def get_text(x):
1835 if not x:
c2d125d9 1836 return
f7ad7160 1837 text = x.get('simpleText')
1838 if text and isinstance(text, compat_str):
1839 return text
1840 runs = x.get('runs')
1841 if not isinstance(runs, list):
1842 return
1843 return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)])
15be3eb5 1844
545cc85d 1845 search_meta = (
1846 lambda x: self._html_search_meta(x, webpage, default=None)) \
1847 if webpage else lambda x: None
dbdaaa23 1848
545cc85d 1849 video_details = player_response.get('videoDetails') or {}
37357d21 1850 microformat = try_get(
545cc85d 1851 player_response,
1852 lambda x: x['microformat']['playerMicroformatRenderer'],
1853 dict) or {}
1854 video_title = video_details.get('title') \
1855 or get_text(microformat.get('title')) \
1856 or search_meta(['og:title', 'twitter:title', 'title'])
1857 video_description = video_details.get('shortDescription')
cf7e015f 1858
8fe10494 1859 if not smuggled_data.get('force_singlefeed', False):
5e1eddb9 1860 if not self._downloader.params.get('noplaylist'):
8fe10494
S
1861 multifeed_metadata_list = try_get(
1862 player_response,
1863 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
545cc85d 1864 compat_str)
8fe10494
S
1865 if multifeed_metadata_list:
1866 entries = []
1867 feed_ids = []
1868 for feed in multifeed_metadata_list.split(','):
1869 # Unquote should take place before split on comma (,) since textual
1870 # fields may contain comma as well (see
067aa17e 1871 # https://github.com/ytdl-org/youtube-dl/issues/8536)
545cc85d 1872 feed_data = compat_parse_qs(
1873 compat_urllib_parse_unquote_plus(feed))
6b09401b
S
1874
1875 def feed_entry(name):
545cc85d 1876 return try_get(
1877 feed_data, lambda x: x[name][0], compat_str)
6b09401b
S
1878
1879 feed_id = feed_entry('id')
1880 if not feed_id:
1881 continue
1882 feed_title = feed_entry('title')
1883 title = video_title
1884 if feed_title:
1885 title += ' (%s)' % feed_title
8fe10494
S
1886 entries.append({
1887 '_type': 'url_transparent',
1888 'ie_key': 'Youtube',
1889 'url': smuggle_url(
545cc85d 1890 base_url + 'watch?v=' + feed_data['id'][0],
8fe10494 1891 {'force_singlefeed': True}),
6b09401b 1892 'title': title,
8fe10494 1893 })
6b09401b 1894 feed_ids.append(feed_id)
8fe10494
S
1895 self.to_screen(
1896 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1897 % (', '.join(feed_ids), video_id))
545cc85d 1898 return self.playlist_result(
1899 entries, video_id, video_title, video_description)
8fe10494
S
1900 else:
1901 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1902
545cc85d 1903 formats = []
1904 itags = []
cc2db878 1905 itag_qualities = {}
545cc85d 1906 player_url = None
dca3ff4a 1907 q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'])
545cc85d 1908 streaming_data = player_response.get('streamingData') or {}
1909 streaming_formats = streaming_data.get('formats') or []
1910 streaming_formats.extend(streaming_data.get('adaptiveFormats') or [])
1911 for fmt in streaming_formats:
1912 if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
1913 continue
321bf820 1914
cc2db878 1915 itag = str_or_none(fmt.get('itag'))
1916 quality = fmt.get('quality')
1917 if itag and quality:
1918 itag_qualities[itag] = quality
1919 # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment
1920 # (adding `&sq=0` to the URL) and parsing emsg box to determine the
1921 # number of fragment that would subsequently requested with (`&sq=N`)
1922 if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF':
1923 continue
1924
545cc85d 1925 fmt_url = fmt.get('url')
1926 if not fmt_url:
1927 sc = compat_parse_qs(fmt.get('signatureCipher'))
1928 fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0]))
1929 encrypted_sig = try_get(sc, lambda x: x['s'][0])
1930 if not (sc and fmt_url and encrypted_sig):
1931 continue
1932 if not player_url:
1933 if not webpage:
1934 continue
1935 player_url = self._search_regex(
1936 r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
1937 webpage, 'player URL', fatal=False)
1938 if not player_url:
201e9eaa 1939 continue
545cc85d 1940 signature = self._decrypt_signature(sc['s'][0], video_id, player_url)
1941 sp = try_get(sc, lambda x: x['sp'][0]) or 'signature'
1942 fmt_url += '&' + sp + '=' + signature
1943
545cc85d 1944 if itag:
1945 itags.append(itag)
cc2db878 1946 tbr = float_or_none(
1947 fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
545cc85d 1948 dct = {
1949 'asr': int_or_none(fmt.get('audioSampleRate')),
1950 'filesize': int_or_none(fmt.get('contentLength')),
1951 'format_id': itag,
1952 'format_note': fmt.get('qualityLabel') or quality,
1953 'fps': int_or_none(fmt.get('fps')),
1954 'height': int_or_none(fmt.get('height')),
dca3ff4a 1955 'quality': q(quality),
cc2db878 1956 'tbr': tbr,
545cc85d 1957 'url': fmt_url,
1958 'width': fmt.get('width'),
1959 }
1960 mimetype = fmt.get('mimeType')
1961 if mimetype:
1962 mobj = re.match(
1963 r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', mimetype)
1964 if mobj:
1965 dct['ext'] = mimetype2ext(mobj.group(1))
1966 dct.update(parse_codecs(mobj.group(2)))
cc2db878 1967 no_audio = dct.get('acodec') == 'none'
1968 no_video = dct.get('vcodec') == 'none'
1969 if no_audio:
1970 dct['vbr'] = tbr
1971 if no_video:
1972 dct['abr'] = tbr
1973 if no_audio or no_video:
545cc85d 1974 dct['downloader_options'] = {
1975 # Youtube throttles chunks >~10M
1976 'http_chunk_size': 10485760,
bf1317d2 1977 }
7c60c33e 1978 if dct.get('ext'):
1979 dct['container'] = dct['ext'] + '_dash'
545cc85d 1980 formats.append(dct)
1981
1982 hls_manifest_url = streaming_data.get('hlsManifestUrl')
1983 if hls_manifest_url:
1984 for f in self._extract_m3u8_formats(
1985 hls_manifest_url, video_id, 'mp4', fatal=False):
1986 itag = self._search_regex(
1987 r'/itag/(\d+)', f['url'], 'itag', default=None)
1988 if itag:
1989 f['format_id'] = itag
1990 formats.append(f)
1991
1418a043 1992 if self._downloader.params.get('youtube_include_dash_manifest', True):
545cc85d 1993 dash_manifest_url = streaming_data.get('dashManifestUrl')
1994 if dash_manifest_url:
545cc85d 1995 for f in self._extract_mpd_formats(
1996 dash_manifest_url, video_id, fatal=False):
cc2db878 1997 itag = f['format_id']
1998 if itag in itags:
1999 continue
dca3ff4a 2000 if itag in itag_qualities:
2001 # Not actually usefull since the sorting is already done with "quality,res,fps,codec"
2002 # but kept to maintain feature parity (and code similarity) with youtube-dl
2003 # Remove if this causes any issues with sorting in future
2004 f['quality'] = q(itag_qualities[itag])
545cc85d 2005 filesize = int_or_none(self._search_regex(
2006 r'/clen/(\d+)', f.get('fragment_base_url')
2007 or f['url'], 'file size', default=None))
2008 if filesize:
2009 f['filesize'] = filesize
cc2db878 2010 formats.append(f)
bf1317d2 2011
545cc85d 2012 if not formats:
63ad4d43 2013 if not self._downloader.params.get('allow_unplayable_formats') and streaming_data.get('licenseInfos'):
545cc85d 2014 raise ExtractorError(
2015 'This video is DRM protected.', expected=True)
2016 pemr = try_get(
2017 playability_status,
2018 lambda x: x['errorScreen']['playerErrorMessageRenderer'],
2019 dict) or {}
2020 reason = get_text(pemr.get('reason')) or playability_status.get('reason')
2021 subreason = pemr.get('subreason')
2022 if subreason:
2023 subreason = clean_html(get_text(subreason))
2024 if subreason == 'The uploader has not made this video available in your country.':
2025 countries = microformat.get('availableCountries')
2026 if not countries:
2027 regions_allowed = search_meta('regionsAllowed')
2028 countries = regions_allowed.split(',') if regions_allowed else None
2029 self.raise_geo_restricted(
2030 subreason, countries)
2031 reason += '\n' + subreason
2032 if reason:
2033 raise ExtractorError(reason, expected=True)
bf1317d2 2034
545cc85d 2035 self._sort_formats(formats)
bf1317d2 2036
545cc85d 2037 keywords = video_details.get('keywords') or []
2038 if not keywords and webpage:
2039 keywords = [
2040 unescapeHTML(m.group('content'))
2041 for m in re.finditer(self._meta_regex('og:video:tag'), webpage)]
2042 for keyword in keywords:
2043 if keyword.startswith('yt:stretch='):
2044 w, h = keyword.split('=')[1].split(':')
2045 w, h = int(w), int(h)
2046 if w > 0 and h > 0:
2047 ratio = w / h
2048 for f in formats:
2049 if f.get('vcodec') != 'none':
2050 f['stretched_ratio'] = ratio
6449cd80 2051
545cc85d 2052 thumbnails = []
2053 for container in (video_details, microformat):
2054 for thumbnail in (try_get(
2055 container,
2056 lambda x: x['thumbnail']['thumbnails'], list) or []):
2057 thumbnail_url = thumbnail.get('url')
2058 if not thumbnail_url:
bf1317d2 2059 continue
545cc85d 2060 thumbnails.append({
2061 'height': int_or_none(thumbnail.get('height')),
2062 'url': thumbnail_url,
2063 'width': int_or_none(thumbnail.get('width')),
2064 })
2065 if thumbnails:
2066 break
a6211d23 2067 else:
545cc85d 2068 thumbnail = search_meta(['og:image', 'twitter:image'])
2069 if thumbnail:
2070 thumbnails = [{'url': thumbnail}]
2071
2072 category = microformat.get('category') or search_meta('genre')
2073 channel_id = video_details.get('channelId') \
2074 or microformat.get('externalChannelId') \
2075 or search_meta('channelId')
2076 duration = int_or_none(
2077 video_details.get('lengthSeconds')
2078 or microformat.get('lengthSeconds')) \
2079 or parse_duration(search_meta('duration'))
2080 is_live = video_details.get('isLive')
2081 owner_profile_url = microformat.get('ownerProfileUrl')
2082
2083 info = {
2084 'id': video_id,
2085 'title': self._live_title(video_title) if is_live else video_title,
2086 'formats': formats,
2087 'thumbnails': thumbnails,
2088 'description': video_description,
2089 'upload_date': unified_strdate(
2090 microformat.get('uploadDate')
2091 or search_meta('uploadDate')),
2092 'uploader': video_details['author'],
2093 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None,
2094 'uploader_url': owner_profile_url,
2095 'channel_id': channel_id,
2096 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None,
2097 'duration': duration,
2098 'view_count': int_or_none(
2099 video_details.get('viewCount')
2100 or microformat.get('viewCount')
2101 or search_meta('interactionCount')),
2102 'average_rating': float_or_none(video_details.get('averageRating')),
2103 'age_limit': 18 if (
2104 microformat.get('isFamilySafe') is False
2105 or search_meta('isFamilyFriendly') == 'false'
2106 or search_meta('og:restrictions:age') == '18+') else 0,
2107 'webpage_url': webpage_url,
2108 'categories': [category] if category else None,
2109 'tags': keywords,
2110 'is_live': is_live,
2111 'playable_in_embed': playability_status.get('playableInEmbed'),
c224251a 2112 'was_live': video_details.get('isLiveContent'),
545cc85d 2113 }
b477fc13 2114
545cc85d 2115 pctr = try_get(
2116 player_response,
2117 lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict)
2118 subtitles = {}
2119 if pctr:
2120 def process_language(container, base_url, lang_code, query):
2121 lang_subs = []
2122 for fmt in self._SUBTITLE_FORMATS:
2123 query.update({
2124 'fmt': fmt,
2125 })
2126 lang_subs.append({
2127 'ext': fmt,
2128 'url': update_url_query(base_url, query),
2129 })
2130 container[lang_code] = lang_subs
7e72694b 2131
545cc85d 2132 for caption_track in (pctr.get('captionTracks') or []):
2133 base_url = caption_track.get('baseUrl')
2134 if not base_url:
2135 continue
2136 if caption_track.get('kind') != 'asr':
2137 lang_code = caption_track.get('languageCode')
2138 if not lang_code:
2139 continue
2140 process_language(
2141 subtitles, base_url, lang_code, {})
2142 continue
2143 automatic_captions = {}
2144 for translation_language in (pctr.get('translationLanguages') or []):
2145 translation_language_code = translation_language.get('languageCode')
2146 if not translation_language_code:
2147 continue
2148 process_language(
2149 automatic_captions, base_url, translation_language_code,
2150 {'tlang': translation_language_code})
2151 info['automatic_captions'] = automatic_captions
2152 info['subtitles'] = subtitles
7e72694b 2153
545cc85d 2154 parsed_url = compat_urllib_parse_urlparse(url)
2155 for component in [parsed_url.fragment, parsed_url.query]:
2156 query = compat_parse_qs(component)
2157 for k, v in query.items():
2158 for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]:
2159 d_k += '_time'
2160 if d_k not in info and k in s_ks:
2161 info[d_k] = parse_duration(query[k][0])
822b9d9c
RA
2162
2163 # Youtube Music Auto-generated description
822b9d9c 2164 if video_description:
38d70284 2165 mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
822b9d9c 2166 if mobj:
822b9d9c
RA
2167 release_year = mobj.group('release_year')
2168 release_date = mobj.group('release_date')
2169 if release_date:
2170 release_date = release_date.replace('-', '')
2171 if not release_year:
545cc85d 2172 release_year = release_date[:4]
2173 info.update({
2174 'album': mobj.group('album'.strip()),
2175 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')),
2176 'track': mobj.group('track').strip(),
2177 'release_date': release_date,
cc2db878 2178 'release_year': int_or_none(release_year),
545cc85d 2179 })
7e72694b 2180
545cc85d 2181 initial_data = None
2182 if webpage:
2183 initial_data = self._extract_yt_initial_variable(
2184 webpage, self._YT_INITIAL_DATA_RE, video_id,
2185 'yt initial data')
2186 if not initial_data:
2187 initial_data = self._call_api(
2188 'next', {'videoId': video_id}, video_id, fatal=False)
2189
2190 if not is_live:
2191 try:
2192 # This will error if there is no livechat
2193 initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
2194 info['subtitles']['live_chat'] = [{
394dcd44 2195 'url': 'https://www.youtube.com/watch?v=%s' % video_id, # url is needed to set cookies
545cc85d 2196 'video_id': video_id,
2197 'ext': 'json',
2198 'protocol': 'youtube_live_chat_replay',
2199 }]
2200 except (KeyError, IndexError, TypeError):
2201 pass
2202
2203 if initial_data:
2204 chapters = self._extract_chapters_from_json(
2205 initial_data, video_id, duration)
2206 if not chapters:
2207 for engagment_pannel in (initial_data.get('engagementPanels') or []):
2208 contents = try_get(
2209 engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'],
2210 list)
2211 if not contents:
2212 continue
2213
2214 def chapter_time(mmlir):
2215 return parse_duration(
2216 get_text(mmlir.get('timeDescription')))
2217
2218 chapters = []
2219 for next_num, content in enumerate(contents, start=1):
2220 mmlir = content.get('macroMarkersListItemRenderer') or {}
2221 start_time = chapter_time(mmlir)
2222 end_time = chapter_time(try_get(
2223 contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \
2224 if next_num < len(contents) else duration
2225 if start_time is None or end_time is None:
2226 continue
2227 chapters.append({
2228 'start_time': start_time,
2229 'end_time': end_time,
2230 'title': get_text(mmlir.get('title')),
2231 })
2232 if chapters:
2233 break
2234 if chapters:
2235 info['chapters'] = chapters
2236
2237 contents = try_get(
2238 initial_data,
2239 lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'],
2240 list) or []
2241 for content in contents:
2242 vpir = content.get('videoPrimaryInfoRenderer')
2243 if vpir:
2244 stl = vpir.get('superTitleLink')
2245 if stl:
2246 stl = get_text(stl)
2247 if try_get(
2248 vpir,
2249 lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':
2250 info['location'] = stl
2251 else:
2252 mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl)
2253 if mobj:
2254 info.update({
2255 'series': mobj.group(1),
2256 'season_number': int(mobj.group(2)),
2257 'episode_number': int(mobj.group(3)),
2258 })
2259 for tlb in (try_get(
2260 vpir,
2261 lambda x: x['videoActions']['menuRenderer']['topLevelButtons'],
2262 list) or []):
2263 tbr = tlb.get('toggleButtonRenderer') or {}
2264 for getter, regex in [(
2265 lambda x: x['defaultText']['accessibility']['accessibilityData'],
2266 r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([
2267 lambda x: x['accessibility'],
2268 lambda x: x['accessibilityData']['accessibilityData'],
2269 ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]:
2270 label = (try_get(tbr, getter, dict) or {}).get('label')
2271 if label:
2272 mobj = re.match(regex, label)
2273 if mobj:
2274 info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count'))
2275 break
2276 sbr_tooltip = try_get(
2277 vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])
2278 if sbr_tooltip:
2279 like_count, dislike_count = sbr_tooltip.split(' / ')
2280 info.update({
2281 'like_count': str_to_int(like_count),
2282 'dislike_count': str_to_int(dislike_count),
2283 })
2284 vsir = content.get('videoSecondaryInfoRenderer')
2285 if vsir:
2286 info['channel'] = get_text(try_get(
2287 vsir,
2288 lambda x: x['owner']['videoOwnerRenderer']['title'],
cce889b9 2289 dict))
545cc85d 2290 rows = try_get(
2291 vsir,
2292 lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
2293 list) or []
2294 multiple_songs = False
2295 for row in rows:
2296 if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
2297 multiple_songs = True
2298 break
2299 for row in rows:
2300 mrr = row.get('metadataRowRenderer') or {}
2301 mrr_title = mrr.get('title')
2302 if not mrr_title:
2303 continue
2304 mrr_title = get_text(mrr['title'])
2305 mrr_contents_text = get_text(mrr['contents'][0])
2306 if mrr_title == 'License':
2307 info['license'] = mrr_contents_text
2308 elif not multiple_songs:
2309 if mrr_title == 'Album':
2310 info['album'] = mrr_contents_text
2311 elif mrr_title == 'Artist':
2312 info['artist'] = mrr_contents_text
2313 elif mrr_title == 'Song':
2314 info['track'] = mrr_contents_text
2315
2316 fallbacks = {
2317 'channel': 'uploader',
2318 'channel_id': 'uploader_id',
2319 'channel_url': 'uploader_url',
2320 }
2321 for to, frm in fallbacks.items():
2322 if not info.get(to):
2323 info[to] = info.get(frm)
2324
2325 for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]:
2326 v = info.get(s_k)
2327 if v:
2328 info[d_k] = v
b84071c0 2329
c224251a
M
2330 is_private = bool_or_none(video_details.get('isPrivate'))
2331 is_unlisted = bool_or_none(microformat.get('isUnlisted'))
2332 is_membersonly = None
b28f8d24 2333 is_premium = None
c224251a
M
2334 if initial_data and is_private is not None:
2335 is_membersonly = False
b28f8d24 2336 is_premium = False
c224251a
M
2337 contents = try_get(initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list)
2338 for content in contents or []:
2339 badges = try_get(content, lambda x: x['videoPrimaryInfoRenderer']['badges'], list)
2340 for badge in badges or []:
2341 label = try_get(badge, lambda x: x['metadataBadgeRenderer']['label']) or ''
2342 if label.lower() == 'members only':
2343 is_membersonly = True
2344 break
b28f8d24
M
2345 elif label.lower() == 'premium':
2346 is_premium = True
2347 break
2348 if is_membersonly or is_premium:
c224251a
M
2349 break
2350
2351 # TODO: Add this for playlists
2352 info['availability'] = self._availability(
2353 is_private=is_private,
b28f8d24 2354 needs_premium=is_premium,
c224251a
M
2355 needs_subscription=is_membersonly,
2356 needs_auth=info['age_limit'] >= 18,
2357 is_unlisted=None if is_private is None else is_unlisted)
2358
06167fbb 2359 # get xsrf for annotations or comments
2360 get_annotations = self._downloader.params.get('writeannotations', False)
2361 get_comments = self._downloader.params.get('getcomments', False)
2362 if get_annotations or get_comments:
29f7c58a 2363 xsrf_token = None
545cc85d 2364 ytcfg = self._extract_ytcfg(video_id, webpage)
29f7c58a 2365 if ytcfg:
2366 xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
2367 if not xsrf_token:
2368 xsrf_token = self._search_regex(
2369 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
8a784c74 2370 webpage, 'xsrf token', group='xsrf_token', fatal=False)
06167fbb 2371
2372 # annotations
06167fbb 2373 if get_annotations:
64b6a4e9
RA
2374 invideo_url = try_get(
2375 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2376 if xsrf_token and invideo_url:
29f7c58a 2377 xsrf_field_name = None
2378 if ytcfg:
2379 xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
2380 if not xsrf_field_name:
2381 xsrf_field_name = self._search_regex(
2382 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
8a784c74 2383 webpage, 'xsrf field name',
29f7c58a 2384 group='xsrf_field_name', default='session_token')
8a784c74 2385 info['annotations'] = self._download_webpage(
64b6a4e9
RA
2386 self._proto_relative_url(invideo_url),
2387 video_id, note='Downloading annotations',
2388 errnote='Unable to download video annotations', fatal=False,
2389 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
7e72694b 2390
277d6ff5 2391 if get_comments:
a1c5d2ca 2392 info['__post_extractor'] = lambda: self._extract_comments(ytcfg, video_id, contents, webpage, xsrf_token)
4ea3be0a 2393
545cc85d 2394 self.mark_watched(video_id, player_response)
d77ab8e2 2395
545cc85d 2396 return info
c5e8d7af 2397
5f6a1245 2398
8bdd16b4 2399class YoutubeTabIE(YoutubeBaseInfoExtractor):
2400 IE_DESC = 'YouTube.com tab'
70d5c17b 2401 _VALID_URL = r'''(?x)
2402 https?://
2403 (?:\w+\.)?
2404 (?:
2405 youtube(?:kids)?\.com|
2406 invidio\.us
2407 )/
2408 (?:
2409 (?:channel|c|user)/|
2410 (?P<not_channel>
9ba5705a 2411 feed/|hashtag/|
70d5c17b 2412 (?:playlist|watch)\?.*?\blist=
2413 )|
29f7c58a 2414 (?!(?:%s)\b) # Direct URLs
70d5c17b 2415 )
2416 (?P<id>[^/?\#&]+)
2417 ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
8bdd16b4 2418 IE_NAME = 'youtube:tab'
2419
81127aa5 2420 _TESTS = [{
8bdd16b4 2421 # playlists, multipage
2422 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
2423 'playlist_mincount': 94,
2424 'info_dict': {
2425 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2426 'title': 'Игорь Клейнер - Playlists',
2427 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
deaec5af 2428 'uploader': 'Игорь Клейнер',
2429 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
8bdd16b4 2430 },
2431 }, {
2432 # playlists, multipage, different order
2433 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2434 'playlist_mincount': 94,
2435 'info_dict': {
2436 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2437 'title': 'Игорь Клейнер - Playlists',
2438 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
deaec5af 2439 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
2440 'uploader': 'Игорь Клейнер',
8bdd16b4 2441 },
2442 }, {
2443 # playlists, singlepage
2444 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2445 'playlist_mincount': 4,
2446 'info_dict': {
2447 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
2448 'title': 'ThirstForScience - Playlists',
2449 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
deaec5af 2450 'uploader': 'ThirstForScience',
2451 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
8bdd16b4 2452 }
2453 }, {
2454 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
2455 'only_matching': True,
2456 }, {
2457 # basic, single video playlist
0e30a7b9 2458 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
81127aa5 2459 'info_dict': {
0e30a7b9 2460 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2461 'uploader': 'Sergey M.',
2462 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3867038a 2463 'title': 'youtube-dl public playlist',
81127aa5 2464 },
0e30a7b9 2465 'playlist_count': 1,
9291475f 2466 }, {
8bdd16b4 2467 # empty playlist
0e30a7b9 2468 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
9291475f 2469 'info_dict': {
0e30a7b9 2470 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2471 'uploader': 'Sergey M.',
2472 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3867038a 2473 'title': 'youtube-dl empty playlist',
9291475f
PH
2474 },
2475 'playlist_count': 0,
2476 }, {
8bdd16b4 2477 # Home tab
2478 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
9291475f 2479 'info_dict': {
8bdd16b4 2480 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2481 'title': 'lex will - Home',
2482 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
deaec5af 2483 'uploader': 'lex will',
2484 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
9291475f 2485 },
8bdd16b4 2486 'playlist_mincount': 2,
9291475f 2487 }, {
8bdd16b4 2488 # Videos tab
2489 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
9291475f 2490 'info_dict': {
8bdd16b4 2491 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2492 'title': 'lex will - Videos',
2493 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
deaec5af 2494 'uploader': 'lex will',
2495 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
9291475f 2496 },
8bdd16b4 2497 'playlist_mincount': 975,
9291475f 2498 }, {
8bdd16b4 2499 # Videos tab, sorted by popular
2500 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
9291475f 2501 'info_dict': {
8bdd16b4 2502 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2503 'title': 'lex will - Videos',
2504 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
deaec5af 2505 'uploader': 'lex will',
2506 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
9291475f 2507 },
8bdd16b4 2508 'playlist_mincount': 199,
9291475f 2509 }, {
8bdd16b4 2510 # Playlists tab
2511 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
9291475f 2512 'info_dict': {
8bdd16b4 2513 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2514 'title': 'lex will - Playlists',
2515 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
deaec5af 2516 'uploader': 'lex will',
2517 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
9291475f 2518 },
8bdd16b4 2519 'playlist_mincount': 17,
ac7553d0 2520 }, {
8bdd16b4 2521 # Community tab
2522 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
ac7553d0 2523 'info_dict': {
8bdd16b4 2524 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2525 'title': 'lex will - Community',
2526 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
deaec5af 2527 'uploader': 'lex will',
2528 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
8bdd16b4 2529 },
2530 'playlist_mincount': 18,
87dadd45 2531 }, {
8bdd16b4 2532 # Channels tab
2533 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
87dadd45 2534 'info_dict': {
8bdd16b4 2535 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2536 'title': 'lex will - Channels',
2537 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
deaec5af 2538 'uploader': 'lex will',
2539 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
8bdd16b4 2540 },
deaec5af 2541 'playlist_mincount': 12,
6b08cdf6 2542 }, {
a0566bbf 2543 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2544 'only_matching': True,
2545 }, {
a0566bbf 2546 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2547 'only_matching': True,
2548 }, {
a0566bbf 2549 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2550 'only_matching': True,
2551 }, {
2552 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2553 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2554 'info_dict': {
2555 'title': '29C3: Not my department',
2556 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2557 'uploader': 'Christiaan008',
2558 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
deaec5af 2559 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268',
8bdd16b4 2560 },
2561 'playlist_count': 96,
2562 }, {
2563 'note': 'Large playlist',
2564 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
6b08cdf6 2565 'info_dict': {
8bdd16b4 2566 'title': 'Uploads from Cauchemar',
2567 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2568 'uploader': 'Cauchemar',
2569 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
13a75688 2570 },
8bdd16b4 2571 'playlist_mincount': 1123,
2572 }, {
2573 # even larger playlist, 8832 videos
2574 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
2575 'only_matching': True,
4b7df0d3
JMF
2576 }, {
2577 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2578 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2579 'info_dict': {
acf757f4
PH
2580 'title': 'Uploads from Interstellar Movie',
2581 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
13a75688 2582 'uploader': 'Interstellar Movie',
8bdd16b4 2583 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3 2584 },
481cc733 2585 'playlist_mincount': 21,
8bdd16b4 2586 }, {
2587 # https://github.com/ytdl-org/youtube-dl/issues/21844
2588 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2589 'info_dict': {
2590 'title': 'Data Analysis with Dr Mike Pound',
2591 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2592 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
2593 'uploader': 'Computerphile',
deaec5af 2594 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487',
8bdd16b4 2595 },
2596 'playlist_mincount': 11,
2597 }, {
a0566bbf 2598 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
8bdd16b4 2599 'only_matching': True,
dacb3a86
S
2600 }, {
2601 # Playlist URL that does not actually serve a playlist
2602 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2603 'info_dict': {
2604 'id': 'FqZTN594JQw',
2605 'ext': 'webm',
2606 'title': "Smiley's People 01 detective, Adventure Series, Action",
2607 'uploader': 'STREEM',
2608 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
ec85ded8 2609 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
dacb3a86
S
2610 'upload_date': '20150526',
2611 'license': 'Standard YouTube License',
2612 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2613 'categories': ['People & Blogs'],
2614 'tags': list,
dbdaaa23 2615 'view_count': int,
dacb3a86
S
2616 'like_count': int,
2617 'dislike_count': int,
2618 },
2619 'params': {
2620 'skip_download': True,
2621 },
13a75688 2622 'skip': 'This video is not available.',
dacb3a86 2623 'add_ie': [YoutubeIE.ie_key()],
481cc733 2624 }, {
8bdd16b4 2625 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
c0345b82 2626 'only_matching': True,
66b48727 2627 }, {
8bdd16b4 2628 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
66b48727 2629 'only_matching': True,
a0566bbf 2630 }, {
2631 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
2632 'info_dict': {
2633 'id': '9Auq9mYxFEE',
2634 'ext': 'mp4',
deaec5af 2635 'title': compat_str,
a0566bbf 2636 'uploader': 'Sky News',
2637 'uploader_id': 'skynews',
2638 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
2639 'upload_date': '20191102',
deaec5af 2640 'description': 'md5:85ddd75d888674631aaf9599a9a0b0ae',
a0566bbf 2641 'categories': ['News & Politics'],
2642 'tags': list,
2643 'like_count': int,
2644 'dislike_count': int,
2645 },
2646 'params': {
2647 'skip_download': True,
2648 },
2649 }, {
2650 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2651 'info_dict': {
2652 'id': 'a48o2S1cPoo',
2653 'ext': 'mp4',
2654 'title': 'The Young Turks - Live Main Show',
2655 'uploader': 'The Young Turks',
2656 'uploader_id': 'TheYoungTurks',
2657 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2658 'upload_date': '20150715',
2659 'license': 'Standard YouTube License',
2660 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2661 'categories': ['News & Politics'],
2662 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2663 'like_count': int,
2664 'dislike_count': int,
2665 },
2666 'params': {
2667 'skip_download': True,
2668 },
2669 'only_matching': True,
2670 }, {
2671 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
2672 'only_matching': True,
2673 }, {
2674 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2675 'only_matching': True,
3d3dddc9 2676 }, {
2677 'url': 'https://www.youtube.com/feed/trending',
2678 'only_matching': True,
2679 }, {
2680 # needs auth
2681 'url': 'https://www.youtube.com/feed/library',
2682 'only_matching': True,
2683 }, {
2684 # needs auth
2685 'url': 'https://www.youtube.com/feed/history',
2686 'only_matching': True,
2687 }, {
2688 # needs auth
2689 'url': 'https://www.youtube.com/feed/subscriptions',
2690 'only_matching': True,
2691 }, {
2692 # needs auth
2693 'url': 'https://www.youtube.com/feed/watch_later',
2694 'only_matching': True,
2695 }, {
2696 # no longer available?
2697 'url': 'https://www.youtube.com/feed/recommended',
2698 'only_matching': True,
29f7c58a 2699 }, {
2700 # inline playlist with not always working continuations
2701 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
2702 'only_matching': True,
2703 }, {
2704 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8',
2705 'only_matching': True,
2706 }, {
2707 'url': 'https://www.youtube.com/course',
2708 'only_matching': True,
2709 }, {
2710 'url': 'https://www.youtube.com/zsecurity',
2711 'only_matching': True,
2712 }, {
2713 'url': 'http://www.youtube.com/NASAgovVideo/videos',
2714 'only_matching': True,
2715 }, {
2716 'url': 'https://www.youtube.com/TheYoungTurks/live',
2717 'only_matching': True,
39ed931e 2718 }, {
2719 'url': 'https://www.youtube.com/hashtag/cctv9',
2720 'info_dict': {
2721 'id': 'cctv9',
2722 'title': '#cctv9',
2723 },
2724 'playlist_mincount': 350,
29f7c58a 2725 }]
2726
2727 @classmethod
2728 def suitable(cls, url):
2729 return False if YoutubeIE.suitable(url) else super(
2730 YoutubeTabIE, cls).suitable(url)
8bdd16b4 2731
2732 def _extract_channel_id(self, webpage):
2733 channel_id = self._html_search_meta(
2734 'channelId', webpage, 'channel id', default=None)
2735 if channel_id:
2736 return channel_id
2737 channel_url = self._html_search_meta(
2738 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
2739 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
2740 'twitter:app:url:googleplay'), webpage, 'channel url')
2741 return self._search_regex(
2742 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
2743 channel_url, 'channel id')
15f6397c 2744
8bdd16b4 2745 @staticmethod
cd7c66cf 2746 def _extract_basic_item_renderer(item):
2747 # Modified from _extract_grid_item_renderer
2748 known_renderers = (
e3c07697 2749 'playlistRenderer', 'videoRenderer', 'channelRenderer',
cd7c66cf 2750 'gridPlaylistRenderer', 'gridVideoRenderer', 'gridChannelRenderer'
2751 )
2752 for key, renderer in item.items():
2753 if key not in known_renderers:
2754 continue
2755 return renderer
8bdd16b4 2756
8bdd16b4 2757 def _grid_entries(self, grid_renderer):
2758 for item in grid_renderer['items']:
2759 if not isinstance(item, dict):
39b62db1 2760 continue
cd7c66cf 2761 renderer = self._extract_basic_item_renderer(item)
8bdd16b4 2762 if not isinstance(renderer, dict):
2763 continue
2764 title = try_get(
2765 renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2766 # playlist
2767 playlist_id = renderer.get('playlistId')
2768 if playlist_id:
2769 yield self.url_result(
2770 'https://www.youtube.com/playlist?list=%s' % playlist_id,
2771 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
2772 video_title=title)
2773 # video
2774 video_id = renderer.get('videoId')
2775 if video_id:
2776 yield self._extract_video(renderer)
2777 # channel
2778 channel_id = renderer.get('channelId')
2779 if channel_id:
2780 title = try_get(
2781 renderer, lambda x: x['title']['simpleText'], compat_str)
2782 yield self.url_result(
2783 'https://www.youtube.com/channel/%s' % channel_id,
2784 ie=YoutubeTabIE.ie_key(), video_title=title)
2785
3d3dddc9 2786 def _shelf_entries_from_content(self, shelf_renderer):
2787 content = shelf_renderer.get('content')
2788 if not isinstance(content, dict):
8bdd16b4 2789 return
cd7c66cf 2790 renderer = content.get('gridRenderer') or content.get('expandedShelfContentsRenderer')
3d3dddc9 2791 if renderer:
2792 # TODO: add support for nested playlists so each shelf is processed
2793 # as separate playlist
2794 # TODO: this includes only first N items
2795 for entry in self._grid_entries(renderer):
2796 yield entry
2797 renderer = content.get('horizontalListRenderer')
2798 if renderer:
2799 # TODO
2800 pass
8bdd16b4 2801
29f7c58a 2802 def _shelf_entries(self, shelf_renderer, skip_channels=False):
8bdd16b4 2803 ep = try_get(
2804 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
2805 compat_str)
2806 shelf_url = urljoin('https://www.youtube.com', ep)
3d3dddc9 2807 if shelf_url:
29f7c58a 2808 # Skipping links to another channels, note that checking for
2809 # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL
2810 # will not work
2811 if skip_channels and '/channels?' in shelf_url:
2812 return
3d3dddc9 2813 title = try_get(
2814 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2815 yield self.url_result(shelf_url, video_title=title)
2816 # Shelf may not contain shelf URL, fallback to extraction from content
2817 for entry in self._shelf_entries_from_content(shelf_renderer):
2818 yield entry
c5e8d7af 2819
8bdd16b4 2820 def _playlist_entries(self, video_list_renderer):
2821 for content in video_list_renderer['contents']:
2822 if not isinstance(content, dict):
2823 continue
2824 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
2825 if not isinstance(renderer, dict):
2826 continue
2827 video_id = renderer.get('videoId')
2828 if not video_id:
2829 continue
2830 yield self._extract_video(renderer)
07aeced6 2831
3462ffa8 2832 def _rich_entries(self, rich_grid_renderer):
2833 renderer = try_get(
70d5c17b 2834 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
3462ffa8 2835 video_id = renderer.get('videoId')
2836 if not video_id:
2837 return
2838 yield self._extract_video(renderer)
2839
8bdd16b4 2840 def _video_entry(self, video_renderer):
2841 video_id = video_renderer.get('videoId')
2842 if video_id:
2843 return self._extract_video(video_renderer)
dacb3a86 2844
8bdd16b4 2845 def _post_thread_entries(self, post_thread_renderer):
2846 post_renderer = try_get(
2847 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
2848 if not post_renderer:
2849 return
2850 # video attachment
2851 video_renderer = try_get(
2852 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
2853 video_id = None
2854 if video_renderer:
2855 entry = self._video_entry(video_renderer)
2856 if entry:
2857 yield entry
2858 # inline video links
2859 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
2860 for run in runs:
2861 if not isinstance(run, dict):
2862 continue
2863 ep_url = try_get(
2864 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
2865 if not ep_url:
2866 continue
2867 if not YoutubeIE.suitable(ep_url):
2868 continue
2869 ep_video_id = YoutubeIE._match_id(ep_url)
2870 if video_id == ep_video_id:
2871 continue
2872 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
dacb3a86 2873
8bdd16b4 2874 def _post_thread_continuation_entries(self, post_thread_continuation):
2875 contents = post_thread_continuation.get('contents')
2876 if not isinstance(contents, list):
2877 return
2878 for content in contents:
2879 renderer = content.get('backstagePostThreadRenderer')
2880 if not isinstance(renderer, dict):
2881 continue
2882 for entry in self._post_thread_entries(renderer):
2883 yield entry
07aeced6 2884
39ed931e 2885 r''' # unused
2886 def _rich_grid_entries(self, contents):
2887 for content in contents:
2888 video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict)
2889 if video_renderer:
2890 entry = self._video_entry(video_renderer)
2891 if entry:
2892 yield entry
2893 '''
2894
29f7c58a 2895 @staticmethod
2896 def _build_continuation_query(continuation, ctp=None):
2897 query = {
2898 'ctoken': continuation,
2899 'continuation': continuation,
2900 }
2901 if ctp:
2902 query['itct'] = ctp
2903 return query
2904
8bdd16b4 2905 @staticmethod
2906 def _extract_next_continuation_data(renderer):
2907 next_continuation = try_get(
2908 renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
2909 if not next_continuation:
2910 return
2911 continuation = next_continuation.get('continuation')
2912 if not continuation:
2913 return
2914 ctp = next_continuation.get('clickTrackingParams')
29f7c58a 2915 return YoutubeTabIE._build_continuation_query(continuation, ctp)
c5e8d7af 2916
8bdd16b4 2917 @classmethod
2918 def _extract_continuation(cls, renderer):
2919 next_continuation = cls._extract_next_continuation_data(renderer)
2920 if next_continuation:
2921 return next_continuation
cc2db878 2922 contents = []
2923 for key in ('contents', 'items'):
2924 contents.extend(try_get(renderer, lambda x: x[key], list) or [])
8bdd16b4 2925 for content in contents:
2926 if not isinstance(content, dict):
2927 continue
2928 continuation_ep = try_get(
2929 content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
2930 dict)
2931 if not continuation_ep:
2932 continue
2933 continuation = try_get(
2934 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
2935 if not continuation:
2936 continue
2937 ctp = continuation_ep.get('clickTrackingParams')
29f7c58a 2938 return YoutubeTabIE._build_continuation_query(continuation, ctp)
448830ce 2939
d069eca7 2940 def _entries(self, tab, item_id, identity_token, account_syncid):
3462ffa8 2941
70d5c17b 2942 def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
2943 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
2944 for content in contents:
2945 if not isinstance(content, dict):
8bdd16b4 2946 continue
70d5c17b 2947 is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
3462ffa8 2948 if not is_renderer:
70d5c17b 2949 renderer = content.get('richItemRenderer')
3462ffa8 2950 if renderer:
2951 for entry in self._rich_entries(renderer):
2952 yield entry
2953 continuation_list[0] = self._extract_continuation(parent_renderer)
8bdd16b4 2954 continue
3462ffa8 2955 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
2956 for isr_content in isr_contents:
2957 if not isinstance(isr_content, dict):
2958 continue
69184e41 2959
2960 known_renderers = {
2961 'playlistVideoListRenderer': self._playlist_entries,
2962 'gridRenderer': self._grid_entries,
2963 'shelfRenderer': lambda x: self._shelf_entries(x, tab.get('title') != 'Channels'),
2964 'backstagePostThreadRenderer': self._post_thread_entries,
2965 'videoRenderer': lambda x: [self._video_entry(x)],
2966 }
2967 for key, renderer in isr_content.items():
2968 if key not in known_renderers:
2969 continue
2970 for entry in known_renderers[key](renderer):
2971 if entry:
2972 yield entry
3462ffa8 2973 continuation_list[0] = self._extract_continuation(renderer)
69184e41 2974 break
70d5c17b 2975
3462ffa8 2976 if not continuation_list[0]:
2977 continuation_list[0] = self._extract_continuation(is_renderer)
70d5c17b 2978
2979 if not continuation_list[0]:
2980 continuation_list[0] = self._extract_continuation(parent_renderer)
3462ffa8 2981
2982 continuation_list = [None] # Python 2 doesnot support nonlocal
29f7c58a 2983 tab_content = try_get(tab, lambda x: x['content'], dict)
2984 if not tab_content:
2985 return
3462ffa8 2986 parent_renderer = (
29f7c58a 2987 try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
2988 or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
70d5c17b 2989 for entry in extract_entries(parent_renderer):
2990 yield entry
3462ffa8 2991 continuation = continuation_list[0]
8bdd16b4 2992
2993 headers = {
2994 'x-youtube-client-name': '1',
2995 'x-youtube-client-version': '2.20201112.04.01',
2996 }
2997 if identity_token:
2998 headers['x-youtube-identity-token'] = identity_token
ebf1b291 2999
d069eca7
M
3000 if account_syncid:
3001 headers['X-Goog-PageId'] = account_syncid
3002 headers['X-Goog-AuthUser'] = 0
3003
8bdd16b4 3004 for page_num in itertools.count(1):
3005 if not continuation:
3006 break
62bff2c1 3007 retries = self._downloader.params.get('extractor_retries', 3)
3008 count = -1
3009 last_error = None
3010 while count < retries:
3011 count += 1
3012 if last_error:
3013 self.report_warning('%s. Retrying ...' % last_error)
29f7c58a 3014 try:
a5c56234 3015 response = self._call_api(
d92f5d5a 3016 ep='browse', fatal=True, headers=headers,
a5c56234
M
3017 video_id='%s page %s' % (item_id, page_num),
3018 query={
3019 'continuation': continuation['continuation'],
3020 'clickTracking': {'clickTrackingParams': continuation['itct']},
3021 },
3022 note='Downloading API JSON%s' % (' (retry #%d)' % count if count else ''))
29f7c58a 3023 except ExtractorError as e:
62bff2c1 3024 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404):
3025 # Downloading page may result in intermittent 5xx HTTP error
3026 # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
3027 last_error = 'HTTP Error %s' % e.cause.code
3028 if count < retries:
29f7c58a 3029 continue
3030 raise
62bff2c1 3031 else:
62bff2c1 3032 # Youtube sometimes sends incomplete data
3033 # See: https://github.com/ytdl-org/youtube-dl/issues/28194
26fe8ffe 3034 if dict_get(response,
3035 ('continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints')):
62bff2c1 3036 break
f3eaa8dd
M
3037
3038 # Youtube may send alerts if there was an issue with the continuation page
3039 self._extract_alerts(response, expected=False)
3040
3041 last_error = 'Incomplete data received'
c705177d 3042 if count >= retries:
3043 self._downloader.report_error(last_error)
a5c56234
M
3044
3045 if not response:
8bdd16b4 3046 break
ebf1b291 3047
69184e41 3048 known_continuation_renderers = {
3049 'playlistVideoListContinuation': self._playlist_entries,
3050 'gridContinuation': self._grid_entries,
3051 'itemSectionContinuation': self._post_thread_continuation_entries,
3052 'sectionListContinuation': extract_entries, # for feeds
3053 }
8bdd16b4 3054 continuation_contents = try_get(
69184e41 3055 response, lambda x: x['continuationContents'], dict) or {}
3056 continuation_renderer = None
3057 for key, value in continuation_contents.items():
3058 if key not in known_continuation_renderers:
3462ffa8 3059 continue
69184e41 3060 continuation_renderer = value
3061 continuation_list = [None]
3062 for entry in known_continuation_renderers[key](continuation_renderer):
3063 yield entry
3064 continuation = continuation_list[0] or self._extract_continuation(continuation_renderer)
3065 break
3066 if continuation_renderer:
3067 continue
c5e8d7af 3068
a1b535bd 3069 known_renderers = {
3070 'gridPlaylistRenderer': (self._grid_entries, 'items'),
3071 'gridVideoRenderer': (self._grid_entries, 'items'),
3072 'playlistVideoRenderer': (self._playlist_entries, 'contents'),
cd7c66cf 3073 'itemSectionRenderer': (extract_entries, 'contents'), # for feeds
9ba5705a 3074 'richItemRenderer': (extract_entries, 'contents'), # for hashtag
26fe8ffe 3075 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents')
a1b535bd 3076 }
cce889b9 3077 on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints'))
8bdd16b4 3078 continuation_items = try_get(
cce889b9 3079 on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list)
a1b535bd 3080 continuation_item = try_get(continuation_items, lambda x: x[0], dict) or {}
3081 video_items_renderer = None
3082 for key, value in continuation_item.items():
3083 if key not in known_renderers:
8bdd16b4 3084 continue
a1b535bd 3085 video_items_renderer = {known_renderers[key][1]: continuation_items}
9ba5705a 3086 continuation_list = [None]
a1b535bd 3087 for entry in known_renderers[key][0](video_items_renderer):
3088 yield entry
9ba5705a 3089 continuation = continuation_list[0] or self._extract_continuation(video_items_renderer)
a1b535bd 3090 break
3091 if video_items_renderer:
3092 continue
8bdd16b4 3093 break
9558dcec 3094
8bdd16b4 3095 @staticmethod
3096 def _extract_selected_tab(tabs):
3097 for tab in tabs:
3098 if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
3099 return tab['tabRenderer']
2b3c2546 3100 else:
8bdd16b4 3101 raise ExtractorError('Unable to find selected tab')
b82f815f 3102
8bdd16b4 3103 @staticmethod
3104 def _extract_uploader(data):
3105 uploader = {}
3106 sidebar_renderer = try_get(
3107 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
3108 if sidebar_renderer:
3109 for item in sidebar_renderer:
3110 if not isinstance(item, dict):
3111 continue
3112 renderer = item.get('playlistSidebarSecondaryInfoRenderer')
3113 if not isinstance(renderer, dict):
3114 continue
3115 owner = try_get(
3116 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3117 if owner:
3118 uploader['uploader'] = owner.get('text')
3119 uploader['uploader_id'] = try_get(
3120 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3121 uploader['uploader_url'] = urljoin(
3122 'https://www.youtube.com/',
3123 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
9c3fe2ef 3124 return {k: v for k, v in uploader.items() if v is not None}
8bdd16b4 3125
d069eca7 3126 def _extract_from_tabs(self, item_id, webpage, data, tabs):
b60419c5 3127 playlist_id = title = description = channel_url = channel_name = channel_id = None
3128 thumbnails_list = tags = []
3129
8bdd16b4 3130 selected_tab = self._extract_selected_tab(tabs)
3131 renderer = try_get(
3132 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3133 if renderer:
b60419c5 3134 channel_name = renderer.get('title')
3135 channel_url = renderer.get('channelUrl')
3136 channel_id = renderer.get('externalId')
39ed931e 3137 else:
64c0d954 3138 renderer = try_get(
3139 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
39ed931e 3140
8bdd16b4 3141 if renderer:
3142 title = renderer.get('title')
ecc97af3 3143 description = renderer.get('description', '')
b60419c5 3144 playlist_id = channel_id
3145 tags = renderer.get('keywords', '').split()
3146 thumbnails_list = (
3147 try_get(renderer, lambda x: x['avatar']['thumbnails'], list)
ff84930c 3148 or try_get(
3149 data,
3150 lambda x: x['sidebar']['playlistSidebarRenderer']['items'][0]['playlistSidebarPrimaryInfoRenderer']['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails'],
3151 list)
b60419c5 3152 or [])
3153
3154 thumbnails = []
3155 for t in thumbnails_list:
3156 if not isinstance(t, dict):
3157 continue
3158 thumbnail_url = url_or_none(t.get('url'))
3159 if not thumbnail_url:
3160 continue
3161 thumbnails.append({
3162 'url': thumbnail_url,
3163 'width': int_or_none(t.get('width')),
3164 'height': int_or_none(t.get('height')),
3165 })
3462ffa8 3166 if playlist_id is None:
70d5c17b 3167 playlist_id = item_id
3168 if title is None:
39ed931e 3169 title = (
3170 try_get(data, lambda x: x['header']['hashtagHeaderRenderer']['hashtag']['simpleText'])
3171 or playlist_id)
b60419c5 3172 title += format_field(selected_tab, 'title', ' - %s')
3173
3174 metadata = {
3175 'playlist_id': playlist_id,
3176 'playlist_title': title,
3177 'playlist_description': description,
3178 'uploader': channel_name,
3179 'uploader_id': channel_id,
3180 'uploader_url': channel_url,
3181 'thumbnails': thumbnails,
3182 'tags': tags,
3183 }
3184 if not channel_id:
3185 metadata.update(self._extract_uploader(data))
3186 metadata.update({
3187 'channel': metadata['uploader'],
3188 'channel_id': metadata['uploader_id'],
3189 'channel_url': metadata['uploader_url']})
3190 return self.playlist_result(
d069eca7
M
3191 self._entries(
3192 selected_tab, playlist_id,
3193 self._extract_identity_token(webpage, item_id),
3194 self._extract_account_syncid(data)),
b60419c5 3195 **metadata)
73c4ac2c 3196
cd7c66cf 3197 def _extract_mix_playlist(self, playlist, playlist_id):
2be71994 3198 first_id = last_id = None
3199 for page_num in itertools.count(1):
cd7c66cf 3200 videos = list(self._playlist_entries(playlist))
3201 if not videos:
3202 return
2be71994 3203 start = next((i for i, v in enumerate(videos) if v['id'] == last_id), -1) + 1
3204 if start >= len(videos):
3205 return
3206 for video in videos[start:]:
3207 if video['id'] == first_id:
3208 self.to_screen('First video %s found again; Assuming end of Mix' % first_id)
3209 return
3210 yield video
3211 first_id = first_id or videos[0]['id']
3212 last_id = videos[-1]['id']
cd7c66cf 3213
cd7c66cf 3214 _, data = self._extract_webpage(
2be71994 3215 'https://www.youtube.com/watch?list=%s&v=%s' % (playlist_id, last_id),
cd7c66cf 3216 '%s page %d' % (playlist_id, page_num))
3217 playlist = try_get(
3218 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3219
29f7c58a 3220 def _extract_from_playlist(self, item_id, url, data, playlist):
8bdd16b4 3221 title = playlist.get('title') or try_get(
3222 data, lambda x: x['titleText']['simpleText'], compat_str)
3223 playlist_id = playlist.get('playlistId') or item_id
cd7c66cf 3224
3225 # Delegating everything except mix playlists to regular tab-based playlist URL
29f7c58a 3226 playlist_url = urljoin(url, try_get(
3227 playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3228 compat_str))
3229 if playlist_url and playlist_url != url:
3230 return self.url_result(
3231 playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3232 video_title=title)
cd7c66cf 3233
8bdd16b4 3234 return self.playlist_result(
cd7c66cf 3235 self._extract_mix_playlist(playlist, playlist_id),
3236 playlist_id=playlist_id, playlist_title=title)
c5e8d7af 3237
f3eaa8dd
M
3238 def _extract_alerts(self, data, expected=False):
3239
3240 def _real_extract_alerts():
3241 for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
3242 if not isinstance(alert_dict, dict):
02ced43c 3243 continue
f3eaa8dd
M
3244 for alert in alert_dict.values():
3245 alert_type = alert.get('type')
3246 if not alert_type:
3247 continue
3ffc7c89 3248 message = try_get(alert, lambda x: x['text']['simpleText'], compat_str) or ''
02ced43c 3249 if message:
3250 yield alert_type, message
f3eaa8dd 3251 for run in try_get(alert, lambda x: x['text']['runs'], list) or []:
3ffc7c89 3252 message += try_get(run, lambda x: x['text'], compat_str)
3253 if message:
3254 yield alert_type, message
f3eaa8dd 3255
3ffc7c89 3256 errors = []
3257 warnings = []
f3eaa8dd
M
3258 for alert_type, alert_message in _real_extract_alerts():
3259 if alert_type.lower() == 'error':
3ffc7c89 3260 errors.append([alert_type, alert_message])
f3eaa8dd 3261 else:
3ffc7c89 3262 warnings.append([alert_type, alert_message])
f3eaa8dd 3263
3ffc7c89 3264 for alert_type, alert_message in (warnings + errors[:-1]):
3265 self._downloader.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
3266 if errors:
3267 raise ExtractorError('YouTube said: %s' % errors[-1][1], expected=expected)
02ced43c 3268
cd7c66cf 3269 def _extract_webpage(self, url, item_id):
62bff2c1 3270 retries = self._downloader.params.get('extractor_retries', 3)
3271 count = -1
c705177d 3272 last_error = 'Incomplete yt initial data recieved'
14fdfea9 3273 while count < retries:
62bff2c1 3274 count += 1
14fdfea9 3275 # Sometimes youtube returns a webpage with incomplete ytInitialData
62bff2c1 3276 # See: https://github.com/yt-dlp/yt-dlp/issues/116
3277 if count:
c705177d 3278 self.report_warning('%s. Retrying ...' % last_error)
5ef7d9bd 3279 webpage = self._download_webpage(
3280 url, item_id,
cd7c66cf 3281 'Downloading webpage%s' % (' (retry #%d)' % count if count else ''))
14fdfea9 3282 data = self._extract_yt_initial_data(item_id, webpage)
f3eaa8dd 3283 self._extract_alerts(data, expected=True)
14fdfea9 3284 if data.get('contents') or data.get('currentVideoEndpoint'):
3285 break
c705177d 3286 if count >= retries:
3287 self._downloader.report_error(last_error)
cd7c66cf 3288 return webpage, data
3289
3290 def _real_extract(self, url):
3291 item_id = self._match_id(url)
3292 url = compat_urlparse.urlunparse(
3293 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
3294
3295 # This is not matched in a channel page with a tab selected
3296 mobj = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url)
3297 mobj = mobj.groupdict() if mobj else {}
3298 if mobj and not mobj.get('not_channel'):
3299 self._downloader.report_warning(
3300 'A channel/user page was given. All the channel\'s videos will be downloaded. '
3301 'To download only the videos in the home page, add a "/featured" to the URL')
3302 url = '%s/videos%s' % (mobj.get('pre'), mobj.get('post') or '')
3303
3304 # Handle both video/playlist URLs
3305 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3306 video_id = qs.get('v', [None])[0]
3307 playlist_id = qs.get('list', [None])[0]
3308
3309 if not video_id and (mobj.get('not_channel') or '').startswith('watch'):
3310 if not playlist_id:
3311 # If there is neither video or playlist ids,
3312 # youtube redirects to home page, which is undesirable
3313 raise ExtractorError('Unable to recognize tab page')
3314 self._downloader.report_warning('A video URL was given without video ID. Trying to download playlist %s' % playlist_id)
3315 url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
3316
3317 if video_id and playlist_id:
3318 if self._downloader.params.get('noplaylist'):
3319 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
3320 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3321 self.to_screen('Downloading playlist %s; add --no-playlist to just download video %s' % (playlist_id, video_id))
3322
3323 webpage, data = self._extract_webpage(url, item_id)
14fdfea9 3324
8bdd16b4 3325 tabs = try_get(
3326 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3327 if tabs:
d069eca7 3328 return self._extract_from_tabs(item_id, webpage, data, tabs)
cd7c66cf 3329
8bdd16b4 3330 playlist = try_get(
3331 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3332 if playlist:
29f7c58a 3333 return self._extract_from_playlist(item_id, url, data, playlist)
cd7c66cf 3334
a0566bbf 3335 video_id = try_get(
3336 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
3337 compat_str) or video_id
8bdd16b4 3338 if video_id:
cd7c66cf 3339 self._downloader.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id)
8bdd16b4 3340 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
cd7c66cf 3341
8bdd16b4 3342 raise ExtractorError('Unable to recognize tab page')
c5e8d7af 3343
c5e8d7af 3344
8bdd16b4 3345class YoutubePlaylistIE(InfoExtractor):
3346 IE_DESC = 'YouTube.com playlists'
3347 _VALID_URL = r'''(?x)(?:
3348 (?:https?://)?
3349 (?:\w+\.)?
3350 (?:
3351 (?:
3352 youtube(?:kids)?\.com|
29f7c58a 3353 invidio\.us
8bdd16b4 3354 )
3355 /.*?\?.*?\blist=
3356 )?
3357 (?P<id>%(playlist_id)s)
3358 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3359 IE_NAME = 'youtube:playlist'
cdc628a4 3360 _TESTS = [{
8bdd16b4 3361 'note': 'issue #673',
3362 'url': 'PLBB231211A4F62143',
cdc628a4 3363 'info_dict': {
8bdd16b4 3364 'title': '[OLD]Team Fortress 2 (Class-based LP)',
3365 'id': 'PLBB231211A4F62143',
3366 'uploader': 'Wickydoo',
3367 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
3368 },
3369 'playlist_mincount': 29,
3370 }, {
3371 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3372 'info_dict': {
3373 'title': 'YDL_safe_search',
3374 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3375 },
3376 'playlist_count': 2,
3377 'skip': 'This playlist is private',
9558dcec 3378 }, {
8bdd16b4 3379 'note': 'embedded',
3380 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3381 'playlist_count': 4,
9558dcec 3382 'info_dict': {
8bdd16b4 3383 'title': 'JODA15',
3384 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3385 'uploader': 'milan',
3386 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
9558dcec 3387 }
cdc628a4 3388 }, {
8bdd16b4 3389 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3390 'playlist_mincount': 982,
3391 'info_dict': {
3392 'title': '2018 Chinese New Singles (11/6 updated)',
3393 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3394 'uploader': 'LBK',
3395 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
3396 }
daa0df9e 3397 }, {
29f7c58a 3398 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
3399 'only_matching': True,
3400 }, {
3401 # music album playlist
3402 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
3403 'only_matching': True,
3404 }]
3405
3406 @classmethod
3407 def suitable(cls, url):
3408 return False if YoutubeTabIE.suitable(url) else super(
3409 YoutubePlaylistIE, cls).suitable(url)
3410
3411 def _real_extract(self, url):
3412 playlist_id = self._match_id(url)
3413 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3414 if not qs:
3415 qs = {'list': playlist_id}
3416 return self.url_result(
3417 update_url_query('https://www.youtube.com/playlist', qs),
3418 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3419
3420
3421class YoutubeYtBeIE(InfoExtractor):
c76eb41b 3422 IE_DESC = 'youtu.be'
29f7c58a 3423 _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3424 _TESTS = [{
8bdd16b4 3425 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
3426 'info_dict': {
3427 'id': 'yeWKywCrFtk',
3428 'ext': 'mp4',
3429 'title': 'Small Scale Baler and Braiding Rugs',
3430 'uploader': 'Backus-Page House Museum',
3431 'uploader_id': 'backuspagemuseum',
3432 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
3433 'upload_date': '20161008',
3434 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
3435 'categories': ['Nonprofits & Activism'],
3436 'tags': list,
3437 'like_count': int,
3438 'dislike_count': int,
3439 },
3440 'params': {
3441 'noplaylist': True,
3442 'skip_download': True,
3443 },
39e7107d 3444 }, {
8bdd16b4 3445 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
39e7107d 3446 'only_matching': True,
cdc628a4
PH
3447 }]
3448
8bdd16b4 3449 def _real_extract(self, url):
29f7c58a 3450 mobj = re.match(self._VALID_URL, url)
3451 video_id = mobj.group('id')
3452 playlist_id = mobj.group('playlist_id')
8bdd16b4 3453 return self.url_result(
29f7c58a 3454 update_url_query('https://www.youtube.com/watch', {
3455 'v': video_id,
3456 'list': playlist_id,
3457 'feature': 'youtu.be',
3458 }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
8bdd16b4 3459
3460
3461class YoutubeYtUserIE(InfoExtractor):
c76eb41b 3462 IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
8bdd16b4 3463 _VALID_URL = r'ytuser:(?P<id>.+)'
3464 _TESTS = [{
3465 'url': 'ytuser:phihag',
3466 'only_matching': True,
3467 }]
3468
3469 def _real_extract(self, url):
3470 user_id = self._match_id(url)
3471 return self.url_result(
3472 'https://www.youtube.com/user/%s' % user_id,
3473 ie=YoutubeTabIE.ie_key(), video_id=user_id)
9558dcec 3474
b05654f0 3475
3d3dddc9 3476class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
70d5c17b 3477 IE_NAME = 'youtube:favorites'
3478 IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
3479 _VALID_URL = r':ytfav(?:ou?rite)?s?'
3480 _LOGIN_REQUIRED = True
3481 _TESTS = [{
3482 'url': ':ytfav',
3483 'only_matching': True,
3484 }, {
3485 'url': ':ytfavorites',
3486 'only_matching': True,
3487 }]
3488
3489 def _real_extract(self, url):
3490 return self.url_result(
3491 'https://www.youtube.com/playlist?list=LL',
3492 ie=YoutubeTabIE.ie_key())
3493
3494
8bdd16b4 3495class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
69184e41 3496 IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
b4c08069
JMF
3497 # there doesn't appear to be a real limit, for example if you search for
3498 # 'python' you get more than 8.000.000 results
3499 _MAX_RESULTS = float('inf')
78caa52a 3500 IE_NAME = 'youtube:search'
b05654f0 3501 _SEARCH_KEY = 'ytsearch'
6c894ea1 3502 _SEARCH_PARAMS = None
9dd8e46a 3503 _TESTS = []
b05654f0 3504
6c894ea1 3505 def _entries(self, query, n):
a5c56234 3506 data = {'query': query}
6c894ea1
U
3507 if self._SEARCH_PARAMS:
3508 data['params'] = self._SEARCH_PARAMS
3509 total = 0
3510 for page_num in itertools.count(1):
a5c56234
M
3511 search = self._call_api(
3512 ep='search', video_id='query "%s"' % query, fatal=False,
3513 note='Downloading page %s' % page_num, query=data)
6c894ea1 3514 if not search:
b4c08069 3515 break
6c894ea1
U
3516 slr_contents = try_get(
3517 search,
3518 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3519 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3520 list)
3521 if not slr_contents:
a22b2fd1 3522 break
0366ae87 3523
0366ae87
M
3524 # Youtube sometimes adds promoted content to searches,
3525 # changing the index location of videos and token.
3526 # So we search through all entries till we find them.
30a074c2 3527 continuation_token = None
3528 for slr_content in slr_contents:
a96c6d15 3529 if continuation_token is None:
3530 continuation_token = try_get(
3531 slr_content,
3532 lambda x: x['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3533 compat_str)
3534
30a074c2 3535 isr_contents = try_get(
3536 slr_content,
3537 lambda x: x['itemSectionRenderer']['contents'],
3538 list)
9da76d30 3539 if not isr_contents:
30a074c2 3540 continue
3541 for content in isr_contents:
3542 if not isinstance(content, dict):
3543 continue
3544 video = content.get('videoRenderer')
3545 if not isinstance(video, dict):
3546 continue
3547 video_id = video.get('videoId')
3548 if not video_id:
3549 continue
3550
3551 yield self._extract_video(video)
3552 total += 1
3553 if total == n:
3554 return
0366ae87 3555
0366ae87 3556 if not continuation_token:
6c894ea1 3557 break
0366ae87 3558 data['continuation'] = continuation_token
b05654f0 3559
6c894ea1
U
3560 def _get_n_results(self, query, n):
3561 """Get a specified number of results for a query"""
3562 return self.playlist_result(self._entries(query, n), query)
75dff0ee 3563
c9ae7b95 3564
a3dd9248 3565class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 3566 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 3567 _SEARCH_KEY = 'ytsearchdate'
c76eb41b 3568 IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
6c894ea1 3569 _SEARCH_PARAMS = 'CAI%3D'
75dff0ee 3570
c9ae7b95 3571
386e1dd9 3572class YoutubeSearchURLIE(YoutubeSearchIE):
69184e41 3573 IE_DESC = 'YouTube.com search URLs'
386e1dd9 3574 IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
3575 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
ef2f3c7f 3576 # _MAX_RESULTS = 100
3462ffa8 3577 _TESTS = [{
3578 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3579 'playlist_mincount': 5,
3580 'info_dict': {
3581 'title': 'youtube-dl test video',
3582 }
3583 }, {
3584 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3585 'only_matching': True,
3586 }]
3587
386e1dd9 3588 @classmethod
3589 def _make_valid_url(cls):
3590 return cls._VALID_URL
3591
3462ffa8 3592 def _real_extract(self, url):
386e1dd9 3593 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
3594 query = (qs.get('search_query') or qs.get('q'))[0]
3595 self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
3596 return self._get_n_results(query, self._MAX_RESULTS)
3462ffa8 3597
3598
3599class YoutubeFeedsInfoExtractor(YoutubeTabIE):
d7ae0639 3600 """
25f14e9f 3601 Base class for feed extractors
3d3dddc9 3602 Subclasses must define the _FEED_NAME property.
d7ae0639 3603 """
b2e8bc1b 3604 _LOGIN_REQUIRED = True
ef2f3c7f 3605 _TESTS = []
d7ae0639
JMF
3606
3607 @property
3608 def IE_NAME(self):
78caa52a 3609 return 'youtube:%s' % self._FEED_NAME
04cc9617 3610
81f0259b 3611 def _real_initialize(self):
b2e8bc1b 3612 self._login()
81f0259b 3613
3853309f 3614 def _real_extract(self, url):
3d3dddc9 3615 return self.url_result(
3616 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3617 ie=YoutubeTabIE.ie_key())
25f14e9f
S
3618
3619
ef2f3c7f 3620class YoutubeWatchLaterIE(InfoExtractor):
3621 IE_NAME = 'youtube:watchlater'
70d5c17b 3622 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3d3dddc9 3623 _VALID_URL = r':ytwatchlater'
bc7a9cd8 3624 _TESTS = [{
8bdd16b4 3625 'url': ':ytwatchlater',
bc7a9cd8
S
3626 'only_matching': True,
3627 }]
25f14e9f
S
3628
3629 def _real_extract(self, url):
ef2f3c7f 3630 return self.url_result(
3631 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
3462ffa8 3632
3633
25f14e9f
S
3634class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3635 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3d3dddc9 3636 _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
25f14e9f 3637 _FEED_NAME = 'recommended'
3d3dddc9 3638 _TESTS = [{
3639 'url': ':ytrec',
3640 'only_matching': True,
3641 }, {
3642 'url': ':ytrecommended',
3643 'only_matching': True,
3644 }, {
3645 'url': 'https://youtube.com',
3646 'only_matching': True,
3647 }]
1ed5b5c9 3648
1ed5b5c9 3649
25f14e9f 3650class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
70d5c17b 3651 IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
3d3dddc9 3652 _VALID_URL = r':ytsub(?:scription)?s?'
25f14e9f 3653 _FEED_NAME = 'subscriptions'
3d3dddc9 3654 _TESTS = [{
3655 'url': ':ytsubs',
3656 'only_matching': True,
3657 }, {
3658 'url': ':ytsubscriptions',
3659 'only_matching': True,
3660 }]
1ed5b5c9 3661
1ed5b5c9 3662
25f14e9f 3663class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
a5c56234
M
3664 IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)'
3665 _VALID_URL = r':ythis(?:tory)?'
25f14e9f 3666 _FEED_NAME = 'history'
3d3dddc9 3667 _TESTS = [{
3668 'url': ':ythistory',
3669 'only_matching': True,
3670 }]
1ed5b5c9
JMF
3671
3672
15870e90
PH
3673class YoutubeTruncatedURLIE(InfoExtractor):
3674 IE_NAME = 'youtube:truncated_url'
3675 IE_DESC = False # Do not list
975d35db 3676 _VALID_URL = r'''(?x)
b95aab84
PH
3677 (?:https?://)?
3678 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3679 (?:watch\?(?:
c4808c60 3680 feature=[a-z_]+|
b95aab84
PH
3681 annotation_id=annotation_[^&]+|
3682 x-yt-cl=[0-9]+|
c1708b89 3683 hl=[^&]*|
287be8c6 3684 t=[0-9]+
b95aab84
PH
3685 )?
3686 |
3687 attribution_link\?a=[^&]+
3688 )
3689 $
975d35db 3690 '''
15870e90 3691
c4808c60 3692 _TESTS = [{
2d3d2997 3693 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
c4808c60 3694 'only_matching': True,
dc2fc736 3695 }, {
2d3d2997 3696 'url': 'https://www.youtube.com/watch?',
dc2fc736 3697 'only_matching': True,
b95aab84
PH
3698 }, {
3699 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3700 'only_matching': True,
3701 }, {
3702 'url': 'https://www.youtube.com/watch?feature=foo',
3703 'only_matching': True,
c1708b89
PH
3704 }, {
3705 'url': 'https://www.youtube.com/watch?hl=en-GB',
3706 'only_matching': True,
287be8c6
PH
3707 }, {
3708 'url': 'https://www.youtube.com/watch?t=2372',
3709 'only_matching': True,
c4808c60
PH
3710 }]
3711
15870e90
PH
3712 def _real_extract(self, url):
3713 raise ExtractorError(
78caa52a
PH
3714 'Did you forget to quote the URL? Remember that & is a meta '
3715 'character in most shells, so you want to put the URL in quotes, '
3867038a 3716 'like youtube-dl '
2d3d2997 3717 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3867038a 3718 ' or simply youtube-dl BaW_jenozKc .',
15870e90 3719 expected=True)
772fd5cc
PH
3720
3721
3722class YoutubeTruncatedIDIE(InfoExtractor):
3723 IE_NAME = 'youtube:truncated_id'
3724 IE_DESC = False # Do not list
b95aab84 3725 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
3726
3727 _TESTS = [{
3728 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3729 'only_matching': True,
3730 }]
3731
3732 def _real_extract(self, url):
3733 video_id = self._match_id(url)
3734 raise ExtractorError(
3735 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3736 expected=True)