]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/youtube.py
[documentation] Improve `--parse-metadata` documentation
[yt-dlp.git] / yt_dlp / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
a5c56234 5import hashlib
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
d77ab8e2 9import random
c5e8d7af 10import re
8a784c74 11import time
e0df6211 12import traceback
c5e8d7af 13
b05654f0 14from .common import InfoExtractor, SearchInfoExtractor
4bb4a188 15from ..compat import (
edf3e38e 16 compat_chr,
29f7c58a 17 compat_HTTPError,
c5e8d7af 18 compat_parse_qs,
545cc85d 19 compat_str,
7fd002c0 20 compat_urllib_parse_unquote_plus,
15707c7e 21 compat_urllib_parse_urlencode,
7c80519c 22 compat_urllib_parse_urlparse,
7c61bd36 23 compat_urlparse,
4bb4a188 24)
545cc85d 25from ..jsinterp import JSInterpreter
4bb4a188 26from ..utils import (
c224251a 27 bool_or_none,
c5e8d7af 28 clean_html,
26fe8ffe 29 dict_get,
c5e8d7af 30 ExtractorError,
b60419c5 31 format_field,
2d30521a 32 float_or_none,
dd27fd17 33 int_or_none,
94278f72 34 mimetype2ext,
6310acf5 35 parse_codecs,
7c80519c 36 parse_duration,
dca3ff4a 37 qualities,
3995d37d 38 remove_start,
cf7e015f 39 smuggle_url,
dbdaaa23 40 str_or_none,
c93d53f5 41 str_to_int,
556dbe7f 42 try_get,
c5e8d7af
PH
43 unescapeHTML,
44 unified_strdate,
cf7e015f 45 unsmuggle_url,
8bdd16b4 46 update_url_query,
21c340b8 47 url_or_none,
6e6bc8da 48 urlencode_postdata,
8bdd16b4 49 urljoin,
c5e8d7af
PH
50)
51
5f6a1245 52
de7f3446 53class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
54 """Provide base functions for Youtube extractors"""
55 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 56 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
e00eb564
S
57
58 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
3995d37d
S
59 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
60 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
e00eb564 61
3462ffa8 62 _RESERVED_NAMES = (
cd7c66cf 63 r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|'
64 r'movies|results|shared|hashtag|trending|feed|feeds|'
65 r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout')
3462ffa8 66
b2e8bc1b
JMF
67 _NETRC_MACHINE = 'youtube'
68 # If True it will raise an error if no login info is provided
69 _LOGIN_REQUIRED = False
70
70d5c17b 71 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
d0ba5587 72
25f14e9f
S
73 def _ids_to_results(self, ids):
74 return [
75 self.url_result(vid_id, 'Youtube', video_id=vid_id)
76 for vid_id in ids]
77
b2e8bc1b 78 def _login(self):
83317f69 79 """
80 Attempt to log in to YouTube.
81 True is returned if successful or skipped.
82 False is returned if login failed.
83
84 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
85 """
68217024 86 username, password = self._get_login_info()
b2e8bc1b
JMF
87 # No authentication to be performed
88 if username is None:
70d35d16 89 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
69ea8ca4 90 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
545cc85d 91 # if self._downloader.params.get('cookiefile'): # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
92 # self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
83317f69 93 return True
b2e8bc1b 94
7cc3570e
PH
95 login_page = self._download_webpage(
96 self._LOGIN_URL, None,
69ea8ca4
PH
97 note='Downloading login page',
98 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
99 if login_page is False:
100 return
b2e8bc1b 101
1212e997 102 login_form = self._hidden_inputs(login_page)
c5e8d7af 103
e00eb564
S
104 def req(url, f_req, note, errnote):
105 data = login_form.copy()
106 data.update({
107 'pstMsg': 1,
108 'checkConnection': 'youtube',
109 'checkedDomains': 'youtube',
110 'hl': 'en',
111 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
3995d37d 112 'f.req': json.dumps(f_req),
e00eb564
S
113 'flowName': 'GlifWebSignIn',
114 'flowEntry': 'ServiceLogin',
baf67a60
S
115 # TODO: reverse actual botguard identifier generation algo
116 'bgRequest': '["identifier",""]',
041bc3ad 117 })
e00eb564
S
118 return self._download_json(
119 url, None, note=note, errnote=errnote,
120 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
121 fatal=False,
122 data=urlencode_postdata(data), headers={
123 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
124 'Google-Accounts-XSRF': 1,
125 })
126
3995d37d
S
127 def warn(message):
128 self._downloader.report_warning(message)
129
130 lookup_req = [
131 username,
132 None, [], None, 'US', None, None, 2, False, True,
133 [
134 None, None,
135 [2, 1, None, 1,
136 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
137 None, [], 4],
138 1, [None, None, []], None, None, None, True
139 ],
140 username,
141 ]
142
e00eb564 143 lookup_results = req(
3995d37d 144 self._LOOKUP_URL, lookup_req,
e00eb564
S
145 'Looking up account info', 'Unable to look up account info')
146
147 if lookup_results is False:
148 return False
041bc3ad 149
3995d37d
S
150 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
151 if not user_hash:
152 warn('Unable to extract user hash')
153 return False
154
155 challenge_req = [
156 user_hash,
157 None, 1, None, [1, None, None, None, [password, None, True]],
158 [
159 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
160 1, [None, None, []], None, None, None, True
161 ]]
83317f69 162
3995d37d
S
163 challenge_results = req(
164 self._CHALLENGE_URL, challenge_req,
165 'Logging in', 'Unable to log in')
83317f69 166
3995d37d 167 if challenge_results is False:
e00eb564 168 return
83317f69 169
3995d37d
S
170 login_res = try_get(challenge_results, lambda x: x[0][5], list)
171 if login_res:
172 login_msg = try_get(login_res, lambda x: x[5], compat_str)
173 warn(
174 'Unable to login: %s' % 'Invalid password'
175 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
176 return False
177
178 res = try_get(challenge_results, lambda x: x[0][-1], list)
179 if not res:
180 warn('Unable to extract result entry')
181 return False
182
9a6628aa
S
183 login_challenge = try_get(res, lambda x: x[0][0], list)
184 if login_challenge:
185 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
186 if challenge_str == 'TWO_STEP_VERIFICATION':
3995d37d
S
187 # SEND_SUCCESS - TFA code has been successfully sent to phone
188 # QUOTA_EXCEEDED - reached the limit of TFA codes
9a6628aa 189 status = try_get(login_challenge, lambda x: x[5], compat_str)
3995d37d
S
190 if status == 'QUOTA_EXCEEDED':
191 warn('Exceeded the limit of TFA codes, try later')
192 return False
193
194 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
195 if not tl:
196 warn('Unable to extract TL')
197 return False
198
199 tfa_code = self._get_tfa_info('2-step verification code')
200
201 if not tfa_code:
202 warn(
203 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
204 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
205 return False
206
207 tfa_code = remove_start(tfa_code, 'G-')
208
209 tfa_req = [
210 user_hash, None, 2, None,
211 [
212 9, None, None, None, None, None, None, None,
213 [None, tfa_code, True, 2]
214 ]]
215
216 tfa_results = req(
217 self._TFA_URL.format(tl), tfa_req,
218 'Submitting TFA code', 'Unable to submit TFA code')
219
220 if tfa_results is False:
221 return False
222
223 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
224 if tfa_res:
225 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
226 warn(
227 'Unable to finish TFA: %s' % 'Invalid TFA code'
228 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
229 return False
230
231 check_cookie_url = try_get(
232 tfa_results, lambda x: x[0][-1][2], compat_str)
9a6628aa
S
233 else:
234 CHALLENGES = {
235 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
236 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
237 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
238 }
239 challenge = CHALLENGES.get(
240 challenge_str,
241 '%s returned error %s.' % (self.IE_NAME, challenge_str))
242 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
243 return False
3995d37d
S
244 else:
245 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
246
247 if not check_cookie_url:
248 warn('Unable to extract CheckCookie URL')
249 return False
e00eb564
S
250
251 check_cookie_results = self._download_webpage(
3995d37d
S
252 check_cookie_url, None, 'Checking cookie', fatal=False)
253
254 if check_cookie_results is False:
255 return False
e00eb564 256
3995d37d
S
257 if 'https://myaccount.google.com/' not in check_cookie_results:
258 warn('Unable to log in')
b2e8bc1b 259 return False
e00eb564 260
b2e8bc1b
JMF
261 return True
262
cce889b9 263 def _initialize_consent(self):
264 cookies = self._get_cookies('https://www.youtube.com/')
265 if cookies.get('__Secure-3PSID'):
266 return
267 consent_id = None
268 consent = cookies.get('CONSENT')
269 if consent:
270 if 'YES' in consent.value:
271 return
272 consent_id = self._search_regex(
273 r'PENDING\+(\d+)', consent.value, 'consent', default=None)
274 if not consent_id:
275 consent_id = random.randint(100, 999)
276 self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id)
8d81f3e3 277
b2e8bc1b 278 def _real_initialize(self):
cce889b9 279 self._initialize_consent()
b2e8bc1b
JMF
280 if self._downloader is None:
281 return
b2e8bc1b
JMF
282 if not self._login():
283 return
c5e8d7af 284
a1c5d2ca 285 _YT_WEB_CLIENT_VERSION = '2.20210301.08.00'
8bdd16b4 286 _DEFAULT_API_DATA = {
287 'context': {
288 'client': {
289 'clientName': 'WEB',
a1c5d2ca 290 'clientVersion': _YT_WEB_CLIENT_VERSION,
8bdd16b4 291 }
292 },
293 }
8377574c 294
a1c5d2ca
M
295 _DEFAULT_BASIC_API_HEADERS = {
296 'X-YouTube-Client-Name': '1',
297 'X-YouTube-Client-Version': _YT_WEB_CLIENT_VERSION
298 }
299
a0566bbf 300 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
29f7c58a 301 _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
302 _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
a0566bbf 303
a5c56234
M
304 def _generate_sapisidhash_header(self):
305 sapisid_cookie = self._get_cookies('https://www.youtube.com').get('SAPISID')
306 if sapisid_cookie is None:
307 return
308 time_now = round(time.time())
309 sapisidhash = hashlib.sha1((str(time_now) + " " + sapisid_cookie.value + " " + "https://www.youtube.com").encode("utf-8")).hexdigest()
310 return "SAPISIDHASH %s_%s" % (time_now, sapisidhash)
311
312 def _call_api(self, ep, query, video_id, fatal=True, headers=None,
313 note='Downloading API JSON', errnote='Unable to download API page'):
8bdd16b4 314 data = self._DEFAULT_API_DATA.copy()
315 data.update(query)
a5c56234
M
316 headers = headers or {}
317 headers.update({'content-type': 'application/json'})
318 auth = self._generate_sapisidhash_header()
319 if auth is not None:
320 headers.update({'Authorization': auth, 'X-Origin': 'https://www.youtube.com'})
545cc85d 321 return self._download_json(
a5c56234
M
322 'https://www.youtube.com/youtubei/v1/%s' % ep,
323 video_id=video_id, fatal=fatal, note=note, errnote=errnote,
324 data=json.dumps(data).encode('utf8'), headers=headers,
8bdd16b4 325 query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
c54f4aad 326
8bdd16b4 327 def _extract_yt_initial_data(self, video_id, webpage):
328 return self._parse_json(
329 self._search_regex(
29f7c58a 330 (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
a0566bbf 331 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
8bdd16b4 332 video_id)
0c148415 333
a1c5d2ca
M
334 def _extract_identity_token(self, webpage, item_id):
335 ytcfg = self._extract_ytcfg(item_id, webpage)
336 if ytcfg:
337 token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
338 if token:
339 return token
340 return self._search_regex(
341 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
342 'identity token', default=None)
343
344 @staticmethod
345 def _extract_account_syncid(data):
346 """Extract syncId required to download private playlists of secondary channels"""
347 sync_ids = (
348 try_get(data, lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'], compat_str)
349 or '').split("||")
350 if len(sync_ids) >= 2 and sync_ids[1]:
351 # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
352 # and just "user_syncid||" for primary channel. We only want the channel_syncid
353 return sync_ids[0]
354
29f7c58a 355 def _extract_ytcfg(self, video_id, webpage):
356 return self._parse_json(
357 self._search_regex(
358 r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
359 default='{}'), video_id, fatal=False)
360
30a074c2 361 def _extract_video(self, renderer):
362 video_id = renderer.get('videoId')
363 title = try_get(
364 renderer,
365 (lambda x: x['title']['runs'][0]['text'],
366 lambda x: x['title']['simpleText']), compat_str)
367 description = try_get(
368 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
369 compat_str)
370 duration = parse_duration(try_get(
371 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
372 view_count_text = try_get(
373 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
374 view_count = str_to_int(self._search_regex(
375 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
376 'view count', default=None))
377 uploader = try_get(
bc2ca1bb 378 renderer,
379 (lambda x: x['ownerText']['runs'][0]['text'],
380 lambda x: x['shortBylineText']['runs'][0]['text']), compat_str)
30a074c2 381 return {
382 '_type': 'url_transparent',
383 'ie_key': YoutubeIE.ie_key(),
384 'id': video_id,
385 'url': video_id,
386 'title': title,
387 'description': description,
388 'duration': duration,
389 'view_count': view_count,
390 'uploader': uploader,
391 }
392
0c148415 393
360e1ca5 394class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 395 IE_DESC = 'YouTube.com'
bc2ca1bb 396 _INVIDIOUS_SITES = (
397 # invidious-redirect websites
398 r'(?:www\.)?redirect\.invidious\.io',
399 r'(?:(?:www|dev)\.)?invidio\.us',
400 # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
401 r'(?:www\.)?invidious\.pussthecat\.org',
402 r'(?:www\.)?invidious\.048596\.xyz',
403 r'(?:www\.)?invidious\.zee\.li',
404 r'(?:www\.)?vid\.puffyan\.us',
405 r'(?:(?:www|au)\.)?ytprivate\.com',
406 r'(?:www\.)?invidious\.namazso\.eu',
407 r'(?:www\.)?invidious\.ethibox\.fr',
408 r'(?:www\.)?inv\.skyn3t\.in',
409 r'(?:www\.)?invidious\.himiko\.cloud',
410 r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
411 r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
412 r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
413 r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
414 # youtube-dl invidious instances list
415 r'(?:(?:www|no)\.)?invidiou\.sh',
416 r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
417 r'(?:www\.)?invidious\.kabi\.tk',
418 r'(?:www\.)?invidious\.13ad\.de',
419 r'(?:www\.)?invidious\.mastodon\.host',
420 r'(?:www\.)?invidious\.zapashcanon\.fr',
421 r'(?:www\.)?invidious\.kavin\.rocks',
422 r'(?:www\.)?invidious\.tube',
423 r'(?:www\.)?invidiou\.site',
424 r'(?:www\.)?invidious\.site',
425 r'(?:www\.)?invidious\.xyz',
426 r'(?:www\.)?invidious\.nixnet\.xyz',
427 r'(?:www\.)?invidious\.drycat\.fr',
428 r'(?:www\.)?tube\.poal\.co',
429 r'(?:www\.)?tube\.connect\.cafe',
430 r'(?:www\.)?vid\.wxzm\.sx',
431 r'(?:www\.)?vid\.mint\.lgbt',
432 r'(?:www\.)?yewtu\.be',
433 r'(?:www\.)?yt\.elukerio\.org',
434 r'(?:www\.)?yt\.lelux\.fi',
435 r'(?:www\.)?invidious\.ggc-project\.de',
436 r'(?:www\.)?yt\.maisputain\.ovh',
437 r'(?:www\.)?invidious\.toot\.koeln',
438 r'(?:www\.)?invidious\.fdn\.fr',
439 r'(?:www\.)?watch\.nettohikari\.com',
440 r'(?:www\.)?kgg2m7yk5aybusll\.onion',
441 r'(?:www\.)?qklhadlycap4cnod\.onion',
442 r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion',
443 r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion',
444 r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion',
445 r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion',
446 r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p',
447 r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion',
448 )
cb7dfeea 449 _VALID_URL = r"""(?x)^
c5e8d7af 450 (
edb53e2d 451 (?:https?://|//) # http(s):// or protocol-independent URL
bc2ca1bb 452 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com|
453 (?:www\.)?deturl\.com/www\.youtube\.com|
454 (?:www\.)?pwnyoutube\.com|
455 (?:www\.)?hooktube\.com|
456 (?:www\.)?yourepeat\.com|
457 tube\.majestyc\.net|
458 %(invidious)s|
459 youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
c5e8d7af
PH
460 (?:.*?\#/)? # handle anchor (#/) redirect urls
461 (?: # the various things that can precede the ID:
ac7553d0 462 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 463 |(?: # or the v= param in all its forms
f7000f3a 464 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 465 (?:\?|\#!?) # the params delimiter ? or # or #!
040ac686 466 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
c5e8d7af
PH
467 v=
468 )
f4b05232 469 ))
cbaed4bb
S
470 |(?:
471 youtu\.be| # just youtu.be/xxxx
6d4fc66b
S
472 vid\.plus| # or vid.plus/xxxx
473 zwearz\.com/watch| # or zwearz.com/watch/xxxx
bc2ca1bb 474 %(invidious)s
cbaed4bb 475 )/
edb53e2d 476 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 477 )
c5e8d7af 478 )? # all until now is optional -> you can pass the naked ID
8bdd16b4 479 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
d0ba5587
S
480 (?!.*?\blist=
481 (?:
482 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
483 WL # WL are handled by the watch later IE
484 )
485 )
c5e8d7af 486 (?(1).+)? # if we found the ID, everything can follow
bc2ca1bb 487 $""" % {
488 'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE,
489 'invidious': '|'.join(_INVIDIOUS_SITES),
490 }
e40c758c 491 _PLAYER_INFO_RE = (
cc2db878 492 r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
493 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
545cc85d 494 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
e40c758c 495 )
2c62dc26 496 _formats = {
c2d3cb4c 497 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
498 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
499 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
500 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
501 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
502 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
503 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
504 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
3834d3e3 505 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
c2d3cb4c 506 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
507 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
508 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
509 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
510 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
511 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
e1a0bfdf 512 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
c2d3cb4c 513 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
514 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
e1a0bfdf 515
516
517 # 3D videos
c2d3cb4c 518 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
519 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
520 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
521 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
e1a0bfdf 522 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
523 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
524 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
836a086c 525
96fb5605 526 # Apple HTTP Live Streaming
11f12195 527 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
c2d3cb4c 528 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
529 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
530 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
531 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
532 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
e1a0bfdf 533 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
534 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
2c62dc26
PH
535
536 # DASH mp4 video
d23028a8
S
537 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
538 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
539 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
540 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
541 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
067aa17e 542 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
d23028a8
S
543 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
544 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
545 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
546 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
547 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
548 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
836a086c 549
f6f1fc92 550 # Dash mp4 audio
d23028a8
S
551 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
552 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
553 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
554 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
555 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
556 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
557 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
836a086c
AZ
558
559 # Dash webm
d23028a8
S
560 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
561 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
562 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
563 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
564 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
565 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
566 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
567 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
568 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
569 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
570 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
571 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
572 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
573 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
574 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
4c6b4764 575 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
d23028a8
S
576 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
577 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
578 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
579 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
580 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
581 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
2c62dc26
PH
582
583 # Dash webm audio
d23028a8
S
584 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
585 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
ce6b9a2d 586
0857baad 587 # Dash webm audio with opus inside
d23028a8
S
588 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
589 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
590 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
0857baad 591
ce6b9a2d
PH
592 # RTMP (unnamed)
593 '_rtmp': {'protocol': 'rtmp'},
b85eae0f
S
594
595 # av01 video only formats sometimes served with "unknown" codecs
596 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
597 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
598 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
599 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
c5e8d7af 600 }
29f7c58a 601 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
836a086c 602
fd5c4aab
S
603 _GEO_BYPASS = False
604
78caa52a 605 IE_NAME = 'youtube'
2eb88d95
PH
606 _TESTS = [
607 {
2d3d2997 608 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
4bc3a23e
PH
609 'info_dict': {
610 'id': 'BaW_jenozKc',
611 'ext': 'mp4',
3867038a 612 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
4bc3a23e
PH
613 'uploader': 'Philipp Hagemeister',
614 'uploader_id': 'phihag',
ec85ded8 615 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
dd4c4492
S
616 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
617 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
4bc3a23e 618 'upload_date': '20121002',
3867038a 619 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
4bc3a23e 620 'categories': ['Science & Technology'],
3867038a 621 'tags': ['youtube-dl'],
556dbe7f 622 'duration': 10,
dbdaaa23 623 'view_count': int,
3e7c1224
PH
624 'like_count': int,
625 'dislike_count': int,
7c80519c 626 'start_time': 1,
297a564b 627 'end_time': 9,
2eb88d95 628 }
0e853ca4 629 },
fccd3771 630 {
4bc3a23e
PH
631 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
632 'note': 'Embed-only video (#1746)',
633 'info_dict': {
634 'id': 'yZIXLfi8CZQ',
635 'ext': 'mp4',
636 'upload_date': '20120608',
637 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
638 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
639 'uploader': 'SET India',
94bfcd23 640 'uploader_id': 'setindia',
ec85ded8 641 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
94bfcd23 642 'age_limit': 18,
545cc85d 643 },
644 'skip': 'Private video',
fccd3771 645 },
11b56058 646 {
8bdd16b4 647 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
11b56058
PM
648 'note': 'Use the first video ID in the URL',
649 'info_dict': {
650 'id': 'BaW_jenozKc',
651 'ext': 'mp4',
3867038a 652 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
11b56058
PM
653 'uploader': 'Philipp Hagemeister',
654 'uploader_id': 'phihag',
ec85ded8 655 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
11b56058 656 'upload_date': '20121002',
3867038a 657 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
11b56058 658 'categories': ['Science & Technology'],
3867038a 659 'tags': ['youtube-dl'],
556dbe7f 660 'duration': 10,
dbdaaa23 661 'view_count': int,
11b56058
PM
662 'like_count': int,
663 'dislike_count': int,
34a7de29
S
664 },
665 'params': {
666 'skip_download': True,
667 },
11b56058 668 },
dd27fd17 669 {
2d3d2997 670 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
4bc3a23e
PH
671 'note': '256k DASH audio (format 141) via DASH manifest',
672 'info_dict': {
673 'id': 'a9LDPn-MO4I',
674 'ext': 'm4a',
675 'upload_date': '20121002',
676 'uploader_id': '8KVIDEO',
ec85ded8 677 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
4bc3a23e
PH
678 'description': '',
679 'uploader': '8KVIDEO',
680 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 681 },
4bc3a23e
PH
682 'params': {
683 'youtube_include_dash_manifest': True,
684 'format': '141',
4919603f 685 },
de3c7fe0 686 'skip': 'format 141 not served anymore',
dd27fd17 687 },
8bdd16b4 688 # DASH manifest with encrypted signature
689 {
690 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
691 'info_dict': {
692 'id': 'IB3lcPjvWLA',
693 'ext': 'm4a',
694 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
695 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
696 'duration': 244,
697 'uploader': 'AfrojackVEVO',
698 'uploader_id': 'AfrojackVEVO',
699 'upload_date': '20131011',
cc2db878 700 'abr': 129.495,
8bdd16b4 701 },
702 'params': {
703 'youtube_include_dash_manifest': True,
704 'format': '141/bestaudio[ext=m4a]',
705 },
706 },
aa79ac0c
PH
707 # Controversy video
708 {
709 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
710 'info_dict': {
711 'id': 'T4XJQO3qol8',
712 'ext': 'mp4',
556dbe7f 713 'duration': 219,
aa79ac0c 714 'upload_date': '20100909',
4fe54c12 715 'uploader': 'Amazing Atheist',
aa79ac0c 716 'uploader_id': 'TheAmazingAtheist',
ec85ded8 717 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
aa79ac0c 718 'title': 'Burning Everyone\'s Koran',
545cc85d 719 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
aa79ac0c 720 }
c522adb1 721 },
dd2d55f1 722 # Normal age-gate video (embed allowed)
c522adb1 723 {
2d3d2997 724 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
c522adb1
JMF
725 'info_dict': {
726 'id': 'HtVdAasjOgU',
727 'ext': 'mp4',
728 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
ec85ded8 729 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
556dbe7f 730 'duration': 142,
c522adb1
JMF
731 'uploader': 'The Witcher',
732 'uploader_id': 'WitcherGame',
ec85ded8 733 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
c522adb1 734 'upload_date': '20140605',
34952f09 735 'age_limit': 18,
c522adb1
JMF
736 },
737 },
8bdd16b4 738 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
739 # YouTube Red ad is not captured for creator
740 {
741 'url': '__2ABJjxzNo',
742 'info_dict': {
743 'id': '__2ABJjxzNo',
744 'ext': 'mp4',
745 'duration': 266,
746 'upload_date': '20100430',
747 'uploader_id': 'deadmau5',
748 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
545cc85d 749 'creator': 'deadmau5',
750 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336',
8bdd16b4 751 'uploader': 'deadmau5',
752 'title': 'Deadmau5 - Some Chords (HD)',
545cc85d 753 'alt_title': 'Some Chords',
8bdd16b4 754 },
755 'expected_warnings': [
756 'DASH manifest missing',
757 ]
758 },
067aa17e 759 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
e52a40ab
PH
760 {
761 'url': 'lqQg6PlCWgI',
762 'info_dict': {
763 'id': 'lqQg6PlCWgI',
764 'ext': 'mp4',
556dbe7f 765 'duration': 6085,
90227264 766 'upload_date': '20150827',
cbe2bd91 767 'uploader_id': 'olympic',
ec85ded8 768 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
cbe2bd91 769 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
be49068d 770 'uploader': 'Olympic',
cbe2bd91
PH
771 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
772 },
773 'params': {
774 'skip_download': 'requires avconv',
e52a40ab 775 }
cbe2bd91 776 },
6271f1ca
PH
777 # Non-square pixels
778 {
779 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
780 'info_dict': {
781 'id': '_b-2C3KPAM0',
782 'ext': 'mp4',
783 'stretched_ratio': 16 / 9.,
556dbe7f 784 'duration': 85,
6271f1ca
PH
785 'upload_date': '20110310',
786 'uploader_id': 'AllenMeow',
ec85ded8 787 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
6271f1ca 788 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
eb6793ba 789 'uploader': '孫ᄋᄅ',
6271f1ca
PH
790 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
791 },
06b491eb
S
792 },
793 # url_encoded_fmt_stream_map is empty string
794 {
795 'url': 'qEJwOuvDf7I',
796 'info_dict': {
797 'id': 'qEJwOuvDf7I',
f57b7835 798 'ext': 'webm',
06b491eb
S
799 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
800 'description': '',
801 'upload_date': '20150404',
802 'uploader_id': 'spbelect',
803 'uploader': 'Наблюдатели Петербурга',
804 },
805 'params': {
806 'skip_download': 'requires avconv',
e323cf3f
S
807 },
808 'skip': 'This live event has ended.',
06b491eb 809 },
067aa17e 810 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
da77d856
S
811 {
812 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
813 'info_dict': {
814 'id': 'FIl7x6_3R5Y',
eb6793ba 815 'ext': 'webm',
da77d856
S
816 'title': 'md5:7b81415841e02ecd4313668cde88737a',
817 'description': 'md5:116377fd2963b81ec4ce64b542173306',
556dbe7f 818 'duration': 220,
da77d856
S
819 'upload_date': '20150625',
820 'uploader_id': 'dorappi2000',
ec85ded8 821 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
da77d856 822 'uploader': 'dorappi2000',
eb6793ba 823 'formats': 'mincount:31',
da77d856 824 },
eb6793ba 825 'skip': 'not actual anymore',
2ee8f5d8 826 },
8a1a26ce
YCH
827 # DASH manifest with segment_list
828 {
829 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
830 'md5': '8ce563a1d667b599d21064e982ab9e31',
831 'info_dict': {
832 'id': 'CsmdDsKjzN8',
833 'ext': 'mp4',
17ee98e1 834 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
835 'uploader': 'Airtek',
836 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
837 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
838 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
839 },
840 'params': {
841 'youtube_include_dash_manifest': True,
842 'format': '135', # bestvideo
be49068d
S
843 },
844 'skip': 'This live event has ended.',
2ee8f5d8 845 },
cf7e015f
S
846 {
847 # Multifeed videos (multiple cameras), URL is for Main Camera
545cc85d 848 'url': 'https://www.youtube.com/watch?v=jvGDaLqkpTg',
cf7e015f 849 'info_dict': {
545cc85d 850 'id': 'jvGDaLqkpTg',
851 'title': 'Tom Clancy Free Weekend Rainbow Whatever',
852 'description': 'md5:e03b909557865076822aa169218d6a5d',
cf7e015f
S
853 },
854 'playlist': [{
855 'info_dict': {
545cc85d 856 'id': 'jvGDaLqkpTg',
cf7e015f 857 'ext': 'mp4',
545cc85d 858 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Main Camera)',
859 'description': 'md5:e03b909557865076822aa169218d6a5d',
860 'duration': 10643,
861 'upload_date': '20161111',
862 'uploader': 'Team PGP',
863 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
864 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
cf7e015f
S
865 },
866 }, {
867 'info_dict': {
545cc85d 868 'id': '3AKt1R1aDnw',
cf7e015f 869 'ext': 'mp4',
545cc85d 870 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 2)',
871 'description': 'md5:e03b909557865076822aa169218d6a5d',
872 'duration': 10991,
873 'upload_date': '20161111',
874 'uploader': 'Team PGP',
875 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
876 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
cf7e015f
S
877 },
878 }, {
879 'info_dict': {
545cc85d 880 'id': 'RtAMM00gpVc',
cf7e015f 881 'ext': 'mp4',
545cc85d 882 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 3)',
883 'description': 'md5:e03b909557865076822aa169218d6a5d',
884 'duration': 10995,
885 'upload_date': '20161111',
886 'uploader': 'Team PGP',
887 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
888 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
cf7e015f
S
889 },
890 }, {
891 'info_dict': {
545cc85d 892 'id': '6N2fdlP3C5U',
cf7e015f 893 'ext': 'mp4',
545cc85d 894 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 4)',
895 'description': 'md5:e03b909557865076822aa169218d6a5d',
896 'duration': 10990,
897 'upload_date': '20161111',
898 'uploader': 'Team PGP',
899 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
900 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
cf7e015f
S
901 },
902 }],
903 'params': {
904 'skip_download': True,
905 },
cbaed4bb 906 },
f9f49d87 907 {
067aa17e 908 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
f9f49d87
S
909 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
910 'info_dict': {
911 'id': 'gVfLd0zydlo',
912 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
913 },
914 'playlist_count': 2,
be49068d 915 'skip': 'Not multifeed anymore',
f9f49d87 916 },
cbaed4bb 917 {
2d3d2997 918 'url': 'https://vid.plus/FlRa-iH7PGw',
cbaed4bb 919 'only_matching': True,
0e49d9a6 920 },
6d4fc66b 921 {
2d3d2997 922 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
6d4fc66b
S
923 'only_matching': True,
924 },
0e49d9a6 925 {
067aa17e 926 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
a8776b10 927 # Also tests cut-off URL expansion in video description (see
067aa17e
S
928 # https://github.com/ytdl-org/youtube-dl/issues/1892,
929 # https://github.com/ytdl-org/youtube-dl/issues/8164)
0e49d9a6
LL
930 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
931 'info_dict': {
932 'id': 'lsguqyKfVQg',
933 'ext': 'mp4',
934 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
eb6793ba 935 'alt_title': 'Dark Walk - Position Music',
0e49d9a6 936 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
556dbe7f 937 'duration': 133,
0e49d9a6
LL
938 'upload_date': '20151119',
939 'uploader_id': 'IronSoulElf',
ec85ded8 940 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
0e49d9a6 941 'uploader': 'IronSoulElf',
eb6793ba
S
942 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
943 'track': 'Dark Walk - Position Music',
944 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
92bc97d3 945 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
0e49d9a6
LL
946 },
947 'params': {
948 'skip_download': True,
949 },
950 },
61f92af1 951 {
067aa17e 952 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
61f92af1
S
953 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
954 'only_matching': True,
955 },
313dfc45
LL
956 {
957 # Video with yt:stretch=17:0
958 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
959 'info_dict': {
960 'id': 'Q39EVAstoRM',
961 'ext': 'mp4',
962 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
963 'description': 'md5:ee18a25c350637c8faff806845bddee9',
964 'upload_date': '20151107',
965 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
966 'uploader': 'CH GAMER DROID',
967 },
968 'params': {
969 'skip_download': True,
970 },
be49068d 971 'skip': 'This video does not exist.',
313dfc45 972 },
7caf9830
S
973 {
974 # Video licensed under Creative Commons
975 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
976 'info_dict': {
977 'id': 'M4gD1WSo5mA',
978 'ext': 'mp4',
979 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
980 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
556dbe7f 981 'duration': 721,
7caf9830
S
982 'upload_date': '20150127',
983 'uploader_id': 'BerkmanCenter',
ec85ded8 984 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
556dbe7f 985 'uploader': 'The Berkman Klein Center for Internet & Society',
7caf9830
S
986 'license': 'Creative Commons Attribution license (reuse allowed)',
987 },
988 'params': {
989 'skip_download': True,
990 },
991 },
fd050249
S
992 {
993 # Channel-like uploader_url
994 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
995 'info_dict': {
996 'id': 'eQcmzGIKrzg',
997 'ext': 'mp4',
998 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
545cc85d 999 'description': 'md5:13a2503d7b5904ef4b223aa101628f39',
556dbe7f 1000 'duration': 4060,
fd050249 1001 'upload_date': '20151119',
eb6793ba 1002 'uploader': 'Bernie Sanders',
fd050249 1003 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
ec85ded8 1004 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
fd050249
S
1005 'license': 'Creative Commons Attribution license (reuse allowed)',
1006 },
1007 'params': {
1008 'skip_download': True,
1009 },
1010 },
040ac686
S
1011 {
1012 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
1013 'only_matching': True,
7f29cf54
S
1014 },
1015 {
067aa17e 1016 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
7f29cf54
S
1017 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
1018 'only_matching': True,
6496ccb4
S
1019 },
1020 {
1021 # Rental video preview
1022 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1023 'info_dict': {
1024 'id': 'uGpuVWrhIzE',
1025 'ext': 'mp4',
1026 'title': 'Piku - Trailer',
1027 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1028 'upload_date': '20150811',
1029 'uploader': 'FlixMatrix',
1030 'uploader_id': 'FlixMatrixKaravan',
ec85ded8 1031 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
6496ccb4
S
1032 'license': 'Standard YouTube License',
1033 },
1034 'params': {
1035 'skip_download': True,
1036 },
eb6793ba 1037 'skip': 'This video is not available.',
022a5d66 1038 },
12afdc2a
S
1039 {
1040 # YouTube Red video with episode data
1041 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1042 'info_dict': {
1043 'id': 'iqKdEhx-dD4',
1044 'ext': 'mp4',
1045 'title': 'Isolation - Mind Field (Ep 1)',
545cc85d 1046 'description': 'md5:f540112edec5d09fc8cc752d3d4ba3cd',
556dbe7f 1047 'duration': 2085,
12afdc2a
S
1048 'upload_date': '20170118',
1049 'uploader': 'Vsauce',
1050 'uploader_id': 'Vsauce',
1051 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
12afdc2a
S
1052 'series': 'Mind Field',
1053 'season_number': 1,
1054 'episode_number': 1,
1055 },
1056 'params': {
1057 'skip_download': True,
1058 },
1059 'expected_warnings': [
1060 'Skipping DASH manifest',
1061 ],
1062 },
c7121fa7
S
1063 {
1064 # The following content has been identified by the YouTube community
1065 # as inappropriate or offensive to some audiences.
1066 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1067 'info_dict': {
1068 'id': '6SJNVb0GnPI',
1069 'ext': 'mp4',
1070 'title': 'Race Differences in Intelligence',
1071 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1072 'duration': 965,
1073 'upload_date': '20140124',
1074 'uploader': 'New Century Foundation',
1075 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1076 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
c7121fa7
S
1077 },
1078 'params': {
1079 'skip_download': True,
1080 },
545cc85d 1081 'skip': 'This video has been removed for violating YouTube\'s policy on hate speech.',
c7121fa7 1082 },
022a5d66
S
1083 {
1084 # itag 212
1085 'url': '1t24XAntNCY',
1086 'only_matching': True,
fd5c4aab
S
1087 },
1088 {
1089 # geo restricted to JP
1090 'url': 'sJL6WA-aGkQ',
1091 'only_matching': True,
1092 },
cd5a74a2
S
1093 {
1094 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1095 'only_matching': True,
1096 },
bc2ca1bb 1097 {
1098 'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc',
1099 'only_matching': True,
1100 },
1101 {
1102 # from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m
1103 'url': 'https://redirect.invidious.io/Yh0AhrY9GjA',
1104 'only_matching': True,
1105 },
825cd268
RA
1106 {
1107 # DRM protected
1108 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1109 'only_matching': True,
4fe54c12
S
1110 },
1111 {
1112 # Video with unsupported adaptive stream type formats
1113 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1114 'info_dict': {
1115 'id': 'Z4Vy8R84T1U',
1116 'ext': 'mp4',
1117 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1118 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1119 'duration': 433,
1120 'upload_date': '20130923',
1121 'uploader': 'Amelia Putri Harwita',
1122 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1123 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1124 'formats': 'maxcount:10',
1125 },
1126 'params': {
1127 'skip_download': True,
1128 'youtube_include_dash_manifest': False,
1129 },
5429d6a9 1130 'skip': 'not actual anymore',
5caabd3c 1131 },
1132 {
822b9d9c 1133 # Youtube Music Auto-generated description
5caabd3c 1134 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1135 'info_dict': {
1136 'id': 'MgNrAu2pzNs',
1137 'ext': 'mp4',
1138 'title': 'Voyeur Girl',
1139 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1140 'upload_date': '20190312',
5429d6a9
S
1141 'uploader': 'Stephen - Topic',
1142 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
5caabd3c 1143 'artist': 'Stephen',
1144 'track': 'Voyeur Girl',
1145 'album': 'it\'s too much love to know my dear',
1146 'release_date': '20190313',
1147 'release_year': 2019,
1148 },
1149 'params': {
1150 'skip_download': True,
1151 },
1152 },
66b48727
RA
1153 {
1154 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1155 'only_matching': True,
1156 },
011e75e6
S
1157 {
1158 # invalid -> valid video id redirection
1159 'url': 'DJztXj2GPfl',
1160 'info_dict': {
1161 'id': 'DJztXj2GPfk',
1162 'ext': 'mp4',
1163 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1164 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1165 'upload_date': '20090125',
1166 'uploader': 'Prochorowka',
1167 'uploader_id': 'Prochorowka',
1168 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1169 'artist': 'Panjabi MC',
1170 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1171 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1172 },
1173 'params': {
1174 'skip_download': True,
1175 },
545cc85d 1176 'skip': 'Video unavailable',
ea74e00b
DP
1177 },
1178 {
1179 # empty description results in an empty string
1180 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1181 'info_dict': {
1182 'id': 'x41yOUIvK2k',
1183 'ext': 'mp4',
1184 'title': 'IMG 3456',
1185 'description': '',
1186 'upload_date': '20170613',
1187 'uploader_id': 'ElevageOrVert',
1188 'uploader': 'ElevageOrVert',
1189 },
1190 'params': {
1191 'skip_download': True,
1192 },
1193 },
a0566bbf 1194 {
29f7c58a 1195 # with '};' inside yt initial data (see [1])
1196 # see [2] for an example with '};' inside ytInitialPlayerResponse
1197 # 1. https://github.com/ytdl-org/youtube-dl/issues/27093
1198 # 2. https://github.com/ytdl-org/youtube-dl/issues/27216
a0566bbf 1199 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1200 'info_dict': {
1201 'id': 'CHqg6qOn4no',
1202 'ext': 'mp4',
1203 'title': 'Part 77 Sort a list of simple types in c#',
1204 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1205 'upload_date': '20130831',
1206 'uploader_id': 'kudvenkat',
1207 'uploader': 'kudvenkat',
1208 },
1209 'params': {
1210 'skip_download': True,
1211 },
1212 },
29f7c58a 1213 {
1214 # another example of '};' in ytInitialData
1215 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY',
1216 'only_matching': True,
1217 },
1218 {
1219 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ',
1220 'only_matching': True,
1221 },
545cc85d 1222 {
cc2db878 1223 # https://github.com/ytdl-org/youtube-dl/pull/28094
1224 'url': 'OtqTfy26tG0',
1225 'info_dict': {
1226 'id': 'OtqTfy26tG0',
1227 'ext': 'mp4',
1228 'title': 'Burn Out',
1229 'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131',
1230 'upload_date': '20141120',
1231 'uploader': 'The Cinematic Orchestra - Topic',
1232 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw',
1233 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw',
1234 'artist': 'The Cinematic Orchestra',
1235 'track': 'Burn Out',
1236 'album': 'Every Day',
1237 'release_data': None,
1238 'release_year': None,
1239 },
1240 'params': {
1241 'skip_download': True,
1242 },
545cc85d 1243 },
bc2ca1bb 1244 {
1245 # controversial video, only works with bpctr when authenticated with cookies
1246 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg',
1247 'only_matching': True,
1248 },
2eb88d95
PH
1249 ]
1250
e0df6211
PH
1251 def __init__(self, *args, **kwargs):
1252 super(YoutubeIE, self).__init__(*args, **kwargs)
545cc85d 1253 self._code_cache = {}
83799698 1254 self._player_cache = {}
e0df6211 1255
60064c53
PH
1256 def _signature_cache_id(self, example_sig):
1257 """ Return a string representation of a signature """
78caa52a 1258 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53 1259
e40c758c
S
1260 @classmethod
1261 def _extract_player_info(cls, player_url):
1262 for player_re in cls._PLAYER_INFO_RE:
1263 id_m = re.search(player_re, player_url)
1264 if id_m:
1265 break
1266 else:
c081b35c 1267 raise ExtractorError('Cannot identify player %r' % player_url)
545cc85d 1268 return id_m.group('id')
e40c758c
S
1269
1270 def _extract_signature_function(self, video_id, player_url, example_sig):
545cc85d 1271 player_id = self._extract_player_info(player_url)
e0df6211 1272
c4417ddb 1273 # Read from filesystem cache
545cc85d 1274 func_id = 'js_%s_%s' % (
1275 player_id, self._signature_cache_id(example_sig))
c4417ddb 1276 assert os.path.basename(func_id) == func_id
a0e07d31 1277
69ea8ca4 1278 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 1279 if cache_spec is not None:
78caa52a 1280 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 1281
545cc85d 1282 if player_id not in self._code_cache:
1283 self._code_cache[player_id] = self._download_webpage(
e0df6211 1284 player_url, video_id,
545cc85d 1285 note='Downloading player ' + player_id,
69ea8ca4 1286 errnote='Download of %s failed' % player_url)
545cc85d 1287 code = self._code_cache[player_id]
1288 res = self._parse_sig_js(code)
e0df6211 1289
785521bf
PH
1290 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1291 cache_res = res(test_string)
1292 cache_spec = [ord(c) for c in cache_res]
83799698 1293
69ea8ca4 1294 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
1295 return res
1296
60064c53 1297 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
1298 def gen_sig_code(idxs):
1299 def _genslice(start, end, step):
78caa52a 1300 starts = '' if start == 0 else str(start)
8bcc8756 1301 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 1302 steps = '' if step == 1 else (':%d' % step)
78caa52a 1303 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
1304
1305 step = None
7af808a5
PH
1306 # Quelch pyflakes warnings - start will be set when step is set
1307 start = '(Never used)'
edf3e38e
PH
1308 for i, prev in zip(idxs[1:], idxs[:-1]):
1309 if step is not None:
1310 if i - prev == step:
1311 continue
1312 yield _genslice(start, prev, step)
1313 step = None
1314 continue
1315 if i - prev in [-1, 1]:
1316 step = i - prev
1317 start = prev
1318 continue
1319 else:
78caa52a 1320 yield 's[%d]' % prev
edf3e38e 1321 if step is None:
78caa52a 1322 yield 's[%d]' % i
edf3e38e
PH
1323 else:
1324 yield _genslice(start, i, step)
1325
78caa52a 1326 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 1327 cache_res = func(test_string)
edf3e38e 1328 cache_spec = [ord(c) for c in cache_res]
78caa52a 1329 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
1330 signature_id_tuple = '(%s)' % (
1331 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 1332 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 1333 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 1334 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 1335
e0df6211
PH
1336 def _parse_sig_js(self, jscode):
1337 funcname = self._search_regex(
abefc03f
S
1338 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1339 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
cc2db878 1340 r'\bm=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(h\.s\)\)',
1341 r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(c\)\)',
1342 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)',
e450f6cb 1343 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
31ce6e99 1344 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
abefc03f
S
1345 # Obsolete patterns
1346 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
9a47fa35 1347 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
abefc03f
S
1348 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1349 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1350 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1351 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1352 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1353 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
3c90cc8b 1354 jscode, 'Initial JS player signature function name', group='sig')
2b25cb5d
PH
1355
1356 jsi = JSInterpreter(jscode)
1357 initial_function = jsi.extract_function(funcname)
e0df6211
PH
1358 return lambda s: initial_function([s])
1359
545cc85d 1360 def _decrypt_signature(self, s, video_id, player_url):
257a2501 1361 """Turn the encrypted s field into a working signature"""
6b37f0be 1362
c8bf86d5 1363 if player_url is None:
69ea8ca4 1364 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 1365
69ea8ca4 1366 if player_url.startswith('//'):
78caa52a 1367 player_url = 'https:' + player_url
3c90cc8b
S
1368 elif not re.match(r'https?://', player_url):
1369 player_url = compat_urlparse.urljoin(
1370 'https://www.youtube.com', player_url)
c8bf86d5 1371 try:
62af3a0e 1372 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
1373 if player_id not in self._player_cache:
1374 func = self._extract_signature_function(
60064c53 1375 video_id, player_url, s
c8bf86d5
PH
1376 )
1377 self._player_cache[player_id] = func
1378 func = self._player_cache[player_id]
1379 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 1380 self._print_sig_code(func, s)
c8bf86d5
PH
1381 return func(s)
1382 except Exception as e:
1383 tb = traceback.format_exc()
1384 raise ExtractorError(
78caa52a 1385 'Signature extraction failed: ' + tb, cause=e)
e0df6211 1386
545cc85d 1387 def _mark_watched(self, video_id, player_response):
21c340b8
S
1388 playback_url = url_or_none(try_get(
1389 player_response,
545cc85d 1390 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']))
d77ab8e2
S
1391 if not playback_url:
1392 return
1393 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1394 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1395
1396 # cpn generation algorithm is reverse engineered from base.js.
1397 # In fact it works even with dummy cpn.
1398 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1399 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1400
1401 qs.update({
1402 'ver': ['2'],
1403 'cpn': [cpn],
1404 })
1405 playback_url = compat_urlparse.urlunparse(
15707c7e 1406 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
d77ab8e2
S
1407
1408 self._download_webpage(
1409 playback_url, video_id, 'Marking watched',
1410 'Unable to mark watched', fatal=False)
1411
66c9fa36
S
1412 @staticmethod
1413 def _extract_urls(webpage):
1414 # Embedded YouTube player
1415 entries = [
1416 unescapeHTML(mobj.group('url'))
1417 for mobj in re.finditer(r'''(?x)
1418 (?:
1419 <iframe[^>]+?src=|
1420 data-video-url=|
1421 <embed[^>]+?src=|
1422 embedSWF\(?:\s*|
1423 <object[^>]+data=|
1424 new\s+SWFObject\(
1425 )
1426 (["\'])
1427 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
f2332f18 1428 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
66c9fa36
S
1429 \1''', webpage)]
1430
1431 # lazyYT YouTube embed
1432 entries.extend(list(map(
1433 unescapeHTML,
1434 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1435
1436 # Wordpress "YouTube Video Importer" plugin
1437 matches = re.findall(r'''(?x)<div[^>]+
1438 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1439 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1440 entries.extend(m[-1] for m in matches)
1441
1442 return entries
1443
1444 @staticmethod
1445 def _extract_url(webpage):
1446 urls = YoutubeIE._extract_urls(webpage)
1447 return urls[0] if urls else None
1448
97665381
PH
1449 @classmethod
1450 def extract_id(cls, url):
1451 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 1452 if mobj is None:
69ea8ca4 1453 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1454 video_id = mobj.group(2)
1455 return video_id
1456
545cc85d 1457 def _extract_chapters_from_json(self, data, video_id, duration):
84213ea8 1458 chapters_list = try_get(
8bdd16b4 1459 data,
84213ea8
S
1460 lambda x: x['playerOverlays']
1461 ['playerOverlayRenderer']
1462 ['decoratedPlayerBarRenderer']
1463 ['decoratedPlayerBarRenderer']
1464 ['playerBar']
1465 ['chapteredPlayerBarRenderer']
1466 ['chapters'],
1467 list)
1468 if not chapters_list:
1469 return
1470
1471 def chapter_time(chapter):
1472 return float_or_none(
1473 try_get(
1474 chapter,
1475 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1476 int),
1477 scale=1000)
1478 chapters = []
1479 for next_num, chapter in enumerate(chapters_list, start=1):
1480 start_time = chapter_time(chapter)
1481 if start_time is None:
1482 continue
1483 end_time = (chapter_time(chapters_list[next_num])
1484 if next_num < len(chapters_list) else duration)
1485 if end_time is None:
1486 continue
1487 title = try_get(
1488 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1489 compat_str)
1490 chapters.append({
1491 'start_time': start_time,
1492 'end_time': end_time,
1493 'title': title,
1494 })
1495 return chapters
1496
545cc85d 1497 def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
1498 return self._parse_json(self._search_regex(
1499 (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
1500 regex), webpage, name, default='{}'), video_id, fatal=False)
84213ea8 1501
a1c5d2ca
M
1502 @staticmethod
1503 def _join_text_entries(runs):
1504 text = None
1505 for run in runs:
1506 if not isinstance(run, dict):
1507 continue
1508 sub_text = try_get(run, lambda x: x['text'], compat_str)
1509 if sub_text:
1510 if not text:
1511 text = sub_text
1512 continue
1513 text += sub_text
1514 return text
1515
1516 def _extract_comment(self, comment_renderer, parent=None):
1517 comment_id = comment_renderer.get('commentId')
1518 if not comment_id:
1519 return
1520 comment_text_runs = try_get(comment_renderer, lambda x: x['contentText']['runs']) or []
1521 text = self._join_text_entries(comment_text_runs) or ''
1522 comment_time_text = try_get(comment_renderer, lambda x: x['publishedTimeText']['runs']) or []
1523 time_text = self._join_text_entries(comment_time_text)
1524
1525 author = try_get(comment_renderer, lambda x: x['authorText']['simpleText'], compat_str)
1526 author_id = try_get(comment_renderer,
1527 lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
1528 votes = str_to_int(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'],
1529 lambda x: x['likeCount']), compat_str)) or 0
1530 author_thumbnail = try_get(comment_renderer,
1531 lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], compat_str)
1532
1533 author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
1534 is_liked = try_get(comment_renderer, lambda x: x['isLiked'], bool)
1535
1536 return {
1537 'id': comment_id,
1538 'text': text,
1539 # TODO: This should be parsed to timestamp
1540 'time_text': time_text,
1541 'like_count': votes,
1542 'is_favorited': is_liked,
1543 'author': author,
1544 'author_id': author_id,
1545 'author_thumbnail': author_thumbnail,
1546 'author_is_uploader': author_is_uploader,
1547 'parent': parent or 'root'
1548 }
1549
1550 def _comment_entries(self, root_continuation_data, identity_token, account_syncid,
1551 session_token_list, parent=None, comment_counts=None):
1552
1553 def extract_thread(parent_renderer):
1554 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
1555 if not parent:
1556 comment_counts[2] = 0
1557 for content in contents:
1558 comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
1559 comment_renderer = try_get(
1560 comment_thread_renderer, (lambda x: x['comment']['commentRenderer'], dict)) or try_get(
1561 content, (lambda x: x['commentRenderer'], dict))
1562
1563 if not comment_renderer:
1564 continue
1565 comment = self._extract_comment(comment_renderer, parent)
1566 if not comment:
1567 continue
1568 comment_counts[0] += 1
1569 yield comment
1570 # Attempt to get the replies
1571 comment_replies_renderer = try_get(
1572 comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
1573
1574 if comment_replies_renderer:
1575 comment_counts[2] += 1
1576 comment_entries_iter = self._comment_entries(
1577 comment_replies_renderer, identity_token, account_syncid,
1578 parent=comment.get('id'), session_token_list=session_token_list,
1579 comment_counts=comment_counts)
1580
1581 for reply_comment in comment_entries_iter:
1582 yield reply_comment
1583
1584 if not comment_counts:
1585 # comment so far, est. total comments, current comment thread #
1586 comment_counts = [0, 0, 0]
1587 headers = self._DEFAULT_BASIC_API_HEADERS.copy()
1588
1589 # TODO: Generalize the download code with TabIE
1590 if identity_token:
1591 headers['x-youtube-identity-token'] = identity_token
1592
1593 if account_syncid:
1594 headers['X-Goog-PageId'] = account_syncid
1595 headers['X-Goog-AuthUser'] = 0
1596
1597 continuation = YoutubeTabIE._extract_continuation(root_continuation_data) # TODO
1598 first_continuation = False
1599 if parent is None:
1600 first_continuation = True
1601
1602 for page_num in itertools.count(0):
1603 if not continuation:
1604 break
1605 retries = self._downloader.params.get('extractor_retries', 3)
1606 count = -1
1607 last_error = None
1608
1609 while count < retries:
1610 count += 1
1611 if last_error:
1612 self.report_warning('%s. Retrying ...' % last_error)
1613 try:
1614 query = {
1615 'ctoken': continuation['ctoken'],
1616 'pbj': 1,
1617 'type': 'next',
1618 }
1619 if parent:
1620 query['action_get_comment_replies'] = 1
1621 else:
1622 query['action_get_comments'] = 1
1623
1624 comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
1625 if page_num == 0:
1626 if first_continuation:
1627 note_prefix = "Downloading initial comment continuation page"
1628 else:
1629 note_prefix = " Downloading comment reply thread %d %s" % (comment_counts[2], comment_prog_str)
1630 else:
1631 note_prefix = "%sDownloading comment%s page %d %s" % (
1632 " " if parent else "",
1633 ' replies' if parent else '',
1634 page_num,
1635 comment_prog_str)
1636
1637 browse = self._download_json(
1638 'https://www.youtube.com/comment_service_ajax', None,
1639 '%s %s' % (note_prefix, '(retry #%d)' % count if count else ''),
1640 headers=headers, query=query,
1641 data=urlencode_postdata({
1642 'session_token': session_token_list[0]
1643 }))
1644 except ExtractorError as e:
1645 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404, 413):
1646 if e.cause.code == 413:
1647 self.report_warning("Assumed end of comments (received HTTP Error 413)")
1648 return
1649 # Downloading page may result in intermittent 5xx HTTP error
1650 # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
1651 last_error = 'HTTP Error %s' % e.cause.code
1652 if e.cause.code == 404:
1653 last_error = last_error + " (this API is probably deprecated)"
1654 if count < retries:
1655 continue
1656 raise
1657 else:
1658 session_token = try_get(browse, lambda x: x['xsrf_token'], compat_str)
1659 if session_token:
1660 session_token_list[0] = session_token
1661
1662 response = try_get(browse,
1663 (lambda x: x['response'],
1664 lambda x: x[1]['response'])) or {}
1665
1666 if response.get('continuationContents'):
1667 break
1668
1669 # YouTube sometimes gives reload: now json if something went wrong (e.g. bad auth)
1670 if browse.get('reload'):
1671 raise ExtractorError("Invalid or missing params in continuation request", expected=False)
1672
1673 # TODO: not tested, merged from old extractor
1674 err_msg = browse.get('externalErrorMessage')
1675 if err_msg:
1676 raise ExtractorError('YouTube said: %s' % err_msg, expected=False)
1677
1678 # Youtube sometimes sends incomplete data
1679 # See: https://github.com/ytdl-org/youtube-dl/issues/28194
1680 last_error = 'Incomplete data received'
1681 if count >= retries:
1682 self._downloader.report_error(last_error)
1683
1684 if not response:
1685 break
1686
1687 known_continuation_renderers = {
1688 'itemSectionContinuation': extract_thread,
1689 'commentRepliesContinuation': extract_thread
1690 }
1691
1692 # extract next root continuation from the results
1693 continuation_contents = try_get(
1694 response, lambda x: x['continuationContents'], dict) or {}
1695
1696 for key, value in continuation_contents.items():
1697 if key not in known_continuation_renderers:
1698 continue
1699 continuation_renderer = value
1700
1701 if first_continuation:
1702 first_continuation = False
1703 expected_comment_count = try_get(
1704 continuation_renderer,
1705 (lambda x: x['header']['commentsHeaderRenderer']['countText']['runs'][0]['text'],
1706 lambda x: x['header']['commentsHeaderRenderer']['commentsCount']['runs'][0]['text']),
1707 compat_str)
1708
1709 if expected_comment_count:
1710 comment_counts[1] = str_to_int(expected_comment_count)
1711 self.to_screen("Downloading ~%d comments" % str_to_int(expected_comment_count))
1712 yield comment_counts[1]
1713
1714 # TODO: cli arg.
1715 # 1/True for newest, 0/False for popular (default)
1716 comment_sort_index = int(True)
1717 sort_continuation_renderer = try_get(
1718 continuation_renderer,
1719 lambda x: x['header']['commentsHeaderRenderer']['sortMenu']['sortFilterSubMenuRenderer']['subMenuItems']
1720 [comment_sort_index]['continuation']['reloadContinuationData'], dict)
1721 # If this fails, the initial continuation page
1722 # starts off with popular anyways.
1723 if sort_continuation_renderer:
1724 continuation = YoutubeTabIE._build_continuation_query(
1725 continuation=sort_continuation_renderer.get('continuation'),
1726 ctp=sort_continuation_renderer.get('clickTrackingParams'))
1727 self.to_screen("Sorting comments by %s" % ('popular' if comment_sort_index == 0 else 'newest'))
1728 break
1729
1730 for entry in known_continuation_renderers[key](continuation_renderer):
1731 yield entry
1732
1733 continuation = YoutubeTabIE._extract_continuation(continuation_renderer) # TODO
1734 break
1735
1736 def _extract_comments(self, ytcfg, video_id, contents, webpage, xsrf_token):
1737 """Entry for comment extraction"""
1738 comments = []
1739 known_entry_comment_renderers = (
1740 'itemSectionRenderer',
1741 )
1742 estimated_total = 0
1743 for entry in contents:
1744 for key, renderer in entry.items():
1745 if key not in known_entry_comment_renderers:
1746 continue
1747
1748 comment_iter = self._comment_entries(
1749 renderer,
1750 identity_token=self._extract_identity_token(webpage, item_id=video_id),
1751 account_syncid=self._extract_account_syncid(ytcfg),
1752 session_token_list=[xsrf_token])
1753
1754 for comment in comment_iter:
1755 if isinstance(comment, int):
1756 estimated_total = comment
1757 continue
1758 comments.append(comment)
1759 break
1760 self.to_screen("Downloaded %d/%d comments" % (len(comments), estimated_total))
1761 return {
1762 'comments': comments,
1763 'comment_count': len(comments),
1764 }
1765
c5e8d7af 1766 def _real_extract(self, url):
cf7e015f 1767 url, smuggled_data = unsmuggle_url(url, {})
545cc85d 1768 video_id = self._match_id(url)
1769 base_url = self.http_scheme() + '//www.youtube.com/'
b3d12425 1770 webpage_url = base_url + 'watch?v=' + video_id
1771 webpage = self._download_webpage(
cce889b9 1772 webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
545cc85d 1773
1774 player_response = None
1775 if webpage:
1776 player_response = self._extract_yt_initial_variable(
1777 webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
1778 video_id, 'initial player response')
1779 if not player_response:
1780 player_response = self._call_api(
1781 'player', {'videoId': video_id}, video_id)
1782
1783 playability_status = player_response.get('playabilityStatus') or {}
1784 if playability_status.get('reason') == 'Sign in to confirm your age':
1785 pr = self._parse_json(try_get(compat_parse_qs(
1786 self._download_webpage(
1787 base_url + 'get_video_info', video_id,
1788 'Refetching age-gated info webpage',
1789 'unable to download video info webpage', query={
1790 'video_id': video_id,
7c60c33e 1791 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
545cc85d 1792 }, fatal=False)),
1793 lambda x: x['player_response'][0],
1794 compat_str) or '{}', video_id)
1795 if pr:
1796 player_response = pr
1797
1798 trailer_video_id = try_get(
1799 playability_status,
1800 lambda x: x['errorScreen']['playerLegacyDesktopYpcTrailerRenderer']['trailerVideoId'],
1801 compat_str)
1802 if trailer_video_id:
1803 return self.url_result(
1804 trailer_video_id, self.ie_key(), trailer_video_id)
cf7e015f 1805
545cc85d 1806 def get_text(x):
1807 if not x:
c2d125d9 1808 return
545cc85d 1809 return x.get('simpleText') or ''.join([r['text'] for r in x['runs']])
15be3eb5 1810
545cc85d 1811 search_meta = (
1812 lambda x: self._html_search_meta(x, webpage, default=None)) \
1813 if webpage else lambda x: None
dbdaaa23 1814
545cc85d 1815 video_details = player_response.get('videoDetails') or {}
37357d21 1816 microformat = try_get(
545cc85d 1817 player_response,
1818 lambda x: x['microformat']['playerMicroformatRenderer'],
1819 dict) or {}
1820 video_title = video_details.get('title') \
1821 or get_text(microformat.get('title')) \
1822 or search_meta(['og:title', 'twitter:title', 'title'])
1823 video_description = video_details.get('shortDescription')
cf7e015f 1824
8fe10494 1825 if not smuggled_data.get('force_singlefeed', False):
5e1eddb9 1826 if not self._downloader.params.get('noplaylist'):
8fe10494
S
1827 multifeed_metadata_list = try_get(
1828 player_response,
1829 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
545cc85d 1830 compat_str)
8fe10494
S
1831 if multifeed_metadata_list:
1832 entries = []
1833 feed_ids = []
1834 for feed in multifeed_metadata_list.split(','):
1835 # Unquote should take place before split on comma (,) since textual
1836 # fields may contain comma as well (see
067aa17e 1837 # https://github.com/ytdl-org/youtube-dl/issues/8536)
545cc85d 1838 feed_data = compat_parse_qs(
1839 compat_urllib_parse_unquote_plus(feed))
6b09401b
S
1840
1841 def feed_entry(name):
545cc85d 1842 return try_get(
1843 feed_data, lambda x: x[name][0], compat_str)
6b09401b
S
1844
1845 feed_id = feed_entry('id')
1846 if not feed_id:
1847 continue
1848 feed_title = feed_entry('title')
1849 title = video_title
1850 if feed_title:
1851 title += ' (%s)' % feed_title
8fe10494
S
1852 entries.append({
1853 '_type': 'url_transparent',
1854 'ie_key': 'Youtube',
1855 'url': smuggle_url(
545cc85d 1856 base_url + 'watch?v=' + feed_data['id'][0],
8fe10494 1857 {'force_singlefeed': True}),
6b09401b 1858 'title': title,
8fe10494 1859 })
6b09401b 1860 feed_ids.append(feed_id)
8fe10494
S
1861 self.to_screen(
1862 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1863 % (', '.join(feed_ids), video_id))
545cc85d 1864 return self.playlist_result(
1865 entries, video_id, video_title, video_description)
8fe10494
S
1866 else:
1867 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1868
545cc85d 1869 formats = []
1870 itags = []
cc2db878 1871 itag_qualities = {}
545cc85d 1872 player_url = None
dca3ff4a 1873 q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'])
545cc85d 1874 streaming_data = player_response.get('streamingData') or {}
1875 streaming_formats = streaming_data.get('formats') or []
1876 streaming_formats.extend(streaming_data.get('adaptiveFormats') or [])
1877 for fmt in streaming_formats:
1878 if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
1879 continue
321bf820 1880
cc2db878 1881 itag = str_or_none(fmt.get('itag'))
1882 quality = fmt.get('quality')
1883 if itag and quality:
1884 itag_qualities[itag] = quality
1885 # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment
1886 # (adding `&sq=0` to the URL) and parsing emsg box to determine the
1887 # number of fragment that would subsequently requested with (`&sq=N`)
1888 if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF':
1889 continue
1890
545cc85d 1891 fmt_url = fmt.get('url')
1892 if not fmt_url:
1893 sc = compat_parse_qs(fmt.get('signatureCipher'))
1894 fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0]))
1895 encrypted_sig = try_get(sc, lambda x: x['s'][0])
1896 if not (sc and fmt_url and encrypted_sig):
1897 continue
1898 if not player_url:
1899 if not webpage:
1900 continue
1901 player_url = self._search_regex(
1902 r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
1903 webpage, 'player URL', fatal=False)
1904 if not player_url:
201e9eaa 1905 continue
545cc85d 1906 signature = self._decrypt_signature(sc['s'][0], video_id, player_url)
1907 sp = try_get(sc, lambda x: x['sp'][0]) or 'signature'
1908 fmt_url += '&' + sp + '=' + signature
1909
545cc85d 1910 if itag:
1911 itags.append(itag)
cc2db878 1912 tbr = float_or_none(
1913 fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
545cc85d 1914 dct = {
1915 'asr': int_or_none(fmt.get('audioSampleRate')),
1916 'filesize': int_or_none(fmt.get('contentLength')),
1917 'format_id': itag,
1918 'format_note': fmt.get('qualityLabel') or quality,
1919 'fps': int_or_none(fmt.get('fps')),
1920 'height': int_or_none(fmt.get('height')),
dca3ff4a 1921 'quality': q(quality),
cc2db878 1922 'tbr': tbr,
545cc85d 1923 'url': fmt_url,
1924 'width': fmt.get('width'),
1925 }
1926 mimetype = fmt.get('mimeType')
1927 if mimetype:
1928 mobj = re.match(
1929 r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', mimetype)
1930 if mobj:
1931 dct['ext'] = mimetype2ext(mobj.group(1))
1932 dct.update(parse_codecs(mobj.group(2)))
cc2db878 1933 no_audio = dct.get('acodec') == 'none'
1934 no_video = dct.get('vcodec') == 'none'
1935 if no_audio:
1936 dct['vbr'] = tbr
1937 if no_video:
1938 dct['abr'] = tbr
1939 if no_audio or no_video:
545cc85d 1940 dct['downloader_options'] = {
1941 # Youtube throttles chunks >~10M
1942 'http_chunk_size': 10485760,
bf1317d2 1943 }
7c60c33e 1944 if dct.get('ext'):
1945 dct['container'] = dct['ext'] + '_dash'
545cc85d 1946 formats.append(dct)
1947
1948 hls_manifest_url = streaming_data.get('hlsManifestUrl')
1949 if hls_manifest_url:
1950 for f in self._extract_m3u8_formats(
1951 hls_manifest_url, video_id, 'mp4', fatal=False):
1952 itag = self._search_regex(
1953 r'/itag/(\d+)', f['url'], 'itag', default=None)
1954 if itag:
1955 f['format_id'] = itag
1956 formats.append(f)
1957
1418a043 1958 if self._downloader.params.get('youtube_include_dash_manifest', True):
545cc85d 1959 dash_manifest_url = streaming_data.get('dashManifestUrl')
1960 if dash_manifest_url:
545cc85d 1961 for f in self._extract_mpd_formats(
1962 dash_manifest_url, video_id, fatal=False):
cc2db878 1963 itag = f['format_id']
1964 if itag in itags:
1965 continue
dca3ff4a 1966 if itag in itag_qualities:
1967 # Not actually usefull since the sorting is already done with "quality,res,fps,codec"
1968 # but kept to maintain feature parity (and code similarity) with youtube-dl
1969 # Remove if this causes any issues with sorting in future
1970 f['quality'] = q(itag_qualities[itag])
545cc85d 1971 filesize = int_or_none(self._search_regex(
1972 r'/clen/(\d+)', f.get('fragment_base_url')
1973 or f['url'], 'file size', default=None))
1974 if filesize:
1975 f['filesize'] = filesize
cc2db878 1976 formats.append(f)
bf1317d2 1977
545cc85d 1978 if not formats:
63ad4d43 1979 if not self._downloader.params.get('allow_unplayable_formats') and streaming_data.get('licenseInfos'):
545cc85d 1980 raise ExtractorError(
1981 'This video is DRM protected.', expected=True)
1982 pemr = try_get(
1983 playability_status,
1984 lambda x: x['errorScreen']['playerErrorMessageRenderer'],
1985 dict) or {}
1986 reason = get_text(pemr.get('reason')) or playability_status.get('reason')
1987 subreason = pemr.get('subreason')
1988 if subreason:
1989 subreason = clean_html(get_text(subreason))
1990 if subreason == 'The uploader has not made this video available in your country.':
1991 countries = microformat.get('availableCountries')
1992 if not countries:
1993 regions_allowed = search_meta('regionsAllowed')
1994 countries = regions_allowed.split(',') if regions_allowed else None
1995 self.raise_geo_restricted(
1996 subreason, countries)
1997 reason += '\n' + subreason
1998 if reason:
1999 raise ExtractorError(reason, expected=True)
bf1317d2 2000
545cc85d 2001 self._sort_formats(formats)
bf1317d2 2002
545cc85d 2003 keywords = video_details.get('keywords') or []
2004 if not keywords and webpage:
2005 keywords = [
2006 unescapeHTML(m.group('content'))
2007 for m in re.finditer(self._meta_regex('og:video:tag'), webpage)]
2008 for keyword in keywords:
2009 if keyword.startswith('yt:stretch='):
2010 w, h = keyword.split('=')[1].split(':')
2011 w, h = int(w), int(h)
2012 if w > 0 and h > 0:
2013 ratio = w / h
2014 for f in formats:
2015 if f.get('vcodec') != 'none':
2016 f['stretched_ratio'] = ratio
6449cd80 2017
545cc85d 2018 thumbnails = []
2019 for container in (video_details, microformat):
2020 for thumbnail in (try_get(
2021 container,
2022 lambda x: x['thumbnail']['thumbnails'], list) or []):
2023 thumbnail_url = thumbnail.get('url')
2024 if not thumbnail_url:
bf1317d2 2025 continue
545cc85d 2026 thumbnails.append({
2027 'height': int_or_none(thumbnail.get('height')),
2028 'url': thumbnail_url,
2029 'width': int_or_none(thumbnail.get('width')),
2030 })
2031 if thumbnails:
2032 break
a6211d23 2033 else:
545cc85d 2034 thumbnail = search_meta(['og:image', 'twitter:image'])
2035 if thumbnail:
2036 thumbnails = [{'url': thumbnail}]
2037
2038 category = microformat.get('category') or search_meta('genre')
2039 channel_id = video_details.get('channelId') \
2040 or microformat.get('externalChannelId') \
2041 or search_meta('channelId')
2042 duration = int_or_none(
2043 video_details.get('lengthSeconds')
2044 or microformat.get('lengthSeconds')) \
2045 or parse_duration(search_meta('duration'))
2046 is_live = video_details.get('isLive')
2047 owner_profile_url = microformat.get('ownerProfileUrl')
2048
2049 info = {
2050 'id': video_id,
2051 'title': self._live_title(video_title) if is_live else video_title,
2052 'formats': formats,
2053 'thumbnails': thumbnails,
2054 'description': video_description,
2055 'upload_date': unified_strdate(
2056 microformat.get('uploadDate')
2057 or search_meta('uploadDate')),
2058 'uploader': video_details['author'],
2059 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None,
2060 'uploader_url': owner_profile_url,
2061 'channel_id': channel_id,
2062 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None,
2063 'duration': duration,
2064 'view_count': int_or_none(
2065 video_details.get('viewCount')
2066 or microformat.get('viewCount')
2067 or search_meta('interactionCount')),
2068 'average_rating': float_or_none(video_details.get('averageRating')),
2069 'age_limit': 18 if (
2070 microformat.get('isFamilySafe') is False
2071 or search_meta('isFamilyFriendly') == 'false'
2072 or search_meta('og:restrictions:age') == '18+') else 0,
2073 'webpage_url': webpage_url,
2074 'categories': [category] if category else None,
2075 'tags': keywords,
2076 'is_live': is_live,
2077 'playable_in_embed': playability_status.get('playableInEmbed'),
c224251a 2078 'was_live': video_details.get('isLiveContent'),
545cc85d 2079 }
b477fc13 2080
545cc85d 2081 pctr = try_get(
2082 player_response,
2083 lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict)
2084 subtitles = {}
2085 if pctr:
2086 def process_language(container, base_url, lang_code, query):
2087 lang_subs = []
2088 for fmt in self._SUBTITLE_FORMATS:
2089 query.update({
2090 'fmt': fmt,
2091 })
2092 lang_subs.append({
2093 'ext': fmt,
2094 'url': update_url_query(base_url, query),
2095 })
2096 container[lang_code] = lang_subs
7e72694b 2097
545cc85d 2098 for caption_track in (pctr.get('captionTracks') or []):
2099 base_url = caption_track.get('baseUrl')
2100 if not base_url:
2101 continue
2102 if caption_track.get('kind') != 'asr':
2103 lang_code = caption_track.get('languageCode')
2104 if not lang_code:
2105 continue
2106 process_language(
2107 subtitles, base_url, lang_code, {})
2108 continue
2109 automatic_captions = {}
2110 for translation_language in (pctr.get('translationLanguages') or []):
2111 translation_language_code = translation_language.get('languageCode')
2112 if not translation_language_code:
2113 continue
2114 process_language(
2115 automatic_captions, base_url, translation_language_code,
2116 {'tlang': translation_language_code})
2117 info['automatic_captions'] = automatic_captions
2118 info['subtitles'] = subtitles
7e72694b 2119
545cc85d 2120 parsed_url = compat_urllib_parse_urlparse(url)
2121 for component in [parsed_url.fragment, parsed_url.query]:
2122 query = compat_parse_qs(component)
2123 for k, v in query.items():
2124 for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]:
2125 d_k += '_time'
2126 if d_k not in info and k in s_ks:
2127 info[d_k] = parse_duration(query[k][0])
822b9d9c
RA
2128
2129 # Youtube Music Auto-generated description
822b9d9c 2130 if video_description:
38d70284 2131 mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
822b9d9c 2132 if mobj:
822b9d9c
RA
2133 release_year = mobj.group('release_year')
2134 release_date = mobj.group('release_date')
2135 if release_date:
2136 release_date = release_date.replace('-', '')
2137 if not release_year:
545cc85d 2138 release_year = release_date[:4]
2139 info.update({
2140 'album': mobj.group('album'.strip()),
2141 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')),
2142 'track': mobj.group('track').strip(),
2143 'release_date': release_date,
cc2db878 2144 'release_year': int_or_none(release_year),
545cc85d 2145 })
7e72694b 2146
545cc85d 2147 initial_data = None
2148 if webpage:
2149 initial_data = self._extract_yt_initial_variable(
2150 webpage, self._YT_INITIAL_DATA_RE, video_id,
2151 'yt initial data')
2152 if not initial_data:
2153 initial_data = self._call_api(
2154 'next', {'videoId': video_id}, video_id, fatal=False)
2155
2156 if not is_live:
2157 try:
2158 # This will error if there is no livechat
2159 initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
2160 info['subtitles']['live_chat'] = [{
394dcd44 2161 'url': 'https://www.youtube.com/watch?v=%s' % video_id, # url is needed to set cookies
545cc85d 2162 'video_id': video_id,
2163 'ext': 'json',
2164 'protocol': 'youtube_live_chat_replay',
2165 }]
2166 except (KeyError, IndexError, TypeError):
2167 pass
2168
2169 if initial_data:
2170 chapters = self._extract_chapters_from_json(
2171 initial_data, video_id, duration)
2172 if not chapters:
2173 for engagment_pannel in (initial_data.get('engagementPanels') or []):
2174 contents = try_get(
2175 engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'],
2176 list)
2177 if not contents:
2178 continue
2179
2180 def chapter_time(mmlir):
2181 return parse_duration(
2182 get_text(mmlir.get('timeDescription')))
2183
2184 chapters = []
2185 for next_num, content in enumerate(contents, start=1):
2186 mmlir = content.get('macroMarkersListItemRenderer') or {}
2187 start_time = chapter_time(mmlir)
2188 end_time = chapter_time(try_get(
2189 contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \
2190 if next_num < len(contents) else duration
2191 if start_time is None or end_time is None:
2192 continue
2193 chapters.append({
2194 'start_time': start_time,
2195 'end_time': end_time,
2196 'title': get_text(mmlir.get('title')),
2197 })
2198 if chapters:
2199 break
2200 if chapters:
2201 info['chapters'] = chapters
2202
2203 contents = try_get(
2204 initial_data,
2205 lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'],
2206 list) or []
2207 for content in contents:
2208 vpir = content.get('videoPrimaryInfoRenderer')
2209 if vpir:
2210 stl = vpir.get('superTitleLink')
2211 if stl:
2212 stl = get_text(stl)
2213 if try_get(
2214 vpir,
2215 lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':
2216 info['location'] = stl
2217 else:
2218 mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl)
2219 if mobj:
2220 info.update({
2221 'series': mobj.group(1),
2222 'season_number': int(mobj.group(2)),
2223 'episode_number': int(mobj.group(3)),
2224 })
2225 for tlb in (try_get(
2226 vpir,
2227 lambda x: x['videoActions']['menuRenderer']['topLevelButtons'],
2228 list) or []):
2229 tbr = tlb.get('toggleButtonRenderer') or {}
2230 for getter, regex in [(
2231 lambda x: x['defaultText']['accessibility']['accessibilityData'],
2232 r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([
2233 lambda x: x['accessibility'],
2234 lambda x: x['accessibilityData']['accessibilityData'],
2235 ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]:
2236 label = (try_get(tbr, getter, dict) or {}).get('label')
2237 if label:
2238 mobj = re.match(regex, label)
2239 if mobj:
2240 info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count'))
2241 break
2242 sbr_tooltip = try_get(
2243 vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])
2244 if sbr_tooltip:
2245 like_count, dislike_count = sbr_tooltip.split(' / ')
2246 info.update({
2247 'like_count': str_to_int(like_count),
2248 'dislike_count': str_to_int(dislike_count),
2249 })
2250 vsir = content.get('videoSecondaryInfoRenderer')
2251 if vsir:
2252 info['channel'] = get_text(try_get(
2253 vsir,
2254 lambda x: x['owner']['videoOwnerRenderer']['title'],
cce889b9 2255 dict))
545cc85d 2256 rows = try_get(
2257 vsir,
2258 lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
2259 list) or []
2260 multiple_songs = False
2261 for row in rows:
2262 if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
2263 multiple_songs = True
2264 break
2265 for row in rows:
2266 mrr = row.get('metadataRowRenderer') or {}
2267 mrr_title = mrr.get('title')
2268 if not mrr_title:
2269 continue
2270 mrr_title = get_text(mrr['title'])
2271 mrr_contents_text = get_text(mrr['contents'][0])
2272 if mrr_title == 'License':
2273 info['license'] = mrr_contents_text
2274 elif not multiple_songs:
2275 if mrr_title == 'Album':
2276 info['album'] = mrr_contents_text
2277 elif mrr_title == 'Artist':
2278 info['artist'] = mrr_contents_text
2279 elif mrr_title == 'Song':
2280 info['track'] = mrr_contents_text
2281
2282 fallbacks = {
2283 'channel': 'uploader',
2284 'channel_id': 'uploader_id',
2285 'channel_url': 'uploader_url',
2286 }
2287 for to, frm in fallbacks.items():
2288 if not info.get(to):
2289 info[to] = info.get(frm)
2290
2291 for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]:
2292 v = info.get(s_k)
2293 if v:
2294 info[d_k] = v
b84071c0 2295
c224251a
M
2296 is_private = bool_or_none(video_details.get('isPrivate'))
2297 is_unlisted = bool_or_none(microformat.get('isUnlisted'))
2298 is_membersonly = None
2299 if initial_data and is_private is not None:
2300 is_membersonly = False
2301 contents = try_get(initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list)
2302 for content in contents or []:
2303 badges = try_get(content, lambda x: x['videoPrimaryInfoRenderer']['badges'], list)
2304 for badge in badges or []:
2305 label = try_get(badge, lambda x: x['metadataBadgeRenderer']['label']) or ''
2306 if label.lower() == 'members only':
2307 is_membersonly = True
2308 break
2309 if is_membersonly:
2310 break
2311
2312 # TODO: Add this for playlists
2313 info['availability'] = self._availability(
2314 is_private=is_private,
2315 needs_premium=False, # Youtube no longer have premium-only videos?
2316 needs_subscription=is_membersonly,
2317 needs_auth=info['age_limit'] >= 18,
2318 is_unlisted=None if is_private is None else is_unlisted)
2319
06167fbb 2320 # get xsrf for annotations or comments
2321 get_annotations = self._downloader.params.get('writeannotations', False)
2322 get_comments = self._downloader.params.get('getcomments', False)
2323 if get_annotations or get_comments:
29f7c58a 2324 xsrf_token = None
545cc85d 2325 ytcfg = self._extract_ytcfg(video_id, webpage)
29f7c58a 2326 if ytcfg:
2327 xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
2328 if not xsrf_token:
2329 xsrf_token = self._search_regex(
2330 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
8a784c74 2331 webpage, 'xsrf token', group='xsrf_token', fatal=False)
06167fbb 2332
2333 # annotations
06167fbb 2334 if get_annotations:
64b6a4e9
RA
2335 invideo_url = try_get(
2336 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2337 if xsrf_token and invideo_url:
29f7c58a 2338 xsrf_field_name = None
2339 if ytcfg:
2340 xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
2341 if not xsrf_field_name:
2342 xsrf_field_name = self._search_regex(
2343 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
8a784c74 2344 webpage, 'xsrf field name',
29f7c58a 2345 group='xsrf_field_name', default='session_token')
8a784c74 2346 info['annotations'] = self._download_webpage(
64b6a4e9
RA
2347 self._proto_relative_url(invideo_url),
2348 video_id, note='Downloading annotations',
2349 errnote='Unable to download video annotations', fatal=False,
2350 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
7e72694b 2351
277d6ff5 2352 if get_comments:
a1c5d2ca 2353 info['__post_extractor'] = lambda: self._extract_comments(ytcfg, video_id, contents, webpage, xsrf_token)
4ea3be0a 2354
545cc85d 2355 self.mark_watched(video_id, player_response)
d77ab8e2 2356
545cc85d 2357 return info
c5e8d7af 2358
5f6a1245 2359
8bdd16b4 2360class YoutubeTabIE(YoutubeBaseInfoExtractor):
2361 IE_DESC = 'YouTube.com tab'
70d5c17b 2362 _VALID_URL = r'''(?x)
2363 https?://
2364 (?:\w+\.)?
2365 (?:
2366 youtube(?:kids)?\.com|
2367 invidio\.us
2368 )/
2369 (?:
2370 (?:channel|c|user)/|
2371 (?P<not_channel>
9ba5705a 2372 feed/|hashtag/|
70d5c17b 2373 (?:playlist|watch)\?.*?\blist=
2374 )|
29f7c58a 2375 (?!(?:%s)\b) # Direct URLs
70d5c17b 2376 )
2377 (?P<id>[^/?\#&]+)
2378 ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
8bdd16b4 2379 IE_NAME = 'youtube:tab'
2380
81127aa5 2381 _TESTS = [{
8bdd16b4 2382 # playlists, multipage
2383 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
2384 'playlist_mincount': 94,
2385 'info_dict': {
2386 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2387 'title': 'Игорь Клейнер - Playlists',
2388 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
deaec5af 2389 'uploader': 'Игорь Клейнер',
2390 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
8bdd16b4 2391 },
2392 }, {
2393 # playlists, multipage, different order
2394 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2395 'playlist_mincount': 94,
2396 'info_dict': {
2397 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2398 'title': 'Игорь Клейнер - Playlists',
2399 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
deaec5af 2400 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
2401 'uploader': 'Игорь Клейнер',
8bdd16b4 2402 },
2403 }, {
2404 # playlists, singlepage
2405 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2406 'playlist_mincount': 4,
2407 'info_dict': {
2408 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
2409 'title': 'ThirstForScience - Playlists',
2410 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
deaec5af 2411 'uploader': 'ThirstForScience',
2412 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
8bdd16b4 2413 }
2414 }, {
2415 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
2416 'only_matching': True,
2417 }, {
2418 # basic, single video playlist
0e30a7b9 2419 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
81127aa5 2420 'info_dict': {
0e30a7b9 2421 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2422 'uploader': 'Sergey M.',
2423 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3867038a 2424 'title': 'youtube-dl public playlist',
81127aa5 2425 },
0e30a7b9 2426 'playlist_count': 1,
9291475f 2427 }, {
8bdd16b4 2428 # empty playlist
0e30a7b9 2429 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
9291475f 2430 'info_dict': {
0e30a7b9 2431 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2432 'uploader': 'Sergey M.',
2433 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3867038a 2434 'title': 'youtube-dl empty playlist',
9291475f
PH
2435 },
2436 'playlist_count': 0,
2437 }, {
8bdd16b4 2438 # Home tab
2439 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
9291475f 2440 'info_dict': {
8bdd16b4 2441 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2442 'title': 'lex will - Home',
2443 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
deaec5af 2444 'uploader': 'lex will',
2445 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
9291475f 2446 },
8bdd16b4 2447 'playlist_mincount': 2,
9291475f 2448 }, {
8bdd16b4 2449 # Videos tab
2450 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
9291475f 2451 'info_dict': {
8bdd16b4 2452 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2453 'title': 'lex will - Videos',
2454 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
deaec5af 2455 'uploader': 'lex will',
2456 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
9291475f 2457 },
8bdd16b4 2458 'playlist_mincount': 975,
9291475f 2459 }, {
8bdd16b4 2460 # Videos tab, sorted by popular
2461 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
9291475f 2462 'info_dict': {
8bdd16b4 2463 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2464 'title': 'lex will - Videos',
2465 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
deaec5af 2466 'uploader': 'lex will',
2467 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
9291475f 2468 },
8bdd16b4 2469 'playlist_mincount': 199,
9291475f 2470 }, {
8bdd16b4 2471 # Playlists tab
2472 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
9291475f 2473 'info_dict': {
8bdd16b4 2474 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2475 'title': 'lex will - Playlists',
2476 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
deaec5af 2477 'uploader': 'lex will',
2478 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
9291475f 2479 },
8bdd16b4 2480 'playlist_mincount': 17,
ac7553d0 2481 }, {
8bdd16b4 2482 # Community tab
2483 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
ac7553d0 2484 'info_dict': {
8bdd16b4 2485 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2486 'title': 'lex will - Community',
2487 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
deaec5af 2488 'uploader': 'lex will',
2489 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
8bdd16b4 2490 },
2491 'playlist_mincount': 18,
87dadd45 2492 }, {
8bdd16b4 2493 # Channels tab
2494 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
87dadd45 2495 'info_dict': {
8bdd16b4 2496 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2497 'title': 'lex will - Channels',
2498 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
deaec5af 2499 'uploader': 'lex will',
2500 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
8bdd16b4 2501 },
deaec5af 2502 'playlist_mincount': 12,
6b08cdf6 2503 }, {
a0566bbf 2504 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2505 'only_matching': True,
2506 }, {
a0566bbf 2507 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2508 'only_matching': True,
2509 }, {
a0566bbf 2510 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2511 'only_matching': True,
2512 }, {
2513 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2514 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2515 'info_dict': {
2516 'title': '29C3: Not my department',
2517 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2518 'uploader': 'Christiaan008',
2519 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
deaec5af 2520 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268',
8bdd16b4 2521 },
2522 'playlist_count': 96,
2523 }, {
2524 'note': 'Large playlist',
2525 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
6b08cdf6 2526 'info_dict': {
8bdd16b4 2527 'title': 'Uploads from Cauchemar',
2528 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2529 'uploader': 'Cauchemar',
2530 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
13a75688 2531 },
8bdd16b4 2532 'playlist_mincount': 1123,
2533 }, {
2534 # even larger playlist, 8832 videos
2535 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
2536 'only_matching': True,
4b7df0d3
JMF
2537 }, {
2538 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2539 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2540 'info_dict': {
acf757f4
PH
2541 'title': 'Uploads from Interstellar Movie',
2542 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
13a75688 2543 'uploader': 'Interstellar Movie',
8bdd16b4 2544 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3 2545 },
481cc733 2546 'playlist_mincount': 21,
8bdd16b4 2547 }, {
2548 # https://github.com/ytdl-org/youtube-dl/issues/21844
2549 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2550 'info_dict': {
2551 'title': 'Data Analysis with Dr Mike Pound',
2552 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2553 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
2554 'uploader': 'Computerphile',
deaec5af 2555 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487',
8bdd16b4 2556 },
2557 'playlist_mincount': 11,
2558 }, {
a0566bbf 2559 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
8bdd16b4 2560 'only_matching': True,
dacb3a86
S
2561 }, {
2562 # Playlist URL that does not actually serve a playlist
2563 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2564 'info_dict': {
2565 'id': 'FqZTN594JQw',
2566 'ext': 'webm',
2567 'title': "Smiley's People 01 detective, Adventure Series, Action",
2568 'uploader': 'STREEM',
2569 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
ec85ded8 2570 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
dacb3a86
S
2571 'upload_date': '20150526',
2572 'license': 'Standard YouTube License',
2573 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2574 'categories': ['People & Blogs'],
2575 'tags': list,
dbdaaa23 2576 'view_count': int,
dacb3a86
S
2577 'like_count': int,
2578 'dislike_count': int,
2579 },
2580 'params': {
2581 'skip_download': True,
2582 },
13a75688 2583 'skip': 'This video is not available.',
dacb3a86 2584 'add_ie': [YoutubeIE.ie_key()],
481cc733 2585 }, {
8bdd16b4 2586 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
c0345b82 2587 'only_matching': True,
66b48727 2588 }, {
8bdd16b4 2589 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
66b48727 2590 'only_matching': True,
a0566bbf 2591 }, {
2592 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
2593 'info_dict': {
2594 'id': '9Auq9mYxFEE',
2595 'ext': 'mp4',
deaec5af 2596 'title': compat_str,
a0566bbf 2597 'uploader': 'Sky News',
2598 'uploader_id': 'skynews',
2599 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
2600 'upload_date': '20191102',
deaec5af 2601 'description': 'md5:85ddd75d888674631aaf9599a9a0b0ae',
a0566bbf 2602 'categories': ['News & Politics'],
2603 'tags': list,
2604 'like_count': int,
2605 'dislike_count': int,
2606 },
2607 'params': {
2608 'skip_download': True,
2609 },
2610 }, {
2611 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2612 'info_dict': {
2613 'id': 'a48o2S1cPoo',
2614 'ext': 'mp4',
2615 'title': 'The Young Turks - Live Main Show',
2616 'uploader': 'The Young Turks',
2617 'uploader_id': 'TheYoungTurks',
2618 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2619 'upload_date': '20150715',
2620 'license': 'Standard YouTube License',
2621 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2622 'categories': ['News & Politics'],
2623 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2624 'like_count': int,
2625 'dislike_count': int,
2626 },
2627 'params': {
2628 'skip_download': True,
2629 },
2630 'only_matching': True,
2631 }, {
2632 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
2633 'only_matching': True,
2634 }, {
2635 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2636 'only_matching': True,
3d3dddc9 2637 }, {
2638 'url': 'https://www.youtube.com/feed/trending',
2639 'only_matching': True,
2640 }, {
2641 # needs auth
2642 'url': 'https://www.youtube.com/feed/library',
2643 'only_matching': True,
2644 }, {
2645 # needs auth
2646 'url': 'https://www.youtube.com/feed/history',
2647 'only_matching': True,
2648 }, {
2649 # needs auth
2650 'url': 'https://www.youtube.com/feed/subscriptions',
2651 'only_matching': True,
2652 }, {
2653 # needs auth
2654 'url': 'https://www.youtube.com/feed/watch_later',
2655 'only_matching': True,
2656 }, {
2657 # no longer available?
2658 'url': 'https://www.youtube.com/feed/recommended',
2659 'only_matching': True,
29f7c58a 2660 }, {
2661 # inline playlist with not always working continuations
2662 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
2663 'only_matching': True,
2664 }, {
2665 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8',
2666 'only_matching': True,
2667 }, {
2668 'url': 'https://www.youtube.com/course',
2669 'only_matching': True,
2670 }, {
2671 'url': 'https://www.youtube.com/zsecurity',
2672 'only_matching': True,
2673 }, {
2674 'url': 'http://www.youtube.com/NASAgovVideo/videos',
2675 'only_matching': True,
2676 }, {
2677 'url': 'https://www.youtube.com/TheYoungTurks/live',
2678 'only_matching': True,
2679 }]
2680
2681 @classmethod
2682 def suitable(cls, url):
2683 return False if YoutubeIE.suitable(url) else super(
2684 YoutubeTabIE, cls).suitable(url)
8bdd16b4 2685
2686 def _extract_channel_id(self, webpage):
2687 channel_id = self._html_search_meta(
2688 'channelId', webpage, 'channel id', default=None)
2689 if channel_id:
2690 return channel_id
2691 channel_url = self._html_search_meta(
2692 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
2693 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
2694 'twitter:app:url:googleplay'), webpage, 'channel url')
2695 return self._search_regex(
2696 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
2697 channel_url, 'channel id')
15f6397c 2698
8bdd16b4 2699 @staticmethod
cd7c66cf 2700 def _extract_basic_item_renderer(item):
2701 # Modified from _extract_grid_item_renderer
2702 known_renderers = (
e3c07697 2703 'playlistRenderer', 'videoRenderer', 'channelRenderer',
cd7c66cf 2704 'gridPlaylistRenderer', 'gridVideoRenderer', 'gridChannelRenderer'
2705 )
2706 for key, renderer in item.items():
2707 if key not in known_renderers:
2708 continue
2709 return renderer
8bdd16b4 2710
8bdd16b4 2711 def _grid_entries(self, grid_renderer):
2712 for item in grid_renderer['items']:
2713 if not isinstance(item, dict):
39b62db1 2714 continue
cd7c66cf 2715 renderer = self._extract_basic_item_renderer(item)
8bdd16b4 2716 if not isinstance(renderer, dict):
2717 continue
2718 title = try_get(
2719 renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2720 # playlist
2721 playlist_id = renderer.get('playlistId')
2722 if playlist_id:
2723 yield self.url_result(
2724 'https://www.youtube.com/playlist?list=%s' % playlist_id,
2725 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
2726 video_title=title)
2727 # video
2728 video_id = renderer.get('videoId')
2729 if video_id:
2730 yield self._extract_video(renderer)
2731 # channel
2732 channel_id = renderer.get('channelId')
2733 if channel_id:
2734 title = try_get(
2735 renderer, lambda x: x['title']['simpleText'], compat_str)
2736 yield self.url_result(
2737 'https://www.youtube.com/channel/%s' % channel_id,
2738 ie=YoutubeTabIE.ie_key(), video_title=title)
2739
3d3dddc9 2740 def _shelf_entries_from_content(self, shelf_renderer):
2741 content = shelf_renderer.get('content')
2742 if not isinstance(content, dict):
8bdd16b4 2743 return
cd7c66cf 2744 renderer = content.get('gridRenderer') or content.get('expandedShelfContentsRenderer')
3d3dddc9 2745 if renderer:
2746 # TODO: add support for nested playlists so each shelf is processed
2747 # as separate playlist
2748 # TODO: this includes only first N items
2749 for entry in self._grid_entries(renderer):
2750 yield entry
2751 renderer = content.get('horizontalListRenderer')
2752 if renderer:
2753 # TODO
2754 pass
8bdd16b4 2755
29f7c58a 2756 def _shelf_entries(self, shelf_renderer, skip_channels=False):
8bdd16b4 2757 ep = try_get(
2758 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
2759 compat_str)
2760 shelf_url = urljoin('https://www.youtube.com', ep)
3d3dddc9 2761 if shelf_url:
29f7c58a 2762 # Skipping links to another channels, note that checking for
2763 # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL
2764 # will not work
2765 if skip_channels and '/channels?' in shelf_url:
2766 return
3d3dddc9 2767 title = try_get(
2768 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2769 yield self.url_result(shelf_url, video_title=title)
2770 # Shelf may not contain shelf URL, fallback to extraction from content
2771 for entry in self._shelf_entries_from_content(shelf_renderer):
2772 yield entry
c5e8d7af 2773
8bdd16b4 2774 def _playlist_entries(self, video_list_renderer):
2775 for content in video_list_renderer['contents']:
2776 if not isinstance(content, dict):
2777 continue
2778 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
2779 if not isinstance(renderer, dict):
2780 continue
2781 video_id = renderer.get('videoId')
2782 if not video_id:
2783 continue
2784 yield self._extract_video(renderer)
07aeced6 2785
3462ffa8 2786 def _rich_entries(self, rich_grid_renderer):
2787 renderer = try_get(
70d5c17b 2788 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
3462ffa8 2789 video_id = renderer.get('videoId')
2790 if not video_id:
2791 return
2792 yield self._extract_video(renderer)
2793
8bdd16b4 2794 def _video_entry(self, video_renderer):
2795 video_id = video_renderer.get('videoId')
2796 if video_id:
2797 return self._extract_video(video_renderer)
dacb3a86 2798
8bdd16b4 2799 def _post_thread_entries(self, post_thread_renderer):
2800 post_renderer = try_get(
2801 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
2802 if not post_renderer:
2803 return
2804 # video attachment
2805 video_renderer = try_get(
2806 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
2807 video_id = None
2808 if video_renderer:
2809 entry = self._video_entry(video_renderer)
2810 if entry:
2811 yield entry
2812 # inline video links
2813 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
2814 for run in runs:
2815 if not isinstance(run, dict):
2816 continue
2817 ep_url = try_get(
2818 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
2819 if not ep_url:
2820 continue
2821 if not YoutubeIE.suitable(ep_url):
2822 continue
2823 ep_video_id = YoutubeIE._match_id(ep_url)
2824 if video_id == ep_video_id:
2825 continue
2826 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
dacb3a86 2827
8bdd16b4 2828 def _post_thread_continuation_entries(self, post_thread_continuation):
2829 contents = post_thread_continuation.get('contents')
2830 if not isinstance(contents, list):
2831 return
2832 for content in contents:
2833 renderer = content.get('backstagePostThreadRenderer')
2834 if not isinstance(renderer, dict):
2835 continue
2836 for entry in self._post_thread_entries(renderer):
2837 yield entry
07aeced6 2838
29f7c58a 2839 @staticmethod
2840 def _build_continuation_query(continuation, ctp=None):
2841 query = {
2842 'ctoken': continuation,
2843 'continuation': continuation,
2844 }
2845 if ctp:
2846 query['itct'] = ctp
2847 return query
2848
8bdd16b4 2849 @staticmethod
2850 def _extract_next_continuation_data(renderer):
2851 next_continuation = try_get(
2852 renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
2853 if not next_continuation:
2854 return
2855 continuation = next_continuation.get('continuation')
2856 if not continuation:
2857 return
2858 ctp = next_continuation.get('clickTrackingParams')
29f7c58a 2859 return YoutubeTabIE._build_continuation_query(continuation, ctp)
c5e8d7af 2860
8bdd16b4 2861 @classmethod
2862 def _extract_continuation(cls, renderer):
2863 next_continuation = cls._extract_next_continuation_data(renderer)
2864 if next_continuation:
2865 return next_continuation
cc2db878 2866 contents = []
2867 for key in ('contents', 'items'):
2868 contents.extend(try_get(renderer, lambda x: x[key], list) or [])
8bdd16b4 2869 for content in contents:
2870 if not isinstance(content, dict):
2871 continue
2872 continuation_ep = try_get(
2873 content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
2874 dict)
2875 if not continuation_ep:
2876 continue
2877 continuation = try_get(
2878 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
2879 if not continuation:
2880 continue
2881 ctp = continuation_ep.get('clickTrackingParams')
29f7c58a 2882 return YoutubeTabIE._build_continuation_query(continuation, ctp)
448830ce 2883
d069eca7 2884 def _entries(self, tab, item_id, identity_token, account_syncid):
3462ffa8 2885
70d5c17b 2886 def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
2887 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
2888 for content in contents:
2889 if not isinstance(content, dict):
8bdd16b4 2890 continue
70d5c17b 2891 is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
3462ffa8 2892 if not is_renderer:
70d5c17b 2893 renderer = content.get('richItemRenderer')
3462ffa8 2894 if renderer:
2895 for entry in self._rich_entries(renderer):
2896 yield entry
2897 continuation_list[0] = self._extract_continuation(parent_renderer)
8bdd16b4 2898 continue
3462ffa8 2899 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
2900 for isr_content in isr_contents:
2901 if not isinstance(isr_content, dict):
2902 continue
69184e41 2903
2904 known_renderers = {
2905 'playlistVideoListRenderer': self._playlist_entries,
2906 'gridRenderer': self._grid_entries,
2907 'shelfRenderer': lambda x: self._shelf_entries(x, tab.get('title') != 'Channels'),
2908 'backstagePostThreadRenderer': self._post_thread_entries,
2909 'videoRenderer': lambda x: [self._video_entry(x)],
2910 }
2911 for key, renderer in isr_content.items():
2912 if key not in known_renderers:
2913 continue
2914 for entry in known_renderers[key](renderer):
2915 if entry:
2916 yield entry
3462ffa8 2917 continuation_list[0] = self._extract_continuation(renderer)
69184e41 2918 break
70d5c17b 2919
3462ffa8 2920 if not continuation_list[0]:
2921 continuation_list[0] = self._extract_continuation(is_renderer)
70d5c17b 2922
2923 if not continuation_list[0]:
2924 continuation_list[0] = self._extract_continuation(parent_renderer)
3462ffa8 2925
2926 continuation_list = [None] # Python 2 doesnot support nonlocal
29f7c58a 2927 tab_content = try_get(tab, lambda x: x['content'], dict)
2928 if not tab_content:
2929 return
3462ffa8 2930 parent_renderer = (
29f7c58a 2931 try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
2932 or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
70d5c17b 2933 for entry in extract_entries(parent_renderer):
2934 yield entry
3462ffa8 2935 continuation = continuation_list[0]
8bdd16b4 2936
2937 headers = {
2938 'x-youtube-client-name': '1',
2939 'x-youtube-client-version': '2.20201112.04.01',
2940 }
2941 if identity_token:
2942 headers['x-youtube-identity-token'] = identity_token
ebf1b291 2943
d069eca7
M
2944 if account_syncid:
2945 headers['X-Goog-PageId'] = account_syncid
2946 headers['X-Goog-AuthUser'] = 0
2947
8bdd16b4 2948 for page_num in itertools.count(1):
2949 if not continuation:
2950 break
62bff2c1 2951 retries = self._downloader.params.get('extractor_retries', 3)
2952 count = -1
2953 last_error = None
2954 while count < retries:
2955 count += 1
2956 if last_error:
2957 self.report_warning('%s. Retrying ...' % last_error)
29f7c58a 2958 try:
a5c56234
M
2959 response = self._call_api(
2960 ep="browse", fatal=True, headers=headers,
2961 video_id='%s page %s' % (item_id, page_num),
2962 query={
2963 'continuation': continuation['continuation'],
2964 'clickTracking': {'clickTrackingParams': continuation['itct']},
2965 },
2966 note='Downloading API JSON%s' % (' (retry #%d)' % count if count else ''))
29f7c58a 2967 except ExtractorError as e:
62bff2c1 2968 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404):
2969 # Downloading page may result in intermittent 5xx HTTP error
2970 # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
2971 last_error = 'HTTP Error %s' % e.cause.code
2972 if count < retries:
29f7c58a 2973 continue
2974 raise
62bff2c1 2975 else:
62bff2c1 2976 # Youtube sometimes sends incomplete data
2977 # See: https://github.com/ytdl-org/youtube-dl/issues/28194
26fe8ffe 2978 if dict_get(response,
2979 ('continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints')):
62bff2c1 2980 break
f3eaa8dd
M
2981
2982 # Youtube may send alerts if there was an issue with the continuation page
2983 self._extract_alerts(response, expected=False)
2984
2985 last_error = 'Incomplete data received'
c705177d 2986 if count >= retries:
2987 self._downloader.report_error(last_error)
a5c56234
M
2988
2989 if not response:
8bdd16b4 2990 break
ebf1b291 2991
69184e41 2992 known_continuation_renderers = {
2993 'playlistVideoListContinuation': self._playlist_entries,
2994 'gridContinuation': self._grid_entries,
2995 'itemSectionContinuation': self._post_thread_continuation_entries,
2996 'sectionListContinuation': extract_entries, # for feeds
2997 }
8bdd16b4 2998 continuation_contents = try_get(
69184e41 2999 response, lambda x: x['continuationContents'], dict) or {}
3000 continuation_renderer = None
3001 for key, value in continuation_contents.items():
3002 if key not in known_continuation_renderers:
3462ffa8 3003 continue
69184e41 3004 continuation_renderer = value
3005 continuation_list = [None]
3006 for entry in known_continuation_renderers[key](continuation_renderer):
3007 yield entry
3008 continuation = continuation_list[0] or self._extract_continuation(continuation_renderer)
3009 break
3010 if continuation_renderer:
3011 continue
c5e8d7af 3012
a1b535bd 3013 known_renderers = {
3014 'gridPlaylistRenderer': (self._grid_entries, 'items'),
3015 'gridVideoRenderer': (self._grid_entries, 'items'),
3016 'playlistVideoRenderer': (self._playlist_entries, 'contents'),
cd7c66cf 3017 'itemSectionRenderer': (extract_entries, 'contents'), # for feeds
9ba5705a 3018 'richItemRenderer': (extract_entries, 'contents'), # for hashtag
26fe8ffe 3019 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents')
a1b535bd 3020 }
cce889b9 3021 on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints'))
8bdd16b4 3022 continuation_items = try_get(
cce889b9 3023 on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list)
a1b535bd 3024 continuation_item = try_get(continuation_items, lambda x: x[0], dict) or {}
3025 video_items_renderer = None
3026 for key, value in continuation_item.items():
3027 if key not in known_renderers:
8bdd16b4 3028 continue
a1b535bd 3029 video_items_renderer = {known_renderers[key][1]: continuation_items}
9ba5705a 3030 continuation_list = [None]
a1b535bd 3031 for entry in known_renderers[key][0](video_items_renderer):
3032 yield entry
9ba5705a 3033 continuation = continuation_list[0] or self._extract_continuation(video_items_renderer)
a1b535bd 3034 break
3035 if video_items_renderer:
3036 continue
8bdd16b4 3037 break
9558dcec 3038
8bdd16b4 3039 @staticmethod
3040 def _extract_selected_tab(tabs):
3041 for tab in tabs:
3042 if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
3043 return tab['tabRenderer']
2b3c2546 3044 else:
8bdd16b4 3045 raise ExtractorError('Unable to find selected tab')
b82f815f 3046
8bdd16b4 3047 @staticmethod
3048 def _extract_uploader(data):
3049 uploader = {}
3050 sidebar_renderer = try_get(
3051 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
3052 if sidebar_renderer:
3053 for item in sidebar_renderer:
3054 if not isinstance(item, dict):
3055 continue
3056 renderer = item.get('playlistSidebarSecondaryInfoRenderer')
3057 if not isinstance(renderer, dict):
3058 continue
3059 owner = try_get(
3060 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3061 if owner:
3062 uploader['uploader'] = owner.get('text')
3063 uploader['uploader_id'] = try_get(
3064 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3065 uploader['uploader_url'] = urljoin(
3066 'https://www.youtube.com/',
3067 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
9c3fe2ef 3068 return {k: v for k, v in uploader.items() if v is not None}
8bdd16b4 3069
d069eca7 3070 def _extract_from_tabs(self, item_id, webpage, data, tabs):
b60419c5 3071 playlist_id = title = description = channel_url = channel_name = channel_id = None
3072 thumbnails_list = tags = []
3073
8bdd16b4 3074 selected_tab = self._extract_selected_tab(tabs)
3075 renderer = try_get(
3076 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3077 if renderer:
b60419c5 3078 channel_name = renderer.get('title')
3079 channel_url = renderer.get('channelUrl')
3080 channel_id = renderer.get('externalId')
64c0d954 3081
64c0d954 3082 if not renderer:
3083 renderer = try_get(
3084 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
8bdd16b4 3085 if renderer:
3086 title = renderer.get('title')
ecc97af3 3087 description = renderer.get('description', '')
b60419c5 3088 playlist_id = channel_id
3089 tags = renderer.get('keywords', '').split()
3090 thumbnails_list = (
3091 try_get(renderer, lambda x: x['avatar']['thumbnails'], list)
ff84930c 3092 or try_get(
3093 data,
3094 lambda x: x['sidebar']['playlistSidebarRenderer']['items'][0]['playlistSidebarPrimaryInfoRenderer']['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails'],
3095 list)
b60419c5 3096 or [])
3097
3098 thumbnails = []
3099 for t in thumbnails_list:
3100 if not isinstance(t, dict):
3101 continue
3102 thumbnail_url = url_or_none(t.get('url'))
3103 if not thumbnail_url:
3104 continue
3105 thumbnails.append({
3106 'url': thumbnail_url,
3107 'width': int_or_none(t.get('width')),
3108 'height': int_or_none(t.get('height')),
3109 })
64c0d954 3110
3462ffa8 3111 if playlist_id is None:
70d5c17b 3112 playlist_id = item_id
3113 if title is None:
b60419c5 3114 title = playlist_id
3115 title += format_field(selected_tab, 'title', ' - %s')
3116
3117 metadata = {
3118 'playlist_id': playlist_id,
3119 'playlist_title': title,
3120 'playlist_description': description,
3121 'uploader': channel_name,
3122 'uploader_id': channel_id,
3123 'uploader_url': channel_url,
3124 'thumbnails': thumbnails,
3125 'tags': tags,
3126 }
3127 if not channel_id:
3128 metadata.update(self._extract_uploader(data))
3129 metadata.update({
3130 'channel': metadata['uploader'],
3131 'channel_id': metadata['uploader_id'],
3132 'channel_url': metadata['uploader_url']})
3133 return self.playlist_result(
d069eca7
M
3134 self._entries(
3135 selected_tab, playlist_id,
3136 self._extract_identity_token(webpage, item_id),
3137 self._extract_account_syncid(data)),
b60419c5 3138 **metadata)
73c4ac2c 3139
cd7c66cf 3140 def _extract_mix_playlist(self, playlist, playlist_id):
2be71994 3141 first_id = last_id = None
3142 for page_num in itertools.count(1):
cd7c66cf 3143 videos = list(self._playlist_entries(playlist))
3144 if not videos:
3145 return
2be71994 3146 start = next((i for i, v in enumerate(videos) if v['id'] == last_id), -1) + 1
3147 if start >= len(videos):
3148 return
3149 for video in videos[start:]:
3150 if video['id'] == first_id:
3151 self.to_screen('First video %s found again; Assuming end of Mix' % first_id)
3152 return
3153 yield video
3154 first_id = first_id or videos[0]['id']
3155 last_id = videos[-1]['id']
cd7c66cf 3156
cd7c66cf 3157 _, data = self._extract_webpage(
2be71994 3158 'https://www.youtube.com/watch?list=%s&v=%s' % (playlist_id, last_id),
cd7c66cf 3159 '%s page %d' % (playlist_id, page_num))
3160 playlist = try_get(
3161 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3162
29f7c58a 3163 def _extract_from_playlist(self, item_id, url, data, playlist):
8bdd16b4 3164 title = playlist.get('title') or try_get(
3165 data, lambda x: x['titleText']['simpleText'], compat_str)
3166 playlist_id = playlist.get('playlistId') or item_id
cd7c66cf 3167
3168 # Delegating everything except mix playlists to regular tab-based playlist URL
29f7c58a 3169 playlist_url = urljoin(url, try_get(
3170 playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3171 compat_str))
3172 if playlist_url and playlist_url != url:
3173 return self.url_result(
3174 playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3175 video_title=title)
cd7c66cf 3176
8bdd16b4 3177 return self.playlist_result(
cd7c66cf 3178 self._extract_mix_playlist(playlist, playlist_id),
3179 playlist_id=playlist_id, playlist_title=title)
c5e8d7af 3180
f3eaa8dd
M
3181 def _extract_alerts(self, data, expected=False):
3182
3183 def _real_extract_alerts():
3184 for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
3185 if not isinstance(alert_dict, dict):
02ced43c 3186 continue
f3eaa8dd
M
3187 for alert in alert_dict.values():
3188 alert_type = alert.get('type')
3189 if not alert_type:
3190 continue
3191 message = try_get(alert, lambda x: x['text']['simpleText'], compat_str)
02ced43c 3192 if message:
3193 yield alert_type, message
f3eaa8dd
M
3194 for run in try_get(alert, lambda x: x['text']['runs'], list) or []:
3195 message = try_get(run, lambda x: x['text'], compat_str)
3196 if message:
3197 yield alert_type, message
3198
3199 err_msg = None
3200 for alert_type, alert_message in _real_extract_alerts():
3201 if alert_type.lower() == 'error':
3202 if err_msg:
3203 self._downloader.report_warning('YouTube said: %s - %s' % ('ERROR', err_msg))
3204 err_msg = alert_message
3205 else:
3206 self._downloader.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
3207
3208 if err_msg:
3209 raise ExtractorError('YouTube said: %s' % err_msg, expected=expected)
02ced43c 3210
cd7c66cf 3211 def _extract_webpage(self, url, item_id):
62bff2c1 3212 retries = self._downloader.params.get('extractor_retries', 3)
3213 count = -1
c705177d 3214 last_error = 'Incomplete yt initial data recieved'
14fdfea9 3215 while count < retries:
62bff2c1 3216 count += 1
14fdfea9 3217 # Sometimes youtube returns a webpage with incomplete ytInitialData
62bff2c1 3218 # See: https://github.com/yt-dlp/yt-dlp/issues/116
3219 if count:
c705177d 3220 self.report_warning('%s. Retrying ...' % last_error)
5ef7d9bd 3221 webpage = self._download_webpage(
3222 url, item_id,
cd7c66cf 3223 'Downloading webpage%s' % (' (retry #%d)' % count if count else ''))
14fdfea9 3224 data = self._extract_yt_initial_data(item_id, webpage)
f3eaa8dd 3225 self._extract_alerts(data, expected=True)
14fdfea9 3226 if data.get('contents') or data.get('currentVideoEndpoint'):
3227 break
c705177d 3228 if count >= retries:
3229 self._downloader.report_error(last_error)
cd7c66cf 3230 return webpage, data
3231
3232 def _real_extract(self, url):
3233 item_id = self._match_id(url)
3234 url = compat_urlparse.urlunparse(
3235 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
3236
3237 # This is not matched in a channel page with a tab selected
3238 mobj = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url)
3239 mobj = mobj.groupdict() if mobj else {}
3240 if mobj and not mobj.get('not_channel'):
3241 self._downloader.report_warning(
3242 'A channel/user page was given. All the channel\'s videos will be downloaded. '
3243 'To download only the videos in the home page, add a "/featured" to the URL')
3244 url = '%s/videos%s' % (mobj.get('pre'), mobj.get('post') or '')
3245
3246 # Handle both video/playlist URLs
3247 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3248 video_id = qs.get('v', [None])[0]
3249 playlist_id = qs.get('list', [None])[0]
3250
3251 if not video_id and (mobj.get('not_channel') or '').startswith('watch'):
3252 if not playlist_id:
3253 # If there is neither video or playlist ids,
3254 # youtube redirects to home page, which is undesirable
3255 raise ExtractorError('Unable to recognize tab page')
3256 self._downloader.report_warning('A video URL was given without video ID. Trying to download playlist %s' % playlist_id)
3257 url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
3258
3259 if video_id and playlist_id:
3260 if self._downloader.params.get('noplaylist'):
3261 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
3262 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3263 self.to_screen('Downloading playlist %s; add --no-playlist to just download video %s' % (playlist_id, video_id))
3264
3265 webpage, data = self._extract_webpage(url, item_id)
14fdfea9 3266
8bdd16b4 3267 tabs = try_get(
3268 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3269 if tabs:
d069eca7 3270 return self._extract_from_tabs(item_id, webpage, data, tabs)
cd7c66cf 3271
8bdd16b4 3272 playlist = try_get(
3273 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3274 if playlist:
29f7c58a 3275 return self._extract_from_playlist(item_id, url, data, playlist)
cd7c66cf 3276
a0566bbf 3277 video_id = try_get(
3278 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
3279 compat_str) or video_id
8bdd16b4 3280 if video_id:
cd7c66cf 3281 self._downloader.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id)
8bdd16b4 3282 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
cd7c66cf 3283
8bdd16b4 3284 raise ExtractorError('Unable to recognize tab page')
c5e8d7af 3285
c5e8d7af 3286
8bdd16b4 3287class YoutubePlaylistIE(InfoExtractor):
3288 IE_DESC = 'YouTube.com playlists'
3289 _VALID_URL = r'''(?x)(?:
3290 (?:https?://)?
3291 (?:\w+\.)?
3292 (?:
3293 (?:
3294 youtube(?:kids)?\.com|
29f7c58a 3295 invidio\.us
8bdd16b4 3296 )
3297 /.*?\?.*?\blist=
3298 )?
3299 (?P<id>%(playlist_id)s)
3300 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3301 IE_NAME = 'youtube:playlist'
cdc628a4 3302 _TESTS = [{
8bdd16b4 3303 'note': 'issue #673',
3304 'url': 'PLBB231211A4F62143',
cdc628a4 3305 'info_dict': {
8bdd16b4 3306 'title': '[OLD]Team Fortress 2 (Class-based LP)',
3307 'id': 'PLBB231211A4F62143',
3308 'uploader': 'Wickydoo',
3309 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
3310 },
3311 'playlist_mincount': 29,
3312 }, {
3313 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3314 'info_dict': {
3315 'title': 'YDL_safe_search',
3316 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3317 },
3318 'playlist_count': 2,
3319 'skip': 'This playlist is private',
9558dcec 3320 }, {
8bdd16b4 3321 'note': 'embedded',
3322 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3323 'playlist_count': 4,
9558dcec 3324 'info_dict': {
8bdd16b4 3325 'title': 'JODA15',
3326 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3327 'uploader': 'milan',
3328 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
9558dcec 3329 }
cdc628a4 3330 }, {
8bdd16b4 3331 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3332 'playlist_mincount': 982,
3333 'info_dict': {
3334 'title': '2018 Chinese New Singles (11/6 updated)',
3335 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3336 'uploader': 'LBK',
3337 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
3338 }
daa0df9e 3339 }, {
29f7c58a 3340 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
3341 'only_matching': True,
3342 }, {
3343 # music album playlist
3344 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
3345 'only_matching': True,
3346 }]
3347
3348 @classmethod
3349 def suitable(cls, url):
3350 return False if YoutubeTabIE.suitable(url) else super(
3351 YoutubePlaylistIE, cls).suitable(url)
3352
3353 def _real_extract(self, url):
3354 playlist_id = self._match_id(url)
3355 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3356 if not qs:
3357 qs = {'list': playlist_id}
3358 return self.url_result(
3359 update_url_query('https://www.youtube.com/playlist', qs),
3360 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3361
3362
3363class YoutubeYtBeIE(InfoExtractor):
c76eb41b 3364 IE_DESC = 'youtu.be'
29f7c58a 3365 _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3366 _TESTS = [{
8bdd16b4 3367 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
3368 'info_dict': {
3369 'id': 'yeWKywCrFtk',
3370 'ext': 'mp4',
3371 'title': 'Small Scale Baler and Braiding Rugs',
3372 'uploader': 'Backus-Page House Museum',
3373 'uploader_id': 'backuspagemuseum',
3374 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
3375 'upload_date': '20161008',
3376 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
3377 'categories': ['Nonprofits & Activism'],
3378 'tags': list,
3379 'like_count': int,
3380 'dislike_count': int,
3381 },
3382 'params': {
3383 'noplaylist': True,
3384 'skip_download': True,
3385 },
39e7107d 3386 }, {
8bdd16b4 3387 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
39e7107d 3388 'only_matching': True,
cdc628a4
PH
3389 }]
3390
8bdd16b4 3391 def _real_extract(self, url):
29f7c58a 3392 mobj = re.match(self._VALID_URL, url)
3393 video_id = mobj.group('id')
3394 playlist_id = mobj.group('playlist_id')
8bdd16b4 3395 return self.url_result(
29f7c58a 3396 update_url_query('https://www.youtube.com/watch', {
3397 'v': video_id,
3398 'list': playlist_id,
3399 'feature': 'youtu.be',
3400 }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
8bdd16b4 3401
3402
3403class YoutubeYtUserIE(InfoExtractor):
c76eb41b 3404 IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
8bdd16b4 3405 _VALID_URL = r'ytuser:(?P<id>.+)'
3406 _TESTS = [{
3407 'url': 'ytuser:phihag',
3408 'only_matching': True,
3409 }]
3410
3411 def _real_extract(self, url):
3412 user_id = self._match_id(url)
3413 return self.url_result(
3414 'https://www.youtube.com/user/%s' % user_id,
3415 ie=YoutubeTabIE.ie_key(), video_id=user_id)
9558dcec 3416
b05654f0 3417
3d3dddc9 3418class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
70d5c17b 3419 IE_NAME = 'youtube:favorites'
3420 IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
3421 _VALID_URL = r':ytfav(?:ou?rite)?s?'
3422 _LOGIN_REQUIRED = True
3423 _TESTS = [{
3424 'url': ':ytfav',
3425 'only_matching': True,
3426 }, {
3427 'url': ':ytfavorites',
3428 'only_matching': True,
3429 }]
3430
3431 def _real_extract(self, url):
3432 return self.url_result(
3433 'https://www.youtube.com/playlist?list=LL',
3434 ie=YoutubeTabIE.ie_key())
3435
3436
8bdd16b4 3437class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
69184e41 3438 IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
b4c08069
JMF
3439 # there doesn't appear to be a real limit, for example if you search for
3440 # 'python' you get more than 8.000.000 results
3441 _MAX_RESULTS = float('inf')
78caa52a 3442 IE_NAME = 'youtube:search'
b05654f0 3443 _SEARCH_KEY = 'ytsearch'
6c894ea1 3444 _SEARCH_PARAMS = None
9dd8e46a 3445 _TESTS = []
b05654f0 3446
6c894ea1 3447 def _entries(self, query, n):
a5c56234 3448 data = {'query': query}
6c894ea1
U
3449 if self._SEARCH_PARAMS:
3450 data['params'] = self._SEARCH_PARAMS
3451 total = 0
3452 for page_num in itertools.count(1):
a5c56234
M
3453 search = self._call_api(
3454 ep='search', video_id='query "%s"' % query, fatal=False,
3455 note='Downloading page %s' % page_num, query=data)
6c894ea1 3456 if not search:
b4c08069 3457 break
6c894ea1
U
3458 slr_contents = try_get(
3459 search,
3460 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3461 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3462 list)
3463 if not slr_contents:
a22b2fd1 3464 break
0366ae87 3465
0366ae87
M
3466 # Youtube sometimes adds promoted content to searches,
3467 # changing the index location of videos and token.
3468 # So we search through all entries till we find them.
30a074c2 3469 continuation_token = None
3470 for slr_content in slr_contents:
a96c6d15 3471 if continuation_token is None:
3472 continuation_token = try_get(
3473 slr_content,
3474 lambda x: x['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3475 compat_str)
3476
30a074c2 3477 isr_contents = try_get(
3478 slr_content,
3479 lambda x: x['itemSectionRenderer']['contents'],
3480 list)
9da76d30 3481 if not isr_contents:
30a074c2 3482 continue
3483 for content in isr_contents:
3484 if not isinstance(content, dict):
3485 continue
3486 video = content.get('videoRenderer')
3487 if not isinstance(video, dict):
3488 continue
3489 video_id = video.get('videoId')
3490 if not video_id:
3491 continue
3492
3493 yield self._extract_video(video)
3494 total += 1
3495 if total == n:
3496 return
0366ae87 3497
0366ae87 3498 if not continuation_token:
6c894ea1 3499 break
0366ae87 3500 data['continuation'] = continuation_token
b05654f0 3501
6c894ea1
U
3502 def _get_n_results(self, query, n):
3503 """Get a specified number of results for a query"""
3504 return self.playlist_result(self._entries(query, n), query)
75dff0ee 3505
c9ae7b95 3506
a3dd9248 3507class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 3508 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 3509 _SEARCH_KEY = 'ytsearchdate'
c76eb41b 3510 IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
6c894ea1 3511 _SEARCH_PARAMS = 'CAI%3D'
75dff0ee 3512
c9ae7b95 3513
386e1dd9 3514class YoutubeSearchURLIE(YoutubeSearchIE):
69184e41 3515 IE_DESC = 'YouTube.com search URLs'
386e1dd9 3516 IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
3517 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
ef2f3c7f 3518 # _MAX_RESULTS = 100
3462ffa8 3519 _TESTS = [{
3520 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3521 'playlist_mincount': 5,
3522 'info_dict': {
3523 'title': 'youtube-dl test video',
3524 }
3525 }, {
3526 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3527 'only_matching': True,
3528 }]
3529
386e1dd9 3530 @classmethod
3531 def _make_valid_url(cls):
3532 return cls._VALID_URL
3533
3462ffa8 3534 def _real_extract(self, url):
386e1dd9 3535 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
3536 query = (qs.get('search_query') or qs.get('q'))[0]
3537 self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
3538 return self._get_n_results(query, self._MAX_RESULTS)
3462ffa8 3539
3540
3541class YoutubeFeedsInfoExtractor(YoutubeTabIE):
d7ae0639 3542 """
25f14e9f 3543 Base class for feed extractors
3d3dddc9 3544 Subclasses must define the _FEED_NAME property.
d7ae0639 3545 """
b2e8bc1b 3546 _LOGIN_REQUIRED = True
ef2f3c7f 3547 _TESTS = []
d7ae0639
JMF
3548
3549 @property
3550 def IE_NAME(self):
78caa52a 3551 return 'youtube:%s' % self._FEED_NAME
04cc9617 3552
81f0259b 3553 def _real_initialize(self):
b2e8bc1b 3554 self._login()
81f0259b 3555
3853309f 3556 def _real_extract(self, url):
3d3dddc9 3557 return self.url_result(
3558 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3559 ie=YoutubeTabIE.ie_key())
25f14e9f
S
3560
3561
ef2f3c7f 3562class YoutubeWatchLaterIE(InfoExtractor):
3563 IE_NAME = 'youtube:watchlater'
70d5c17b 3564 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3d3dddc9 3565 _VALID_URL = r':ytwatchlater'
bc7a9cd8 3566 _TESTS = [{
8bdd16b4 3567 'url': ':ytwatchlater',
bc7a9cd8
S
3568 'only_matching': True,
3569 }]
25f14e9f
S
3570
3571 def _real_extract(self, url):
ef2f3c7f 3572 return self.url_result(
3573 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
3462ffa8 3574
3575
25f14e9f
S
3576class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3577 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3d3dddc9 3578 _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
25f14e9f 3579 _FEED_NAME = 'recommended'
3d3dddc9 3580 _TESTS = [{
3581 'url': ':ytrec',
3582 'only_matching': True,
3583 }, {
3584 'url': ':ytrecommended',
3585 'only_matching': True,
3586 }, {
3587 'url': 'https://youtube.com',
3588 'only_matching': True,
3589 }]
1ed5b5c9 3590
1ed5b5c9 3591
25f14e9f 3592class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
70d5c17b 3593 IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
3d3dddc9 3594 _VALID_URL = r':ytsub(?:scription)?s?'
25f14e9f 3595 _FEED_NAME = 'subscriptions'
3d3dddc9 3596 _TESTS = [{
3597 'url': ':ytsubs',
3598 'only_matching': True,
3599 }, {
3600 'url': ':ytsubscriptions',
3601 'only_matching': True,
3602 }]
1ed5b5c9 3603
1ed5b5c9 3604
25f14e9f 3605class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
a5c56234
M
3606 IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)'
3607 _VALID_URL = r':ythis(?:tory)?'
25f14e9f 3608 _FEED_NAME = 'history'
3d3dddc9 3609 _TESTS = [{
3610 'url': ':ythistory',
3611 'only_matching': True,
3612 }]
1ed5b5c9
JMF
3613
3614
15870e90
PH
3615class YoutubeTruncatedURLIE(InfoExtractor):
3616 IE_NAME = 'youtube:truncated_url'
3617 IE_DESC = False # Do not list
975d35db 3618 _VALID_URL = r'''(?x)
b95aab84
PH
3619 (?:https?://)?
3620 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3621 (?:watch\?(?:
c4808c60 3622 feature=[a-z_]+|
b95aab84
PH
3623 annotation_id=annotation_[^&]+|
3624 x-yt-cl=[0-9]+|
c1708b89 3625 hl=[^&]*|
287be8c6 3626 t=[0-9]+
b95aab84
PH
3627 )?
3628 |
3629 attribution_link\?a=[^&]+
3630 )
3631 $
975d35db 3632 '''
15870e90 3633
c4808c60 3634 _TESTS = [{
2d3d2997 3635 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
c4808c60 3636 'only_matching': True,
dc2fc736 3637 }, {
2d3d2997 3638 'url': 'https://www.youtube.com/watch?',
dc2fc736 3639 'only_matching': True,
b95aab84
PH
3640 }, {
3641 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3642 'only_matching': True,
3643 }, {
3644 'url': 'https://www.youtube.com/watch?feature=foo',
3645 'only_matching': True,
c1708b89
PH
3646 }, {
3647 'url': 'https://www.youtube.com/watch?hl=en-GB',
3648 'only_matching': True,
287be8c6
PH
3649 }, {
3650 'url': 'https://www.youtube.com/watch?t=2372',
3651 'only_matching': True,
c4808c60
PH
3652 }]
3653
15870e90
PH
3654 def _real_extract(self, url):
3655 raise ExtractorError(
78caa52a
PH
3656 'Did you forget to quote the URL? Remember that & is a meta '
3657 'character in most shells, so you want to put the URL in quotes, '
3867038a 3658 'like youtube-dl '
2d3d2997 3659 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3867038a 3660 ' or simply youtube-dl BaW_jenozKc .',
15870e90 3661 expected=True)
772fd5cc
PH
3662
3663
3664class YoutubeTruncatedIDIE(InfoExtractor):
3665 IE_NAME = 'youtube:truncated_id'
3666 IE_DESC = False # Do not list
b95aab84 3667 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
3668
3669 _TESTS = [{
3670 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3671 'only_matching': True,
3672 }]
3673
3674 def _real_extract(self, url):
3675 video_id = self._match_id(url)
3676 raise ExtractorError(
3677 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3678 expected=True)