]> jfr.im git - yt-dlp.git/blame - youtube_dlc/extractor/youtube.py
[youtube] Update to ytdl-2021.02.04.1
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
0ca96d48 5import itertools
c5e8d7af 6import json
c4417ddb 7import os.path
d77ab8e2 8import random
c5e8d7af 9import re
e0df6211 10import traceback
c5e8d7af 11
b05654f0 12from .common import InfoExtractor, SearchInfoExtractor
4bb4a188 13from ..compat import (
edf3e38e 14 compat_chr,
29f7c58a 15 compat_HTTPError,
8d81f3e3 16 compat_kwargs,
c5e8d7af 17 compat_parse_qs,
545cc85d 18 compat_str,
7fd002c0 19 compat_urllib_parse_unquote_plus,
15707c7e 20 compat_urllib_parse_urlencode,
7c80519c 21 compat_urllib_parse_urlparse,
7c61bd36 22 compat_urlparse,
4bb4a188 23)
545cc85d 24from ..jsinterp import JSInterpreter
4bb4a188 25from ..utils import (
c5e8d7af 26 clean_html,
c5e8d7af 27 ExtractorError,
b60419c5 28 format_field,
2d30521a 29 float_or_none,
dd27fd17 30 int_or_none,
94278f72 31 mimetype2ext,
6310acf5 32 parse_codecs,
b84071c0 33 parse_count,
7c80519c 34 parse_duration,
545cc85d 35 qualities,
3995d37d 36 remove_start,
cf7e015f 37 smuggle_url,
dbdaaa23 38 str_or_none,
c93d53f5 39 str_to_int,
556dbe7f 40 try_get,
c5e8d7af
PH
41 unescapeHTML,
42 unified_strdate,
cf7e015f 43 unsmuggle_url,
8bdd16b4 44 update_url_query,
21c340b8 45 url_or_none,
6e6bc8da 46 urlencode_postdata,
8bdd16b4 47 urljoin,
c5e8d7af
PH
48)
49
5f6a1245 50
de7f3446 51class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
52 """Provide base functions for Youtube extractors"""
53 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 54 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
e00eb564
S
55
56 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
3995d37d
S
57 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
58 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
e00eb564 59
3462ffa8 60 _RESERVED_NAMES = (
29f7c58a 61 r'embed|e|watch_popup|channel|c|user|playlist|watch|w|v|movies|results|shared|'
62 r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout|'
63 r'feed/(?:watch_later|history|subscriptions|library|trending|recommended)')
3462ffa8 64
b2e8bc1b
JMF
65 _NETRC_MACHINE = 'youtube'
66 # If True it will raise an error if no login info is provided
67 _LOGIN_REQUIRED = False
68
70d5c17b 69 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
d0ba5587 70
25f14e9f
S
71 def _ids_to_results(self, ids):
72 return [
73 self.url_result(vid_id, 'Youtube', video_id=vid_id)
74 for vid_id in ids]
75
b2e8bc1b 76 def _login(self):
83317f69 77 """
78 Attempt to log in to YouTube.
79 True is returned if successful or skipped.
80 False is returned if login failed.
81
82 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
83 """
68217024 84 username, password = self._get_login_info()
b2e8bc1b
JMF
85 # No authentication to be performed
86 if username is None:
70d35d16 87 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
69ea8ca4 88 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
545cc85d 89 # if self._downloader.params.get('cookiefile'): # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
90 # self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
83317f69 91 return True
b2e8bc1b 92
7cc3570e
PH
93 login_page = self._download_webpage(
94 self._LOGIN_URL, None,
69ea8ca4
PH
95 note='Downloading login page',
96 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
97 if login_page is False:
98 return
b2e8bc1b 99
1212e997 100 login_form = self._hidden_inputs(login_page)
c5e8d7af 101
e00eb564
S
102 def req(url, f_req, note, errnote):
103 data = login_form.copy()
104 data.update({
105 'pstMsg': 1,
106 'checkConnection': 'youtube',
107 'checkedDomains': 'youtube',
108 'hl': 'en',
109 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
3995d37d 110 'f.req': json.dumps(f_req),
e00eb564
S
111 'flowName': 'GlifWebSignIn',
112 'flowEntry': 'ServiceLogin',
baf67a60
S
113 # TODO: reverse actual botguard identifier generation algo
114 'bgRequest': '["identifier",""]',
041bc3ad 115 })
e00eb564
S
116 return self._download_json(
117 url, None, note=note, errnote=errnote,
118 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
119 fatal=False,
120 data=urlencode_postdata(data), headers={
121 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
122 'Google-Accounts-XSRF': 1,
123 })
124
3995d37d
S
125 def warn(message):
126 self._downloader.report_warning(message)
127
128 lookup_req = [
129 username,
130 None, [], None, 'US', None, None, 2, False, True,
131 [
132 None, None,
133 [2, 1, None, 1,
134 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
135 None, [], 4],
136 1, [None, None, []], None, None, None, True
137 ],
138 username,
139 ]
140
e00eb564 141 lookup_results = req(
3995d37d 142 self._LOOKUP_URL, lookup_req,
e00eb564
S
143 'Looking up account info', 'Unable to look up account info')
144
145 if lookup_results is False:
146 return False
041bc3ad 147
3995d37d
S
148 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
149 if not user_hash:
150 warn('Unable to extract user hash')
151 return False
152
153 challenge_req = [
154 user_hash,
155 None, 1, None, [1, None, None, None, [password, None, True]],
156 [
157 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
158 1, [None, None, []], None, None, None, True
159 ]]
83317f69 160
3995d37d
S
161 challenge_results = req(
162 self._CHALLENGE_URL, challenge_req,
163 'Logging in', 'Unable to log in')
83317f69 164
3995d37d 165 if challenge_results is False:
e00eb564 166 return
83317f69 167
3995d37d
S
168 login_res = try_get(challenge_results, lambda x: x[0][5], list)
169 if login_res:
170 login_msg = try_get(login_res, lambda x: x[5], compat_str)
171 warn(
172 'Unable to login: %s' % 'Invalid password'
173 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
174 return False
175
176 res = try_get(challenge_results, lambda x: x[0][-1], list)
177 if not res:
178 warn('Unable to extract result entry')
179 return False
180
9a6628aa
S
181 login_challenge = try_get(res, lambda x: x[0][0], list)
182 if login_challenge:
183 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
184 if challenge_str == 'TWO_STEP_VERIFICATION':
3995d37d
S
185 # SEND_SUCCESS - TFA code has been successfully sent to phone
186 # QUOTA_EXCEEDED - reached the limit of TFA codes
9a6628aa 187 status = try_get(login_challenge, lambda x: x[5], compat_str)
3995d37d
S
188 if status == 'QUOTA_EXCEEDED':
189 warn('Exceeded the limit of TFA codes, try later')
190 return False
191
192 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
193 if not tl:
194 warn('Unable to extract TL')
195 return False
196
197 tfa_code = self._get_tfa_info('2-step verification code')
198
199 if not tfa_code:
200 warn(
201 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
202 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
203 return False
204
205 tfa_code = remove_start(tfa_code, 'G-')
206
207 tfa_req = [
208 user_hash, None, 2, None,
209 [
210 9, None, None, None, None, None, None, None,
211 [None, tfa_code, True, 2]
212 ]]
213
214 tfa_results = req(
215 self._TFA_URL.format(tl), tfa_req,
216 'Submitting TFA code', 'Unable to submit TFA code')
217
218 if tfa_results is False:
219 return False
220
221 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
222 if tfa_res:
223 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
224 warn(
225 'Unable to finish TFA: %s' % 'Invalid TFA code'
226 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
227 return False
228
229 check_cookie_url = try_get(
230 tfa_results, lambda x: x[0][-1][2], compat_str)
9a6628aa
S
231 else:
232 CHALLENGES = {
233 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
234 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
235 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
236 }
237 challenge = CHALLENGES.get(
238 challenge_str,
239 '%s returned error %s.' % (self.IE_NAME, challenge_str))
240 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
241 return False
3995d37d
S
242 else:
243 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
244
245 if not check_cookie_url:
246 warn('Unable to extract CheckCookie URL')
247 return False
e00eb564
S
248
249 check_cookie_results = self._download_webpage(
3995d37d
S
250 check_cookie_url, None, 'Checking cookie', fatal=False)
251
252 if check_cookie_results is False:
253 return False
e00eb564 254
3995d37d
S
255 if 'https://myaccount.google.com/' not in check_cookie_results:
256 warn('Unable to log in')
b2e8bc1b 257 return False
e00eb564 258
b2e8bc1b
JMF
259 return True
260
30226342 261 def _download_webpage_handle(self, *args, **kwargs):
c1148516 262 query = kwargs.get('query', {}).copy()
c1148516 263 kwargs['query'] = query
30226342 264 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
8d81f3e3
S
265 *args, **compat_kwargs(kwargs))
266
b2e8bc1b
JMF
267 def _real_initialize(self):
268 if self._downloader is None:
269 return
b2e8bc1b
JMF
270 if not self._login():
271 return
c5e8d7af 272
8bdd16b4 273 _DEFAULT_API_DATA = {
274 'context': {
275 'client': {
276 'clientName': 'WEB',
277 'clientVersion': '2.20201021.03.00',
278 }
279 },
280 }
8377574c 281
a0566bbf 282 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
29f7c58a 283 _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
284 _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
a0566bbf 285
545cc85d 286 def _call_api(self, ep, query, video_id, fatal=True):
8bdd16b4 287 data = self._DEFAULT_API_DATA.copy()
288 data.update(query)
9833e7a0 289
545cc85d 290 return self._download_json(
8bdd16b4 291 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
292 note='Downloading API JSON', errnote='Unable to download API page',
545cc85d 293 data=json.dumps(data).encode('utf8'), fatal=fatal,
8bdd16b4 294 headers={'content-type': 'application/json'},
295 query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
c54f4aad 296
8bdd16b4 297 def _extract_yt_initial_data(self, video_id, webpage):
298 return self._parse_json(
299 self._search_regex(
29f7c58a 300 (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
a0566bbf 301 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
8bdd16b4 302 video_id)
0c148415 303
29f7c58a 304 def _extract_ytcfg(self, video_id, webpage):
305 return self._parse_json(
306 self._search_regex(
307 r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
308 default='{}'), video_id, fatal=False)
309
30a074c2 310 def _extract_video(self, renderer):
311 video_id = renderer.get('videoId')
312 title = try_get(
313 renderer,
314 (lambda x: x['title']['runs'][0]['text'],
315 lambda x: x['title']['simpleText']), compat_str)
316 description = try_get(
317 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
318 compat_str)
319 duration = parse_duration(try_get(
320 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
321 view_count_text = try_get(
322 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
323 view_count = str_to_int(self._search_regex(
324 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
325 'view count', default=None))
326 uploader = try_get(
327 renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
328 return {
329 '_type': 'url_transparent',
330 'ie_key': YoutubeIE.ie_key(),
331 'id': video_id,
332 'url': video_id,
333 'title': title,
334 'description': description,
335 'duration': duration,
336 'view_count': view_count,
337 'uploader': uploader,
338 }
339
0c148415 340
360e1ca5 341class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 342 IE_DESC = 'YouTube.com'
cb7dfeea 343 _VALID_URL = r"""(?x)^
c5e8d7af 344 (
edb53e2d 345 (?:https?://|//) # http(s):// or protocol-independent URL
66b48727 346 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
484aaeb2 347 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 348 (?:www\.)?pwnyoutube\.com/|
8b561bfc 349 (?:www\.)?hooktube\.com/|
f7000f3a 350 (?:www\.)?yourepeat\.com/|
e69ae5b9 351 tube\.majestyc\.net/|
ba036333 352 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
77d95677 353 (?:(?:www|dev)\.)?invidio\.us/|
ba036333 354 (?:(?:www|no)\.)?invidiou\.sh/|
29f7c58a 355 (?:(?:www|fi)\.)?invidious\.snopyta\.org/|
8ae113ca 356 (?:www\.)?invidious\.kabi\.tk/|
ba036333 357 (?:www\.)?invidious\.13ad\.de/|
791d2e81 358 (?:www\.)?invidious\.mastodon\.host/|
29f7c58a 359 (?:www\.)?invidious\.zapashcanon\.fr/|
360 (?:www\.)?invidious\.kavin\.rocks/|
361 (?:www\.)?invidious\.tube/|
362 (?:www\.)?invidiou\.site/|
363 (?:www\.)?invidious\.site/|
364 (?:www\.)?invidious\.xyz/|
494d664e 365 (?:www\.)?invidious\.nixnet\.xyz/|
666d808e 366 (?:www\.)?invidious\.drycat\.fr/|
ba036333 367 (?:www\.)?tube\.poal\.co/|
29f7c58a 368 (?:www\.)?tube\.connect\.cafe/|
8ae113ca 369 (?:www\.)?vid\.wxzm\.sx/|
29f7c58a 370 (?:www\.)?vid\.mint\.lgbt/|
384bf91f 371 (?:www\.)?yewtu\.be/|
494d664e 372 (?:www\.)?yt\.elukerio\.org/|
894b3826 373 (?:www\.)?yt\.lelux\.fi/|
1db5ab6b 374 (?:www\.)?invidious\.ggc-project\.de/|
375 (?:www\.)?yt\.maisputain\.ovh/|
376 (?:www\.)?invidious\.13ad\.de/|
377 (?:www\.)?invidious\.toot\.koeln/|
378 (?:www\.)?invidious\.fdn\.fr/|
379 (?:www\.)?watch\.nettohikari\.com/|
bff90fc5 380 (?:www\.)?kgg2m7yk5aybusll\.onion/|
381 (?:www\.)?qklhadlycap4cnod\.onion/|
382 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
383 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
384 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
385 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
33c1c7d8 386 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
1db5ab6b 387 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
e69ae5b9 388 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
389 (?:.*?\#/)? # handle anchor (#/) redirect urls
390 (?: # the various things that can precede the ID:
ac7553d0 391 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 392 |(?: # or the v= param in all its forms
f7000f3a 393 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 394 (?:\?|\#!?) # the params delimiter ? or # or #!
040ac686 395 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
c5e8d7af
PH
396 v=
397 )
f4b05232 398 ))
cbaed4bb
S
399 |(?:
400 youtu\.be| # just youtu.be/xxxx
6d4fc66b
S
401 vid\.plus| # or vid.plus/xxxx
402 zwearz\.com/watch| # or zwearz.com/watch/xxxx
cbaed4bb 403 )/
edb53e2d 404 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 405 )
c5e8d7af 406 )? # all until now is optional -> you can pass the naked ID
8bdd16b4 407 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
d0ba5587
S
408 (?!.*?\blist=
409 (?:
410 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
411 WL # WL are handled by the watch later IE
412 )
413 )
c5e8d7af 414 (?(1).+)? # if we found the ID, everything can follow
d0ba5587 415 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
e40c758c 416 _PLAYER_INFO_RE = (
545cc85d 417 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.js$',
418 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
e40c758c 419 )
2c62dc26 420 _formats = {
c2d3cb4c 421 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
422 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
423 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
424 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
425 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
426 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
427 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
428 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
3834d3e3 429 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
c2d3cb4c 430 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
431 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
432 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
433 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
434 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
435 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
e1a0bfdf 436 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
c2d3cb4c 437 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
438 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
e1a0bfdf 439
440
441 # 3D videos
c2d3cb4c 442 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
443 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
444 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
445 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
e1a0bfdf 446 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
447 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
448 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
836a086c 449
96fb5605 450 # Apple HTTP Live Streaming
11f12195 451 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
c2d3cb4c 452 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
453 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
454 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
455 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
456 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
e1a0bfdf 457 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
458 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
2c62dc26
PH
459
460 # DASH mp4 video
d23028a8
S
461 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
462 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
463 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
464 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
465 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
067aa17e 466 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
d23028a8
S
467 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
468 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
469 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
470 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
471 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
472 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
836a086c 473
f6f1fc92 474 # Dash mp4 audio
d23028a8
S
475 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
476 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
477 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
478 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
479 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
480 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
481 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
836a086c
AZ
482
483 # Dash webm
d23028a8
S
484 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
485 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
486 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
487 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
488 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
489 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
490 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
491 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
492 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
493 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
494 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
495 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
496 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
497 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
498 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
4c6b4764 499 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
d23028a8
S
500 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
501 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
502 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
503 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
504 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
505 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
2c62dc26
PH
506
507 # Dash webm audio
d23028a8
S
508 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
509 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
ce6b9a2d 510
0857baad 511 # Dash webm audio with opus inside
d23028a8
S
512 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
513 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
514 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
0857baad 515
ce6b9a2d
PH
516 # RTMP (unnamed)
517 '_rtmp': {'protocol': 'rtmp'},
b85eae0f
S
518
519 # av01 video only formats sometimes served with "unknown" codecs
520 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
521 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
522 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
523 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
c5e8d7af 524 }
29f7c58a 525 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
836a086c 526
fd5c4aab
S
527 _GEO_BYPASS = False
528
78caa52a 529 IE_NAME = 'youtube'
2eb88d95
PH
530 _TESTS = [
531 {
2d3d2997 532 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
4bc3a23e
PH
533 'info_dict': {
534 'id': 'BaW_jenozKc',
535 'ext': 'mp4',
3867038a 536 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
4bc3a23e
PH
537 'uploader': 'Philipp Hagemeister',
538 'uploader_id': 'phihag',
ec85ded8 539 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
dd4c4492
S
540 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
541 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
4bc3a23e 542 'upload_date': '20121002',
3867038a 543 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
4bc3a23e 544 'categories': ['Science & Technology'],
3867038a 545 'tags': ['youtube-dl'],
556dbe7f 546 'duration': 10,
dbdaaa23 547 'view_count': int,
3e7c1224
PH
548 'like_count': int,
549 'dislike_count': int,
7c80519c 550 'start_time': 1,
297a564b 551 'end_time': 9,
2eb88d95 552 }
0e853ca4 553 },
fccd3771 554 {
4bc3a23e
PH
555 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
556 'note': 'Embed-only video (#1746)',
557 'info_dict': {
558 'id': 'yZIXLfi8CZQ',
559 'ext': 'mp4',
560 'upload_date': '20120608',
561 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
562 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
563 'uploader': 'SET India',
94bfcd23 564 'uploader_id': 'setindia',
ec85ded8 565 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
94bfcd23 566 'age_limit': 18,
545cc85d 567 },
568 'skip': 'Private video',
fccd3771 569 },
11b56058 570 {
8bdd16b4 571 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
11b56058
PM
572 'note': 'Use the first video ID in the URL',
573 'info_dict': {
574 'id': 'BaW_jenozKc',
575 'ext': 'mp4',
3867038a 576 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
11b56058
PM
577 'uploader': 'Philipp Hagemeister',
578 'uploader_id': 'phihag',
ec85ded8 579 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
11b56058 580 'upload_date': '20121002',
3867038a 581 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
11b56058 582 'categories': ['Science & Technology'],
3867038a 583 'tags': ['youtube-dl'],
556dbe7f 584 'duration': 10,
dbdaaa23 585 'view_count': int,
11b56058
PM
586 'like_count': int,
587 'dislike_count': int,
34a7de29
S
588 },
589 'params': {
590 'skip_download': True,
591 },
11b56058 592 },
dd27fd17 593 {
2d3d2997 594 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
4bc3a23e
PH
595 'note': '256k DASH audio (format 141) via DASH manifest',
596 'info_dict': {
597 'id': 'a9LDPn-MO4I',
598 'ext': 'm4a',
599 'upload_date': '20121002',
600 'uploader_id': '8KVIDEO',
ec85ded8 601 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
4bc3a23e
PH
602 'description': '',
603 'uploader': '8KVIDEO',
604 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 605 },
4bc3a23e
PH
606 'params': {
607 'youtube_include_dash_manifest': True,
608 'format': '141',
4919603f 609 },
de3c7fe0 610 'skip': 'format 141 not served anymore',
dd27fd17 611 },
8bdd16b4 612 # DASH manifest with encrypted signature
613 {
614 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
615 'info_dict': {
616 'id': 'IB3lcPjvWLA',
617 'ext': 'm4a',
618 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
619 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
620 'duration': 244,
621 'uploader': 'AfrojackVEVO',
622 'uploader_id': 'AfrojackVEVO',
623 'upload_date': '20131011',
624 },
625 'params': {
626 'youtube_include_dash_manifest': True,
627 'format': '141/bestaudio[ext=m4a]',
628 },
629 },
aa79ac0c
PH
630 # Controversy video
631 {
632 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
633 'info_dict': {
634 'id': 'T4XJQO3qol8',
635 'ext': 'mp4',
556dbe7f 636 'duration': 219,
aa79ac0c 637 'upload_date': '20100909',
4fe54c12 638 'uploader': 'Amazing Atheist',
aa79ac0c 639 'uploader_id': 'TheAmazingAtheist',
ec85ded8 640 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
aa79ac0c 641 'title': 'Burning Everyone\'s Koran',
545cc85d 642 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
aa79ac0c 643 }
c522adb1 644 },
dd2d55f1 645 # Normal age-gate video (embed allowed)
c522adb1 646 {
2d3d2997 647 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
c522adb1
JMF
648 'info_dict': {
649 'id': 'HtVdAasjOgU',
650 'ext': 'mp4',
651 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
ec85ded8 652 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
556dbe7f 653 'duration': 142,
c522adb1
JMF
654 'uploader': 'The Witcher',
655 'uploader_id': 'WitcherGame',
ec85ded8 656 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
c522adb1 657 'upload_date': '20140605',
34952f09 658 'age_limit': 18,
c522adb1
JMF
659 },
660 },
8bdd16b4 661 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
662 # YouTube Red ad is not captured for creator
663 {
664 'url': '__2ABJjxzNo',
665 'info_dict': {
666 'id': '__2ABJjxzNo',
667 'ext': 'mp4',
668 'duration': 266,
669 'upload_date': '20100430',
670 'uploader_id': 'deadmau5',
671 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
545cc85d 672 'creator': 'deadmau5',
673 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336',
8bdd16b4 674 'uploader': 'deadmau5',
675 'title': 'Deadmau5 - Some Chords (HD)',
545cc85d 676 'alt_title': 'Some Chords',
8bdd16b4 677 },
678 'expected_warnings': [
679 'DASH manifest missing',
680 ]
681 },
067aa17e 682 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
e52a40ab
PH
683 {
684 'url': 'lqQg6PlCWgI',
685 'info_dict': {
686 'id': 'lqQg6PlCWgI',
687 'ext': 'mp4',
556dbe7f 688 'duration': 6085,
90227264 689 'upload_date': '20150827',
cbe2bd91 690 'uploader_id': 'olympic',
ec85ded8 691 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
cbe2bd91 692 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
be49068d 693 'uploader': 'Olympic',
cbe2bd91
PH
694 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
695 },
696 'params': {
697 'skip_download': 'requires avconv',
e52a40ab 698 }
cbe2bd91 699 },
6271f1ca
PH
700 # Non-square pixels
701 {
702 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
703 'info_dict': {
704 'id': '_b-2C3KPAM0',
705 'ext': 'mp4',
706 'stretched_ratio': 16 / 9.,
556dbe7f 707 'duration': 85,
6271f1ca
PH
708 'upload_date': '20110310',
709 'uploader_id': 'AllenMeow',
ec85ded8 710 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
6271f1ca 711 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
eb6793ba 712 'uploader': '孫ᄋᄅ',
6271f1ca
PH
713 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
714 },
06b491eb
S
715 },
716 # url_encoded_fmt_stream_map is empty string
717 {
718 'url': 'qEJwOuvDf7I',
719 'info_dict': {
720 'id': 'qEJwOuvDf7I',
f57b7835 721 'ext': 'webm',
06b491eb
S
722 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
723 'description': '',
724 'upload_date': '20150404',
725 'uploader_id': 'spbelect',
726 'uploader': 'Наблюдатели Петербурга',
727 },
728 'params': {
729 'skip_download': 'requires avconv',
e323cf3f
S
730 },
731 'skip': 'This live event has ended.',
06b491eb 732 },
067aa17e 733 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
da77d856
S
734 {
735 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
736 'info_dict': {
737 'id': 'FIl7x6_3R5Y',
eb6793ba 738 'ext': 'webm',
da77d856
S
739 'title': 'md5:7b81415841e02ecd4313668cde88737a',
740 'description': 'md5:116377fd2963b81ec4ce64b542173306',
556dbe7f 741 'duration': 220,
da77d856
S
742 'upload_date': '20150625',
743 'uploader_id': 'dorappi2000',
ec85ded8 744 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
da77d856 745 'uploader': 'dorappi2000',
eb6793ba 746 'formats': 'mincount:31',
da77d856 747 },
eb6793ba 748 'skip': 'not actual anymore',
2ee8f5d8 749 },
8a1a26ce
YCH
750 # DASH manifest with segment_list
751 {
752 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
753 'md5': '8ce563a1d667b599d21064e982ab9e31',
754 'info_dict': {
755 'id': 'CsmdDsKjzN8',
756 'ext': 'mp4',
17ee98e1 757 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
758 'uploader': 'Airtek',
759 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
760 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
761 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
762 },
763 'params': {
764 'youtube_include_dash_manifest': True,
765 'format': '135', # bestvideo
be49068d
S
766 },
767 'skip': 'This live event has ended.',
2ee8f5d8 768 },
cf7e015f
S
769 {
770 # Multifeed videos (multiple cameras), URL is for Main Camera
545cc85d 771 'url': 'https://www.youtube.com/watch?v=jvGDaLqkpTg',
cf7e015f 772 'info_dict': {
545cc85d 773 'id': 'jvGDaLqkpTg',
774 'title': 'Tom Clancy Free Weekend Rainbow Whatever',
775 'description': 'md5:e03b909557865076822aa169218d6a5d',
cf7e015f
S
776 },
777 'playlist': [{
778 'info_dict': {
545cc85d 779 'id': 'jvGDaLqkpTg',
cf7e015f 780 'ext': 'mp4',
545cc85d 781 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Main Camera)',
782 'description': 'md5:e03b909557865076822aa169218d6a5d',
783 'duration': 10643,
784 'upload_date': '20161111',
785 'uploader': 'Team PGP',
786 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
787 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
cf7e015f
S
788 },
789 }, {
790 'info_dict': {
545cc85d 791 'id': '3AKt1R1aDnw',
cf7e015f 792 'ext': 'mp4',
545cc85d 793 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 2)',
794 'description': 'md5:e03b909557865076822aa169218d6a5d',
795 'duration': 10991,
796 'upload_date': '20161111',
797 'uploader': 'Team PGP',
798 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
799 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
cf7e015f
S
800 },
801 }, {
802 'info_dict': {
545cc85d 803 'id': 'RtAMM00gpVc',
cf7e015f 804 'ext': 'mp4',
545cc85d 805 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 3)',
806 'description': 'md5:e03b909557865076822aa169218d6a5d',
807 'duration': 10995,
808 'upload_date': '20161111',
809 'uploader': 'Team PGP',
810 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
811 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
cf7e015f
S
812 },
813 }, {
814 'info_dict': {
545cc85d 815 'id': '6N2fdlP3C5U',
cf7e015f 816 'ext': 'mp4',
545cc85d 817 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 4)',
818 'description': 'md5:e03b909557865076822aa169218d6a5d',
819 'duration': 10990,
820 'upload_date': '20161111',
821 'uploader': 'Team PGP',
822 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
823 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
cf7e015f
S
824 },
825 }],
826 'params': {
827 'skip_download': True,
828 },
cbaed4bb 829 },
f9f49d87 830 {
067aa17e 831 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
f9f49d87
S
832 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
833 'info_dict': {
834 'id': 'gVfLd0zydlo',
835 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
836 },
837 'playlist_count': 2,
be49068d 838 'skip': 'Not multifeed anymore',
f9f49d87 839 },
cbaed4bb 840 {
2d3d2997 841 'url': 'https://vid.plus/FlRa-iH7PGw',
cbaed4bb 842 'only_matching': True,
0e49d9a6 843 },
6d4fc66b 844 {
2d3d2997 845 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
6d4fc66b
S
846 'only_matching': True,
847 },
0e49d9a6 848 {
067aa17e 849 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
a8776b10 850 # Also tests cut-off URL expansion in video description (see
067aa17e
S
851 # https://github.com/ytdl-org/youtube-dl/issues/1892,
852 # https://github.com/ytdl-org/youtube-dl/issues/8164)
0e49d9a6
LL
853 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
854 'info_dict': {
855 'id': 'lsguqyKfVQg',
856 'ext': 'mp4',
857 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
eb6793ba 858 'alt_title': 'Dark Walk - Position Music',
0e49d9a6 859 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
556dbe7f 860 'duration': 133,
0e49d9a6
LL
861 'upload_date': '20151119',
862 'uploader_id': 'IronSoulElf',
ec85ded8 863 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
0e49d9a6 864 'uploader': 'IronSoulElf',
eb6793ba
S
865 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
866 'track': 'Dark Walk - Position Music',
867 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
92bc97d3 868 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
0e49d9a6
LL
869 },
870 'params': {
871 'skip_download': True,
872 },
873 },
61f92af1 874 {
067aa17e 875 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
61f92af1
S
876 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
877 'only_matching': True,
878 },
313dfc45
LL
879 {
880 # Video with yt:stretch=17:0
881 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
882 'info_dict': {
883 'id': 'Q39EVAstoRM',
884 'ext': 'mp4',
885 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
886 'description': 'md5:ee18a25c350637c8faff806845bddee9',
887 'upload_date': '20151107',
888 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
889 'uploader': 'CH GAMER DROID',
890 },
891 'params': {
892 'skip_download': True,
893 },
be49068d 894 'skip': 'This video does not exist.',
313dfc45 895 },
7caf9830
S
896 {
897 # Video licensed under Creative Commons
898 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
899 'info_dict': {
900 'id': 'M4gD1WSo5mA',
901 'ext': 'mp4',
902 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
903 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
556dbe7f 904 'duration': 721,
7caf9830
S
905 'upload_date': '20150127',
906 'uploader_id': 'BerkmanCenter',
ec85ded8 907 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
556dbe7f 908 'uploader': 'The Berkman Klein Center for Internet & Society',
7caf9830
S
909 'license': 'Creative Commons Attribution license (reuse allowed)',
910 },
911 'params': {
912 'skip_download': True,
913 },
914 },
fd050249
S
915 {
916 # Channel-like uploader_url
917 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
918 'info_dict': {
919 'id': 'eQcmzGIKrzg',
920 'ext': 'mp4',
921 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
545cc85d 922 'description': 'md5:13a2503d7b5904ef4b223aa101628f39',
556dbe7f 923 'duration': 4060,
fd050249 924 'upload_date': '20151119',
eb6793ba 925 'uploader': 'Bernie Sanders',
fd050249 926 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
ec85ded8 927 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
fd050249
S
928 'license': 'Creative Commons Attribution license (reuse allowed)',
929 },
930 'params': {
931 'skip_download': True,
932 },
933 },
040ac686
S
934 {
935 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
936 'only_matching': True,
7f29cf54
S
937 },
938 {
067aa17e 939 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
7f29cf54
S
940 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
941 'only_matching': True,
6496ccb4
S
942 },
943 {
944 # Rental video preview
945 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
946 'info_dict': {
947 'id': 'uGpuVWrhIzE',
948 'ext': 'mp4',
949 'title': 'Piku - Trailer',
950 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
951 'upload_date': '20150811',
952 'uploader': 'FlixMatrix',
953 'uploader_id': 'FlixMatrixKaravan',
ec85ded8 954 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
6496ccb4
S
955 'license': 'Standard YouTube License',
956 },
957 'params': {
958 'skip_download': True,
959 },
eb6793ba 960 'skip': 'This video is not available.',
022a5d66 961 },
12afdc2a
S
962 {
963 # YouTube Red video with episode data
964 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
965 'info_dict': {
966 'id': 'iqKdEhx-dD4',
967 'ext': 'mp4',
968 'title': 'Isolation - Mind Field (Ep 1)',
545cc85d 969 'description': 'md5:f540112edec5d09fc8cc752d3d4ba3cd',
556dbe7f 970 'duration': 2085,
12afdc2a
S
971 'upload_date': '20170118',
972 'uploader': 'Vsauce',
973 'uploader_id': 'Vsauce',
974 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
12afdc2a
S
975 'series': 'Mind Field',
976 'season_number': 1,
977 'episode_number': 1,
978 },
979 'params': {
980 'skip_download': True,
981 },
982 'expected_warnings': [
983 'Skipping DASH manifest',
984 ],
985 },
c7121fa7
S
986 {
987 # The following content has been identified by the YouTube community
988 # as inappropriate or offensive to some audiences.
989 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
990 'info_dict': {
991 'id': '6SJNVb0GnPI',
992 'ext': 'mp4',
993 'title': 'Race Differences in Intelligence',
994 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
995 'duration': 965,
996 'upload_date': '20140124',
997 'uploader': 'New Century Foundation',
998 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
999 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
c7121fa7
S
1000 },
1001 'params': {
1002 'skip_download': True,
1003 },
545cc85d 1004 'skip': 'This video has been removed for violating YouTube\'s policy on hate speech.',
c7121fa7 1005 },
022a5d66
S
1006 {
1007 # itag 212
1008 'url': '1t24XAntNCY',
1009 'only_matching': True,
fd5c4aab
S
1010 },
1011 {
1012 # geo restricted to JP
1013 'url': 'sJL6WA-aGkQ',
1014 'only_matching': True,
1015 },
cd5a74a2
S
1016 {
1017 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1018 'only_matching': True,
1019 },
825cd268
RA
1020 {
1021 # DRM protected
1022 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1023 'only_matching': True,
4fe54c12
S
1024 },
1025 {
1026 # Video with unsupported adaptive stream type formats
1027 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1028 'info_dict': {
1029 'id': 'Z4Vy8R84T1U',
1030 'ext': 'mp4',
1031 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1032 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1033 'duration': 433,
1034 'upload_date': '20130923',
1035 'uploader': 'Amelia Putri Harwita',
1036 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1037 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1038 'formats': 'maxcount:10',
1039 },
1040 'params': {
1041 'skip_download': True,
1042 'youtube_include_dash_manifest': False,
1043 },
5429d6a9 1044 'skip': 'not actual anymore',
5caabd3c 1045 },
1046 {
822b9d9c 1047 # Youtube Music Auto-generated description
5caabd3c 1048 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1049 'info_dict': {
1050 'id': 'MgNrAu2pzNs',
1051 'ext': 'mp4',
1052 'title': 'Voyeur Girl',
1053 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1054 'upload_date': '20190312',
5429d6a9
S
1055 'uploader': 'Stephen - Topic',
1056 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
5caabd3c 1057 'artist': 'Stephen',
1058 'track': 'Voyeur Girl',
1059 'album': 'it\'s too much love to know my dear',
1060 'release_date': '20190313',
1061 'release_year': 2019,
1062 },
1063 'params': {
1064 'skip_download': True,
1065 },
1066 },
66b48727
RA
1067 {
1068 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1069 'only_matching': True,
1070 },
011e75e6
S
1071 {
1072 # invalid -> valid video id redirection
1073 'url': 'DJztXj2GPfl',
1074 'info_dict': {
1075 'id': 'DJztXj2GPfk',
1076 'ext': 'mp4',
1077 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1078 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1079 'upload_date': '20090125',
1080 'uploader': 'Prochorowka',
1081 'uploader_id': 'Prochorowka',
1082 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1083 'artist': 'Panjabi MC',
1084 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1085 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1086 },
1087 'params': {
1088 'skip_download': True,
1089 },
545cc85d 1090 'skip': 'Video unavailable',
ea74e00b
DP
1091 },
1092 {
1093 # empty description results in an empty string
1094 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1095 'info_dict': {
1096 'id': 'x41yOUIvK2k',
1097 'ext': 'mp4',
1098 'title': 'IMG 3456',
1099 'description': '',
1100 'upload_date': '20170613',
1101 'uploader_id': 'ElevageOrVert',
1102 'uploader': 'ElevageOrVert',
1103 },
1104 'params': {
1105 'skip_download': True,
1106 },
1107 },
a0566bbf 1108 {
29f7c58a 1109 # with '};' inside yt initial data (see [1])
1110 # see [2] for an example with '};' inside ytInitialPlayerResponse
1111 # 1. https://github.com/ytdl-org/youtube-dl/issues/27093
1112 # 2. https://github.com/ytdl-org/youtube-dl/issues/27216
a0566bbf 1113 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1114 'info_dict': {
1115 'id': 'CHqg6qOn4no',
1116 'ext': 'mp4',
1117 'title': 'Part 77 Sort a list of simple types in c#',
1118 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1119 'upload_date': '20130831',
1120 'uploader_id': 'kudvenkat',
1121 'uploader': 'kudvenkat',
1122 },
1123 'params': {
1124 'skip_download': True,
1125 },
1126 },
29f7c58a 1127 {
1128 # another example of '};' in ytInitialData
1129 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY',
1130 'only_matching': True,
1131 },
1132 {
1133 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ',
1134 'only_matching': True,
1135 },
545cc85d 1136 {
1137 # Age-gated video only available with authentication (unavailable
1138 # via embed page workaround)
1139 'url': 'XgnwCQzjau8',
1140 'only_matching': True,
1141 },
2eb88d95
PH
1142 ]
1143
e0df6211
PH
1144 def __init__(self, *args, **kwargs):
1145 super(YoutubeIE, self).__init__(*args, **kwargs)
545cc85d 1146 self._code_cache = {}
83799698 1147 self._player_cache = {}
e0df6211 1148
60064c53
PH
1149 def _signature_cache_id(self, example_sig):
1150 """ Return a string representation of a signature """
78caa52a 1151 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53 1152
e40c758c
S
1153 @classmethod
1154 def _extract_player_info(cls, player_url):
1155 for player_re in cls._PLAYER_INFO_RE:
1156 id_m = re.search(player_re, player_url)
1157 if id_m:
1158 break
1159 else:
c081b35c 1160 raise ExtractorError('Cannot identify player %r' % player_url)
545cc85d 1161 return id_m.group('id')
e40c758c
S
1162
1163 def _extract_signature_function(self, video_id, player_url, example_sig):
545cc85d 1164 player_id = self._extract_player_info(player_url)
e0df6211 1165
c4417ddb 1166 # Read from filesystem cache
545cc85d 1167 func_id = 'js_%s_%s' % (
1168 player_id, self._signature_cache_id(example_sig))
c4417ddb 1169 assert os.path.basename(func_id) == func_id
a0e07d31 1170
69ea8ca4 1171 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 1172 if cache_spec is not None:
78caa52a 1173 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 1174
545cc85d 1175 if player_id not in self._code_cache:
1176 self._code_cache[player_id] = self._download_webpage(
e0df6211 1177 player_url, video_id,
545cc85d 1178 note='Downloading player ' + player_id,
69ea8ca4 1179 errnote='Download of %s failed' % player_url)
545cc85d 1180 code = self._code_cache[player_id]
1181 res = self._parse_sig_js(code)
e0df6211 1182
785521bf
PH
1183 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1184 cache_res = res(test_string)
1185 cache_spec = [ord(c) for c in cache_res]
83799698 1186
69ea8ca4 1187 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
1188 return res
1189
60064c53 1190 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
1191 def gen_sig_code(idxs):
1192 def _genslice(start, end, step):
78caa52a 1193 starts = '' if start == 0 else str(start)
8bcc8756 1194 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 1195 steps = '' if step == 1 else (':%d' % step)
78caa52a 1196 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
1197
1198 step = None
7af808a5
PH
1199 # Quelch pyflakes warnings - start will be set when step is set
1200 start = '(Never used)'
edf3e38e
PH
1201 for i, prev in zip(idxs[1:], idxs[:-1]):
1202 if step is not None:
1203 if i - prev == step:
1204 continue
1205 yield _genslice(start, prev, step)
1206 step = None
1207 continue
1208 if i - prev in [-1, 1]:
1209 step = i - prev
1210 start = prev
1211 continue
1212 else:
78caa52a 1213 yield 's[%d]' % prev
edf3e38e 1214 if step is None:
78caa52a 1215 yield 's[%d]' % i
edf3e38e
PH
1216 else:
1217 yield _genslice(start, i, step)
1218
78caa52a 1219 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 1220 cache_res = func(test_string)
edf3e38e 1221 cache_spec = [ord(c) for c in cache_res]
78caa52a 1222 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
1223 signature_id_tuple = '(%s)' % (
1224 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 1225 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 1226 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 1227 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 1228
e0df6211
PH
1229 def _parse_sig_js(self, jscode):
1230 funcname = self._search_regex(
abefc03f
S
1231 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1232 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
e450f6cb 1233 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
31ce6e99 1234 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
abefc03f
S
1235 # Obsolete patterns
1236 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
9a47fa35 1237 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
abefc03f
S
1238 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1239 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1240 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1241 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1242 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1243 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
3c90cc8b 1244 jscode, 'Initial JS player signature function name', group='sig')
2b25cb5d
PH
1245
1246 jsi = JSInterpreter(jscode)
1247 initial_function = jsi.extract_function(funcname)
e0df6211
PH
1248 return lambda s: initial_function([s])
1249
545cc85d 1250 def _decrypt_signature(self, s, video_id, player_url):
257a2501 1251 """Turn the encrypted s field into a working signature"""
6b37f0be 1252
c8bf86d5 1253 if player_url is None:
69ea8ca4 1254 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 1255
69ea8ca4 1256 if player_url.startswith('//'):
78caa52a 1257 player_url = 'https:' + player_url
3c90cc8b
S
1258 elif not re.match(r'https?://', player_url):
1259 player_url = compat_urlparse.urljoin(
1260 'https://www.youtube.com', player_url)
c8bf86d5 1261 try:
62af3a0e 1262 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
1263 if player_id not in self._player_cache:
1264 func = self._extract_signature_function(
60064c53 1265 video_id, player_url, s
c8bf86d5
PH
1266 )
1267 self._player_cache[player_id] = func
1268 func = self._player_cache[player_id]
1269 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 1270 self._print_sig_code(func, s)
c8bf86d5
PH
1271 return func(s)
1272 except Exception as e:
1273 tb = traceback.format_exc()
1274 raise ExtractorError(
78caa52a 1275 'Signature extraction failed: ' + tb, cause=e)
e0df6211 1276
545cc85d 1277 def _mark_watched(self, video_id, player_response):
21c340b8
S
1278 playback_url = url_or_none(try_get(
1279 player_response,
545cc85d 1280 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']))
d77ab8e2
S
1281 if not playback_url:
1282 return
1283 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1284 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1285
1286 # cpn generation algorithm is reverse engineered from base.js.
1287 # In fact it works even with dummy cpn.
1288 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1289 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1290
1291 qs.update({
1292 'ver': ['2'],
1293 'cpn': [cpn],
1294 })
1295 playback_url = compat_urlparse.urlunparse(
15707c7e 1296 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
d77ab8e2
S
1297
1298 self._download_webpage(
1299 playback_url, video_id, 'Marking watched',
1300 'Unable to mark watched', fatal=False)
1301
66c9fa36
S
1302 @staticmethod
1303 def _extract_urls(webpage):
1304 # Embedded YouTube player
1305 entries = [
1306 unescapeHTML(mobj.group('url'))
1307 for mobj in re.finditer(r'''(?x)
1308 (?:
1309 <iframe[^>]+?src=|
1310 data-video-url=|
1311 <embed[^>]+?src=|
1312 embedSWF\(?:\s*|
1313 <object[^>]+data=|
1314 new\s+SWFObject\(
1315 )
1316 (["\'])
1317 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
f2332f18 1318 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
66c9fa36
S
1319 \1''', webpage)]
1320
1321 # lazyYT YouTube embed
1322 entries.extend(list(map(
1323 unescapeHTML,
1324 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1325
1326 # Wordpress "YouTube Video Importer" plugin
1327 matches = re.findall(r'''(?x)<div[^>]+
1328 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1329 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1330 entries.extend(m[-1] for m in matches)
1331
1332 return entries
1333
1334 @staticmethod
1335 def _extract_url(webpage):
1336 urls = YoutubeIE._extract_urls(webpage)
1337 return urls[0] if urls else None
1338
97665381
PH
1339 @classmethod
1340 def extract_id(cls, url):
1341 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 1342 if mobj is None:
69ea8ca4 1343 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1344 video_id = mobj.group(2)
1345 return video_id
1346
545cc85d 1347 def _extract_chapters_from_json(self, data, video_id, duration):
84213ea8 1348 chapters_list = try_get(
8bdd16b4 1349 data,
84213ea8
S
1350 lambda x: x['playerOverlays']
1351 ['playerOverlayRenderer']
1352 ['decoratedPlayerBarRenderer']
1353 ['decoratedPlayerBarRenderer']
1354 ['playerBar']
1355 ['chapteredPlayerBarRenderer']
1356 ['chapters'],
1357 list)
1358 if not chapters_list:
1359 return
1360
1361 def chapter_time(chapter):
1362 return float_or_none(
1363 try_get(
1364 chapter,
1365 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1366 int),
1367 scale=1000)
1368 chapters = []
1369 for next_num, chapter in enumerate(chapters_list, start=1):
1370 start_time = chapter_time(chapter)
1371 if start_time is None:
1372 continue
1373 end_time = (chapter_time(chapters_list[next_num])
1374 if next_num < len(chapters_list) else duration)
1375 if end_time is None:
1376 continue
1377 title = try_get(
1378 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1379 compat_str)
1380 chapters.append({
1381 'start_time': start_time,
1382 'end_time': end_time,
1383 'title': title,
1384 })
1385 return chapters
1386
545cc85d 1387 def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
1388 return self._parse_json(self._search_regex(
1389 (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
1390 regex), webpage, name, default='{}'), video_id, fatal=False)
84213ea8 1391
c5e8d7af 1392 def _real_extract(self, url):
cf7e015f 1393 url, smuggled_data = unsmuggle_url(url, {})
545cc85d 1394 video_id = self._match_id(url)
1395 base_url = self.http_scheme() + '//www.youtube.com/'
1396 webpage_url = base_url + 'watch?v=' + video_id
1397 webpage = self._download_webpage(webpage_url, video_id, fatal=False)
1398
1399 player_response = None
1400 if webpage:
1401 player_response = self._extract_yt_initial_variable(
1402 webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
1403 video_id, 'initial player response')
1404 if not player_response:
1405 player_response = self._call_api(
1406 'player', {'videoId': video_id}, video_id)
1407
1408 playability_status = player_response.get('playabilityStatus') or {}
1409 if playability_status.get('reason') == 'Sign in to confirm your age':
1410 pr = self._parse_json(try_get(compat_parse_qs(
1411 self._download_webpage(
1412 base_url + 'get_video_info', video_id,
1413 'Refetching age-gated info webpage',
1414 'unable to download video info webpage', query={
1415 'video_id': video_id,
1416 'eurl': 'https://www.youtube.com/embed/' + video_id,
1417 }, fatal=False)),
1418 lambda x: x['player_response'][0],
1419 compat_str) or '{}', video_id)
1420 if pr:
1421 player_response = pr
1422
1423 trailer_video_id = try_get(
1424 playability_status,
1425 lambda x: x['errorScreen']['playerLegacyDesktopYpcTrailerRenderer']['trailerVideoId'],
1426 compat_str)
1427 if trailer_video_id:
1428 return self.url_result(
1429 trailer_video_id, self.ie_key(), trailer_video_id)
cf7e015f 1430
545cc85d 1431 def get_text(x):
1432 if not x:
c2d125d9 1433 return
545cc85d 1434 return x.get('simpleText') or ''.join([r['text'] for r in x['runs']])
15be3eb5 1435
545cc85d 1436 search_meta = (
1437 lambda x: self._html_search_meta(x, webpage, default=None)) \
1438 if webpage else lambda x: None
dbdaaa23 1439
545cc85d 1440 video_details = player_response.get('videoDetails') or {}
37357d21 1441 microformat = try_get(
545cc85d 1442 player_response,
1443 lambda x: x['microformat']['playerMicroformatRenderer'],
1444 dict) or {}
1445 video_title = video_details.get('title') \
1446 or get_text(microformat.get('title')) \
1447 or search_meta(['og:title', 'twitter:title', 'title'])
1448 video_description = video_details.get('shortDescription')
cf7e015f 1449
8fe10494 1450 if not smuggled_data.get('force_singlefeed', False):
5e1eddb9 1451 if not self._downloader.params.get('noplaylist'):
8fe10494
S
1452 multifeed_metadata_list = try_get(
1453 player_response,
1454 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
545cc85d 1455 compat_str)
8fe10494
S
1456 if multifeed_metadata_list:
1457 entries = []
1458 feed_ids = []
1459 for feed in multifeed_metadata_list.split(','):
1460 # Unquote should take place before split on comma (,) since textual
1461 # fields may contain comma as well (see
067aa17e 1462 # https://github.com/ytdl-org/youtube-dl/issues/8536)
545cc85d 1463 feed_data = compat_parse_qs(
1464 compat_urllib_parse_unquote_plus(feed))
6b09401b
S
1465
1466 def feed_entry(name):
545cc85d 1467 return try_get(
1468 feed_data, lambda x: x[name][0], compat_str)
6b09401b
S
1469
1470 feed_id = feed_entry('id')
1471 if not feed_id:
1472 continue
1473 feed_title = feed_entry('title')
1474 title = video_title
1475 if feed_title:
1476 title += ' (%s)' % feed_title
8fe10494
S
1477 entries.append({
1478 '_type': 'url_transparent',
1479 'ie_key': 'Youtube',
1480 'url': smuggle_url(
545cc85d 1481 base_url + 'watch?v=' + feed_data['id'][0],
8fe10494 1482 {'force_singlefeed': True}),
6b09401b 1483 'title': title,
8fe10494 1484 })
6b09401b 1485 feed_ids.append(feed_id)
8fe10494
S
1486 self.to_screen(
1487 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1488 % (', '.join(feed_ids), video_id))
545cc85d 1489 return self.playlist_result(
1490 entries, video_id, video_title, video_description)
8fe10494
S
1491 else:
1492 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1493
545cc85d 1494 formats = []
1495 itags = []
1496 player_url = None
1497 q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'])
1498 streaming_data = player_response.get('streamingData') or {}
1499 streaming_formats = streaming_data.get('formats') or []
1500 streaming_formats.extend(streaming_data.get('adaptiveFormats') or [])
1501 for fmt in streaming_formats:
1502 if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
1503 continue
321bf820 1504
545cc85d 1505 fmt_url = fmt.get('url')
1506 if not fmt_url:
1507 sc = compat_parse_qs(fmt.get('signatureCipher'))
1508 fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0]))
1509 encrypted_sig = try_get(sc, lambda x: x['s'][0])
1510 if not (sc and fmt_url and encrypted_sig):
1511 continue
1512 if not player_url:
1513 if not webpage:
1514 continue
1515 player_url = self._search_regex(
1516 r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
1517 webpage, 'player URL', fatal=False)
1518 if not player_url:
201e9eaa 1519 continue
545cc85d 1520 signature = self._decrypt_signature(sc['s'][0], video_id, player_url)
1521 sp = try_get(sc, lambda x: x['sp'][0]) or 'signature'
1522 fmt_url += '&' + sp + '=' + signature
1523
1524 itag = str_or_none(fmt.get('itag'))
1525 if itag:
1526 itags.append(itag)
1527 quality = fmt.get('quality')
1528 dct = {
1529 'asr': int_or_none(fmt.get('audioSampleRate')),
1530 'filesize': int_or_none(fmt.get('contentLength')),
1531 'format_id': itag,
1532 'format_note': fmt.get('qualityLabel') or quality,
1533 'fps': int_or_none(fmt.get('fps')),
1534 'height': int_or_none(fmt.get('height')),
1535 # 'quality': q(quality), # This does not correctly reflect the overall quality of the format
1536 'tbr': float_or_none(fmt.get(
1537 'averageBitrate') or fmt.get('bitrate'), 1000),
1538 'url': fmt_url,
1539 'width': fmt.get('width'),
1540 }
1541 mimetype = fmt.get('mimeType')
1542 if mimetype:
1543 mobj = re.match(
1544 r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', mimetype)
1545 if mobj:
1546 dct['ext'] = mimetype2ext(mobj.group(1))
1547 dct.update(parse_codecs(mobj.group(2)))
1548 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
1549 dct['downloader_options'] = {
1550 # Youtube throttles chunks >~10M
1551 'http_chunk_size': 10485760,
bf1317d2 1552 }
545cc85d 1553 formats.append(dct)
1554
1555 hls_manifest_url = streaming_data.get('hlsManifestUrl')
1556 if hls_manifest_url:
1557 for f in self._extract_m3u8_formats(
1558 hls_manifest_url, video_id, 'mp4', fatal=False):
1559 itag = self._search_regex(
1560 r'/itag/(\d+)', f['url'], 'itag', default=None)
1561 if itag:
1562 f['format_id'] = itag
1563 formats.append(f)
1564
1565 if self._downloader.params.get('youtube_include_dash_manifest'):
1566 dash_manifest_url = streaming_data.get('dashManifestUrl')
1567 if dash_manifest_url:
1568 dash_formats = []
1569 for f in self._extract_mpd_formats(
1570 dash_manifest_url, video_id, fatal=False):
1571 filesize = int_or_none(self._search_regex(
1572 r'/clen/(\d+)', f.get('fragment_base_url')
1573 or f['url'], 'file size', default=None))
1574 if filesize:
1575 f['filesize'] = filesize
1576 dash_formats.append(f)
1577 # Until further investigation prefer DASH formats as non-DASH
1578 # may not be available (see [1])
1579 # 1. https://github.com/ytdl-org/youtube-dl/issues/28070
1580 if dash_formats:
1581 dash_formats_keys = [f['format_id'] for f in dash_formats]
1582 formats = [f for f in formats if f['format_id'] not in dash_formats_keys]
1583 formats.extend(dash_formats)
bf1317d2 1584
545cc85d 1585 if not formats:
1586 if streaming_data.get('licenseInfos'):
1587 raise ExtractorError(
1588 'This video is DRM protected.', expected=True)
1589 pemr = try_get(
1590 playability_status,
1591 lambda x: x['errorScreen']['playerErrorMessageRenderer'],
1592 dict) or {}
1593 reason = get_text(pemr.get('reason')) or playability_status.get('reason')
1594 subreason = pemr.get('subreason')
1595 if subreason:
1596 subreason = clean_html(get_text(subreason))
1597 if subreason == 'The uploader has not made this video available in your country.':
1598 countries = microformat.get('availableCountries')
1599 if not countries:
1600 regions_allowed = search_meta('regionsAllowed')
1601 countries = regions_allowed.split(',') if regions_allowed else None
1602 self.raise_geo_restricted(
1603 subreason, countries)
1604 reason += '\n' + subreason
1605 if reason:
1606 raise ExtractorError(reason, expected=True)
bf1317d2 1607
545cc85d 1608 self._sort_formats(formats)
bf1317d2 1609
545cc85d 1610 keywords = video_details.get('keywords') or []
1611 if not keywords and webpage:
1612 keywords = [
1613 unescapeHTML(m.group('content'))
1614 for m in re.finditer(self._meta_regex('og:video:tag'), webpage)]
1615 for keyword in keywords:
1616 if keyword.startswith('yt:stretch='):
1617 w, h = keyword.split('=')[1].split(':')
1618 w, h = int(w), int(h)
1619 if w > 0 and h > 0:
1620 ratio = w / h
1621 for f in formats:
1622 if f.get('vcodec') != 'none':
1623 f['stretched_ratio'] = ratio
6449cd80 1624
545cc85d 1625 thumbnails = []
1626 for container in (video_details, microformat):
1627 for thumbnail in (try_get(
1628 container,
1629 lambda x: x['thumbnail']['thumbnails'], list) or []):
1630 thumbnail_url = thumbnail.get('url')
1631 if not thumbnail_url:
bf1317d2 1632 continue
545cc85d 1633 thumbnails.append({
1634 'height': int_or_none(thumbnail.get('height')),
1635 'url': thumbnail_url,
1636 'width': int_or_none(thumbnail.get('width')),
1637 })
1638 if thumbnails:
1639 break
a6211d23 1640 else:
545cc85d 1641 thumbnail = search_meta(['og:image', 'twitter:image'])
1642 if thumbnail:
1643 thumbnails = [{'url': thumbnail}]
1644
1645 category = microformat.get('category') or search_meta('genre')
1646 channel_id = video_details.get('channelId') \
1647 or microformat.get('externalChannelId') \
1648 or search_meta('channelId')
1649 duration = int_or_none(
1650 video_details.get('lengthSeconds')
1651 or microformat.get('lengthSeconds')) \
1652 or parse_duration(search_meta('duration'))
1653 is_live = video_details.get('isLive')
1654 owner_profile_url = microformat.get('ownerProfileUrl')
1655
1656 info = {
1657 'id': video_id,
1658 'title': self._live_title(video_title) if is_live else video_title,
1659 'formats': formats,
1660 'thumbnails': thumbnails,
1661 'description': video_description,
1662 'upload_date': unified_strdate(
1663 microformat.get('uploadDate')
1664 or search_meta('uploadDate')),
1665 'uploader': video_details['author'],
1666 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None,
1667 'uploader_url': owner_profile_url,
1668 'channel_id': channel_id,
1669 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None,
1670 'duration': duration,
1671 'view_count': int_or_none(
1672 video_details.get('viewCount')
1673 or microformat.get('viewCount')
1674 or search_meta('interactionCount')),
1675 'average_rating': float_or_none(video_details.get('averageRating')),
1676 'age_limit': 18 if (
1677 microformat.get('isFamilySafe') is False
1678 or search_meta('isFamilyFriendly') == 'false'
1679 or search_meta('og:restrictions:age') == '18+') else 0,
1680 'webpage_url': webpage_url,
1681 'categories': [category] if category else None,
1682 'tags': keywords,
1683 'is_live': is_live,
1684 'playable_in_embed': playability_status.get('playableInEmbed'),
1685 }
b477fc13 1686
545cc85d 1687 pctr = try_get(
1688 player_response,
1689 lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict)
1690 subtitles = {}
1691 if pctr:
1692 def process_language(container, base_url, lang_code, query):
1693 lang_subs = []
1694 for fmt in self._SUBTITLE_FORMATS:
1695 query.update({
1696 'fmt': fmt,
1697 })
1698 lang_subs.append({
1699 'ext': fmt,
1700 'url': update_url_query(base_url, query),
1701 })
1702 container[lang_code] = lang_subs
7e72694b 1703
545cc85d 1704 for caption_track in (pctr.get('captionTracks') or []):
1705 base_url = caption_track.get('baseUrl')
1706 if not base_url:
1707 continue
1708 if caption_track.get('kind') != 'asr':
1709 lang_code = caption_track.get('languageCode')
1710 if not lang_code:
1711 continue
1712 process_language(
1713 subtitles, base_url, lang_code, {})
1714 continue
1715 automatic_captions = {}
1716 for translation_language in (pctr.get('translationLanguages') or []):
1717 translation_language_code = translation_language.get('languageCode')
1718 if not translation_language_code:
1719 continue
1720 process_language(
1721 automatic_captions, base_url, translation_language_code,
1722 {'tlang': translation_language_code})
1723 info['automatic_captions'] = automatic_captions
1724 info['subtitles'] = subtitles
7e72694b 1725
545cc85d 1726 parsed_url = compat_urllib_parse_urlparse(url)
1727 for component in [parsed_url.fragment, parsed_url.query]:
1728 query = compat_parse_qs(component)
1729 for k, v in query.items():
1730 for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]:
1731 d_k += '_time'
1732 if d_k not in info and k in s_ks:
1733 info[d_k] = parse_duration(query[k][0])
822b9d9c
RA
1734
1735 # Youtube Music Auto-generated description
822b9d9c 1736 if video_description:
38d70284 1737 mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
822b9d9c 1738 if mobj:
822b9d9c
RA
1739 release_year = mobj.group('release_year')
1740 release_date = mobj.group('release_date')
1741 if release_date:
1742 release_date = release_date.replace('-', '')
1743 if not release_year:
545cc85d 1744 release_year = release_date[:4]
1745 info.update({
1746 'album': mobj.group('album'.strip()),
1747 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')),
1748 'track': mobj.group('track').strip(),
1749 'release_date': release_date,
1750 'release_year': int(release_year),
1751 })
7e72694b 1752
545cc85d 1753 initial_data = None
1754 if webpage:
1755 initial_data = self._extract_yt_initial_variable(
1756 webpage, self._YT_INITIAL_DATA_RE, video_id,
1757 'yt initial data')
1758 if not initial_data:
1759 initial_data = self._call_api(
1760 'next', {'videoId': video_id}, video_id, fatal=False)
1761
1762 if not is_live:
1763 try:
1764 # This will error if there is no livechat
1765 initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
1766 info['subtitles']['live_chat'] = [{
1767 'video_id': video_id,
1768 'ext': 'json',
1769 'protocol': 'youtube_live_chat_replay',
1770 }]
1771 except (KeyError, IndexError, TypeError):
1772 pass
1773
1774 if initial_data:
1775 chapters = self._extract_chapters_from_json(
1776 initial_data, video_id, duration)
1777 if not chapters:
1778 for engagment_pannel in (initial_data.get('engagementPanels') or []):
1779 contents = try_get(
1780 engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'],
1781 list)
1782 if not contents:
1783 continue
1784
1785 def chapter_time(mmlir):
1786 return parse_duration(
1787 get_text(mmlir.get('timeDescription')))
1788
1789 chapters = []
1790 for next_num, content in enumerate(contents, start=1):
1791 mmlir = content.get('macroMarkersListItemRenderer') or {}
1792 start_time = chapter_time(mmlir)
1793 end_time = chapter_time(try_get(
1794 contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \
1795 if next_num < len(contents) else duration
1796 if start_time is None or end_time is None:
1797 continue
1798 chapters.append({
1799 'start_time': start_time,
1800 'end_time': end_time,
1801 'title': get_text(mmlir.get('title')),
1802 })
1803 if chapters:
1804 break
1805 if chapters:
1806 info['chapters'] = chapters
1807
1808 contents = try_get(
1809 initial_data,
1810 lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'],
1811 list) or []
1812 for content in contents:
1813 vpir = content.get('videoPrimaryInfoRenderer')
1814 if vpir:
1815 stl = vpir.get('superTitleLink')
1816 if stl:
1817 stl = get_text(stl)
1818 if try_get(
1819 vpir,
1820 lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':
1821 info['location'] = stl
1822 else:
1823 mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl)
1824 if mobj:
1825 info.update({
1826 'series': mobj.group(1),
1827 'season_number': int(mobj.group(2)),
1828 'episode_number': int(mobj.group(3)),
1829 })
1830 for tlb in (try_get(
1831 vpir,
1832 lambda x: x['videoActions']['menuRenderer']['topLevelButtons'],
1833 list) or []):
1834 tbr = tlb.get('toggleButtonRenderer') or {}
1835 for getter, regex in [(
1836 lambda x: x['defaultText']['accessibility']['accessibilityData'],
1837 r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([
1838 lambda x: x['accessibility'],
1839 lambda x: x['accessibilityData']['accessibilityData'],
1840 ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]:
1841 label = (try_get(tbr, getter, dict) or {}).get('label')
1842 if label:
1843 mobj = re.match(regex, label)
1844 if mobj:
1845 info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count'))
1846 break
1847 sbr_tooltip = try_get(
1848 vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])
1849 if sbr_tooltip:
1850 like_count, dislike_count = sbr_tooltip.split(' / ')
1851 info.update({
1852 'like_count': str_to_int(like_count),
1853 'dislike_count': str_to_int(dislike_count),
1854 })
1855 vsir = content.get('videoSecondaryInfoRenderer')
1856 if vsir:
1857 info['channel'] = get_text(try_get(
1858 vsir,
1859 lambda x: x['owner']['videoOwnerRenderer']['title'],
1860 compat_str))
1861 rows = try_get(
1862 vsir,
1863 lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
1864 list) or []
1865 multiple_songs = False
1866 for row in rows:
1867 if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
1868 multiple_songs = True
1869 break
1870 for row in rows:
1871 mrr = row.get('metadataRowRenderer') or {}
1872 mrr_title = mrr.get('title')
1873 if not mrr_title:
1874 continue
1875 mrr_title = get_text(mrr['title'])
1876 mrr_contents_text = get_text(mrr['contents'][0])
1877 if mrr_title == 'License':
1878 info['license'] = mrr_contents_text
1879 elif not multiple_songs:
1880 if mrr_title == 'Album':
1881 info['album'] = mrr_contents_text
1882 elif mrr_title == 'Artist':
1883 info['artist'] = mrr_contents_text
1884 elif mrr_title == 'Song':
1885 info['track'] = mrr_contents_text
1886
1887 fallbacks = {
1888 'channel': 'uploader',
1889 'channel_id': 'uploader_id',
1890 'channel_url': 'uploader_url',
1891 }
1892 for to, frm in fallbacks.items():
1893 if not info.get(to):
1894 info[to] = info.get(frm)
1895
1896 for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]:
1897 v = info.get(s_k)
1898 if v:
1899 info[d_k] = v
b84071c0 1900
06167fbb 1901 # get xsrf for annotations or comments
1902 get_annotations = self._downloader.params.get('writeannotations', False)
1903 get_comments = self._downloader.params.get('getcomments', False)
1904 if get_annotations or get_comments:
29f7c58a 1905 xsrf_token = None
545cc85d 1906 ytcfg = self._extract_ytcfg(video_id, webpage)
29f7c58a 1907 if ytcfg:
1908 xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
1909 if not xsrf_token:
1910 xsrf_token = self._search_regex(
1911 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
1912 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
06167fbb 1913
1914 # annotations
1915 video_annotations = None
1916 if get_annotations:
64b6a4e9
RA
1917 invideo_url = try_get(
1918 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
1919 if xsrf_token and invideo_url:
29f7c58a 1920 xsrf_field_name = None
1921 if ytcfg:
1922 xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
1923 if not xsrf_field_name:
1924 xsrf_field_name = self._search_regex(
1925 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
1926 video_webpage, 'xsrf field name',
1927 group='xsrf_field_name', default='session_token')
64b6a4e9
RA
1928 video_annotations = self._download_webpage(
1929 self._proto_relative_url(invideo_url),
1930 video_id, note='Downloading annotations',
1931 errnote='Unable to download video annotations', fatal=False,
1932 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
7e72694b 1933
06167fbb 1934 # Get comments
1935 # TODO: Refactor and move to seperate function
1936 if get_comments:
1937 expected_video_comment_count = 0
1938 video_comments = []
1939
1940 def find_value(html, key, num_chars=2, separator='"'):
1941 pos_begin = html.find(key) + len(key) + num_chars
1942 pos_end = html.find(separator, pos_begin)
1943 return html[pos_begin: pos_end]
1944
1945 def search_dict(partial, key):
1946 if isinstance(partial, dict):
1947 for k, v in partial.items():
1948 if k == key:
1949 yield v
1950 else:
1951 for o in search_dict(v, key):
1952 yield o
1953 elif isinstance(partial, list):
1954 for i in partial:
1955 for o in search_dict(i, key):
1956 yield o
1957
1958 try:
1959 ncd = next(search_dict(yt_initial_data, 'nextContinuationData'))
8d0ea5f9 1960 continuations = [ncd['continuation']]
06167fbb 1961 # Handle videos where comments have been disabled entirely
1962 except StopIteration:
1963 continuations = []
1964
8d0ea5f9 1965 def get_continuation(continuation, session_token, replies=False):
06167fbb 1966 query = {
66c935fb 1967 'pbj': 1,
1968 'ctoken': continuation,
06167fbb 1969 }
1970 if replies:
1971 query['action_get_comment_replies'] = 1
1972 else:
1973 query['action_get_comments'] = 1
1974
1975 while True:
1976 content, handle = self._download_webpage_handle(
1977 'https://www.youtube.com/comment_service_ajax',
1978 video_id,
1979 note=False,
1980 expected_status=[413],
1981 data=urlencode_postdata({
1982 'session_token': session_token
1983 }),
1984 query=query,
1985 headers={
1986 'Accept': '*/*',
1987 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0',
1988 'X-YouTube-Client-Name': '1',
1989 'X-YouTube-Client-Version': '2.20201202.06.01'
1990 }
1991 )
1992
1993 response_code = handle.getcode()
1994 if (response_code == 200):
1995 return self._parse_json(content, video_id)
8d0ea5f9 1996 if (response_code == 413):
06167fbb 1997 return None
1998 raise ExtractorError('Unexpected HTTP error code: %s' % response_code)
1999
2000 first_continuation = True
2001 while continuations:
2002 continuation, itct = continuations.pop()
8d0ea5f9 2003 comment_response = get_continuation(continuation, xsrf_token)
06167fbb 2004 if not comment_response:
2005 continue
2006 if list(search_dict(comment_response, 'externalErrorMessage')):
2007 raise ExtractorError('Error returned from server: ' + next(search_dict(comment_response, 'externalErrorMessage')))
2008
8d0ea5f9
B
2009 if 'continuationContents' not in comment_response['response']:
2010 # Something is wrong here. Youtube won't accept this continuation token for some reason and responds with a user satisfaction dialog (error?)
2011 continue
2012 # not sure if this actually helps
2013 if 'xsrf_token' in comment_response:
2014 xsrf_token = comment_response['xsrf_token']
2015
06167fbb 2016 item_section = comment_response['response']['continuationContents']['itemSectionContinuation']
2017 if first_continuation:
2018 expected_video_comment_count = int(item_section['header']['commentsHeaderRenderer']['countText']['runs'][0]['text'].replace(' Comments', '').replace('1 Comment', '1').replace(',', ''))
2019 first_continuation = False
2020 if 'contents' not in item_section:
2021 # continuation returned no comments?
2022 # set an empty array as to not break the for loop
2023 item_section['contents'] = []
2024
2025 for meta_comment in item_section['contents']:
2026 comment = meta_comment['commentThreadRenderer']['comment']['commentRenderer']
2027 video_comments.append({
2028 'id': comment['commentId'],
2029 'text': ''.join([c['text'] for c in comment['contentText']['runs']]),
8d0ea5f9 2030 'time_text': ''.join([c['text'] for c in comment['publishedTimeText']['runs']]),
06167fbb 2031 'author': comment.get('authorText', {}).get('simpleText', ''),
2032 'votes': comment.get('voteCount', {}).get('simpleText', '0'),
2033 'author_thumbnail': comment['authorThumbnail']['thumbnails'][-1]['url'],
2034 'parent': 'root'
2035 })
2036 if 'replies' not in meta_comment['commentThreadRenderer']:
545cc85d 2037
06167fbb 2038 continue
2039
545cc85d 2040
8d0ea5f9
B
2041 reply_continuations = [rcn['nextContinuationData']['continuation'] for rcn in meta_comment['commentThreadRenderer']['replies']['commentRepliesRenderer']['continuations']]
2042 while reply_continuations:
06167fbb 2043 time.sleep(1)
8d0ea5f9
B
2044 continuation = reply_continuations.pop()
2045 replies_data = get_continuation(continuation, xsrf_token, True)
06167fbb 2046 if not replies_data or 'continuationContents' not in replies_data[1]['response']:
8d0ea5f9 2047 continue
06167fbb 2048
2049 if self._downloader.params.get('verbose', False):
2050 self.to_screen('[debug] Comments downloaded (chain %s) %s of ~%s' % (comment['commentId'], len(video_comments), expected_video_comment_count))
2051 reply_comment_meta = replies_data[1]['response']['continuationContents']['commentRepliesContinuation']
2052 for reply_meta in replies_data[1]['response']['continuationContents']['commentRepliesContinuation']['contents']:
2053 reply_comment = reply_meta['commentRenderer']
2054 video_comments.append({
2055 'id': reply_comment['commentId'],
2056 'text': ''.join([c['text'] for c in reply_comment['contentText']['runs']]),
8d0ea5f9 2057 'time_text': ''.join([c['text'] for c in reply_comment['publishedTimeText']['runs']]),
06167fbb 2058 'author': reply_comment.get('authorText', {}).get('simpleText', ''),
2059 'votes': reply_comment.get('voteCount', {}).get('simpleText', '0'),
2060 'author_thumbnail': reply_comment['authorThumbnail']['thumbnails'][-1]['url'],
2061 'parent': comment['commentId']
2062 })
2063 if 'continuations' not in reply_comment_meta or len(reply_comment_meta['continuations']) == 0:
545cc85d 2064
8d0ea5f9 2065 continue
06167fbb 2066
8d0ea5f9 2067 reply_continuations += [rcn['nextContinuationData']['continuation'] for rcn in reply_comment_meta['continuations']]
06167fbb 2068
2069 self.to_screen('Comments downloaded %s of ~%s' % (len(video_comments), expected_video_comment_count))
2070
2071 if 'continuations' in item_section:
8d0ea5f9 2072 continuations += [ncd['nextContinuationData']['continuation'] for ncd in item_section['continuations']]
06167fbb 2073 time.sleep(1)
2074
2075 self.to_screen('Total comments downloaded %s of ~%s' % (len(video_comments), expected_video_comment_count))
545cc85d 2076 info.update({
2077 'comments': video_comments,
2078 'comment_count': expected_video_comment_count
2079 })
4ea3be0a 2080
545cc85d 2081 self.mark_watched(video_id, player_response)
d77ab8e2 2082
545cc85d 2083 return info
c5e8d7af 2084
5f6a1245 2085
8bdd16b4 2086class YoutubeTabIE(YoutubeBaseInfoExtractor):
2087 IE_DESC = 'YouTube.com tab'
70d5c17b 2088 _VALID_URL = r'''(?x)
2089 https?://
2090 (?:\w+\.)?
2091 (?:
2092 youtube(?:kids)?\.com|
2093 invidio\.us
2094 )/
2095 (?:
2096 (?:channel|c|user)/|
2097 (?P<not_channel>
3d3dddc9 2098 feed/|
70d5c17b 2099 (?:playlist|watch)\?.*?\blist=
2100 )|
29f7c58a 2101 (?!(?:%s)\b) # Direct URLs
70d5c17b 2102 )
2103 (?P<id>[^/?\#&]+)
2104 ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
8bdd16b4 2105 IE_NAME = 'youtube:tab'
2106
81127aa5 2107 _TESTS = [{
8bdd16b4 2108 # playlists, multipage
2109 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
2110 'playlist_mincount': 94,
2111 'info_dict': {
2112 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2113 'title': 'Игорь Клейнер - Playlists',
2114 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2115 },
2116 }, {
2117 # playlists, multipage, different order
2118 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2119 'playlist_mincount': 94,
2120 'info_dict': {
2121 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2122 'title': 'Игорь Клейнер - Playlists',
2123 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2124 },
2125 }, {
2126 # playlists, singlepage
2127 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2128 'playlist_mincount': 4,
2129 'info_dict': {
2130 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
2131 'title': 'ThirstForScience - Playlists',
2132 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
2133 }
2134 }, {
2135 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
2136 'only_matching': True,
2137 }, {
2138 # basic, single video playlist
0e30a7b9 2139 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
81127aa5 2140 'info_dict': {
0e30a7b9 2141 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2142 'uploader': 'Sergey M.',
2143 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3867038a 2144 'title': 'youtube-dl public playlist',
81127aa5 2145 },
0e30a7b9 2146 'playlist_count': 1,
9291475f 2147 }, {
8bdd16b4 2148 # empty playlist
0e30a7b9 2149 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
9291475f 2150 'info_dict': {
0e30a7b9 2151 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2152 'uploader': 'Sergey M.',
2153 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3867038a 2154 'title': 'youtube-dl empty playlist',
9291475f
PH
2155 },
2156 'playlist_count': 0,
2157 }, {
8bdd16b4 2158 # Home tab
2159 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
9291475f 2160 'info_dict': {
8bdd16b4 2161 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2162 'title': 'lex will - Home',
2163 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2164 },
8bdd16b4 2165 'playlist_mincount': 2,
9291475f 2166 }, {
8bdd16b4 2167 # Videos tab
2168 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
9291475f 2169 'info_dict': {
8bdd16b4 2170 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2171 'title': 'lex will - Videos',
2172 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2173 },
8bdd16b4 2174 'playlist_mincount': 975,
9291475f 2175 }, {
8bdd16b4 2176 # Videos tab, sorted by popular
2177 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
9291475f 2178 'info_dict': {
8bdd16b4 2179 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2180 'title': 'lex will - Videos',
2181 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2182 },
8bdd16b4 2183 'playlist_mincount': 199,
9291475f 2184 }, {
8bdd16b4 2185 # Playlists tab
2186 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
9291475f 2187 'info_dict': {
8bdd16b4 2188 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2189 'title': 'lex will - Playlists',
2190 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2191 },
8bdd16b4 2192 'playlist_mincount': 17,
ac7553d0 2193 }, {
8bdd16b4 2194 # Community tab
2195 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
ac7553d0 2196 'info_dict': {
8bdd16b4 2197 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2198 'title': 'lex will - Community',
2199 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2200 },
2201 'playlist_mincount': 18,
87dadd45 2202 }, {
8bdd16b4 2203 # Channels tab
2204 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
87dadd45 2205 'info_dict': {
8bdd16b4 2206 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2207 'title': 'lex will - Channels',
2208 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2209 },
2210 'playlist_mincount': 138,
6b08cdf6 2211 }, {
a0566bbf 2212 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2213 'only_matching': True,
2214 }, {
a0566bbf 2215 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2216 'only_matching': True,
2217 }, {
a0566bbf 2218 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2219 'only_matching': True,
2220 }, {
2221 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2222 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2223 'info_dict': {
2224 'title': '29C3: Not my department',
2225 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2226 'uploader': 'Christiaan008',
2227 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
2228 },
2229 'playlist_count': 96,
2230 }, {
2231 'note': 'Large playlist',
2232 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
6b08cdf6 2233 'info_dict': {
8bdd16b4 2234 'title': 'Uploads from Cauchemar',
2235 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2236 'uploader': 'Cauchemar',
2237 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
13a75688 2238 },
8bdd16b4 2239 'playlist_mincount': 1123,
2240 }, {
2241 # even larger playlist, 8832 videos
2242 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
2243 'only_matching': True,
4b7df0d3
JMF
2244 }, {
2245 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2246 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2247 'info_dict': {
acf757f4
PH
2248 'title': 'Uploads from Interstellar Movie',
2249 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
13a75688 2250 'uploader': 'Interstellar Movie',
8bdd16b4 2251 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3 2252 },
481cc733 2253 'playlist_mincount': 21,
8bdd16b4 2254 }, {
2255 # https://github.com/ytdl-org/youtube-dl/issues/21844
2256 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2257 'info_dict': {
2258 'title': 'Data Analysis with Dr Mike Pound',
2259 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2260 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
2261 'uploader': 'Computerphile',
2262 },
2263 'playlist_mincount': 11,
2264 }, {
a0566bbf 2265 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
8bdd16b4 2266 'only_matching': True,
dacb3a86
S
2267 }, {
2268 # Playlist URL that does not actually serve a playlist
2269 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2270 'info_dict': {
2271 'id': 'FqZTN594JQw',
2272 'ext': 'webm',
2273 'title': "Smiley's People 01 detective, Adventure Series, Action",
2274 'uploader': 'STREEM',
2275 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
ec85ded8 2276 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
dacb3a86
S
2277 'upload_date': '20150526',
2278 'license': 'Standard YouTube License',
2279 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2280 'categories': ['People & Blogs'],
2281 'tags': list,
dbdaaa23 2282 'view_count': int,
dacb3a86
S
2283 'like_count': int,
2284 'dislike_count': int,
2285 },
2286 'params': {
2287 'skip_download': True,
2288 },
13a75688 2289 'skip': 'This video is not available.',
dacb3a86 2290 'add_ie': [YoutubeIE.ie_key()],
481cc733 2291 }, {
8bdd16b4 2292 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
c0345b82 2293 'only_matching': True,
66b48727 2294 }, {
8bdd16b4 2295 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
66b48727 2296 'only_matching': True,
a0566bbf 2297 }, {
2298 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
2299 'info_dict': {
2300 'id': '9Auq9mYxFEE',
2301 'ext': 'mp4',
2302 'title': 'Watch Sky News live',
2303 'uploader': 'Sky News',
2304 'uploader_id': 'skynews',
2305 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
2306 'upload_date': '20191102',
2307 'description': 'md5:78de4e1c2359d0ea3ed829678e38b662',
2308 'categories': ['News & Politics'],
2309 'tags': list,
2310 'like_count': int,
2311 'dislike_count': int,
2312 },
2313 'params': {
2314 'skip_download': True,
2315 },
2316 }, {
2317 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2318 'info_dict': {
2319 'id': 'a48o2S1cPoo',
2320 'ext': 'mp4',
2321 'title': 'The Young Turks - Live Main Show',
2322 'uploader': 'The Young Turks',
2323 'uploader_id': 'TheYoungTurks',
2324 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2325 'upload_date': '20150715',
2326 'license': 'Standard YouTube License',
2327 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2328 'categories': ['News & Politics'],
2329 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2330 'like_count': int,
2331 'dislike_count': int,
2332 },
2333 'params': {
2334 'skip_download': True,
2335 },
2336 'only_matching': True,
2337 }, {
2338 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
2339 'only_matching': True,
2340 }, {
2341 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2342 'only_matching': True,
3d3dddc9 2343 }, {
2344 'url': 'https://www.youtube.com/feed/trending',
2345 'only_matching': True,
2346 }, {
2347 # needs auth
2348 'url': 'https://www.youtube.com/feed/library',
2349 'only_matching': True,
2350 }, {
2351 # needs auth
2352 'url': 'https://www.youtube.com/feed/history',
2353 'only_matching': True,
2354 }, {
2355 # needs auth
2356 'url': 'https://www.youtube.com/feed/subscriptions',
2357 'only_matching': True,
2358 }, {
2359 # needs auth
2360 'url': 'https://www.youtube.com/feed/watch_later',
2361 'only_matching': True,
2362 }, {
2363 # no longer available?
2364 'url': 'https://www.youtube.com/feed/recommended',
2365 'only_matching': True,
29f7c58a 2366 }, {
2367 # inline playlist with not always working continuations
2368 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
2369 'only_matching': True,
2370 }, {
2371 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8',
2372 'only_matching': True,
2373 }, {
2374 'url': 'https://www.youtube.com/course',
2375 'only_matching': True,
2376 }, {
2377 'url': 'https://www.youtube.com/zsecurity',
2378 'only_matching': True,
2379 }, {
2380 'url': 'http://www.youtube.com/NASAgovVideo/videos',
2381 'only_matching': True,
2382 }, {
2383 'url': 'https://www.youtube.com/TheYoungTurks/live',
2384 'only_matching': True,
2385 }]
2386
2387 @classmethod
2388 def suitable(cls, url):
2389 return False if YoutubeIE.suitable(url) else super(
2390 YoutubeTabIE, cls).suitable(url)
8bdd16b4 2391
2392 def _extract_channel_id(self, webpage):
2393 channel_id = self._html_search_meta(
2394 'channelId', webpage, 'channel id', default=None)
2395 if channel_id:
2396 return channel_id
2397 channel_url = self._html_search_meta(
2398 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
2399 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
2400 'twitter:app:url:googleplay'), webpage, 'channel url')
2401 return self._search_regex(
2402 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
2403 channel_url, 'channel id')
15f6397c 2404
8bdd16b4 2405 @staticmethod
2406 def _extract_grid_item_renderer(item):
2407 for item_kind in ('Playlist', 'Video', 'Channel'):
2408 renderer = item.get('grid%sRenderer' % item_kind)
2409 if renderer:
2410 return renderer
2411
8bdd16b4 2412 def _grid_entries(self, grid_renderer):
2413 for item in grid_renderer['items']:
2414 if not isinstance(item, dict):
39b62db1 2415 continue
8bdd16b4 2416 renderer = self._extract_grid_item_renderer(item)
2417 if not isinstance(renderer, dict):
2418 continue
2419 title = try_get(
2420 renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2421 # playlist
2422 playlist_id = renderer.get('playlistId')
2423 if playlist_id:
2424 yield self.url_result(
2425 'https://www.youtube.com/playlist?list=%s' % playlist_id,
2426 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
2427 video_title=title)
2428 # video
2429 video_id = renderer.get('videoId')
2430 if video_id:
2431 yield self._extract_video(renderer)
2432 # channel
2433 channel_id = renderer.get('channelId')
2434 if channel_id:
2435 title = try_get(
2436 renderer, lambda x: x['title']['simpleText'], compat_str)
2437 yield self.url_result(
2438 'https://www.youtube.com/channel/%s' % channel_id,
2439 ie=YoutubeTabIE.ie_key(), video_title=title)
2440
3d3dddc9 2441 def _shelf_entries_from_content(self, shelf_renderer):
2442 content = shelf_renderer.get('content')
2443 if not isinstance(content, dict):
8bdd16b4 2444 return
3d3dddc9 2445 renderer = content.get('gridRenderer')
2446 if renderer:
2447 # TODO: add support for nested playlists so each shelf is processed
2448 # as separate playlist
2449 # TODO: this includes only first N items
2450 for entry in self._grid_entries(renderer):
2451 yield entry
2452 renderer = content.get('horizontalListRenderer')
2453 if renderer:
2454 # TODO
2455 pass
8bdd16b4 2456
29f7c58a 2457 def _shelf_entries(self, shelf_renderer, skip_channels=False):
8bdd16b4 2458 ep = try_get(
2459 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
2460 compat_str)
2461 shelf_url = urljoin('https://www.youtube.com', ep)
3d3dddc9 2462 if shelf_url:
29f7c58a 2463 # Skipping links to another channels, note that checking for
2464 # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL
2465 # will not work
2466 if skip_channels and '/channels?' in shelf_url:
2467 return
3d3dddc9 2468 title = try_get(
2469 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2470 yield self.url_result(shelf_url, video_title=title)
2471 # Shelf may not contain shelf URL, fallback to extraction from content
2472 for entry in self._shelf_entries_from_content(shelf_renderer):
2473 yield entry
c5e8d7af 2474
8bdd16b4 2475 def _playlist_entries(self, video_list_renderer):
2476 for content in video_list_renderer['contents']:
2477 if not isinstance(content, dict):
2478 continue
2479 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
2480 if not isinstance(renderer, dict):
2481 continue
2482 video_id = renderer.get('videoId')
2483 if not video_id:
2484 continue
2485 yield self._extract_video(renderer)
07aeced6 2486
3d3dddc9 2487 r""" # Not needed in the new implementation
3462ffa8 2488 def _itemSection_entries(self, item_sect_renderer):
2489 for content in item_sect_renderer['contents']:
2490 if not isinstance(content, dict):
2491 continue
2492 renderer = content.get('videoRenderer', {})
2493 if not isinstance(renderer, dict):
2494 continue
2495 video_id = renderer.get('videoId')
2496 if not video_id:
2497 continue
2498 yield self._extract_video(renderer)
3d3dddc9 2499 """
3462ffa8 2500
2501 def _rich_entries(self, rich_grid_renderer):
2502 renderer = try_get(
70d5c17b 2503 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
3462ffa8 2504 video_id = renderer.get('videoId')
2505 if not video_id:
2506 return
2507 yield self._extract_video(renderer)
2508
8bdd16b4 2509 def _video_entry(self, video_renderer):
2510 video_id = video_renderer.get('videoId')
2511 if video_id:
2512 return self._extract_video(video_renderer)
dacb3a86 2513
8bdd16b4 2514 def _post_thread_entries(self, post_thread_renderer):
2515 post_renderer = try_get(
2516 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
2517 if not post_renderer:
2518 return
2519 # video attachment
2520 video_renderer = try_get(
2521 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
2522 video_id = None
2523 if video_renderer:
2524 entry = self._video_entry(video_renderer)
2525 if entry:
2526 yield entry
2527 # inline video links
2528 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
2529 for run in runs:
2530 if not isinstance(run, dict):
2531 continue
2532 ep_url = try_get(
2533 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
2534 if not ep_url:
2535 continue
2536 if not YoutubeIE.suitable(ep_url):
2537 continue
2538 ep_video_id = YoutubeIE._match_id(ep_url)
2539 if video_id == ep_video_id:
2540 continue
2541 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
dacb3a86 2542
8bdd16b4 2543 def _post_thread_continuation_entries(self, post_thread_continuation):
2544 contents = post_thread_continuation.get('contents')
2545 if not isinstance(contents, list):
2546 return
2547 for content in contents:
2548 renderer = content.get('backstagePostThreadRenderer')
2549 if not isinstance(renderer, dict):
2550 continue
2551 for entry in self._post_thread_entries(renderer):
2552 yield entry
07aeced6 2553
29f7c58a 2554 @staticmethod
2555 def _build_continuation_query(continuation, ctp=None):
2556 query = {
2557 'ctoken': continuation,
2558 'continuation': continuation,
2559 }
2560 if ctp:
2561 query['itct'] = ctp
2562 return query
2563
8bdd16b4 2564 @staticmethod
2565 def _extract_next_continuation_data(renderer):
2566 next_continuation = try_get(
2567 renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
2568 if not next_continuation:
2569 return
2570 continuation = next_continuation.get('continuation')
2571 if not continuation:
2572 return
2573 ctp = next_continuation.get('clickTrackingParams')
29f7c58a 2574 return YoutubeTabIE._build_continuation_query(continuation, ctp)
c5e8d7af 2575
8bdd16b4 2576 @classmethod
2577 def _extract_continuation(cls, renderer):
2578 next_continuation = cls._extract_next_continuation_data(renderer)
2579 if next_continuation:
2580 return next_continuation
2581 contents = renderer.get('contents')
2582 if not isinstance(contents, list):
2583 return
2584 for content in contents:
2585 if not isinstance(content, dict):
2586 continue
2587 continuation_ep = try_get(
2588 content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
2589 dict)
2590 if not continuation_ep:
2591 continue
2592 continuation = try_get(
2593 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
2594 if not continuation:
2595 continue
2596 ctp = continuation_ep.get('clickTrackingParams')
29f7c58a 2597 return YoutubeTabIE._build_continuation_query(continuation, ctp)
448830ce 2598
8bdd16b4 2599 def _entries(self, tab, identity_token):
3462ffa8 2600
70d5c17b 2601 def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
2602 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
2603 for content in contents:
2604 if not isinstance(content, dict):
8bdd16b4 2605 continue
70d5c17b 2606 is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
3462ffa8 2607 if not is_renderer:
70d5c17b 2608 renderer = content.get('richItemRenderer')
3462ffa8 2609 if renderer:
2610 for entry in self._rich_entries(renderer):
2611 yield entry
2612 continuation_list[0] = self._extract_continuation(parent_renderer)
8bdd16b4 2613 continue
3462ffa8 2614 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
2615 for isr_content in isr_contents:
2616 if not isinstance(isr_content, dict):
2617 continue
2618 renderer = isr_content.get('playlistVideoListRenderer')
2619 if renderer:
2620 for entry in self._playlist_entries(renderer):
2621 yield entry
2622 continuation_list[0] = self._extract_continuation(renderer)
2623 continue
2624 renderer = isr_content.get('gridRenderer')
2625 if renderer:
2626 for entry in self._grid_entries(renderer):
2627 yield entry
2628 continuation_list[0] = self._extract_continuation(renderer)
2629 continue
2630 renderer = isr_content.get('shelfRenderer')
2631 if renderer:
29f7c58a 2632 is_channels_tab = tab.get('title') == 'Channels'
2633 for entry in self._shelf_entries(renderer, not is_channels_tab):
3462ffa8 2634 yield entry
3462ffa8 2635 continue
2636 renderer = isr_content.get('backstagePostThreadRenderer')
2637 if renderer:
2638 for entry in self._post_thread_entries(renderer):
2639 yield entry
2640 continuation_list[0] = self._extract_continuation(renderer)
2641 continue
2642 renderer = isr_content.get('videoRenderer')
2643 if renderer:
2644 entry = self._video_entry(renderer)
2645 if entry:
2646 yield entry
70d5c17b 2647
3462ffa8 2648 if not continuation_list[0]:
2649 continuation_list[0] = self._extract_continuation(is_renderer)
70d5c17b 2650
2651 if not continuation_list[0]:
2652 continuation_list[0] = self._extract_continuation(parent_renderer)
3462ffa8 2653
2654 continuation_list = [None] # Python 2 doesnot support nonlocal
29f7c58a 2655 tab_content = try_get(tab, lambda x: x['content'], dict)
2656 if not tab_content:
2657 return
3462ffa8 2658 parent_renderer = (
29f7c58a 2659 try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
2660 or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
70d5c17b 2661 for entry in extract_entries(parent_renderer):
2662 yield entry
3462ffa8 2663 continuation = continuation_list[0]
8bdd16b4 2664
2665 headers = {
2666 'x-youtube-client-name': '1',
2667 'x-youtube-client-version': '2.20201112.04.01',
2668 }
2669 if identity_token:
2670 headers['x-youtube-identity-token'] = identity_token
ebf1b291 2671
8bdd16b4 2672 for page_num in itertools.count(1):
2673 if not continuation:
2674 break
29f7c58a 2675 count = 0
2676 retries = 3
2677 while count <= retries:
2678 try:
2679 # Downloading page may result in intermittent 5xx HTTP error
2680 # that is usually worked around with a retry
2681 browse = self._download_json(
2682 'https://www.youtube.com/browse_ajax', None,
2683 'Downloading page %d%s'
2684 % (page_num, ' (retry #%d)' % count if count else ''),
2685 headers=headers, query=continuation)
2686 break
2687 except ExtractorError as e:
2688 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
2689 count += 1
2690 if count <= retries:
2691 continue
2692 raise
8bdd16b4 2693 if not browse:
2694 break
2695 response = try_get(browse, lambda x: x[1]['response'], dict)
2696 if not response:
2697 break
ebf1b291 2698
8bdd16b4 2699 continuation_contents = try_get(
2700 response, lambda x: x['continuationContents'], dict)
2701 if continuation_contents:
2702 continuation_renderer = continuation_contents.get('playlistVideoListContinuation')
2703 if continuation_renderer:
2704 for entry in self._playlist_entries(continuation_renderer):
2705 yield entry
2706 continuation = self._extract_continuation(continuation_renderer)
2707 continue
2708 continuation_renderer = continuation_contents.get('gridContinuation')
2709 if continuation_renderer:
2710 for entry in self._grid_entries(continuation_renderer):
2711 yield entry
2712 continuation = self._extract_continuation(continuation_renderer)
2713 continue
2714 continuation_renderer = continuation_contents.get('itemSectionContinuation')
2715 if continuation_renderer:
2716 for entry in self._post_thread_continuation_entries(continuation_renderer):
2717 yield entry
2718 continuation = self._extract_continuation(continuation_renderer)
2719 continue
70d5c17b 2720 continuation_renderer = continuation_contents.get('sectionListContinuation') # for feeds
3462ffa8 2721 if continuation_renderer:
2722 continuation_list = [None]
2723 for entry in extract_entries(continuation_renderer):
2724 yield entry
2725 continuation = continuation_list[0]
2726 continue
c5e8d7af 2727
8bdd16b4 2728 continuation_items = try_get(
2729 response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
2730 if continuation_items:
2731 continuation_item = continuation_items[0]
2732 if not isinstance(continuation_item, dict):
2733 continue
70d5c17b 2734 renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer')
8bdd16b4 2735 if renderer:
2736 video_list_renderer = {'contents': continuation_items}
2737 for entry in self._playlist_entries(video_list_renderer):
2738 yield entry
2739 continuation = self._extract_continuation(video_list_renderer)
2740 continue
8bdd16b4 2741 break
9558dcec 2742
8bdd16b4 2743 @staticmethod
2744 def _extract_selected_tab(tabs):
2745 for tab in tabs:
2746 if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
2747 return tab['tabRenderer']
2b3c2546 2748 else:
8bdd16b4 2749 raise ExtractorError('Unable to find selected tab')
b82f815f 2750
8bdd16b4 2751 @staticmethod
2752 def _extract_uploader(data):
2753 uploader = {}
2754 sidebar_renderer = try_get(
2755 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
2756 if sidebar_renderer:
2757 for item in sidebar_renderer:
2758 if not isinstance(item, dict):
2759 continue
2760 renderer = item.get('playlistSidebarSecondaryInfoRenderer')
2761 if not isinstance(renderer, dict):
2762 continue
2763 owner = try_get(
2764 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
2765 if owner:
2766 uploader['uploader'] = owner.get('text')
2767 uploader['uploader_id'] = try_get(
2768 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
2769 uploader['uploader_url'] = urljoin(
2770 'https://www.youtube.com/',
2771 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
9c3fe2ef 2772 return {k: v for k, v in uploader.items() if v is not None}
8bdd16b4 2773
2774 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
b60419c5 2775 playlist_id = title = description = channel_url = channel_name = channel_id = None
2776 thumbnails_list = tags = []
2777
8bdd16b4 2778 selected_tab = self._extract_selected_tab(tabs)
2779 renderer = try_get(
2780 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
2781 if renderer:
b60419c5 2782 channel_name = renderer.get('title')
2783 channel_url = renderer.get('channelUrl')
2784 channel_id = renderer.get('externalId')
64c0d954 2785
64c0d954 2786 if not renderer:
2787 renderer = try_get(
2788 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
8bdd16b4 2789 if renderer:
2790 title = renderer.get('title')
64c0d954 2791 description = renderer.get('description')
b60419c5 2792 playlist_id = channel_id
2793 tags = renderer.get('keywords', '').split()
2794 thumbnails_list = (
2795 try_get(renderer, lambda x: x['avatar']['thumbnails'], list)
2796 or data['sidebar']['playlistSidebarRenderer']['items'][0]['playlistSidebarPrimaryInfoRenderer']['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails']
2797 or [])
2798
2799 thumbnails = []
2800 for t in thumbnails_list:
2801 if not isinstance(t, dict):
2802 continue
2803 thumbnail_url = url_or_none(t.get('url'))
2804 if not thumbnail_url:
2805 continue
2806 thumbnails.append({
2807 'url': thumbnail_url,
2808 'width': int_or_none(t.get('width')),
2809 'height': int_or_none(t.get('height')),
2810 })
64c0d954 2811
3462ffa8 2812 if playlist_id is None:
70d5c17b 2813 playlist_id = item_id
2814 if title is None:
b60419c5 2815 title = playlist_id
2816 title += format_field(selected_tab, 'title', ' - %s')
2817
2818 metadata = {
2819 'playlist_id': playlist_id,
2820 'playlist_title': title,
2821 'playlist_description': description,
2822 'uploader': channel_name,
2823 'uploader_id': channel_id,
2824 'uploader_url': channel_url,
2825 'thumbnails': thumbnails,
2826 'tags': tags,
2827 }
2828 if not channel_id:
2829 metadata.update(self._extract_uploader(data))
2830 metadata.update({
2831 'channel': metadata['uploader'],
2832 'channel_id': metadata['uploader_id'],
2833 'channel_url': metadata['uploader_url']})
2834 return self.playlist_result(
29f7c58a 2835 self._entries(selected_tab, identity_token),
b60419c5 2836 **metadata)
73c4ac2c 2837
29f7c58a 2838 def _extract_from_playlist(self, item_id, url, data, playlist):
8bdd16b4 2839 title = playlist.get('title') or try_get(
2840 data, lambda x: x['titleText']['simpleText'], compat_str)
2841 playlist_id = playlist.get('playlistId') or item_id
29f7c58a 2842 # Inline playlist rendition continuation does not always work
2843 # at Youtube side, so delegating regular tab-based playlist URL
2844 # processing whenever possible.
2845 playlist_url = urljoin(url, try_get(
2846 playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
2847 compat_str))
2848 if playlist_url and playlist_url != url:
2849 return self.url_result(
2850 playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
2851 video_title=title)
8bdd16b4 2852 return self.playlist_result(
2853 self._playlist_entries(playlist), playlist_id=playlist_id,
2854 playlist_title=title)
c5e8d7af 2855
29f7c58a 2856 @staticmethod
2857 def _extract_alerts(data):
02ced43c 2858 for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
29f7c58a 2859 if not isinstance(alert_dict, dict):
2860 continue
02ced43c 2861 for renderer in alert_dict:
2862 alert = alert_dict[renderer]
2863 alert_type = alert.get('type')
2864 if not alert_type:
2865 continue
2866 message = try_get(alert, lambda x: x['text']['simpleText'], compat_str)
2867 if message:
2868 yield alert_type, message
2869 for run in try_get(alert, lambda x: x['text']['runs'], list) or []:
2870 message = try_get(run, lambda x: x['text'], compat_str)
2871 if message:
2872 yield alert_type, message
2873
29f7c58a 2874 def _extract_identity_token(self, webpage, item_id):
2875 ytcfg = self._extract_ytcfg(item_id, webpage)
2876 if ytcfg:
2877 token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
2878 if token:
2879 return token
2880 return self._search_regex(
2881 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
2882 'identity token', default=None)
2883
8bdd16b4 2884 def _real_extract(self, url):
2885 item_id = self._match_id(url)
2886 url = compat_urlparse.urlunparse(
2887 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
036fcf3a 2888 is_home = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url)
70d5c17b 2889 if is_home is not None and is_home.group('not_channel') is None and item_id != 'feed':
036fcf3a 2890 self._downloader.report_warning(
2891 'A channel/user page was given. All the channel\'s videos will be downloaded. '
c76eb41b 2892 'To download only the videos in the home page, add a "/featured" to the URL')
036fcf3a 2893 url = '%s/videos%s' % (is_home.group('pre'), is_home.group('post') or '')
2894
8bdd16b4 2895 # Handle both video/playlist URLs
2896 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
2897 video_id = qs.get('v', [None])[0]
2898 playlist_id = qs.get('list', [None])[0]
f0c532a4 2899
29f7c58a 2900 if is_home is not None and is_home.group('not_channel') is not None and is_home.group('not_channel').startswith('watch') and not video_id:
f0c532a4 2901 if playlist_id:
2902 self._downloader.report_warning('%s is not a valid Youtube URL. Trying to download playlist %s' % (url, playlist_id))
2903 url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
2904 # return self.url_result(playlist_id, ie=YoutubePlaylistIE.ie_key())
2905 else:
2906 raise ExtractorError('Unable to recognize tab page')
8bdd16b4 2907 if video_id and playlist_id:
2908 if self._downloader.params.get('noplaylist'):
2909 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2910 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
2911 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
2fa90513 2912
8bdd16b4 2913 webpage = self._download_webpage(url, item_id)
29f7c58a 2914 identity_token = self._extract_identity_token(webpage, item_id)
8bdd16b4 2915 data = self._extract_yt_initial_data(item_id, webpage)
6b8eb0c0 2916 err_msg = None
02ced43c 2917 for alert_type, alert_message in self._extract_alerts(data):
6b8eb0c0 2918 if alert_type.lower() == 'error':
2919 if err_msg:
2920 self._downloader.report_warning('YouTube said: %s - %s' % ('ERROR', err_msg))
2921 err_msg = alert_message
2922 else:
2923 self._downloader.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
2924 if err_msg:
2925 raise ExtractorError('YouTube said: %s' % err_msg, expected=True)
8bdd16b4 2926 tabs = try_get(
2927 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
2928 if tabs:
2929 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
2930 playlist = try_get(
2931 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
2932 if playlist:
29f7c58a 2933 return self._extract_from_playlist(item_id, url, data, playlist)
a0566bbf 2934 # Fallback to video extraction if no playlist alike page is recognized.
2935 # First check for the current video then try the v attribute of URL query.
2936 video_id = try_get(
2937 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
2938 compat_str) or video_id
8bdd16b4 2939 if video_id:
2940 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
2941 # Failed to recognize
2942 raise ExtractorError('Unable to recognize tab page')
c5e8d7af 2943
c5e8d7af 2944
8bdd16b4 2945class YoutubePlaylistIE(InfoExtractor):
2946 IE_DESC = 'YouTube.com playlists'
2947 _VALID_URL = r'''(?x)(?:
2948 (?:https?://)?
2949 (?:\w+\.)?
2950 (?:
2951 (?:
2952 youtube(?:kids)?\.com|
29f7c58a 2953 invidio\.us
8bdd16b4 2954 )
2955 /.*?\?.*?\blist=
2956 )?
2957 (?P<id>%(playlist_id)s)
2958 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
2959 IE_NAME = 'youtube:playlist'
cdc628a4 2960 _TESTS = [{
8bdd16b4 2961 'note': 'issue #673',
2962 'url': 'PLBB231211A4F62143',
cdc628a4 2963 'info_dict': {
8bdd16b4 2964 'title': '[OLD]Team Fortress 2 (Class-based LP)',
2965 'id': 'PLBB231211A4F62143',
2966 'uploader': 'Wickydoo',
2967 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
2968 },
2969 'playlist_mincount': 29,
2970 }, {
2971 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2972 'info_dict': {
2973 'title': 'YDL_safe_search',
2974 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2975 },
2976 'playlist_count': 2,
2977 'skip': 'This playlist is private',
9558dcec 2978 }, {
8bdd16b4 2979 'note': 'embedded',
2980 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2981 'playlist_count': 4,
9558dcec 2982 'info_dict': {
8bdd16b4 2983 'title': 'JODA15',
2984 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2985 'uploader': 'milan',
2986 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
9558dcec 2987 }
cdc628a4 2988 }, {
8bdd16b4 2989 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2990 'playlist_mincount': 982,
2991 'info_dict': {
2992 'title': '2018 Chinese New Singles (11/6 updated)',
2993 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2994 'uploader': 'LBK',
2995 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
2996 }
daa0df9e 2997 }, {
29f7c58a 2998 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2999 'only_matching': True,
3000 }, {
3001 # music album playlist
3002 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
3003 'only_matching': True,
3004 }]
3005
3006 @classmethod
3007 def suitable(cls, url):
3008 return False if YoutubeTabIE.suitable(url) else super(
3009 YoutubePlaylistIE, cls).suitable(url)
3010
3011 def _real_extract(self, url):
3012 playlist_id = self._match_id(url)
3013 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3014 if not qs:
3015 qs = {'list': playlist_id}
3016 return self.url_result(
3017 update_url_query('https://www.youtube.com/playlist', qs),
3018 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3019
3020
3021class YoutubeYtBeIE(InfoExtractor):
c76eb41b 3022 IE_DESC = 'youtu.be'
29f7c58a 3023 _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3024 _TESTS = [{
8bdd16b4 3025 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
3026 'info_dict': {
3027 'id': 'yeWKywCrFtk',
3028 'ext': 'mp4',
3029 'title': 'Small Scale Baler and Braiding Rugs',
3030 'uploader': 'Backus-Page House Museum',
3031 'uploader_id': 'backuspagemuseum',
3032 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
3033 'upload_date': '20161008',
3034 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
3035 'categories': ['Nonprofits & Activism'],
3036 'tags': list,
3037 'like_count': int,
3038 'dislike_count': int,
3039 },
3040 'params': {
3041 'noplaylist': True,
3042 'skip_download': True,
3043 },
39e7107d 3044 }, {
8bdd16b4 3045 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
39e7107d 3046 'only_matching': True,
cdc628a4
PH
3047 }]
3048
8bdd16b4 3049 def _real_extract(self, url):
29f7c58a 3050 mobj = re.match(self._VALID_URL, url)
3051 video_id = mobj.group('id')
3052 playlist_id = mobj.group('playlist_id')
8bdd16b4 3053 return self.url_result(
29f7c58a 3054 update_url_query('https://www.youtube.com/watch', {
3055 'v': video_id,
3056 'list': playlist_id,
3057 'feature': 'youtu.be',
3058 }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
8bdd16b4 3059
3060
3061class YoutubeYtUserIE(InfoExtractor):
c76eb41b 3062 IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
8bdd16b4 3063 _VALID_URL = r'ytuser:(?P<id>.+)'
3064 _TESTS = [{
3065 'url': 'ytuser:phihag',
3066 'only_matching': True,
3067 }]
3068
3069 def _real_extract(self, url):
3070 user_id = self._match_id(url)
3071 return self.url_result(
3072 'https://www.youtube.com/user/%s' % user_id,
3073 ie=YoutubeTabIE.ie_key(), video_id=user_id)
9558dcec 3074
b05654f0 3075
3d3dddc9 3076class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
70d5c17b 3077 IE_NAME = 'youtube:favorites'
3078 IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
3079 _VALID_URL = r':ytfav(?:ou?rite)?s?'
3080 _LOGIN_REQUIRED = True
3081 _TESTS = [{
3082 'url': ':ytfav',
3083 'only_matching': True,
3084 }, {
3085 'url': ':ytfavorites',
3086 'only_matching': True,
3087 }]
3088
3089 def _real_extract(self, url):
3090 return self.url_result(
3091 'https://www.youtube.com/playlist?list=LL',
3092 ie=YoutubeTabIE.ie_key())
3093
3094
8bdd16b4 3095class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
78caa52a 3096 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
3097 # there doesn't appear to be a real limit, for example if you search for
3098 # 'python' you get more than 8.000.000 results
3099 _MAX_RESULTS = float('inf')
78caa52a 3100 IE_NAME = 'youtube:search'
b05654f0 3101 _SEARCH_KEY = 'ytsearch'
6c894ea1 3102 _SEARCH_PARAMS = None
9dd8e46a 3103 _TESTS = []
b05654f0 3104
6c894ea1
U
3105 def _entries(self, query, n):
3106 data = {
3107 'context': {
3108 'client': {
3109 'clientName': 'WEB',
3110 'clientVersion': '2.20201021.03.00',
3111 }
3112 },
3113 'query': query,
a22b2fd1 3114 }
6c894ea1
U
3115 if self._SEARCH_PARAMS:
3116 data['params'] = self._SEARCH_PARAMS
3117 total = 0
3118 for page_num in itertools.count(1):
3119 search = self._download_json(
3120 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3121 video_id='query "%s"' % query,
3122 note='Downloading page %s' % page_num,
3123 errnote='Unable to download API page', fatal=False,
3124 data=json.dumps(data).encode('utf8'),
3125 headers={'content-type': 'application/json'})
3126 if not search:
b4c08069 3127 break
6c894ea1
U
3128 slr_contents = try_get(
3129 search,
3130 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3131 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3132 list)
3133 if not slr_contents:
a22b2fd1 3134 break
0366ae87 3135
0366ae87
M
3136 # Youtube sometimes adds promoted content to searches,
3137 # changing the index location of videos and token.
3138 # So we search through all entries till we find them.
30a074c2 3139 continuation_token = None
3140 for slr_content in slr_contents:
3141 isr_contents = try_get(
3142 slr_content,
3143 lambda x: x['itemSectionRenderer']['contents'],
3144 list)
9da76d30 3145 if not isr_contents:
30a074c2 3146 continue
3147 for content in isr_contents:
3148 if not isinstance(content, dict):
3149 continue
3150 video = content.get('videoRenderer')
3151 if not isinstance(video, dict):
3152 continue
3153 video_id = video.get('videoId')
3154 if not video_id:
3155 continue
3156
3157 yield self._extract_video(video)
3158 total += 1
3159 if total == n:
3160 return
0366ae87
M
3161
3162 if continuation_token is None:
3163 continuation_token = try_get(
30a074c2 3164 slr_content,
3165 lambda x: x['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
0366ae87 3166 compat_str)
0366ae87 3167
0366ae87 3168 if not continuation_token:
6c894ea1 3169 break
0366ae87 3170 data['continuation'] = continuation_token
b05654f0 3171
6c894ea1
U
3172 def _get_n_results(self, query, n):
3173 """Get a specified number of results for a query"""
3174 return self.playlist_result(self._entries(query, n), query)
75dff0ee 3175
c9ae7b95 3176
a3dd9248 3177class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 3178 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 3179 _SEARCH_KEY = 'ytsearchdate'
c76eb41b 3180 IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
6c894ea1 3181 _SEARCH_PARAMS = 'CAI%3D'
75dff0ee 3182
c9ae7b95 3183
386e1dd9 3184class YoutubeSearchURLIE(YoutubeSearchIE):
c76eb41b 3185 IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
386e1dd9 3186 IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
3187 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
ef2f3c7f 3188 # _MAX_RESULTS = 100
3462ffa8 3189 _TESTS = [{
3190 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3191 'playlist_mincount': 5,
3192 'info_dict': {
3193 'title': 'youtube-dl test video',
3194 }
3195 }, {
3196 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3197 'only_matching': True,
3198 }]
3199
386e1dd9 3200 @classmethod
3201 def _make_valid_url(cls):
3202 return cls._VALID_URL
3203
3462ffa8 3204 def _real_extract(self, url):
386e1dd9 3205 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
3206 query = (qs.get('search_query') or qs.get('q'))[0]
3207 self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
3208 return self._get_n_results(query, self._MAX_RESULTS)
3462ffa8 3209
3210
3211class YoutubeFeedsInfoExtractor(YoutubeTabIE):
d7ae0639 3212 """
25f14e9f 3213 Base class for feed extractors
3d3dddc9 3214 Subclasses must define the _FEED_NAME property.
d7ae0639 3215 """
b2e8bc1b 3216 _LOGIN_REQUIRED = True
3462ffa8 3217 # _MAX_PAGES = 5
ef2f3c7f 3218 _TESTS = []
d7ae0639
JMF
3219
3220 @property
3221 def IE_NAME(self):
78caa52a 3222 return 'youtube:%s' % self._FEED_NAME
04cc9617 3223
81f0259b 3224 def _real_initialize(self):
b2e8bc1b 3225 self._login()
81f0259b 3226
3853309f 3227 def _real_extract(self, url):
3d3dddc9 3228 return self.url_result(
3229 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3230 ie=YoutubeTabIE.ie_key())
25f14e9f
S
3231
3232
ef2f3c7f 3233class YoutubeWatchLaterIE(InfoExtractor):
3234 IE_NAME = 'youtube:watchlater'
70d5c17b 3235 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3d3dddc9 3236 _VALID_URL = r':ytwatchlater'
bc7a9cd8 3237 _TESTS = [{
8bdd16b4 3238 'url': ':ytwatchlater',
bc7a9cd8
S
3239 'only_matching': True,
3240 }]
25f14e9f
S
3241
3242 def _real_extract(self, url):
ef2f3c7f 3243 return self.url_result(
3244 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
3462ffa8 3245
3246
25f14e9f
S
3247class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3248 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3d3dddc9 3249 _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
25f14e9f 3250 _FEED_NAME = 'recommended'
3d3dddc9 3251 _TESTS = [{
3252 'url': ':ytrec',
3253 'only_matching': True,
3254 }, {
3255 'url': ':ytrecommended',
3256 'only_matching': True,
3257 }, {
3258 'url': 'https://youtube.com',
3259 'only_matching': True,
3260 }]
1ed5b5c9 3261
1ed5b5c9 3262
25f14e9f 3263class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
70d5c17b 3264 IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
3d3dddc9 3265 _VALID_URL = r':ytsub(?:scription)?s?'
25f14e9f 3266 _FEED_NAME = 'subscriptions'
3d3dddc9 3267 _TESTS = [{
3268 'url': ':ytsubs',
3269 'only_matching': True,
3270 }, {
3271 'url': ':ytsubscriptions',
3272 'only_matching': True,
3273 }]
1ed5b5c9 3274
1ed5b5c9 3275
25f14e9f
S
3276class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3277 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3d3dddc9 3278 _VALID_URL = r':ythistory'
25f14e9f 3279 _FEED_NAME = 'history'
3d3dddc9 3280 _TESTS = [{
3281 'url': ':ythistory',
3282 'only_matching': True,
3283 }]
1ed5b5c9
JMF
3284
3285
15870e90
PH
3286class YoutubeTruncatedURLIE(InfoExtractor):
3287 IE_NAME = 'youtube:truncated_url'
3288 IE_DESC = False # Do not list
975d35db 3289 _VALID_URL = r'''(?x)
b95aab84
PH
3290 (?:https?://)?
3291 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3292 (?:watch\?(?:
c4808c60 3293 feature=[a-z_]+|
b95aab84
PH
3294 annotation_id=annotation_[^&]+|
3295 x-yt-cl=[0-9]+|
c1708b89 3296 hl=[^&]*|
287be8c6 3297 t=[0-9]+
b95aab84
PH
3298 )?
3299 |
3300 attribution_link\?a=[^&]+
3301 )
3302 $
975d35db 3303 '''
15870e90 3304
c4808c60 3305 _TESTS = [{
2d3d2997 3306 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
c4808c60 3307 'only_matching': True,
dc2fc736 3308 }, {
2d3d2997 3309 'url': 'https://www.youtube.com/watch?',
dc2fc736 3310 'only_matching': True,
b95aab84
PH
3311 }, {
3312 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3313 'only_matching': True,
3314 }, {
3315 'url': 'https://www.youtube.com/watch?feature=foo',
3316 'only_matching': True,
c1708b89
PH
3317 }, {
3318 'url': 'https://www.youtube.com/watch?hl=en-GB',
3319 'only_matching': True,
287be8c6
PH
3320 }, {
3321 'url': 'https://www.youtube.com/watch?t=2372',
3322 'only_matching': True,
c4808c60
PH
3323 }]
3324
15870e90
PH
3325 def _real_extract(self, url):
3326 raise ExtractorError(
78caa52a
PH
3327 'Did you forget to quote the URL? Remember that & is a meta '
3328 'character in most shells, so you want to put the URL in quotes, '
3867038a 3329 'like youtube-dl '
2d3d2997 3330 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3867038a 3331 ' or simply youtube-dl BaW_jenozKc .',
15870e90 3332 expected=True)
772fd5cc
PH
3333
3334
3335class YoutubeTruncatedIDIE(InfoExtractor):
3336 IE_NAME = 'youtube:truncated_id'
3337 IE_DESC = False # Do not list
b95aab84 3338 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
3339
3340 _TESTS = [{
3341 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3342 'only_matching': True,
3343 }]
3344
3345 def _real_extract(self, url):
3346 video_id = self._match_id(url)
3347 raise ExtractorError(
3348 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3349 expected=True)