]> jfr.im git - yt-dlp.git/blame - youtube_dlc/extractor/youtube.py
#72 Fix issue with unicode filenames in aria2c (Closes #71)
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
0ca96d48 5import itertools
c5e8d7af 6import json
c4417ddb 7import os.path
d77ab8e2 8import random
c5e8d7af 9import re
8a784c74 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
4bb4a188 14from ..compat import (
edf3e38e 15 compat_chr,
29f7c58a 16 compat_HTTPError,
8d81f3e3 17 compat_kwargs,
c5e8d7af 18 compat_parse_qs,
545cc85d 19 compat_str,
7fd002c0 20 compat_urllib_parse_unquote_plus,
15707c7e 21 compat_urllib_parse_urlencode,
7c80519c 22 compat_urllib_parse_urlparse,
7c61bd36 23 compat_urlparse,
4bb4a188 24)
545cc85d 25from ..jsinterp import JSInterpreter
4bb4a188 26from ..utils import (
c5e8d7af 27 clean_html,
c5e8d7af 28 ExtractorError,
b60419c5 29 format_field,
2d30521a 30 float_or_none,
dd27fd17 31 int_or_none,
94278f72 32 mimetype2ext,
6310acf5 33 parse_codecs,
7c80519c 34 parse_duration,
8a784c74 35 # qualities,
3995d37d 36 remove_start,
cf7e015f 37 smuggle_url,
dbdaaa23 38 str_or_none,
c93d53f5 39 str_to_int,
556dbe7f 40 try_get,
c5e8d7af
PH
41 unescapeHTML,
42 unified_strdate,
cf7e015f 43 unsmuggle_url,
8bdd16b4 44 update_url_query,
21c340b8 45 url_or_none,
6e6bc8da 46 urlencode_postdata,
8bdd16b4 47 urljoin,
c5e8d7af
PH
48)
49
5f6a1245 50
de7f3446 51class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
52 """Provide base functions for Youtube extractors"""
53 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 54 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
e00eb564
S
55
56 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
3995d37d
S
57 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
58 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
e00eb564 59
3462ffa8 60 _RESERVED_NAMES = (
29f7c58a 61 r'embed|e|watch_popup|channel|c|user|playlist|watch|w|v|movies|results|shared|'
62 r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout|'
63 r'feed/(?:watch_later|history|subscriptions|library|trending|recommended)')
3462ffa8 64
b2e8bc1b
JMF
65 _NETRC_MACHINE = 'youtube'
66 # If True it will raise an error if no login info is provided
67 _LOGIN_REQUIRED = False
68
70d5c17b 69 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
d0ba5587 70
25f14e9f
S
71 def _ids_to_results(self, ids):
72 return [
73 self.url_result(vid_id, 'Youtube', video_id=vid_id)
74 for vid_id in ids]
75
b2e8bc1b 76 def _login(self):
83317f69 77 """
78 Attempt to log in to YouTube.
79 True is returned if successful or skipped.
80 False is returned if login failed.
81
82 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
83 """
68217024 84 username, password = self._get_login_info()
b2e8bc1b
JMF
85 # No authentication to be performed
86 if username is None:
70d35d16 87 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
69ea8ca4 88 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
545cc85d 89 # if self._downloader.params.get('cookiefile'): # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
90 # self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
83317f69 91 return True
b2e8bc1b 92
7cc3570e
PH
93 login_page = self._download_webpage(
94 self._LOGIN_URL, None,
69ea8ca4
PH
95 note='Downloading login page',
96 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
97 if login_page is False:
98 return
b2e8bc1b 99
1212e997 100 login_form = self._hidden_inputs(login_page)
c5e8d7af 101
e00eb564
S
102 def req(url, f_req, note, errnote):
103 data = login_form.copy()
104 data.update({
105 'pstMsg': 1,
106 'checkConnection': 'youtube',
107 'checkedDomains': 'youtube',
108 'hl': 'en',
109 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
3995d37d 110 'f.req': json.dumps(f_req),
e00eb564
S
111 'flowName': 'GlifWebSignIn',
112 'flowEntry': 'ServiceLogin',
baf67a60
S
113 # TODO: reverse actual botguard identifier generation algo
114 'bgRequest': '["identifier",""]',
041bc3ad 115 })
e00eb564
S
116 return self._download_json(
117 url, None, note=note, errnote=errnote,
118 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
119 fatal=False,
120 data=urlencode_postdata(data), headers={
121 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
122 'Google-Accounts-XSRF': 1,
123 })
124
3995d37d
S
125 def warn(message):
126 self._downloader.report_warning(message)
127
128 lookup_req = [
129 username,
130 None, [], None, 'US', None, None, 2, False, True,
131 [
132 None, None,
133 [2, 1, None, 1,
134 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
135 None, [], 4],
136 1, [None, None, []], None, None, None, True
137 ],
138 username,
139 ]
140
e00eb564 141 lookup_results = req(
3995d37d 142 self._LOOKUP_URL, lookup_req,
e00eb564
S
143 'Looking up account info', 'Unable to look up account info')
144
145 if lookup_results is False:
146 return False
041bc3ad 147
3995d37d
S
148 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
149 if not user_hash:
150 warn('Unable to extract user hash')
151 return False
152
153 challenge_req = [
154 user_hash,
155 None, 1, None, [1, None, None, None, [password, None, True]],
156 [
157 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
158 1, [None, None, []], None, None, None, True
159 ]]
83317f69 160
3995d37d
S
161 challenge_results = req(
162 self._CHALLENGE_URL, challenge_req,
163 'Logging in', 'Unable to log in')
83317f69 164
3995d37d 165 if challenge_results is False:
e00eb564 166 return
83317f69 167
3995d37d
S
168 login_res = try_get(challenge_results, lambda x: x[0][5], list)
169 if login_res:
170 login_msg = try_get(login_res, lambda x: x[5], compat_str)
171 warn(
172 'Unable to login: %s' % 'Invalid password'
173 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
174 return False
175
176 res = try_get(challenge_results, lambda x: x[0][-1], list)
177 if not res:
178 warn('Unable to extract result entry')
179 return False
180
9a6628aa
S
181 login_challenge = try_get(res, lambda x: x[0][0], list)
182 if login_challenge:
183 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
184 if challenge_str == 'TWO_STEP_VERIFICATION':
3995d37d
S
185 # SEND_SUCCESS - TFA code has been successfully sent to phone
186 # QUOTA_EXCEEDED - reached the limit of TFA codes
9a6628aa 187 status = try_get(login_challenge, lambda x: x[5], compat_str)
3995d37d
S
188 if status == 'QUOTA_EXCEEDED':
189 warn('Exceeded the limit of TFA codes, try later')
190 return False
191
192 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
193 if not tl:
194 warn('Unable to extract TL')
195 return False
196
197 tfa_code = self._get_tfa_info('2-step verification code')
198
199 if not tfa_code:
200 warn(
201 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
202 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
203 return False
204
205 tfa_code = remove_start(tfa_code, 'G-')
206
207 tfa_req = [
208 user_hash, None, 2, None,
209 [
210 9, None, None, None, None, None, None, None,
211 [None, tfa_code, True, 2]
212 ]]
213
214 tfa_results = req(
215 self._TFA_URL.format(tl), tfa_req,
216 'Submitting TFA code', 'Unable to submit TFA code')
217
218 if tfa_results is False:
219 return False
220
221 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
222 if tfa_res:
223 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
224 warn(
225 'Unable to finish TFA: %s' % 'Invalid TFA code'
226 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
227 return False
228
229 check_cookie_url = try_get(
230 tfa_results, lambda x: x[0][-1][2], compat_str)
9a6628aa
S
231 else:
232 CHALLENGES = {
233 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
234 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
235 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
236 }
237 challenge = CHALLENGES.get(
238 challenge_str,
239 '%s returned error %s.' % (self.IE_NAME, challenge_str))
240 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
241 return False
3995d37d
S
242 else:
243 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
244
245 if not check_cookie_url:
246 warn('Unable to extract CheckCookie URL')
247 return False
e00eb564
S
248
249 check_cookie_results = self._download_webpage(
3995d37d
S
250 check_cookie_url, None, 'Checking cookie', fatal=False)
251
252 if check_cookie_results is False:
253 return False
e00eb564 254
3995d37d
S
255 if 'https://myaccount.google.com/' not in check_cookie_results:
256 warn('Unable to log in')
b2e8bc1b 257 return False
e00eb564 258
b2e8bc1b
JMF
259 return True
260
30226342 261 def _download_webpage_handle(self, *args, **kwargs):
c1148516 262 query = kwargs.get('query', {}).copy()
c1148516 263 kwargs['query'] = query
30226342 264 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
8d81f3e3
S
265 *args, **compat_kwargs(kwargs))
266
b2e8bc1b
JMF
267 def _real_initialize(self):
268 if self._downloader is None:
269 return
b2e8bc1b
JMF
270 if not self._login():
271 return
c5e8d7af 272
8bdd16b4 273 _DEFAULT_API_DATA = {
274 'context': {
275 'client': {
276 'clientName': 'WEB',
277 'clientVersion': '2.20201021.03.00',
278 }
279 },
280 }
8377574c 281
a0566bbf 282 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
29f7c58a 283 _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
284 _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
a0566bbf 285
545cc85d 286 def _call_api(self, ep, query, video_id, fatal=True):
8bdd16b4 287 data = self._DEFAULT_API_DATA.copy()
288 data.update(query)
9833e7a0 289
545cc85d 290 return self._download_json(
8bdd16b4 291 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
292 note='Downloading API JSON', errnote='Unable to download API page',
545cc85d 293 data=json.dumps(data).encode('utf8'), fatal=fatal,
8bdd16b4 294 headers={'content-type': 'application/json'},
295 query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
c54f4aad 296
8bdd16b4 297 def _extract_yt_initial_data(self, video_id, webpage):
298 return self._parse_json(
299 self._search_regex(
29f7c58a 300 (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
a0566bbf 301 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
8bdd16b4 302 video_id)
0c148415 303
29f7c58a 304 def _extract_ytcfg(self, video_id, webpage):
305 return self._parse_json(
306 self._search_regex(
307 r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
308 default='{}'), video_id, fatal=False)
309
30a074c2 310 def _extract_video(self, renderer):
311 video_id = renderer.get('videoId')
312 title = try_get(
313 renderer,
314 (lambda x: x['title']['runs'][0]['text'],
315 lambda x: x['title']['simpleText']), compat_str)
316 description = try_get(
317 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
318 compat_str)
319 duration = parse_duration(try_get(
320 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
321 view_count_text = try_get(
322 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
323 view_count = str_to_int(self._search_regex(
324 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
325 'view count', default=None))
326 uploader = try_get(
327 renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
328 return {
329 '_type': 'url_transparent',
330 'ie_key': YoutubeIE.ie_key(),
331 'id': video_id,
332 'url': video_id,
333 'title': title,
334 'description': description,
335 'duration': duration,
336 'view_count': view_count,
337 'uploader': uploader,
338 }
339
0c148415 340
360e1ca5 341class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 342 IE_DESC = 'YouTube.com'
cb7dfeea 343 _VALID_URL = r"""(?x)^
c5e8d7af 344 (
edb53e2d 345 (?:https?://|//) # http(s):// or protocol-independent URL
66b48727 346 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
484aaeb2 347 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 348 (?:www\.)?pwnyoutube\.com/|
8b561bfc 349 (?:www\.)?hooktube\.com/|
f7000f3a 350 (?:www\.)?yourepeat\.com/|
e69ae5b9 351 tube\.majestyc\.net/|
ba036333 352 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
77d95677 353 (?:(?:www|dev)\.)?invidio\.us/|
ba036333 354 (?:(?:www|no)\.)?invidiou\.sh/|
29f7c58a 355 (?:(?:www|fi)\.)?invidious\.snopyta\.org/|
8ae113ca 356 (?:www\.)?invidious\.kabi\.tk/|
ba036333 357 (?:www\.)?invidious\.13ad\.de/|
791d2e81 358 (?:www\.)?invidious\.mastodon\.host/|
29f7c58a 359 (?:www\.)?invidious\.zapashcanon\.fr/|
360 (?:www\.)?invidious\.kavin\.rocks/|
361 (?:www\.)?invidious\.tube/|
362 (?:www\.)?invidiou\.site/|
363 (?:www\.)?invidious\.site/|
364 (?:www\.)?invidious\.xyz/|
494d664e 365 (?:www\.)?invidious\.nixnet\.xyz/|
666d808e 366 (?:www\.)?invidious\.drycat\.fr/|
ba036333 367 (?:www\.)?tube\.poal\.co/|
29f7c58a 368 (?:www\.)?tube\.connect\.cafe/|
8ae113ca 369 (?:www\.)?vid\.wxzm\.sx/|
29f7c58a 370 (?:www\.)?vid\.mint\.lgbt/|
384bf91f 371 (?:www\.)?yewtu\.be/|
494d664e 372 (?:www\.)?yt\.elukerio\.org/|
894b3826 373 (?:www\.)?yt\.lelux\.fi/|
1db5ab6b 374 (?:www\.)?invidious\.ggc-project\.de/|
375 (?:www\.)?yt\.maisputain\.ovh/|
376 (?:www\.)?invidious\.13ad\.de/|
377 (?:www\.)?invidious\.toot\.koeln/|
378 (?:www\.)?invidious\.fdn\.fr/|
379 (?:www\.)?watch\.nettohikari\.com/|
bff90fc5 380 (?:www\.)?kgg2m7yk5aybusll\.onion/|
381 (?:www\.)?qklhadlycap4cnod\.onion/|
382 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
383 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
384 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
385 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
33c1c7d8 386 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
1db5ab6b 387 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
e69ae5b9 388 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
389 (?:.*?\#/)? # handle anchor (#/) redirect urls
390 (?: # the various things that can precede the ID:
ac7553d0 391 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 392 |(?: # or the v= param in all its forms
f7000f3a 393 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 394 (?:\?|\#!?) # the params delimiter ? or # or #!
040ac686 395 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
c5e8d7af
PH
396 v=
397 )
f4b05232 398 ))
cbaed4bb
S
399 |(?:
400 youtu\.be| # just youtu.be/xxxx
6d4fc66b
S
401 vid\.plus| # or vid.plus/xxxx
402 zwearz\.com/watch| # or zwearz.com/watch/xxxx
cbaed4bb 403 )/
edb53e2d 404 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 405 )
c5e8d7af 406 )? # all until now is optional -> you can pass the naked ID
8bdd16b4 407 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
d0ba5587
S
408 (?!.*?\blist=
409 (?:
410 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
411 WL # WL are handled by the watch later IE
412 )
413 )
c5e8d7af 414 (?(1).+)? # if we found the ID, everything can follow
d0ba5587 415 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
e40c758c 416 _PLAYER_INFO_RE = (
545cc85d 417 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.js$',
418 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
e40c758c 419 )
2c62dc26 420 _formats = {
c2d3cb4c 421 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
422 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
423 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
424 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
425 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
426 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
427 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
428 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
3834d3e3 429 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
c2d3cb4c 430 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
431 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
432 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
433 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
434 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
435 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
e1a0bfdf 436 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
c2d3cb4c 437 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
438 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
e1a0bfdf 439
440
441 # 3D videos
c2d3cb4c 442 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
443 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
444 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
445 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
e1a0bfdf 446 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
447 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
448 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
836a086c 449
96fb5605 450 # Apple HTTP Live Streaming
11f12195 451 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
c2d3cb4c 452 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
453 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
454 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
455 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
456 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
e1a0bfdf 457 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
458 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
2c62dc26
PH
459
460 # DASH mp4 video
d23028a8
S
461 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
462 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
463 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
464 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
465 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
067aa17e 466 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
d23028a8
S
467 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
468 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
469 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
470 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
471 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
472 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
836a086c 473
f6f1fc92 474 # Dash mp4 audio
d23028a8
S
475 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
476 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
477 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
478 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
479 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
480 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
481 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
836a086c
AZ
482
483 # Dash webm
d23028a8
S
484 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
485 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
486 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
487 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
488 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
489 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
490 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
491 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
492 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
493 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
494 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
495 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
496 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
497 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
498 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
4c6b4764 499 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
d23028a8
S
500 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
501 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
502 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
503 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
504 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
505 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
2c62dc26
PH
506
507 # Dash webm audio
d23028a8
S
508 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
509 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
ce6b9a2d 510
0857baad 511 # Dash webm audio with opus inside
d23028a8
S
512 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
513 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
514 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
0857baad 515
ce6b9a2d
PH
516 # RTMP (unnamed)
517 '_rtmp': {'protocol': 'rtmp'},
b85eae0f
S
518
519 # av01 video only formats sometimes served with "unknown" codecs
520 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
521 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
522 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
523 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
c5e8d7af 524 }
29f7c58a 525 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
836a086c 526
fd5c4aab
S
527 _GEO_BYPASS = False
528
78caa52a 529 IE_NAME = 'youtube'
2eb88d95
PH
530 _TESTS = [
531 {
2d3d2997 532 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
4bc3a23e
PH
533 'info_dict': {
534 'id': 'BaW_jenozKc',
535 'ext': 'mp4',
3867038a 536 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
4bc3a23e
PH
537 'uploader': 'Philipp Hagemeister',
538 'uploader_id': 'phihag',
ec85ded8 539 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
dd4c4492
S
540 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
541 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
4bc3a23e 542 'upload_date': '20121002',
3867038a 543 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
4bc3a23e 544 'categories': ['Science & Technology'],
3867038a 545 'tags': ['youtube-dl'],
556dbe7f 546 'duration': 10,
dbdaaa23 547 'view_count': int,
3e7c1224
PH
548 'like_count': int,
549 'dislike_count': int,
7c80519c 550 'start_time': 1,
297a564b 551 'end_time': 9,
2eb88d95 552 }
0e853ca4 553 },
fccd3771 554 {
4bc3a23e
PH
555 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
556 'note': 'Embed-only video (#1746)',
557 'info_dict': {
558 'id': 'yZIXLfi8CZQ',
559 'ext': 'mp4',
560 'upload_date': '20120608',
561 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
562 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
563 'uploader': 'SET India',
94bfcd23 564 'uploader_id': 'setindia',
ec85ded8 565 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
94bfcd23 566 'age_limit': 18,
545cc85d 567 },
568 'skip': 'Private video',
fccd3771 569 },
11b56058 570 {
8bdd16b4 571 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
11b56058
PM
572 'note': 'Use the first video ID in the URL',
573 'info_dict': {
574 'id': 'BaW_jenozKc',
575 'ext': 'mp4',
3867038a 576 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
11b56058
PM
577 'uploader': 'Philipp Hagemeister',
578 'uploader_id': 'phihag',
ec85ded8 579 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
11b56058 580 'upload_date': '20121002',
3867038a 581 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
11b56058 582 'categories': ['Science & Technology'],
3867038a 583 'tags': ['youtube-dl'],
556dbe7f 584 'duration': 10,
dbdaaa23 585 'view_count': int,
11b56058
PM
586 'like_count': int,
587 'dislike_count': int,
34a7de29
S
588 },
589 'params': {
590 'skip_download': True,
591 },
11b56058 592 },
dd27fd17 593 {
2d3d2997 594 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
4bc3a23e
PH
595 'note': '256k DASH audio (format 141) via DASH manifest',
596 'info_dict': {
597 'id': 'a9LDPn-MO4I',
598 'ext': 'm4a',
599 'upload_date': '20121002',
600 'uploader_id': '8KVIDEO',
ec85ded8 601 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
4bc3a23e
PH
602 'description': '',
603 'uploader': '8KVIDEO',
604 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 605 },
4bc3a23e
PH
606 'params': {
607 'youtube_include_dash_manifest': True,
608 'format': '141',
4919603f 609 },
de3c7fe0 610 'skip': 'format 141 not served anymore',
dd27fd17 611 },
8bdd16b4 612 # DASH manifest with encrypted signature
613 {
614 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
615 'info_dict': {
616 'id': 'IB3lcPjvWLA',
617 'ext': 'm4a',
618 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
619 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
620 'duration': 244,
621 'uploader': 'AfrojackVEVO',
622 'uploader_id': 'AfrojackVEVO',
623 'upload_date': '20131011',
624 },
625 'params': {
626 'youtube_include_dash_manifest': True,
627 'format': '141/bestaudio[ext=m4a]',
628 },
629 },
aa79ac0c
PH
630 # Controversy video
631 {
632 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
633 'info_dict': {
634 'id': 'T4XJQO3qol8',
635 'ext': 'mp4',
556dbe7f 636 'duration': 219,
aa79ac0c 637 'upload_date': '20100909',
4fe54c12 638 'uploader': 'Amazing Atheist',
aa79ac0c 639 'uploader_id': 'TheAmazingAtheist',
ec85ded8 640 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
aa79ac0c 641 'title': 'Burning Everyone\'s Koran',
545cc85d 642 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
aa79ac0c 643 }
c522adb1 644 },
dd2d55f1 645 # Normal age-gate video (embed allowed)
c522adb1 646 {
2d3d2997 647 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
c522adb1
JMF
648 'info_dict': {
649 'id': 'HtVdAasjOgU',
650 'ext': 'mp4',
651 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
ec85ded8 652 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
556dbe7f 653 'duration': 142,
c522adb1
JMF
654 'uploader': 'The Witcher',
655 'uploader_id': 'WitcherGame',
ec85ded8 656 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
c522adb1 657 'upload_date': '20140605',
34952f09 658 'age_limit': 18,
c522adb1
JMF
659 },
660 },
8bdd16b4 661 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
662 # YouTube Red ad is not captured for creator
663 {
664 'url': '__2ABJjxzNo',
665 'info_dict': {
666 'id': '__2ABJjxzNo',
667 'ext': 'mp4',
668 'duration': 266,
669 'upload_date': '20100430',
670 'uploader_id': 'deadmau5',
671 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
545cc85d 672 'creator': 'deadmau5',
673 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336',
8bdd16b4 674 'uploader': 'deadmau5',
675 'title': 'Deadmau5 - Some Chords (HD)',
545cc85d 676 'alt_title': 'Some Chords',
8bdd16b4 677 },
678 'expected_warnings': [
679 'DASH manifest missing',
680 ]
681 },
067aa17e 682 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
e52a40ab
PH
683 {
684 'url': 'lqQg6PlCWgI',
685 'info_dict': {
686 'id': 'lqQg6PlCWgI',
687 'ext': 'mp4',
556dbe7f 688 'duration': 6085,
90227264 689 'upload_date': '20150827',
cbe2bd91 690 'uploader_id': 'olympic',
ec85ded8 691 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
cbe2bd91 692 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
be49068d 693 'uploader': 'Olympic',
cbe2bd91
PH
694 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
695 },
696 'params': {
697 'skip_download': 'requires avconv',
e52a40ab 698 }
cbe2bd91 699 },
6271f1ca
PH
700 # Non-square pixels
701 {
702 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
703 'info_dict': {
704 'id': '_b-2C3KPAM0',
705 'ext': 'mp4',
706 'stretched_ratio': 16 / 9.,
556dbe7f 707 'duration': 85,
6271f1ca
PH
708 'upload_date': '20110310',
709 'uploader_id': 'AllenMeow',
ec85ded8 710 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
6271f1ca 711 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
eb6793ba 712 'uploader': '孫ᄋᄅ',
6271f1ca
PH
713 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
714 },
06b491eb
S
715 },
716 # url_encoded_fmt_stream_map is empty string
717 {
718 'url': 'qEJwOuvDf7I',
719 'info_dict': {
720 'id': 'qEJwOuvDf7I',
f57b7835 721 'ext': 'webm',
06b491eb
S
722 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
723 'description': '',
724 'upload_date': '20150404',
725 'uploader_id': 'spbelect',
726 'uploader': 'Наблюдатели Петербурга',
727 },
728 'params': {
729 'skip_download': 'requires avconv',
e323cf3f
S
730 },
731 'skip': 'This live event has ended.',
06b491eb 732 },
067aa17e 733 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
da77d856
S
734 {
735 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
736 'info_dict': {
737 'id': 'FIl7x6_3R5Y',
eb6793ba 738 'ext': 'webm',
da77d856
S
739 'title': 'md5:7b81415841e02ecd4313668cde88737a',
740 'description': 'md5:116377fd2963b81ec4ce64b542173306',
556dbe7f 741 'duration': 220,
da77d856
S
742 'upload_date': '20150625',
743 'uploader_id': 'dorappi2000',
ec85ded8 744 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
da77d856 745 'uploader': 'dorappi2000',
eb6793ba 746 'formats': 'mincount:31',
da77d856 747 },
eb6793ba 748 'skip': 'not actual anymore',
2ee8f5d8 749 },
8a1a26ce
YCH
750 # DASH manifest with segment_list
751 {
752 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
753 'md5': '8ce563a1d667b599d21064e982ab9e31',
754 'info_dict': {
755 'id': 'CsmdDsKjzN8',
756 'ext': 'mp4',
17ee98e1 757 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
758 'uploader': 'Airtek',
759 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
760 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
761 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
762 },
763 'params': {
764 'youtube_include_dash_manifest': True,
765 'format': '135', # bestvideo
be49068d
S
766 },
767 'skip': 'This live event has ended.',
2ee8f5d8 768 },
cf7e015f
S
769 {
770 # Multifeed videos (multiple cameras), URL is for Main Camera
545cc85d 771 'url': 'https://www.youtube.com/watch?v=jvGDaLqkpTg',
cf7e015f 772 'info_dict': {
545cc85d 773 'id': 'jvGDaLqkpTg',
774 'title': 'Tom Clancy Free Weekend Rainbow Whatever',
775 'description': 'md5:e03b909557865076822aa169218d6a5d',
cf7e015f
S
776 },
777 'playlist': [{
778 'info_dict': {
545cc85d 779 'id': 'jvGDaLqkpTg',
cf7e015f 780 'ext': 'mp4',
545cc85d 781 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Main Camera)',
782 'description': 'md5:e03b909557865076822aa169218d6a5d',
783 'duration': 10643,
784 'upload_date': '20161111',
785 'uploader': 'Team PGP',
786 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
787 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
cf7e015f
S
788 },
789 }, {
790 'info_dict': {
545cc85d 791 'id': '3AKt1R1aDnw',
cf7e015f 792 'ext': 'mp4',
545cc85d 793 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 2)',
794 'description': 'md5:e03b909557865076822aa169218d6a5d',
795 'duration': 10991,
796 'upload_date': '20161111',
797 'uploader': 'Team PGP',
798 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
799 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
cf7e015f
S
800 },
801 }, {
802 'info_dict': {
545cc85d 803 'id': 'RtAMM00gpVc',
cf7e015f 804 'ext': 'mp4',
545cc85d 805 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 3)',
806 'description': 'md5:e03b909557865076822aa169218d6a5d',
807 'duration': 10995,
808 'upload_date': '20161111',
809 'uploader': 'Team PGP',
810 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
811 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
cf7e015f
S
812 },
813 }, {
814 'info_dict': {
545cc85d 815 'id': '6N2fdlP3C5U',
cf7e015f 816 'ext': 'mp4',
545cc85d 817 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 4)',
818 'description': 'md5:e03b909557865076822aa169218d6a5d',
819 'duration': 10990,
820 'upload_date': '20161111',
821 'uploader': 'Team PGP',
822 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
823 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
cf7e015f
S
824 },
825 }],
826 'params': {
827 'skip_download': True,
828 },
cbaed4bb 829 },
f9f49d87 830 {
067aa17e 831 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
f9f49d87
S
832 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
833 'info_dict': {
834 'id': 'gVfLd0zydlo',
835 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
836 },
837 'playlist_count': 2,
be49068d 838 'skip': 'Not multifeed anymore',
f9f49d87 839 },
cbaed4bb 840 {
2d3d2997 841 'url': 'https://vid.plus/FlRa-iH7PGw',
cbaed4bb 842 'only_matching': True,
0e49d9a6 843 },
6d4fc66b 844 {
2d3d2997 845 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
6d4fc66b
S
846 'only_matching': True,
847 },
0e49d9a6 848 {
067aa17e 849 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
a8776b10 850 # Also tests cut-off URL expansion in video description (see
067aa17e
S
851 # https://github.com/ytdl-org/youtube-dl/issues/1892,
852 # https://github.com/ytdl-org/youtube-dl/issues/8164)
0e49d9a6
LL
853 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
854 'info_dict': {
855 'id': 'lsguqyKfVQg',
856 'ext': 'mp4',
857 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
eb6793ba 858 'alt_title': 'Dark Walk - Position Music',
0e49d9a6 859 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
556dbe7f 860 'duration': 133,
0e49d9a6
LL
861 'upload_date': '20151119',
862 'uploader_id': 'IronSoulElf',
ec85ded8 863 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
0e49d9a6 864 'uploader': 'IronSoulElf',
eb6793ba
S
865 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
866 'track': 'Dark Walk - Position Music',
867 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
92bc97d3 868 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
0e49d9a6
LL
869 },
870 'params': {
871 'skip_download': True,
872 },
873 },
61f92af1 874 {
067aa17e 875 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
61f92af1
S
876 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
877 'only_matching': True,
878 },
313dfc45
LL
879 {
880 # Video with yt:stretch=17:0
881 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
882 'info_dict': {
883 'id': 'Q39EVAstoRM',
884 'ext': 'mp4',
885 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
886 'description': 'md5:ee18a25c350637c8faff806845bddee9',
887 'upload_date': '20151107',
888 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
889 'uploader': 'CH GAMER DROID',
890 },
891 'params': {
892 'skip_download': True,
893 },
be49068d 894 'skip': 'This video does not exist.',
313dfc45 895 },
7caf9830
S
896 {
897 # Video licensed under Creative Commons
898 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
899 'info_dict': {
900 'id': 'M4gD1WSo5mA',
901 'ext': 'mp4',
902 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
903 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
556dbe7f 904 'duration': 721,
7caf9830
S
905 'upload_date': '20150127',
906 'uploader_id': 'BerkmanCenter',
ec85ded8 907 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
556dbe7f 908 'uploader': 'The Berkman Klein Center for Internet & Society',
7caf9830
S
909 'license': 'Creative Commons Attribution license (reuse allowed)',
910 },
911 'params': {
912 'skip_download': True,
913 },
914 },
fd050249
S
915 {
916 # Channel-like uploader_url
917 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
918 'info_dict': {
919 'id': 'eQcmzGIKrzg',
920 'ext': 'mp4',
921 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
545cc85d 922 'description': 'md5:13a2503d7b5904ef4b223aa101628f39',
556dbe7f 923 'duration': 4060,
fd050249 924 'upload_date': '20151119',
eb6793ba 925 'uploader': 'Bernie Sanders',
fd050249 926 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
ec85ded8 927 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
fd050249
S
928 'license': 'Creative Commons Attribution license (reuse allowed)',
929 },
930 'params': {
931 'skip_download': True,
932 },
933 },
040ac686
S
934 {
935 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
936 'only_matching': True,
7f29cf54
S
937 },
938 {
067aa17e 939 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
7f29cf54
S
940 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
941 'only_matching': True,
6496ccb4
S
942 },
943 {
944 # Rental video preview
945 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
946 'info_dict': {
947 'id': 'uGpuVWrhIzE',
948 'ext': 'mp4',
949 'title': 'Piku - Trailer',
950 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
951 'upload_date': '20150811',
952 'uploader': 'FlixMatrix',
953 'uploader_id': 'FlixMatrixKaravan',
ec85ded8 954 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
6496ccb4
S
955 'license': 'Standard YouTube License',
956 },
957 'params': {
958 'skip_download': True,
959 },
eb6793ba 960 'skip': 'This video is not available.',
022a5d66 961 },
12afdc2a
S
962 {
963 # YouTube Red video with episode data
964 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
965 'info_dict': {
966 'id': 'iqKdEhx-dD4',
967 'ext': 'mp4',
968 'title': 'Isolation - Mind Field (Ep 1)',
545cc85d 969 'description': 'md5:f540112edec5d09fc8cc752d3d4ba3cd',
556dbe7f 970 'duration': 2085,
12afdc2a
S
971 'upload_date': '20170118',
972 'uploader': 'Vsauce',
973 'uploader_id': 'Vsauce',
974 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
12afdc2a
S
975 'series': 'Mind Field',
976 'season_number': 1,
977 'episode_number': 1,
978 },
979 'params': {
980 'skip_download': True,
981 },
982 'expected_warnings': [
983 'Skipping DASH manifest',
984 ],
985 },
c7121fa7
S
986 {
987 # The following content has been identified by the YouTube community
988 # as inappropriate or offensive to some audiences.
989 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
990 'info_dict': {
991 'id': '6SJNVb0GnPI',
992 'ext': 'mp4',
993 'title': 'Race Differences in Intelligence',
994 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
995 'duration': 965,
996 'upload_date': '20140124',
997 'uploader': 'New Century Foundation',
998 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
999 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
c7121fa7
S
1000 },
1001 'params': {
1002 'skip_download': True,
1003 },
545cc85d 1004 'skip': 'This video has been removed for violating YouTube\'s policy on hate speech.',
c7121fa7 1005 },
022a5d66
S
1006 {
1007 # itag 212
1008 'url': '1t24XAntNCY',
1009 'only_matching': True,
fd5c4aab
S
1010 },
1011 {
1012 # geo restricted to JP
1013 'url': 'sJL6WA-aGkQ',
1014 'only_matching': True,
1015 },
cd5a74a2
S
1016 {
1017 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1018 'only_matching': True,
1019 },
825cd268
RA
1020 {
1021 # DRM protected
1022 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1023 'only_matching': True,
4fe54c12
S
1024 },
1025 {
1026 # Video with unsupported adaptive stream type formats
1027 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1028 'info_dict': {
1029 'id': 'Z4Vy8R84T1U',
1030 'ext': 'mp4',
1031 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1032 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1033 'duration': 433,
1034 'upload_date': '20130923',
1035 'uploader': 'Amelia Putri Harwita',
1036 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1037 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1038 'formats': 'maxcount:10',
1039 },
1040 'params': {
1041 'skip_download': True,
1042 'youtube_include_dash_manifest': False,
1043 },
5429d6a9 1044 'skip': 'not actual anymore',
5caabd3c 1045 },
1046 {
822b9d9c 1047 # Youtube Music Auto-generated description
5caabd3c 1048 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1049 'info_dict': {
1050 'id': 'MgNrAu2pzNs',
1051 'ext': 'mp4',
1052 'title': 'Voyeur Girl',
1053 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1054 'upload_date': '20190312',
5429d6a9
S
1055 'uploader': 'Stephen - Topic',
1056 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
5caabd3c 1057 'artist': 'Stephen',
1058 'track': 'Voyeur Girl',
1059 'album': 'it\'s too much love to know my dear',
1060 'release_date': '20190313',
1061 'release_year': 2019,
1062 },
1063 'params': {
1064 'skip_download': True,
1065 },
1066 },
66b48727
RA
1067 {
1068 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1069 'only_matching': True,
1070 },
011e75e6
S
1071 {
1072 # invalid -> valid video id redirection
1073 'url': 'DJztXj2GPfl',
1074 'info_dict': {
1075 'id': 'DJztXj2GPfk',
1076 'ext': 'mp4',
1077 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1078 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1079 'upload_date': '20090125',
1080 'uploader': 'Prochorowka',
1081 'uploader_id': 'Prochorowka',
1082 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1083 'artist': 'Panjabi MC',
1084 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1085 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1086 },
1087 'params': {
1088 'skip_download': True,
1089 },
545cc85d 1090 'skip': 'Video unavailable',
ea74e00b
DP
1091 },
1092 {
1093 # empty description results in an empty string
1094 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1095 'info_dict': {
1096 'id': 'x41yOUIvK2k',
1097 'ext': 'mp4',
1098 'title': 'IMG 3456',
1099 'description': '',
1100 'upload_date': '20170613',
1101 'uploader_id': 'ElevageOrVert',
1102 'uploader': 'ElevageOrVert',
1103 },
1104 'params': {
1105 'skip_download': True,
1106 },
1107 },
a0566bbf 1108 {
29f7c58a 1109 # with '};' inside yt initial data (see [1])
1110 # see [2] for an example with '};' inside ytInitialPlayerResponse
1111 # 1. https://github.com/ytdl-org/youtube-dl/issues/27093
1112 # 2. https://github.com/ytdl-org/youtube-dl/issues/27216
a0566bbf 1113 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1114 'info_dict': {
1115 'id': 'CHqg6qOn4no',
1116 'ext': 'mp4',
1117 'title': 'Part 77 Sort a list of simple types in c#',
1118 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1119 'upload_date': '20130831',
1120 'uploader_id': 'kudvenkat',
1121 'uploader': 'kudvenkat',
1122 },
1123 'params': {
1124 'skip_download': True,
1125 },
1126 },
29f7c58a 1127 {
1128 # another example of '};' in ytInitialData
1129 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY',
1130 'only_matching': True,
1131 },
1132 {
1133 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ',
1134 'only_matching': True,
1135 },
545cc85d 1136 {
1137 # Age-gated video only available with authentication (unavailable
1138 # via embed page workaround)
1139 'url': 'XgnwCQzjau8',
1140 'only_matching': True,
1141 },
2eb88d95
PH
1142 ]
1143
e0df6211
PH
1144 def __init__(self, *args, **kwargs):
1145 super(YoutubeIE, self).__init__(*args, **kwargs)
545cc85d 1146 self._code_cache = {}
83799698 1147 self._player_cache = {}
e0df6211 1148
60064c53
PH
1149 def _signature_cache_id(self, example_sig):
1150 """ Return a string representation of a signature """
78caa52a 1151 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53 1152
e40c758c
S
1153 @classmethod
1154 def _extract_player_info(cls, player_url):
1155 for player_re in cls._PLAYER_INFO_RE:
1156 id_m = re.search(player_re, player_url)
1157 if id_m:
1158 break
1159 else:
c081b35c 1160 raise ExtractorError('Cannot identify player %r' % player_url)
545cc85d 1161 return id_m.group('id')
e40c758c
S
1162
1163 def _extract_signature_function(self, video_id, player_url, example_sig):
545cc85d 1164 player_id = self._extract_player_info(player_url)
e0df6211 1165
c4417ddb 1166 # Read from filesystem cache
545cc85d 1167 func_id = 'js_%s_%s' % (
1168 player_id, self._signature_cache_id(example_sig))
c4417ddb 1169 assert os.path.basename(func_id) == func_id
a0e07d31 1170
69ea8ca4 1171 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 1172 if cache_spec is not None:
78caa52a 1173 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 1174
545cc85d 1175 if player_id not in self._code_cache:
1176 self._code_cache[player_id] = self._download_webpage(
e0df6211 1177 player_url, video_id,
545cc85d 1178 note='Downloading player ' + player_id,
69ea8ca4 1179 errnote='Download of %s failed' % player_url)
545cc85d 1180 code = self._code_cache[player_id]
1181 res = self._parse_sig_js(code)
e0df6211 1182
785521bf
PH
1183 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1184 cache_res = res(test_string)
1185 cache_spec = [ord(c) for c in cache_res]
83799698 1186
69ea8ca4 1187 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
1188 return res
1189
60064c53 1190 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
1191 def gen_sig_code(idxs):
1192 def _genslice(start, end, step):
78caa52a 1193 starts = '' if start == 0 else str(start)
8bcc8756 1194 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 1195 steps = '' if step == 1 else (':%d' % step)
78caa52a 1196 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
1197
1198 step = None
7af808a5
PH
1199 # Quelch pyflakes warnings - start will be set when step is set
1200 start = '(Never used)'
edf3e38e
PH
1201 for i, prev in zip(idxs[1:], idxs[:-1]):
1202 if step is not None:
1203 if i - prev == step:
1204 continue
1205 yield _genslice(start, prev, step)
1206 step = None
1207 continue
1208 if i - prev in [-1, 1]:
1209 step = i - prev
1210 start = prev
1211 continue
1212 else:
78caa52a 1213 yield 's[%d]' % prev
edf3e38e 1214 if step is None:
78caa52a 1215 yield 's[%d]' % i
edf3e38e
PH
1216 else:
1217 yield _genslice(start, i, step)
1218
78caa52a 1219 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 1220 cache_res = func(test_string)
edf3e38e 1221 cache_spec = [ord(c) for c in cache_res]
78caa52a 1222 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
1223 signature_id_tuple = '(%s)' % (
1224 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 1225 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 1226 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 1227 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 1228
e0df6211
PH
1229 def _parse_sig_js(self, jscode):
1230 funcname = self._search_regex(
abefc03f
S
1231 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1232 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
e450f6cb 1233 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
31ce6e99 1234 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
abefc03f
S
1235 # Obsolete patterns
1236 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
9a47fa35 1237 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
abefc03f
S
1238 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1239 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1240 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1241 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1242 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1243 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
3c90cc8b 1244 jscode, 'Initial JS player signature function name', group='sig')
2b25cb5d
PH
1245
1246 jsi = JSInterpreter(jscode)
1247 initial_function = jsi.extract_function(funcname)
e0df6211
PH
1248 return lambda s: initial_function([s])
1249
545cc85d 1250 def _decrypt_signature(self, s, video_id, player_url):
257a2501 1251 """Turn the encrypted s field into a working signature"""
6b37f0be 1252
c8bf86d5 1253 if player_url is None:
69ea8ca4 1254 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 1255
69ea8ca4 1256 if player_url.startswith('//'):
78caa52a 1257 player_url = 'https:' + player_url
3c90cc8b
S
1258 elif not re.match(r'https?://', player_url):
1259 player_url = compat_urlparse.urljoin(
1260 'https://www.youtube.com', player_url)
c8bf86d5 1261 try:
62af3a0e 1262 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
1263 if player_id not in self._player_cache:
1264 func = self._extract_signature_function(
60064c53 1265 video_id, player_url, s
c8bf86d5
PH
1266 )
1267 self._player_cache[player_id] = func
1268 func = self._player_cache[player_id]
1269 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 1270 self._print_sig_code(func, s)
c8bf86d5
PH
1271 return func(s)
1272 except Exception as e:
1273 tb = traceback.format_exc()
1274 raise ExtractorError(
78caa52a 1275 'Signature extraction failed: ' + tb, cause=e)
e0df6211 1276
545cc85d 1277 def _mark_watched(self, video_id, player_response):
21c340b8
S
1278 playback_url = url_or_none(try_get(
1279 player_response,
545cc85d 1280 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']))
d77ab8e2
S
1281 if not playback_url:
1282 return
1283 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1284 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1285
1286 # cpn generation algorithm is reverse engineered from base.js.
1287 # In fact it works even with dummy cpn.
1288 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1289 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1290
1291 qs.update({
1292 'ver': ['2'],
1293 'cpn': [cpn],
1294 })
1295 playback_url = compat_urlparse.urlunparse(
15707c7e 1296 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
d77ab8e2
S
1297
1298 self._download_webpage(
1299 playback_url, video_id, 'Marking watched',
1300 'Unable to mark watched', fatal=False)
1301
66c9fa36
S
1302 @staticmethod
1303 def _extract_urls(webpage):
1304 # Embedded YouTube player
1305 entries = [
1306 unescapeHTML(mobj.group('url'))
1307 for mobj in re.finditer(r'''(?x)
1308 (?:
1309 <iframe[^>]+?src=|
1310 data-video-url=|
1311 <embed[^>]+?src=|
1312 embedSWF\(?:\s*|
1313 <object[^>]+data=|
1314 new\s+SWFObject\(
1315 )
1316 (["\'])
1317 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
f2332f18 1318 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
66c9fa36
S
1319 \1''', webpage)]
1320
1321 # lazyYT YouTube embed
1322 entries.extend(list(map(
1323 unescapeHTML,
1324 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1325
1326 # Wordpress "YouTube Video Importer" plugin
1327 matches = re.findall(r'''(?x)<div[^>]+
1328 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1329 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1330 entries.extend(m[-1] for m in matches)
1331
1332 return entries
1333
1334 @staticmethod
1335 def _extract_url(webpage):
1336 urls = YoutubeIE._extract_urls(webpage)
1337 return urls[0] if urls else None
1338
97665381
PH
1339 @classmethod
1340 def extract_id(cls, url):
1341 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 1342 if mobj is None:
69ea8ca4 1343 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1344 video_id = mobj.group(2)
1345 return video_id
1346
545cc85d 1347 def _extract_chapters_from_json(self, data, video_id, duration):
84213ea8 1348 chapters_list = try_get(
8bdd16b4 1349 data,
84213ea8
S
1350 lambda x: x['playerOverlays']
1351 ['playerOverlayRenderer']
1352 ['decoratedPlayerBarRenderer']
1353 ['decoratedPlayerBarRenderer']
1354 ['playerBar']
1355 ['chapteredPlayerBarRenderer']
1356 ['chapters'],
1357 list)
1358 if not chapters_list:
1359 return
1360
1361 def chapter_time(chapter):
1362 return float_or_none(
1363 try_get(
1364 chapter,
1365 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1366 int),
1367 scale=1000)
1368 chapters = []
1369 for next_num, chapter in enumerate(chapters_list, start=1):
1370 start_time = chapter_time(chapter)
1371 if start_time is None:
1372 continue
1373 end_time = (chapter_time(chapters_list[next_num])
1374 if next_num < len(chapters_list) else duration)
1375 if end_time is None:
1376 continue
1377 title = try_get(
1378 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1379 compat_str)
1380 chapters.append({
1381 'start_time': start_time,
1382 'end_time': end_time,
1383 'title': title,
1384 })
1385 return chapters
1386
545cc85d 1387 def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
1388 return self._parse_json(self._search_regex(
1389 (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
1390 regex), webpage, name, default='{}'), video_id, fatal=False)
84213ea8 1391
c5e8d7af 1392 def _real_extract(self, url):
cf7e015f 1393 url, smuggled_data = unsmuggle_url(url, {})
545cc85d 1394 video_id = self._match_id(url)
1395 base_url = self.http_scheme() + '//www.youtube.com/'
1396 webpage_url = base_url + 'watch?v=' + video_id
1397 webpage = self._download_webpage(webpage_url, video_id, fatal=False)
1398
1399 player_response = None
1400 if webpage:
1401 player_response = self._extract_yt_initial_variable(
1402 webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
1403 video_id, 'initial player response')
1404 if not player_response:
1405 player_response = self._call_api(
1406 'player', {'videoId': video_id}, video_id)
1407
1408 playability_status = player_response.get('playabilityStatus') or {}
1409 if playability_status.get('reason') == 'Sign in to confirm your age':
1410 pr = self._parse_json(try_get(compat_parse_qs(
1411 self._download_webpage(
1412 base_url + 'get_video_info', video_id,
1413 'Refetching age-gated info webpage',
1414 'unable to download video info webpage', query={
1415 'video_id': video_id,
1416 'eurl': 'https://www.youtube.com/embed/' + video_id,
1417 }, fatal=False)),
1418 lambda x: x['player_response'][0],
1419 compat_str) or '{}', video_id)
1420 if pr:
1421 player_response = pr
1422
1423 trailer_video_id = try_get(
1424 playability_status,
1425 lambda x: x['errorScreen']['playerLegacyDesktopYpcTrailerRenderer']['trailerVideoId'],
1426 compat_str)
1427 if trailer_video_id:
1428 return self.url_result(
1429 trailer_video_id, self.ie_key(), trailer_video_id)
cf7e015f 1430
545cc85d 1431 def get_text(x):
1432 if not x:
c2d125d9 1433 return
545cc85d 1434 return x.get('simpleText') or ''.join([r['text'] for r in x['runs']])
15be3eb5 1435
545cc85d 1436 search_meta = (
1437 lambda x: self._html_search_meta(x, webpage, default=None)) \
1438 if webpage else lambda x: None
dbdaaa23 1439
545cc85d 1440 video_details = player_response.get('videoDetails') or {}
37357d21 1441 microformat = try_get(
545cc85d 1442 player_response,
1443 lambda x: x['microformat']['playerMicroformatRenderer'],
1444 dict) or {}
1445 video_title = video_details.get('title') \
1446 or get_text(microformat.get('title')) \
1447 or search_meta(['og:title', 'twitter:title', 'title'])
1448 video_description = video_details.get('shortDescription')
cf7e015f 1449
8fe10494 1450 if not smuggled_data.get('force_singlefeed', False):
5e1eddb9 1451 if not self._downloader.params.get('noplaylist'):
8fe10494
S
1452 multifeed_metadata_list = try_get(
1453 player_response,
1454 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
545cc85d 1455 compat_str)
8fe10494
S
1456 if multifeed_metadata_list:
1457 entries = []
1458 feed_ids = []
1459 for feed in multifeed_metadata_list.split(','):
1460 # Unquote should take place before split on comma (,) since textual
1461 # fields may contain comma as well (see
067aa17e 1462 # https://github.com/ytdl-org/youtube-dl/issues/8536)
545cc85d 1463 feed_data = compat_parse_qs(
1464 compat_urllib_parse_unquote_plus(feed))
6b09401b
S
1465
1466 def feed_entry(name):
545cc85d 1467 return try_get(
1468 feed_data, lambda x: x[name][0], compat_str)
6b09401b
S
1469
1470 feed_id = feed_entry('id')
1471 if not feed_id:
1472 continue
1473 feed_title = feed_entry('title')
1474 title = video_title
1475 if feed_title:
1476 title += ' (%s)' % feed_title
8fe10494
S
1477 entries.append({
1478 '_type': 'url_transparent',
1479 'ie_key': 'Youtube',
1480 'url': smuggle_url(
545cc85d 1481 base_url + 'watch?v=' + feed_data['id'][0],
8fe10494 1482 {'force_singlefeed': True}),
6b09401b 1483 'title': title,
8fe10494 1484 })
6b09401b 1485 feed_ids.append(feed_id)
8fe10494
S
1486 self.to_screen(
1487 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1488 % (', '.join(feed_ids), video_id))
545cc85d 1489 return self.playlist_result(
1490 entries, video_id, video_title, video_description)
8fe10494
S
1491 else:
1492 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1493
545cc85d 1494 formats = []
1495 itags = []
1496 player_url = None
8a784c74 1497 # q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'])
545cc85d 1498 streaming_data = player_response.get('streamingData') or {}
1499 streaming_formats = streaming_data.get('formats') or []
1500 streaming_formats.extend(streaming_data.get('adaptiveFormats') or [])
1501 for fmt in streaming_formats:
1502 if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
1503 continue
321bf820 1504
545cc85d 1505 fmt_url = fmt.get('url')
1506 if not fmt_url:
1507 sc = compat_parse_qs(fmt.get('signatureCipher'))
1508 fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0]))
1509 encrypted_sig = try_get(sc, lambda x: x['s'][0])
1510 if not (sc and fmt_url and encrypted_sig):
1511 continue
1512 if not player_url:
1513 if not webpage:
1514 continue
1515 player_url = self._search_regex(
1516 r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
1517 webpage, 'player URL', fatal=False)
1518 if not player_url:
201e9eaa 1519 continue
545cc85d 1520 signature = self._decrypt_signature(sc['s'][0], video_id, player_url)
1521 sp = try_get(sc, lambda x: x['sp'][0]) or 'signature'
1522 fmt_url += '&' + sp + '=' + signature
1523
1524 itag = str_or_none(fmt.get('itag'))
1525 if itag:
1526 itags.append(itag)
1527 quality = fmt.get('quality')
1528 dct = {
1529 'asr': int_or_none(fmt.get('audioSampleRate')),
1530 'filesize': int_or_none(fmt.get('contentLength')),
1531 'format_id': itag,
1532 'format_note': fmt.get('qualityLabel') or quality,
1533 'fps': int_or_none(fmt.get('fps')),
1534 'height': int_or_none(fmt.get('height')),
1535 # 'quality': q(quality), # This does not correctly reflect the overall quality of the format
1536 'tbr': float_or_none(fmt.get(
1537 'averageBitrate') or fmt.get('bitrate'), 1000),
1538 'url': fmt_url,
1539 'width': fmt.get('width'),
1540 }
1541 mimetype = fmt.get('mimeType')
1542 if mimetype:
1543 mobj = re.match(
1544 r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', mimetype)
1545 if mobj:
1546 dct['ext'] = mimetype2ext(mobj.group(1))
1547 dct.update(parse_codecs(mobj.group(2)))
1548 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
1549 dct['downloader_options'] = {
1550 # Youtube throttles chunks >~10M
1551 'http_chunk_size': 10485760,
bf1317d2 1552 }
545cc85d 1553 formats.append(dct)
1554
1555 hls_manifest_url = streaming_data.get('hlsManifestUrl')
1556 if hls_manifest_url:
1557 for f in self._extract_m3u8_formats(
1558 hls_manifest_url, video_id, 'mp4', fatal=False):
1559 itag = self._search_regex(
1560 r'/itag/(\d+)', f['url'], 'itag', default=None)
1561 if itag:
1562 f['format_id'] = itag
1563 formats.append(f)
1564
1565 if self._downloader.params.get('youtube_include_dash_manifest'):
1566 dash_manifest_url = streaming_data.get('dashManifestUrl')
1567 if dash_manifest_url:
1568 dash_formats = []
1569 for f in self._extract_mpd_formats(
1570 dash_manifest_url, video_id, fatal=False):
1571 filesize = int_or_none(self._search_regex(
1572 r'/clen/(\d+)', f.get('fragment_base_url')
1573 or f['url'], 'file size', default=None))
1574 if filesize:
1575 f['filesize'] = filesize
1576 dash_formats.append(f)
1577 # Until further investigation prefer DASH formats as non-DASH
1578 # may not be available (see [1])
1579 # 1. https://github.com/ytdl-org/youtube-dl/issues/28070
1580 if dash_formats:
1581 dash_formats_keys = [f['format_id'] for f in dash_formats]
1582 formats = [f for f in formats if f['format_id'] not in dash_formats_keys]
1583 formats.extend(dash_formats)
bf1317d2 1584
545cc85d 1585 if not formats:
1586 if streaming_data.get('licenseInfos'):
1587 raise ExtractorError(
1588 'This video is DRM protected.', expected=True)
1589 pemr = try_get(
1590 playability_status,
1591 lambda x: x['errorScreen']['playerErrorMessageRenderer'],
1592 dict) or {}
1593 reason = get_text(pemr.get('reason')) or playability_status.get('reason')
1594 subreason = pemr.get('subreason')
1595 if subreason:
1596 subreason = clean_html(get_text(subreason))
1597 if subreason == 'The uploader has not made this video available in your country.':
1598 countries = microformat.get('availableCountries')
1599 if not countries:
1600 regions_allowed = search_meta('regionsAllowed')
1601 countries = regions_allowed.split(',') if regions_allowed else None
1602 self.raise_geo_restricted(
1603 subreason, countries)
1604 reason += '\n' + subreason
1605 if reason:
1606 raise ExtractorError(reason, expected=True)
bf1317d2 1607
545cc85d 1608 self._sort_formats(formats)
bf1317d2 1609
545cc85d 1610 keywords = video_details.get('keywords') or []
1611 if not keywords and webpage:
1612 keywords = [
1613 unescapeHTML(m.group('content'))
1614 for m in re.finditer(self._meta_regex('og:video:tag'), webpage)]
1615 for keyword in keywords:
1616 if keyword.startswith('yt:stretch='):
1617 w, h = keyword.split('=')[1].split(':')
1618 w, h = int(w), int(h)
1619 if w > 0 and h > 0:
1620 ratio = w / h
1621 for f in formats:
1622 if f.get('vcodec') != 'none':
1623 f['stretched_ratio'] = ratio
6449cd80 1624
545cc85d 1625 thumbnails = []
1626 for container in (video_details, microformat):
1627 for thumbnail in (try_get(
1628 container,
1629 lambda x: x['thumbnail']['thumbnails'], list) or []):
1630 thumbnail_url = thumbnail.get('url')
1631 if not thumbnail_url:
bf1317d2 1632 continue
545cc85d 1633 thumbnails.append({
1634 'height': int_or_none(thumbnail.get('height')),
1635 'url': thumbnail_url,
1636 'width': int_or_none(thumbnail.get('width')),
1637 })
1638 if thumbnails:
1639 break
a6211d23 1640 else:
545cc85d 1641 thumbnail = search_meta(['og:image', 'twitter:image'])
1642 if thumbnail:
1643 thumbnails = [{'url': thumbnail}]
1644
1645 category = microformat.get('category') or search_meta('genre')
1646 channel_id = video_details.get('channelId') \
1647 or microformat.get('externalChannelId') \
1648 or search_meta('channelId')
1649 duration = int_or_none(
1650 video_details.get('lengthSeconds')
1651 or microformat.get('lengthSeconds')) \
1652 or parse_duration(search_meta('duration'))
1653 is_live = video_details.get('isLive')
1654 owner_profile_url = microformat.get('ownerProfileUrl')
1655
1656 info = {
1657 'id': video_id,
1658 'title': self._live_title(video_title) if is_live else video_title,
1659 'formats': formats,
1660 'thumbnails': thumbnails,
1661 'description': video_description,
1662 'upload_date': unified_strdate(
1663 microformat.get('uploadDate')
1664 or search_meta('uploadDate')),
1665 'uploader': video_details['author'],
1666 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None,
1667 'uploader_url': owner_profile_url,
1668 'channel_id': channel_id,
1669 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None,
1670 'duration': duration,
1671 'view_count': int_or_none(
1672 video_details.get('viewCount')
1673 or microformat.get('viewCount')
1674 or search_meta('interactionCount')),
1675 'average_rating': float_or_none(video_details.get('averageRating')),
1676 'age_limit': 18 if (
1677 microformat.get('isFamilySafe') is False
1678 or search_meta('isFamilyFriendly') == 'false'
1679 or search_meta('og:restrictions:age') == '18+') else 0,
1680 'webpage_url': webpage_url,
1681 'categories': [category] if category else None,
1682 'tags': keywords,
1683 'is_live': is_live,
1684 'playable_in_embed': playability_status.get('playableInEmbed'),
1685 }
b477fc13 1686
545cc85d 1687 pctr = try_get(
1688 player_response,
1689 lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict)
1690 subtitles = {}
1691 if pctr:
1692 def process_language(container, base_url, lang_code, query):
1693 lang_subs = []
1694 for fmt in self._SUBTITLE_FORMATS:
1695 query.update({
1696 'fmt': fmt,
1697 })
1698 lang_subs.append({
1699 'ext': fmt,
1700 'url': update_url_query(base_url, query),
1701 })
1702 container[lang_code] = lang_subs
7e72694b 1703
545cc85d 1704 for caption_track in (pctr.get('captionTracks') or []):
1705 base_url = caption_track.get('baseUrl')
1706 if not base_url:
1707 continue
1708 if caption_track.get('kind') != 'asr':
1709 lang_code = caption_track.get('languageCode')
1710 if not lang_code:
1711 continue
1712 process_language(
1713 subtitles, base_url, lang_code, {})
1714 continue
1715 automatic_captions = {}
1716 for translation_language in (pctr.get('translationLanguages') or []):
1717 translation_language_code = translation_language.get('languageCode')
1718 if not translation_language_code:
1719 continue
1720 process_language(
1721 automatic_captions, base_url, translation_language_code,
1722 {'tlang': translation_language_code})
1723 info['automatic_captions'] = automatic_captions
1724 info['subtitles'] = subtitles
7e72694b 1725
545cc85d 1726 parsed_url = compat_urllib_parse_urlparse(url)
1727 for component in [parsed_url.fragment, parsed_url.query]:
1728 query = compat_parse_qs(component)
1729 for k, v in query.items():
1730 for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]:
1731 d_k += '_time'
1732 if d_k not in info and k in s_ks:
1733 info[d_k] = parse_duration(query[k][0])
822b9d9c
RA
1734
1735 # Youtube Music Auto-generated description
822b9d9c 1736 if video_description:
38d70284 1737 mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
822b9d9c 1738 if mobj:
822b9d9c
RA
1739 release_year = mobj.group('release_year')
1740 release_date = mobj.group('release_date')
1741 if release_date:
1742 release_date = release_date.replace('-', '')
1743 if not release_year:
545cc85d 1744 release_year = release_date[:4]
1745 info.update({
1746 'album': mobj.group('album'.strip()),
1747 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')),
1748 'track': mobj.group('track').strip(),
1749 'release_date': release_date,
1750 'release_year': int(release_year),
1751 })
7e72694b 1752
545cc85d 1753 initial_data = None
1754 if webpage:
1755 initial_data = self._extract_yt_initial_variable(
1756 webpage, self._YT_INITIAL_DATA_RE, video_id,
1757 'yt initial data')
1758 if not initial_data:
1759 initial_data = self._call_api(
1760 'next', {'videoId': video_id}, video_id, fatal=False)
1761
1762 if not is_live:
1763 try:
1764 # This will error if there is no livechat
1765 initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
1766 info['subtitles']['live_chat'] = [{
1767 'video_id': video_id,
1768 'ext': 'json',
1769 'protocol': 'youtube_live_chat_replay',
1770 }]
1771 except (KeyError, IndexError, TypeError):
1772 pass
1773
1774 if initial_data:
1775 chapters = self._extract_chapters_from_json(
1776 initial_data, video_id, duration)
1777 if not chapters:
1778 for engagment_pannel in (initial_data.get('engagementPanels') or []):
1779 contents = try_get(
1780 engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'],
1781 list)
1782 if not contents:
1783 continue
1784
1785 def chapter_time(mmlir):
1786 return parse_duration(
1787 get_text(mmlir.get('timeDescription')))
1788
1789 chapters = []
1790 for next_num, content in enumerate(contents, start=1):
1791 mmlir = content.get('macroMarkersListItemRenderer') or {}
1792 start_time = chapter_time(mmlir)
1793 end_time = chapter_time(try_get(
1794 contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \
1795 if next_num < len(contents) else duration
1796 if start_time is None or end_time is None:
1797 continue
1798 chapters.append({
1799 'start_time': start_time,
1800 'end_time': end_time,
1801 'title': get_text(mmlir.get('title')),
1802 })
1803 if chapters:
1804 break
1805 if chapters:
1806 info['chapters'] = chapters
1807
1808 contents = try_get(
1809 initial_data,
1810 lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'],
1811 list) or []
1812 for content in contents:
1813 vpir = content.get('videoPrimaryInfoRenderer')
1814 if vpir:
1815 stl = vpir.get('superTitleLink')
1816 if stl:
1817 stl = get_text(stl)
1818 if try_get(
1819 vpir,
1820 lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':
1821 info['location'] = stl
1822 else:
1823 mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl)
1824 if mobj:
1825 info.update({
1826 'series': mobj.group(1),
1827 'season_number': int(mobj.group(2)),
1828 'episode_number': int(mobj.group(3)),
1829 })
1830 for tlb in (try_get(
1831 vpir,
1832 lambda x: x['videoActions']['menuRenderer']['topLevelButtons'],
1833 list) or []):
1834 tbr = tlb.get('toggleButtonRenderer') or {}
1835 for getter, regex in [(
1836 lambda x: x['defaultText']['accessibility']['accessibilityData'],
1837 r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([
1838 lambda x: x['accessibility'],
1839 lambda x: x['accessibilityData']['accessibilityData'],
1840 ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]:
1841 label = (try_get(tbr, getter, dict) or {}).get('label')
1842 if label:
1843 mobj = re.match(regex, label)
1844 if mobj:
1845 info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count'))
1846 break
1847 sbr_tooltip = try_get(
1848 vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])
1849 if sbr_tooltip:
1850 like_count, dislike_count = sbr_tooltip.split(' / ')
1851 info.update({
1852 'like_count': str_to_int(like_count),
1853 'dislike_count': str_to_int(dislike_count),
1854 })
1855 vsir = content.get('videoSecondaryInfoRenderer')
1856 if vsir:
1857 info['channel'] = get_text(try_get(
1858 vsir,
1859 lambda x: x['owner']['videoOwnerRenderer']['title'],
1860 compat_str))
1861 rows = try_get(
1862 vsir,
1863 lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
1864 list) or []
1865 multiple_songs = False
1866 for row in rows:
1867 if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
1868 multiple_songs = True
1869 break
1870 for row in rows:
1871 mrr = row.get('metadataRowRenderer') or {}
1872 mrr_title = mrr.get('title')
1873 if not mrr_title:
1874 continue
1875 mrr_title = get_text(mrr['title'])
1876 mrr_contents_text = get_text(mrr['contents'][0])
1877 if mrr_title == 'License':
1878 info['license'] = mrr_contents_text
1879 elif not multiple_songs:
1880 if mrr_title == 'Album':
1881 info['album'] = mrr_contents_text
1882 elif mrr_title == 'Artist':
1883 info['artist'] = mrr_contents_text
1884 elif mrr_title == 'Song':
1885 info['track'] = mrr_contents_text
1886
1887 fallbacks = {
1888 'channel': 'uploader',
1889 'channel_id': 'uploader_id',
1890 'channel_url': 'uploader_url',
1891 }
1892 for to, frm in fallbacks.items():
1893 if not info.get(to):
1894 info[to] = info.get(frm)
1895
1896 for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]:
1897 v = info.get(s_k)
1898 if v:
1899 info[d_k] = v
b84071c0 1900
06167fbb 1901 # get xsrf for annotations or comments
1902 get_annotations = self._downloader.params.get('writeannotations', False)
1903 get_comments = self._downloader.params.get('getcomments', False)
1904 if get_annotations or get_comments:
29f7c58a 1905 xsrf_token = None
545cc85d 1906 ytcfg = self._extract_ytcfg(video_id, webpage)
29f7c58a 1907 if ytcfg:
1908 xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
1909 if not xsrf_token:
1910 xsrf_token = self._search_regex(
1911 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
8a784c74 1912 webpage, 'xsrf token', group='xsrf_token', fatal=False)
06167fbb 1913
1914 # annotations
06167fbb 1915 if get_annotations:
64b6a4e9
RA
1916 invideo_url = try_get(
1917 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
1918 if xsrf_token and invideo_url:
29f7c58a 1919 xsrf_field_name = None
1920 if ytcfg:
1921 xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
1922 if not xsrf_field_name:
1923 xsrf_field_name = self._search_regex(
1924 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
8a784c74 1925 webpage, 'xsrf field name',
29f7c58a 1926 group='xsrf_field_name', default='session_token')
8a784c74 1927 info['annotations'] = self._download_webpage(
64b6a4e9
RA
1928 self._proto_relative_url(invideo_url),
1929 video_id, note='Downloading annotations',
1930 errnote='Unable to download video annotations', fatal=False,
1931 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
7e72694b 1932
06167fbb 1933 # Get comments
1934 # TODO: Refactor and move to seperate function
1935 if get_comments:
1936 expected_video_comment_count = 0
1937 video_comments = []
1938
1939 def find_value(html, key, num_chars=2, separator='"'):
1940 pos_begin = html.find(key) + len(key) + num_chars
1941 pos_end = html.find(separator, pos_begin)
1942 return html[pos_begin: pos_end]
1943
1944 def search_dict(partial, key):
1945 if isinstance(partial, dict):
1946 for k, v in partial.items():
1947 if k == key:
1948 yield v
1949 else:
1950 for o in search_dict(v, key):
1951 yield o
1952 elif isinstance(partial, list):
1953 for i in partial:
1954 for o in search_dict(i, key):
1955 yield o
1956
8a784c74 1957 continuations = []
1958 if initial_data:
1959 try:
1960 ncd = next(search_dict(initial_data, 'nextContinuationData'))
1961 continuations = [ncd['continuation']]
1962 # Handle videos where comments have been disabled entirely
1963 except StopIteration:
1964 pass
06167fbb 1965
8d0ea5f9 1966 def get_continuation(continuation, session_token, replies=False):
06167fbb 1967 query = {
66c935fb 1968 'pbj': 1,
1969 'ctoken': continuation,
06167fbb 1970 }
1971 if replies:
1972 query['action_get_comment_replies'] = 1
1973 else:
1974 query['action_get_comments'] = 1
1975
1976 while True:
1977 content, handle = self._download_webpage_handle(
1978 'https://www.youtube.com/comment_service_ajax',
1979 video_id,
1980 note=False,
1981 expected_status=[413],
1982 data=urlencode_postdata({
1983 'session_token': session_token
1984 }),
1985 query=query,
1986 headers={
1987 'Accept': '*/*',
1988 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0',
1989 'X-YouTube-Client-Name': '1',
1990 'X-YouTube-Client-Version': '2.20201202.06.01'
1991 }
1992 )
1993
1994 response_code = handle.getcode()
1995 if (response_code == 200):
1996 return self._parse_json(content, video_id)
8d0ea5f9 1997 if (response_code == 413):
06167fbb 1998 return None
1999 raise ExtractorError('Unexpected HTTP error code: %s' % response_code)
2000
2001 first_continuation = True
885d36d4 2002 chain_msg = ''
2003 self.to_screen('Downloading comments')
06167fbb 2004 while continuations:
885d36d4 2005 continuation = continuations.pop()
8d0ea5f9 2006 comment_response = get_continuation(continuation, xsrf_token)
06167fbb 2007 if not comment_response:
2008 continue
2009 if list(search_dict(comment_response, 'externalErrorMessage')):
2010 raise ExtractorError('Error returned from server: ' + next(search_dict(comment_response, 'externalErrorMessage')))
2011
8d0ea5f9
B
2012 if 'continuationContents' not in comment_response['response']:
2013 # Something is wrong here. Youtube won't accept this continuation token for some reason and responds with a user satisfaction dialog (error?)
2014 continue
2015 # not sure if this actually helps
2016 if 'xsrf_token' in comment_response:
2017 xsrf_token = comment_response['xsrf_token']
2018
06167fbb 2019 item_section = comment_response['response']['continuationContents']['itemSectionContinuation']
2020 if first_continuation:
2021 expected_video_comment_count = int(item_section['header']['commentsHeaderRenderer']['countText']['runs'][0]['text'].replace(' Comments', '').replace('1 Comment', '1').replace(',', ''))
2022 first_continuation = False
2023 if 'contents' not in item_section:
2024 # continuation returned no comments?
2025 # set an empty array as to not break the for loop
2026 item_section['contents'] = []
2027
2028 for meta_comment in item_section['contents']:
2029 comment = meta_comment['commentThreadRenderer']['comment']['commentRenderer']
2030 video_comments.append({
2031 'id': comment['commentId'],
2032 'text': ''.join([c['text'] for c in comment['contentText']['runs']]),
8d0ea5f9 2033 'time_text': ''.join([c['text'] for c in comment['publishedTimeText']['runs']]),
06167fbb 2034 'author': comment.get('authorText', {}).get('simpleText', ''),
2035 'votes': comment.get('voteCount', {}).get('simpleText', '0'),
2036 'author_thumbnail': comment['authorThumbnail']['thumbnails'][-1]['url'],
2037 'parent': 'root'
2038 })
2039 if 'replies' not in meta_comment['commentThreadRenderer']:
2040 continue
2041
8d0ea5f9
B
2042 reply_continuations = [rcn['nextContinuationData']['continuation'] for rcn in meta_comment['commentThreadRenderer']['replies']['commentRepliesRenderer']['continuations']]
2043 while reply_continuations:
06167fbb 2044 time.sleep(1)
8d0ea5f9
B
2045 continuation = reply_continuations.pop()
2046 replies_data = get_continuation(continuation, xsrf_token, True)
06167fbb 2047 if not replies_data or 'continuationContents' not in replies_data[1]['response']:
8d0ea5f9 2048 continue
06167fbb 2049
2050 if self._downloader.params.get('verbose', False):
885d36d4 2051 chain_msg = ' (chain %s)' % comment['commentId']
2052 self.to_screen('Comments downloaded: %d of ~%d%s' % (len(video_comments), expected_video_comment_count, chain_msg))
06167fbb 2053 reply_comment_meta = replies_data[1]['response']['continuationContents']['commentRepliesContinuation']
885d36d4 2054 for reply_meta in reply_comment_meta.get('contents', {}):
06167fbb 2055 reply_comment = reply_meta['commentRenderer']
2056 video_comments.append({
2057 'id': reply_comment['commentId'],
2058 'text': ''.join([c['text'] for c in reply_comment['contentText']['runs']]),
8d0ea5f9 2059 'time_text': ''.join([c['text'] for c in reply_comment['publishedTimeText']['runs']]),
06167fbb 2060 'author': reply_comment.get('authorText', {}).get('simpleText', ''),
2061 'votes': reply_comment.get('voteCount', {}).get('simpleText', '0'),
2062 'author_thumbnail': reply_comment['authorThumbnail']['thumbnails'][-1]['url'],
2063 'parent': comment['commentId']
2064 })
2065 if 'continuations' not in reply_comment_meta or len(reply_comment_meta['continuations']) == 0:
8d0ea5f9 2066 continue
8d0ea5f9 2067 reply_continuations += [rcn['nextContinuationData']['continuation'] for rcn in reply_comment_meta['continuations']]
06167fbb 2068
885d36d4 2069 self.to_screen('Comments downloaded: %d of ~%d' % (len(video_comments), expected_video_comment_count))
06167fbb 2070 if 'continuations' in item_section:
8d0ea5f9 2071 continuations += [ncd['nextContinuationData']['continuation'] for ncd in item_section['continuations']]
06167fbb 2072 time.sleep(1)
2073
885d36d4 2074 self.to_screen('Total comments downloaded: %d of ~%d' % (len(video_comments), expected_video_comment_count))
545cc85d 2075 info.update({
2076 'comments': video_comments,
2077 'comment_count': expected_video_comment_count
2078 })
4ea3be0a 2079
545cc85d 2080 self.mark_watched(video_id, player_response)
d77ab8e2 2081
545cc85d 2082 return info
c5e8d7af 2083
5f6a1245 2084
8bdd16b4 2085class YoutubeTabIE(YoutubeBaseInfoExtractor):
2086 IE_DESC = 'YouTube.com tab'
70d5c17b 2087 _VALID_URL = r'''(?x)
2088 https?://
2089 (?:\w+\.)?
2090 (?:
2091 youtube(?:kids)?\.com|
2092 invidio\.us
2093 )/
2094 (?:
2095 (?:channel|c|user)/|
2096 (?P<not_channel>
3d3dddc9 2097 feed/|
70d5c17b 2098 (?:playlist|watch)\?.*?\blist=
2099 )|
29f7c58a 2100 (?!(?:%s)\b) # Direct URLs
70d5c17b 2101 )
2102 (?P<id>[^/?\#&]+)
2103 ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
8bdd16b4 2104 IE_NAME = 'youtube:tab'
2105
81127aa5 2106 _TESTS = [{
8bdd16b4 2107 # playlists, multipage
2108 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
2109 'playlist_mincount': 94,
2110 'info_dict': {
2111 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2112 'title': 'Игорь Клейнер - Playlists',
2113 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
deaec5af 2114 'uploader': 'Игорь Клейнер',
2115 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
8bdd16b4 2116 },
2117 }, {
2118 # playlists, multipage, different order
2119 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2120 'playlist_mincount': 94,
2121 'info_dict': {
2122 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2123 'title': 'Игорь Клейнер - Playlists',
2124 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
deaec5af 2125 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
2126 'uploader': 'Игорь Клейнер',
8bdd16b4 2127 },
2128 }, {
2129 # playlists, singlepage
2130 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2131 'playlist_mincount': 4,
2132 'info_dict': {
2133 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
2134 'title': 'ThirstForScience - Playlists',
2135 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
deaec5af 2136 'uploader': 'ThirstForScience',
2137 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
8bdd16b4 2138 }
2139 }, {
2140 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
2141 'only_matching': True,
2142 }, {
2143 # basic, single video playlist
0e30a7b9 2144 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
81127aa5 2145 'info_dict': {
0e30a7b9 2146 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2147 'uploader': 'Sergey M.',
2148 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3867038a 2149 'title': 'youtube-dl public playlist',
81127aa5 2150 },
0e30a7b9 2151 'playlist_count': 1,
9291475f 2152 }, {
8bdd16b4 2153 # empty playlist
0e30a7b9 2154 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
9291475f 2155 'info_dict': {
0e30a7b9 2156 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2157 'uploader': 'Sergey M.',
2158 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3867038a 2159 'title': 'youtube-dl empty playlist',
9291475f
PH
2160 },
2161 'playlist_count': 0,
2162 }, {
8bdd16b4 2163 # Home tab
2164 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
9291475f 2165 'info_dict': {
8bdd16b4 2166 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2167 'title': 'lex will - Home',
2168 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
deaec5af 2169 'uploader': 'lex will',
2170 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
9291475f 2171 },
8bdd16b4 2172 'playlist_mincount': 2,
9291475f 2173 }, {
8bdd16b4 2174 # Videos tab
2175 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
9291475f 2176 'info_dict': {
8bdd16b4 2177 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2178 'title': 'lex will - Videos',
2179 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
deaec5af 2180 'uploader': 'lex will',
2181 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
9291475f 2182 },
8bdd16b4 2183 'playlist_mincount': 975,
9291475f 2184 }, {
8bdd16b4 2185 # Videos tab, sorted by popular
2186 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
9291475f 2187 'info_dict': {
8bdd16b4 2188 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2189 'title': 'lex will - Videos',
2190 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
deaec5af 2191 'uploader': 'lex will',
2192 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
9291475f 2193 },
8bdd16b4 2194 'playlist_mincount': 199,
9291475f 2195 }, {
8bdd16b4 2196 # Playlists tab
2197 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
9291475f 2198 'info_dict': {
8bdd16b4 2199 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2200 'title': 'lex will - Playlists',
2201 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
deaec5af 2202 'uploader': 'lex will',
2203 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
9291475f 2204 },
8bdd16b4 2205 'playlist_mincount': 17,
ac7553d0 2206 }, {
8bdd16b4 2207 # Community tab
2208 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
ac7553d0 2209 'info_dict': {
8bdd16b4 2210 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2211 'title': 'lex will - Community',
2212 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
deaec5af 2213 'uploader': 'lex will',
2214 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
8bdd16b4 2215 },
2216 'playlist_mincount': 18,
87dadd45 2217 }, {
8bdd16b4 2218 # Channels tab
2219 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
87dadd45 2220 'info_dict': {
8bdd16b4 2221 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2222 'title': 'lex will - Channels',
2223 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
deaec5af 2224 'uploader': 'lex will',
2225 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
8bdd16b4 2226 },
deaec5af 2227 'playlist_mincount': 12,
6b08cdf6 2228 }, {
a0566bbf 2229 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2230 'only_matching': True,
2231 }, {
a0566bbf 2232 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2233 'only_matching': True,
2234 }, {
a0566bbf 2235 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2236 'only_matching': True,
2237 }, {
2238 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2239 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2240 'info_dict': {
2241 'title': '29C3: Not my department',
2242 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2243 'uploader': 'Christiaan008',
2244 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
deaec5af 2245 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268',
8bdd16b4 2246 },
2247 'playlist_count': 96,
2248 }, {
2249 'note': 'Large playlist',
2250 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
6b08cdf6 2251 'info_dict': {
8bdd16b4 2252 'title': 'Uploads from Cauchemar',
2253 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2254 'uploader': 'Cauchemar',
2255 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
13a75688 2256 },
8bdd16b4 2257 'playlist_mincount': 1123,
2258 }, {
2259 # even larger playlist, 8832 videos
2260 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
2261 'only_matching': True,
4b7df0d3
JMF
2262 }, {
2263 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2264 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2265 'info_dict': {
acf757f4
PH
2266 'title': 'Uploads from Interstellar Movie',
2267 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
13a75688 2268 'uploader': 'Interstellar Movie',
8bdd16b4 2269 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3 2270 },
481cc733 2271 'playlist_mincount': 21,
8bdd16b4 2272 }, {
2273 # https://github.com/ytdl-org/youtube-dl/issues/21844
2274 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2275 'info_dict': {
2276 'title': 'Data Analysis with Dr Mike Pound',
2277 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2278 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
2279 'uploader': 'Computerphile',
deaec5af 2280 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487',
8bdd16b4 2281 },
2282 'playlist_mincount': 11,
2283 }, {
a0566bbf 2284 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
8bdd16b4 2285 'only_matching': True,
dacb3a86
S
2286 }, {
2287 # Playlist URL that does not actually serve a playlist
2288 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2289 'info_dict': {
2290 'id': 'FqZTN594JQw',
2291 'ext': 'webm',
2292 'title': "Smiley's People 01 detective, Adventure Series, Action",
2293 'uploader': 'STREEM',
2294 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
ec85ded8 2295 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
dacb3a86
S
2296 'upload_date': '20150526',
2297 'license': 'Standard YouTube License',
2298 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2299 'categories': ['People & Blogs'],
2300 'tags': list,
dbdaaa23 2301 'view_count': int,
dacb3a86
S
2302 'like_count': int,
2303 'dislike_count': int,
2304 },
2305 'params': {
2306 'skip_download': True,
2307 },
13a75688 2308 'skip': 'This video is not available.',
dacb3a86 2309 'add_ie': [YoutubeIE.ie_key()],
481cc733 2310 }, {
8bdd16b4 2311 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
c0345b82 2312 'only_matching': True,
66b48727 2313 }, {
8bdd16b4 2314 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
66b48727 2315 'only_matching': True,
a0566bbf 2316 }, {
2317 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
2318 'info_dict': {
2319 'id': '9Auq9mYxFEE',
2320 'ext': 'mp4',
deaec5af 2321 'title': compat_str,
a0566bbf 2322 'uploader': 'Sky News',
2323 'uploader_id': 'skynews',
2324 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
2325 'upload_date': '20191102',
deaec5af 2326 'description': 'md5:85ddd75d888674631aaf9599a9a0b0ae',
a0566bbf 2327 'categories': ['News & Politics'],
2328 'tags': list,
2329 'like_count': int,
2330 'dislike_count': int,
2331 },
2332 'params': {
2333 'skip_download': True,
2334 },
2335 }, {
2336 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2337 'info_dict': {
2338 'id': 'a48o2S1cPoo',
2339 'ext': 'mp4',
2340 'title': 'The Young Turks - Live Main Show',
2341 'uploader': 'The Young Turks',
2342 'uploader_id': 'TheYoungTurks',
2343 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2344 'upload_date': '20150715',
2345 'license': 'Standard YouTube License',
2346 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2347 'categories': ['News & Politics'],
2348 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2349 'like_count': int,
2350 'dislike_count': int,
2351 },
2352 'params': {
2353 'skip_download': True,
2354 },
2355 'only_matching': True,
2356 }, {
2357 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
2358 'only_matching': True,
2359 }, {
2360 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2361 'only_matching': True,
3d3dddc9 2362 }, {
2363 'url': 'https://www.youtube.com/feed/trending',
2364 'only_matching': True,
2365 }, {
2366 # needs auth
2367 'url': 'https://www.youtube.com/feed/library',
2368 'only_matching': True,
2369 }, {
2370 # needs auth
2371 'url': 'https://www.youtube.com/feed/history',
2372 'only_matching': True,
2373 }, {
2374 # needs auth
2375 'url': 'https://www.youtube.com/feed/subscriptions',
2376 'only_matching': True,
2377 }, {
2378 # needs auth
2379 'url': 'https://www.youtube.com/feed/watch_later',
2380 'only_matching': True,
2381 }, {
2382 # no longer available?
2383 'url': 'https://www.youtube.com/feed/recommended',
2384 'only_matching': True,
29f7c58a 2385 }, {
2386 # inline playlist with not always working continuations
2387 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
2388 'only_matching': True,
2389 }, {
2390 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8',
2391 'only_matching': True,
2392 }, {
2393 'url': 'https://www.youtube.com/course',
2394 'only_matching': True,
2395 }, {
2396 'url': 'https://www.youtube.com/zsecurity',
2397 'only_matching': True,
2398 }, {
2399 'url': 'http://www.youtube.com/NASAgovVideo/videos',
2400 'only_matching': True,
2401 }, {
2402 'url': 'https://www.youtube.com/TheYoungTurks/live',
2403 'only_matching': True,
2404 }]
2405
2406 @classmethod
2407 def suitable(cls, url):
2408 return False if YoutubeIE.suitable(url) else super(
2409 YoutubeTabIE, cls).suitable(url)
8bdd16b4 2410
2411 def _extract_channel_id(self, webpage):
2412 channel_id = self._html_search_meta(
2413 'channelId', webpage, 'channel id', default=None)
2414 if channel_id:
2415 return channel_id
2416 channel_url = self._html_search_meta(
2417 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
2418 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
2419 'twitter:app:url:googleplay'), webpage, 'channel url')
2420 return self._search_regex(
2421 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
2422 channel_url, 'channel id')
15f6397c 2423
8bdd16b4 2424 @staticmethod
2425 def _extract_grid_item_renderer(item):
2426 for item_kind in ('Playlist', 'Video', 'Channel'):
2427 renderer = item.get('grid%sRenderer' % item_kind)
2428 if renderer:
2429 return renderer
2430
8bdd16b4 2431 def _grid_entries(self, grid_renderer):
2432 for item in grid_renderer['items']:
2433 if not isinstance(item, dict):
39b62db1 2434 continue
8bdd16b4 2435 renderer = self._extract_grid_item_renderer(item)
2436 if not isinstance(renderer, dict):
2437 continue
2438 title = try_get(
2439 renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2440 # playlist
2441 playlist_id = renderer.get('playlistId')
2442 if playlist_id:
2443 yield self.url_result(
2444 'https://www.youtube.com/playlist?list=%s' % playlist_id,
2445 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
2446 video_title=title)
2447 # video
2448 video_id = renderer.get('videoId')
2449 if video_id:
2450 yield self._extract_video(renderer)
2451 # channel
2452 channel_id = renderer.get('channelId')
2453 if channel_id:
2454 title = try_get(
2455 renderer, lambda x: x['title']['simpleText'], compat_str)
2456 yield self.url_result(
2457 'https://www.youtube.com/channel/%s' % channel_id,
2458 ie=YoutubeTabIE.ie_key(), video_title=title)
2459
3d3dddc9 2460 def _shelf_entries_from_content(self, shelf_renderer):
2461 content = shelf_renderer.get('content')
2462 if not isinstance(content, dict):
8bdd16b4 2463 return
3d3dddc9 2464 renderer = content.get('gridRenderer')
2465 if renderer:
2466 # TODO: add support for nested playlists so each shelf is processed
2467 # as separate playlist
2468 # TODO: this includes only first N items
2469 for entry in self._grid_entries(renderer):
2470 yield entry
2471 renderer = content.get('horizontalListRenderer')
2472 if renderer:
2473 # TODO
2474 pass
8bdd16b4 2475
29f7c58a 2476 def _shelf_entries(self, shelf_renderer, skip_channels=False):
8bdd16b4 2477 ep = try_get(
2478 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
2479 compat_str)
2480 shelf_url = urljoin('https://www.youtube.com', ep)
3d3dddc9 2481 if shelf_url:
29f7c58a 2482 # Skipping links to another channels, note that checking for
2483 # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL
2484 # will not work
2485 if skip_channels and '/channels?' in shelf_url:
2486 return
3d3dddc9 2487 title = try_get(
2488 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2489 yield self.url_result(shelf_url, video_title=title)
2490 # Shelf may not contain shelf URL, fallback to extraction from content
2491 for entry in self._shelf_entries_from_content(shelf_renderer):
2492 yield entry
c5e8d7af 2493
8bdd16b4 2494 def _playlist_entries(self, video_list_renderer):
2495 for content in video_list_renderer['contents']:
2496 if not isinstance(content, dict):
2497 continue
2498 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
2499 if not isinstance(renderer, dict):
2500 continue
2501 video_id = renderer.get('videoId')
2502 if not video_id:
2503 continue
2504 yield self._extract_video(renderer)
07aeced6 2505
3d3dddc9 2506 r""" # Not needed in the new implementation
3462ffa8 2507 def _itemSection_entries(self, item_sect_renderer):
2508 for content in item_sect_renderer['contents']:
2509 if not isinstance(content, dict):
2510 continue
2511 renderer = content.get('videoRenderer', {})
2512 if not isinstance(renderer, dict):
2513 continue
2514 video_id = renderer.get('videoId')
2515 if not video_id:
2516 continue
2517 yield self._extract_video(renderer)
3d3dddc9 2518 """
3462ffa8 2519
2520 def _rich_entries(self, rich_grid_renderer):
2521 renderer = try_get(
70d5c17b 2522 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
3462ffa8 2523 video_id = renderer.get('videoId')
2524 if not video_id:
2525 return
2526 yield self._extract_video(renderer)
2527
8bdd16b4 2528 def _video_entry(self, video_renderer):
2529 video_id = video_renderer.get('videoId')
2530 if video_id:
2531 return self._extract_video(video_renderer)
dacb3a86 2532
8bdd16b4 2533 def _post_thread_entries(self, post_thread_renderer):
2534 post_renderer = try_get(
2535 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
2536 if not post_renderer:
2537 return
2538 # video attachment
2539 video_renderer = try_get(
2540 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
2541 video_id = None
2542 if video_renderer:
2543 entry = self._video_entry(video_renderer)
2544 if entry:
2545 yield entry
2546 # inline video links
2547 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
2548 for run in runs:
2549 if not isinstance(run, dict):
2550 continue
2551 ep_url = try_get(
2552 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
2553 if not ep_url:
2554 continue
2555 if not YoutubeIE.suitable(ep_url):
2556 continue
2557 ep_video_id = YoutubeIE._match_id(ep_url)
2558 if video_id == ep_video_id:
2559 continue
2560 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
dacb3a86 2561
8bdd16b4 2562 def _post_thread_continuation_entries(self, post_thread_continuation):
2563 contents = post_thread_continuation.get('contents')
2564 if not isinstance(contents, list):
2565 return
2566 for content in contents:
2567 renderer = content.get('backstagePostThreadRenderer')
2568 if not isinstance(renderer, dict):
2569 continue
2570 for entry in self._post_thread_entries(renderer):
2571 yield entry
07aeced6 2572
29f7c58a 2573 @staticmethod
2574 def _build_continuation_query(continuation, ctp=None):
2575 query = {
2576 'ctoken': continuation,
2577 'continuation': continuation,
2578 }
2579 if ctp:
2580 query['itct'] = ctp
2581 return query
2582
8bdd16b4 2583 @staticmethod
2584 def _extract_next_continuation_data(renderer):
2585 next_continuation = try_get(
2586 renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
2587 if not next_continuation:
2588 return
2589 continuation = next_continuation.get('continuation')
2590 if not continuation:
2591 return
2592 ctp = next_continuation.get('clickTrackingParams')
29f7c58a 2593 return YoutubeTabIE._build_continuation_query(continuation, ctp)
c5e8d7af 2594
8bdd16b4 2595 @classmethod
2596 def _extract_continuation(cls, renderer):
2597 next_continuation = cls._extract_next_continuation_data(renderer)
2598 if next_continuation:
2599 return next_continuation
a1b535bd 2600 contents = renderer.get('contents') or renderer.get('items')
8bdd16b4 2601 if not isinstance(contents, list):
2602 return
2603 for content in contents:
2604 if not isinstance(content, dict):
2605 continue
2606 continuation_ep = try_get(
2607 content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
2608 dict)
2609 if not continuation_ep:
2610 continue
2611 continuation = try_get(
2612 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
2613 if not continuation:
2614 continue
2615 ctp = continuation_ep.get('clickTrackingParams')
29f7c58a 2616 return YoutubeTabIE._build_continuation_query(continuation, ctp)
448830ce 2617
8bdd16b4 2618 def _entries(self, tab, identity_token):
3462ffa8 2619
70d5c17b 2620 def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
2621 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
2622 for content in contents:
2623 if not isinstance(content, dict):
8bdd16b4 2624 continue
70d5c17b 2625 is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
3462ffa8 2626 if not is_renderer:
70d5c17b 2627 renderer = content.get('richItemRenderer')
3462ffa8 2628 if renderer:
2629 for entry in self._rich_entries(renderer):
2630 yield entry
2631 continuation_list[0] = self._extract_continuation(parent_renderer)
8bdd16b4 2632 continue
3462ffa8 2633 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
2634 for isr_content in isr_contents:
2635 if not isinstance(isr_content, dict):
2636 continue
69184e41 2637
2638 known_renderers = {
2639 'playlistVideoListRenderer': self._playlist_entries,
2640 'gridRenderer': self._grid_entries,
2641 'shelfRenderer': lambda x: self._shelf_entries(x, tab.get('title') != 'Channels'),
2642 'backstagePostThreadRenderer': self._post_thread_entries,
2643 'videoRenderer': lambda x: [self._video_entry(x)],
2644 }
2645 for key, renderer in isr_content.items():
2646 if key not in known_renderers:
2647 continue
2648 for entry in known_renderers[key](renderer):
2649 if entry:
2650 yield entry
3462ffa8 2651 continuation_list[0] = self._extract_continuation(renderer)
69184e41 2652 break
70d5c17b 2653
3462ffa8 2654 if not continuation_list[0]:
2655 continuation_list[0] = self._extract_continuation(is_renderer)
70d5c17b 2656
2657 if not continuation_list[0]:
2658 continuation_list[0] = self._extract_continuation(parent_renderer)
3462ffa8 2659
2660 continuation_list = [None] # Python 2 doesnot support nonlocal
29f7c58a 2661 tab_content = try_get(tab, lambda x: x['content'], dict)
2662 if not tab_content:
2663 return
3462ffa8 2664 parent_renderer = (
29f7c58a 2665 try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
2666 or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
70d5c17b 2667 for entry in extract_entries(parent_renderer):
2668 yield entry
3462ffa8 2669 continuation = continuation_list[0]
8bdd16b4 2670
2671 headers = {
2672 'x-youtube-client-name': '1',
2673 'x-youtube-client-version': '2.20201112.04.01',
2674 }
2675 if identity_token:
2676 headers['x-youtube-identity-token'] = identity_token
ebf1b291 2677
8bdd16b4 2678 for page_num in itertools.count(1):
2679 if not continuation:
2680 break
29f7c58a 2681 count = 0
2682 retries = 3
2683 while count <= retries:
2684 try:
2685 # Downloading page may result in intermittent 5xx HTTP error
2686 # that is usually worked around with a retry
2687 browse = self._download_json(
2688 'https://www.youtube.com/browse_ajax', None,
2689 'Downloading page %d%s'
2690 % (page_num, ' (retry #%d)' % count if count else ''),
2691 headers=headers, query=continuation)
2692 break
2693 except ExtractorError as e:
2694 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
2695 count += 1
2696 if count <= retries:
2697 continue
2698 raise
8bdd16b4 2699 if not browse:
2700 break
2701 response = try_get(browse, lambda x: x[1]['response'], dict)
2702 if not response:
2703 break
ebf1b291 2704
69184e41 2705 known_continuation_renderers = {
2706 'playlistVideoListContinuation': self._playlist_entries,
2707 'gridContinuation': self._grid_entries,
2708 'itemSectionContinuation': self._post_thread_continuation_entries,
2709 'sectionListContinuation': extract_entries, # for feeds
2710 }
8bdd16b4 2711 continuation_contents = try_get(
69184e41 2712 response, lambda x: x['continuationContents'], dict) or {}
2713 continuation_renderer = None
2714 for key, value in continuation_contents.items():
2715 if key not in known_continuation_renderers:
3462ffa8 2716 continue
69184e41 2717 continuation_renderer = value
2718 continuation_list = [None]
2719 for entry in known_continuation_renderers[key](continuation_renderer):
2720 yield entry
2721 continuation = continuation_list[0] or self._extract_continuation(continuation_renderer)
2722 break
2723 if continuation_renderer:
2724 continue
c5e8d7af 2725
a1b535bd 2726 known_renderers = {
2727 'gridPlaylistRenderer': (self._grid_entries, 'items'),
2728 'gridVideoRenderer': (self._grid_entries, 'items'),
2729 'playlistVideoRenderer': (self._playlist_entries, 'contents'),
2730 'itemSectionRenderer': (self._playlist_entries, 'contents'),
2731 }
8bdd16b4 2732 continuation_items = try_get(
2733 response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
a1b535bd 2734 continuation_item = try_get(continuation_items, lambda x: x[0], dict) or {}
2735 video_items_renderer = None
2736 for key, value in continuation_item.items():
2737 if key not in known_renderers:
8bdd16b4 2738 continue
a1b535bd 2739 video_items_renderer = {known_renderers[key][1]: continuation_items}
2740 for entry in known_renderers[key][0](video_items_renderer):
2741 yield entry
2742 continuation = self._extract_continuation(video_items_renderer)
2743 break
2744 if video_items_renderer:
2745 continue
8bdd16b4 2746 break
9558dcec 2747
8bdd16b4 2748 @staticmethod
2749 def _extract_selected_tab(tabs):
2750 for tab in tabs:
2751 if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
2752 return tab['tabRenderer']
2b3c2546 2753 else:
8bdd16b4 2754 raise ExtractorError('Unable to find selected tab')
b82f815f 2755
8bdd16b4 2756 @staticmethod
2757 def _extract_uploader(data):
2758 uploader = {}
2759 sidebar_renderer = try_get(
2760 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
2761 if sidebar_renderer:
2762 for item in sidebar_renderer:
2763 if not isinstance(item, dict):
2764 continue
2765 renderer = item.get('playlistSidebarSecondaryInfoRenderer')
2766 if not isinstance(renderer, dict):
2767 continue
2768 owner = try_get(
2769 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
2770 if owner:
2771 uploader['uploader'] = owner.get('text')
2772 uploader['uploader_id'] = try_get(
2773 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
2774 uploader['uploader_url'] = urljoin(
2775 'https://www.youtube.com/',
2776 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
9c3fe2ef 2777 return {k: v for k, v in uploader.items() if v is not None}
8bdd16b4 2778
2779 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
b60419c5 2780 playlist_id = title = description = channel_url = channel_name = channel_id = None
2781 thumbnails_list = tags = []
2782
8bdd16b4 2783 selected_tab = self._extract_selected_tab(tabs)
2784 renderer = try_get(
2785 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
2786 if renderer:
b60419c5 2787 channel_name = renderer.get('title')
2788 channel_url = renderer.get('channelUrl')
2789 channel_id = renderer.get('externalId')
64c0d954 2790
64c0d954 2791 if not renderer:
2792 renderer = try_get(
2793 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
8bdd16b4 2794 if renderer:
2795 title = renderer.get('title')
ecc97af3 2796 description = renderer.get('description', '')
b60419c5 2797 playlist_id = channel_id
2798 tags = renderer.get('keywords', '').split()
2799 thumbnails_list = (
2800 try_get(renderer, lambda x: x['avatar']['thumbnails'], list)
ff84930c 2801 or try_get(
2802 data,
2803 lambda x: x['sidebar']['playlistSidebarRenderer']['items'][0]['playlistSidebarPrimaryInfoRenderer']['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails'],
2804 list)
b60419c5 2805 or [])
2806
2807 thumbnails = []
2808 for t in thumbnails_list:
2809 if not isinstance(t, dict):
2810 continue
2811 thumbnail_url = url_or_none(t.get('url'))
2812 if not thumbnail_url:
2813 continue
2814 thumbnails.append({
2815 'url': thumbnail_url,
2816 'width': int_or_none(t.get('width')),
2817 'height': int_or_none(t.get('height')),
2818 })
64c0d954 2819
3462ffa8 2820 if playlist_id is None:
70d5c17b 2821 playlist_id = item_id
2822 if title is None:
b60419c5 2823 title = playlist_id
2824 title += format_field(selected_tab, 'title', ' - %s')
2825
2826 metadata = {
2827 'playlist_id': playlist_id,
2828 'playlist_title': title,
2829 'playlist_description': description,
2830 'uploader': channel_name,
2831 'uploader_id': channel_id,
2832 'uploader_url': channel_url,
2833 'thumbnails': thumbnails,
2834 'tags': tags,
2835 }
2836 if not channel_id:
2837 metadata.update(self._extract_uploader(data))
2838 metadata.update({
2839 'channel': metadata['uploader'],
2840 'channel_id': metadata['uploader_id'],
2841 'channel_url': metadata['uploader_url']})
2842 return self.playlist_result(
29f7c58a 2843 self._entries(selected_tab, identity_token),
b60419c5 2844 **metadata)
73c4ac2c 2845
29f7c58a 2846 def _extract_from_playlist(self, item_id, url, data, playlist):
8bdd16b4 2847 title = playlist.get('title') or try_get(
2848 data, lambda x: x['titleText']['simpleText'], compat_str)
2849 playlist_id = playlist.get('playlistId') or item_id
29f7c58a 2850 # Inline playlist rendition continuation does not always work
2851 # at Youtube side, so delegating regular tab-based playlist URL
2852 # processing whenever possible.
2853 playlist_url = urljoin(url, try_get(
2854 playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
2855 compat_str))
2856 if playlist_url and playlist_url != url:
2857 return self.url_result(
2858 playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
2859 video_title=title)
8bdd16b4 2860 return self.playlist_result(
2861 self._playlist_entries(playlist), playlist_id=playlist_id,
2862 playlist_title=title)
c5e8d7af 2863
29f7c58a 2864 @staticmethod
2865 def _extract_alerts(data):
02ced43c 2866 for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
29f7c58a 2867 if not isinstance(alert_dict, dict):
2868 continue
02ced43c 2869 for renderer in alert_dict:
2870 alert = alert_dict[renderer]
2871 alert_type = alert.get('type')
2872 if not alert_type:
2873 continue
2874 message = try_get(alert, lambda x: x['text']['simpleText'], compat_str)
2875 if message:
2876 yield alert_type, message
2877 for run in try_get(alert, lambda x: x['text']['runs'], list) or []:
2878 message = try_get(run, lambda x: x['text'], compat_str)
2879 if message:
2880 yield alert_type, message
2881
29f7c58a 2882 def _extract_identity_token(self, webpage, item_id):
2883 ytcfg = self._extract_ytcfg(item_id, webpage)
2884 if ytcfg:
2885 token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
2886 if token:
2887 return token
2888 return self._search_regex(
2889 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
2890 'identity token', default=None)
2891
8bdd16b4 2892 def _real_extract(self, url):
2893 item_id = self._match_id(url)
2894 url = compat_urlparse.urlunparse(
2895 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
036fcf3a 2896 is_home = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url)
70d5c17b 2897 if is_home is not None and is_home.group('not_channel') is None and item_id != 'feed':
036fcf3a 2898 self._downloader.report_warning(
2899 'A channel/user page was given. All the channel\'s videos will be downloaded. '
c76eb41b 2900 'To download only the videos in the home page, add a "/featured" to the URL')
036fcf3a 2901 url = '%s/videos%s' % (is_home.group('pre'), is_home.group('post') or '')
2902
8bdd16b4 2903 # Handle both video/playlist URLs
2904 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
2905 video_id = qs.get('v', [None])[0]
2906 playlist_id = qs.get('list', [None])[0]
f0c532a4 2907
29f7c58a 2908 if is_home is not None and is_home.group('not_channel') is not None and is_home.group('not_channel').startswith('watch') and not video_id:
f0c532a4 2909 if playlist_id:
2910 self._downloader.report_warning('%s is not a valid Youtube URL. Trying to download playlist %s' % (url, playlist_id))
2911 url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
2912 # return self.url_result(playlist_id, ie=YoutubePlaylistIE.ie_key())
2913 else:
2914 raise ExtractorError('Unable to recognize tab page')
8bdd16b4 2915 if video_id and playlist_id:
2916 if self._downloader.params.get('noplaylist'):
2917 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2918 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
2919 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
2fa90513 2920
8bdd16b4 2921 webpage = self._download_webpage(url, item_id)
29f7c58a 2922 identity_token = self._extract_identity_token(webpage, item_id)
8bdd16b4 2923 data = self._extract_yt_initial_data(item_id, webpage)
6b8eb0c0 2924 err_msg = None
02ced43c 2925 for alert_type, alert_message in self._extract_alerts(data):
6b8eb0c0 2926 if alert_type.lower() == 'error':
2927 if err_msg:
2928 self._downloader.report_warning('YouTube said: %s - %s' % ('ERROR', err_msg))
2929 err_msg = alert_message
2930 else:
2931 self._downloader.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
2932 if err_msg:
2933 raise ExtractorError('YouTube said: %s' % err_msg, expected=True)
8bdd16b4 2934 tabs = try_get(
2935 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
2936 if tabs:
2937 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
2938 playlist = try_get(
2939 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
2940 if playlist:
29f7c58a 2941 return self._extract_from_playlist(item_id, url, data, playlist)
a0566bbf 2942 # Fallback to video extraction if no playlist alike page is recognized.
2943 # First check for the current video then try the v attribute of URL query.
2944 video_id = try_get(
2945 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
2946 compat_str) or video_id
8bdd16b4 2947 if video_id:
2948 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
2949 # Failed to recognize
2950 raise ExtractorError('Unable to recognize tab page')
c5e8d7af 2951
c5e8d7af 2952
8bdd16b4 2953class YoutubePlaylistIE(InfoExtractor):
2954 IE_DESC = 'YouTube.com playlists'
2955 _VALID_URL = r'''(?x)(?:
2956 (?:https?://)?
2957 (?:\w+\.)?
2958 (?:
2959 (?:
2960 youtube(?:kids)?\.com|
29f7c58a 2961 invidio\.us
8bdd16b4 2962 )
2963 /.*?\?.*?\blist=
2964 )?
2965 (?P<id>%(playlist_id)s)
2966 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
2967 IE_NAME = 'youtube:playlist'
cdc628a4 2968 _TESTS = [{
8bdd16b4 2969 'note': 'issue #673',
2970 'url': 'PLBB231211A4F62143',
cdc628a4 2971 'info_dict': {
8bdd16b4 2972 'title': '[OLD]Team Fortress 2 (Class-based LP)',
2973 'id': 'PLBB231211A4F62143',
2974 'uploader': 'Wickydoo',
2975 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
2976 },
2977 'playlist_mincount': 29,
2978 }, {
2979 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2980 'info_dict': {
2981 'title': 'YDL_safe_search',
2982 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2983 },
2984 'playlist_count': 2,
2985 'skip': 'This playlist is private',
9558dcec 2986 }, {
8bdd16b4 2987 'note': 'embedded',
2988 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2989 'playlist_count': 4,
9558dcec 2990 'info_dict': {
8bdd16b4 2991 'title': 'JODA15',
2992 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2993 'uploader': 'milan',
2994 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
9558dcec 2995 }
cdc628a4 2996 }, {
8bdd16b4 2997 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2998 'playlist_mincount': 982,
2999 'info_dict': {
3000 'title': '2018 Chinese New Singles (11/6 updated)',
3001 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3002 'uploader': 'LBK',
3003 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
3004 }
daa0df9e 3005 }, {
29f7c58a 3006 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
3007 'only_matching': True,
3008 }, {
3009 # music album playlist
3010 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
3011 'only_matching': True,
3012 }]
3013
3014 @classmethod
3015 def suitable(cls, url):
3016 return False if YoutubeTabIE.suitable(url) else super(
3017 YoutubePlaylistIE, cls).suitable(url)
3018
3019 def _real_extract(self, url):
3020 playlist_id = self._match_id(url)
3021 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3022 if not qs:
3023 qs = {'list': playlist_id}
3024 return self.url_result(
3025 update_url_query('https://www.youtube.com/playlist', qs),
3026 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3027
3028
3029class YoutubeYtBeIE(InfoExtractor):
c76eb41b 3030 IE_DESC = 'youtu.be'
29f7c58a 3031 _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3032 _TESTS = [{
8bdd16b4 3033 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
3034 'info_dict': {
3035 'id': 'yeWKywCrFtk',
3036 'ext': 'mp4',
3037 'title': 'Small Scale Baler and Braiding Rugs',
3038 'uploader': 'Backus-Page House Museum',
3039 'uploader_id': 'backuspagemuseum',
3040 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
3041 'upload_date': '20161008',
3042 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
3043 'categories': ['Nonprofits & Activism'],
3044 'tags': list,
3045 'like_count': int,
3046 'dislike_count': int,
3047 },
3048 'params': {
3049 'noplaylist': True,
3050 'skip_download': True,
3051 },
39e7107d 3052 }, {
8bdd16b4 3053 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
39e7107d 3054 'only_matching': True,
cdc628a4
PH
3055 }]
3056
8bdd16b4 3057 def _real_extract(self, url):
29f7c58a 3058 mobj = re.match(self._VALID_URL, url)
3059 video_id = mobj.group('id')
3060 playlist_id = mobj.group('playlist_id')
8bdd16b4 3061 return self.url_result(
29f7c58a 3062 update_url_query('https://www.youtube.com/watch', {
3063 'v': video_id,
3064 'list': playlist_id,
3065 'feature': 'youtu.be',
3066 }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
8bdd16b4 3067
3068
3069class YoutubeYtUserIE(InfoExtractor):
c76eb41b 3070 IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
8bdd16b4 3071 _VALID_URL = r'ytuser:(?P<id>.+)'
3072 _TESTS = [{
3073 'url': 'ytuser:phihag',
3074 'only_matching': True,
3075 }]
3076
3077 def _real_extract(self, url):
3078 user_id = self._match_id(url)
3079 return self.url_result(
3080 'https://www.youtube.com/user/%s' % user_id,
3081 ie=YoutubeTabIE.ie_key(), video_id=user_id)
9558dcec 3082
b05654f0 3083
3d3dddc9 3084class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
70d5c17b 3085 IE_NAME = 'youtube:favorites'
3086 IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
3087 _VALID_URL = r':ytfav(?:ou?rite)?s?'
3088 _LOGIN_REQUIRED = True
3089 _TESTS = [{
3090 'url': ':ytfav',
3091 'only_matching': True,
3092 }, {
3093 'url': ':ytfavorites',
3094 'only_matching': True,
3095 }]
3096
3097 def _real_extract(self, url):
3098 return self.url_result(
3099 'https://www.youtube.com/playlist?list=LL',
3100 ie=YoutubeTabIE.ie_key())
3101
3102
8bdd16b4 3103class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
69184e41 3104 IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
b4c08069
JMF
3105 # there doesn't appear to be a real limit, for example if you search for
3106 # 'python' you get more than 8.000.000 results
3107 _MAX_RESULTS = float('inf')
78caa52a 3108 IE_NAME = 'youtube:search'
b05654f0 3109 _SEARCH_KEY = 'ytsearch'
6c894ea1 3110 _SEARCH_PARAMS = None
9dd8e46a 3111 _TESTS = []
b05654f0 3112
6c894ea1
U
3113 def _entries(self, query, n):
3114 data = {
3115 'context': {
3116 'client': {
3117 'clientName': 'WEB',
3118 'clientVersion': '2.20201021.03.00',
3119 }
3120 },
3121 'query': query,
a22b2fd1 3122 }
6c894ea1
U
3123 if self._SEARCH_PARAMS:
3124 data['params'] = self._SEARCH_PARAMS
3125 total = 0
3126 for page_num in itertools.count(1):
3127 search = self._download_json(
3128 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3129 video_id='query "%s"' % query,
3130 note='Downloading page %s' % page_num,
3131 errnote='Unable to download API page', fatal=False,
3132 data=json.dumps(data).encode('utf8'),
3133 headers={'content-type': 'application/json'})
3134 if not search:
b4c08069 3135 break
6c894ea1
U
3136 slr_contents = try_get(
3137 search,
3138 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3139 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3140 list)
3141 if not slr_contents:
a22b2fd1 3142 break
0366ae87 3143
0366ae87
M
3144 # Youtube sometimes adds promoted content to searches,
3145 # changing the index location of videos and token.
3146 # So we search through all entries till we find them.
30a074c2 3147 continuation_token = None
3148 for slr_content in slr_contents:
3149 isr_contents = try_get(
3150 slr_content,
3151 lambda x: x['itemSectionRenderer']['contents'],
3152 list)
9da76d30 3153 if not isr_contents:
30a074c2 3154 continue
3155 for content in isr_contents:
3156 if not isinstance(content, dict):
3157 continue
3158 video = content.get('videoRenderer')
3159 if not isinstance(video, dict):
3160 continue
3161 video_id = video.get('videoId')
3162 if not video_id:
3163 continue
3164
3165 yield self._extract_video(video)
3166 total += 1
3167 if total == n:
3168 return
0366ae87
M
3169
3170 if continuation_token is None:
3171 continuation_token = try_get(
30a074c2 3172 slr_content,
3173 lambda x: x['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
0366ae87 3174 compat_str)
0366ae87 3175
0366ae87 3176 if not continuation_token:
6c894ea1 3177 break
0366ae87 3178 data['continuation'] = continuation_token
b05654f0 3179
6c894ea1
U
3180 def _get_n_results(self, query, n):
3181 """Get a specified number of results for a query"""
3182 return self.playlist_result(self._entries(query, n), query)
75dff0ee 3183
c9ae7b95 3184
a3dd9248 3185class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 3186 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 3187 _SEARCH_KEY = 'ytsearchdate'
c76eb41b 3188 IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
6c894ea1 3189 _SEARCH_PARAMS = 'CAI%3D'
75dff0ee 3190
c9ae7b95 3191
386e1dd9 3192class YoutubeSearchURLIE(YoutubeSearchIE):
69184e41 3193 IE_DESC = 'YouTube.com search URLs'
386e1dd9 3194 IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
3195 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
ef2f3c7f 3196 # _MAX_RESULTS = 100
3462ffa8 3197 _TESTS = [{
3198 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3199 'playlist_mincount': 5,
3200 'info_dict': {
3201 'title': 'youtube-dl test video',
3202 }
3203 }, {
3204 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3205 'only_matching': True,
3206 }]
3207
386e1dd9 3208 @classmethod
3209 def _make_valid_url(cls):
3210 return cls._VALID_URL
3211
3462ffa8 3212 def _real_extract(self, url):
386e1dd9 3213 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
3214 query = (qs.get('search_query') or qs.get('q'))[0]
3215 self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
3216 return self._get_n_results(query, self._MAX_RESULTS)
3462ffa8 3217
3218
3219class YoutubeFeedsInfoExtractor(YoutubeTabIE):
d7ae0639 3220 """
25f14e9f 3221 Base class for feed extractors
3d3dddc9 3222 Subclasses must define the _FEED_NAME property.
d7ae0639 3223 """
b2e8bc1b 3224 _LOGIN_REQUIRED = True
3462ffa8 3225 # _MAX_PAGES = 5
ef2f3c7f 3226 _TESTS = []
d7ae0639
JMF
3227
3228 @property
3229 def IE_NAME(self):
78caa52a 3230 return 'youtube:%s' % self._FEED_NAME
04cc9617 3231
81f0259b 3232 def _real_initialize(self):
b2e8bc1b 3233 self._login()
81f0259b 3234
3853309f 3235 def _real_extract(self, url):
3d3dddc9 3236 return self.url_result(
3237 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3238 ie=YoutubeTabIE.ie_key())
25f14e9f
S
3239
3240
ef2f3c7f 3241class YoutubeWatchLaterIE(InfoExtractor):
3242 IE_NAME = 'youtube:watchlater'
70d5c17b 3243 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3d3dddc9 3244 _VALID_URL = r':ytwatchlater'
bc7a9cd8 3245 _TESTS = [{
8bdd16b4 3246 'url': ':ytwatchlater',
bc7a9cd8
S
3247 'only_matching': True,
3248 }]
25f14e9f
S
3249
3250 def _real_extract(self, url):
ef2f3c7f 3251 return self.url_result(
3252 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
3462ffa8 3253
3254
25f14e9f
S
3255class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3256 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3d3dddc9 3257 _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
25f14e9f 3258 _FEED_NAME = 'recommended'
3d3dddc9 3259 _TESTS = [{
3260 'url': ':ytrec',
3261 'only_matching': True,
3262 }, {
3263 'url': ':ytrecommended',
3264 'only_matching': True,
3265 }, {
3266 'url': 'https://youtube.com',
3267 'only_matching': True,
3268 }]
1ed5b5c9 3269
1ed5b5c9 3270
25f14e9f 3271class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
70d5c17b 3272 IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
3d3dddc9 3273 _VALID_URL = r':ytsub(?:scription)?s?'
25f14e9f 3274 _FEED_NAME = 'subscriptions'
3d3dddc9 3275 _TESTS = [{
3276 'url': ':ytsubs',
3277 'only_matching': True,
3278 }, {
3279 'url': ':ytsubscriptions',
3280 'only_matching': True,
3281 }]
1ed5b5c9 3282
1ed5b5c9 3283
25f14e9f
S
3284class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3285 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3d3dddc9 3286 _VALID_URL = r':ythistory'
25f14e9f 3287 _FEED_NAME = 'history'
3d3dddc9 3288 _TESTS = [{
3289 'url': ':ythistory',
3290 'only_matching': True,
3291 }]
1ed5b5c9
JMF
3292
3293
15870e90
PH
3294class YoutubeTruncatedURLIE(InfoExtractor):
3295 IE_NAME = 'youtube:truncated_url'
3296 IE_DESC = False # Do not list
975d35db 3297 _VALID_URL = r'''(?x)
b95aab84
PH
3298 (?:https?://)?
3299 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3300 (?:watch\?(?:
c4808c60 3301 feature=[a-z_]+|
b95aab84
PH
3302 annotation_id=annotation_[^&]+|
3303 x-yt-cl=[0-9]+|
c1708b89 3304 hl=[^&]*|
287be8c6 3305 t=[0-9]+
b95aab84
PH
3306 )?
3307 |
3308 attribution_link\?a=[^&]+
3309 )
3310 $
975d35db 3311 '''
15870e90 3312
c4808c60 3313 _TESTS = [{
2d3d2997 3314 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
c4808c60 3315 'only_matching': True,
dc2fc736 3316 }, {
2d3d2997 3317 'url': 'https://www.youtube.com/watch?',
dc2fc736 3318 'only_matching': True,
b95aab84
PH
3319 }, {
3320 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3321 'only_matching': True,
3322 }, {
3323 'url': 'https://www.youtube.com/watch?feature=foo',
3324 'only_matching': True,
c1708b89
PH
3325 }, {
3326 'url': 'https://www.youtube.com/watch?hl=en-GB',
3327 'only_matching': True,
287be8c6
PH
3328 }, {
3329 'url': 'https://www.youtube.com/watch?t=2372',
3330 'only_matching': True,
c4808c60
PH
3331 }]
3332
15870e90
PH
3333 def _real_extract(self, url):
3334 raise ExtractorError(
78caa52a
PH
3335 'Did you forget to quote the URL? Remember that & is a meta '
3336 'character in most shells, so you want to put the URL in quotes, '
3867038a 3337 'like youtube-dl '
2d3d2997 3338 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3867038a 3339 ' or simply youtube-dl BaW_jenozKc .',
15870e90 3340 expected=True)
772fd5cc
PH
3341
3342
3343class YoutubeTruncatedIDIE(InfoExtractor):
3344 IE_NAME = 'youtube:truncated_id'
3345 IE_DESC = False # Do not list
b95aab84 3346 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
3347
3348 _TESTS = [{
3349 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3350 'only_matching': True,
3351 }]
3352
3353 def _real_extract(self, url):
3354 video_id = self._match_id(url)
3355 raise ExtractorError(
3356 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3357 expected=True)