]> jfr.im git - yt-dlp.git/blame - youtube_dlc/extractor/youtube.py
Print youtube's warning message (Closes #256)
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
d77ab8e2 9import random
c5e8d7af 10import re
42939b61 11import time
e0df6211 12import traceback
c5e8d7af 13
b05654f0 14from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
8d81f3e3 19 compat_kwargs,
c5e8d7af 20 compat_parse_qs,
7fd002c0
S
21 compat_urllib_parse_unquote,
22 compat_urllib_parse_unquote_plus,
15707c7e 23 compat_urllib_parse_urlencode,
7c80519c 24 compat_urllib_parse_urlparse,
7c61bd36 25 compat_urlparse,
c5e8d7af 26 compat_str,
4bb4a188
PH
27)
28from ..utils import (
27019dbb 29 bool_or_none,
c5e8d7af 30 clean_html,
9b9c5355 31 error_to_compat_str,
c5e8d7af 32 ExtractorError,
2d30521a 33 float_or_none,
4bb4a188 34 get_element_by_id,
dd27fd17 35 int_or_none,
94278f72 36 mimetype2ext,
6310acf5 37 parse_codecs,
b84071c0 38 parse_count,
7c80519c 39 parse_duration,
0cb58b02 40 remove_quotes,
3995d37d 41 remove_start,
cf7e015f 42 smuggle_url,
dbdaaa23 43 str_or_none,
c93d53f5 44 str_to_int,
556dbe7f 45 try_get,
c5e8d7af
PH
46 unescapeHTML,
47 unified_strdate,
cf7e015f 48 unsmuggle_url,
8bdd16b4 49 update_url_query,
81c2f20b 50 uppercase_escape,
21c340b8 51 url_or_none,
6e6bc8da 52 urlencode_postdata,
8bdd16b4 53 urljoin,
c5e8d7af
PH
54)
55
5f6a1245 56
de7f3446 57class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
58 """Provide base functions for Youtube extractors"""
59 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 60 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
e00eb564
S
61
62 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
3995d37d
S
63 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
64 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
e00eb564 65
3462ffa8 66 _RESERVED_NAMES = (
a93f71ee 67 r'course|embed|channel|c|user|playlist|watch|w|results|storefront|'
3462ffa8 68 r'shared|index|account|reporthistory|t/terms|about|upload|signin|logout|'
69 r'feed/(watch_later|history|subscriptions|library|trending|recommended)')
70
b2e8bc1b
JMF
71 _NETRC_MACHINE = 'youtube'
72 # If True it will raise an error if no login info is provided
73 _LOGIN_REQUIRED = False
74
70d5c17b 75 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
d0ba5587 76
d84b21b4
S
77 _YOUTUBE_CLIENT_HEADERS = {
78 'x-youtube-client-name': '1',
79 'x-youtube-client-version': '1.20200609.04.02',
80 }
81
b2e8bc1b 82 def _set_language(self):
810fb84d 83 self._set_cookie(
ee0b726c 84 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
42939b61 85 # YouTube sets the expire time to about two months
810fb84d 86 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 87
25f14e9f
S
88 def _ids_to_results(self, ids):
89 return [
90 self.url_result(vid_id, 'Youtube', video_id=vid_id)
91 for vid_id in ids]
92
b2e8bc1b 93 def _login(self):
83317f69 94 """
95 Attempt to log in to YouTube.
96 True is returned if successful or skipped.
97 False is returned if login failed.
98
99 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
100 """
68217024 101 username, password = self._get_login_info()
b2e8bc1b
JMF
102 # No authentication to be performed
103 if username is None:
70d35d16 104 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
69ea8ca4 105 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
957c523e
U
106 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
107 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
83317f69 108 return True
b2e8bc1b 109
7cc3570e
PH
110 login_page = self._download_webpage(
111 self._LOGIN_URL, None,
69ea8ca4
PH
112 note='Downloading login page',
113 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
114 if login_page is False:
115 return
b2e8bc1b 116
1212e997 117 login_form = self._hidden_inputs(login_page)
c5e8d7af 118
e00eb564
S
119 def req(url, f_req, note, errnote):
120 data = login_form.copy()
121 data.update({
122 'pstMsg': 1,
123 'checkConnection': 'youtube',
124 'checkedDomains': 'youtube',
125 'hl': 'en',
126 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
3995d37d 127 'f.req': json.dumps(f_req),
e00eb564
S
128 'flowName': 'GlifWebSignIn',
129 'flowEntry': 'ServiceLogin',
baf67a60
S
130 # TODO: reverse actual botguard identifier generation algo
131 'bgRequest': '["identifier",""]',
041bc3ad 132 })
e00eb564
S
133 return self._download_json(
134 url, None, note=note, errnote=errnote,
135 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
136 fatal=False,
137 data=urlencode_postdata(data), headers={
138 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
139 'Google-Accounts-XSRF': 1,
140 })
141
3995d37d
S
142 def warn(message):
143 self._downloader.report_warning(message)
144
145 lookup_req = [
146 username,
147 None, [], None, 'US', None, None, 2, False, True,
148 [
149 None, None,
150 [2, 1, None, 1,
151 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
152 None, [], 4],
153 1, [None, None, []], None, None, None, True
154 ],
155 username,
156 ]
157
e00eb564 158 lookup_results = req(
3995d37d 159 self._LOOKUP_URL, lookup_req,
e00eb564
S
160 'Looking up account info', 'Unable to look up account info')
161
162 if lookup_results is False:
163 return False
041bc3ad 164
3995d37d
S
165 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
166 if not user_hash:
167 warn('Unable to extract user hash')
168 return False
169
170 challenge_req = [
171 user_hash,
172 None, 1, None, [1, None, None, None, [password, None, True]],
173 [
174 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
175 1, [None, None, []], None, None, None, True
176 ]]
83317f69 177
3995d37d
S
178 challenge_results = req(
179 self._CHALLENGE_URL, challenge_req,
180 'Logging in', 'Unable to log in')
83317f69 181
3995d37d 182 if challenge_results is False:
e00eb564 183 return
83317f69 184
3995d37d
S
185 login_res = try_get(challenge_results, lambda x: x[0][5], list)
186 if login_res:
187 login_msg = try_get(login_res, lambda x: x[5], compat_str)
188 warn(
189 'Unable to login: %s' % 'Invalid password'
190 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
191 return False
192
193 res = try_get(challenge_results, lambda x: x[0][-1], list)
194 if not res:
195 warn('Unable to extract result entry')
196 return False
197
9a6628aa
S
198 login_challenge = try_get(res, lambda x: x[0][0], list)
199 if login_challenge:
200 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
201 if challenge_str == 'TWO_STEP_VERIFICATION':
3995d37d
S
202 # SEND_SUCCESS - TFA code has been successfully sent to phone
203 # QUOTA_EXCEEDED - reached the limit of TFA codes
9a6628aa 204 status = try_get(login_challenge, lambda x: x[5], compat_str)
3995d37d
S
205 if status == 'QUOTA_EXCEEDED':
206 warn('Exceeded the limit of TFA codes, try later')
207 return False
208
209 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
210 if not tl:
211 warn('Unable to extract TL')
212 return False
213
214 tfa_code = self._get_tfa_info('2-step verification code')
215
216 if not tfa_code:
217 warn(
218 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
219 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
220 return False
221
222 tfa_code = remove_start(tfa_code, 'G-')
223
224 tfa_req = [
225 user_hash, None, 2, None,
226 [
227 9, None, None, None, None, None, None, None,
228 [None, tfa_code, True, 2]
229 ]]
230
231 tfa_results = req(
232 self._TFA_URL.format(tl), tfa_req,
233 'Submitting TFA code', 'Unable to submit TFA code')
234
235 if tfa_results is False:
236 return False
237
238 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
239 if tfa_res:
240 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
241 warn(
242 'Unable to finish TFA: %s' % 'Invalid TFA code'
243 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
244 return False
245
246 check_cookie_url = try_get(
247 tfa_results, lambda x: x[0][-1][2], compat_str)
9a6628aa
S
248 else:
249 CHALLENGES = {
250 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
251 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
252 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
253 }
254 challenge = CHALLENGES.get(
255 challenge_str,
256 '%s returned error %s.' % (self.IE_NAME, challenge_str))
257 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
258 return False
3995d37d
S
259 else:
260 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
261
262 if not check_cookie_url:
263 warn('Unable to extract CheckCookie URL')
264 return False
e00eb564
S
265
266 check_cookie_results = self._download_webpage(
3995d37d
S
267 check_cookie_url, None, 'Checking cookie', fatal=False)
268
269 if check_cookie_results is False:
270 return False
e00eb564 271
3995d37d
S
272 if 'https://myaccount.google.com/' not in check_cookie_results:
273 warn('Unable to log in')
b2e8bc1b 274 return False
e00eb564 275
b2e8bc1b
JMF
276 return True
277
30226342 278 def _download_webpage_handle(self, *args, **kwargs):
c1148516 279 query = kwargs.get('query', {}).copy()
c1148516 280 kwargs['query'] = query
30226342 281 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
8d81f3e3
S
282 *args, **compat_kwargs(kwargs))
283
5b0a6a80 284 def _get_yt_initial_data(self, video_id, webpage):
285 config = self._search_regex(
286 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
287 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
288 webpage, 'ytInitialData', default=None)
289 if config:
290 return self._parse_json(
291 uppercase_escape(config), video_id, fatal=False)
292
b2e8bc1b
JMF
293 def _real_initialize(self):
294 if self._downloader is None:
295 return
42939b61 296 self._set_language()
b2e8bc1b
JMF
297 if not self._login():
298 return
c5e8d7af 299
8bdd16b4 300 _DEFAULT_API_DATA = {
301 'context': {
302 'client': {
303 'clientName': 'WEB',
304 'clientVersion': '2.20201021.03.00',
305 }
306 },
307 }
8377574c 308
a0566bbf 309 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
310
8bdd16b4 311 def _call_api(self, ep, query, video_id):
312 data = self._DEFAULT_API_DATA.copy()
313 data.update(query)
9833e7a0 314
8bdd16b4 315 response = self._download_json(
316 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
317 note='Downloading API JSON', errnote='Unable to download API page',
318 data=json.dumps(data).encode('utf8'),
319 headers={'content-type': 'application/json'},
320 query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
c54f4aad 321
8bdd16b4 322 return response
061a75ed 323
8bdd16b4 324 def _extract_yt_initial_data(self, video_id, webpage):
325 return self._parse_json(
326 self._search_regex(
a0566bbf 327 (r'%s\s*\n' % self._YT_INITIAL_DATA_RE,
328 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
8bdd16b4 329 video_id)
0c148415
S
330
331
360e1ca5 332class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 333 IE_DESC = 'YouTube.com'
cb7dfeea 334 _VALID_URL = r"""(?x)^
c5e8d7af 335 (
edb53e2d 336 (?:https?://|//) # http(s):// or protocol-independent URL
66b48727 337 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
484aaeb2 338 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 339 (?:www\.)?pwnyoutube\.com/|
8b561bfc 340 (?:www\.)?hooktube\.com/|
f7000f3a 341 (?:www\.)?yourepeat\.com/|
e69ae5b9 342 tube\.majestyc\.net/|
ba036333 343 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
77d95677 344 (?:(?:www|dev)\.)?invidio\.us/|
ba036333 345 (?:(?:www|no)\.)?invidiou\.sh/|
346 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
8ae113ca 347 (?:www\.)?invidious\.kabi\.tk/|
ba036333 348 (?:www\.)?invidious\.13ad\.de/|
791d2e81 349 (?:www\.)?invidious\.mastodon\.host/|
494d664e 350 (?:www\.)?invidious\.nixnet\.xyz/|
666d808e 351 (?:www\.)?invidious\.drycat\.fr/|
ba036333 352 (?:www\.)?tube\.poal\.co/|
8ae113ca 353 (?:www\.)?vid\.wxzm\.sx/|
384bf91f 354 (?:www\.)?yewtu\.be/|
494d664e 355 (?:www\.)?yt\.elukerio\.org/|
894b3826 356 (?:www\.)?yt\.lelux\.fi/|
1db5ab6b 357 (?:www\.)?invidious\.ggc-project\.de/|
358 (?:www\.)?yt\.maisputain\.ovh/|
359 (?:www\.)?invidious\.13ad\.de/|
360 (?:www\.)?invidious\.toot\.koeln/|
361 (?:www\.)?invidious\.fdn\.fr/|
362 (?:www\.)?watch\.nettohikari\.com/|
bff90fc5 363 (?:www\.)?kgg2m7yk5aybusll\.onion/|
364 (?:www\.)?qklhadlycap4cnod\.onion/|
365 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
366 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
367 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
368 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
33c1c7d8 369 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
1db5ab6b 370 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
e69ae5b9 371 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
372 (?:.*?\#/)? # handle anchor (#/) redirect urls
373 (?: # the various things that can precede the ID:
ac7553d0 374 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 375 |(?: # or the v= param in all its forms
f7000f3a 376 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 377 (?:\?|\#!?) # the params delimiter ? or # or #!
040ac686 378 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
c5e8d7af
PH
379 v=
380 )
f4b05232 381 ))
cbaed4bb
S
382 |(?:
383 youtu\.be| # just youtu.be/xxxx
6d4fc66b
S
384 vid\.plus| # or vid.plus/xxxx
385 zwearz\.com/watch| # or zwearz.com/watch/xxxx
cbaed4bb 386 )/
edb53e2d 387 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 388 )
c5e8d7af 389 )? # all until now is optional -> you can pass the naked ID
8bdd16b4 390 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
d0ba5587
S
391 (?!.*?\blist=
392 (?:
393 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
394 WL # WL are handled by the watch later IE
395 )
396 )
c5e8d7af 397 (?(1).+)? # if we found the ID, everything can follow
d0ba5587 398 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
c5e8d7af 399 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
e40c758c
S
400 _PLAYER_INFO_RE = (
401 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
402 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
403 )
2c62dc26 404 _formats = {
c2d3cb4c 405 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
406 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
407 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
408 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
409 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
410 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
411 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
412 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
3834d3e3 413 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
c2d3cb4c 414 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
415 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
416 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
417 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
418 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
419 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
e1a0bfdf 420 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
c2d3cb4c 421 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
422 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
e1a0bfdf 423
424
425 # 3D videos
c2d3cb4c 426 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
427 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
428 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
429 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
e1a0bfdf 430 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
431 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
432 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
836a086c 433
96fb5605 434 # Apple HTTP Live Streaming
11f12195 435 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
c2d3cb4c 436 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
437 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
438 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
439 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
440 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
e1a0bfdf 441 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
442 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
2c62dc26
PH
443
444 # DASH mp4 video
d23028a8
S
445 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
446 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
447 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
448 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
449 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
067aa17e 450 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
d23028a8
S
451 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
452 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
453 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
454 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
455 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
456 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
836a086c 457
f6f1fc92 458 # Dash mp4 audio
d23028a8
S
459 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
460 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
461 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
462 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
463 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
464 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
465 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
836a086c
AZ
466
467 # Dash webm
d23028a8
S
468 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
469 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
470 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
471 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
472 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
473 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
474 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
475 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
476 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
477 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
478 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
479 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
480 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
481 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
482 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
4c6b4764 483 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
d23028a8
S
484 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
485 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
486 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
487 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
488 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
489 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
2c62dc26
PH
490
491 # Dash webm audio
d23028a8
S
492 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
493 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
ce6b9a2d 494
0857baad 495 # Dash webm audio with opus inside
d23028a8
S
496 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
497 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
498 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
0857baad 499
ce6b9a2d
PH
500 # RTMP (unnamed)
501 '_rtmp': {'protocol': 'rtmp'},
b85eae0f
S
502
503 # av01 video only formats sometimes served with "unknown" codecs
504 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
505 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
506 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
507 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
c5e8d7af 508 }
84da5d84 509 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
836a086c 510
fd5c4aab
S
511 _GEO_BYPASS = False
512
78caa52a 513 IE_NAME = 'youtube'
2eb88d95
PH
514 _TESTS = [
515 {
2d3d2997 516 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
4bc3a23e
PH
517 'info_dict': {
518 'id': 'BaW_jenozKc',
519 'ext': 'mp4',
3867038a 520 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
4bc3a23e
PH
521 'uploader': 'Philipp Hagemeister',
522 'uploader_id': 'phihag',
ec85ded8 523 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
dd4c4492
S
524 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
525 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
4bc3a23e 526 'upload_date': '20121002',
3867038a 527 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
4bc3a23e 528 'categories': ['Science & Technology'],
3867038a 529 'tags': ['youtube-dl'],
556dbe7f 530 'duration': 10,
dbdaaa23 531 'view_count': int,
3e7c1224
PH
532 'like_count': int,
533 'dislike_count': int,
7c80519c 534 'start_time': 1,
297a564b 535 'end_time': 9,
2eb88d95 536 }
0e853ca4 537 },
fccd3771 538 {
4bc3a23e
PH
539 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
540 'note': 'Embed-only video (#1746)',
541 'info_dict': {
542 'id': 'yZIXLfi8CZQ',
543 'ext': 'mp4',
544 'upload_date': '20120608',
545 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
546 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
547 'uploader': 'SET India',
94bfcd23 548 'uploader_id': 'setindia',
ec85ded8 549 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
94bfcd23 550 'age_limit': 18,
fccd3771
PH
551 }
552 },
11b56058 553 {
8bdd16b4 554 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
11b56058
PM
555 'note': 'Use the first video ID in the URL',
556 'info_dict': {
557 'id': 'BaW_jenozKc',
558 'ext': 'mp4',
3867038a 559 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
11b56058
PM
560 'uploader': 'Philipp Hagemeister',
561 'uploader_id': 'phihag',
ec85ded8 562 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
11b56058 563 'upload_date': '20121002',
3867038a 564 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
11b56058 565 'categories': ['Science & Technology'],
3867038a 566 'tags': ['youtube-dl'],
556dbe7f 567 'duration': 10,
dbdaaa23 568 'view_count': int,
11b56058
PM
569 'like_count': int,
570 'dislike_count': int,
34a7de29
S
571 },
572 'params': {
573 'skip_download': True,
574 },
11b56058 575 },
dd27fd17 576 {
2d3d2997 577 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
4bc3a23e
PH
578 'note': '256k DASH audio (format 141) via DASH manifest',
579 'info_dict': {
580 'id': 'a9LDPn-MO4I',
581 'ext': 'm4a',
582 'upload_date': '20121002',
583 'uploader_id': '8KVIDEO',
ec85ded8 584 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
4bc3a23e
PH
585 'description': '',
586 'uploader': '8KVIDEO',
587 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 588 },
4bc3a23e
PH
589 'params': {
590 'youtube_include_dash_manifest': True,
591 'format': '141',
4919603f 592 },
de3c7fe0 593 'skip': 'format 141 not served anymore',
dd27fd17 594 },
8bdd16b4 595 # DASH manifest with encrypted signature
596 {
597 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
598 'info_dict': {
599 'id': 'IB3lcPjvWLA',
600 'ext': 'm4a',
601 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
602 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
603 'duration': 244,
604 'uploader': 'AfrojackVEVO',
605 'uploader_id': 'AfrojackVEVO',
606 'upload_date': '20131011',
607 },
608 'params': {
609 'youtube_include_dash_manifest': True,
610 'format': '141/bestaudio[ext=m4a]',
611 },
612 },
aa79ac0c
PH
613 # Controversy video
614 {
615 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
616 'info_dict': {
617 'id': 'T4XJQO3qol8',
618 'ext': 'mp4',
556dbe7f 619 'duration': 219,
aa79ac0c 620 'upload_date': '20100909',
4fe54c12 621 'uploader': 'Amazing Atheist',
aa79ac0c 622 'uploader_id': 'TheAmazingAtheist',
ec85ded8 623 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
aa79ac0c
PH
624 'title': 'Burning Everyone\'s Koran',
625 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
626 }
c522adb1 627 },
dd2d55f1 628 # Normal age-gate video (embed allowed)
c522adb1 629 {
2d3d2997 630 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
c522adb1
JMF
631 'info_dict': {
632 'id': 'HtVdAasjOgU',
633 'ext': 'mp4',
634 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
ec85ded8 635 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
556dbe7f 636 'duration': 142,
c522adb1
JMF
637 'uploader': 'The Witcher',
638 'uploader_id': 'WitcherGame',
ec85ded8 639 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
c522adb1 640 'upload_date': '20140605',
34952f09 641 'age_limit': 18,
c522adb1
JMF
642 },
643 },
8bdd16b4 644 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
645 # YouTube Red ad is not captured for creator
646 {
647 'url': '__2ABJjxzNo',
648 'info_dict': {
649 'id': '__2ABJjxzNo',
650 'ext': 'mp4',
651 'duration': 266,
652 'upload_date': '20100430',
653 'uploader_id': 'deadmau5',
654 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
655 'creator': 'Dada Life, deadmau5',
656 'description': 'md5:12c56784b8032162bb936a5f76d55360',
657 'uploader': 'deadmau5',
658 'title': 'Deadmau5 - Some Chords (HD)',
659 'alt_title': 'This Machine Kills Some Chords',
660 },
661 'expected_warnings': [
662 'DASH manifest missing',
663 ]
664 },
067aa17e 665 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
e52a40ab
PH
666 {
667 'url': 'lqQg6PlCWgI',
668 'info_dict': {
669 'id': 'lqQg6PlCWgI',
670 'ext': 'mp4',
556dbe7f 671 'duration': 6085,
90227264 672 'upload_date': '20150827',
cbe2bd91 673 'uploader_id': 'olympic',
ec85ded8 674 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
cbe2bd91 675 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
be49068d 676 'uploader': 'Olympic',
cbe2bd91
PH
677 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
678 },
679 'params': {
680 'skip_download': 'requires avconv',
e52a40ab 681 }
cbe2bd91 682 },
6271f1ca
PH
683 # Non-square pixels
684 {
685 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
686 'info_dict': {
687 'id': '_b-2C3KPAM0',
688 'ext': 'mp4',
689 'stretched_ratio': 16 / 9.,
556dbe7f 690 'duration': 85,
6271f1ca
PH
691 'upload_date': '20110310',
692 'uploader_id': 'AllenMeow',
ec85ded8 693 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
6271f1ca 694 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
eb6793ba 695 'uploader': '孫ᄋᄅ',
6271f1ca
PH
696 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
697 },
06b491eb
S
698 },
699 # url_encoded_fmt_stream_map is empty string
700 {
701 'url': 'qEJwOuvDf7I',
702 'info_dict': {
703 'id': 'qEJwOuvDf7I',
f57b7835 704 'ext': 'webm',
06b491eb
S
705 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
706 'description': '',
707 'upload_date': '20150404',
708 'uploader_id': 'spbelect',
709 'uploader': 'Наблюдатели Петербурга',
710 },
711 'params': {
712 'skip_download': 'requires avconv',
e323cf3f
S
713 },
714 'skip': 'This live event has ended.',
06b491eb 715 },
067aa17e 716 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
da77d856
S
717 {
718 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
719 'info_dict': {
720 'id': 'FIl7x6_3R5Y',
eb6793ba 721 'ext': 'webm',
da77d856
S
722 'title': 'md5:7b81415841e02ecd4313668cde88737a',
723 'description': 'md5:116377fd2963b81ec4ce64b542173306',
556dbe7f 724 'duration': 220,
da77d856
S
725 'upload_date': '20150625',
726 'uploader_id': 'dorappi2000',
ec85ded8 727 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
da77d856 728 'uploader': 'dorappi2000',
eb6793ba 729 'formats': 'mincount:31',
da77d856 730 },
eb6793ba 731 'skip': 'not actual anymore',
2ee8f5d8 732 },
8a1a26ce
YCH
733 # DASH manifest with segment_list
734 {
735 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
736 'md5': '8ce563a1d667b599d21064e982ab9e31',
737 'info_dict': {
738 'id': 'CsmdDsKjzN8',
739 'ext': 'mp4',
17ee98e1 740 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
741 'uploader': 'Airtek',
742 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
743 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
744 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
745 },
746 'params': {
747 'youtube_include_dash_manifest': True,
748 'format': '135', # bestvideo
be49068d
S
749 },
750 'skip': 'This live event has ended.',
2ee8f5d8 751 },
cf7e015f
S
752 {
753 # Multifeed videos (multiple cameras), URL is for Main Camera
754 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
755 'info_dict': {
756 'id': 'jqWvoWXjCVs',
757 'title': 'teamPGP: Rocket League Noob Stream',
758 'description': 'md5:dc7872fb300e143831327f1bae3af010',
759 },
760 'playlist': [{
761 'info_dict': {
762 'id': 'jqWvoWXjCVs',
763 'ext': 'mp4',
764 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
765 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 766 'duration': 7335,
cf7e015f
S
767 'upload_date': '20150721',
768 'uploader': 'Beer Games Beer',
769 'uploader_id': 'beergamesbeer',
ec85ded8 770 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 771 'license': 'Standard YouTube License',
cf7e015f
S
772 },
773 }, {
774 'info_dict': {
775 'id': '6h8e8xoXJzg',
776 'ext': 'mp4',
777 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
778 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 779 'duration': 7337,
cf7e015f
S
780 'upload_date': '20150721',
781 'uploader': 'Beer Games Beer',
782 'uploader_id': 'beergamesbeer',
ec85ded8 783 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 784 'license': 'Standard YouTube License',
cf7e015f
S
785 },
786 }, {
787 'info_dict': {
788 'id': 'PUOgX5z9xZw',
789 'ext': 'mp4',
790 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
791 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 792 'duration': 7337,
cf7e015f
S
793 'upload_date': '20150721',
794 'uploader': 'Beer Games Beer',
795 'uploader_id': 'beergamesbeer',
ec85ded8 796 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 797 'license': 'Standard YouTube License',
cf7e015f
S
798 },
799 }, {
800 'info_dict': {
801 'id': 'teuwxikvS5k',
802 'ext': 'mp4',
803 'title': 'teamPGP: Rocket League Noob Stream (zim)',
804 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 805 'duration': 7334,
cf7e015f
S
806 'upload_date': '20150721',
807 'uploader': 'Beer Games Beer',
808 'uploader_id': 'beergamesbeer',
ec85ded8 809 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 810 'license': 'Standard YouTube License',
cf7e015f
S
811 },
812 }],
813 'params': {
814 'skip_download': True,
815 },
4fe54c12 816 'skip': 'This video is not available.',
cbaed4bb 817 },
f9f49d87 818 {
067aa17e 819 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
f9f49d87
S
820 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
821 'info_dict': {
822 'id': 'gVfLd0zydlo',
823 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
824 },
825 'playlist_count': 2,
be49068d 826 'skip': 'Not multifeed anymore',
f9f49d87 827 },
cbaed4bb 828 {
2d3d2997 829 'url': 'https://vid.plus/FlRa-iH7PGw',
cbaed4bb 830 'only_matching': True,
0e49d9a6 831 },
6d4fc66b 832 {
2d3d2997 833 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
6d4fc66b
S
834 'only_matching': True,
835 },
0e49d9a6 836 {
067aa17e 837 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
a8776b10 838 # Also tests cut-off URL expansion in video description (see
067aa17e
S
839 # https://github.com/ytdl-org/youtube-dl/issues/1892,
840 # https://github.com/ytdl-org/youtube-dl/issues/8164)
0e49d9a6
LL
841 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
842 'info_dict': {
843 'id': 'lsguqyKfVQg',
844 'ext': 'mp4',
845 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
eb6793ba 846 'alt_title': 'Dark Walk - Position Music',
0e49d9a6 847 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
556dbe7f 848 'duration': 133,
0e49d9a6
LL
849 'upload_date': '20151119',
850 'uploader_id': 'IronSoulElf',
ec85ded8 851 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
0e49d9a6 852 'uploader': 'IronSoulElf',
eb6793ba
S
853 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
854 'track': 'Dark Walk - Position Music',
855 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
92bc97d3 856 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
0e49d9a6
LL
857 },
858 'params': {
859 'skip_download': True,
860 },
861 },
61f92af1 862 {
067aa17e 863 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
61f92af1
S
864 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
865 'only_matching': True,
866 },
313dfc45
LL
867 {
868 # Video with yt:stretch=17:0
869 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
870 'info_dict': {
871 'id': 'Q39EVAstoRM',
872 'ext': 'mp4',
873 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
874 'description': 'md5:ee18a25c350637c8faff806845bddee9',
875 'upload_date': '20151107',
876 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
877 'uploader': 'CH GAMER DROID',
878 },
879 'params': {
880 'skip_download': True,
881 },
be49068d 882 'skip': 'This video does not exist.',
313dfc45 883 },
7caf9830
S
884 {
885 # Video licensed under Creative Commons
886 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
887 'info_dict': {
888 'id': 'M4gD1WSo5mA',
889 'ext': 'mp4',
890 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
891 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
556dbe7f 892 'duration': 721,
7caf9830
S
893 'upload_date': '20150127',
894 'uploader_id': 'BerkmanCenter',
ec85ded8 895 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
556dbe7f 896 'uploader': 'The Berkman Klein Center for Internet & Society',
7caf9830
S
897 'license': 'Creative Commons Attribution license (reuse allowed)',
898 },
899 'params': {
900 'skip_download': True,
901 },
902 },
fd050249
S
903 {
904 # Channel-like uploader_url
905 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
906 'info_dict': {
907 'id': 'eQcmzGIKrzg',
908 'ext': 'mp4',
909 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
910 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
556dbe7f 911 'duration': 4060,
fd050249 912 'upload_date': '20151119',
eb6793ba 913 'uploader': 'Bernie Sanders',
fd050249 914 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
ec85ded8 915 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
fd050249
S
916 'license': 'Creative Commons Attribution license (reuse allowed)',
917 },
918 'params': {
919 'skip_download': True,
920 },
921 },
040ac686
S
922 {
923 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
924 'only_matching': True,
7f29cf54
S
925 },
926 {
067aa17e 927 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
7f29cf54
S
928 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
929 'only_matching': True,
6496ccb4
S
930 },
931 {
932 # Rental video preview
933 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
934 'info_dict': {
935 'id': 'uGpuVWrhIzE',
936 'ext': 'mp4',
937 'title': 'Piku - Trailer',
938 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
939 'upload_date': '20150811',
940 'uploader': 'FlixMatrix',
941 'uploader_id': 'FlixMatrixKaravan',
ec85ded8 942 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
6496ccb4
S
943 'license': 'Standard YouTube License',
944 },
945 'params': {
946 'skip_download': True,
947 },
eb6793ba 948 'skip': 'This video is not available.',
022a5d66 949 },
12afdc2a
S
950 {
951 # YouTube Red video with episode data
952 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
953 'info_dict': {
954 'id': 'iqKdEhx-dD4',
955 'ext': 'mp4',
956 'title': 'Isolation - Mind Field (Ep 1)',
4fe54c12 957 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
556dbe7f 958 'duration': 2085,
12afdc2a
S
959 'upload_date': '20170118',
960 'uploader': 'Vsauce',
961 'uploader_id': 'Vsauce',
962 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
12afdc2a
S
963 'series': 'Mind Field',
964 'season_number': 1,
965 'episode_number': 1,
966 },
967 'params': {
968 'skip_download': True,
969 },
970 'expected_warnings': [
971 'Skipping DASH manifest',
972 ],
973 },
c7121fa7
S
974 {
975 # The following content has been identified by the YouTube community
976 # as inappropriate or offensive to some audiences.
977 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
978 'info_dict': {
979 'id': '6SJNVb0GnPI',
980 'ext': 'mp4',
981 'title': 'Race Differences in Intelligence',
982 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
983 'duration': 965,
984 'upload_date': '20140124',
985 'uploader': 'New Century Foundation',
986 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
987 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
c7121fa7
S
988 },
989 'params': {
990 'skip_download': True,
991 },
992 },
022a5d66
S
993 {
994 # itag 212
995 'url': '1t24XAntNCY',
996 'only_matching': True,
fd5c4aab
S
997 },
998 {
999 # geo restricted to JP
1000 'url': 'sJL6WA-aGkQ',
1001 'only_matching': True,
1002 },
cd5a74a2
S
1003 {
1004 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1005 'only_matching': True,
1006 },
825cd268
RA
1007 {
1008 # DRM protected
1009 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1010 'only_matching': True,
4fe54c12
S
1011 },
1012 {
1013 # Video with unsupported adaptive stream type formats
1014 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1015 'info_dict': {
1016 'id': 'Z4Vy8R84T1U',
1017 'ext': 'mp4',
1018 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1019 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1020 'duration': 433,
1021 'upload_date': '20130923',
1022 'uploader': 'Amelia Putri Harwita',
1023 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1024 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1025 'formats': 'maxcount:10',
1026 },
1027 'params': {
1028 'skip_download': True,
1029 'youtube_include_dash_manifest': False,
1030 },
5429d6a9 1031 'skip': 'not actual anymore',
5caabd3c 1032 },
1033 {
822b9d9c 1034 # Youtube Music Auto-generated description
5caabd3c 1035 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1036 'info_dict': {
1037 'id': 'MgNrAu2pzNs',
1038 'ext': 'mp4',
1039 'title': 'Voyeur Girl',
1040 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1041 'upload_date': '20190312',
5429d6a9
S
1042 'uploader': 'Stephen - Topic',
1043 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
5caabd3c 1044 'artist': 'Stephen',
1045 'track': 'Voyeur Girl',
1046 'album': 'it\'s too much love to know my dear',
1047 'release_date': '20190313',
1048 'release_year': 2019,
1049 },
1050 'params': {
1051 'skip_download': True,
1052 },
1053 },
66b48727
RA
1054 {
1055 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1056 'only_matching': True,
1057 },
011e75e6
S
1058 {
1059 # invalid -> valid video id redirection
1060 'url': 'DJztXj2GPfl',
1061 'info_dict': {
1062 'id': 'DJztXj2GPfk',
1063 'ext': 'mp4',
1064 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1065 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1066 'upload_date': '20090125',
1067 'uploader': 'Prochorowka',
1068 'uploader_id': 'Prochorowka',
1069 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1070 'artist': 'Panjabi MC',
1071 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1072 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1073 },
1074 'params': {
1075 'skip_download': True,
1076 },
ea74e00b
DP
1077 },
1078 {
1079 # empty description results in an empty string
1080 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1081 'info_dict': {
1082 'id': 'x41yOUIvK2k',
1083 'ext': 'mp4',
1084 'title': 'IMG 3456',
1085 'description': '',
1086 'upload_date': '20170613',
1087 'uploader_id': 'ElevageOrVert',
1088 'uploader': 'ElevageOrVert',
1089 },
1090 'params': {
1091 'skip_download': True,
1092 },
1093 },
a0566bbf 1094 {
1095 # with '};' inside yt initial data (see https://github.com/ytdl-org/youtube-dl/issues/27093)
1096 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1097 'info_dict': {
1098 'id': 'CHqg6qOn4no',
1099 'ext': 'mp4',
1100 'title': 'Part 77 Sort a list of simple types in c#',
1101 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1102 'upload_date': '20130831',
1103 'uploader_id': 'kudvenkat',
1104 'uploader': 'kudvenkat',
1105 },
1106 'params': {
1107 'skip_download': True,
1108 },
1109 },
2eb88d95
PH
1110 ]
1111
e0df6211
PH
1112 def __init__(self, *args, **kwargs):
1113 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 1114 self._player_cache = {}
e0df6211 1115
c5e8d7af
PH
1116 def report_video_info_webpage_download(self, video_id):
1117 """Report attempt to download video info webpage."""
69ea8ca4 1118 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 1119
c5e8d7af
PH
1120 def report_information_extraction(self, video_id):
1121 """Report attempt to extract video information."""
69ea8ca4 1122 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
1123
1124 def report_unavailable_format(self, video_id, format):
1125 """Report extracted video URL."""
69ea8ca4 1126 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
1127
1128 def report_rtmp_download(self):
1129 """Indicate the download will use the RTMP protocol."""
69ea8ca4 1130 self.to_screen('RTMP download detected')
c5e8d7af 1131
60064c53
PH
1132 def _signature_cache_id(self, example_sig):
1133 """ Return a string representation of a signature """
78caa52a 1134 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53 1135
e40c758c
S
1136 @classmethod
1137 def _extract_player_info(cls, player_url):
1138 for player_re in cls._PLAYER_INFO_RE:
1139 id_m = re.search(player_re, player_url)
1140 if id_m:
1141 break
1142 else:
c081b35c 1143 raise ExtractorError('Cannot identify player %r' % player_url)
e40c758c
S
1144 return id_m.group('ext'), id_m.group('id')
1145
1146 def _extract_signature_function(self, video_id, player_url, example_sig):
1147 player_type, player_id = self._extract_player_info(player_url)
e0df6211 1148
c4417ddb 1149 # Read from filesystem cache
60064c53
PH
1150 func_id = '%s_%s_%s' % (
1151 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 1152 assert os.path.basename(func_id) == func_id
a0e07d31 1153
69ea8ca4 1154 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 1155 if cache_spec is not None:
78caa52a 1156 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 1157
6d1a55a5
PH
1158 download_note = (
1159 'Downloading player %s' % player_url
1160 if self._downloader.params.get('verbose') else
1161 'Downloading %s player %s' % (player_type, player_id)
1162 )
e0df6211
PH
1163 if player_type == 'js':
1164 code = self._download_webpage(
1165 player_url, video_id,
6d1a55a5 1166 note=download_note,
69ea8ca4 1167 errnote='Download of %s failed' % player_url)
83799698 1168 res = self._parse_sig_js(code)
c4417ddb 1169 elif player_type == 'swf':
e0df6211
PH
1170 urlh = self._request_webpage(
1171 player_url, video_id,
6d1a55a5 1172 note=download_note,
69ea8ca4 1173 errnote='Download of %s failed' % player_url)
e0df6211 1174 code = urlh.read()
83799698 1175 res = self._parse_sig_swf(code)
e0df6211
PH
1176 else:
1177 assert False, 'Invalid player type %r' % player_type
1178
785521bf
PH
1179 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1180 cache_res = res(test_string)
1181 cache_spec = [ord(c) for c in cache_res]
83799698 1182
69ea8ca4 1183 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
1184 return res
1185
60064c53 1186 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
1187 def gen_sig_code(idxs):
1188 def _genslice(start, end, step):
78caa52a 1189 starts = '' if start == 0 else str(start)
8bcc8756 1190 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 1191 steps = '' if step == 1 else (':%d' % step)
78caa52a 1192 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
1193
1194 step = None
7af808a5
PH
1195 # Quelch pyflakes warnings - start will be set when step is set
1196 start = '(Never used)'
edf3e38e
PH
1197 for i, prev in zip(idxs[1:], idxs[:-1]):
1198 if step is not None:
1199 if i - prev == step:
1200 continue
1201 yield _genslice(start, prev, step)
1202 step = None
1203 continue
1204 if i - prev in [-1, 1]:
1205 step = i - prev
1206 start = prev
1207 continue
1208 else:
78caa52a 1209 yield 's[%d]' % prev
edf3e38e 1210 if step is None:
78caa52a 1211 yield 's[%d]' % i
edf3e38e
PH
1212 else:
1213 yield _genslice(start, i, step)
1214
78caa52a 1215 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 1216 cache_res = func(test_string)
edf3e38e 1217 cache_spec = [ord(c) for c in cache_res]
78caa52a 1218 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
1219 signature_id_tuple = '(%s)' % (
1220 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 1221 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 1222 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 1223 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 1224
e0df6211
PH
1225 def _parse_sig_js(self, jscode):
1226 funcname = self._search_regex(
abefc03f
S
1227 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1228 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
e450f6cb 1229 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
31ce6e99 1230 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
abefc03f
S
1231 # Obsolete patterns
1232 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
9a47fa35 1233 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
abefc03f
S
1234 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1235 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1236 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1237 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1238 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1239 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
3c90cc8b 1240 jscode, 'Initial JS player signature function name', group='sig')
2b25cb5d
PH
1241
1242 jsi = JSInterpreter(jscode)
1243 initial_function = jsi.extract_function(funcname)
e0df6211
PH
1244 return lambda s: initial_function([s])
1245
1246 def _parse_sig_swf(self, file_contents):
54256267 1247 swfi = SWFInterpreter(file_contents)
78caa52a 1248 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 1249 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 1250 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
1251 return lambda s: initial_function([s])
1252
83799698 1253 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1254 """Turn the encrypted s field into a working signature"""
6b37f0be 1255
c8bf86d5 1256 if player_url is None:
69ea8ca4 1257 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 1258
69ea8ca4 1259 if player_url.startswith('//'):
78caa52a 1260 player_url = 'https:' + player_url
3c90cc8b
S
1261 elif not re.match(r'https?://', player_url):
1262 player_url = compat_urlparse.urljoin(
1263 'https://www.youtube.com', player_url)
c8bf86d5 1264 try:
62af3a0e 1265 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
1266 if player_id not in self._player_cache:
1267 func = self._extract_signature_function(
60064c53 1268 video_id, player_url, s
c8bf86d5
PH
1269 )
1270 self._player_cache[player_id] = func
1271 func = self._player_cache[player_id]
1272 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 1273 self._print_sig_code(func, s)
c8bf86d5
PH
1274 return func(s)
1275 except Exception as e:
1276 tb = traceback.format_exc()
1277 raise ExtractorError(
78caa52a 1278 'Signature extraction failed: ' + tb, cause=e)
e0df6211 1279
f96f5dda 1280 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
de7f3446 1281 try:
60e47a26 1282 subs_doc = self._download_xml(
38c2e5b8 1283 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
1284 video_id, note=False)
1285 except ExtractorError as err:
9b9c5355 1286 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
de7f3446 1287 return {}
de7f3446
JMF
1288
1289 sub_lang_list = {}
60e47a26
JMF
1290 for track in subs_doc.findall('track'):
1291 lang = track.attrib['lang_code']
7e660ac1
LD
1292 if lang in sub_lang_list:
1293 continue
360e1ca5 1294 sub_formats = []
23d17e4b 1295 for ext in self._SUBTITLE_FORMATS:
15707c7e 1296 params = compat_urllib_parse_urlencode({
360e1ca5
JMF
1297 'lang': lang,
1298 'v': video_id,
1299 'fmt': ext,
1300 'name': track.attrib['name'].encode('utf-8'),
1301 })
1302 sub_formats.append({
1303 'url': 'https://www.youtube.com/api/timedtext?' + params,
1304 'ext': ext,
1305 })
1306 sub_lang_list[lang] = sub_formats
9f448fcb 1307 if has_live_chat_replay:
321bf820 1308 sub_lang_list['live_chat'] = [
1309 {
1310 'video_id': video_id,
1311 'ext': 'json',
1312 'protocol': 'youtube_live_chat_replay',
1313 },
9f448fcb 1314 ]
de7f3446 1315 if not sub_lang_list:
69ea8ca4 1316 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
1317 return {}
1318 return sub_lang_list
1319
a72778d3
S
1320 def _get_ytplayer_config(self, video_id, webpage):
1321 patterns = (
526b3b07
S
1322 # User data may contain arbitrary character sequences that may affect
1323 # JSON extraction with regex, e.g. when '};' is contained the second
1324 # regex won't capture the whole JSON. Yet working around by trying more
1325 # concrete regex first keeping in mind proper quoted string handling
1326 # to be implemented in future that will replace this workaround (see
067aa17e
S
1327 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1328 # https://github.com/ytdl-org/youtube-dl/pull/7599)
a72778d3
S
1329 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1330 r';ytplayer\.config\s*=\s*({.+?});',
1331 )
1332 config = self._search_regex(
1333 patterns, webpage, 'ytplayer.config', default=None)
1334 if config:
1335 return self._parse_json(
1336 uppercase_escape(config), video_id, fatal=False)
0e49d9a6 1337
9322f116 1338 def _get_music_metadata_from_yt_initial(self, yt_initial):
1339 music_metadata = []
1340 key_map = {
1341 'Album': 'album',
1342 'Artist': 'artist',
1343 'Song': 'track'
1344 }
1345 contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'])
1346 if type(contents) is list:
1347 for content in contents:
1348 music_track = {}
1349 if type(content) is not dict:
1350 continue
1351 videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer'])
1352 if type(videoSecondaryInfoRenderer) is not dict:
1353 continue
1354 rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'])
1355 if type(rows) is not list:
1356 continue
1357 for row in rows:
1358 metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer'])
1359 if type(metadataRowRenderer) is not dict:
1360 continue
1361 key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText'])
1362 value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \
1363 try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text'])
1364 if type(key) is not str or type(value) is not str:
1365 continue
1366 if key in key_map:
1367 if key_map[key] in music_track:
1368 # we've started on a new track
1369 music_metadata.append(music_track)
1370 music_track = {}
1371 music_track[key_map[key]] = value
1372 if len(music_track.keys()):
1373 music_metadata.append(music_track)
1374 return music_metadata
1375
360e1ca5 1376 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
1377 """We need the webpage for getting the captions url, pass it as an
1378 argument to speed up the process."""
69ea8ca4 1379 self.to_screen('%s: Looking for automatic captions' % video_id)
a72778d3 1380 player_config = self._get_ytplayer_config(video_id, webpage)
78caa52a 1381 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
a72778d3 1382 if not player_config:
de7f3446
JMF
1383 self._downloader.report_warning(err_msg)
1384 return {}
de7f3446 1385 try:
8bdd16b4 1386 args = player_config['args']
1387 caption_url = args.get('ttsurl')
1388 if caption_url:
b78b292f
S
1389 timestamp = args['timestamp']
1390 # We get the available subtitles
15707c7e 1391 list_params = compat_urllib_parse_urlencode({
b78b292f
S
1392 'type': 'list',
1393 'tlangs': 1,
1394 'asrs': 1,
1395 })
1396 list_url = caption_url + '&' + list_params
1397 caption_list = self._download_xml(list_url, video_id)
1398 original_lang_node = caption_list.find('track')
1399 if original_lang_node is None:
1400 self._downloader.report_warning('Video doesn\'t have automatic captions')
1401 return {}
1402 original_lang = original_lang_node.attrib['lang_code']
1403 caption_kind = original_lang_node.attrib.get('kind', '')
1404
1405 sub_lang_list = {}
1406 for lang_node in caption_list.findall('target'):
1407 sub_lang = lang_node.attrib['lang_code']
1408 sub_formats = []
1409 for ext in self._SUBTITLE_FORMATS:
15707c7e 1410 params = compat_urllib_parse_urlencode({
b78b292f
S
1411 'lang': original_lang,
1412 'tlang': sub_lang,
1413 'fmt': ext,
1414 'ts': timestamp,
1415 'kind': caption_kind,
1416 })
1417 sub_formats.append({
1418 'url': caption_url + '&' + params,
1419 'ext': ext,
1420 })
1421 sub_lang_list[sub_lang] = sub_formats
1422 return sub_lang_list
1423
ddbb4c5c
S
1424 def make_captions(sub_url, sub_langs):
1425 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1426 caption_qs = compat_parse_qs(parsed_sub_url.query)
1427 captions = {}
1428 for sub_lang in sub_langs:
1429 sub_formats = []
1430 for ext in self._SUBTITLE_FORMATS:
1431 caption_qs.update({
1432 'tlang': [sub_lang],
1433 'fmt': [ext],
1434 })
1435 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1436 query=compat_urllib_parse_urlencode(caption_qs, True)))
1437 sub_formats.append({
1438 'url': sub_url,
1439 'ext': ext,
1440 })
1441 captions[sub_lang] = sub_formats
1442 return captions
1443
1444 # New captions format as of 22.06.2017
8bdd16b4 1445 player_response = args.get('player_response')
1446 if player_response and isinstance(player_response, compat_str):
1447 player_response = self._parse_json(
1448 player_response, video_id, fatal=False)
1449 if player_response:
1450 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1451 base_url = renderer['captionTracks'][0]['baseUrl']
59c5fa91
PO
1452 sub_lang_list = []
1453 for lang in renderer['translationLanguages']:
1454 lang_code = lang.get('languageCode')
1455 if lang_code:
1456 sub_lang_list.append(lang_code)
1457 return make_captions(base_url, sub_lang_list)
1458
8bdd16b4 1459 # Some videos don't provide ttsurl but rather caption_tracks and
1460 # caption_translation_languages (e.g. 20LmZk1hakA)
1461 # Does not used anymore as of 22.06.2017
1462 caption_tracks = args['caption_tracks']
1463 caption_translation_languages = args['caption_translation_languages']
1464 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1465 sub_lang_list = []
1466 for lang in caption_translation_languages.split(','):
1467 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1468 sub_lang = lang_qs.get('lc', [None])[0]
1469 if sub_lang:
1470 sub_lang_list.append(sub_lang)
1471 return make_captions(caption_url, sub_lang_list)
de7f3446
JMF
1472 # An extractor error can be raise by the download process if there are
1473 # no automatic captions but there are subtitles
ddbb4c5c 1474 except (KeyError, IndexError, ExtractorError):
de7f3446
JMF
1475 self._downloader.report_warning(err_msg)
1476 return {}
1477
21c340b8
S
1478 def _mark_watched(self, video_id, video_info, player_response):
1479 playback_url = url_or_none(try_get(
1480 player_response,
1481 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1482 video_info, lambda x: x['videostats_playback_base_url'][0]))
d77ab8e2
S
1483 if not playback_url:
1484 return
1485 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1486 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1487
1488 # cpn generation algorithm is reverse engineered from base.js.
1489 # In fact it works even with dummy cpn.
1490 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1491 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1492
1493 qs.update({
1494 'ver': ['2'],
1495 'cpn': [cpn],
1496 })
1497 playback_url = compat_urlparse.urlunparse(
15707c7e 1498 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
d77ab8e2
S
1499
1500 self._download_webpage(
1501 playback_url, video_id, 'Marking watched',
1502 'Unable to mark watched', fatal=False)
1503
66c9fa36
S
1504 @staticmethod
1505 def _extract_urls(webpage):
1506 # Embedded YouTube player
1507 entries = [
1508 unescapeHTML(mobj.group('url'))
1509 for mobj in re.finditer(r'''(?x)
1510 (?:
1511 <iframe[^>]+?src=|
1512 data-video-url=|
1513 <embed[^>]+?src=|
1514 embedSWF\(?:\s*|
1515 <object[^>]+data=|
1516 new\s+SWFObject\(
1517 )
1518 (["\'])
1519 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
f2332f18 1520 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
66c9fa36
S
1521 \1''', webpage)]
1522
1523 # lazyYT YouTube embed
1524 entries.extend(list(map(
1525 unescapeHTML,
1526 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1527
1528 # Wordpress "YouTube Video Importer" plugin
1529 matches = re.findall(r'''(?x)<div[^>]+
1530 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1531 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1532 entries.extend(m[-1] for m in matches)
1533
1534 return entries
1535
1536 @staticmethod
1537 def _extract_url(webpage):
1538 urls = YoutubeIE._extract_urls(webpage)
1539 return urls[0] if urls else None
1540
97665381
PH
1541 @classmethod
1542 def extract_id(cls, url):
1543 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 1544 if mobj is None:
69ea8ca4 1545 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1546 video_id = mobj.group(2)
1547 return video_id
1548
84213ea8
S
1549 def _extract_chapters_from_json(self, webpage, video_id, duration):
1550 if not webpage:
1551 return
8bdd16b4 1552 data = self._extract_yt_initial_data(video_id, webpage)
1553 if not data or not isinstance(data, dict):
84213ea8
S
1554 return
1555 chapters_list = try_get(
8bdd16b4 1556 data,
84213ea8
S
1557 lambda x: x['playerOverlays']
1558 ['playerOverlayRenderer']
1559 ['decoratedPlayerBarRenderer']
1560 ['decoratedPlayerBarRenderer']
1561 ['playerBar']
1562 ['chapteredPlayerBarRenderer']
1563 ['chapters'],
1564 list)
1565 if not chapters_list:
1566 return
1567
1568 def chapter_time(chapter):
1569 return float_or_none(
1570 try_get(
1571 chapter,
1572 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1573 int),
1574 scale=1000)
1575 chapters = []
1576 for next_num, chapter in enumerate(chapters_list, start=1):
1577 start_time = chapter_time(chapter)
1578 if start_time is None:
1579 continue
1580 end_time = (chapter_time(chapters_list[next_num])
1581 if next_num < len(chapters_list) else duration)
1582 if end_time is None:
1583 continue
1584 title = try_get(
1585 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1586 compat_str)
1587 chapters.append({
1588 'start_time': start_time,
1589 'end_time': end_time,
1590 'title': title,
1591 })
1592 return chapters
1593
9cafc3fd 1594 @staticmethod
84213ea8 1595 def _extract_chapters_from_description(description, duration):
9cafc3fd
S
1596 if not description:
1597 return None
1598 chapter_lines = re.findall(
1599 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1600 description)
1601 if not chapter_lines:
1602 return None
1603 chapters = []
1604 for next_num, (chapter_line, time_point) in enumerate(
1605 chapter_lines, start=1):
1606 start_time = parse_duration(time_point)
1607 if start_time is None:
1608 continue
39d4c1be
S
1609 if start_time > duration:
1610 break
9cafc3fd
S
1611 end_time = (duration if next_num == len(chapter_lines)
1612 else parse_duration(chapter_lines[next_num][1]))
1613 if end_time is None:
1614 continue
39d4c1be
S
1615 if end_time > duration:
1616 end_time = duration
1617 if start_time > end_time:
1618 break
9cafc3fd
S
1619 chapter_title = re.sub(
1620 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1621 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1622 chapters.append({
1623 'start_time': start_time,
1624 'end_time': end_time,
1625 'title': chapter_title,
1626 })
1627 return chapters
1628
84213ea8
S
1629 def _extract_chapters(self, webpage, description, video_id, duration):
1630 return (self._extract_chapters_from_json(webpage, video_id, duration)
1631 or self._extract_chapters_from_description(description, duration))
1632
c5e8d7af 1633 def _real_extract(self, url):
cf7e015f
S
1634 url, smuggled_data = unsmuggle_url(url, {})
1635
7e8c0af0 1636 proto = (
78caa52a
PH
1637 'http' if self._downloader.params.get('prefer_insecure', False)
1638 else 'https')
7e8c0af0 1639
7c80519c 1640 start_time = None
297a564b 1641 end_time = None
7c80519c
JMF
1642 parsed_url = compat_urllib_parse_urlparse(url)
1643 for component in [parsed_url.fragment, parsed_url.query]:
1644 query = compat_parse_qs(component)
297a564b 1645 if start_time is None and 't' in query:
7c80519c 1646 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
1647 if start_time is None and 'start' in query:
1648 start_time = parse_duration(query['start'][0])
297a564b
JMF
1649 if end_time is None and 'end' in query:
1650 end_time = parse_duration(query['end'][0])
7c80519c 1651
c5e8d7af
PH
1652 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1653 mobj = re.search(self._NEXT_URL_RE, url)
1654 if mobj:
7fd002c0 1655 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 1656 video_id = self.extract_id(url)
c5e8d7af
PH
1657
1658 # Get video webpage
aa79ac0c 1659 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
011e75e6
S
1660 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1661
1662 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1663 video_id = qs.get('v', [None])[0] or video_id
c5e8d7af
PH
1664
1665 # Attempt to extract SWF player URL
e0df6211 1666 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1667 if mobj is not None:
1668 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1669 else:
1670 player_url = None
1671
d8d24a92
S
1672 dash_mpds = []
1673
1674 def add_dash_mpd(video_info):
1675 dash_mpd = video_info.get('dashmpd')
1676 if dash_mpd and dash_mpd[0] not in dash_mpds:
1677 dash_mpds.append(dash_mpd[0])
1678
561b456e
S
1679 def add_dash_mpd_pr(pl_response):
1680 dash_mpd = url_or_none(try_get(
1681 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1682 compat_str))
1683 if dash_mpd and dash_mpd not in dash_mpds:
1684 dash_mpds.append(dash_mpd)
1685
c7121fa7
S
1686 is_live = None
1687 view_count = None
1688
1689 def extract_view_count(v_info):
1690 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1691
c2d125d9
S
1692 def extract_player_response(player_response, video_id):
1693 pl_response = str_or_none(player_response)
1694 if not pl_response:
1695 return
1696 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1697 if isinstance(pl_response, dict):
1698 add_dash_mpd_pr(pl_response)
1699 return pl_response
1700
fb2c9277
U
1701 def extract_embedded_config(embed_webpage, video_id):
1702 embedded_config = self._search_regex(
1703 r'setConfig\(({.*})\);',
1704 embed_webpage, 'ytInitialData', default=None)
1705 if embedded_config:
1706 return embedded_config
1707
dbdaaa23
S
1708 player_response = {}
1709
c5e8d7af 1710 # Get video info
43ebf77d 1711 video_info = {}
6449cd80 1712 embed_webpage = None
39e7107d
U
1713 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1714 or re.search(r'player-age-gate-content">', video_webpage) is not None):
9d9314cb 1715 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
c108eb73
JMF
1716 age_gate = True
1717 # We simulate the access to the video from www.youtube.com/v/{video_id}
1718 # this can be viewed without login into Youtube
beb95e77
CL
1719 url = proto + '://www.youtube.com/embed/%s' % video_id
1720 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
fb2c9277
U
1721 ext = extract_embedded_config(embed_webpage, video_id)
1722 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1723 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1724 if not playable_in_embed:
1725 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1726 playable_in_embed = ''
1727 else:
1728 playable_in_embed = playable_in_embed.group('playableinEmbed')
1729 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1730 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1731 if playable_in_embed == 'false':
c73baf23
U
1732 '''
1733 # TODO apply this patch when Support for Python 2.6(!) and above drops
9d9314cb 1734 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
4bb9c880 1735 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
c73baf23
U
1736 '''
1737 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1738 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
4bb9c880
U
1739 age_gate = False
1740 # Try looking directly into the video webpage
1741 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1742 if ytplayer_config:
59c5fa91
PO
1743 args = ytplayer_config.get("args")
1744 if args is not None:
1745 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1746 # Convert to the same format returned by compat_parse_qs
1747 video_info = dict((k, [v]) for k, v in args.items())
1748 add_dash_mpd(video_info)
1749 # Rental video is not rented but preview is available (e.g.
1750 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1751 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1752 if not video_info and args.get('ypc_vid'):
1753 return self.url_result(
1754 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1755 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1756 is_live = True
1757 if not player_response:
1758 player_response = extract_player_response(args.get('player_response'), video_id)
1759 elif not player_response:
1760 player_response = ytplayer_config
4bb9c880
U
1761 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1762 add_dash_mpd_pr(player_response)
9d9314cb
U
1763 else:
1764 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1765 else:
1766 data = compat_urllib_parse_urlencode({
1767 'video_id': video_id,
1768 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1769 'sts': self._search_regex(
1770 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1771 })
1772 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1773 try:
1774 video_info_webpage = self._download_webpage(
1775 video_info_url, video_id,
1776 note='Refetching age-gated info webpage',
1777 errnote='unable to download video info webpage')
1778 except ExtractorError:
1779 video_info_webpage = None
1780 if video_info_webpage:
1781 video_info = compat_parse_qs(video_info_webpage)
1782 pl_response = video_info.get('player_response', [None])[0]
1783 player_response = extract_player_response(pl_response, video_id)
1784 add_dash_mpd(video_info)
1785 view_count = extract_view_count(video_info)
c108eb73
JMF
1786 else:
1787 age_gate = False
d8d24a92 1788 # Try looking directly into the video webpage
a72778d3 1789 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
8bdd16b4 1790 if ytplayer_config:
1791 args = ytplayer_config.get('args', {})
4c76aa06 1792 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
d8d24a92
S
1793 # Convert to the same format returned by compat_parse_qs
1794 video_info = dict((k, [v]) for k, v in args.items())
1795 add_dash_mpd(video_info)
6496ccb4
S
1796 # Rental video is not rented but preview is available (e.g.
1797 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
067aa17e 1798 # https://github.com/ytdl-org/youtube-dl/issues/10532)
6496ccb4
S
1799 if not video_info and args.get('ypc_vid'):
1800 return self.url_result(
1801 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
2fe1ff85
JMF
1802 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1803 is_live = True
dbdaaa23 1804 if not player_response:
c2d125d9 1805 player_response = extract_player_response(args.get('player_response'), video_id)
0a3cf9ad 1806 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
561b456e 1807 add_dash_mpd_pr(player_response)
bbb7c3f7 1808
8bdd16b4 1809 if not video_info and not player_response:
1810 player_response = extract_player_response(
1811 self._search_regex(
1812 r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;', video_webpage,
1813 'initial player response', default='{}'),
1814 video_id)
1815
bbb7c3f7 1816 def extract_unavailable_message():
0add33ab
S
1817 messages = []
1818 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1819 msg = self._html_search_regex(
1820 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1821 video_webpage, 'unavailable %s' % kind, default=None)
1822 if msg:
1823 messages.append(msg)
1824 if messages:
1825 return '\n'.join(messages)
bbb7c3f7 1826
f93abcf1 1827 if not video_info and not player_response:
15be3eb5
RA
1828 unavailable_message = extract_unavailable_message()
1829 if not unavailable_message:
1830 unavailable_message = 'Unable to extract video data'
1831 raise ExtractorError(
1832 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1833
f93abcf1
S
1834 if not isinstance(video_info, dict):
1835 video_info = {}
1836
dbdaaa23
S
1837 video_details = try_get(
1838 player_response, lambda x: x['videoDetails'], dict) or {}
1839
37357d21
S
1840 microformat = try_get(
1841 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1842
8dbf751a
RA
1843 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1844 if not video_title:
cf7e015f
S
1845 self._downloader.report_warning('Unable to extract video title')
1846 video_title = '_'
1847
9cafc3fd 1848 description_original = video_description = get_element_by_id("eow-description", video_webpage)
cf7e015f 1849 if video_description:
fa4bc6e7
RA
1850
1851 def replace_url(m):
1852 redir_url = compat_urlparse.urljoin(url, m.group(1))
1853 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1854 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1855 qs = compat_parse_qs(parsed_redir_url.query)
1856 q = qs.get('q')
1857 if q and q[0]:
1858 return q[0]
1859 return redir_url
1860
9cafc3fd 1861 description_original = video_description = re.sub(r'''(?x)
cf7e015f 1862 <a\s+
25cb7a0e 1863 (?:[a-zA-Z-]+="[^"]*"\s+)*?
23f13e97 1864 (?:title|href)="([^"]+)"\s+
25cb7a0e 1865 (?:[a-zA-Z-]+="[^"]*"\s+)*?
525cedb9 1866 class="[^"]*"[^>]*>
23f13e97 1867 [^<]+\.{3}\s*
cf7e015f 1868 </a>
fa4bc6e7 1869 ''', replace_url, video_description)
cf7e015f
S
1870 video_description = clean_html(video_description)
1871 else:
ea74e00b
DP
1872 video_description = video_details.get('shortDescription')
1873 if video_description is None:
1874 video_description = self._html_search_meta('description', video_webpage)
cf7e015f 1875
8fe10494 1876 if not smuggled_data.get('force_singlefeed', False):
5e1eddb9 1877 if not self._downloader.params.get('noplaylist'):
8fe10494
S
1878 multifeed_metadata_list = try_get(
1879 player_response,
1880 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1881 compat_str) or try_get(
1882 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1883 if multifeed_metadata_list:
1884 entries = []
1885 feed_ids = []
1886 for feed in multifeed_metadata_list.split(','):
1887 # Unquote should take place before split on comma (,) since textual
1888 # fields may contain comma as well (see
067aa17e 1889 # https://github.com/ytdl-org/youtube-dl/issues/8536)
8fe10494 1890 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
6b09401b
S
1891
1892 def feed_entry(name):
1893 return try_get(feed_data, lambda x: x[name][0], compat_str)
1894
1895 feed_id = feed_entry('id')
1896 if not feed_id:
1897 continue
1898 feed_title = feed_entry('title')
1899 title = video_title
1900 if feed_title:
1901 title += ' (%s)' % feed_title
8fe10494
S
1902 entries.append({
1903 '_type': 'url_transparent',
1904 'ie_key': 'Youtube',
1905 'url': smuggle_url(
1906 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1907 {'force_singlefeed': True}),
6b09401b 1908 'title': title,
8fe10494 1909 })
6b09401b 1910 feed_ids.append(feed_id)
8fe10494
S
1911 self.to_screen(
1912 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1913 % (', '.join(feed_ids), video_id))
1914 return self.playlist_result(entries, video_id, video_title, video_description)
1915 else:
1916 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1917
c7121fa7 1918 if view_count is None:
1c9c8de2 1919 view_count = extract_view_count(video_info)
dbdaaa23
S
1920 if view_count is None and video_details:
1921 view_count = int_or_none(video_details.get('viewCount'))
7b16239a
S
1922 if view_count is None and microformat:
1923 view_count = int_or_none(microformat.get('viewCount'))
1d699755 1924
27019dbb 1925 if is_live is None:
898238e9 1926 is_live = bool_or_none(video_details.get('isLive'))
27019dbb 1927
321bf820 1928 has_live_chat_replay = False
f0f76a33 1929 if not is_live:
321bf820 1930 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
1931 try:
1932 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
1933 has_live_chat_replay = True
f0f76a33 1934 except (KeyError, IndexError, TypeError):
321bf820 1935 pass
1936
c5e8d7af
PH
1937 # Check for "rental" videos
1938 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
067aa17e 1939 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
c5e8d7af 1940
c63ca0ee
S
1941 def _extract_filesize(media_url):
1942 return int_or_none(self._search_regex(
1943 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1944
bf1317d2
S
1945 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
1946 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
1947
c5e8d7af
PH
1948 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1949 self.report_rtmp_download()
dd27fd17
PH
1950 formats = [{
1951 'format_id': '_rtmp',
1952 'protocol': 'rtmp',
1953 'url': video_info['conn'][0],
1954 'player_url': player_url,
1955 }]
bf1317d2 1956 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
5f6a1245 1957 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1958 if 'rtmpe%3Dyes' in encoded_url_map:
067aa17e 1959 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
bf1317d2 1960 formats = []
3318832e 1961 formats_spec = {}
82156fdb 1962 fmt_list = video_info.get('fmt_list', [''])[0]
1963 if fmt_list:
1964 for fmt in fmt_list.split(','):
1965 spec = fmt.split('/')
3318832e 1966 if len(spec) > 1:
1967 width_height = spec[1].split('x')
1968 if len(width_height) == 2:
1969 formats_spec[spec[0]] = {
1970 'resolution': spec[1],
1971 'width': int_or_none(width_height[0]),
1972 'height': int_or_none(width_height[1]),
1973 }
bf1317d2
S
1974 for fmt in streaming_formats:
1975 itag = str_or_none(fmt.get('itag'))
1976 if not itag:
201e9eaa 1977 continue
bf1317d2
S
1978 quality = fmt.get('quality')
1979 quality_label = fmt.get('qualityLabel') or quality
1980 formats_spec[itag] = {
1981 'asr': int_or_none(fmt.get('audioSampleRate')),
1982 'filesize': int_or_none(fmt.get('contentLength')),
1983 'format_note': quality_label,
1984 'fps': int_or_none(fmt.get('fps')),
1985 'height': int_or_none(fmt.get('height')),
bf1317d2
S
1986 # bitrate for itag 43 is always 2147483647
1987 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
1988 'width': int_or_none(fmt.get('width')),
1989 }
1990
1991 for fmt in streaming_formats:
00eb865b 1992 if fmt.get('drmFamilies') or fmt.get('drm_families'):
bf1317d2
S
1993 continue
1994 url = url_or_none(fmt.get('url'))
1995
1996 if not url:
fa3db383 1997 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
bf1317d2
S
1998 if not cipher:
1999 continue
2000 url_data = compat_parse_qs(cipher)
2001 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2002 if not url:
2003 continue
2004 else:
2005 cipher = None
2006 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2007
2f483bc1
S
2008 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2009 # Unsupported FORMAT_STREAM_TYPE_OTF
2010 if stream_type == 3:
2011 continue
6449cd80 2012
bf1317d2
S
2013 format_id = fmt.get('itag') or url_data['itag'][0]
2014 if not format_id:
2015 continue
2016 format_id = compat_str(format_id)
a49eccdf 2017
bf1317d2
S
2018 if cipher:
2019 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
8bdd16b4 2020 ASSETS_RE = (
2021 r'<script[^>]+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base',
2022 r'"jsUrl"\s*:\s*("[^"]+")',
2023 r'"assets":.+?"js":\s*("[^"]+")')
bf1317d2
S
2024 jsplayer_url_json = self._search_regex(
2025 ASSETS_RE,
2026 embed_webpage if age_gate else video_webpage,
2027 'JS player URL (1)', default=None)
2028 if not jsplayer_url_json and not age_gate:
2029 # We need the embed website after all
2030 if embed_webpage is None:
2031 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2032 embed_webpage = self._download_webpage(
2033 embed_url, video_id, 'Downloading embed webpage')
2034 jsplayer_url_json = self._search_regex(
2035 ASSETS_RE, embed_webpage, 'JS player URL')
2036
2037 player_url = json.loads(jsplayer_url_json)
cf010131 2038 if player_url is None:
bf1317d2
S
2039 player_url_json = self._search_regex(
2040 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2041 video_webpage, 'age gate player URL')
2042 player_url = json.loads(player_url_json)
2043
2044 if 'sig' in url_data:
2045 url += '&signature=' + url_data['sig'][0]
2046 elif 's' in url_data:
2047 encrypted_sig = url_data['s'][0]
2048
2049 if self._downloader.params.get('verbose'):
2050 if player_url is None:
bf1317d2 2051 player_desc = 'unknown'
cf010131 2052 else:
e40c758c
S
2053 player_type, player_version = self._extract_player_info(player_url)
2054 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
bf1317d2
S
2055 parts_sizes = self._signature_cache_id(encrypted_sig)
2056 self.to_screen('{%s} signature length %s, %s' %
2057 (format_id, parts_sizes, player_desc))
2058
2059 signature = self._decrypt_signature(
2060 encrypted_sig, video_id, player_url, age_gate)
2061 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2062 url += '&%s=%s' % (sp, signature)
201e9eaa
PH
2063 if 'ratebypass' not in url:
2064 url += '&ratebypass=yes'
c9afb51c 2065
94278f72
YCH
2066 dct = {
2067 'format_id': format_id,
2068 'url': url,
2069 'player_url': player_url,
2070 }
2071 if format_id in self._formats:
2072 dct.update(self._formats[format_id])
3318832e 2073 if format_id in formats_spec:
2074 dct.update(formats_spec[format_id])
94278f72 2075
aabc2be6 2076 # Some itags are not included in DASH manifest thus corresponding formats will
067aa17e 2077 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
aabc2be6
S
2078 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2079 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2080 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
94278f72 2081
bf1317d2
S
2082 if width is None:
2083 width = int_or_none(fmt.get('width'))
2084 if height is None:
2085 height = int_or_none(fmt.get('height'))
2086
c63ca0ee
S
2087 filesize = int_or_none(url_data.get(
2088 'clen', [None])[0]) or _extract_filesize(url)
2089
bf1317d2
S
2090 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2091 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2092
4878759f
S
2093 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2094 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
bf1317d2 2095 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
54fc90aa 2096
94278f72 2097 more_fields = {
c63ca0ee 2098 'filesize': filesize,
bf1317d2 2099 'tbr': tbr,
c9afb51c
AH
2100 'width': width,
2101 'height': height,
bf1317d2
S
2102 'fps': fps,
2103 'format_note': quality_label or quality,
c9afb51c 2104 }
94278f72
YCH
2105 for key, value in more_fields.items():
2106 if value:
2107 dct[key] = value
bf1317d2 2108 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
aabc2be6
S
2109 if type_:
2110 type_split = type_.split(';')
2111 kind_ext = type_split[0].split('/')
2112 if len(kind_ext) == 2:
94278f72
YCH
2113 kind, _ = kind_ext
2114 dct['ext'] = mimetype2ext(type_split[0])
aabc2be6
S
2115 if kind in ('audio', 'video'):
2116 codecs = None
2117 for mobj in re.finditer(
2118 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2119 if mobj.group('key') == 'codecs':
2120 codecs = mobj.group('val')
2121 break
2122 if codecs:
6310acf5 2123 dct.update(parse_codecs(codecs))
e4a60912
S
2124 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2125 dct['downloader_options'] = {
2126 # Youtube throttles chunks >~10M
2127 'http_chunk_size': 10485760,
2128 }
aabc2be6 2129 formats.append(dct)
c5e8d7af 2130 else:
c3e54389
S
2131 manifest_url = (
2132 url_or_none(try_get(
2133 player_response,
2134 lambda x: x['streamingData']['hlsManifestUrl'],
3089bc74
S
2135 compat_str))
2136 or url_or_none(try_get(
c3e54389
S
2137 video_info, lambda x: x['hlsvp'][0], compat_str)))
2138 if manifest_url:
2139 formats = []
2140 m3u8_formats = self._extract_m3u8_formats(
2141 manifest_url, video_id, 'mp4', fatal=False)
2142 for a_format in m3u8_formats:
2143 itag = self._search_regex(
2144 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2145 if itag:
2146 a_format['format_id'] = itag
2147 if itag in self._formats:
2148 dct = self._formats[itag].copy()
2149 dct.update(a_format)
2150 a_format = dct
2151 a_format['player_url'] = player_url
2152 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2153 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
78895bd3
U
2154 if self._downloader.params.get('youtube_include_hls_manifest', True):
2155 formats.append(a_format)
c3e54389 2156 else:
13577349 2157 error_message = extract_unavailable_message()
a0566bbf 2158 if not error_message:
2159 reason_list = try_get(
2160 player_response,
2161 lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'],
2162 list) or []
2163 for reason in reason_list:
2164 if not isinstance(reason, dict):
2165 continue
2166 reason_text = try_get(reason, lambda x: x['text'], compat_str)
2167 if reason_text:
2168 if not error_message:
2169 error_message = ''
2170 error_message += reason_text
2171 if error_message:
2172 error_message = clean_html(error_message)
c3e54389 2173 if not error_message:
13577349
S
2174 error_message = clean_html(try_get(
2175 player_response, lambda x: x['playabilityStatus']['reason'],
2176 compat_str))
2177 if not error_message:
2178 error_message = clean_html(
2179 try_get(video_info, lambda x: x['reason'][0], compat_str))
c3e54389
S
2180 if error_message:
2181 raise ExtractorError(error_message, expected=True)
2182 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 2183
7e72694b 2184 # uploader
dbdaaa23
S
2185 video_uploader = try_get(
2186 video_info, lambda x: x['author'][0],
2187 compat_str) or str_or_none(video_details.get('author'))
7e72694b
S
2188 if video_uploader:
2189 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2190 else:
2191 self._downloader.report_warning('unable to extract uploader name')
2192
2193 # uploader_id
2194 video_uploader_id = None
2195 video_uploader_url = None
2196 mobj = re.search(
2197 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2198 video_webpage)
2199 if mobj is not None:
2200 video_uploader_id = mobj.group('uploader_id')
2201 video_uploader_url = mobj.group('uploader_url')
a6211d23
S
2202 else:
2203 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2204 if owner_profile_url:
2205 video_uploader_id = self._search_regex(
2206 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2207 default=None)
2208 video_uploader_url = owner_profile_url
7e72694b 2209
b45a9e69 2210 channel_id = (
3089bc74
S
2211 str_or_none(video_details.get('channelId'))
2212 or self._html_search_meta(
2213 'channelId', video_webpage, 'channel id', default=None)
2214 or self._search_regex(
b45a9e69 2215 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2216 video_webpage, 'channel id', default=None, group='id'))
dd4c4492
S
2217 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2218
b477fc13
S
2219 thumbnails = []
2220 thumbnails_list = try_get(
2221 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2222 for t in thumbnails_list:
2223 if not isinstance(t, dict):
2224 continue
2225 thumbnail_url = url_or_none(t.get('url'))
2226 if not thumbnail_url:
2227 continue
2228 thumbnails.append({
2229 'url': thumbnail_url,
2230 'width': int_or_none(t.get('width')),
2231 'height': int_or_none(t.get('height')),
2232 })
2233
2234 if not thumbnails:
7e72694b 2235 video_thumbnail = None
b477fc13
S
2236 # We try first to get a high quality image:
2237 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2238 video_webpage, re.DOTALL)
2239 if m_thumb is not None:
2240 video_thumbnail = m_thumb.group(1)
2241 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2242 if thumbnail_url:
2243 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2244 if video_thumbnail:
2245 thumbnails.append({'url': video_thumbnail})
7e72694b
S
2246
2247 # upload date
2248 upload_date = self._html_search_meta(
2249 'datePublished', video_webpage, 'upload date', default=None)
2250 if not upload_date:
2251 upload_date = self._search_regex(
2252 [r'(?s)id="eow-date.*?>(.*?)</span>',
2253 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2254 video_webpage, 'upload date', default=None)
37357d21
S
2255 if not upload_date:
2256 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
7e72694b
S
2257 upload_date = unified_strdate(upload_date)
2258
2259 video_license = self._html_search_regex(
2260 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2261 video_webpage, 'license', default=None)
2262
2263 m_music = re.search(
2264 r'''(?x)
2265 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2266 <ul[^>]*>\s*
2267 <li>(?P<title>.+?)
2268 by (?P<creator>.+?)
2269 (?:
2270 \(.+?\)|
2271 <a[^>]*
2272 (?:
2273 \bhref=["\']/red[^>]*>| # drop possible
2274 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2275 )
2276 .*?
2277 )?</li
2278 ''',
2279 video_webpage)
2280 if m_music:
2281 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2282 video_creator = clean_html(m_music.group('creator'))
2283 else:
2284 video_alt_title = video_creator = None
2285
2286 def extract_meta(field):
2287 return self._html_search_regex(
2288 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2289 video_webpage, field, default=None)
2290
2291 track = extract_meta('Song')
2292 artist = extract_meta('Artist')
92bc97d3 2293 album = extract_meta('Album')
822b9d9c
RA
2294
2295 # Youtube Music Auto-generated description
92bc97d3 2296 release_date = release_year = None
822b9d9c
RA
2297 if video_description:
2298 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2299 if mobj:
2300 if not track:
2301 track = mobj.group('track').strip()
2302 if not artist:
2303 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
92bc97d3
RA
2304 if not album:
2305 album = mobj.group('album'.strip())
822b9d9c
RA
2306 release_year = mobj.group('release_year')
2307 release_date = mobj.group('release_date')
2308 if release_date:
2309 release_date = release_date.replace('-', '')
2310 if not release_year:
2311 release_year = int(release_date[:4])
2312 if release_year:
2313 release_year = int(release_year)
7e72694b 2314
9322f116 2315 yt_initial = self._get_yt_initial_data(video_id, video_webpage)
2316 if yt_initial:
2317 music_metadata = self._get_music_metadata_from_yt_initial(yt_initial)
2318 if len(music_metadata):
2319 album = music_metadata[0].get('album')
2320 artist = music_metadata[0].get('artist')
2321 track = music_metadata[0].get('track')
2322
7e72694b
S
2323 m_episode = re.search(
2324 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2325 video_webpage)
2326 if m_episode:
c2dd2dc0 2327 series = unescapeHTML(m_episode.group('series'))
7e72694b
S
2328 season_number = int(m_episode.group('season'))
2329 episode_number = int(m_episode.group('episode'))
2330 else:
2331 series = season_number = episode_number = None
2332
2333 m_cat_container = self._search_regex(
2334 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2335 video_webpage, 'categories', default=None)
dbeafce5 2336 category = None
7e72694b
S
2337 if m_cat_container:
2338 category = self._html_search_regex(
2339 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2340 default=None)
dbeafce5
S
2341 if not category:
2342 category = try_get(
2343 microformat, lambda x: x['category'], compat_str)
2344 video_categories = None if category is None else [category]
7e72694b
S
2345
2346 video_tags = [
2347 unescapeHTML(m.group('content'))
2348 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
dbeafce5
S
2349 if not video_tags:
2350 video_tags = try_get(video_details, lambda x: x['keywords'], list)
7e72694b
S
2351
2352 def _extract_count(count_name):
2353 return str_to_int(self._search_regex(
a0566bbf 2354 (r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name),
2355 r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)),
7e72694b
S
2356 video_webpage, count_name, default=None))
2357
2358 like_count = _extract_count('like')
2359 dislike_count = _extract_count('dislike')
2360
dbdaaa23
S
2361 if view_count is None:
2362 view_count = str_to_int(self._search_regex(
2363 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2364 'view count', default=None))
2365
bf3c9326
S
2366 average_rating = (
2367 float_or_none(video_details.get('averageRating'))
2368 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2369
7e72694b 2370 # subtitles
321bf820 2371 video_subtitles = self.extract_subtitles(
2372 video_id, video_webpage, has_live_chat_replay)
7e72694b
S
2373 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2374
2375 video_duration = try_get(
2376 video_info, lambda x: int_or_none(x['length_seconds'][0]))
dbdaaa23
S
2377 if not video_duration:
2378 video_duration = int_or_none(video_details.get('lengthSeconds'))
7e72694b
S
2379 if not video_duration:
2380 video_duration = parse_duration(self._html_search_meta(
2381 'duration', video_webpage, 'video duration'))
2382
b84071c0
JP
2383 # Get Subscriber Count of channel
2384 subscriber_count = parse_count(self._search_regex(
2385 r'"text":"([\d\.]+\w?) subscribers"',
2386 video_webpage,
2387 'subscriber count',
2388 default=None
2389 ))
2390
7e72694b
S
2391 # annotations
2392 video_annotations = None
2393 if self._downloader.params.get('writeannotations', False):
64b6a4e9
RA
2394 xsrf_token = self._search_regex(
2395 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2396 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2397 invideo_url = try_get(
2398 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2399 if xsrf_token and invideo_url:
2400 xsrf_field_name = self._search_regex(
2401 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2402 video_webpage, 'xsrf field name',
2403 group='xsrf_field_name', default='session_token')
2404 video_annotations = self._download_webpage(
2405 self._proto_relative_url(invideo_url),
2406 video_id, note='Downloading annotations',
2407 errnote='Unable to download video annotations', fatal=False,
2408 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
7e72694b 2409
84213ea8 2410 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
7e72694b 2411
dd27fd17 2412 # Look for the DASH manifest
203fb43f 2413 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 2414 dash_mpd_fatal = True
8ff648e4 2415 for mpd_url in dash_mpds:
d8d24a92 2416 dash_formats = {}
774e208f 2417 try:
05d0d131
YCH
2418 def decrypt_sig(mobj):
2419 s = mobj.group(1)
2420 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2421 return '/signature/%s' % dec_s
2422
8ff648e4 2423 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2d2fa82d 2424
8ff648e4 2425 for df in self._extract_mpd_formats(
2426 mpd_url, video_id, fatal=dash_mpd_fatal,
2427 formats_dict=self._formats):
c63ca0ee
S
2428 if not df.get('filesize'):
2429 df['filesize'] = _extract_filesize(df['url'])
d8d24a92
S
2430 # Do not overwrite DASH format found in some previous DASH manifest
2431 if df['format_id'] not in dash_formats:
2432 dash_formats[df['format_id']] = df
77c6fb5b
S
2433 # Additional DASH manifests may end up in HTTP Error 403 therefore
2434 # allow them to fail without bug report message if we already have
2435 # some DASH manifest succeeded. This is temporary workaround to reduce
2436 # burst of bug reports until we figure out the reason and whether it
2437 # can be fixed at all.
2438 dash_mpd_fatal = False
774e208f
PH
2439 except (ExtractorError, KeyError) as e:
2440 self.report_warning(
2441 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 2442 if dash_formats:
04b3b3df
JMF
2443 # Remove the formats we found through non-DASH, they
2444 # contain less info and it can be wrong, because we use
2445 # fixed values (for example the resolution). See
067aa17e 2446 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
04b3b3df 2447 # example.
d80265cc 2448 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 2449 formats.extend(dash_formats.values())
d80044c2 2450
6271f1ca
PH
2451 # Check for malformed aspect ratio
2452 stretched_m = re.search(
2453 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2454 video_webpage)
2455 if stretched_m:
313dfc45
LL
2456 w = float(stretched_m.group('w'))
2457 h = float(stretched_m.group('h'))
5faf9fed
S
2458 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2459 # We will only process correct ratios.
313dfc45 2460 if w > 0 and h > 0:
41f24c32 2461 ratio = w / h
313dfc45
LL
2462 for f in formats:
2463 if f.get('vcodec') != 'none':
2464 f['stretched_ratio'] = ratio
6271f1ca 2465
026fbedc 2466 if not formats:
43ebf77d
S
2467 if 'reason' in video_info:
2468 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2469 regions_allowed = self._html_search_meta(
2470 'regionsAllowed', video_webpage, default=None)
2471 countries = regions_allowed.split(',') if regions_allowed else None
2472 self.raise_geo_restricted(
2473 msg=video_info['reason'][0], countries=countries)
2474 reason = video_info['reason'][0]
2475 if 'Invalid parameters' in reason:
2476 unavailable_message = extract_unavailable_message()
2477 if unavailable_message:
2478 reason = unavailable_message
2479 raise ExtractorError(
2480 'YouTube said: %s' % reason,
2481 expected=True, video_id=video_id)
2482 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2483 raise ExtractorError('This video is DRM protected.', expected=True)
0d297518 2484
4bcc7bd1 2485 self._sort_formats(formats)
4ea3be0a 2486
21c340b8 2487 self.mark_watched(video_id, video_info, player_response)
d77ab8e2 2488
4ea3be0a 2489 return {
8bcc8756
JW
2490 'id': video_id,
2491 'uploader': video_uploader,
2492 'uploader_id': video_uploader_id,
fd050249 2493 'uploader_url': video_uploader_url,
dd4c4492
S
2494 'channel_id': channel_id,
2495 'channel_url': channel_url,
8bcc8756 2496 'upload_date': upload_date,
7caf9830 2497 'license': video_license,
936784b2 2498 'creator': video_creator or artist,
8bcc8756 2499 'title': video_title,
936784b2 2500 'alt_title': video_alt_title or track,
b477fc13 2501 'thumbnails': thumbnails,
8bcc8756
JW
2502 'description': video_description,
2503 'categories': video_categories,
000b6b5a 2504 'tags': video_tags,
8bcc8756 2505 'subtitles': video_subtitles,
360e1ca5 2506 'automatic_captions': automatic_captions,
8bcc8756
JW
2507 'duration': video_duration,
2508 'age_limit': 18 if age_gate else 0,
2509 'annotations': video_annotations,
9cafc3fd 2510 'chapters': chapters,
7e8c0af0 2511 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 2512 'view_count': view_count,
4ea3be0a 2513 'like_count': like_count,
2514 'dislike_count': dislike_count,
bf3c9326 2515 'average_rating': average_rating,
8bcc8756 2516 'formats': formats,
2fe1ff85 2517 'is_live': is_live,
7c80519c 2518 'start_time': start_time,
297a564b 2519 'end_time': end_time,
12afdc2a
S
2520 'series': series,
2521 'season_number': season_number,
2522 'episode_number': episode_number,
936784b2
S
2523 'track': track,
2524 'artist': artist,
5caabd3c 2525 'album': album,
2526 'release_date': release_date,
2527 'release_year': release_year,
b84071c0 2528 'subscriber_count': subscriber_count,
4ea3be0a 2529 }
c5e8d7af 2530
5f6a1245 2531
8bdd16b4 2532class YoutubeTabIE(YoutubeBaseInfoExtractor):
2533 IE_DESC = 'YouTube.com tab'
70d5c17b 2534 _VALID_URL = r'''(?x)
2535 https?://
2536 (?:\w+\.)?
2537 (?:
2538 youtube(?:kids)?\.com|
2539 invidio\.us
2540 )/
2541 (?:
2542 (?:channel|c|user)/|
2543 (?P<not_channel>
3d3dddc9 2544 feed/|
70d5c17b 2545 (?:playlist|watch)\?.*?\blist=
2546 )|
2547 (?!(%s)([/#?]|$)) # Direct URLs
2548 )
2549 (?P<id>[^/?\#&]+)
2550 ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
8bdd16b4 2551 IE_NAME = 'youtube:tab'
2552
81127aa5 2553 _TESTS = [{
8bdd16b4 2554 # playlists, multipage
2555 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
2556 'playlist_mincount': 94,
2557 'info_dict': {
2558 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2559 'title': 'Игорь Клейнер - Playlists',
2560 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2561 },
2562 }, {
2563 # playlists, multipage, different order
2564 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2565 'playlist_mincount': 94,
2566 'info_dict': {
2567 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2568 'title': 'Игорь Клейнер - Playlists',
2569 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2570 },
2571 }, {
2572 # playlists, singlepage
2573 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2574 'playlist_mincount': 4,
2575 'info_dict': {
2576 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
2577 'title': 'ThirstForScience - Playlists',
2578 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
2579 }
2580 }, {
2581 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
2582 'only_matching': True,
2583 }, {
2584 # basic, single video playlist
0e30a7b9 2585 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
81127aa5 2586 'info_dict': {
0e30a7b9 2587 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2588 'uploader': 'Sergey M.',
2589 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3867038a 2590 'title': 'youtube-dl public playlist',
81127aa5 2591 },
0e30a7b9 2592 'playlist_count': 1,
9291475f 2593 }, {
8bdd16b4 2594 # empty playlist
0e30a7b9 2595 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
9291475f 2596 'info_dict': {
0e30a7b9 2597 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2598 'uploader': 'Sergey M.',
2599 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3867038a 2600 'title': 'youtube-dl empty playlist',
9291475f
PH
2601 },
2602 'playlist_count': 0,
2603 }, {
8bdd16b4 2604 # Home tab
2605 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
9291475f 2606 'info_dict': {
8bdd16b4 2607 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2608 'title': 'lex will - Home',
2609 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2610 },
8bdd16b4 2611 'playlist_mincount': 2,
9291475f 2612 }, {
8bdd16b4 2613 # Videos tab
2614 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
9291475f 2615 'info_dict': {
8bdd16b4 2616 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2617 'title': 'lex will - Videos',
2618 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2619 },
8bdd16b4 2620 'playlist_mincount': 975,
9291475f 2621 }, {
8bdd16b4 2622 # Videos tab, sorted by popular
2623 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
9291475f 2624 'info_dict': {
8bdd16b4 2625 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2626 'title': 'lex will - Videos',
2627 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2628 },
8bdd16b4 2629 'playlist_mincount': 199,
9291475f 2630 }, {
8bdd16b4 2631 # Playlists tab
2632 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
9291475f 2633 'info_dict': {
8bdd16b4 2634 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2635 'title': 'lex will - Playlists',
2636 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2637 },
8bdd16b4 2638 'playlist_mincount': 17,
ac7553d0 2639 }, {
8bdd16b4 2640 # Community tab
2641 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
ac7553d0 2642 'info_dict': {
8bdd16b4 2643 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2644 'title': 'lex will - Community',
2645 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2646 },
2647 'playlist_mincount': 18,
87dadd45 2648 }, {
8bdd16b4 2649 # Channels tab
2650 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
87dadd45 2651 'info_dict': {
8bdd16b4 2652 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2653 'title': 'lex will - Channels',
2654 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2655 },
2656 'playlist_mincount': 138,
6b08cdf6 2657 }, {
a0566bbf 2658 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2659 'only_matching': True,
2660 }, {
a0566bbf 2661 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2662 'only_matching': True,
2663 }, {
a0566bbf 2664 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2665 'only_matching': True,
2666 }, {
2667 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2668 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2669 'info_dict': {
2670 'title': '29C3: Not my department',
2671 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2672 'uploader': 'Christiaan008',
2673 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
2674 },
2675 'playlist_count': 96,
2676 }, {
2677 'note': 'Large playlist',
2678 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
6b08cdf6 2679 'info_dict': {
8bdd16b4 2680 'title': 'Uploads from Cauchemar',
2681 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2682 'uploader': 'Cauchemar',
2683 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
13a75688 2684 },
8bdd16b4 2685 'playlist_mincount': 1123,
2686 }, {
2687 # even larger playlist, 8832 videos
2688 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
2689 'only_matching': True,
4b7df0d3
JMF
2690 }, {
2691 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2692 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2693 'info_dict': {
acf757f4
PH
2694 'title': 'Uploads from Interstellar Movie',
2695 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
13a75688 2696 'uploader': 'Interstellar Movie',
8bdd16b4 2697 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3 2698 },
481cc733 2699 'playlist_mincount': 21,
8bdd16b4 2700 }, {
2701 # https://github.com/ytdl-org/youtube-dl/issues/21844
2702 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2703 'info_dict': {
2704 'title': 'Data Analysis with Dr Mike Pound',
2705 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2706 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
2707 'uploader': 'Computerphile',
2708 },
2709 'playlist_mincount': 11,
2710 }, {
a0566bbf 2711 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
8bdd16b4 2712 'only_matching': True,
dacb3a86
S
2713 }, {
2714 # Playlist URL that does not actually serve a playlist
2715 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2716 'info_dict': {
2717 'id': 'FqZTN594JQw',
2718 'ext': 'webm',
2719 'title': "Smiley's People 01 detective, Adventure Series, Action",
2720 'uploader': 'STREEM',
2721 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
ec85ded8 2722 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
dacb3a86
S
2723 'upload_date': '20150526',
2724 'license': 'Standard YouTube License',
2725 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2726 'categories': ['People & Blogs'],
2727 'tags': list,
dbdaaa23 2728 'view_count': int,
dacb3a86
S
2729 'like_count': int,
2730 'dislike_count': int,
2731 },
2732 'params': {
2733 'skip_download': True,
2734 },
13a75688 2735 'skip': 'This video is not available.',
dacb3a86 2736 'add_ie': [YoutubeIE.ie_key()],
481cc733 2737 }, {
8bdd16b4 2738 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
c0345b82 2739 'only_matching': True,
66b48727 2740 }, {
8bdd16b4 2741 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
66b48727 2742 'only_matching': True,
a0566bbf 2743 }, {
2744 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
2745 'info_dict': {
2746 'id': '9Auq9mYxFEE',
2747 'ext': 'mp4',
2748 'title': 'Watch Sky News live',
2749 'uploader': 'Sky News',
2750 'uploader_id': 'skynews',
2751 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
2752 'upload_date': '20191102',
2753 'description': 'md5:78de4e1c2359d0ea3ed829678e38b662',
2754 'categories': ['News & Politics'],
2755 'tags': list,
2756 'like_count': int,
2757 'dislike_count': int,
2758 },
2759 'params': {
2760 'skip_download': True,
2761 },
2762 }, {
2763 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2764 'info_dict': {
2765 'id': 'a48o2S1cPoo',
2766 'ext': 'mp4',
2767 'title': 'The Young Turks - Live Main Show',
2768 'uploader': 'The Young Turks',
2769 'uploader_id': 'TheYoungTurks',
2770 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2771 'upload_date': '20150715',
2772 'license': 'Standard YouTube License',
2773 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2774 'categories': ['News & Politics'],
2775 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2776 'like_count': int,
2777 'dislike_count': int,
2778 },
2779 'params': {
2780 'skip_download': True,
2781 },
2782 'only_matching': True,
2783 }, {
2784 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
2785 'only_matching': True,
2786 }, {
2787 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2788 'only_matching': True,
3d3dddc9 2789 }, {
2790 'url': 'https://www.youtube.com/feed/trending',
2791 'only_matching': True,
2792 }, {
2793 # needs auth
2794 'url': 'https://www.youtube.com/feed/library',
2795 'only_matching': True,
2796 }, {
2797 # needs auth
2798 'url': 'https://www.youtube.com/feed/history',
2799 'only_matching': True,
2800 }, {
2801 # needs auth
2802 'url': 'https://www.youtube.com/feed/subscriptions',
2803 'only_matching': True,
2804 }, {
2805 # needs auth
2806 'url': 'https://www.youtube.com/feed/watch_later',
2807 'only_matching': True,
2808 }, {
2809 # no longer available?
2810 'url': 'https://www.youtube.com/feed/recommended',
2811 'only_matching': True,
2812 }
ef2f3c7f 2813 # TODO
2814 # {
2815 # 'url': 'https://www.youtube.com/TheYoungTurks/live',
2816 # 'only_matching': True,
2817 # }
a0566bbf 2818 ]
8bdd16b4 2819
2820 def _extract_channel_id(self, webpage):
2821 channel_id = self._html_search_meta(
2822 'channelId', webpage, 'channel id', default=None)
2823 if channel_id:
2824 return channel_id
2825 channel_url = self._html_search_meta(
2826 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
2827 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
2828 'twitter:app:url:googleplay'), webpage, 'channel url')
2829 return self._search_regex(
2830 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
2831 channel_url, 'channel id')
15f6397c 2832
8bdd16b4 2833 @staticmethod
2834 def _extract_grid_item_renderer(item):
2835 for item_kind in ('Playlist', 'Video', 'Channel'):
2836 renderer = item.get('grid%sRenderer' % item_kind)
2837 if renderer:
2838 return renderer
2839
2840 def _extract_video(self, renderer):
2841 video_id = renderer.get('videoId')
2842 title = try_get(
2843 renderer,
2844 (lambda x: x['title']['runs'][0]['text'],
2845 lambda x: x['title']['simpleText']), compat_str)
2846 description = try_get(
2847 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
2848 compat_str)
2849 duration = parse_duration(try_get(
2850 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
2851 view_count_text = try_get(
2852 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
2853 view_count = str_to_int(self._search_regex(
2854 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
2855 'view count', default=None))
2856 uploader = try_get(
2857 renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
2858 return {
2859 '_type': 'url_transparent',
2860 'ie_key': YoutubeIE.ie_key(),
2861 'id': video_id,
2862 'url': video_id,
2863 'title': title,
2864 'description': description,
2865 'duration': duration,
2866 'view_count': view_count,
2867 'uploader': uploader,
2868 }
652cdaa2 2869
8bdd16b4 2870 def _grid_entries(self, grid_renderer):
2871 for item in grid_renderer['items']:
2872 if not isinstance(item, dict):
39b62db1 2873 continue
8bdd16b4 2874 renderer = self._extract_grid_item_renderer(item)
2875 if not isinstance(renderer, dict):
2876 continue
2877 title = try_get(
2878 renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2879 # playlist
2880 playlist_id = renderer.get('playlistId')
2881 if playlist_id:
2882 yield self.url_result(
2883 'https://www.youtube.com/playlist?list=%s' % playlist_id,
2884 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
2885 video_title=title)
2886 # video
2887 video_id = renderer.get('videoId')
2888 if video_id:
2889 yield self._extract_video(renderer)
2890 # channel
2891 channel_id = renderer.get('channelId')
2892 if channel_id:
2893 title = try_get(
2894 renderer, lambda x: x['title']['simpleText'], compat_str)
2895 yield self.url_result(
2896 'https://www.youtube.com/channel/%s' % channel_id,
2897 ie=YoutubeTabIE.ie_key(), video_title=title)
2898
3d3dddc9 2899 def _shelf_entries_from_content(self, shelf_renderer):
2900 content = shelf_renderer.get('content')
2901 if not isinstance(content, dict):
8bdd16b4 2902 return
3d3dddc9 2903 renderer = content.get('gridRenderer')
2904 if renderer:
2905 # TODO: add support for nested playlists so each shelf is processed
2906 # as separate playlist
2907 # TODO: this includes only first N items
2908 for entry in self._grid_entries(renderer):
2909 yield entry
2910 renderer = content.get('horizontalListRenderer')
2911 if renderer:
2912 # TODO
2913 pass
8bdd16b4 2914
2915 def _shelf_entries(self, shelf_renderer):
2916 ep = try_get(
2917 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
2918 compat_str)
2919 shelf_url = urljoin('https://www.youtube.com', ep)
3d3dddc9 2920 if shelf_url:
2921 title = try_get(
2922 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2923 yield self.url_result(shelf_url, video_title=title)
2924 # Shelf may not contain shelf URL, fallback to extraction from content
2925 for entry in self._shelf_entries_from_content(shelf_renderer):
2926 yield entry
c5e8d7af 2927
8bdd16b4 2928 def _playlist_entries(self, video_list_renderer):
2929 for content in video_list_renderer['contents']:
2930 if not isinstance(content, dict):
2931 continue
2932 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
2933 if not isinstance(renderer, dict):
2934 continue
2935 video_id = renderer.get('videoId')
2936 if not video_id:
2937 continue
2938 yield self._extract_video(renderer)
07aeced6 2939
3d3dddc9 2940 r""" # Not needed in the new implementation
3462ffa8 2941 def _itemSection_entries(self, item_sect_renderer):
2942 for content in item_sect_renderer['contents']:
2943 if not isinstance(content, dict):
2944 continue
2945 renderer = content.get('videoRenderer', {})
2946 if not isinstance(renderer, dict):
2947 continue
2948 video_id = renderer.get('videoId')
2949 if not video_id:
2950 continue
2951 yield self._extract_video(renderer)
3d3dddc9 2952 """
3462ffa8 2953
2954 def _rich_entries(self, rich_grid_renderer):
2955 renderer = try_get(
70d5c17b 2956 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
3462ffa8 2957 video_id = renderer.get('videoId')
2958 if not video_id:
2959 return
2960 yield self._extract_video(renderer)
2961
8bdd16b4 2962 def _video_entry(self, video_renderer):
2963 video_id = video_renderer.get('videoId')
2964 if video_id:
2965 return self._extract_video(video_renderer)
dacb3a86 2966
8bdd16b4 2967 def _post_thread_entries(self, post_thread_renderer):
2968 post_renderer = try_get(
2969 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
2970 if not post_renderer:
2971 return
2972 # video attachment
2973 video_renderer = try_get(
2974 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
2975 video_id = None
2976 if video_renderer:
2977 entry = self._video_entry(video_renderer)
2978 if entry:
2979 yield entry
2980 # inline video links
2981 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
2982 for run in runs:
2983 if not isinstance(run, dict):
2984 continue
2985 ep_url = try_get(
2986 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
2987 if not ep_url:
2988 continue
2989 if not YoutubeIE.suitable(ep_url):
2990 continue
2991 ep_video_id = YoutubeIE._match_id(ep_url)
2992 if video_id == ep_video_id:
2993 continue
2994 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
dacb3a86 2995
8bdd16b4 2996 def _post_thread_continuation_entries(self, post_thread_continuation):
2997 contents = post_thread_continuation.get('contents')
2998 if not isinstance(contents, list):
2999 return
3000 for content in contents:
3001 renderer = content.get('backstagePostThreadRenderer')
3002 if not isinstance(renderer, dict):
3003 continue
3004 for entry in self._post_thread_entries(renderer):
3005 yield entry
07aeced6 3006
8bdd16b4 3007 @staticmethod
3008 def _extract_next_continuation_data(renderer):
3009 next_continuation = try_get(
3010 renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
3011 if not next_continuation:
3012 return
3013 continuation = next_continuation.get('continuation')
3014 if not continuation:
3015 return
3016 ctp = next_continuation.get('clickTrackingParams')
3017 return {
3018 'ctoken': continuation,
3019 'continuation': continuation,
3020 'itct': ctp,
3021 }
c5e8d7af 3022
8bdd16b4 3023 @classmethod
3024 def _extract_continuation(cls, renderer):
3025 next_continuation = cls._extract_next_continuation_data(renderer)
3026 if next_continuation:
3027 return next_continuation
3028 contents = renderer.get('contents')
3029 if not isinstance(contents, list):
3030 return
3031 for content in contents:
3032 if not isinstance(content, dict):
3033 continue
3034 continuation_ep = try_get(
3035 content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
3036 dict)
3037 if not continuation_ep:
3038 continue
3039 continuation = try_get(
3040 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
3041 if not continuation:
3042 continue
3043 ctp = continuation_ep.get('clickTrackingParams')
3044 if not ctp:
3045 continue
3046 return {
3047 'ctoken': continuation,
3048 'continuation': continuation,
3049 'itct': ctp,
3050 }
448830ce 3051
8bdd16b4 3052 def _entries(self, tab, identity_token):
3462ffa8 3053
70d5c17b 3054 def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
3055 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
3056 for content in contents:
3057 if not isinstance(content, dict):
8bdd16b4 3058 continue
70d5c17b 3059 is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
3462ffa8 3060 if not is_renderer:
70d5c17b 3061 renderer = content.get('richItemRenderer')
3462ffa8 3062 if renderer:
3063 for entry in self._rich_entries(renderer):
3064 yield entry
3065 continuation_list[0] = self._extract_continuation(parent_renderer)
8bdd16b4 3066 continue
3462ffa8 3067 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
3068 for isr_content in isr_contents:
3069 if not isinstance(isr_content, dict):
3070 continue
3071 renderer = isr_content.get('playlistVideoListRenderer')
3072 if renderer:
3073 for entry in self._playlist_entries(renderer):
3074 yield entry
3075 continuation_list[0] = self._extract_continuation(renderer)
3076 continue
3077 renderer = isr_content.get('gridRenderer')
3078 if renderer:
3079 for entry in self._grid_entries(renderer):
3080 yield entry
3081 continuation_list[0] = self._extract_continuation(renderer)
3082 continue
3083 renderer = isr_content.get('shelfRenderer')
3084 if renderer:
3085 for entry in self._shelf_entries(renderer):
3086 yield entry
3462ffa8 3087 continue
3088 renderer = isr_content.get('backstagePostThreadRenderer')
3089 if renderer:
3090 for entry in self._post_thread_entries(renderer):
3091 yield entry
3092 continuation_list[0] = self._extract_continuation(renderer)
3093 continue
3094 renderer = isr_content.get('videoRenderer')
3095 if renderer:
3096 entry = self._video_entry(renderer)
3097 if entry:
3098 yield entry
70d5c17b 3099
3462ffa8 3100 if not continuation_list[0]:
3101 continuation_list[0] = self._extract_continuation(is_renderer)
70d5c17b 3102
3103 if not continuation_list[0]:
3104 continuation_list[0] = self._extract_continuation(parent_renderer)
3462ffa8 3105
3106 continuation_list = [None] # Python 2 doesnot support nonlocal
3107 parent_renderer = (
3108 try_get(tab, lambda x: x['sectionListRenderer'], dict)
3109 or try_get(tab, lambda x: x['richGridRenderer'], dict) or {})
70d5c17b 3110 for entry in extract_entries(parent_renderer):
3111 yield entry
3462ffa8 3112 continuation = continuation_list[0]
8bdd16b4 3113
3114 headers = {
3115 'x-youtube-client-name': '1',
3116 'x-youtube-client-version': '2.20201112.04.01',
3117 }
3118 if identity_token:
3119 headers['x-youtube-identity-token'] = identity_token
ebf1b291 3120
8bdd16b4 3121 for page_num in itertools.count(1):
3122 if not continuation:
3123 break
3124 browse = self._download_json(
3125 'https://www.youtube.com/browse_ajax', None,
3126 'Downloading page %d' % page_num,
3127 headers=headers, query=continuation, fatal=False)
3128 if not browse:
3129 break
3130 response = try_get(browse, lambda x: x[1]['response'], dict)
3131 if not response:
3132 break
ebf1b291 3133
8bdd16b4 3134 continuation_contents = try_get(
3135 response, lambda x: x['continuationContents'], dict)
3136 if continuation_contents:
3137 continuation_renderer = continuation_contents.get('playlistVideoListContinuation')
3138 if continuation_renderer:
3139 for entry in self._playlist_entries(continuation_renderer):
3140 yield entry
3141 continuation = self._extract_continuation(continuation_renderer)
3142 continue
3143 continuation_renderer = continuation_contents.get('gridContinuation')
3144 if continuation_renderer:
3145 for entry in self._grid_entries(continuation_renderer):
3146 yield entry
3147 continuation = self._extract_continuation(continuation_renderer)
3148 continue
3149 continuation_renderer = continuation_contents.get('itemSectionContinuation')
3150 if continuation_renderer:
3151 for entry in self._post_thread_continuation_entries(continuation_renderer):
3152 yield entry
3153 continuation = self._extract_continuation(continuation_renderer)
3154 continue
70d5c17b 3155 continuation_renderer = continuation_contents.get('sectionListContinuation') # for feeds
3462ffa8 3156 if continuation_renderer:
3157 continuation_list = [None]
3158 for entry in extract_entries(continuation_renderer):
3159 yield entry
3160 continuation = continuation_list[0]
3161 continue
c5e8d7af 3162
8bdd16b4 3163 continuation_items = try_get(
3164 response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
3165 if continuation_items:
3166 continuation_item = continuation_items[0]
3167 if not isinstance(continuation_item, dict):
3168 continue
70d5c17b 3169 renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer')
8bdd16b4 3170 if renderer:
3171 video_list_renderer = {'contents': continuation_items}
3172 for entry in self._playlist_entries(video_list_renderer):
3173 yield entry
3174 continuation = self._extract_continuation(video_list_renderer)
3175 continue
8bdd16b4 3176 break
9558dcec 3177
8bdd16b4 3178 @staticmethod
3179 def _extract_selected_tab(tabs):
3180 for tab in tabs:
3181 if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
3182 return tab['tabRenderer']
2b3c2546 3183 else:
8bdd16b4 3184 raise ExtractorError('Unable to find selected tab')
b82f815f 3185
8bdd16b4 3186 @staticmethod
3187 def _extract_uploader(data):
3188 uploader = {}
3189 sidebar_renderer = try_get(
3190 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
3191 if sidebar_renderer:
3192 for item in sidebar_renderer:
3193 if not isinstance(item, dict):
3194 continue
3195 renderer = item.get('playlistSidebarSecondaryInfoRenderer')
3196 if not isinstance(renderer, dict):
3197 continue
3198 owner = try_get(
3199 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3200 if owner:
3201 uploader['uploader'] = owner.get('text')
3202 uploader['uploader_id'] = try_get(
3203 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3204 uploader['uploader_url'] = urljoin(
3205 'https://www.youtube.com/',
3206 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3207 return uploader
3208
3209 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
3210 selected_tab = self._extract_selected_tab(tabs)
3211 renderer = try_get(
3212 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
70d5c17b 3213 playlist_id = title = description = None
8bdd16b4 3214 if renderer:
3215 channel_title = renderer.get('title') or item_id
3216 tab_title = selected_tab.get('title')
3217 title = channel_title or item_id
3218 if tab_title:
3219 title += ' - %s' % tab_title
3220 description = renderer.get('description')
3221 playlist_id = renderer.get('externalId')
3222 renderer = try_get(
3223 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3224 if renderer:
3225 title = renderer.get('title')
3226 description = None
3227 playlist_id = item_id
3462ffa8 3228 if playlist_id is None:
70d5c17b 3229 playlist_id = item_id
3230 if title is None:
3231 title = "Youtube " + playlist_id.title()
8bdd16b4 3232 playlist = self.playlist_result(
3233 self._entries(selected_tab['content'], identity_token),
3234 playlist_id=playlist_id, playlist_title=title,
3235 playlist_description=description)
3236 playlist.update(self._extract_uploader(data))
3237 return playlist
73c4ac2c 3238
8bdd16b4 3239 def _extract_from_playlist(self, item_id, data, playlist):
3240 title = playlist.get('title') or try_get(
3241 data, lambda x: x['titleText']['simpleText'], compat_str)
3242 playlist_id = playlist.get('playlistId') or item_id
3243 return self.playlist_result(
3244 self._playlist_entries(playlist), playlist_id=playlist_id,
3245 playlist_title=title)
c5e8d7af 3246
02ced43c 3247 def _extract_alerts(self, data):
3248 for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
3249 for renderer in alert_dict:
3250 alert = alert_dict[renderer]
3251 alert_type = alert.get('type')
3252 if not alert_type:
3253 continue
3254 message = try_get(alert, lambda x: x['text']['simpleText'], compat_str)
3255 if message:
3256 yield alert_type, message
3257 for run in try_get(alert, lambda x: x['text']['runs'], list) or []:
3258 message = try_get(run, lambda x: x['text'], compat_str)
3259 if message:
3260 yield alert_type, message
3261
8bdd16b4 3262 def _real_extract(self, url):
3263 item_id = self._match_id(url)
3264 url = compat_urlparse.urlunparse(
3265 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
036fcf3a 3266 is_home = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url)
70d5c17b 3267 if is_home is not None and is_home.group('not_channel') is None and item_id != 'feed':
036fcf3a 3268 self._downloader.report_warning(
3269 'A channel/user page was given. All the channel\'s videos will be downloaded. '
3270 'To download only the videos in the home page, add a "/home" to the URL')
3271 url = '%s/videos%s' % (is_home.group('pre'), is_home.group('post') or '')
3272
8bdd16b4 3273 # Handle both video/playlist URLs
3274 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3275 video_id = qs.get('v', [None])[0]
3276 playlist_id = qs.get('list', [None])[0]
3277 if video_id and playlist_id:
3278 if self._downloader.params.get('noplaylist'):
3279 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
3280 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3281 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
3282 webpage = self._download_webpage(url, item_id)
3283 identity_token = self._search_regex(
a93f71ee 3284 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
8bdd16b4 3285 'identity token', default=None)
3286 data = self._extract_yt_initial_data(item_id, webpage)
02ced43c 3287 for alert_type, alert_message in self._extract_alerts(data):
3288 self._downloader.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
8bdd16b4 3289 tabs = try_get(
3290 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3291 if tabs:
3292 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
3293 playlist = try_get(
3294 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3295 if playlist:
3296 return self._extract_from_playlist(item_id, data, playlist)
a0566bbf 3297 # Fallback to video extraction if no playlist alike page is recognized.
3298 # First check for the current video then try the v attribute of URL query.
3299 video_id = try_get(
3300 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
3301 compat_str) or video_id
8bdd16b4 3302 if video_id:
3303 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3304 # Failed to recognize
3305 raise ExtractorError('Unable to recognize tab page')
c5e8d7af 3306
c5e8d7af 3307
8bdd16b4 3308class YoutubePlaylistIE(InfoExtractor):
3309 IE_DESC = 'YouTube.com playlists'
3310 _VALID_URL = r'''(?x)(?:
3311 (?:https?://)?
3312 (?:\w+\.)?
3313 (?:
3314 (?:
3315 youtube(?:kids)?\.com|
3316 invidio\.us|
3317 youtu\.be
3318 )
3319 /.*?\?.*?\blist=
3320 )?
3321 (?P<id>%(playlist_id)s)
3322 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3323 IE_NAME = 'youtube:playlist'
cdc628a4 3324 _TESTS = [{
8bdd16b4 3325 'note': 'issue #673',
3326 'url': 'PLBB231211A4F62143',
cdc628a4 3327 'info_dict': {
8bdd16b4 3328 'title': '[OLD]Team Fortress 2 (Class-based LP)',
3329 'id': 'PLBB231211A4F62143',
3330 'uploader': 'Wickydoo',
3331 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
3332 },
3333 'playlist_mincount': 29,
3334 }, {
3335 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3336 'info_dict': {
3337 'title': 'YDL_safe_search',
3338 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3339 },
3340 'playlist_count': 2,
3341 'skip': 'This playlist is private',
9558dcec 3342 }, {
8bdd16b4 3343 'note': 'embedded',
3344 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3345 'playlist_count': 4,
9558dcec 3346 'info_dict': {
8bdd16b4 3347 'title': 'JODA15',
3348 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3349 'uploader': 'milan',
3350 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
9558dcec 3351 }
cdc628a4 3352 }, {
8bdd16b4 3353 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3354 'playlist_mincount': 982,
3355 'info_dict': {
3356 'title': '2018 Chinese New Singles (11/6 updated)',
3357 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3358 'uploader': 'LBK',
3359 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
3360 }
daa0df9e 3361 }, {
8bdd16b4 3362 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
3363 'info_dict': {
3364 'id': 'yeWKywCrFtk',
3365 'ext': 'mp4',
3366 'title': 'Small Scale Baler and Braiding Rugs',
3367 'uploader': 'Backus-Page House Museum',
3368 'uploader_id': 'backuspagemuseum',
3369 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
3370 'upload_date': '20161008',
3371 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
3372 'categories': ['Nonprofits & Activism'],
3373 'tags': list,
3374 'like_count': int,
3375 'dislike_count': int,
3376 },
3377 'params': {
3378 'noplaylist': True,
3379 'skip_download': True,
3380 },
39e7107d 3381 }, {
8bdd16b4 3382 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
39e7107d 3383 'only_matching': True,
9558dcec 3384 }, {
8bdd16b4 3385 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
9558dcec 3386 'only_matching': True,
73c4ac2c 3387 }, {
8bdd16b4 3388 # music album playlist
3389 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
73c4ac2c 3390 'only_matching': True,
cdc628a4
PH
3391 }]
3392
e3ea4790 3393 @classmethod
f4b05232 3394 def suitable(cls, url):
8bdd16b4 3395 return False if YoutubeTabIE.suitable(url) else super(
3396 YoutubePlaylistIE, cls).suitable(url)
f4b05232 3397
8bdd16b4 3398 def _real_extract(self, url):
3399 playlist_id = self._match_id(url)
3400 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3401 if not qs:
3402 qs = {'list': playlist_id}
3403 return self.url_result(
3404 update_url_query('https://www.youtube.com/playlist', qs),
3405 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3406
3407
3408class YoutubeYtUserIE(InfoExtractor):
3409 _VALID_URL = r'ytuser:(?P<id>.+)'
3410 _TESTS = [{
3411 'url': 'ytuser:phihag',
3412 'only_matching': True,
3413 }]
3414
3415 def _real_extract(self, url):
3416 user_id = self._match_id(url)
3417 return self.url_result(
3418 'https://www.youtube.com/user/%s' % user_id,
3419 ie=YoutubeTabIE.ie_key(), video_id=user_id)
9558dcec 3420
b05654f0 3421
3d3dddc9 3422class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
70d5c17b 3423 IE_NAME = 'youtube:favorites'
3424 IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
3425 _VALID_URL = r':ytfav(?:ou?rite)?s?'
3426 _LOGIN_REQUIRED = True
3427 _TESTS = [{
3428 'url': ':ytfav',
3429 'only_matching': True,
3430 }, {
3431 'url': ':ytfavorites',
3432 'only_matching': True,
3433 }]
3434
3435 def _real_extract(self, url):
3436 return self.url_result(
3437 'https://www.youtube.com/playlist?list=LL',
3438 ie=YoutubeTabIE.ie_key())
3439
3440
8bdd16b4 3441class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
78caa52a 3442 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
3443 # there doesn't appear to be a real limit, for example if you search for
3444 # 'python' you get more than 8.000.000 results
3445 _MAX_RESULTS = float('inf')
78caa52a 3446 IE_NAME = 'youtube:search'
b05654f0 3447 _SEARCH_KEY = 'ytsearch'
6c894ea1 3448 _SEARCH_PARAMS = None
9dd8e46a 3449 _TESTS = []
b05654f0 3450
6c894ea1
U
3451 def _entries(self, query, n):
3452 data = {
3453 'context': {
3454 'client': {
3455 'clientName': 'WEB',
3456 'clientVersion': '2.20201021.03.00',
3457 }
3458 },
3459 'query': query,
a22b2fd1 3460 }
6c894ea1
U
3461 if self._SEARCH_PARAMS:
3462 data['params'] = self._SEARCH_PARAMS
3463 total = 0
3464 for page_num in itertools.count(1):
3465 search = self._download_json(
3466 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3467 video_id='query "%s"' % query,
3468 note='Downloading page %s' % page_num,
3469 errnote='Unable to download API page', fatal=False,
3470 data=json.dumps(data).encode('utf8'),
3471 headers={'content-type': 'application/json'})
3472 if not search:
b4c08069 3473 break
6c894ea1
U
3474 slr_contents = try_get(
3475 search,
3476 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3477 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3478 list)
3479 if not slr_contents:
a22b2fd1 3480 break
6c894ea1
U
3481 isr_contents = try_get(
3482 slr_contents,
3483 lambda x: x[0]['itemSectionRenderer']['contents'],
3484 list)
3485 if not isr_contents:
3486 break
3487 for content in isr_contents:
3488 if not isinstance(content, dict):
3489 continue
3490 video = content.get('videoRenderer')
3491 if not isinstance(video, dict):
3492 continue
3493 video_id = video.get('videoId')
3494 if not video_id:
3495 continue
3496 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3497 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3498 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3499 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3500 view_count = int_or_none(self._search_regex(
3501 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3502 'view count', default=None))
3503 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3504 total += 1
3505 yield {
3506 '_type': 'url_transparent',
3507 'ie_key': YoutubeIE.ie_key(),
3508 'id': video_id,
3509 'url': video_id,
3510 'title': title,
3511 'description': description,
3512 'duration': duration,
3513 'view_count': view_count,
3514 'uploader': uploader,
3515 }
3516 if total == n:
3517 return
3518 token = try_get(
3519 slr_contents,
3520 lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3521 compat_str)
3522 if not token:
3523 break
3524 data['continuation'] = token
b05654f0 3525
6c894ea1
U
3526 def _get_n_results(self, query, n):
3527 """Get a specified number of results for a query"""
3528 return self.playlist_result(self._entries(query, n), query)
75dff0ee 3529
c9ae7b95 3530
a3dd9248 3531class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 3532 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 3533 _SEARCH_KEY = 'ytsearchdate'
78caa52a 3534 IE_DESC = 'YouTube.com searches, newest videos first'
6c894ea1 3535 _SEARCH_PARAMS = 'CAI%3D'
75dff0ee 3536
c9ae7b95 3537
386e1dd9 3538class YoutubeSearchURLIE(YoutubeSearchIE):
3462ffa8 3539 IE_DESC = 'YouTube.com search URLs'
386e1dd9 3540 IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
3541 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
ef2f3c7f 3542 # _MAX_RESULTS = 100
3462ffa8 3543 _TESTS = [{
3544 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3545 'playlist_mincount': 5,
3546 'info_dict': {
3547 'title': 'youtube-dl test video',
3548 }
3549 }, {
3550 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3551 'only_matching': True,
3552 }]
3553
386e1dd9 3554 @classmethod
3555 def _make_valid_url(cls):
3556 return cls._VALID_URL
3557
3462ffa8 3558 def _real_extract(self, url):
386e1dd9 3559 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
3560 query = (qs.get('search_query') or qs.get('q'))[0]
3561 self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
3562 return self._get_n_results(query, self._MAX_RESULTS)
3462ffa8 3563
3564
3565class YoutubeFeedsInfoExtractor(YoutubeTabIE):
d7ae0639 3566 """
25f14e9f 3567 Base class for feed extractors
3d3dddc9 3568 Subclasses must define the _FEED_NAME property.
d7ae0639 3569 """
b2e8bc1b 3570 _LOGIN_REQUIRED = True
3462ffa8 3571 # _MAX_PAGES = 5
ef2f3c7f 3572 _TESTS = []
d7ae0639
JMF
3573
3574 @property
3575 def IE_NAME(self):
78caa52a 3576 return 'youtube:%s' % self._FEED_NAME
04cc9617 3577
81f0259b 3578 def _real_initialize(self):
b2e8bc1b 3579 self._login()
81f0259b 3580
3853309f 3581 def _real_extract(self, url):
3d3dddc9 3582 return self.url_result(
3583 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3584 ie=YoutubeTabIE.ie_key())
25f14e9f
S
3585
3586
ef2f3c7f 3587class YoutubeWatchLaterIE(InfoExtractor):
3588 IE_NAME = 'youtube:watchlater'
70d5c17b 3589 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3d3dddc9 3590 _VALID_URL = r':ytwatchlater'
bc7a9cd8 3591 _TESTS = [{
8bdd16b4 3592 'url': ':ytwatchlater',
bc7a9cd8
S
3593 'only_matching': True,
3594 }]
25f14e9f
S
3595
3596 def _real_extract(self, url):
ef2f3c7f 3597 return self.url_result(
3598 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
3462ffa8 3599
3600
25f14e9f
S
3601class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3602 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3d3dddc9 3603 _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
25f14e9f 3604 _FEED_NAME = 'recommended'
3d3dddc9 3605 _TESTS = [{
3606 'url': ':ytrec',
3607 'only_matching': True,
3608 }, {
3609 'url': ':ytrecommended',
3610 'only_matching': True,
3611 }, {
3612 'url': 'https://youtube.com',
3613 'only_matching': True,
3614 }]
1ed5b5c9 3615
1ed5b5c9 3616
25f14e9f 3617class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
70d5c17b 3618 IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
3d3dddc9 3619 _VALID_URL = r':ytsub(?:scription)?s?'
25f14e9f 3620 _FEED_NAME = 'subscriptions'
3d3dddc9 3621 _TESTS = [{
3622 'url': ':ytsubs',
3623 'only_matching': True,
3624 }, {
3625 'url': ':ytsubscriptions',
3626 'only_matching': True,
3627 }]
1ed5b5c9 3628
1ed5b5c9 3629
25f14e9f
S
3630class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3631 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3d3dddc9 3632 _VALID_URL = r':ythistory'
25f14e9f 3633 _FEED_NAME = 'history'
3d3dddc9 3634 _TESTS = [{
3635 'url': ':ythistory',
3636 'only_matching': True,
3637 }]
1ed5b5c9
JMF
3638
3639
15870e90
PH
3640class YoutubeTruncatedURLIE(InfoExtractor):
3641 IE_NAME = 'youtube:truncated_url'
3642 IE_DESC = False # Do not list
975d35db 3643 _VALID_URL = r'''(?x)
b95aab84
PH
3644 (?:https?://)?
3645 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3646 (?:watch\?(?:
c4808c60 3647 feature=[a-z_]+|
b95aab84
PH
3648 annotation_id=annotation_[^&]+|
3649 x-yt-cl=[0-9]+|
c1708b89 3650 hl=[^&]*|
287be8c6 3651 t=[0-9]+
b95aab84
PH
3652 )?
3653 |
3654 attribution_link\?a=[^&]+
3655 )
3656 $
975d35db 3657 '''
15870e90 3658
c4808c60 3659 _TESTS = [{
2d3d2997 3660 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
c4808c60 3661 'only_matching': True,
dc2fc736 3662 }, {
2d3d2997 3663 'url': 'https://www.youtube.com/watch?',
dc2fc736 3664 'only_matching': True,
b95aab84
PH
3665 }, {
3666 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3667 'only_matching': True,
3668 }, {
3669 'url': 'https://www.youtube.com/watch?feature=foo',
3670 'only_matching': True,
c1708b89
PH
3671 }, {
3672 'url': 'https://www.youtube.com/watch?hl=en-GB',
3673 'only_matching': True,
287be8c6
PH
3674 }, {
3675 'url': 'https://www.youtube.com/watch?t=2372',
3676 'only_matching': True,
c4808c60
PH
3677 }]
3678
15870e90
PH
3679 def _real_extract(self, url):
3680 raise ExtractorError(
78caa52a
PH
3681 'Did you forget to quote the URL? Remember that & is a meta '
3682 'character in most shells, so you want to put the URL in quotes, '
3867038a 3683 'like youtube-dl '
2d3d2997 3684 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3867038a 3685 ' or simply youtube-dl BaW_jenozKc .',
15870e90 3686 expected=True)
772fd5cc
PH
3687
3688
3689class YoutubeTruncatedIDIE(InfoExtractor):
3690 IE_NAME = 'youtube:truncated_id'
3691 IE_DESC = False # Do not list
b95aab84 3692 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
3693
3694 _TESTS = [{
3695 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3696 'only_matching': True,
3697 }]
3698
3699 def _real_extract(self, url):
3700 video_id = self._match_id(url)
3701 raise ExtractorError(
3702 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3703 expected=True)
8bdd16b4 3704
3705
3462ffa8 3706# Do Youtube show urls even exist anymore? I couldn't find any
3707r'''
3708class YoutubeShowIE(YoutubeTabIE):
8bdd16b4 3709 IE_DESC = 'YouTube.com (multi-season) shows'
3710 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3711 IE_NAME = 'youtube:show'
3712 _TESTS = [{
3713 'url': 'https://www.youtube.com/show/airdisasters',
3714 'playlist_mincount': 5,
3715 'info_dict': {
3716 'id': 'airdisasters',
3717 'title': 'Air Disasters',
3718 }
3719 }]
3720
3721 def _real_extract(self, url):
3722 playlist_id = self._match_id(url)
3723 return super(YoutubeShowIE, self)._real_extract(
3724 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3462ffa8 3725'''