]> jfr.im git - yt-dlp.git/blame - youtube_dlc/extractor/youtube.py
Readme changes
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
d77ab8e2 9import random
c5e8d7af 10import re
42939b61 11import time
e0df6211 12import traceback
c5e8d7af 13
b05654f0 14from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
8d81f3e3 19 compat_kwargs,
c5e8d7af 20 compat_parse_qs,
7fd002c0
S
21 compat_urllib_parse_unquote,
22 compat_urllib_parse_unquote_plus,
15707c7e 23 compat_urllib_parse_urlencode,
7c80519c 24 compat_urllib_parse_urlparse,
7c61bd36 25 compat_urlparse,
c5e8d7af 26 compat_str,
4bb4a188
PH
27)
28from ..utils import (
27019dbb 29 bool_or_none,
c5e8d7af 30 clean_html,
9b9c5355 31 error_to_compat_str,
c5e8d7af 32 ExtractorError,
2d30521a 33 float_or_none,
4bb4a188 34 get_element_by_id,
dd27fd17 35 int_or_none,
94278f72 36 mimetype2ext,
6310acf5 37 parse_codecs,
b84071c0 38 parse_count,
7c80519c 39 parse_duration,
0cb58b02 40 remove_quotes,
3995d37d 41 remove_start,
cf7e015f 42 smuggle_url,
dbdaaa23 43 str_or_none,
c93d53f5 44 str_to_int,
556dbe7f 45 try_get,
c5e8d7af
PH
46 unescapeHTML,
47 unified_strdate,
cf7e015f 48 unsmuggle_url,
8bdd16b4 49 update_url_query,
81c2f20b 50 uppercase_escape,
21c340b8 51 url_or_none,
6e6bc8da 52 urlencode_postdata,
8bdd16b4 53 urljoin,
c5e8d7af
PH
54)
55
5f6a1245 56
de7f3446 57class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
58 """Provide base functions for Youtube extractors"""
59 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 60 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
e00eb564
S
61
62 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
3995d37d
S
63 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
64 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
e00eb564 65
3462ffa8 66 _RESERVED_NAMES = (
c78b936a 67 r'course|embed|channel|c|user|playlist|watch|w|results|storefront|oops|'
3462ffa8 68 r'shared|index|account|reporthistory|t/terms|about|upload|signin|logout|'
69 r'feed/(watch_later|history|subscriptions|library|trending|recommended)')
70
b2e8bc1b
JMF
71 _NETRC_MACHINE = 'youtube'
72 # If True it will raise an error if no login info is provided
73 _LOGIN_REQUIRED = False
74
70d5c17b 75 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
d0ba5587 76
d84b21b4
S
77 _YOUTUBE_CLIENT_HEADERS = {
78 'x-youtube-client-name': '1',
79 'x-youtube-client-version': '1.20200609.04.02',
80 }
81
b2e8bc1b 82 def _set_language(self):
810fb84d 83 self._set_cookie(
ee0b726c 84 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
42939b61 85 # YouTube sets the expire time to about two months
810fb84d 86 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 87
25f14e9f
S
88 def _ids_to_results(self, ids):
89 return [
90 self.url_result(vid_id, 'Youtube', video_id=vid_id)
91 for vid_id in ids]
92
b2e8bc1b 93 def _login(self):
83317f69 94 """
95 Attempt to log in to YouTube.
96 True is returned if successful or skipped.
97 False is returned if login failed.
98
99 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
100 """
68217024 101 username, password = self._get_login_info()
b2e8bc1b
JMF
102 # No authentication to be performed
103 if username is None:
70d35d16 104 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
69ea8ca4 105 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
957c523e
U
106 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
107 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
83317f69 108 return True
b2e8bc1b 109
7cc3570e
PH
110 login_page = self._download_webpage(
111 self._LOGIN_URL, None,
69ea8ca4
PH
112 note='Downloading login page',
113 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
114 if login_page is False:
115 return
b2e8bc1b 116
1212e997 117 login_form = self._hidden_inputs(login_page)
c5e8d7af 118
e00eb564
S
119 def req(url, f_req, note, errnote):
120 data = login_form.copy()
121 data.update({
122 'pstMsg': 1,
123 'checkConnection': 'youtube',
124 'checkedDomains': 'youtube',
125 'hl': 'en',
126 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
3995d37d 127 'f.req': json.dumps(f_req),
e00eb564
S
128 'flowName': 'GlifWebSignIn',
129 'flowEntry': 'ServiceLogin',
baf67a60
S
130 # TODO: reverse actual botguard identifier generation algo
131 'bgRequest': '["identifier",""]',
041bc3ad 132 })
e00eb564
S
133 return self._download_json(
134 url, None, note=note, errnote=errnote,
135 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
136 fatal=False,
137 data=urlencode_postdata(data), headers={
138 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
139 'Google-Accounts-XSRF': 1,
140 })
141
3995d37d
S
142 def warn(message):
143 self._downloader.report_warning(message)
144
145 lookup_req = [
146 username,
147 None, [], None, 'US', None, None, 2, False, True,
148 [
149 None, None,
150 [2, 1, None, 1,
151 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
152 None, [], 4],
153 1, [None, None, []], None, None, None, True
154 ],
155 username,
156 ]
157
e00eb564 158 lookup_results = req(
3995d37d 159 self._LOOKUP_URL, lookup_req,
e00eb564
S
160 'Looking up account info', 'Unable to look up account info')
161
162 if lookup_results is False:
163 return False
041bc3ad 164
3995d37d
S
165 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
166 if not user_hash:
167 warn('Unable to extract user hash')
168 return False
169
170 challenge_req = [
171 user_hash,
172 None, 1, None, [1, None, None, None, [password, None, True]],
173 [
174 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
175 1, [None, None, []], None, None, None, True
176 ]]
83317f69 177
3995d37d
S
178 challenge_results = req(
179 self._CHALLENGE_URL, challenge_req,
180 'Logging in', 'Unable to log in')
83317f69 181
3995d37d 182 if challenge_results is False:
e00eb564 183 return
83317f69 184
3995d37d
S
185 login_res = try_get(challenge_results, lambda x: x[0][5], list)
186 if login_res:
187 login_msg = try_get(login_res, lambda x: x[5], compat_str)
188 warn(
189 'Unable to login: %s' % 'Invalid password'
190 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
191 return False
192
193 res = try_get(challenge_results, lambda x: x[0][-1], list)
194 if not res:
195 warn('Unable to extract result entry')
196 return False
197
9a6628aa
S
198 login_challenge = try_get(res, lambda x: x[0][0], list)
199 if login_challenge:
200 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
201 if challenge_str == 'TWO_STEP_VERIFICATION':
3995d37d
S
202 # SEND_SUCCESS - TFA code has been successfully sent to phone
203 # QUOTA_EXCEEDED - reached the limit of TFA codes
9a6628aa 204 status = try_get(login_challenge, lambda x: x[5], compat_str)
3995d37d
S
205 if status == 'QUOTA_EXCEEDED':
206 warn('Exceeded the limit of TFA codes, try later')
207 return False
208
209 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
210 if not tl:
211 warn('Unable to extract TL')
212 return False
213
214 tfa_code = self._get_tfa_info('2-step verification code')
215
216 if not tfa_code:
217 warn(
218 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
219 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
220 return False
221
222 tfa_code = remove_start(tfa_code, 'G-')
223
224 tfa_req = [
225 user_hash, None, 2, None,
226 [
227 9, None, None, None, None, None, None, None,
228 [None, tfa_code, True, 2]
229 ]]
230
231 tfa_results = req(
232 self._TFA_URL.format(tl), tfa_req,
233 'Submitting TFA code', 'Unable to submit TFA code')
234
235 if tfa_results is False:
236 return False
237
238 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
239 if tfa_res:
240 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
241 warn(
242 'Unable to finish TFA: %s' % 'Invalid TFA code'
243 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
244 return False
245
246 check_cookie_url = try_get(
247 tfa_results, lambda x: x[0][-1][2], compat_str)
9a6628aa
S
248 else:
249 CHALLENGES = {
250 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
251 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
252 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
253 }
254 challenge = CHALLENGES.get(
255 challenge_str,
256 '%s returned error %s.' % (self.IE_NAME, challenge_str))
257 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
258 return False
3995d37d
S
259 else:
260 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
261
262 if not check_cookie_url:
263 warn('Unable to extract CheckCookie URL')
264 return False
e00eb564
S
265
266 check_cookie_results = self._download_webpage(
3995d37d
S
267 check_cookie_url, None, 'Checking cookie', fatal=False)
268
269 if check_cookie_results is False:
270 return False
e00eb564 271
3995d37d
S
272 if 'https://myaccount.google.com/' not in check_cookie_results:
273 warn('Unable to log in')
b2e8bc1b 274 return False
e00eb564 275
b2e8bc1b
JMF
276 return True
277
30226342 278 def _download_webpage_handle(self, *args, **kwargs):
c1148516 279 query = kwargs.get('query', {}).copy()
c1148516 280 kwargs['query'] = query
30226342 281 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
8d81f3e3
S
282 *args, **compat_kwargs(kwargs))
283
5b0a6a80 284 def _get_yt_initial_data(self, video_id, webpage):
285 config = self._search_regex(
286 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
287 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
288 webpage, 'ytInitialData', default=None)
289 if config:
290 return self._parse_json(
291 uppercase_escape(config), video_id, fatal=False)
292
b2e8bc1b
JMF
293 def _real_initialize(self):
294 if self._downloader is None:
295 return
42939b61 296 self._set_language()
b2e8bc1b
JMF
297 if not self._login():
298 return
c5e8d7af 299
8bdd16b4 300 _DEFAULT_API_DATA = {
301 'context': {
302 'client': {
303 'clientName': 'WEB',
304 'clientVersion': '2.20201021.03.00',
305 }
306 },
307 }
8377574c 308
a0566bbf 309 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
310
8bdd16b4 311 def _call_api(self, ep, query, video_id):
312 data = self._DEFAULT_API_DATA.copy()
313 data.update(query)
9833e7a0 314
8bdd16b4 315 response = self._download_json(
316 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
317 note='Downloading API JSON', errnote='Unable to download API page',
318 data=json.dumps(data).encode('utf8'),
319 headers={'content-type': 'application/json'},
320 query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
c54f4aad 321
8bdd16b4 322 return response
061a75ed 323
8bdd16b4 324 def _extract_yt_initial_data(self, video_id, webpage):
325 return self._parse_json(
326 self._search_regex(
a0566bbf 327 (r'%s\s*\n' % self._YT_INITIAL_DATA_RE,
328 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
8bdd16b4 329 video_id)
0c148415
S
330
331
360e1ca5 332class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 333 IE_DESC = 'YouTube.com'
cb7dfeea 334 _VALID_URL = r"""(?x)^
c5e8d7af 335 (
edb53e2d 336 (?:https?://|//) # http(s):// or protocol-independent URL
66b48727 337 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
484aaeb2 338 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 339 (?:www\.)?pwnyoutube\.com/|
8b561bfc 340 (?:www\.)?hooktube\.com/|
f7000f3a 341 (?:www\.)?yourepeat\.com/|
e69ae5b9 342 tube\.majestyc\.net/|
ba036333 343 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
77d95677 344 (?:(?:www|dev)\.)?invidio\.us/|
ba036333 345 (?:(?:www|no)\.)?invidiou\.sh/|
346 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
8ae113ca 347 (?:www\.)?invidious\.kabi\.tk/|
ba036333 348 (?:www\.)?invidious\.13ad\.de/|
791d2e81 349 (?:www\.)?invidious\.mastodon\.host/|
494d664e 350 (?:www\.)?invidious\.nixnet\.xyz/|
666d808e 351 (?:www\.)?invidious\.drycat\.fr/|
ba036333 352 (?:www\.)?tube\.poal\.co/|
8ae113ca 353 (?:www\.)?vid\.wxzm\.sx/|
384bf91f 354 (?:www\.)?yewtu\.be/|
494d664e 355 (?:www\.)?yt\.elukerio\.org/|
894b3826 356 (?:www\.)?yt\.lelux\.fi/|
1db5ab6b 357 (?:www\.)?invidious\.ggc-project\.de/|
358 (?:www\.)?yt\.maisputain\.ovh/|
359 (?:www\.)?invidious\.13ad\.de/|
360 (?:www\.)?invidious\.toot\.koeln/|
361 (?:www\.)?invidious\.fdn\.fr/|
362 (?:www\.)?watch\.nettohikari\.com/|
bff90fc5 363 (?:www\.)?kgg2m7yk5aybusll\.onion/|
364 (?:www\.)?qklhadlycap4cnod\.onion/|
365 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
366 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
367 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
368 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
33c1c7d8 369 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
1db5ab6b 370 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
e69ae5b9 371 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
372 (?:.*?\#/)? # handle anchor (#/) redirect urls
373 (?: # the various things that can precede the ID:
ac7553d0 374 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 375 |(?: # or the v= param in all its forms
f7000f3a 376 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 377 (?:\?|\#!?) # the params delimiter ? or # or #!
040ac686 378 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
c5e8d7af
PH
379 v=
380 )
f4b05232 381 ))
cbaed4bb
S
382 |(?:
383 youtu\.be| # just youtu.be/xxxx
6d4fc66b
S
384 vid\.plus| # or vid.plus/xxxx
385 zwearz\.com/watch| # or zwearz.com/watch/xxxx
cbaed4bb 386 )/
edb53e2d 387 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 388 )
c5e8d7af 389 )? # all until now is optional -> you can pass the naked ID
8bdd16b4 390 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
d0ba5587
S
391 (?!.*?\blist=
392 (?:
393 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
394 WL # WL are handled by the watch later IE
395 )
396 )
c5e8d7af 397 (?(1).+)? # if we found the ID, everything can follow
d0ba5587 398 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
c5e8d7af 399 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
e40c758c
S
400 _PLAYER_INFO_RE = (
401 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
402 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
403 )
2c62dc26 404 _formats = {
c2d3cb4c 405 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
406 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
407 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
408 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
409 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
410 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
411 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
412 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
3834d3e3 413 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
c2d3cb4c 414 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
415 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
416 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
417 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
418 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
419 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
e1a0bfdf 420 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
c2d3cb4c 421 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
422 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
e1a0bfdf 423
424
425 # 3D videos
c2d3cb4c 426 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
427 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
428 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
429 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
e1a0bfdf 430 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
431 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
432 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
836a086c 433
96fb5605 434 # Apple HTTP Live Streaming
11f12195 435 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
c2d3cb4c 436 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
437 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
438 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
439 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
440 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
e1a0bfdf 441 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
442 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
2c62dc26
PH
443
444 # DASH mp4 video
d23028a8
S
445 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
446 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
447 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
448 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
449 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
067aa17e 450 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
d23028a8
S
451 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
452 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
453 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
454 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
455 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
456 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
836a086c 457
f6f1fc92 458 # Dash mp4 audio
d23028a8
S
459 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
460 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
461 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
462 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
463 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
464 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
465 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
836a086c
AZ
466
467 # Dash webm
d23028a8
S
468 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
469 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
470 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
471 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
472 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
473 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
474 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
475 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
476 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
477 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
478 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
479 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
480 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
481 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
482 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
4c6b4764 483 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
d23028a8
S
484 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
485 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
486 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
487 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
488 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
489 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
2c62dc26
PH
490
491 # Dash webm audio
d23028a8
S
492 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
493 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
ce6b9a2d 494
0857baad 495 # Dash webm audio with opus inside
d23028a8
S
496 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
497 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
498 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
0857baad 499
ce6b9a2d
PH
500 # RTMP (unnamed)
501 '_rtmp': {'protocol': 'rtmp'},
b85eae0f
S
502
503 # av01 video only formats sometimes served with "unknown" codecs
504 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
505 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
506 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
507 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
c5e8d7af 508 }
40ec740f 509 _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') # TODO 'json3' raising issues with automatic captions
836a086c 510
fd5c4aab
S
511 _GEO_BYPASS = False
512
78caa52a 513 IE_NAME = 'youtube'
2eb88d95
PH
514 _TESTS = [
515 {
2d3d2997 516 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
4bc3a23e
PH
517 'info_dict': {
518 'id': 'BaW_jenozKc',
519 'ext': 'mp4',
3867038a 520 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
4bc3a23e
PH
521 'uploader': 'Philipp Hagemeister',
522 'uploader_id': 'phihag',
ec85ded8 523 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
dd4c4492
S
524 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
525 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
4bc3a23e 526 'upload_date': '20121002',
3867038a 527 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
4bc3a23e 528 'categories': ['Science & Technology'],
3867038a 529 'tags': ['youtube-dl'],
556dbe7f 530 'duration': 10,
dbdaaa23 531 'view_count': int,
3e7c1224
PH
532 'like_count': int,
533 'dislike_count': int,
7c80519c 534 'start_time': 1,
297a564b 535 'end_time': 9,
2eb88d95 536 }
0e853ca4 537 },
fccd3771 538 {
4bc3a23e
PH
539 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
540 'note': 'Embed-only video (#1746)',
541 'info_dict': {
542 'id': 'yZIXLfi8CZQ',
543 'ext': 'mp4',
544 'upload_date': '20120608',
545 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
546 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
547 'uploader': 'SET India',
94bfcd23 548 'uploader_id': 'setindia',
ec85ded8 549 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
94bfcd23 550 'age_limit': 18,
fccd3771
PH
551 }
552 },
11b56058 553 {
8bdd16b4 554 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
11b56058
PM
555 'note': 'Use the first video ID in the URL',
556 'info_dict': {
557 'id': 'BaW_jenozKc',
558 'ext': 'mp4',
3867038a 559 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
11b56058
PM
560 'uploader': 'Philipp Hagemeister',
561 'uploader_id': 'phihag',
ec85ded8 562 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
11b56058 563 'upload_date': '20121002',
3867038a 564 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
11b56058 565 'categories': ['Science & Technology'],
3867038a 566 'tags': ['youtube-dl'],
556dbe7f 567 'duration': 10,
dbdaaa23 568 'view_count': int,
11b56058
PM
569 'like_count': int,
570 'dislike_count': int,
34a7de29
S
571 },
572 'params': {
573 'skip_download': True,
574 },
11b56058 575 },
dd27fd17 576 {
2d3d2997 577 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
4bc3a23e
PH
578 'note': '256k DASH audio (format 141) via DASH manifest',
579 'info_dict': {
580 'id': 'a9LDPn-MO4I',
581 'ext': 'm4a',
582 'upload_date': '20121002',
583 'uploader_id': '8KVIDEO',
ec85ded8 584 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
4bc3a23e
PH
585 'description': '',
586 'uploader': '8KVIDEO',
587 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 588 },
4bc3a23e
PH
589 'params': {
590 'youtube_include_dash_manifest': True,
591 'format': '141',
4919603f 592 },
de3c7fe0 593 'skip': 'format 141 not served anymore',
dd27fd17 594 },
8bdd16b4 595 # DASH manifest with encrypted signature
596 {
597 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
598 'info_dict': {
599 'id': 'IB3lcPjvWLA',
600 'ext': 'm4a',
601 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
602 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
603 'duration': 244,
604 'uploader': 'AfrojackVEVO',
605 'uploader_id': 'AfrojackVEVO',
606 'upload_date': '20131011',
607 },
608 'params': {
609 'youtube_include_dash_manifest': True,
610 'format': '141/bestaudio[ext=m4a]',
611 },
612 },
aa79ac0c
PH
613 # Controversy video
614 {
615 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
616 'info_dict': {
617 'id': 'T4XJQO3qol8',
618 'ext': 'mp4',
556dbe7f 619 'duration': 219,
aa79ac0c 620 'upload_date': '20100909',
4fe54c12 621 'uploader': 'Amazing Atheist',
aa79ac0c 622 'uploader_id': 'TheAmazingAtheist',
ec85ded8 623 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
aa79ac0c
PH
624 'title': 'Burning Everyone\'s Koran',
625 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
626 }
c522adb1 627 },
dd2d55f1 628 # Normal age-gate video (embed allowed)
c522adb1 629 {
2d3d2997 630 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
c522adb1
JMF
631 'info_dict': {
632 'id': 'HtVdAasjOgU',
633 'ext': 'mp4',
634 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
ec85ded8 635 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
556dbe7f 636 'duration': 142,
c522adb1
JMF
637 'uploader': 'The Witcher',
638 'uploader_id': 'WitcherGame',
ec85ded8 639 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
c522adb1 640 'upload_date': '20140605',
34952f09 641 'age_limit': 18,
c522adb1
JMF
642 },
643 },
8bdd16b4 644 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
645 # YouTube Red ad is not captured for creator
646 {
647 'url': '__2ABJjxzNo',
648 'info_dict': {
649 'id': '__2ABJjxzNo',
650 'ext': 'mp4',
651 'duration': 266,
652 'upload_date': '20100430',
653 'uploader_id': 'deadmau5',
654 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
655 'creator': 'Dada Life, deadmau5',
656 'description': 'md5:12c56784b8032162bb936a5f76d55360',
657 'uploader': 'deadmau5',
658 'title': 'Deadmau5 - Some Chords (HD)',
659 'alt_title': 'This Machine Kills Some Chords',
660 },
661 'expected_warnings': [
662 'DASH manifest missing',
663 ]
664 },
067aa17e 665 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
e52a40ab
PH
666 {
667 'url': 'lqQg6PlCWgI',
668 'info_dict': {
669 'id': 'lqQg6PlCWgI',
670 'ext': 'mp4',
556dbe7f 671 'duration': 6085,
90227264 672 'upload_date': '20150827',
cbe2bd91 673 'uploader_id': 'olympic',
ec85ded8 674 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
cbe2bd91 675 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
be49068d 676 'uploader': 'Olympic',
cbe2bd91
PH
677 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
678 },
679 'params': {
680 'skip_download': 'requires avconv',
e52a40ab 681 }
cbe2bd91 682 },
6271f1ca
PH
683 # Non-square pixels
684 {
685 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
686 'info_dict': {
687 'id': '_b-2C3KPAM0',
688 'ext': 'mp4',
689 'stretched_ratio': 16 / 9.,
556dbe7f 690 'duration': 85,
6271f1ca
PH
691 'upload_date': '20110310',
692 'uploader_id': 'AllenMeow',
ec85ded8 693 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
6271f1ca 694 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
eb6793ba 695 'uploader': '孫ᄋᄅ',
6271f1ca
PH
696 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
697 },
06b491eb
S
698 },
699 # url_encoded_fmt_stream_map is empty string
700 {
701 'url': 'qEJwOuvDf7I',
702 'info_dict': {
703 'id': 'qEJwOuvDf7I',
f57b7835 704 'ext': 'webm',
06b491eb
S
705 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
706 'description': '',
707 'upload_date': '20150404',
708 'uploader_id': 'spbelect',
709 'uploader': 'Наблюдатели Петербурга',
710 },
711 'params': {
712 'skip_download': 'requires avconv',
e323cf3f
S
713 },
714 'skip': 'This live event has ended.',
06b491eb 715 },
067aa17e 716 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
da77d856
S
717 {
718 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
719 'info_dict': {
720 'id': 'FIl7x6_3R5Y',
eb6793ba 721 'ext': 'webm',
da77d856
S
722 'title': 'md5:7b81415841e02ecd4313668cde88737a',
723 'description': 'md5:116377fd2963b81ec4ce64b542173306',
556dbe7f 724 'duration': 220,
da77d856
S
725 'upload_date': '20150625',
726 'uploader_id': 'dorappi2000',
ec85ded8 727 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
da77d856 728 'uploader': 'dorappi2000',
eb6793ba 729 'formats': 'mincount:31',
da77d856 730 },
eb6793ba 731 'skip': 'not actual anymore',
2ee8f5d8 732 },
8a1a26ce
YCH
733 # DASH manifest with segment_list
734 {
735 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
736 'md5': '8ce563a1d667b599d21064e982ab9e31',
737 'info_dict': {
738 'id': 'CsmdDsKjzN8',
739 'ext': 'mp4',
17ee98e1 740 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
741 'uploader': 'Airtek',
742 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
743 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
744 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
745 },
746 'params': {
747 'youtube_include_dash_manifest': True,
748 'format': '135', # bestvideo
be49068d
S
749 },
750 'skip': 'This live event has ended.',
2ee8f5d8 751 },
cf7e015f
S
752 {
753 # Multifeed videos (multiple cameras), URL is for Main Camera
754 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
755 'info_dict': {
756 'id': 'jqWvoWXjCVs',
757 'title': 'teamPGP: Rocket League Noob Stream',
758 'description': 'md5:dc7872fb300e143831327f1bae3af010',
759 },
760 'playlist': [{
761 'info_dict': {
762 'id': 'jqWvoWXjCVs',
763 'ext': 'mp4',
764 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
765 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 766 'duration': 7335,
cf7e015f
S
767 'upload_date': '20150721',
768 'uploader': 'Beer Games Beer',
769 'uploader_id': 'beergamesbeer',
ec85ded8 770 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 771 'license': 'Standard YouTube License',
cf7e015f
S
772 },
773 }, {
774 'info_dict': {
775 'id': '6h8e8xoXJzg',
776 'ext': 'mp4',
777 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
778 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 779 'duration': 7337,
cf7e015f
S
780 'upload_date': '20150721',
781 'uploader': 'Beer Games Beer',
782 'uploader_id': 'beergamesbeer',
ec85ded8 783 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 784 'license': 'Standard YouTube License',
cf7e015f
S
785 },
786 }, {
787 'info_dict': {
788 'id': 'PUOgX5z9xZw',
789 'ext': 'mp4',
790 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
791 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 792 'duration': 7337,
cf7e015f
S
793 'upload_date': '20150721',
794 'uploader': 'Beer Games Beer',
795 'uploader_id': 'beergamesbeer',
ec85ded8 796 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 797 'license': 'Standard YouTube License',
cf7e015f
S
798 },
799 }, {
800 'info_dict': {
801 'id': 'teuwxikvS5k',
802 'ext': 'mp4',
803 'title': 'teamPGP: Rocket League Noob Stream (zim)',
804 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 805 'duration': 7334,
cf7e015f
S
806 'upload_date': '20150721',
807 'uploader': 'Beer Games Beer',
808 'uploader_id': 'beergamesbeer',
ec85ded8 809 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 810 'license': 'Standard YouTube License',
cf7e015f
S
811 },
812 }],
813 'params': {
814 'skip_download': True,
815 },
4fe54c12 816 'skip': 'This video is not available.',
cbaed4bb 817 },
f9f49d87 818 {
067aa17e 819 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
f9f49d87
S
820 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
821 'info_dict': {
822 'id': 'gVfLd0zydlo',
823 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
824 },
825 'playlist_count': 2,
be49068d 826 'skip': 'Not multifeed anymore',
f9f49d87 827 },
cbaed4bb 828 {
2d3d2997 829 'url': 'https://vid.plus/FlRa-iH7PGw',
cbaed4bb 830 'only_matching': True,
0e49d9a6 831 },
6d4fc66b 832 {
2d3d2997 833 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
6d4fc66b
S
834 'only_matching': True,
835 },
0e49d9a6 836 {
067aa17e 837 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
a8776b10 838 # Also tests cut-off URL expansion in video description (see
067aa17e
S
839 # https://github.com/ytdl-org/youtube-dl/issues/1892,
840 # https://github.com/ytdl-org/youtube-dl/issues/8164)
0e49d9a6
LL
841 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
842 'info_dict': {
843 'id': 'lsguqyKfVQg',
844 'ext': 'mp4',
845 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
eb6793ba 846 'alt_title': 'Dark Walk - Position Music',
0e49d9a6 847 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
556dbe7f 848 'duration': 133,
0e49d9a6
LL
849 'upload_date': '20151119',
850 'uploader_id': 'IronSoulElf',
ec85ded8 851 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
0e49d9a6 852 'uploader': 'IronSoulElf',
eb6793ba
S
853 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
854 'track': 'Dark Walk - Position Music',
855 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
92bc97d3 856 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
0e49d9a6
LL
857 },
858 'params': {
859 'skip_download': True,
860 },
861 },
61f92af1 862 {
067aa17e 863 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
61f92af1
S
864 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
865 'only_matching': True,
866 },
313dfc45
LL
867 {
868 # Video with yt:stretch=17:0
869 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
870 'info_dict': {
871 'id': 'Q39EVAstoRM',
872 'ext': 'mp4',
873 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
874 'description': 'md5:ee18a25c350637c8faff806845bddee9',
875 'upload_date': '20151107',
876 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
877 'uploader': 'CH GAMER DROID',
878 },
879 'params': {
880 'skip_download': True,
881 },
be49068d 882 'skip': 'This video does not exist.',
313dfc45 883 },
7caf9830
S
884 {
885 # Video licensed under Creative Commons
886 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
887 'info_dict': {
888 'id': 'M4gD1WSo5mA',
889 'ext': 'mp4',
890 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
891 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
556dbe7f 892 'duration': 721,
7caf9830
S
893 'upload_date': '20150127',
894 'uploader_id': 'BerkmanCenter',
ec85ded8 895 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
556dbe7f 896 'uploader': 'The Berkman Klein Center for Internet & Society',
7caf9830
S
897 'license': 'Creative Commons Attribution license (reuse allowed)',
898 },
899 'params': {
900 'skip_download': True,
901 },
902 },
fd050249
S
903 {
904 # Channel-like uploader_url
905 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
906 'info_dict': {
907 'id': 'eQcmzGIKrzg',
908 'ext': 'mp4',
909 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
910 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
556dbe7f 911 'duration': 4060,
fd050249 912 'upload_date': '20151119',
eb6793ba 913 'uploader': 'Bernie Sanders',
fd050249 914 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
ec85ded8 915 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
fd050249
S
916 'license': 'Creative Commons Attribution license (reuse allowed)',
917 },
918 'params': {
919 'skip_download': True,
920 },
921 },
040ac686
S
922 {
923 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
924 'only_matching': True,
7f29cf54
S
925 },
926 {
067aa17e 927 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
7f29cf54
S
928 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
929 'only_matching': True,
6496ccb4
S
930 },
931 {
932 # Rental video preview
933 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
934 'info_dict': {
935 'id': 'uGpuVWrhIzE',
936 'ext': 'mp4',
937 'title': 'Piku - Trailer',
938 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
939 'upload_date': '20150811',
940 'uploader': 'FlixMatrix',
941 'uploader_id': 'FlixMatrixKaravan',
ec85ded8 942 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
6496ccb4
S
943 'license': 'Standard YouTube License',
944 },
945 'params': {
946 'skip_download': True,
947 },
eb6793ba 948 'skip': 'This video is not available.',
022a5d66 949 },
12afdc2a
S
950 {
951 # YouTube Red video with episode data
952 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
953 'info_dict': {
954 'id': 'iqKdEhx-dD4',
955 'ext': 'mp4',
956 'title': 'Isolation - Mind Field (Ep 1)',
4fe54c12 957 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
556dbe7f 958 'duration': 2085,
12afdc2a
S
959 'upload_date': '20170118',
960 'uploader': 'Vsauce',
961 'uploader_id': 'Vsauce',
962 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
12afdc2a
S
963 'series': 'Mind Field',
964 'season_number': 1,
965 'episode_number': 1,
966 },
967 'params': {
968 'skip_download': True,
969 },
970 'expected_warnings': [
971 'Skipping DASH manifest',
972 ],
973 },
c7121fa7
S
974 {
975 # The following content has been identified by the YouTube community
976 # as inappropriate or offensive to some audiences.
977 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
978 'info_dict': {
979 'id': '6SJNVb0GnPI',
980 'ext': 'mp4',
981 'title': 'Race Differences in Intelligence',
982 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
983 'duration': 965,
984 'upload_date': '20140124',
985 'uploader': 'New Century Foundation',
986 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
987 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
c7121fa7
S
988 },
989 'params': {
990 'skip_download': True,
991 },
992 },
022a5d66
S
993 {
994 # itag 212
995 'url': '1t24XAntNCY',
996 'only_matching': True,
fd5c4aab
S
997 },
998 {
999 # geo restricted to JP
1000 'url': 'sJL6WA-aGkQ',
1001 'only_matching': True,
1002 },
cd5a74a2
S
1003 {
1004 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1005 'only_matching': True,
1006 },
825cd268
RA
1007 {
1008 # DRM protected
1009 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1010 'only_matching': True,
4fe54c12
S
1011 },
1012 {
1013 # Video with unsupported adaptive stream type formats
1014 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1015 'info_dict': {
1016 'id': 'Z4Vy8R84T1U',
1017 'ext': 'mp4',
1018 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1019 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1020 'duration': 433,
1021 'upload_date': '20130923',
1022 'uploader': 'Amelia Putri Harwita',
1023 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1024 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1025 'formats': 'maxcount:10',
1026 },
1027 'params': {
1028 'skip_download': True,
1029 'youtube_include_dash_manifest': False,
1030 },
5429d6a9 1031 'skip': 'not actual anymore',
5caabd3c 1032 },
1033 {
822b9d9c 1034 # Youtube Music Auto-generated description
5caabd3c 1035 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1036 'info_dict': {
1037 'id': 'MgNrAu2pzNs',
1038 'ext': 'mp4',
1039 'title': 'Voyeur Girl',
1040 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1041 'upload_date': '20190312',
5429d6a9
S
1042 'uploader': 'Stephen - Topic',
1043 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
5caabd3c 1044 'artist': 'Stephen',
1045 'track': 'Voyeur Girl',
1046 'album': 'it\'s too much love to know my dear',
1047 'release_date': '20190313',
1048 'release_year': 2019,
1049 },
1050 'params': {
1051 'skip_download': True,
1052 },
1053 },
66b48727
RA
1054 {
1055 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1056 'only_matching': True,
1057 },
011e75e6
S
1058 {
1059 # invalid -> valid video id redirection
1060 'url': 'DJztXj2GPfl',
1061 'info_dict': {
1062 'id': 'DJztXj2GPfk',
1063 'ext': 'mp4',
1064 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1065 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1066 'upload_date': '20090125',
1067 'uploader': 'Prochorowka',
1068 'uploader_id': 'Prochorowka',
1069 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1070 'artist': 'Panjabi MC',
1071 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1072 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1073 },
1074 'params': {
1075 'skip_download': True,
1076 },
ea74e00b
DP
1077 },
1078 {
1079 # empty description results in an empty string
1080 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1081 'info_dict': {
1082 'id': 'x41yOUIvK2k',
1083 'ext': 'mp4',
1084 'title': 'IMG 3456',
1085 'description': '',
1086 'upload_date': '20170613',
1087 'uploader_id': 'ElevageOrVert',
1088 'uploader': 'ElevageOrVert',
1089 },
1090 'params': {
1091 'skip_download': True,
1092 },
1093 },
a0566bbf 1094 {
1095 # with '};' inside yt initial data (see https://github.com/ytdl-org/youtube-dl/issues/27093)
1096 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1097 'info_dict': {
1098 'id': 'CHqg6qOn4no',
1099 'ext': 'mp4',
1100 'title': 'Part 77 Sort a list of simple types in c#',
1101 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1102 'upload_date': '20130831',
1103 'uploader_id': 'kudvenkat',
1104 'uploader': 'kudvenkat',
1105 },
1106 'params': {
1107 'skip_download': True,
1108 },
1109 },
2eb88d95
PH
1110 ]
1111
e0df6211
PH
1112 def __init__(self, *args, **kwargs):
1113 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 1114 self._player_cache = {}
e0df6211 1115
c5e8d7af
PH
1116 def report_video_info_webpage_download(self, video_id):
1117 """Report attempt to download video info webpage."""
69ea8ca4 1118 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 1119
c5e8d7af
PH
1120 def report_information_extraction(self, video_id):
1121 """Report attempt to extract video information."""
69ea8ca4 1122 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
1123
1124 def report_unavailable_format(self, video_id, format):
1125 """Report extracted video URL."""
69ea8ca4 1126 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
1127
1128 def report_rtmp_download(self):
1129 """Indicate the download will use the RTMP protocol."""
69ea8ca4 1130 self.to_screen('RTMP download detected')
c5e8d7af 1131
60064c53
PH
1132 def _signature_cache_id(self, example_sig):
1133 """ Return a string representation of a signature """
78caa52a 1134 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53 1135
e40c758c
S
1136 @classmethod
1137 def _extract_player_info(cls, player_url):
1138 for player_re in cls._PLAYER_INFO_RE:
1139 id_m = re.search(player_re, player_url)
1140 if id_m:
1141 break
1142 else:
c081b35c 1143 raise ExtractorError('Cannot identify player %r' % player_url)
e40c758c
S
1144 return id_m.group('ext'), id_m.group('id')
1145
1146 def _extract_signature_function(self, video_id, player_url, example_sig):
1147 player_type, player_id = self._extract_player_info(player_url)
e0df6211 1148
c4417ddb 1149 # Read from filesystem cache
60064c53
PH
1150 func_id = '%s_%s_%s' % (
1151 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 1152 assert os.path.basename(func_id) == func_id
a0e07d31 1153
69ea8ca4 1154 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 1155 if cache_spec is not None:
78caa52a 1156 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 1157
6d1a55a5
PH
1158 download_note = (
1159 'Downloading player %s' % player_url
1160 if self._downloader.params.get('verbose') else
1161 'Downloading %s player %s' % (player_type, player_id)
1162 )
e0df6211
PH
1163 if player_type == 'js':
1164 code = self._download_webpage(
1165 player_url, video_id,
6d1a55a5 1166 note=download_note,
69ea8ca4 1167 errnote='Download of %s failed' % player_url)
83799698 1168 res = self._parse_sig_js(code)
c4417ddb 1169 elif player_type == 'swf':
e0df6211
PH
1170 urlh = self._request_webpage(
1171 player_url, video_id,
6d1a55a5 1172 note=download_note,
69ea8ca4 1173 errnote='Download of %s failed' % player_url)
e0df6211 1174 code = urlh.read()
83799698 1175 res = self._parse_sig_swf(code)
e0df6211
PH
1176 else:
1177 assert False, 'Invalid player type %r' % player_type
1178
785521bf
PH
1179 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1180 cache_res = res(test_string)
1181 cache_spec = [ord(c) for c in cache_res]
83799698 1182
69ea8ca4 1183 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
1184 return res
1185
60064c53 1186 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
1187 def gen_sig_code(idxs):
1188 def _genslice(start, end, step):
78caa52a 1189 starts = '' if start == 0 else str(start)
8bcc8756 1190 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 1191 steps = '' if step == 1 else (':%d' % step)
78caa52a 1192 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
1193
1194 step = None
7af808a5
PH
1195 # Quelch pyflakes warnings - start will be set when step is set
1196 start = '(Never used)'
edf3e38e
PH
1197 for i, prev in zip(idxs[1:], idxs[:-1]):
1198 if step is not None:
1199 if i - prev == step:
1200 continue
1201 yield _genslice(start, prev, step)
1202 step = None
1203 continue
1204 if i - prev in [-1, 1]:
1205 step = i - prev
1206 start = prev
1207 continue
1208 else:
78caa52a 1209 yield 's[%d]' % prev
edf3e38e 1210 if step is None:
78caa52a 1211 yield 's[%d]' % i
edf3e38e
PH
1212 else:
1213 yield _genslice(start, i, step)
1214
78caa52a 1215 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 1216 cache_res = func(test_string)
edf3e38e 1217 cache_spec = [ord(c) for c in cache_res]
78caa52a 1218 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
1219 signature_id_tuple = '(%s)' % (
1220 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 1221 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 1222 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 1223 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 1224
e0df6211
PH
1225 def _parse_sig_js(self, jscode):
1226 funcname = self._search_regex(
abefc03f
S
1227 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1228 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
e450f6cb 1229 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
31ce6e99 1230 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
abefc03f
S
1231 # Obsolete patterns
1232 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
9a47fa35 1233 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
abefc03f
S
1234 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1235 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1236 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1237 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1238 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1239 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
3c90cc8b 1240 jscode, 'Initial JS player signature function name', group='sig')
2b25cb5d
PH
1241
1242 jsi = JSInterpreter(jscode)
1243 initial_function = jsi.extract_function(funcname)
e0df6211
PH
1244 return lambda s: initial_function([s])
1245
1246 def _parse_sig_swf(self, file_contents):
54256267 1247 swfi = SWFInterpreter(file_contents)
78caa52a 1248 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 1249 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 1250 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
1251 return lambda s: initial_function([s])
1252
83799698 1253 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1254 """Turn the encrypted s field into a working signature"""
6b37f0be 1255
c8bf86d5 1256 if player_url is None:
69ea8ca4 1257 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 1258
69ea8ca4 1259 if player_url.startswith('//'):
78caa52a 1260 player_url = 'https:' + player_url
3c90cc8b
S
1261 elif not re.match(r'https?://', player_url):
1262 player_url = compat_urlparse.urljoin(
1263 'https://www.youtube.com', player_url)
c8bf86d5 1264 try:
62af3a0e 1265 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
1266 if player_id not in self._player_cache:
1267 func = self._extract_signature_function(
60064c53 1268 video_id, player_url, s
c8bf86d5
PH
1269 )
1270 self._player_cache[player_id] = func
1271 func = self._player_cache[player_id]
1272 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 1273 self._print_sig_code(func, s)
c8bf86d5
PH
1274 return func(s)
1275 except Exception as e:
1276 tb = traceback.format_exc()
1277 raise ExtractorError(
78caa52a 1278 'Signature extraction failed: ' + tb, cause=e)
e0df6211 1279
f96f5dda 1280 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
de7f3446 1281 try:
60e47a26 1282 subs_doc = self._download_xml(
38c2e5b8 1283 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
1284 video_id, note=False)
1285 except ExtractorError as err:
9b9c5355 1286 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
de7f3446 1287 return {}
de7f3446
JMF
1288
1289 sub_lang_list = {}
60e47a26
JMF
1290 for track in subs_doc.findall('track'):
1291 lang = track.attrib['lang_code']
7e660ac1
LD
1292 if lang in sub_lang_list:
1293 continue
360e1ca5 1294 sub_formats = []
23d17e4b 1295 for ext in self._SUBTITLE_FORMATS:
15707c7e 1296 params = compat_urllib_parse_urlencode({
360e1ca5
JMF
1297 'lang': lang,
1298 'v': video_id,
1299 'fmt': ext,
1300 'name': track.attrib['name'].encode('utf-8'),
1301 })
1302 sub_formats.append({
1303 'url': 'https://www.youtube.com/api/timedtext?' + params,
1304 'ext': ext,
1305 })
1306 sub_lang_list[lang] = sub_formats
9f448fcb 1307 if has_live_chat_replay:
321bf820 1308 sub_lang_list['live_chat'] = [
1309 {
1310 'video_id': video_id,
1311 'ext': 'json',
1312 'protocol': 'youtube_live_chat_replay',
1313 },
9f448fcb 1314 ]
de7f3446 1315 if not sub_lang_list:
69ea8ca4 1316 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
1317 return {}
1318 return sub_lang_list
1319
a72778d3
S
1320 def _get_ytplayer_config(self, video_id, webpage):
1321 patterns = (
526b3b07
S
1322 # User data may contain arbitrary character sequences that may affect
1323 # JSON extraction with regex, e.g. when '};' is contained the second
1324 # regex won't capture the whole JSON. Yet working around by trying more
1325 # concrete regex first keeping in mind proper quoted string handling
1326 # to be implemented in future that will replace this workaround (see
067aa17e
S
1327 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1328 # https://github.com/ytdl-org/youtube-dl/pull/7599)
a72778d3
S
1329 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1330 r';ytplayer\.config\s*=\s*({.+?});',
1331 )
1332 config = self._search_regex(
1333 patterns, webpage, 'ytplayer.config', default=None)
1334 if config:
1335 return self._parse_json(
1336 uppercase_escape(config), video_id, fatal=False)
0e49d9a6 1337
360e1ca5 1338 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
1339 """We need the webpage for getting the captions url, pass it as an
1340 argument to speed up the process."""
69ea8ca4 1341 self.to_screen('%s: Looking for automatic captions' % video_id)
a72778d3 1342 player_config = self._get_ytplayer_config(video_id, webpage)
78caa52a 1343 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
a72778d3 1344 if not player_config:
de7f3446
JMF
1345 self._downloader.report_warning(err_msg)
1346 return {}
de7f3446 1347 try:
8bdd16b4 1348 args = player_config['args']
1349 caption_url = args.get('ttsurl')
1350 if caption_url:
b78b292f
S
1351 timestamp = args['timestamp']
1352 # We get the available subtitles
15707c7e 1353 list_params = compat_urllib_parse_urlencode({
b78b292f
S
1354 'type': 'list',
1355 'tlangs': 1,
1356 'asrs': 1,
1357 })
1358 list_url = caption_url + '&' + list_params
1359 caption_list = self._download_xml(list_url, video_id)
1360 original_lang_node = caption_list.find('track')
1361 if original_lang_node is None:
1362 self._downloader.report_warning('Video doesn\'t have automatic captions')
1363 return {}
1364 original_lang = original_lang_node.attrib['lang_code']
1365 caption_kind = original_lang_node.attrib.get('kind', '')
1366
1367 sub_lang_list = {}
1368 for lang_node in caption_list.findall('target'):
1369 sub_lang = lang_node.attrib['lang_code']
1370 sub_formats = []
1371 for ext in self._SUBTITLE_FORMATS:
15707c7e 1372 params = compat_urllib_parse_urlencode({
b78b292f
S
1373 'lang': original_lang,
1374 'tlang': sub_lang,
1375 'fmt': ext,
1376 'ts': timestamp,
1377 'kind': caption_kind,
1378 })
1379 sub_formats.append({
1380 'url': caption_url + '&' + params,
1381 'ext': ext,
1382 })
1383 sub_lang_list[sub_lang] = sub_formats
1384 return sub_lang_list
1385
ddbb4c5c
S
1386 def make_captions(sub_url, sub_langs):
1387 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1388 caption_qs = compat_parse_qs(parsed_sub_url.query)
1389 captions = {}
1390 for sub_lang in sub_langs:
1391 sub_formats = []
1392 for ext in self._SUBTITLE_FORMATS:
1393 caption_qs.update({
1394 'tlang': [sub_lang],
1395 'fmt': [ext],
1396 })
1397 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1398 query=compat_urllib_parse_urlencode(caption_qs, True)))
1399 sub_formats.append({
1400 'url': sub_url,
1401 'ext': ext,
1402 })
1403 captions[sub_lang] = sub_formats
1404 return captions
1405
1406 # New captions format as of 22.06.2017
8bdd16b4 1407 player_response = args.get('player_response')
1408 if player_response and isinstance(player_response, compat_str):
1409 player_response = self._parse_json(
1410 player_response, video_id, fatal=False)
1411 if player_response:
1412 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1413 base_url = renderer['captionTracks'][0]['baseUrl']
59c5fa91
PO
1414 sub_lang_list = []
1415 for lang in renderer['translationLanguages']:
1416 lang_code = lang.get('languageCode')
1417 if lang_code:
1418 sub_lang_list.append(lang_code)
1419 return make_captions(base_url, sub_lang_list)
1420
8bdd16b4 1421 # Some videos don't provide ttsurl but rather caption_tracks and
1422 # caption_translation_languages (e.g. 20LmZk1hakA)
1423 # Does not used anymore as of 22.06.2017
1424 caption_tracks = args['caption_tracks']
1425 caption_translation_languages = args['caption_translation_languages']
1426 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1427 sub_lang_list = []
1428 for lang in caption_translation_languages.split(','):
1429 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1430 sub_lang = lang_qs.get('lc', [None])[0]
1431 if sub_lang:
1432 sub_lang_list.append(sub_lang)
1433 return make_captions(caption_url, sub_lang_list)
de7f3446
JMF
1434 # An extractor error can be raise by the download process if there are
1435 # no automatic captions but there are subtitles
ddbb4c5c 1436 except (KeyError, IndexError, ExtractorError):
de7f3446
JMF
1437 self._downloader.report_warning(err_msg)
1438 return {}
1439
21c340b8
S
1440 def _mark_watched(self, video_id, video_info, player_response):
1441 playback_url = url_or_none(try_get(
1442 player_response,
1443 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1444 video_info, lambda x: x['videostats_playback_base_url'][0]))
d77ab8e2
S
1445 if not playback_url:
1446 return
1447 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1448 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1449
1450 # cpn generation algorithm is reverse engineered from base.js.
1451 # In fact it works even with dummy cpn.
1452 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1453 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1454
1455 qs.update({
1456 'ver': ['2'],
1457 'cpn': [cpn],
1458 })
1459 playback_url = compat_urlparse.urlunparse(
15707c7e 1460 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
d77ab8e2
S
1461
1462 self._download_webpage(
1463 playback_url, video_id, 'Marking watched',
1464 'Unable to mark watched', fatal=False)
1465
66c9fa36
S
1466 @staticmethod
1467 def _extract_urls(webpage):
1468 # Embedded YouTube player
1469 entries = [
1470 unescapeHTML(mobj.group('url'))
1471 for mobj in re.finditer(r'''(?x)
1472 (?:
1473 <iframe[^>]+?src=|
1474 data-video-url=|
1475 <embed[^>]+?src=|
1476 embedSWF\(?:\s*|
1477 <object[^>]+data=|
1478 new\s+SWFObject\(
1479 )
1480 (["\'])
1481 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
f2332f18 1482 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
66c9fa36
S
1483 \1''', webpage)]
1484
1485 # lazyYT YouTube embed
1486 entries.extend(list(map(
1487 unescapeHTML,
1488 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1489
1490 # Wordpress "YouTube Video Importer" plugin
1491 matches = re.findall(r'''(?x)<div[^>]+
1492 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1493 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1494 entries.extend(m[-1] for m in matches)
1495
1496 return entries
1497
1498 @staticmethod
1499 def _extract_url(webpage):
1500 urls = YoutubeIE._extract_urls(webpage)
1501 return urls[0] if urls else None
1502
97665381
PH
1503 @classmethod
1504 def extract_id(cls, url):
1505 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 1506 if mobj is None:
69ea8ca4 1507 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1508 video_id = mobj.group(2)
1509 return video_id
1510
84213ea8
S
1511 def _extract_chapters_from_json(self, webpage, video_id, duration):
1512 if not webpage:
1513 return
8bdd16b4 1514 data = self._extract_yt_initial_data(video_id, webpage)
1515 if not data or not isinstance(data, dict):
84213ea8
S
1516 return
1517 chapters_list = try_get(
8bdd16b4 1518 data,
84213ea8
S
1519 lambda x: x['playerOverlays']
1520 ['playerOverlayRenderer']
1521 ['decoratedPlayerBarRenderer']
1522 ['decoratedPlayerBarRenderer']
1523 ['playerBar']
1524 ['chapteredPlayerBarRenderer']
1525 ['chapters'],
1526 list)
1527 if not chapters_list:
1528 return
1529
1530 def chapter_time(chapter):
1531 return float_or_none(
1532 try_get(
1533 chapter,
1534 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1535 int),
1536 scale=1000)
1537 chapters = []
1538 for next_num, chapter in enumerate(chapters_list, start=1):
1539 start_time = chapter_time(chapter)
1540 if start_time is None:
1541 continue
1542 end_time = (chapter_time(chapters_list[next_num])
1543 if next_num < len(chapters_list) else duration)
1544 if end_time is None:
1545 continue
1546 title = try_get(
1547 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1548 compat_str)
1549 chapters.append({
1550 'start_time': start_time,
1551 'end_time': end_time,
1552 'title': title,
1553 })
1554 return chapters
1555
9cafc3fd 1556 @staticmethod
84213ea8 1557 def _extract_chapters_from_description(description, duration):
9cafc3fd
S
1558 if not description:
1559 return None
1560 chapter_lines = re.findall(
1561 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1562 description)
1563 if not chapter_lines:
1564 return None
1565 chapters = []
1566 for next_num, (chapter_line, time_point) in enumerate(
1567 chapter_lines, start=1):
1568 start_time = parse_duration(time_point)
1569 if start_time is None:
1570 continue
39d4c1be
S
1571 if start_time > duration:
1572 break
9cafc3fd
S
1573 end_time = (duration if next_num == len(chapter_lines)
1574 else parse_duration(chapter_lines[next_num][1]))
1575 if end_time is None:
1576 continue
39d4c1be
S
1577 if end_time > duration:
1578 end_time = duration
1579 if start_time > end_time:
1580 break
9cafc3fd
S
1581 chapter_title = re.sub(
1582 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1583 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1584 chapters.append({
1585 'start_time': start_time,
1586 'end_time': end_time,
1587 'title': chapter_title,
1588 })
1589 return chapters
1590
84213ea8
S
1591 def _extract_chapters(self, webpage, description, video_id, duration):
1592 return (self._extract_chapters_from_json(webpage, video_id, duration)
1593 or self._extract_chapters_from_description(description, duration))
1594
c5e8d7af 1595 def _real_extract(self, url):
cf7e015f
S
1596 url, smuggled_data = unsmuggle_url(url, {})
1597
7e8c0af0 1598 proto = (
78caa52a
PH
1599 'http' if self._downloader.params.get('prefer_insecure', False)
1600 else 'https')
7e8c0af0 1601
7c80519c 1602 start_time = None
297a564b 1603 end_time = None
7c80519c
JMF
1604 parsed_url = compat_urllib_parse_urlparse(url)
1605 for component in [parsed_url.fragment, parsed_url.query]:
1606 query = compat_parse_qs(component)
297a564b 1607 if start_time is None and 't' in query:
7c80519c 1608 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
1609 if start_time is None and 'start' in query:
1610 start_time = parse_duration(query['start'][0])
297a564b
JMF
1611 if end_time is None and 'end' in query:
1612 end_time = parse_duration(query['end'][0])
7c80519c 1613
c5e8d7af
PH
1614 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1615 mobj = re.search(self._NEXT_URL_RE, url)
1616 if mobj:
7fd002c0 1617 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 1618 video_id = self.extract_id(url)
c5e8d7af
PH
1619
1620 # Get video webpage
aa79ac0c 1621 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
011e75e6
S
1622 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1623
1624 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1625 video_id = qs.get('v', [None])[0] or video_id
c5e8d7af
PH
1626
1627 # Attempt to extract SWF player URL
e0df6211 1628 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1629 if mobj is not None:
1630 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1631 else:
1632 player_url = None
1633
d8d24a92
S
1634 dash_mpds = []
1635
1636 def add_dash_mpd(video_info):
1637 dash_mpd = video_info.get('dashmpd')
1638 if dash_mpd and dash_mpd[0] not in dash_mpds:
1639 dash_mpds.append(dash_mpd[0])
1640
561b456e
S
1641 def add_dash_mpd_pr(pl_response):
1642 dash_mpd = url_or_none(try_get(
1643 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1644 compat_str))
1645 if dash_mpd and dash_mpd not in dash_mpds:
1646 dash_mpds.append(dash_mpd)
1647
c7121fa7
S
1648 is_live = None
1649 view_count = None
1650
1651 def extract_view_count(v_info):
1652 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1653
c2d125d9
S
1654 def extract_player_response(player_response, video_id):
1655 pl_response = str_or_none(player_response)
1656 if not pl_response:
1657 return
1658 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1659 if isinstance(pl_response, dict):
1660 add_dash_mpd_pr(pl_response)
1661 return pl_response
1662
fb2c9277
U
1663 def extract_embedded_config(embed_webpage, video_id):
1664 embedded_config = self._search_regex(
1665 r'setConfig\(({.*})\);',
1666 embed_webpage, 'ytInitialData', default=None)
1667 if embedded_config:
1668 return embedded_config
1669
dbdaaa23
S
1670 player_response = {}
1671
c5e8d7af 1672 # Get video info
43ebf77d 1673 video_info = {}
6449cd80 1674 embed_webpage = None
39e7107d
U
1675 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1676 or re.search(r'player-age-gate-content">', video_webpage) is not None):
9d9314cb 1677 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
c108eb73
JMF
1678 age_gate = True
1679 # We simulate the access to the video from www.youtube.com/v/{video_id}
1680 # this can be viewed without login into Youtube
beb95e77
CL
1681 url = proto + '://www.youtube.com/embed/%s' % video_id
1682 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
fb2c9277
U
1683 ext = extract_embedded_config(embed_webpage, video_id)
1684 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1685 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1686 if not playable_in_embed:
1687 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1688 playable_in_embed = ''
1689 else:
1690 playable_in_embed = playable_in_embed.group('playableinEmbed')
1691 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1692 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1693 if playable_in_embed == 'false':
c73baf23
U
1694 '''
1695 # TODO apply this patch when Support for Python 2.6(!) and above drops
9d9314cb 1696 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
4bb9c880 1697 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
c73baf23
U
1698 '''
1699 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1700 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
4bb9c880
U
1701 age_gate = False
1702 # Try looking directly into the video webpage
1703 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1704 if ytplayer_config:
59c5fa91
PO
1705 args = ytplayer_config.get("args")
1706 if args is not None:
1707 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1708 # Convert to the same format returned by compat_parse_qs
1709 video_info = dict((k, [v]) for k, v in args.items())
1710 add_dash_mpd(video_info)
1711 # Rental video is not rented but preview is available (e.g.
1712 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1713 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1714 if not video_info and args.get('ypc_vid'):
1715 return self.url_result(
1716 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1717 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1718 is_live = True
1719 if not player_response:
1720 player_response = extract_player_response(args.get('player_response'), video_id)
1721 elif not player_response:
1722 player_response = ytplayer_config
4bb9c880
U
1723 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1724 add_dash_mpd_pr(player_response)
9d9314cb
U
1725 else:
1726 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1727 else:
1728 data = compat_urllib_parse_urlencode({
1729 'video_id': video_id,
1730 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1731 'sts': self._search_regex(
1732 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1733 })
1734 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1735 try:
1736 video_info_webpage = self._download_webpage(
1737 video_info_url, video_id,
1738 note='Refetching age-gated info webpage',
1739 errnote='unable to download video info webpage')
1740 except ExtractorError:
1741 video_info_webpage = None
1742 if video_info_webpage:
1743 video_info = compat_parse_qs(video_info_webpage)
1744 pl_response = video_info.get('player_response', [None])[0]
1745 player_response = extract_player_response(pl_response, video_id)
1746 add_dash_mpd(video_info)
1747 view_count = extract_view_count(video_info)
c108eb73
JMF
1748 else:
1749 age_gate = False
d8d24a92 1750 # Try looking directly into the video webpage
a72778d3 1751 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
8bdd16b4 1752 if ytplayer_config:
1753 args = ytplayer_config.get('args', {})
4c76aa06 1754 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
d8d24a92
S
1755 # Convert to the same format returned by compat_parse_qs
1756 video_info = dict((k, [v]) for k, v in args.items())
1757 add_dash_mpd(video_info)
6496ccb4
S
1758 # Rental video is not rented but preview is available (e.g.
1759 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
067aa17e 1760 # https://github.com/ytdl-org/youtube-dl/issues/10532)
6496ccb4
S
1761 if not video_info and args.get('ypc_vid'):
1762 return self.url_result(
1763 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
2fe1ff85
JMF
1764 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1765 is_live = True
dbdaaa23 1766 if not player_response:
c2d125d9 1767 player_response = extract_player_response(args.get('player_response'), video_id)
0a3cf9ad 1768 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
561b456e 1769 add_dash_mpd_pr(player_response)
bbb7c3f7 1770
8bdd16b4 1771 if not video_info and not player_response:
1772 player_response = extract_player_response(
1773 self._search_regex(
1774 r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;', video_webpage,
1775 'initial player response', default='{}'),
1776 video_id)
1777
bbb7c3f7 1778 def extract_unavailable_message():
0add33ab
S
1779 messages = []
1780 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1781 msg = self._html_search_regex(
1782 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1783 video_webpage, 'unavailable %s' % kind, default=None)
1784 if msg:
1785 messages.append(msg)
1786 if messages:
1787 return '\n'.join(messages)
bbb7c3f7 1788
f93abcf1 1789 if not video_info and not player_response:
15be3eb5
RA
1790 unavailable_message = extract_unavailable_message()
1791 if not unavailable_message:
1792 unavailable_message = 'Unable to extract video data'
1793 raise ExtractorError(
1794 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1795
f93abcf1
S
1796 if not isinstance(video_info, dict):
1797 video_info = {}
1798
dbdaaa23
S
1799 video_details = try_get(
1800 player_response, lambda x: x['videoDetails'], dict) or {}
1801
37357d21
S
1802 microformat = try_get(
1803 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1804
8dbf751a
RA
1805 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1806 if not video_title:
cf7e015f
S
1807 self._downloader.report_warning('Unable to extract video title')
1808 video_title = '_'
1809
9cafc3fd 1810 description_original = video_description = get_element_by_id("eow-description", video_webpage)
cf7e015f 1811 if video_description:
fa4bc6e7
RA
1812
1813 def replace_url(m):
1814 redir_url = compat_urlparse.urljoin(url, m.group(1))
1815 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1816 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1817 qs = compat_parse_qs(parsed_redir_url.query)
1818 q = qs.get('q')
1819 if q and q[0]:
1820 return q[0]
1821 return redir_url
1822
9cafc3fd 1823 description_original = video_description = re.sub(r'''(?x)
cf7e015f 1824 <a\s+
25cb7a0e 1825 (?:[a-zA-Z-]+="[^"]*"\s+)*?
23f13e97 1826 (?:title|href)="([^"]+)"\s+
25cb7a0e 1827 (?:[a-zA-Z-]+="[^"]*"\s+)*?
525cedb9 1828 class="[^"]*"[^>]*>
23f13e97 1829 [^<]+\.{3}\s*
cf7e015f 1830 </a>
fa4bc6e7 1831 ''', replace_url, video_description)
cf7e015f
S
1832 video_description = clean_html(video_description)
1833 else:
ea74e00b
DP
1834 video_description = video_details.get('shortDescription')
1835 if video_description is None:
1836 video_description = self._html_search_meta('description', video_webpage)
cf7e015f 1837
8fe10494 1838 if not smuggled_data.get('force_singlefeed', False):
5e1eddb9 1839 if not self._downloader.params.get('noplaylist'):
8fe10494
S
1840 multifeed_metadata_list = try_get(
1841 player_response,
1842 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1843 compat_str) or try_get(
1844 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1845 if multifeed_metadata_list:
1846 entries = []
1847 feed_ids = []
1848 for feed in multifeed_metadata_list.split(','):
1849 # Unquote should take place before split on comma (,) since textual
1850 # fields may contain comma as well (see
067aa17e 1851 # https://github.com/ytdl-org/youtube-dl/issues/8536)
8fe10494 1852 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
6b09401b
S
1853
1854 def feed_entry(name):
1855 return try_get(feed_data, lambda x: x[name][0], compat_str)
1856
1857 feed_id = feed_entry('id')
1858 if not feed_id:
1859 continue
1860 feed_title = feed_entry('title')
1861 title = video_title
1862 if feed_title:
1863 title += ' (%s)' % feed_title
8fe10494
S
1864 entries.append({
1865 '_type': 'url_transparent',
1866 'ie_key': 'Youtube',
1867 'url': smuggle_url(
1868 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1869 {'force_singlefeed': True}),
6b09401b 1870 'title': title,
8fe10494 1871 })
6b09401b 1872 feed_ids.append(feed_id)
8fe10494
S
1873 self.to_screen(
1874 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1875 % (', '.join(feed_ids), video_id))
1876 return self.playlist_result(entries, video_id, video_title, video_description)
1877 else:
1878 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1879
c7121fa7 1880 if view_count is None:
1c9c8de2 1881 view_count = extract_view_count(video_info)
dbdaaa23
S
1882 if view_count is None and video_details:
1883 view_count = int_or_none(video_details.get('viewCount'))
7b16239a
S
1884 if view_count is None and microformat:
1885 view_count = int_or_none(microformat.get('viewCount'))
1d699755 1886
27019dbb 1887 if is_live is None:
898238e9 1888 is_live = bool_or_none(video_details.get('isLive'))
27019dbb 1889
321bf820 1890 has_live_chat_replay = False
f0f76a33 1891 if not is_live:
321bf820 1892 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
1893 try:
1894 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
1895 has_live_chat_replay = True
f0f76a33 1896 except (KeyError, IndexError, TypeError):
321bf820 1897 pass
1898
c5e8d7af
PH
1899 # Check for "rental" videos
1900 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
067aa17e 1901 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
c5e8d7af 1902
c63ca0ee
S
1903 def _extract_filesize(media_url):
1904 return int_or_none(self._search_regex(
1905 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1906
bf1317d2
S
1907 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
1908 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
1909
c5e8d7af
PH
1910 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1911 self.report_rtmp_download()
dd27fd17
PH
1912 formats = [{
1913 'format_id': '_rtmp',
1914 'protocol': 'rtmp',
1915 'url': video_info['conn'][0],
1916 'player_url': player_url,
1917 }]
bf1317d2 1918 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
5f6a1245 1919 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1920 if 'rtmpe%3Dyes' in encoded_url_map:
067aa17e 1921 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
bf1317d2 1922 formats = []
3318832e 1923 formats_spec = {}
82156fdb 1924 fmt_list = video_info.get('fmt_list', [''])[0]
1925 if fmt_list:
1926 for fmt in fmt_list.split(','):
1927 spec = fmt.split('/')
3318832e 1928 if len(spec) > 1:
1929 width_height = spec[1].split('x')
1930 if len(width_height) == 2:
1931 formats_spec[spec[0]] = {
1932 'resolution': spec[1],
1933 'width': int_or_none(width_height[0]),
1934 'height': int_or_none(width_height[1]),
1935 }
bf1317d2
S
1936 for fmt in streaming_formats:
1937 itag = str_or_none(fmt.get('itag'))
1938 if not itag:
201e9eaa 1939 continue
bf1317d2
S
1940 quality = fmt.get('quality')
1941 quality_label = fmt.get('qualityLabel') or quality
1942 formats_spec[itag] = {
1943 'asr': int_or_none(fmt.get('audioSampleRate')),
1944 'filesize': int_or_none(fmt.get('contentLength')),
1945 'format_note': quality_label,
1946 'fps': int_or_none(fmt.get('fps')),
1947 'height': int_or_none(fmt.get('height')),
bf1317d2
S
1948 # bitrate for itag 43 is always 2147483647
1949 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
1950 'width': int_or_none(fmt.get('width')),
1951 }
1952
1953 for fmt in streaming_formats:
00eb865b 1954 if fmt.get('drmFamilies') or fmt.get('drm_families'):
bf1317d2
S
1955 continue
1956 url = url_or_none(fmt.get('url'))
1957
1958 if not url:
fa3db383 1959 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
bf1317d2
S
1960 if not cipher:
1961 continue
1962 url_data = compat_parse_qs(cipher)
1963 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
1964 if not url:
1965 continue
1966 else:
1967 cipher = None
1968 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
1969
2f483bc1
S
1970 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
1971 # Unsupported FORMAT_STREAM_TYPE_OTF
1972 if stream_type == 3:
1973 continue
6449cd80 1974
bf1317d2
S
1975 format_id = fmt.get('itag') or url_data['itag'][0]
1976 if not format_id:
1977 continue
1978 format_id = compat_str(format_id)
a49eccdf 1979
bf1317d2
S
1980 if cipher:
1981 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
8bdd16b4 1982 ASSETS_RE = (
1983 r'<script[^>]+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base',
1984 r'"jsUrl"\s*:\s*("[^"]+")',
1985 r'"assets":.+?"js":\s*("[^"]+")')
bf1317d2
S
1986 jsplayer_url_json = self._search_regex(
1987 ASSETS_RE,
1988 embed_webpage if age_gate else video_webpage,
1989 'JS player URL (1)', default=None)
1990 if not jsplayer_url_json and not age_gate:
1991 # We need the embed website after all
1992 if embed_webpage is None:
1993 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1994 embed_webpage = self._download_webpage(
1995 embed_url, video_id, 'Downloading embed webpage')
1996 jsplayer_url_json = self._search_regex(
1997 ASSETS_RE, embed_webpage, 'JS player URL')
1998
1999 player_url = json.loads(jsplayer_url_json)
cf010131 2000 if player_url is None:
bf1317d2
S
2001 player_url_json = self._search_regex(
2002 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2003 video_webpage, 'age gate player URL')
2004 player_url = json.loads(player_url_json)
2005
2006 if 'sig' in url_data:
2007 url += '&signature=' + url_data['sig'][0]
2008 elif 's' in url_data:
2009 encrypted_sig = url_data['s'][0]
2010
2011 if self._downloader.params.get('verbose'):
2012 if player_url is None:
bf1317d2 2013 player_desc = 'unknown'
cf010131 2014 else:
e40c758c
S
2015 player_type, player_version = self._extract_player_info(player_url)
2016 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
bf1317d2
S
2017 parts_sizes = self._signature_cache_id(encrypted_sig)
2018 self.to_screen('{%s} signature length %s, %s' %
2019 (format_id, parts_sizes, player_desc))
2020
2021 signature = self._decrypt_signature(
2022 encrypted_sig, video_id, player_url, age_gate)
2023 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2024 url += '&%s=%s' % (sp, signature)
201e9eaa
PH
2025 if 'ratebypass' not in url:
2026 url += '&ratebypass=yes'
c9afb51c 2027
94278f72
YCH
2028 dct = {
2029 'format_id': format_id,
2030 'url': url,
2031 'player_url': player_url,
2032 }
2033 if format_id in self._formats:
2034 dct.update(self._formats[format_id])
3318832e 2035 if format_id in formats_spec:
2036 dct.update(formats_spec[format_id])
94278f72 2037
aabc2be6 2038 # Some itags are not included in DASH manifest thus corresponding formats will
067aa17e 2039 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
aabc2be6
S
2040 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2041 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2042 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
94278f72 2043
bf1317d2
S
2044 if width is None:
2045 width = int_or_none(fmt.get('width'))
2046 if height is None:
2047 height = int_or_none(fmt.get('height'))
2048
c63ca0ee
S
2049 filesize = int_or_none(url_data.get(
2050 'clen', [None])[0]) or _extract_filesize(url)
2051
bf1317d2
S
2052 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2053 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2054
4878759f
S
2055 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2056 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
bf1317d2 2057 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
54fc90aa 2058
94278f72 2059 more_fields = {
c63ca0ee 2060 'filesize': filesize,
bf1317d2 2061 'tbr': tbr,
c9afb51c
AH
2062 'width': width,
2063 'height': height,
bf1317d2
S
2064 'fps': fps,
2065 'format_note': quality_label or quality,
c9afb51c 2066 }
94278f72
YCH
2067 for key, value in more_fields.items():
2068 if value:
2069 dct[key] = value
bf1317d2 2070 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
aabc2be6
S
2071 if type_:
2072 type_split = type_.split(';')
2073 kind_ext = type_split[0].split('/')
2074 if len(kind_ext) == 2:
94278f72
YCH
2075 kind, _ = kind_ext
2076 dct['ext'] = mimetype2ext(type_split[0])
aabc2be6
S
2077 if kind in ('audio', 'video'):
2078 codecs = None
2079 for mobj in re.finditer(
2080 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2081 if mobj.group('key') == 'codecs':
2082 codecs = mobj.group('val')
2083 break
2084 if codecs:
6310acf5 2085 dct.update(parse_codecs(codecs))
e4a60912
S
2086 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2087 dct['downloader_options'] = {
2088 # Youtube throttles chunks >~10M
2089 'http_chunk_size': 10485760,
2090 }
aabc2be6 2091 formats.append(dct)
c5e8d7af 2092 else:
c3e54389
S
2093 manifest_url = (
2094 url_or_none(try_get(
2095 player_response,
2096 lambda x: x['streamingData']['hlsManifestUrl'],
3089bc74
S
2097 compat_str))
2098 or url_or_none(try_get(
c3e54389
S
2099 video_info, lambda x: x['hlsvp'][0], compat_str)))
2100 if manifest_url:
2101 formats = []
2102 m3u8_formats = self._extract_m3u8_formats(
2103 manifest_url, video_id, 'mp4', fatal=False)
2104 for a_format in m3u8_formats:
2105 itag = self._search_regex(
2106 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2107 if itag:
2108 a_format['format_id'] = itag
2109 if itag in self._formats:
2110 dct = self._formats[itag].copy()
2111 dct.update(a_format)
2112 a_format = dct
2113 a_format['player_url'] = player_url
2114 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2115 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
78895bd3
U
2116 if self._downloader.params.get('youtube_include_hls_manifest', True):
2117 formats.append(a_format)
c3e54389 2118 else:
13577349 2119 error_message = extract_unavailable_message()
a0566bbf 2120 if not error_message:
2121 reason_list = try_get(
2122 player_response,
2123 lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'],
2124 list) or []
2125 for reason in reason_list:
2126 if not isinstance(reason, dict):
2127 continue
2128 reason_text = try_get(reason, lambda x: x['text'], compat_str)
2129 if reason_text:
2130 if not error_message:
2131 error_message = ''
2132 error_message += reason_text
2133 if error_message:
2134 error_message = clean_html(error_message)
c3e54389 2135 if not error_message:
13577349
S
2136 error_message = clean_html(try_get(
2137 player_response, lambda x: x['playabilityStatus']['reason'],
2138 compat_str))
2139 if not error_message:
2140 error_message = clean_html(
2141 try_get(video_info, lambda x: x['reason'][0], compat_str))
c3e54389
S
2142 if error_message:
2143 raise ExtractorError(error_message, expected=True)
2144 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 2145
7e72694b 2146 # uploader
dbdaaa23
S
2147 video_uploader = try_get(
2148 video_info, lambda x: x['author'][0],
2149 compat_str) or str_or_none(video_details.get('author'))
7e72694b
S
2150 if video_uploader:
2151 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2152 else:
2153 self._downloader.report_warning('unable to extract uploader name')
2154
2155 # uploader_id
2156 video_uploader_id = None
2157 video_uploader_url = None
2158 mobj = re.search(
2159 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2160 video_webpage)
2161 if mobj is not None:
2162 video_uploader_id = mobj.group('uploader_id')
2163 video_uploader_url = mobj.group('uploader_url')
a6211d23
S
2164 else:
2165 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2166 if owner_profile_url:
2167 video_uploader_id = self._search_regex(
2168 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2169 default=None)
2170 video_uploader_url = owner_profile_url
7e72694b 2171
b45a9e69 2172 channel_id = (
3089bc74
S
2173 str_or_none(video_details.get('channelId'))
2174 or self._html_search_meta(
2175 'channelId', video_webpage, 'channel id', default=None)
2176 or self._search_regex(
b45a9e69 2177 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2178 video_webpage, 'channel id', default=None, group='id'))
dd4c4492
S
2179 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2180
b477fc13
S
2181 thumbnails = []
2182 thumbnails_list = try_get(
2183 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2184 for t in thumbnails_list:
2185 if not isinstance(t, dict):
2186 continue
2187 thumbnail_url = url_or_none(t.get('url'))
2188 if not thumbnail_url:
2189 continue
2190 thumbnails.append({
2191 'url': thumbnail_url,
2192 'width': int_or_none(t.get('width')),
2193 'height': int_or_none(t.get('height')),
2194 })
2195
2196 if not thumbnails:
7e72694b 2197 video_thumbnail = None
b477fc13
S
2198 # We try first to get a high quality image:
2199 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2200 video_webpage, re.DOTALL)
2201 if m_thumb is not None:
2202 video_thumbnail = m_thumb.group(1)
2203 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2204 if thumbnail_url:
2205 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2206 if video_thumbnail:
2207 thumbnails.append({'url': video_thumbnail})
7e72694b
S
2208
2209 # upload date
2210 upload_date = self._html_search_meta(
2211 'datePublished', video_webpage, 'upload date', default=None)
2212 if not upload_date:
2213 upload_date = self._search_regex(
2214 [r'(?s)id="eow-date.*?>(.*?)</span>',
2215 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2216 video_webpage, 'upload date', default=None)
37357d21
S
2217 if not upload_date:
2218 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
7e72694b
S
2219 upload_date = unified_strdate(upload_date)
2220
2221 video_license = self._html_search_regex(
2222 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2223 video_webpage, 'license', default=None)
2224
2225 m_music = re.search(
2226 r'''(?x)
2227 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2228 <ul[^>]*>\s*
2229 <li>(?P<title>.+?)
2230 by (?P<creator>.+?)
2231 (?:
2232 \(.+?\)|
2233 <a[^>]*
2234 (?:
2235 \bhref=["\']/red[^>]*>| # drop possible
2236 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2237 )
2238 .*?
2239 )?</li
2240 ''',
2241 video_webpage)
2242 if m_music:
2243 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2244 video_creator = clean_html(m_music.group('creator'))
2245 else:
2246 video_alt_title = video_creator = None
2247
2248 def extract_meta(field):
2249 return self._html_search_regex(
2250 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2251 video_webpage, field, default=None)
2252
2253 track = extract_meta('Song')
2254 artist = extract_meta('Artist')
92bc97d3 2255 album = extract_meta('Album')
822b9d9c
RA
2256
2257 # Youtube Music Auto-generated description
92bc97d3 2258 release_date = release_year = None
822b9d9c 2259 if video_description:
38d70284 2260 mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
822b9d9c
RA
2261 if mobj:
2262 if not track:
2263 track = mobj.group('track').strip()
2264 if not artist:
2265 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
92bc97d3
RA
2266 if not album:
2267 album = mobj.group('album'.strip())
822b9d9c
RA
2268 release_year = mobj.group('release_year')
2269 release_date = mobj.group('release_date')
2270 if release_date:
2271 release_date = release_date.replace('-', '')
2272 if not release_year:
2273 release_year = int(release_date[:4])
2274 if release_year:
2275 release_year = int(release_year)
7e72694b 2276
38d70284 2277 yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage)
2278 contents = try_get(yt_initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or []
2279 for content in contents:
2280 rows = try_get(content, lambda x: x['videoSecondaryInfoRenderer']['metadataRowContainer']['metadataRowContainerRenderer']['rows'], list) or []
2281 multiple_songs = False
2282 for row in rows:
2283 if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
2284 multiple_songs = True
2285 break
2286 for row in rows:
2287 mrr = row.get('metadataRowRenderer') or {}
2288 mrr_title = try_get(
2289 mrr, lambda x: x['title']['simpleText'], compat_str)
2290 mrr_contents = try_get(
2291 mrr, lambda x: x['contents'][0], dict) or {}
2292 mrr_contents_text = try_get(mrr_contents, [lambda x: x['simpleText'], lambda x: x['runs'][0]['text']], compat_str)
2293 if not (mrr_title and mrr_contents_text):
2294 continue
2295 if mrr_title == 'License':
2296 video_license = mrr_contents_text
2297 elif not multiple_songs:
2298 if mrr_title == 'Album':
2299 album = mrr_contents_text
2300 elif mrr_title == 'Artist':
2301 artist = mrr_contents_text
2302 elif mrr_title == 'Song':
2303 track = mrr_contents_text
9322f116 2304
7e72694b
S
2305 m_episode = re.search(
2306 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2307 video_webpage)
2308 if m_episode:
c2dd2dc0 2309 series = unescapeHTML(m_episode.group('series'))
7e72694b
S
2310 season_number = int(m_episode.group('season'))
2311 episode_number = int(m_episode.group('episode'))
2312 else:
2313 series = season_number = episode_number = None
2314
2315 m_cat_container = self._search_regex(
2316 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2317 video_webpage, 'categories', default=None)
dbeafce5 2318 category = None
7e72694b
S
2319 if m_cat_container:
2320 category = self._html_search_regex(
2321 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2322 default=None)
dbeafce5
S
2323 if not category:
2324 category = try_get(
2325 microformat, lambda x: x['category'], compat_str)
2326 video_categories = None if category is None else [category]
7e72694b
S
2327
2328 video_tags = [
2329 unescapeHTML(m.group('content'))
2330 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
dbeafce5
S
2331 if not video_tags:
2332 video_tags = try_get(video_details, lambda x: x['keywords'], list)
7e72694b
S
2333
2334 def _extract_count(count_name):
2335 return str_to_int(self._search_regex(
a0566bbf 2336 (r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name),
2337 r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)),
7e72694b
S
2338 video_webpage, count_name, default=None))
2339
2340 like_count = _extract_count('like')
2341 dislike_count = _extract_count('dislike')
2342
dbdaaa23
S
2343 if view_count is None:
2344 view_count = str_to_int(self._search_regex(
2345 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2346 'view count', default=None))
2347
bf3c9326
S
2348 average_rating = (
2349 float_or_none(video_details.get('averageRating'))
2350 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2351
7e72694b 2352 # subtitles
321bf820 2353 video_subtitles = self.extract_subtitles(
2354 video_id, video_webpage, has_live_chat_replay)
7e72694b
S
2355 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2356
2357 video_duration = try_get(
2358 video_info, lambda x: int_or_none(x['length_seconds'][0]))
dbdaaa23
S
2359 if not video_duration:
2360 video_duration = int_or_none(video_details.get('lengthSeconds'))
7e72694b
S
2361 if not video_duration:
2362 video_duration = parse_duration(self._html_search_meta(
2363 'duration', video_webpage, 'video duration'))
2364
b84071c0
JP
2365 # Get Subscriber Count of channel
2366 subscriber_count = parse_count(self._search_regex(
2367 r'"text":"([\d\.]+\w?) subscribers"',
2368 video_webpage,
2369 'subscriber count',
2370 default=None
2371 ))
2372
7e72694b
S
2373 # annotations
2374 video_annotations = None
2375 if self._downloader.params.get('writeannotations', False):
64b6a4e9
RA
2376 xsrf_token = self._search_regex(
2377 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2378 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2379 invideo_url = try_get(
2380 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2381 if xsrf_token and invideo_url:
2382 xsrf_field_name = self._search_regex(
2383 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2384 video_webpage, 'xsrf field name',
2385 group='xsrf_field_name', default='session_token')
2386 video_annotations = self._download_webpage(
2387 self._proto_relative_url(invideo_url),
2388 video_id, note='Downloading annotations',
2389 errnote='Unable to download video annotations', fatal=False,
2390 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
7e72694b 2391
84213ea8 2392 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
7e72694b 2393
dd27fd17 2394 # Look for the DASH manifest
203fb43f 2395 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 2396 dash_mpd_fatal = True
8ff648e4 2397 for mpd_url in dash_mpds:
d8d24a92 2398 dash_formats = {}
774e208f 2399 try:
05d0d131
YCH
2400 def decrypt_sig(mobj):
2401 s = mobj.group(1)
2402 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2403 return '/signature/%s' % dec_s
2404
8ff648e4 2405 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2d2fa82d 2406
8ff648e4 2407 for df in self._extract_mpd_formats(
2408 mpd_url, video_id, fatal=dash_mpd_fatal,
2409 formats_dict=self._formats):
c63ca0ee
S
2410 if not df.get('filesize'):
2411 df['filesize'] = _extract_filesize(df['url'])
d8d24a92
S
2412 # Do not overwrite DASH format found in some previous DASH manifest
2413 if df['format_id'] not in dash_formats:
2414 dash_formats[df['format_id']] = df
77c6fb5b
S
2415 # Additional DASH manifests may end up in HTTP Error 403 therefore
2416 # allow them to fail without bug report message if we already have
2417 # some DASH manifest succeeded. This is temporary workaround to reduce
2418 # burst of bug reports until we figure out the reason and whether it
2419 # can be fixed at all.
2420 dash_mpd_fatal = False
774e208f
PH
2421 except (ExtractorError, KeyError) as e:
2422 self.report_warning(
2423 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 2424 if dash_formats:
04b3b3df
JMF
2425 # Remove the formats we found through non-DASH, they
2426 # contain less info and it can be wrong, because we use
2427 # fixed values (for example the resolution). See
067aa17e 2428 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
04b3b3df 2429 # example.
d80265cc 2430 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 2431 formats.extend(dash_formats.values())
d80044c2 2432
6271f1ca
PH
2433 # Check for malformed aspect ratio
2434 stretched_m = re.search(
2435 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2436 video_webpage)
2437 if stretched_m:
313dfc45
LL
2438 w = float(stretched_m.group('w'))
2439 h = float(stretched_m.group('h'))
5faf9fed
S
2440 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2441 # We will only process correct ratios.
313dfc45 2442 if w > 0 and h > 0:
41f24c32 2443 ratio = w / h
313dfc45
LL
2444 for f in formats:
2445 if f.get('vcodec') != 'none':
2446 f['stretched_ratio'] = ratio
6271f1ca 2447
026fbedc 2448 if not formats:
43ebf77d
S
2449 if 'reason' in video_info:
2450 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2451 regions_allowed = self._html_search_meta(
2452 'regionsAllowed', video_webpage, default=None)
2453 countries = regions_allowed.split(',') if regions_allowed else None
2454 self.raise_geo_restricted(
2455 msg=video_info['reason'][0], countries=countries)
2456 reason = video_info['reason'][0]
2457 if 'Invalid parameters' in reason:
2458 unavailable_message = extract_unavailable_message()
2459 if unavailable_message:
2460 reason = unavailable_message
2461 raise ExtractorError(
2462 'YouTube said: %s' % reason,
2463 expected=True, video_id=video_id)
2464 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2465 raise ExtractorError('This video is DRM protected.', expected=True)
0d297518 2466
4bcc7bd1 2467 self._sort_formats(formats)
4ea3be0a 2468
21c340b8 2469 self.mark_watched(video_id, video_info, player_response)
d77ab8e2 2470
4ea3be0a 2471 return {
8bcc8756
JW
2472 'id': video_id,
2473 'uploader': video_uploader,
2474 'uploader_id': video_uploader_id,
fd050249 2475 'uploader_url': video_uploader_url,
dd4c4492
S
2476 'channel_id': channel_id,
2477 'channel_url': channel_url,
8bcc8756 2478 'upload_date': upload_date,
7caf9830 2479 'license': video_license,
936784b2 2480 'creator': video_creator or artist,
8bcc8756 2481 'title': video_title,
936784b2 2482 'alt_title': video_alt_title or track,
b477fc13 2483 'thumbnails': thumbnails,
8bcc8756
JW
2484 'description': video_description,
2485 'categories': video_categories,
000b6b5a 2486 'tags': video_tags,
8bcc8756 2487 'subtitles': video_subtitles,
360e1ca5 2488 'automatic_captions': automatic_captions,
8bcc8756
JW
2489 'duration': video_duration,
2490 'age_limit': 18 if age_gate else 0,
2491 'annotations': video_annotations,
9cafc3fd 2492 'chapters': chapters,
7e8c0af0 2493 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 2494 'view_count': view_count,
4ea3be0a 2495 'like_count': like_count,
2496 'dislike_count': dislike_count,
bf3c9326 2497 'average_rating': average_rating,
8bcc8756 2498 'formats': formats,
2fe1ff85 2499 'is_live': is_live,
7c80519c 2500 'start_time': start_time,
297a564b 2501 'end_time': end_time,
12afdc2a
S
2502 'series': series,
2503 'season_number': season_number,
2504 'episode_number': episode_number,
936784b2
S
2505 'track': track,
2506 'artist': artist,
5caabd3c 2507 'album': album,
2508 'release_date': release_date,
2509 'release_year': release_year,
b84071c0 2510 'subscriber_count': subscriber_count,
4ea3be0a 2511 }
c5e8d7af 2512
5f6a1245 2513
8bdd16b4 2514class YoutubeTabIE(YoutubeBaseInfoExtractor):
2515 IE_DESC = 'YouTube.com tab'
70d5c17b 2516 _VALID_URL = r'''(?x)
2517 https?://
2518 (?:\w+\.)?
2519 (?:
2520 youtube(?:kids)?\.com|
2521 invidio\.us
2522 )/
2523 (?:
2524 (?:channel|c|user)/|
2525 (?P<not_channel>
3d3dddc9 2526 feed/|
70d5c17b 2527 (?:playlist|watch)\?.*?\blist=
2528 )|
2529 (?!(%s)([/#?]|$)) # Direct URLs
2530 )
2531 (?P<id>[^/?\#&]+)
2532 ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
8bdd16b4 2533 IE_NAME = 'youtube:tab'
2534
81127aa5 2535 _TESTS = [{
8bdd16b4 2536 # playlists, multipage
2537 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
2538 'playlist_mincount': 94,
2539 'info_dict': {
2540 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2541 'title': 'Игорь Клейнер - Playlists',
2542 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2543 },
2544 }, {
2545 # playlists, multipage, different order
2546 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2547 'playlist_mincount': 94,
2548 'info_dict': {
2549 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2550 'title': 'Игорь Клейнер - Playlists',
2551 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2552 },
2553 }, {
2554 # playlists, singlepage
2555 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2556 'playlist_mincount': 4,
2557 'info_dict': {
2558 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
2559 'title': 'ThirstForScience - Playlists',
2560 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
2561 }
2562 }, {
2563 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
2564 'only_matching': True,
2565 }, {
2566 # basic, single video playlist
0e30a7b9 2567 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
81127aa5 2568 'info_dict': {
0e30a7b9 2569 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2570 'uploader': 'Sergey M.',
2571 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3867038a 2572 'title': 'youtube-dl public playlist',
81127aa5 2573 },
0e30a7b9 2574 'playlist_count': 1,
9291475f 2575 }, {
8bdd16b4 2576 # empty playlist
0e30a7b9 2577 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
9291475f 2578 'info_dict': {
0e30a7b9 2579 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2580 'uploader': 'Sergey M.',
2581 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3867038a 2582 'title': 'youtube-dl empty playlist',
9291475f
PH
2583 },
2584 'playlist_count': 0,
2585 }, {
8bdd16b4 2586 # Home tab
2587 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
9291475f 2588 'info_dict': {
8bdd16b4 2589 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2590 'title': 'lex will - Home',
2591 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2592 },
8bdd16b4 2593 'playlist_mincount': 2,
9291475f 2594 }, {
8bdd16b4 2595 # Videos tab
2596 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
9291475f 2597 'info_dict': {
8bdd16b4 2598 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2599 'title': 'lex will - Videos',
2600 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2601 },
8bdd16b4 2602 'playlist_mincount': 975,
9291475f 2603 }, {
8bdd16b4 2604 # Videos tab, sorted by popular
2605 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
9291475f 2606 'info_dict': {
8bdd16b4 2607 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2608 'title': 'lex will - Videos',
2609 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2610 },
8bdd16b4 2611 'playlist_mincount': 199,
9291475f 2612 }, {
8bdd16b4 2613 # Playlists tab
2614 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
9291475f 2615 'info_dict': {
8bdd16b4 2616 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2617 'title': 'lex will - Playlists',
2618 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2619 },
8bdd16b4 2620 'playlist_mincount': 17,
ac7553d0 2621 }, {
8bdd16b4 2622 # Community tab
2623 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
ac7553d0 2624 'info_dict': {
8bdd16b4 2625 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2626 'title': 'lex will - Community',
2627 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2628 },
2629 'playlist_mincount': 18,
87dadd45 2630 }, {
8bdd16b4 2631 # Channels tab
2632 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
87dadd45 2633 'info_dict': {
8bdd16b4 2634 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2635 'title': 'lex will - Channels',
2636 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2637 },
2638 'playlist_mincount': 138,
6b08cdf6 2639 }, {
a0566bbf 2640 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2641 'only_matching': True,
2642 }, {
a0566bbf 2643 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2644 'only_matching': True,
2645 }, {
a0566bbf 2646 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2647 'only_matching': True,
2648 }, {
2649 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2650 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2651 'info_dict': {
2652 'title': '29C3: Not my department',
2653 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2654 'uploader': 'Christiaan008',
2655 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
2656 },
2657 'playlist_count': 96,
2658 }, {
2659 'note': 'Large playlist',
2660 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
6b08cdf6 2661 'info_dict': {
8bdd16b4 2662 'title': 'Uploads from Cauchemar',
2663 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2664 'uploader': 'Cauchemar',
2665 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
13a75688 2666 },
8bdd16b4 2667 'playlist_mincount': 1123,
2668 }, {
2669 # even larger playlist, 8832 videos
2670 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
2671 'only_matching': True,
4b7df0d3
JMF
2672 }, {
2673 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2674 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2675 'info_dict': {
acf757f4
PH
2676 'title': 'Uploads from Interstellar Movie',
2677 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
13a75688 2678 'uploader': 'Interstellar Movie',
8bdd16b4 2679 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3 2680 },
481cc733 2681 'playlist_mincount': 21,
8bdd16b4 2682 }, {
2683 # https://github.com/ytdl-org/youtube-dl/issues/21844
2684 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2685 'info_dict': {
2686 'title': 'Data Analysis with Dr Mike Pound',
2687 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2688 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
2689 'uploader': 'Computerphile',
2690 },
2691 'playlist_mincount': 11,
2692 }, {
a0566bbf 2693 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
8bdd16b4 2694 'only_matching': True,
dacb3a86
S
2695 }, {
2696 # Playlist URL that does not actually serve a playlist
2697 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2698 'info_dict': {
2699 'id': 'FqZTN594JQw',
2700 'ext': 'webm',
2701 'title': "Smiley's People 01 detective, Adventure Series, Action",
2702 'uploader': 'STREEM',
2703 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
ec85ded8 2704 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
dacb3a86
S
2705 'upload_date': '20150526',
2706 'license': 'Standard YouTube License',
2707 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2708 'categories': ['People & Blogs'],
2709 'tags': list,
dbdaaa23 2710 'view_count': int,
dacb3a86
S
2711 'like_count': int,
2712 'dislike_count': int,
2713 },
2714 'params': {
2715 'skip_download': True,
2716 },
13a75688 2717 'skip': 'This video is not available.',
dacb3a86 2718 'add_ie': [YoutubeIE.ie_key()],
481cc733 2719 }, {
8bdd16b4 2720 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
c0345b82 2721 'only_matching': True,
66b48727 2722 }, {
8bdd16b4 2723 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
66b48727 2724 'only_matching': True,
a0566bbf 2725 }, {
2726 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
2727 'info_dict': {
2728 'id': '9Auq9mYxFEE',
2729 'ext': 'mp4',
2730 'title': 'Watch Sky News live',
2731 'uploader': 'Sky News',
2732 'uploader_id': 'skynews',
2733 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
2734 'upload_date': '20191102',
2735 'description': 'md5:78de4e1c2359d0ea3ed829678e38b662',
2736 'categories': ['News & Politics'],
2737 'tags': list,
2738 'like_count': int,
2739 'dislike_count': int,
2740 },
2741 'params': {
2742 'skip_download': True,
2743 },
2744 }, {
2745 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2746 'info_dict': {
2747 'id': 'a48o2S1cPoo',
2748 'ext': 'mp4',
2749 'title': 'The Young Turks - Live Main Show',
2750 'uploader': 'The Young Turks',
2751 'uploader_id': 'TheYoungTurks',
2752 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2753 'upload_date': '20150715',
2754 'license': 'Standard YouTube License',
2755 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2756 'categories': ['News & Politics'],
2757 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2758 'like_count': int,
2759 'dislike_count': int,
2760 },
2761 'params': {
2762 'skip_download': True,
2763 },
2764 'only_matching': True,
2765 }, {
2766 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
2767 'only_matching': True,
2768 }, {
2769 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2770 'only_matching': True,
3d3dddc9 2771 }, {
2772 'url': 'https://www.youtube.com/feed/trending',
2773 'only_matching': True,
2774 }, {
2775 # needs auth
2776 'url': 'https://www.youtube.com/feed/library',
2777 'only_matching': True,
2778 }, {
2779 # needs auth
2780 'url': 'https://www.youtube.com/feed/history',
2781 'only_matching': True,
2782 }, {
2783 # needs auth
2784 'url': 'https://www.youtube.com/feed/subscriptions',
2785 'only_matching': True,
2786 }, {
2787 # needs auth
2788 'url': 'https://www.youtube.com/feed/watch_later',
2789 'only_matching': True,
2790 }, {
2791 # no longer available?
2792 'url': 'https://www.youtube.com/feed/recommended',
2793 'only_matching': True,
2794 }
ef2f3c7f 2795 # TODO
2796 # {
2797 # 'url': 'https://www.youtube.com/TheYoungTurks/live',
2798 # 'only_matching': True,
2799 # }
a0566bbf 2800 ]
8bdd16b4 2801
2802 def _extract_channel_id(self, webpage):
2803 channel_id = self._html_search_meta(
2804 'channelId', webpage, 'channel id', default=None)
2805 if channel_id:
2806 return channel_id
2807 channel_url = self._html_search_meta(
2808 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
2809 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
2810 'twitter:app:url:googleplay'), webpage, 'channel url')
2811 return self._search_regex(
2812 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
2813 channel_url, 'channel id')
15f6397c 2814
8bdd16b4 2815 @staticmethod
2816 def _extract_grid_item_renderer(item):
2817 for item_kind in ('Playlist', 'Video', 'Channel'):
2818 renderer = item.get('grid%sRenderer' % item_kind)
2819 if renderer:
2820 return renderer
2821
2822 def _extract_video(self, renderer):
2823 video_id = renderer.get('videoId')
2824 title = try_get(
2825 renderer,
2826 (lambda x: x['title']['runs'][0]['text'],
2827 lambda x: x['title']['simpleText']), compat_str)
2828 description = try_get(
2829 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
2830 compat_str)
2831 duration = parse_duration(try_get(
2832 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
2833 view_count_text = try_get(
2834 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
2835 view_count = str_to_int(self._search_regex(
2836 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
2837 'view count', default=None))
2838 uploader = try_get(
2839 renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
2840 return {
2841 '_type': 'url_transparent',
2842 'ie_key': YoutubeIE.ie_key(),
2843 'id': video_id,
2844 'url': video_id,
2845 'title': title,
2846 'description': description,
2847 'duration': duration,
2848 'view_count': view_count,
2849 'uploader': uploader,
2850 }
652cdaa2 2851
8bdd16b4 2852 def _grid_entries(self, grid_renderer):
2853 for item in grid_renderer['items']:
2854 if not isinstance(item, dict):
39b62db1 2855 continue
8bdd16b4 2856 renderer = self._extract_grid_item_renderer(item)
2857 if not isinstance(renderer, dict):
2858 continue
2859 title = try_get(
2860 renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2861 # playlist
2862 playlist_id = renderer.get('playlistId')
2863 if playlist_id:
2864 yield self.url_result(
2865 'https://www.youtube.com/playlist?list=%s' % playlist_id,
2866 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
2867 video_title=title)
2868 # video
2869 video_id = renderer.get('videoId')
2870 if video_id:
2871 yield self._extract_video(renderer)
2872 # channel
2873 channel_id = renderer.get('channelId')
2874 if channel_id:
2875 title = try_get(
2876 renderer, lambda x: x['title']['simpleText'], compat_str)
2877 yield self.url_result(
2878 'https://www.youtube.com/channel/%s' % channel_id,
2879 ie=YoutubeTabIE.ie_key(), video_title=title)
2880
3d3dddc9 2881 def _shelf_entries_from_content(self, shelf_renderer):
2882 content = shelf_renderer.get('content')
2883 if not isinstance(content, dict):
8bdd16b4 2884 return
3d3dddc9 2885 renderer = content.get('gridRenderer')
2886 if renderer:
2887 # TODO: add support for nested playlists so each shelf is processed
2888 # as separate playlist
2889 # TODO: this includes only first N items
2890 for entry in self._grid_entries(renderer):
2891 yield entry
2892 renderer = content.get('horizontalListRenderer')
2893 if renderer:
2894 # TODO
2895 pass
8bdd16b4 2896
2897 def _shelf_entries(self, shelf_renderer):
2898 ep = try_get(
2899 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
2900 compat_str)
2901 shelf_url = urljoin('https://www.youtube.com', ep)
3d3dddc9 2902 if shelf_url:
2903 title = try_get(
2904 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2905 yield self.url_result(shelf_url, video_title=title)
2906 # Shelf may not contain shelf URL, fallback to extraction from content
2907 for entry in self._shelf_entries_from_content(shelf_renderer):
2908 yield entry
c5e8d7af 2909
8bdd16b4 2910 def _playlist_entries(self, video_list_renderer):
2911 for content in video_list_renderer['contents']:
2912 if not isinstance(content, dict):
2913 continue
2914 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
2915 if not isinstance(renderer, dict):
2916 continue
2917 video_id = renderer.get('videoId')
2918 if not video_id:
2919 continue
2920 yield self._extract_video(renderer)
07aeced6 2921
3d3dddc9 2922 r""" # Not needed in the new implementation
3462ffa8 2923 def _itemSection_entries(self, item_sect_renderer):
2924 for content in item_sect_renderer['contents']:
2925 if not isinstance(content, dict):
2926 continue
2927 renderer = content.get('videoRenderer', {})
2928 if not isinstance(renderer, dict):
2929 continue
2930 video_id = renderer.get('videoId')
2931 if not video_id:
2932 continue
2933 yield self._extract_video(renderer)
3d3dddc9 2934 """
3462ffa8 2935
2936 def _rich_entries(self, rich_grid_renderer):
2937 renderer = try_get(
70d5c17b 2938 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
3462ffa8 2939 video_id = renderer.get('videoId')
2940 if not video_id:
2941 return
2942 yield self._extract_video(renderer)
2943
8bdd16b4 2944 def _video_entry(self, video_renderer):
2945 video_id = video_renderer.get('videoId')
2946 if video_id:
2947 return self._extract_video(video_renderer)
dacb3a86 2948
8bdd16b4 2949 def _post_thread_entries(self, post_thread_renderer):
2950 post_renderer = try_get(
2951 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
2952 if not post_renderer:
2953 return
2954 # video attachment
2955 video_renderer = try_get(
2956 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
2957 video_id = None
2958 if video_renderer:
2959 entry = self._video_entry(video_renderer)
2960 if entry:
2961 yield entry
2962 # inline video links
2963 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
2964 for run in runs:
2965 if not isinstance(run, dict):
2966 continue
2967 ep_url = try_get(
2968 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
2969 if not ep_url:
2970 continue
2971 if not YoutubeIE.suitable(ep_url):
2972 continue
2973 ep_video_id = YoutubeIE._match_id(ep_url)
2974 if video_id == ep_video_id:
2975 continue
2976 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
dacb3a86 2977
8bdd16b4 2978 def _post_thread_continuation_entries(self, post_thread_continuation):
2979 contents = post_thread_continuation.get('contents')
2980 if not isinstance(contents, list):
2981 return
2982 for content in contents:
2983 renderer = content.get('backstagePostThreadRenderer')
2984 if not isinstance(renderer, dict):
2985 continue
2986 for entry in self._post_thread_entries(renderer):
2987 yield entry
07aeced6 2988
8bdd16b4 2989 @staticmethod
2990 def _extract_next_continuation_data(renderer):
2991 next_continuation = try_get(
2992 renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
2993 if not next_continuation:
2994 return
2995 continuation = next_continuation.get('continuation')
2996 if not continuation:
2997 return
2998 ctp = next_continuation.get('clickTrackingParams')
2999 return {
3000 'ctoken': continuation,
3001 'continuation': continuation,
3002 'itct': ctp,
3003 }
c5e8d7af 3004
8bdd16b4 3005 @classmethod
3006 def _extract_continuation(cls, renderer):
3007 next_continuation = cls._extract_next_continuation_data(renderer)
3008 if next_continuation:
3009 return next_continuation
3010 contents = renderer.get('contents')
3011 if not isinstance(contents, list):
3012 return
3013 for content in contents:
3014 if not isinstance(content, dict):
3015 continue
3016 continuation_ep = try_get(
3017 content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
3018 dict)
3019 if not continuation_ep:
3020 continue
3021 continuation = try_get(
3022 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
3023 if not continuation:
3024 continue
3025 ctp = continuation_ep.get('clickTrackingParams')
3026 if not ctp:
3027 continue
3028 return {
3029 'ctoken': continuation,
3030 'continuation': continuation,
3031 'itct': ctp,
3032 }
448830ce 3033
8bdd16b4 3034 def _entries(self, tab, identity_token):
3462ffa8 3035
70d5c17b 3036 def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
3037 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
3038 for content in contents:
3039 if not isinstance(content, dict):
8bdd16b4 3040 continue
70d5c17b 3041 is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
3462ffa8 3042 if not is_renderer:
70d5c17b 3043 renderer = content.get('richItemRenderer')
3462ffa8 3044 if renderer:
3045 for entry in self._rich_entries(renderer):
3046 yield entry
3047 continuation_list[0] = self._extract_continuation(parent_renderer)
8bdd16b4 3048 continue
3462ffa8 3049 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
3050 for isr_content in isr_contents:
3051 if not isinstance(isr_content, dict):
3052 continue
3053 renderer = isr_content.get('playlistVideoListRenderer')
3054 if renderer:
3055 for entry in self._playlist_entries(renderer):
3056 yield entry
3057 continuation_list[0] = self._extract_continuation(renderer)
3058 continue
3059 renderer = isr_content.get('gridRenderer')
3060 if renderer:
3061 for entry in self._grid_entries(renderer):
3062 yield entry
3063 continuation_list[0] = self._extract_continuation(renderer)
3064 continue
3065 renderer = isr_content.get('shelfRenderer')
3066 if renderer:
3067 for entry in self._shelf_entries(renderer):
3068 yield entry
3462ffa8 3069 continue
3070 renderer = isr_content.get('backstagePostThreadRenderer')
3071 if renderer:
3072 for entry in self._post_thread_entries(renderer):
3073 yield entry
3074 continuation_list[0] = self._extract_continuation(renderer)
3075 continue
3076 renderer = isr_content.get('videoRenderer')
3077 if renderer:
3078 entry = self._video_entry(renderer)
3079 if entry:
3080 yield entry
70d5c17b 3081
3462ffa8 3082 if not continuation_list[0]:
3083 continuation_list[0] = self._extract_continuation(is_renderer)
70d5c17b 3084
3085 if not continuation_list[0]:
3086 continuation_list[0] = self._extract_continuation(parent_renderer)
3462ffa8 3087
3088 continuation_list = [None] # Python 2 doesnot support nonlocal
3089 parent_renderer = (
3090 try_get(tab, lambda x: x['sectionListRenderer'], dict)
3091 or try_get(tab, lambda x: x['richGridRenderer'], dict) or {})
70d5c17b 3092 for entry in extract_entries(parent_renderer):
3093 yield entry
3462ffa8 3094 continuation = continuation_list[0]
8bdd16b4 3095
3096 headers = {
3097 'x-youtube-client-name': '1',
3098 'x-youtube-client-version': '2.20201112.04.01',
3099 }
3100 if identity_token:
3101 headers['x-youtube-identity-token'] = identity_token
ebf1b291 3102
8bdd16b4 3103 for page_num in itertools.count(1):
3104 if not continuation:
3105 break
3106 browse = self._download_json(
3107 'https://www.youtube.com/browse_ajax', None,
3108 'Downloading page %d' % page_num,
3109 headers=headers, query=continuation, fatal=False)
3110 if not browse:
3111 break
3112 response = try_get(browse, lambda x: x[1]['response'], dict)
3113 if not response:
3114 break
ebf1b291 3115
8bdd16b4 3116 continuation_contents = try_get(
3117 response, lambda x: x['continuationContents'], dict)
3118 if continuation_contents:
3119 continuation_renderer = continuation_contents.get('playlistVideoListContinuation')
3120 if continuation_renderer:
3121 for entry in self._playlist_entries(continuation_renderer):
3122 yield entry
3123 continuation = self._extract_continuation(continuation_renderer)
3124 continue
3125 continuation_renderer = continuation_contents.get('gridContinuation')
3126 if continuation_renderer:
3127 for entry in self._grid_entries(continuation_renderer):
3128 yield entry
3129 continuation = self._extract_continuation(continuation_renderer)
3130 continue
3131 continuation_renderer = continuation_contents.get('itemSectionContinuation')
3132 if continuation_renderer:
3133 for entry in self._post_thread_continuation_entries(continuation_renderer):
3134 yield entry
3135 continuation = self._extract_continuation(continuation_renderer)
3136 continue
70d5c17b 3137 continuation_renderer = continuation_contents.get('sectionListContinuation') # for feeds
3462ffa8 3138 if continuation_renderer:
3139 continuation_list = [None]
3140 for entry in extract_entries(continuation_renderer):
3141 yield entry
3142 continuation = continuation_list[0]
3143 continue
c5e8d7af 3144
8bdd16b4 3145 continuation_items = try_get(
3146 response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
3147 if continuation_items:
3148 continuation_item = continuation_items[0]
3149 if not isinstance(continuation_item, dict):
3150 continue
70d5c17b 3151 renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer')
8bdd16b4 3152 if renderer:
3153 video_list_renderer = {'contents': continuation_items}
3154 for entry in self._playlist_entries(video_list_renderer):
3155 yield entry
3156 continuation = self._extract_continuation(video_list_renderer)
3157 continue
8bdd16b4 3158 break
9558dcec 3159
8bdd16b4 3160 @staticmethod
3161 def _extract_selected_tab(tabs):
3162 for tab in tabs:
3163 if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
3164 return tab['tabRenderer']
2b3c2546 3165 else:
8bdd16b4 3166 raise ExtractorError('Unable to find selected tab')
b82f815f 3167
8bdd16b4 3168 @staticmethod
3169 def _extract_uploader(data):
3170 uploader = {}
3171 sidebar_renderer = try_get(
3172 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
3173 if sidebar_renderer:
3174 for item in sidebar_renderer:
3175 if not isinstance(item, dict):
3176 continue
3177 renderer = item.get('playlistSidebarSecondaryInfoRenderer')
3178 if not isinstance(renderer, dict):
3179 continue
3180 owner = try_get(
3181 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3182 if owner:
3183 uploader['uploader'] = owner.get('text')
3184 uploader['uploader_id'] = try_get(
3185 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3186 uploader['uploader_url'] = urljoin(
3187 'https://www.youtube.com/',
3188 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3189 return uploader
3190
3191 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
3192 selected_tab = self._extract_selected_tab(tabs)
3193 renderer = try_get(
3194 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
70d5c17b 3195 playlist_id = title = description = None
8bdd16b4 3196 if renderer:
3197 channel_title = renderer.get('title') or item_id
3198 tab_title = selected_tab.get('title')
3199 title = channel_title or item_id
3200 if tab_title:
3201 title += ' - %s' % tab_title
3202 description = renderer.get('description')
3203 playlist_id = renderer.get('externalId')
3204 renderer = try_get(
3205 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3206 if renderer:
3207 title = renderer.get('title')
3208 description = None
3209 playlist_id = item_id
3462ffa8 3210 if playlist_id is None:
70d5c17b 3211 playlist_id = item_id
3212 if title is None:
3213 title = "Youtube " + playlist_id.title()
8bdd16b4 3214 playlist = self.playlist_result(
3215 self._entries(selected_tab['content'], identity_token),
3216 playlist_id=playlist_id, playlist_title=title,
3217 playlist_description=description)
3218 playlist.update(self._extract_uploader(data))
3219 return playlist
73c4ac2c 3220
8bdd16b4 3221 def _extract_from_playlist(self, item_id, data, playlist):
3222 title = playlist.get('title') or try_get(
3223 data, lambda x: x['titleText']['simpleText'], compat_str)
3224 playlist_id = playlist.get('playlistId') or item_id
3225 return self.playlist_result(
3226 self._playlist_entries(playlist), playlist_id=playlist_id,
3227 playlist_title=title)
c5e8d7af 3228
02ced43c 3229 def _extract_alerts(self, data):
3230 for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
3231 for renderer in alert_dict:
3232 alert = alert_dict[renderer]
3233 alert_type = alert.get('type')
3234 if not alert_type:
3235 continue
3236 message = try_get(alert, lambda x: x['text']['simpleText'], compat_str)
3237 if message:
3238 yield alert_type, message
3239 for run in try_get(alert, lambda x: x['text']['runs'], list) or []:
3240 message = try_get(run, lambda x: x['text'], compat_str)
3241 if message:
3242 yield alert_type, message
3243
8bdd16b4 3244 def _real_extract(self, url):
3245 item_id = self._match_id(url)
3246 url = compat_urlparse.urlunparse(
3247 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
036fcf3a 3248 is_home = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url)
70d5c17b 3249 if is_home is not None and is_home.group('not_channel') is None and item_id != 'feed':
036fcf3a 3250 self._downloader.report_warning(
3251 'A channel/user page was given. All the channel\'s videos will be downloaded. '
3252 'To download only the videos in the home page, add a "/home" to the URL')
3253 url = '%s/videos%s' % (is_home.group('pre'), is_home.group('post') or '')
3254
8bdd16b4 3255 # Handle both video/playlist URLs
3256 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3257 video_id = qs.get('v', [None])[0]
3258 playlist_id = qs.get('list', [None])[0]
f0c532a4 3259
c78b936a 3260 if is_home.group('not_channel') is not None and is_home.group('not_channel').startswith('watch') and not video_id:
f0c532a4 3261 if playlist_id:
3262 self._downloader.report_warning('%s is not a valid Youtube URL. Trying to download playlist %s' % (url, playlist_id))
3263 url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
3264 # return self.url_result(playlist_id, ie=YoutubePlaylistIE.ie_key())
3265 else:
3266 raise ExtractorError('Unable to recognize tab page')
8bdd16b4 3267 if video_id and playlist_id:
3268 if self._downloader.params.get('noplaylist'):
3269 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
3270 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3271 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
2fa90513 3272
8bdd16b4 3273 webpage = self._download_webpage(url, item_id)
3274 identity_token = self._search_regex(
a93f71ee 3275 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
8bdd16b4 3276 'identity token', default=None)
3277 data = self._extract_yt_initial_data(item_id, webpage)
02ced43c 3278 for alert_type, alert_message in self._extract_alerts(data):
3279 self._downloader.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
8bdd16b4 3280 tabs = try_get(
3281 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3282 if tabs:
3283 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
3284 playlist = try_get(
3285 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3286 if playlist:
3287 return self._extract_from_playlist(item_id, data, playlist)
a0566bbf 3288 # Fallback to video extraction if no playlist alike page is recognized.
3289 # First check for the current video then try the v attribute of URL query.
3290 video_id = try_get(
3291 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
3292 compat_str) or video_id
8bdd16b4 3293 if video_id:
3294 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3295 # Failed to recognize
3296 raise ExtractorError('Unable to recognize tab page')
c5e8d7af 3297
c5e8d7af 3298
8bdd16b4 3299class YoutubePlaylistIE(InfoExtractor):
3300 IE_DESC = 'YouTube.com playlists'
3301 _VALID_URL = r'''(?x)(?:
3302 (?:https?://)?
3303 (?:\w+\.)?
3304 (?:
3305 (?:
3306 youtube(?:kids)?\.com|
3307 invidio\.us|
3308 youtu\.be
3309 )
3310 /.*?\?.*?\blist=
3311 )?
3312 (?P<id>%(playlist_id)s)
3313 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3314 IE_NAME = 'youtube:playlist'
cdc628a4 3315 _TESTS = [{
8bdd16b4 3316 'note': 'issue #673',
3317 'url': 'PLBB231211A4F62143',
cdc628a4 3318 'info_dict': {
8bdd16b4 3319 'title': '[OLD]Team Fortress 2 (Class-based LP)',
3320 'id': 'PLBB231211A4F62143',
3321 'uploader': 'Wickydoo',
3322 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
3323 },
3324 'playlist_mincount': 29,
3325 }, {
3326 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3327 'info_dict': {
3328 'title': 'YDL_safe_search',
3329 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3330 },
3331 'playlist_count': 2,
3332 'skip': 'This playlist is private',
9558dcec 3333 }, {
8bdd16b4 3334 'note': 'embedded',
3335 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3336 'playlist_count': 4,
9558dcec 3337 'info_dict': {
8bdd16b4 3338 'title': 'JODA15',
3339 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3340 'uploader': 'milan',
3341 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
9558dcec 3342 }
cdc628a4 3343 }, {
8bdd16b4 3344 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3345 'playlist_mincount': 982,
3346 'info_dict': {
3347 'title': '2018 Chinese New Singles (11/6 updated)',
3348 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3349 'uploader': 'LBK',
3350 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
3351 }
daa0df9e 3352 }, {
8bdd16b4 3353 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
3354 'info_dict': {
3355 'id': 'yeWKywCrFtk',
3356 'ext': 'mp4',
3357 'title': 'Small Scale Baler and Braiding Rugs',
3358 'uploader': 'Backus-Page House Museum',
3359 'uploader_id': 'backuspagemuseum',
3360 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
3361 'upload_date': '20161008',
3362 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
3363 'categories': ['Nonprofits & Activism'],
3364 'tags': list,
3365 'like_count': int,
3366 'dislike_count': int,
3367 },
3368 'params': {
3369 'noplaylist': True,
3370 'skip_download': True,
3371 },
39e7107d 3372 }, {
8bdd16b4 3373 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
39e7107d 3374 'only_matching': True,
9558dcec 3375 }, {
8bdd16b4 3376 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
9558dcec 3377 'only_matching': True,
73c4ac2c 3378 }, {
8bdd16b4 3379 # music album playlist
3380 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
73c4ac2c 3381 'only_matching': True,
cdc628a4
PH
3382 }]
3383
e3ea4790 3384 @classmethod
f4b05232 3385 def suitable(cls, url):
8bdd16b4 3386 return False if YoutubeTabIE.suitable(url) else super(
3387 YoutubePlaylistIE, cls).suitable(url)
f4b05232 3388
8bdd16b4 3389 def _real_extract(self, url):
3390 playlist_id = self._match_id(url)
3391 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3392 if not qs:
3393 qs = {'list': playlist_id}
3394 return self.url_result(
3395 update_url_query('https://www.youtube.com/playlist', qs),
3396 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3397
3398
3399class YoutubeYtUserIE(InfoExtractor):
3400 _VALID_URL = r'ytuser:(?P<id>.+)'
3401 _TESTS = [{
3402 'url': 'ytuser:phihag',
3403 'only_matching': True,
3404 }]
3405
3406 def _real_extract(self, url):
3407 user_id = self._match_id(url)
3408 return self.url_result(
3409 'https://www.youtube.com/user/%s' % user_id,
3410 ie=YoutubeTabIE.ie_key(), video_id=user_id)
9558dcec 3411
b05654f0 3412
3d3dddc9 3413class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
70d5c17b 3414 IE_NAME = 'youtube:favorites'
3415 IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
3416 _VALID_URL = r':ytfav(?:ou?rite)?s?'
3417 _LOGIN_REQUIRED = True
3418 _TESTS = [{
3419 'url': ':ytfav',
3420 'only_matching': True,
3421 }, {
3422 'url': ':ytfavorites',
3423 'only_matching': True,
3424 }]
3425
3426 def _real_extract(self, url):
3427 return self.url_result(
3428 'https://www.youtube.com/playlist?list=LL',
3429 ie=YoutubeTabIE.ie_key())
3430
3431
8bdd16b4 3432class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
78caa52a 3433 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
3434 # there doesn't appear to be a real limit, for example if you search for
3435 # 'python' you get more than 8.000.000 results
3436 _MAX_RESULTS = float('inf')
78caa52a 3437 IE_NAME = 'youtube:search'
b05654f0 3438 _SEARCH_KEY = 'ytsearch'
6c894ea1 3439 _SEARCH_PARAMS = None
9dd8e46a 3440 _TESTS = []
b05654f0 3441
6c894ea1
U
3442 def _entries(self, query, n):
3443 data = {
3444 'context': {
3445 'client': {
3446 'clientName': 'WEB',
3447 'clientVersion': '2.20201021.03.00',
3448 }
3449 },
3450 'query': query,
a22b2fd1 3451 }
6c894ea1
U
3452 if self._SEARCH_PARAMS:
3453 data['params'] = self._SEARCH_PARAMS
3454 total = 0
3455 for page_num in itertools.count(1):
3456 search = self._download_json(
3457 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3458 video_id='query "%s"' % query,
3459 note='Downloading page %s' % page_num,
3460 errnote='Unable to download API page', fatal=False,
3461 data=json.dumps(data).encode('utf8'),
3462 headers={'content-type': 'application/json'})
3463 if not search:
b4c08069 3464 break
6c894ea1
U
3465 slr_contents = try_get(
3466 search,
3467 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3468 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3469 list)
3470 if not slr_contents:
a22b2fd1 3471 break
0366ae87
M
3472
3473 isr_contents = []
3474 continuation_token = None
3475 # Youtube sometimes adds promoted content to searches,
3476 # changing the index location of videos and token.
3477 # So we search through all entries till we find them.
3478 for index, isr in enumerate(slr_contents):
9da76d30 3479 if not isr_contents:
0366ae87
M
3480 isr_contents = try_get(
3481 slr_contents,
3482 (lambda x: x[index]['itemSectionRenderer']['contents']),
3483 list)
3484 for content in isr_contents:
3485 if content.get('videoRenderer') is not None:
3486 break
3487 else:
3488 isr_contents = []
3489
3490 if continuation_token is None:
3491 continuation_token = try_get(
3492 slr_contents,
3493 lambda x: x[index]['continuationItemRenderer']['continuationEndpoint']['continuationCommand'][
3494 'token'],
3495 compat_str)
9da76d30 3496 if continuation_token is not None and isr_contents:
0366ae87
M
3497 break
3498
6c894ea1
U
3499 if not isr_contents:
3500 break
3501 for content in isr_contents:
3502 if not isinstance(content, dict):
3503 continue
3504 video = content.get('videoRenderer')
3505 if not isinstance(video, dict):
3506 continue
3507 video_id = video.get('videoId')
3508 if not video_id:
3509 continue
3510 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3511 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3512 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3513 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3514 view_count = int_or_none(self._search_regex(
3515 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3516 'view count', default=None))
3517 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3518 total += 1
3519 yield {
3520 '_type': 'url_transparent',
3521 'ie_key': YoutubeIE.ie_key(),
3522 'id': video_id,
3523 'url': video_id,
3524 'title': title,
3525 'description': description,
3526 'duration': duration,
3527 'view_count': view_count,
3528 'uploader': uploader,
3529 }
3530 if total == n:
3531 return
0366ae87 3532 if not continuation_token:
6c894ea1 3533 break
0366ae87 3534 data['continuation'] = continuation_token
b05654f0 3535
6c894ea1
U
3536 def _get_n_results(self, query, n):
3537 """Get a specified number of results for a query"""
3538 return self.playlist_result(self._entries(query, n), query)
75dff0ee 3539
c9ae7b95 3540
a3dd9248 3541class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 3542 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 3543 _SEARCH_KEY = 'ytsearchdate'
78caa52a 3544 IE_DESC = 'YouTube.com searches, newest videos first'
6c894ea1 3545 _SEARCH_PARAMS = 'CAI%3D'
75dff0ee 3546
c9ae7b95 3547
386e1dd9 3548class YoutubeSearchURLIE(YoutubeSearchIE):
3462ffa8 3549 IE_DESC = 'YouTube.com search URLs'
386e1dd9 3550 IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
3551 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
ef2f3c7f 3552 # _MAX_RESULTS = 100
3462ffa8 3553 _TESTS = [{
3554 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3555 'playlist_mincount': 5,
3556 'info_dict': {
3557 'title': 'youtube-dl test video',
3558 }
3559 }, {
3560 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3561 'only_matching': True,
3562 }]
3563
386e1dd9 3564 @classmethod
3565 def _make_valid_url(cls):
3566 return cls._VALID_URL
3567
3462ffa8 3568 def _real_extract(self, url):
386e1dd9 3569 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
3570 query = (qs.get('search_query') or qs.get('q'))[0]
3571 self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
3572 return self._get_n_results(query, self._MAX_RESULTS)
3462ffa8 3573
3574
3575class YoutubeFeedsInfoExtractor(YoutubeTabIE):
d7ae0639 3576 """
25f14e9f 3577 Base class for feed extractors
3d3dddc9 3578 Subclasses must define the _FEED_NAME property.
d7ae0639 3579 """
b2e8bc1b 3580 _LOGIN_REQUIRED = True
3462ffa8 3581 # _MAX_PAGES = 5
ef2f3c7f 3582 _TESTS = []
d7ae0639
JMF
3583
3584 @property
3585 def IE_NAME(self):
78caa52a 3586 return 'youtube:%s' % self._FEED_NAME
04cc9617 3587
81f0259b 3588 def _real_initialize(self):
b2e8bc1b 3589 self._login()
81f0259b 3590
3853309f 3591 def _real_extract(self, url):
3d3dddc9 3592 return self.url_result(
3593 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3594 ie=YoutubeTabIE.ie_key())
25f14e9f
S
3595
3596
ef2f3c7f 3597class YoutubeWatchLaterIE(InfoExtractor):
3598 IE_NAME = 'youtube:watchlater'
70d5c17b 3599 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3d3dddc9 3600 _VALID_URL = r':ytwatchlater'
bc7a9cd8 3601 _TESTS = [{
8bdd16b4 3602 'url': ':ytwatchlater',
bc7a9cd8
S
3603 'only_matching': True,
3604 }]
25f14e9f
S
3605
3606 def _real_extract(self, url):
ef2f3c7f 3607 return self.url_result(
3608 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
3462ffa8 3609
3610
25f14e9f
S
3611class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3612 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3d3dddc9 3613 _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
25f14e9f 3614 _FEED_NAME = 'recommended'
3d3dddc9 3615 _TESTS = [{
3616 'url': ':ytrec',
3617 'only_matching': True,
3618 }, {
3619 'url': ':ytrecommended',
3620 'only_matching': True,
3621 }, {
3622 'url': 'https://youtube.com',
3623 'only_matching': True,
3624 }]
1ed5b5c9 3625
1ed5b5c9 3626
25f14e9f 3627class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
70d5c17b 3628 IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
3d3dddc9 3629 _VALID_URL = r':ytsub(?:scription)?s?'
25f14e9f 3630 _FEED_NAME = 'subscriptions'
3d3dddc9 3631 _TESTS = [{
3632 'url': ':ytsubs',
3633 'only_matching': True,
3634 }, {
3635 'url': ':ytsubscriptions',
3636 'only_matching': True,
3637 }]
1ed5b5c9 3638
1ed5b5c9 3639
25f14e9f
S
3640class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3641 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3d3dddc9 3642 _VALID_URL = r':ythistory'
25f14e9f 3643 _FEED_NAME = 'history'
3d3dddc9 3644 _TESTS = [{
3645 'url': ':ythistory',
3646 'only_matching': True,
3647 }]
1ed5b5c9
JMF
3648
3649
15870e90
PH
3650class YoutubeTruncatedURLIE(InfoExtractor):
3651 IE_NAME = 'youtube:truncated_url'
3652 IE_DESC = False # Do not list
975d35db 3653 _VALID_URL = r'''(?x)
b95aab84
PH
3654 (?:https?://)?
3655 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3656 (?:watch\?(?:
c4808c60 3657 feature=[a-z_]+|
b95aab84
PH
3658 annotation_id=annotation_[^&]+|
3659 x-yt-cl=[0-9]+|
c1708b89 3660 hl=[^&]*|
287be8c6 3661 t=[0-9]+
b95aab84
PH
3662 )?
3663 |
3664 attribution_link\?a=[^&]+
3665 )
3666 $
975d35db 3667 '''
15870e90 3668
c4808c60 3669 _TESTS = [{
2d3d2997 3670 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
c4808c60 3671 'only_matching': True,
dc2fc736 3672 }, {
2d3d2997 3673 'url': 'https://www.youtube.com/watch?',
dc2fc736 3674 'only_matching': True,
b95aab84
PH
3675 }, {
3676 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3677 'only_matching': True,
3678 }, {
3679 'url': 'https://www.youtube.com/watch?feature=foo',
3680 'only_matching': True,
c1708b89
PH
3681 }, {
3682 'url': 'https://www.youtube.com/watch?hl=en-GB',
3683 'only_matching': True,
287be8c6
PH
3684 }, {
3685 'url': 'https://www.youtube.com/watch?t=2372',
3686 'only_matching': True,
c4808c60
PH
3687 }]
3688
15870e90
PH
3689 def _real_extract(self, url):
3690 raise ExtractorError(
78caa52a
PH
3691 'Did you forget to quote the URL? Remember that & is a meta '
3692 'character in most shells, so you want to put the URL in quotes, '
3867038a 3693 'like youtube-dl '
2d3d2997 3694 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3867038a 3695 ' or simply youtube-dl BaW_jenozKc .',
15870e90 3696 expected=True)
772fd5cc
PH
3697
3698
3699class YoutubeTruncatedIDIE(InfoExtractor):
3700 IE_NAME = 'youtube:truncated_id'
3701 IE_DESC = False # Do not list
b95aab84 3702 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
3703
3704 _TESTS = [{
3705 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3706 'only_matching': True,
3707 }]
3708
3709 def _real_extract(self, url):
3710 video_id = self._match_id(url)
3711 raise ExtractorError(
3712 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3713 expected=True)
8bdd16b4 3714
3715
3462ffa8 3716# Do Youtube show urls even exist anymore? I couldn't find any
3717r'''
3718class YoutubeShowIE(YoutubeTabIE):
8bdd16b4 3719 IE_DESC = 'YouTube.com (multi-season) shows'
3720 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3721 IE_NAME = 'youtube:show'
3722 _TESTS = [{
3723 'url': 'https://www.youtube.com/show/airdisasters',
3724 'playlist_mincount': 5,
3725 'info_dict': {
3726 'id': 'airdisasters',
3727 'title': 'Air Disasters',
3728 }
3729 }]
3730
3731 def _real_extract(self, url):
3732 playlist_id = self._match_id(url)
3733 return super(YoutubeShowIE, self)._real_extract(
3734 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3462ffa8 3735'''