]> jfr.im git - yt-dlp.git/blame - youtube_dlc/extractor/youtube.py
[yt_live_chat] deactivate for now.
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
d77ab8e2 9import random
c5e8d7af 10import re
42939b61 11import time
e0df6211 12import traceback
c5e8d7af 13
b05654f0 14from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
f8c55c66 19 compat_HTTPError,
8d81f3e3 20 compat_kwargs,
c5e8d7af 21 compat_parse_qs,
7fd002c0
S
22 compat_urllib_parse_unquote,
23 compat_urllib_parse_unquote_plus,
15707c7e 24 compat_urllib_parse_urlencode,
7c80519c 25 compat_urllib_parse_urlparse,
7c61bd36 26 compat_urlparse,
c5e8d7af 27 compat_str,
4bb4a188
PH
28)
29from ..utils import (
27019dbb 30 bool_or_none,
c5e8d7af 31 clean_html,
9b9c5355 32 error_to_compat_str,
351f37c0 33 extract_attributes,
c5e8d7af 34 ExtractorError,
2d30521a 35 float_or_none,
4bb4a188
PH
36 get_element_by_attribute,
37 get_element_by_id,
dd27fd17 38 int_or_none,
94278f72 39 mimetype2ext,
4bb4a188 40 orderedSet,
6310acf5 41 parse_codecs,
b84071c0 42 parse_count,
7c80519c 43 parse_duration,
0cb58b02 44 remove_quotes,
3995d37d 45 remove_start,
cf7e015f 46 smuggle_url,
dbdaaa23 47 str_or_none,
c93d53f5 48 str_to_int,
556dbe7f 49 try_get,
c5e8d7af
PH
50 unescapeHTML,
51 unified_strdate,
cf7e015f 52 unsmuggle_url,
81c2f20b 53 uppercase_escape,
21c340b8 54 url_or_none,
6e6bc8da 55 urlencode_postdata,
c5e8d7af
PH
56)
57
5f6a1245 58
de7f3446 59class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
60 """Provide base functions for Youtube extractors"""
61 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 62 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
e00eb564
S
63
64 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
3995d37d
S
65 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
66 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
e00eb564 67
b2e8bc1b
JMF
68 _NETRC_MACHINE = 'youtube'
69 # If True it will raise an error if no login info is provided
70 _LOGIN_REQUIRED = False
71
66b48727 72 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
d0ba5587 73
d84b21b4
S
74 _YOUTUBE_CLIENT_HEADERS = {
75 'x-youtube-client-name': '1',
76 'x-youtube-client-version': '1.20200609.04.02',
77 }
78
b2e8bc1b 79 def _set_language(self):
810fb84d 80 self._set_cookie(
ee0b726c 81 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
42939b61 82 # YouTube sets the expire time to about two months
810fb84d 83 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 84
25f14e9f
S
85 def _ids_to_results(self, ids):
86 return [
87 self.url_result(vid_id, 'Youtube', video_id=vid_id)
88 for vid_id in ids]
89
b2e8bc1b 90 def _login(self):
83317f69 91 """
92 Attempt to log in to YouTube.
93 True is returned if successful or skipped.
94 False is returned if login failed.
95
96 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
97 """
68217024 98 username, password = self._get_login_info()
b2e8bc1b
JMF
99 # No authentication to be performed
100 if username is None:
70d35d16 101 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
69ea8ca4 102 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
957c523e
U
103 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
104 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
83317f69 105 return True
b2e8bc1b 106
7cc3570e
PH
107 login_page = self._download_webpage(
108 self._LOGIN_URL, None,
69ea8ca4
PH
109 note='Downloading login page',
110 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
111 if login_page is False:
112 return
b2e8bc1b 113
1212e997 114 login_form = self._hidden_inputs(login_page)
c5e8d7af 115
e00eb564
S
116 def req(url, f_req, note, errnote):
117 data = login_form.copy()
118 data.update({
119 'pstMsg': 1,
120 'checkConnection': 'youtube',
121 'checkedDomains': 'youtube',
122 'hl': 'en',
123 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
3995d37d 124 'f.req': json.dumps(f_req),
e00eb564
S
125 'flowName': 'GlifWebSignIn',
126 'flowEntry': 'ServiceLogin',
baf67a60
S
127 # TODO: reverse actual botguard identifier generation algo
128 'bgRequest': '["identifier",""]',
041bc3ad 129 })
e00eb564
S
130 return self._download_json(
131 url, None, note=note, errnote=errnote,
132 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
133 fatal=False,
134 data=urlencode_postdata(data), headers={
135 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
136 'Google-Accounts-XSRF': 1,
137 })
138
3995d37d
S
139 def warn(message):
140 self._downloader.report_warning(message)
141
142 lookup_req = [
143 username,
144 None, [], None, 'US', None, None, 2, False, True,
145 [
146 None, None,
147 [2, 1, None, 1,
148 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
149 None, [], 4],
150 1, [None, None, []], None, None, None, True
151 ],
152 username,
153 ]
154
e00eb564 155 lookup_results = req(
3995d37d 156 self._LOOKUP_URL, lookup_req,
e00eb564
S
157 'Looking up account info', 'Unable to look up account info')
158
159 if lookup_results is False:
160 return False
041bc3ad 161
3995d37d
S
162 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
163 if not user_hash:
164 warn('Unable to extract user hash')
165 return False
166
167 challenge_req = [
168 user_hash,
169 None, 1, None, [1, None, None, None, [password, None, True]],
170 [
171 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
172 1, [None, None, []], None, None, None, True
173 ]]
83317f69 174
3995d37d
S
175 challenge_results = req(
176 self._CHALLENGE_URL, challenge_req,
177 'Logging in', 'Unable to log in')
83317f69 178
3995d37d 179 if challenge_results is False:
e00eb564 180 return
83317f69 181
3995d37d
S
182 login_res = try_get(challenge_results, lambda x: x[0][5], list)
183 if login_res:
184 login_msg = try_get(login_res, lambda x: x[5], compat_str)
185 warn(
186 'Unable to login: %s' % 'Invalid password'
187 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
188 return False
189
190 res = try_get(challenge_results, lambda x: x[0][-1], list)
191 if not res:
192 warn('Unable to extract result entry')
193 return False
194
9a6628aa
S
195 login_challenge = try_get(res, lambda x: x[0][0], list)
196 if login_challenge:
197 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
198 if challenge_str == 'TWO_STEP_VERIFICATION':
3995d37d
S
199 # SEND_SUCCESS - TFA code has been successfully sent to phone
200 # QUOTA_EXCEEDED - reached the limit of TFA codes
9a6628aa 201 status = try_get(login_challenge, lambda x: x[5], compat_str)
3995d37d
S
202 if status == 'QUOTA_EXCEEDED':
203 warn('Exceeded the limit of TFA codes, try later')
204 return False
205
206 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
207 if not tl:
208 warn('Unable to extract TL')
209 return False
210
211 tfa_code = self._get_tfa_info('2-step verification code')
212
213 if not tfa_code:
214 warn(
215 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
216 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
217 return False
218
219 tfa_code = remove_start(tfa_code, 'G-')
220
221 tfa_req = [
222 user_hash, None, 2, None,
223 [
224 9, None, None, None, None, None, None, None,
225 [None, tfa_code, True, 2]
226 ]]
227
228 tfa_results = req(
229 self._TFA_URL.format(tl), tfa_req,
230 'Submitting TFA code', 'Unable to submit TFA code')
231
232 if tfa_results is False:
233 return False
234
235 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
236 if tfa_res:
237 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
238 warn(
239 'Unable to finish TFA: %s' % 'Invalid TFA code'
240 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
241 return False
242
243 check_cookie_url = try_get(
244 tfa_results, lambda x: x[0][-1][2], compat_str)
9a6628aa
S
245 else:
246 CHALLENGES = {
247 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
248 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
249 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
250 }
251 challenge = CHALLENGES.get(
252 challenge_str,
253 '%s returned error %s.' % (self.IE_NAME, challenge_str))
254 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
255 return False
3995d37d
S
256 else:
257 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
258
259 if not check_cookie_url:
260 warn('Unable to extract CheckCookie URL')
261 return False
e00eb564
S
262
263 check_cookie_results = self._download_webpage(
3995d37d
S
264 check_cookie_url, None, 'Checking cookie', fatal=False)
265
266 if check_cookie_results is False:
267 return False
e00eb564 268
3995d37d
S
269 if 'https://myaccount.google.com/' not in check_cookie_results:
270 warn('Unable to log in')
b2e8bc1b 271 return False
e00eb564 272
b2e8bc1b
JMF
273 return True
274
30226342 275 def _download_webpage_handle(self, *args, **kwargs):
c1148516
S
276 query = kwargs.get('query', {}).copy()
277 query['disable_polymer'] = 'true'
278 kwargs['query'] = query
30226342 279 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
8d81f3e3
S
280 *args, **compat_kwargs(kwargs))
281
b2e8bc1b
JMF
282 def _real_initialize(self):
283 if self._downloader is None:
284 return
42939b61 285 self._set_language()
b2e8bc1b
JMF
286 if not self._login():
287 return
c5e8d7af 288
8377574c 289
8e7aad20 290class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
061a75ed 291 # Extract entries from page with "Load more" button
648e6a1f
S
292 def _entries(self, page, playlist_id):
293 more_widget_html = content_html = page
294 for page_num in itertools.count(1):
061a75ed
S
295 for entry in self._process_page(content_html):
296 yield entry
648e6a1f
S
297
298 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
299 if not mobj:
300 break
301
f8c55c66
S
302 count = 0
303 retries = 3
304 while count <= retries:
305 try:
306 # Downloading page may result in intermittent 5xx HTTP error
307 # that is usually worked around with a retry
308 more = self._download_json(
07af16b9 309 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id,
f8c55c66
S
310 'Downloading page #%s%s'
311 % (page_num, ' (retry #%d)' % count if count else ''),
d84b21b4
S
312 transform_source=uppercase_escape,
313 headers=self._YOUTUBE_CLIENT_HEADERS)
f8c55c66
S
314 break
315 except ExtractorError as e:
316 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
317 count += 1
318 if count <= retries:
319 continue
320 raise
321
648e6a1f
S
322 content_html = more['content_html']
323 if not content_html.strip():
324 # Some webpages show a "Load more" button but they don't
325 # have more videos
326 break
327 more_widget_html = more['load_more_widget_html']
328
061a75ed
S
329
330class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
331 def _process_page(self, content):
332 for video_id, video_title in self.extract_videos_from_page(content):
333 yield self.url_result(video_id, 'Youtube', video_id, video_title)
334
351f37c0
S
335 def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
336 for mobj in re.finditer(video_re, page):
648e6a1f
S
337 # The link with index 0 is not the first video of the playlist (not sure if still actual)
338 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
339 continue
340 video_id = mobj.group('id')
351f37c0
S
341 video_title = unescapeHTML(
342 mobj.group('title')) if 'title' in mobj.groupdict() else None
648e6a1f
S
343 if video_title:
344 video_title = video_title.strip()
351f37c0
S
345 if video_title == '► Play all':
346 video_title = None
648e6a1f
S
347 try:
348 idx = ids_in_page.index(video_id)
349 if video_title and not titles_in_page[idx]:
350 titles_in_page[idx] = video_title
351 except ValueError:
352 ids_in_page.append(video_id)
353 titles_in_page.append(video_title)
351f37c0
S
354
355 def extract_videos_from_page(self, page):
356 ids_in_page = []
357 titles_in_page = []
358 self.extract_videos_from_page_impl(
359 self._VIDEO_RE, page, ids_in_page, titles_in_page)
648e6a1f
S
360 return zip(ids_in_page, titles_in_page)
361
362
061a75ed
S
363class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
364 def _process_page(self, content):
6dee688e
S
365 for playlist_id in orderedSet(re.findall(
366 r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
367 content)):
061a75ed
S
368 yield self.url_result(
369 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
370
0c148415
S
371 def _real_extract(self, url):
372 playlist_id = self._match_id(url)
373 webpage = self._download_webpage(url, playlist_id)
0c148415 374 title = self._og_search_title(webpage, fatal=False)
061a75ed 375 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
0c148415
S
376
377
360e1ca5 378class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 379 IE_DESC = 'YouTube.com'
cb7dfeea 380 _VALID_URL = r"""(?x)^
c5e8d7af 381 (
edb53e2d 382 (?:https?://|//) # http(s):// or protocol-independent URL
66b48727 383 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
484aaeb2 384 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 385 (?:www\.)?pwnyoutube\.com/|
8b561bfc 386 (?:www\.)?hooktube\.com/|
f7000f3a 387 (?:www\.)?yourepeat\.com/|
e69ae5b9 388 tube\.majestyc\.net/|
ba036333 389 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
77d95677 390 (?:(?:www|dev)\.)?invidio\.us/|
ba036333 391 (?:(?:www|no)\.)?invidiou\.sh/|
392 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
8ae113ca 393 (?:www\.)?invidious\.kabi\.tk/|
ba036333 394 (?:www\.)?invidious\.13ad\.de/|
791d2e81 395 (?:www\.)?invidious\.mastodon\.host/|
494d664e 396 (?:www\.)?invidious\.nixnet\.xyz/|
666d808e 397 (?:www\.)?invidious\.drycat\.fr/|
ba036333 398 (?:www\.)?tube\.poal\.co/|
8ae113ca 399 (?:www\.)?vid\.wxzm\.sx/|
384bf91f 400 (?:www\.)?yewtu\.be/|
494d664e 401 (?:www\.)?yt\.elukerio\.org/|
894b3826 402 (?:www\.)?yt\.lelux\.fi/|
1db5ab6b 403 (?:www\.)?invidious\.ggc-project\.de/|
404 (?:www\.)?yt\.maisputain\.ovh/|
405 (?:www\.)?invidious\.13ad\.de/|
406 (?:www\.)?invidious\.toot\.koeln/|
407 (?:www\.)?invidious\.fdn\.fr/|
408 (?:www\.)?watch\.nettohikari\.com/|
bff90fc5 409 (?:www\.)?kgg2m7yk5aybusll\.onion/|
410 (?:www\.)?qklhadlycap4cnod\.onion/|
411 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
412 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
413 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
414 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
33c1c7d8 415 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
1db5ab6b 416 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
e69ae5b9 417 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
418 (?:.*?\#/)? # handle anchor (#/) redirect urls
419 (?: # the various things that can precede the ID:
ac7553d0 420 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 421 |(?: # or the v= param in all its forms
f7000f3a 422 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 423 (?:\?|\#!?) # the params delimiter ? or # or #!
040ac686 424 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
c5e8d7af
PH
425 v=
426 )
f4b05232 427 ))
cbaed4bb
S
428 |(?:
429 youtu\.be| # just youtu.be/xxxx
6d4fc66b
S
430 vid\.plus| # or vid.plus/xxxx
431 zwearz\.com/watch| # or zwearz.com/watch/xxxx
cbaed4bb 432 )/
edb53e2d 433 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 434 )
c5e8d7af 435 )? # all until now is optional -> you can pass the naked ID
8963d9c2 436 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
d0ba5587
S
437 (?!.*?\blist=
438 (?:
439 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
440 WL # WL are handled by the watch later IE
441 )
442 )
c5e8d7af 443 (?(1).+)? # if we found the ID, everything can follow
d0ba5587 444 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
c5e8d7af 445 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
e40c758c
S
446 _PLAYER_INFO_RE = (
447 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
448 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
449 )
2c62dc26 450 _formats = {
c2d3cb4c 451 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
452 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
453 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
454 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
455 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
456 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
457 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
458 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
3834d3e3 459 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
c2d3cb4c 460 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
461 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
462 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
463 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
464 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
465 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
e1a0bfdf 466 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
c2d3cb4c 467 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
468 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
e1a0bfdf 469
470
471 # 3D videos
c2d3cb4c 472 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
473 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
474 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
475 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
e1a0bfdf 476 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
477 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
478 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
836a086c 479
96fb5605 480 # Apple HTTP Live Streaming
11f12195 481 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
c2d3cb4c 482 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
483 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
484 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
485 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
486 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
e1a0bfdf 487 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
488 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
2c62dc26
PH
489
490 # DASH mp4 video
d23028a8
S
491 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
492 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
493 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
494 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
495 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
067aa17e 496 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
d23028a8
S
497 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
498 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
499 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
500 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
501 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
502 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
836a086c 503
f6f1fc92 504 # Dash mp4 audio
d23028a8
S
505 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
506 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
507 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
508 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
509 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
510 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
511 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
836a086c
AZ
512
513 # Dash webm
d23028a8
S
514 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
515 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
516 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
517 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
518 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
519 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
520 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
521 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
522 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
523 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
524 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
525 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
526 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
527 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
528 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
4c6b4764 529 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
d23028a8
S
530 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
531 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
532 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
533 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
534 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
535 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
2c62dc26
PH
536
537 # Dash webm audio
d23028a8
S
538 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
539 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
ce6b9a2d 540
0857baad 541 # Dash webm audio with opus inside
d23028a8
S
542 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
543 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
544 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
0857baad 545
ce6b9a2d
PH
546 # RTMP (unnamed)
547 '_rtmp': {'protocol': 'rtmp'},
b85eae0f
S
548
549 # av01 video only formats sometimes served with "unknown" codecs
550 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
551 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
552 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
553 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
c5e8d7af 554 }
84da5d84 555 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
836a086c 556
fd5c4aab
S
557 _GEO_BYPASS = False
558
78caa52a 559 IE_NAME = 'youtube'
2eb88d95
PH
560 _TESTS = [
561 {
2d3d2997 562 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
4bc3a23e
PH
563 'info_dict': {
564 'id': 'BaW_jenozKc',
565 'ext': 'mp4',
3867038a 566 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
4bc3a23e
PH
567 'uploader': 'Philipp Hagemeister',
568 'uploader_id': 'phihag',
ec85ded8 569 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
dd4c4492
S
570 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
571 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
4bc3a23e 572 'upload_date': '20121002',
3867038a 573 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
4bc3a23e 574 'categories': ['Science & Technology'],
3867038a 575 'tags': ['youtube-dl'],
556dbe7f 576 'duration': 10,
dbdaaa23 577 'view_count': int,
3e7c1224
PH
578 'like_count': int,
579 'dislike_count': int,
7c80519c 580 'start_time': 1,
297a564b 581 'end_time': 9,
2eb88d95 582 }
0e853ca4 583 },
fccd3771 584 {
4bc3a23e
PH
585 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
586 'note': 'Embed-only video (#1746)',
587 'info_dict': {
588 'id': 'yZIXLfi8CZQ',
589 'ext': 'mp4',
590 'upload_date': '20120608',
591 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
592 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
593 'uploader': 'SET India',
94bfcd23 594 'uploader_id': 'setindia',
ec85ded8 595 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
94bfcd23 596 'age_limit': 18,
fccd3771
PH
597 }
598 },
11b56058 599 {
2d3d2997 600 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
11b56058
PM
601 'note': 'Use the first video ID in the URL',
602 'info_dict': {
603 'id': 'BaW_jenozKc',
604 'ext': 'mp4',
3867038a 605 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
11b56058
PM
606 'uploader': 'Philipp Hagemeister',
607 'uploader_id': 'phihag',
ec85ded8 608 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
11b56058 609 'upload_date': '20121002',
3867038a 610 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
11b56058 611 'categories': ['Science & Technology'],
3867038a 612 'tags': ['youtube-dl'],
556dbe7f 613 'duration': 10,
dbdaaa23 614 'view_count': int,
11b56058
PM
615 'like_count': int,
616 'dislike_count': int,
34a7de29
S
617 },
618 'params': {
619 'skip_download': True,
620 },
11b56058 621 },
dd27fd17 622 {
2d3d2997 623 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
4bc3a23e
PH
624 'note': '256k DASH audio (format 141) via DASH manifest',
625 'info_dict': {
626 'id': 'a9LDPn-MO4I',
627 'ext': 'm4a',
628 'upload_date': '20121002',
629 'uploader_id': '8KVIDEO',
ec85ded8 630 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
4bc3a23e
PH
631 'description': '',
632 'uploader': '8KVIDEO',
633 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 634 },
4bc3a23e
PH
635 'params': {
636 'youtube_include_dash_manifest': True,
637 'format': '141',
4919603f 638 },
de3c7fe0 639 'skip': 'format 141 not served anymore',
dd27fd17 640 },
aa79ac0c
PH
641 # Controversy video
642 {
643 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
644 'info_dict': {
645 'id': 'T4XJQO3qol8',
646 'ext': 'mp4',
556dbe7f 647 'duration': 219,
aa79ac0c 648 'upload_date': '20100909',
4fe54c12 649 'uploader': 'Amazing Atheist',
aa79ac0c 650 'uploader_id': 'TheAmazingAtheist',
ec85ded8 651 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
aa79ac0c
PH
652 'title': 'Burning Everyone\'s Koran',
653 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
654 }
c522adb1 655 },
dd2d55f1 656 # Normal age-gate video (embed allowed)
c522adb1 657 {
2d3d2997 658 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
c522adb1
JMF
659 'info_dict': {
660 'id': 'HtVdAasjOgU',
661 'ext': 'mp4',
662 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
ec85ded8 663 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
556dbe7f 664 'duration': 142,
c522adb1
JMF
665 'uploader': 'The Witcher',
666 'uploader_id': 'WitcherGame',
ec85ded8 667 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
c522adb1 668 'upload_date': '20140605',
34952f09 669 'age_limit': 18,
c522adb1
JMF
670 },
671 },
067aa17e 672 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
e52a40ab
PH
673 {
674 'url': 'lqQg6PlCWgI',
675 'info_dict': {
676 'id': 'lqQg6PlCWgI',
677 'ext': 'mp4',
556dbe7f 678 'duration': 6085,
90227264 679 'upload_date': '20150827',
cbe2bd91 680 'uploader_id': 'olympic',
ec85ded8 681 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
cbe2bd91 682 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
be49068d 683 'uploader': 'Olympic',
cbe2bd91
PH
684 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
685 },
686 'params': {
687 'skip_download': 'requires avconv',
e52a40ab 688 }
cbe2bd91 689 },
6271f1ca
PH
690 # Non-square pixels
691 {
692 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
693 'info_dict': {
694 'id': '_b-2C3KPAM0',
695 'ext': 'mp4',
696 'stretched_ratio': 16 / 9.,
556dbe7f 697 'duration': 85,
6271f1ca
PH
698 'upload_date': '20110310',
699 'uploader_id': 'AllenMeow',
ec85ded8 700 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
6271f1ca 701 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
eb6793ba 702 'uploader': '孫ᄋᄅ',
6271f1ca
PH
703 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
704 },
06b491eb
S
705 },
706 # url_encoded_fmt_stream_map is empty string
707 {
708 'url': 'qEJwOuvDf7I',
709 'info_dict': {
710 'id': 'qEJwOuvDf7I',
f57b7835 711 'ext': 'webm',
06b491eb
S
712 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
713 'description': '',
714 'upload_date': '20150404',
715 'uploader_id': 'spbelect',
716 'uploader': 'Наблюдатели Петербурга',
717 },
718 'params': {
719 'skip_download': 'requires avconv',
e323cf3f
S
720 },
721 'skip': 'This live event has ended.',
06b491eb 722 },
067aa17e 723 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
da77d856
S
724 {
725 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
726 'info_dict': {
727 'id': 'FIl7x6_3R5Y',
eb6793ba 728 'ext': 'webm',
da77d856
S
729 'title': 'md5:7b81415841e02ecd4313668cde88737a',
730 'description': 'md5:116377fd2963b81ec4ce64b542173306',
556dbe7f 731 'duration': 220,
da77d856
S
732 'upload_date': '20150625',
733 'uploader_id': 'dorappi2000',
ec85ded8 734 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
da77d856 735 'uploader': 'dorappi2000',
eb6793ba 736 'formats': 'mincount:31',
da77d856 737 },
eb6793ba 738 'skip': 'not actual anymore',
2ee8f5d8 739 },
8a1a26ce
YCH
740 # DASH manifest with segment_list
741 {
742 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
743 'md5': '8ce563a1d667b599d21064e982ab9e31',
744 'info_dict': {
745 'id': 'CsmdDsKjzN8',
746 'ext': 'mp4',
17ee98e1 747 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
748 'uploader': 'Airtek',
749 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
750 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
751 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
752 },
753 'params': {
754 'youtube_include_dash_manifest': True,
755 'format': '135', # bestvideo
be49068d
S
756 },
757 'skip': 'This live event has ended.',
2ee8f5d8 758 },
cf7e015f
S
759 {
760 # Multifeed videos (multiple cameras), URL is for Main Camera
761 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
762 'info_dict': {
763 'id': 'jqWvoWXjCVs',
764 'title': 'teamPGP: Rocket League Noob Stream',
765 'description': 'md5:dc7872fb300e143831327f1bae3af010',
766 },
767 'playlist': [{
768 'info_dict': {
769 'id': 'jqWvoWXjCVs',
770 'ext': 'mp4',
771 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
772 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 773 'duration': 7335,
cf7e015f
S
774 'upload_date': '20150721',
775 'uploader': 'Beer Games Beer',
776 'uploader_id': 'beergamesbeer',
ec85ded8 777 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 778 'license': 'Standard YouTube License',
cf7e015f
S
779 },
780 }, {
781 'info_dict': {
782 'id': '6h8e8xoXJzg',
783 'ext': 'mp4',
784 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
785 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 786 'duration': 7337,
cf7e015f
S
787 'upload_date': '20150721',
788 'uploader': 'Beer Games Beer',
789 'uploader_id': 'beergamesbeer',
ec85ded8 790 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 791 'license': 'Standard YouTube License',
cf7e015f
S
792 },
793 }, {
794 'info_dict': {
795 'id': 'PUOgX5z9xZw',
796 'ext': 'mp4',
797 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
798 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 799 'duration': 7337,
cf7e015f
S
800 'upload_date': '20150721',
801 'uploader': 'Beer Games Beer',
802 'uploader_id': 'beergamesbeer',
ec85ded8 803 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 804 'license': 'Standard YouTube License',
cf7e015f
S
805 },
806 }, {
807 'info_dict': {
808 'id': 'teuwxikvS5k',
809 'ext': 'mp4',
810 'title': 'teamPGP: Rocket League Noob Stream (zim)',
811 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 812 'duration': 7334,
cf7e015f
S
813 'upload_date': '20150721',
814 'uploader': 'Beer Games Beer',
815 'uploader_id': 'beergamesbeer',
ec85ded8 816 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 817 'license': 'Standard YouTube License',
cf7e015f
S
818 },
819 }],
820 'params': {
821 'skip_download': True,
822 },
4fe54c12 823 'skip': 'This video is not available.',
cbaed4bb 824 },
f9f49d87 825 {
067aa17e 826 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
f9f49d87
S
827 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
828 'info_dict': {
829 'id': 'gVfLd0zydlo',
830 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
831 },
832 'playlist_count': 2,
be49068d 833 'skip': 'Not multifeed anymore',
f9f49d87 834 },
cbaed4bb 835 {
2d3d2997 836 'url': 'https://vid.plus/FlRa-iH7PGw',
cbaed4bb 837 'only_matching': True,
0e49d9a6 838 },
6d4fc66b 839 {
2d3d2997 840 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
6d4fc66b
S
841 'only_matching': True,
842 },
0e49d9a6 843 {
067aa17e 844 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
a8776b10 845 # Also tests cut-off URL expansion in video description (see
067aa17e
S
846 # https://github.com/ytdl-org/youtube-dl/issues/1892,
847 # https://github.com/ytdl-org/youtube-dl/issues/8164)
0e49d9a6
LL
848 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
849 'info_dict': {
850 'id': 'lsguqyKfVQg',
851 'ext': 'mp4',
852 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
eb6793ba 853 'alt_title': 'Dark Walk - Position Music',
0e49d9a6 854 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
556dbe7f 855 'duration': 133,
0e49d9a6
LL
856 'upload_date': '20151119',
857 'uploader_id': 'IronSoulElf',
ec85ded8 858 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
0e49d9a6 859 'uploader': 'IronSoulElf',
eb6793ba
S
860 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
861 'track': 'Dark Walk - Position Music',
862 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
92bc97d3 863 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
0e49d9a6
LL
864 },
865 'params': {
866 'skip_download': True,
867 },
868 },
61f92af1 869 {
067aa17e 870 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
61f92af1
S
871 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
872 'only_matching': True,
873 },
313dfc45
LL
874 {
875 # Video with yt:stretch=17:0
876 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
877 'info_dict': {
878 'id': 'Q39EVAstoRM',
879 'ext': 'mp4',
880 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
881 'description': 'md5:ee18a25c350637c8faff806845bddee9',
882 'upload_date': '20151107',
883 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
884 'uploader': 'CH GAMER DROID',
885 },
886 'params': {
887 'skip_download': True,
888 },
be49068d 889 'skip': 'This video does not exist.',
313dfc45 890 },
7caf9830
S
891 {
892 # Video licensed under Creative Commons
893 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
894 'info_dict': {
895 'id': 'M4gD1WSo5mA',
896 'ext': 'mp4',
897 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
898 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
556dbe7f 899 'duration': 721,
7caf9830
S
900 'upload_date': '20150127',
901 'uploader_id': 'BerkmanCenter',
ec85ded8 902 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
556dbe7f 903 'uploader': 'The Berkman Klein Center for Internet & Society',
7caf9830
S
904 'license': 'Creative Commons Attribution license (reuse allowed)',
905 },
906 'params': {
907 'skip_download': True,
908 },
909 },
fd050249
S
910 {
911 # Channel-like uploader_url
912 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
913 'info_dict': {
914 'id': 'eQcmzGIKrzg',
915 'ext': 'mp4',
916 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
917 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
556dbe7f 918 'duration': 4060,
fd050249 919 'upload_date': '20151119',
eb6793ba 920 'uploader': 'Bernie Sanders',
fd050249 921 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
ec85ded8 922 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
fd050249
S
923 'license': 'Creative Commons Attribution license (reuse allowed)',
924 },
925 'params': {
926 'skip_download': True,
927 },
928 },
040ac686
S
929 {
930 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
931 'only_matching': True,
7f29cf54
S
932 },
933 {
067aa17e 934 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
7f29cf54
S
935 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
936 'only_matching': True,
6496ccb4
S
937 },
938 {
939 # Rental video preview
940 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
941 'info_dict': {
942 'id': 'uGpuVWrhIzE',
943 'ext': 'mp4',
944 'title': 'Piku - Trailer',
945 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
946 'upload_date': '20150811',
947 'uploader': 'FlixMatrix',
948 'uploader_id': 'FlixMatrixKaravan',
ec85ded8 949 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
6496ccb4
S
950 'license': 'Standard YouTube License',
951 },
952 'params': {
953 'skip_download': True,
954 },
eb6793ba 955 'skip': 'This video is not available.',
022a5d66 956 },
12afdc2a
S
957 {
958 # YouTube Red video with episode data
959 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
960 'info_dict': {
961 'id': 'iqKdEhx-dD4',
962 'ext': 'mp4',
963 'title': 'Isolation - Mind Field (Ep 1)',
4fe54c12 964 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
556dbe7f 965 'duration': 2085,
12afdc2a
S
966 'upload_date': '20170118',
967 'uploader': 'Vsauce',
968 'uploader_id': 'Vsauce',
969 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
12afdc2a
S
970 'series': 'Mind Field',
971 'season_number': 1,
972 'episode_number': 1,
973 },
974 'params': {
975 'skip_download': True,
976 },
977 'expected_warnings': [
978 'Skipping DASH manifest',
979 ],
980 },
c7121fa7
S
981 {
982 # The following content has been identified by the YouTube community
983 # as inappropriate or offensive to some audiences.
984 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
985 'info_dict': {
986 'id': '6SJNVb0GnPI',
987 'ext': 'mp4',
988 'title': 'Race Differences in Intelligence',
989 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
990 'duration': 965,
991 'upload_date': '20140124',
992 'uploader': 'New Century Foundation',
993 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
994 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
c7121fa7
S
995 },
996 'params': {
997 'skip_download': True,
998 },
999 },
022a5d66
S
1000 {
1001 # itag 212
1002 'url': '1t24XAntNCY',
1003 'only_matching': True,
fd5c4aab
S
1004 },
1005 {
1006 # geo restricted to JP
1007 'url': 'sJL6WA-aGkQ',
1008 'only_matching': True,
1009 },
d0ba5587
S
1010 {
1011 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1012 'only_matching': True,
1013 },
cd5a74a2
S
1014 {
1015 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1016 'only_matching': True,
1017 },
825cd268
RA
1018 {
1019 # DRM protected
1020 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1021 'only_matching': True,
4fe54c12
S
1022 },
1023 {
1024 # Video with unsupported adaptive stream type formats
1025 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1026 'info_dict': {
1027 'id': 'Z4Vy8R84T1U',
1028 'ext': 'mp4',
1029 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1030 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1031 'duration': 433,
1032 'upload_date': '20130923',
1033 'uploader': 'Amelia Putri Harwita',
1034 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1035 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1036 'formats': 'maxcount:10',
1037 },
1038 'params': {
1039 'skip_download': True,
1040 'youtube_include_dash_manifest': False,
1041 },
5429d6a9 1042 'skip': 'not actual anymore',
5caabd3c 1043 },
1044 {
822b9d9c 1045 # Youtube Music Auto-generated description
5caabd3c 1046 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1047 'info_dict': {
1048 'id': 'MgNrAu2pzNs',
1049 'ext': 'mp4',
1050 'title': 'Voyeur Girl',
1051 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1052 'upload_date': '20190312',
5429d6a9
S
1053 'uploader': 'Stephen - Topic',
1054 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
5caabd3c 1055 'artist': 'Stephen',
1056 'track': 'Voyeur Girl',
1057 'album': 'it\'s too much love to know my dear',
1058 'release_date': '20190313',
1059 'release_year': 2019,
1060 },
1061 'params': {
1062 'skip_download': True,
1063 },
1064 },
1065 {
822b9d9c 1066 # Youtube Music Auto-generated description
5caabd3c 1067 # Retrieve 'artist' field from 'Artist:' in video description
1068 # when it is present on youtube music video
5caabd3c 1069 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
1070 'info_dict': {
1071 'id': 'k0jLE7tTwjY',
1072 'ext': 'mp4',
1073 'title': 'Latch Feat. Sam Smith',
1074 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
1075 'upload_date': '20150110',
1076 'uploader': 'Various Artists - Topic',
1077 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
1078 'artist': 'Disclosure',
1079 'track': 'Latch Feat. Sam Smith',
1080 'album': 'Latch Featuring Sam Smith',
1081 'release_date': '20121008',
1082 'release_year': 2012,
1083 },
1084 'params': {
1085 'skip_download': True,
1086 },
1087 },
1088 {
822b9d9c 1089 # Youtube Music Auto-generated description
5caabd3c 1090 # handle multiple artists on youtube music video
1091 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
1092 'info_dict': {
1093 'id': '74qn0eJSjpA',
1094 'ext': 'mp4',
1095 'title': 'Eastside',
1096 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
1097 'upload_date': '20180710',
1098 'uploader': 'Benny Blanco - Topic',
1099 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
1100 'artist': 'benny blanco, Halsey, Khalid',
1101 'track': 'Eastside',
1102 'album': 'Eastside',
1103 'release_date': '20180713',
1104 'release_year': 2018,
1105 },
1106 'params': {
1107 'skip_download': True,
1108 },
1109 },
1110 {
822b9d9c 1111 # Youtube Music Auto-generated description
5caabd3c 1112 # handle youtube music video with release_year and no release_date
1113 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
1114 'info_dict': {
1115 'id': '-hcAI0g-f5M',
1116 'ext': 'mp4',
1117 'title': 'Put It On Me',
5429d6a9 1118 'description': 'md5:f6422397c07c4c907c6638e1fee380a5',
5caabd3c 1119 'upload_date': '20180426',
1120 'uploader': 'Matt Maeson - Topic',
1121 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
1122 'artist': 'Matt Maeson',
1123 'track': 'Put It On Me',
1124 'album': 'The Hearse',
1125 'release_date': None,
1126 'release_year': 2018,
1127 },
1128 'params': {
1129 'skip_download': True,
1130 },
1131 },
66b48727
RA
1132 {
1133 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1134 'only_matching': True,
1135 },
011e75e6
S
1136 {
1137 # invalid -> valid video id redirection
1138 'url': 'DJztXj2GPfl',
1139 'info_dict': {
1140 'id': 'DJztXj2GPfk',
1141 'ext': 'mp4',
1142 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1143 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1144 'upload_date': '20090125',
1145 'uploader': 'Prochorowka',
1146 'uploader_id': 'Prochorowka',
1147 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1148 'artist': 'Panjabi MC',
1149 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1150 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1151 },
1152 'params': {
1153 'skip_download': True,
1154 },
ea74e00b
DP
1155 },
1156 {
1157 # empty description results in an empty string
1158 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1159 'info_dict': {
1160 'id': 'x41yOUIvK2k',
1161 'ext': 'mp4',
1162 'title': 'IMG 3456',
1163 'description': '',
1164 'upload_date': '20170613',
1165 'uploader_id': 'ElevageOrVert',
1166 'uploader': 'ElevageOrVert',
1167 },
1168 'params': {
1169 'skip_download': True,
1170 },
1171 },
2eb88d95
PH
1172 ]
1173
e0df6211
PH
1174 def __init__(self, *args, **kwargs):
1175 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 1176 self._player_cache = {}
e0df6211 1177
c5e8d7af
PH
1178 def report_video_info_webpage_download(self, video_id):
1179 """Report attempt to download video info webpage."""
69ea8ca4 1180 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 1181
c5e8d7af
PH
1182 def report_information_extraction(self, video_id):
1183 """Report attempt to extract video information."""
69ea8ca4 1184 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
1185
1186 def report_unavailable_format(self, video_id, format):
1187 """Report extracted video URL."""
69ea8ca4 1188 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
1189
1190 def report_rtmp_download(self):
1191 """Indicate the download will use the RTMP protocol."""
69ea8ca4 1192 self.to_screen('RTMP download detected')
c5e8d7af 1193
60064c53
PH
1194 def _signature_cache_id(self, example_sig):
1195 """ Return a string representation of a signature """
78caa52a 1196 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53 1197
e40c758c
S
1198 @classmethod
1199 def _extract_player_info(cls, player_url):
1200 for player_re in cls._PLAYER_INFO_RE:
1201 id_m = re.search(player_re, player_url)
1202 if id_m:
1203 break
1204 else:
c081b35c 1205 raise ExtractorError('Cannot identify player %r' % player_url)
e40c758c
S
1206 return id_m.group('ext'), id_m.group('id')
1207
1208 def _extract_signature_function(self, video_id, player_url, example_sig):
1209 player_type, player_id = self._extract_player_info(player_url)
e0df6211 1210
c4417ddb 1211 # Read from filesystem cache
60064c53
PH
1212 func_id = '%s_%s_%s' % (
1213 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 1214 assert os.path.basename(func_id) == func_id
a0e07d31 1215
69ea8ca4 1216 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 1217 if cache_spec is not None:
78caa52a 1218 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 1219
6d1a55a5
PH
1220 download_note = (
1221 'Downloading player %s' % player_url
1222 if self._downloader.params.get('verbose') else
1223 'Downloading %s player %s' % (player_type, player_id)
1224 )
e0df6211
PH
1225 if player_type == 'js':
1226 code = self._download_webpage(
1227 player_url, video_id,
6d1a55a5 1228 note=download_note,
69ea8ca4 1229 errnote='Download of %s failed' % player_url)
83799698 1230 res = self._parse_sig_js(code)
c4417ddb 1231 elif player_type == 'swf':
e0df6211
PH
1232 urlh = self._request_webpage(
1233 player_url, video_id,
6d1a55a5 1234 note=download_note,
69ea8ca4 1235 errnote='Download of %s failed' % player_url)
e0df6211 1236 code = urlh.read()
83799698 1237 res = self._parse_sig_swf(code)
e0df6211
PH
1238 else:
1239 assert False, 'Invalid player type %r' % player_type
1240
785521bf
PH
1241 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1242 cache_res = res(test_string)
1243 cache_spec = [ord(c) for c in cache_res]
83799698 1244
69ea8ca4 1245 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
1246 return res
1247
60064c53 1248 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
1249 def gen_sig_code(idxs):
1250 def _genslice(start, end, step):
78caa52a 1251 starts = '' if start == 0 else str(start)
8bcc8756 1252 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 1253 steps = '' if step == 1 else (':%d' % step)
78caa52a 1254 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
1255
1256 step = None
7af808a5
PH
1257 # Quelch pyflakes warnings - start will be set when step is set
1258 start = '(Never used)'
edf3e38e
PH
1259 for i, prev in zip(idxs[1:], idxs[:-1]):
1260 if step is not None:
1261 if i - prev == step:
1262 continue
1263 yield _genslice(start, prev, step)
1264 step = None
1265 continue
1266 if i - prev in [-1, 1]:
1267 step = i - prev
1268 start = prev
1269 continue
1270 else:
78caa52a 1271 yield 's[%d]' % prev
edf3e38e 1272 if step is None:
78caa52a 1273 yield 's[%d]' % i
edf3e38e
PH
1274 else:
1275 yield _genslice(start, i, step)
1276
78caa52a 1277 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 1278 cache_res = func(test_string)
edf3e38e 1279 cache_spec = [ord(c) for c in cache_res]
78caa52a 1280 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
1281 signature_id_tuple = '(%s)' % (
1282 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 1283 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 1284 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 1285 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 1286
e0df6211
PH
1287 def _parse_sig_js(self, jscode):
1288 funcname = self._search_regex(
abefc03f
S
1289 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1290 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
e450f6cb 1291 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
31ce6e99 1292 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
abefc03f
S
1293 # Obsolete patterns
1294 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
9a47fa35 1295 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
abefc03f
S
1296 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1297 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1298 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1299 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1300 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1301 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
3c90cc8b 1302 jscode, 'Initial JS player signature function name', group='sig')
2b25cb5d
PH
1303
1304 jsi = JSInterpreter(jscode)
1305 initial_function = jsi.extract_function(funcname)
e0df6211
PH
1306 return lambda s: initial_function([s])
1307
1308 def _parse_sig_swf(self, file_contents):
54256267 1309 swfi = SWFInterpreter(file_contents)
78caa52a 1310 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 1311 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 1312 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
1313 return lambda s: initial_function([s])
1314
83799698 1315 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1316 """Turn the encrypted s field into a working signature"""
6b37f0be 1317
c8bf86d5 1318 if player_url is None:
69ea8ca4 1319 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 1320
69ea8ca4 1321 if player_url.startswith('//'):
78caa52a 1322 player_url = 'https:' + player_url
3c90cc8b
S
1323 elif not re.match(r'https?://', player_url):
1324 player_url = compat_urlparse.urljoin(
1325 'https://www.youtube.com', player_url)
c8bf86d5 1326 try:
62af3a0e 1327 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
1328 if player_id not in self._player_cache:
1329 func = self._extract_signature_function(
60064c53 1330 video_id, player_url, s
c8bf86d5
PH
1331 )
1332 self._player_cache[player_id] = func
1333 func = self._player_cache[player_id]
1334 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 1335 self._print_sig_code(func, s)
c8bf86d5
PH
1336 return func(s)
1337 except Exception as e:
1338 tb = traceback.format_exc()
1339 raise ExtractorError(
78caa52a 1340 'Signature extraction failed: ' + tb, cause=e)
e0df6211 1341
f96f5dda 1342 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
de7f3446 1343 try:
60e47a26 1344 subs_doc = self._download_xml(
38c2e5b8 1345 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
1346 video_id, note=False)
1347 except ExtractorError as err:
9b9c5355 1348 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
de7f3446 1349 return {}
de7f3446
JMF
1350
1351 sub_lang_list = {}
60e47a26
JMF
1352 for track in subs_doc.findall('track'):
1353 lang = track.attrib['lang_code']
7e660ac1
LD
1354 if lang in sub_lang_list:
1355 continue
360e1ca5 1356 sub_formats = []
23d17e4b 1357 for ext in self._SUBTITLE_FORMATS:
15707c7e 1358 params = compat_urllib_parse_urlencode({
360e1ca5
JMF
1359 'lang': lang,
1360 'v': video_id,
1361 'fmt': ext,
1362 'name': track.attrib['name'].encode('utf-8'),
1363 })
1364 sub_formats.append({
1365 'url': 'https://www.youtube.com/api/timedtext?' + params,
1366 'ext': ext,
1367 })
1368 sub_lang_list[lang] = sub_formats
4932ba4a 1369 """ if has_live_chat_replay:
321bf820 1370 sub_lang_list['live_chat'] = [
1371 {
1372 'video_id': video_id,
1373 'ext': 'json',
1374 'protocol': 'youtube_live_chat_replay',
1375 },
4932ba4a 1376 ] """
de7f3446 1377 if not sub_lang_list:
69ea8ca4 1378 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
1379 return {}
1380 return sub_lang_list
1381
a72778d3
S
1382 def _get_ytplayer_config(self, video_id, webpage):
1383 patterns = (
526b3b07
S
1384 # User data may contain arbitrary character sequences that may affect
1385 # JSON extraction with regex, e.g. when '};' is contained the second
1386 # regex won't capture the whole JSON. Yet working around by trying more
1387 # concrete regex first keeping in mind proper quoted string handling
1388 # to be implemented in future that will replace this workaround (see
067aa17e
S
1389 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1390 # https://github.com/ytdl-org/youtube-dl/pull/7599)
a72778d3
S
1391 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1392 r';ytplayer\.config\s*=\s*({.+?});',
1393 )
1394 config = self._search_regex(
1395 patterns, webpage, 'ytplayer.config', default=None)
1396 if config:
1397 return self._parse_json(
1398 uppercase_escape(config), video_id, fatal=False)
0e49d9a6 1399
321bf820 1400 def _get_yt_initial_data(self, video_id, webpage):
1401 config = self._search_regex(
15eae44d 1402 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
1403 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
321bf820 1404 webpage, 'ytInitialData', default=None)
1405 if config:
1406 return self._parse_json(
1407 uppercase_escape(config), video_id, fatal=False)
1408
360e1ca5 1409 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
1410 """We need the webpage for getting the captions url, pass it as an
1411 argument to speed up the process."""
69ea8ca4 1412 self.to_screen('%s: Looking for automatic captions' % video_id)
a72778d3 1413 player_config = self._get_ytplayer_config(video_id, webpage)
78caa52a 1414 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
a72778d3 1415 if not player_config:
de7f3446
JMF
1416 self._downloader.report_warning(err_msg)
1417 return {}
de7f3446 1418 try:
0792d563 1419 args = player_config['args']
b78b292f
S
1420 caption_url = args.get('ttsurl')
1421 if caption_url:
1422 timestamp = args['timestamp']
1423 # We get the available subtitles
15707c7e 1424 list_params = compat_urllib_parse_urlencode({
b78b292f
S
1425 'type': 'list',
1426 'tlangs': 1,
1427 'asrs': 1,
1428 })
1429 list_url = caption_url + '&' + list_params
1430 caption_list = self._download_xml(list_url, video_id)
1431 original_lang_node = caption_list.find('track')
1432 if original_lang_node is None:
1433 self._downloader.report_warning('Video doesn\'t have automatic captions')
1434 return {}
1435 original_lang = original_lang_node.attrib['lang_code']
1436 caption_kind = original_lang_node.attrib.get('kind', '')
1437
1438 sub_lang_list = {}
1439 for lang_node in caption_list.findall('target'):
1440 sub_lang = lang_node.attrib['lang_code']
1441 sub_formats = []
1442 for ext in self._SUBTITLE_FORMATS:
15707c7e 1443 params = compat_urllib_parse_urlencode({
b78b292f
S
1444 'lang': original_lang,
1445 'tlang': sub_lang,
1446 'fmt': ext,
1447 'ts': timestamp,
1448 'kind': caption_kind,
1449 })
1450 sub_formats.append({
1451 'url': caption_url + '&' + params,
1452 'ext': ext,
1453 })
1454 sub_lang_list[sub_lang] = sub_formats
1455 return sub_lang_list
1456
ddbb4c5c
S
1457 def make_captions(sub_url, sub_langs):
1458 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1459 caption_qs = compat_parse_qs(parsed_sub_url.query)
1460 captions = {}
1461 for sub_lang in sub_langs:
1462 sub_formats = []
1463 for ext in self._SUBTITLE_FORMATS:
1464 caption_qs.update({
1465 'tlang': [sub_lang],
1466 'fmt': [ext],
1467 })
1468 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1469 query=compat_urllib_parse_urlencode(caption_qs, True)))
1470 sub_formats.append({
1471 'url': sub_url,
1472 'ext': ext,
1473 })
1474 captions[sub_lang] = sub_formats
1475 return captions
1476
1477 # New captions format as of 22.06.2017
1478 player_response = args.get('player_response')
1479 if player_response and isinstance(player_response, compat_str):
1480 player_response = self._parse_json(
1481 player_response, video_id, fatal=False)
1482 if player_response:
1483 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
7e1cf1a4 1484 caption_tracks = renderer['captionTracks']
1485 for caption_track in caption_tracks:
1486 if 'kind' not in caption_track:
1487 # not an automatic transcription
1488 continue
1489 base_url = caption_track['baseUrl']
1490 sub_lang_list = []
1491 for lang in renderer['translationLanguages']:
1492 lang_code = lang.get('languageCode')
1493 if lang_code:
1494 sub_lang_list.append(lang_code)
1495 return make_captions(base_url, sub_lang_list)
bc842c27 1496
7e1cf1a4 1497 self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id)
1498 return {}
b78b292f
S
1499 # Some videos don't provide ttsurl but rather caption_tracks and
1500 # caption_translation_languages (e.g. 20LmZk1hakA)
ddbb4c5c 1501 # Does not used anymore as of 22.06.2017
b78b292f
S
1502 caption_tracks = args['caption_tracks']
1503 caption_translation_languages = args['caption_translation_languages']
1504 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
ddbb4c5c 1505 sub_lang_list = []
b78b292f
S
1506 for lang in caption_translation_languages.split(','):
1507 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1508 sub_lang = lang_qs.get('lc', [None])[0]
ddbb4c5c
S
1509 if sub_lang:
1510 sub_lang_list.append(sub_lang)
1511 return make_captions(caption_url, sub_lang_list)
de7f3446
JMF
1512 # An extractor error can be raise by the download process if there are
1513 # no automatic captions but there are subtitles
ddbb4c5c 1514 except (KeyError, IndexError, ExtractorError):
de7f3446
JMF
1515 self._downloader.report_warning(err_msg)
1516 return {}
1517
21c340b8
S
1518 def _mark_watched(self, video_id, video_info, player_response):
1519 playback_url = url_or_none(try_get(
1520 player_response,
1521 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1522 video_info, lambda x: x['videostats_playback_base_url'][0]))
d77ab8e2
S
1523 if not playback_url:
1524 return
1525 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1526 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1527
1528 # cpn generation algorithm is reverse engineered from base.js.
1529 # In fact it works even with dummy cpn.
1530 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1531 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1532
1533 qs.update({
1534 'ver': ['2'],
1535 'cpn': [cpn],
1536 })
1537 playback_url = compat_urlparse.urlunparse(
15707c7e 1538 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
d77ab8e2
S
1539
1540 self._download_webpage(
1541 playback_url, video_id, 'Marking watched',
1542 'Unable to mark watched', fatal=False)
1543
66c9fa36
S
1544 @staticmethod
1545 def _extract_urls(webpage):
1546 # Embedded YouTube player
1547 entries = [
1548 unescapeHTML(mobj.group('url'))
1549 for mobj in re.finditer(r'''(?x)
1550 (?:
1551 <iframe[^>]+?src=|
1552 data-video-url=|
1553 <embed[^>]+?src=|
1554 embedSWF\(?:\s*|
1555 <object[^>]+data=|
1556 new\s+SWFObject\(
1557 )
1558 (["\'])
1559 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
f2332f18 1560 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
66c9fa36
S
1561 \1''', webpage)]
1562
1563 # lazyYT YouTube embed
1564 entries.extend(list(map(
1565 unescapeHTML,
1566 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1567
1568 # Wordpress "YouTube Video Importer" plugin
1569 matches = re.findall(r'''(?x)<div[^>]+
1570 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1571 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1572 entries.extend(m[-1] for m in matches)
1573
1574 return entries
1575
1576 @staticmethod
1577 def _extract_url(webpage):
1578 urls = YoutubeIE._extract_urls(webpage)
1579 return urls[0] if urls else None
1580
97665381
PH
1581 @classmethod
1582 def extract_id(cls, url):
1583 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 1584 if mobj is None:
69ea8ca4 1585 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1586 video_id = mobj.group(2)
1587 return video_id
1588
84213ea8
S
1589 def _extract_chapters_from_json(self, webpage, video_id, duration):
1590 if not webpage:
1591 return
edd83104 1592 initial_data = self._parse_json(
84213ea8 1593 self._search_regex(
edd83104 1594 r'window\["ytInitialData"\] = (.+);\n', webpage,
84213ea8
S
1595 'player args', default='{}'),
1596 video_id, fatal=False)
edd83104 1597 if not initial_data or not isinstance(initial_data, dict):
84213ea8
S
1598 return
1599 chapters_list = try_get(
edd83104 1600 initial_data,
84213ea8
S
1601 lambda x: x['playerOverlays']
1602 ['playerOverlayRenderer']
1603 ['decoratedPlayerBarRenderer']
1604 ['decoratedPlayerBarRenderer']
1605 ['playerBar']
1606 ['chapteredPlayerBarRenderer']
1607 ['chapters'],
1608 list)
1609 if not chapters_list:
1610 return
1611
1612 def chapter_time(chapter):
1613 return float_or_none(
1614 try_get(
1615 chapter,
1616 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1617 int),
1618 scale=1000)
1619 chapters = []
1620 for next_num, chapter in enumerate(chapters_list, start=1):
1621 start_time = chapter_time(chapter)
1622 if start_time is None:
1623 continue
1624 end_time = (chapter_time(chapters_list[next_num])
1625 if next_num < len(chapters_list) else duration)
1626 if end_time is None:
1627 continue
1628 title = try_get(
1629 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1630 compat_str)
1631 chapters.append({
1632 'start_time': start_time,
1633 'end_time': end_time,
1634 'title': title,
1635 })
1636 return chapters
1637
9cafc3fd 1638 @staticmethod
84213ea8 1639 def _extract_chapters_from_description(description, duration):
9cafc3fd
S
1640 if not description:
1641 return None
1642 chapter_lines = re.findall(
1643 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1644 description)
1645 if not chapter_lines:
1646 return None
1647 chapters = []
1648 for next_num, (chapter_line, time_point) in enumerate(
1649 chapter_lines, start=1):
1650 start_time = parse_duration(time_point)
1651 if start_time is None:
1652 continue
39d4c1be
S
1653 if start_time > duration:
1654 break
9cafc3fd
S
1655 end_time = (duration if next_num == len(chapter_lines)
1656 else parse_duration(chapter_lines[next_num][1]))
1657 if end_time is None:
1658 continue
39d4c1be
S
1659 if end_time > duration:
1660 end_time = duration
1661 if start_time > end_time:
1662 break
9cafc3fd
S
1663 chapter_title = re.sub(
1664 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1665 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1666 chapters.append({
1667 'start_time': start_time,
1668 'end_time': end_time,
1669 'title': chapter_title,
1670 })
1671 return chapters
1672
84213ea8
S
1673 def _extract_chapters(self, webpage, description, video_id, duration):
1674 return (self._extract_chapters_from_json(webpage, video_id, duration)
1675 or self._extract_chapters_from_description(description, duration))
1676
c5e8d7af 1677 def _real_extract(self, url):
cf7e015f
S
1678 url, smuggled_data = unsmuggle_url(url, {})
1679
7e8c0af0 1680 proto = (
78caa52a
PH
1681 'http' if self._downloader.params.get('prefer_insecure', False)
1682 else 'https')
7e8c0af0 1683
7c80519c 1684 start_time = None
297a564b 1685 end_time = None
7c80519c
JMF
1686 parsed_url = compat_urllib_parse_urlparse(url)
1687 for component in [parsed_url.fragment, parsed_url.query]:
1688 query = compat_parse_qs(component)
297a564b 1689 if start_time is None and 't' in query:
7c80519c 1690 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
1691 if start_time is None and 'start' in query:
1692 start_time = parse_duration(query['start'][0])
297a564b
JMF
1693 if end_time is None and 'end' in query:
1694 end_time = parse_duration(query['end'][0])
7c80519c 1695
c5e8d7af
PH
1696 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1697 mobj = re.search(self._NEXT_URL_RE, url)
1698 if mobj:
7fd002c0 1699 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 1700 video_id = self.extract_id(url)
c5e8d7af
PH
1701
1702 # Get video webpage
aa79ac0c 1703 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
011e75e6
S
1704 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1705
1706 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1707 video_id = qs.get('v', [None])[0] or video_id
c5e8d7af
PH
1708
1709 # Attempt to extract SWF player URL
e0df6211 1710 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1711 if mobj is not None:
1712 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1713 else:
1714 player_url = None
1715
d8d24a92
S
1716 dash_mpds = []
1717
1718 def add_dash_mpd(video_info):
1719 dash_mpd = video_info.get('dashmpd')
1720 if dash_mpd and dash_mpd[0] not in dash_mpds:
1721 dash_mpds.append(dash_mpd[0])
1722
561b456e
S
1723 def add_dash_mpd_pr(pl_response):
1724 dash_mpd = url_or_none(try_get(
1725 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1726 compat_str))
1727 if dash_mpd and dash_mpd not in dash_mpds:
1728 dash_mpds.append(dash_mpd)
1729
c7121fa7
S
1730 is_live = None
1731 view_count = None
1732
1733 def extract_view_count(v_info):
1734 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1735
c2d125d9
S
1736 def extract_player_response(player_response, video_id):
1737 pl_response = str_or_none(player_response)
1738 if not pl_response:
1739 return
1740 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1741 if isinstance(pl_response, dict):
1742 add_dash_mpd_pr(pl_response)
1743 return pl_response
1744
fb2c9277
U
1745 def extract_embedded_config(embed_webpage, video_id):
1746 embedded_config = self._search_regex(
1747 r'setConfig\(({.*})\);',
1748 embed_webpage, 'ytInitialData', default=None)
1749 if embedded_config:
1750 return embedded_config
1751
dbdaaa23
S
1752 player_response = {}
1753
c5e8d7af 1754 # Get video info
43ebf77d 1755 video_info = {}
6449cd80 1756 embed_webpage = None
39e7107d
U
1757 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1758 or re.search(r'player-age-gate-content">', video_webpage) is not None):
9d9314cb 1759 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
c108eb73
JMF
1760 age_gate = True
1761 # We simulate the access to the video from www.youtube.com/v/{video_id}
1762 # this can be viewed without login into Youtube
beb95e77
CL
1763 url = proto + '://www.youtube.com/embed/%s' % video_id
1764 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
fb2c9277
U
1765 ext = extract_embedded_config(embed_webpage, video_id)
1766 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1767 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1768 if not playable_in_embed:
1769 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1770 playable_in_embed = ''
1771 else:
1772 playable_in_embed = playable_in_embed.group('playableinEmbed')
1773 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1774 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1775 if playable_in_embed == 'false':
c73baf23
U
1776 '''
1777 # TODO apply this patch when Support for Python 2.6(!) and above drops
9d9314cb 1778 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
4bb9c880 1779 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
c73baf23
U
1780 '''
1781 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1782 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
4bb9c880
U
1783 age_gate = False
1784 # Try looking directly into the video webpage
1785 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1786 if ytplayer_config:
1787 args = ytplayer_config['args']
1788 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1789 # Convert to the same format returned by compat_parse_qs
1790 video_info = dict((k, [v]) for k, v in args.items())
1791 add_dash_mpd(video_info)
1792 # Rental video is not rented but preview is available (e.g.
1793 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1794 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1795 if not video_info and args.get('ypc_vid'):
1796 return self.url_result(
1797 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1798 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1799 is_live = True
1800 if not player_response:
1801 player_response = extract_player_response(args.get('player_response'), video_id)
1802 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1803 add_dash_mpd_pr(player_response)
9d9314cb
U
1804 else:
1805 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1806 else:
1807 data = compat_urllib_parse_urlencode({
1808 'video_id': video_id,
1809 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1810 'sts': self._search_regex(
1811 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1812 })
1813 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1814 try:
1815 video_info_webpage = self._download_webpage(
1816 video_info_url, video_id,
1817 note='Refetching age-gated info webpage',
1818 errnote='unable to download video info webpage')
1819 except ExtractorError:
1820 video_info_webpage = None
1821 if video_info_webpage:
1822 video_info = compat_parse_qs(video_info_webpage)
1823 pl_response = video_info.get('player_response', [None])[0]
1824 player_response = extract_player_response(pl_response, video_id)
1825 add_dash_mpd(video_info)
1826 view_count = extract_view_count(video_info)
c108eb73
JMF
1827 else:
1828 age_gate = False
d8d24a92 1829 # Try looking directly into the video webpage
a72778d3
S
1830 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1831 if ytplayer_config:
4e62ebe2 1832 args = ytplayer_config['args']
4c76aa06 1833 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
d8d24a92
S
1834 # Convert to the same format returned by compat_parse_qs
1835 video_info = dict((k, [v]) for k, v in args.items())
1836 add_dash_mpd(video_info)
6496ccb4
S
1837 # Rental video is not rented but preview is available (e.g.
1838 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
067aa17e 1839 # https://github.com/ytdl-org/youtube-dl/issues/10532)
6496ccb4
S
1840 if not video_info and args.get('ypc_vid'):
1841 return self.url_result(
1842 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
2fe1ff85
JMF
1843 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1844 is_live = True
dbdaaa23 1845 if not player_response:
c2d125d9 1846 player_response = extract_player_response(args.get('player_response'), video_id)
0a3cf9ad 1847 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
561b456e 1848 add_dash_mpd_pr(player_response)
bbb7c3f7
YCH
1849
1850 def extract_unavailable_message():
0add33ab
S
1851 messages = []
1852 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1853 msg = self._html_search_regex(
1854 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1855 video_webpage, 'unavailable %s' % kind, default=None)
1856 if msg:
1857 messages.append(msg)
1858 if messages:
1859 return '\n'.join(messages)
bbb7c3f7 1860
f93abcf1 1861 if not video_info and not player_response:
15be3eb5
RA
1862 unavailable_message = extract_unavailable_message()
1863 if not unavailable_message:
1864 unavailable_message = 'Unable to extract video data'
1865 raise ExtractorError(
1866 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1867
f93abcf1
S
1868 if not isinstance(video_info, dict):
1869 video_info = {}
1870
dbdaaa23
S
1871 video_details = try_get(
1872 player_response, lambda x: x['videoDetails'], dict) or {}
1873
37357d21
S
1874 microformat = try_get(
1875 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1876
8dbf751a
RA
1877 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1878 if not video_title:
cf7e015f
S
1879 self._downloader.report_warning('Unable to extract video title')
1880 video_title = '_'
1881
9cafc3fd 1882 description_original = video_description = get_element_by_id("eow-description", video_webpage)
cf7e015f 1883 if video_description:
fa4bc6e7
RA
1884
1885 def replace_url(m):
1886 redir_url = compat_urlparse.urljoin(url, m.group(1))
1887 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1888 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1889 qs = compat_parse_qs(parsed_redir_url.query)
1890 q = qs.get('q')
1891 if q and q[0]:
1892 return q[0]
1893 return redir_url
1894
9cafc3fd 1895 description_original = video_description = re.sub(r'''(?x)
cf7e015f 1896 <a\s+
25cb7a0e 1897 (?:[a-zA-Z-]+="[^"]*"\s+)*?
23f13e97 1898 (?:title|href)="([^"]+)"\s+
25cb7a0e 1899 (?:[a-zA-Z-]+="[^"]*"\s+)*?
525cedb9 1900 class="[^"]*"[^>]*>
23f13e97 1901 [^<]+\.{3}\s*
cf7e015f 1902 </a>
fa4bc6e7 1903 ''', replace_url, video_description)
cf7e015f
S
1904 video_description = clean_html(video_description)
1905 else:
ea74e00b
DP
1906 video_description = video_details.get('shortDescription')
1907 if video_description is None:
1908 video_description = self._html_search_meta('description', video_webpage)
cf7e015f 1909
8fe10494 1910 if not smuggled_data.get('force_singlefeed', False):
5e1eddb9 1911 if not self._downloader.params.get('noplaylist'):
8fe10494
S
1912 multifeed_metadata_list = try_get(
1913 player_response,
1914 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1915 compat_str) or try_get(
1916 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1917 if multifeed_metadata_list:
1918 entries = []
1919 feed_ids = []
1920 for feed in multifeed_metadata_list.split(','):
1921 # Unquote should take place before split on comma (,) since textual
1922 # fields may contain comma as well (see
067aa17e 1923 # https://github.com/ytdl-org/youtube-dl/issues/8536)
8fe10494 1924 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
6b09401b
S
1925
1926 def feed_entry(name):
1927 return try_get(feed_data, lambda x: x[name][0], compat_str)
1928
1929 feed_id = feed_entry('id')
1930 if not feed_id:
1931 continue
1932 feed_title = feed_entry('title')
1933 title = video_title
1934 if feed_title:
1935 title += ' (%s)' % feed_title
8fe10494
S
1936 entries.append({
1937 '_type': 'url_transparent',
1938 'ie_key': 'Youtube',
1939 'url': smuggle_url(
1940 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1941 {'force_singlefeed': True}),
6b09401b 1942 'title': title,
8fe10494 1943 })
6b09401b 1944 feed_ids.append(feed_id)
8fe10494
S
1945 self.to_screen(
1946 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1947 % (', '.join(feed_ids), video_id))
1948 return self.playlist_result(entries, video_id, video_title, video_description)
1949 else:
1950 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1951
c7121fa7 1952 if view_count is None:
1c9c8de2 1953 view_count = extract_view_count(video_info)
dbdaaa23
S
1954 if view_count is None and video_details:
1955 view_count = int_or_none(video_details.get('viewCount'))
7b16239a
S
1956 if view_count is None and microformat:
1957 view_count = int_or_none(microformat.get('viewCount'))
1d699755 1958
27019dbb 1959 if is_live is None:
898238e9 1960 is_live = bool_or_none(video_details.get('isLive'))
27019dbb 1961
321bf820 1962 has_live_chat_replay = False
f0f76a33 1963 if not is_live:
321bf820 1964 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
1965 try:
1966 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
1967 has_live_chat_replay = True
f0f76a33 1968 except (KeyError, IndexError, TypeError):
321bf820 1969 pass
1970
c5e8d7af
PH
1971 # Check for "rental" videos
1972 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
067aa17e 1973 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
c5e8d7af 1974
c63ca0ee
S
1975 def _extract_filesize(media_url):
1976 return int_or_none(self._search_regex(
1977 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1978
bf1317d2
S
1979 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
1980 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
1981
c5e8d7af
PH
1982 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1983 self.report_rtmp_download()
dd27fd17
PH
1984 formats = [{
1985 'format_id': '_rtmp',
1986 'protocol': 'rtmp',
1987 'url': video_info['conn'][0],
1988 'player_url': player_url,
1989 }]
bf1317d2 1990 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
5f6a1245 1991 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1992 if 'rtmpe%3Dyes' in encoded_url_map:
067aa17e 1993 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
bf1317d2 1994 formats = []
3318832e 1995 formats_spec = {}
82156fdb 1996 fmt_list = video_info.get('fmt_list', [''])[0]
1997 if fmt_list:
1998 for fmt in fmt_list.split(','):
1999 spec = fmt.split('/')
3318832e 2000 if len(spec) > 1:
2001 width_height = spec[1].split('x')
2002 if len(width_height) == 2:
2003 formats_spec[spec[0]] = {
2004 'resolution': spec[1],
2005 'width': int_or_none(width_height[0]),
2006 'height': int_or_none(width_height[1]),
2007 }
bf1317d2
S
2008 for fmt in streaming_formats:
2009 itag = str_or_none(fmt.get('itag'))
2010 if not itag:
201e9eaa 2011 continue
bf1317d2
S
2012 quality = fmt.get('quality')
2013 quality_label = fmt.get('qualityLabel') or quality
2014 formats_spec[itag] = {
2015 'asr': int_or_none(fmt.get('audioSampleRate')),
2016 'filesize': int_or_none(fmt.get('contentLength')),
2017 'format_note': quality_label,
2018 'fps': int_or_none(fmt.get('fps')),
2019 'height': int_or_none(fmt.get('height')),
bf1317d2
S
2020 # bitrate for itag 43 is always 2147483647
2021 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
2022 'width': int_or_none(fmt.get('width')),
2023 }
2024
2025 for fmt in streaming_formats:
00eb865b 2026 if fmt.get('drmFamilies') or fmt.get('drm_families'):
bf1317d2
S
2027 continue
2028 url = url_or_none(fmt.get('url'))
2029
2030 if not url:
fa3db383 2031 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
bf1317d2
S
2032 if not cipher:
2033 continue
2034 url_data = compat_parse_qs(cipher)
2035 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2036 if not url:
2037 continue
2038 else:
2039 cipher = None
2040 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2041
2f483bc1
S
2042 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2043 # Unsupported FORMAT_STREAM_TYPE_OTF
2044 if stream_type == 3:
2045 continue
6449cd80 2046
bf1317d2
S
2047 format_id = fmt.get('itag') or url_data['itag'][0]
2048 if not format_id:
2049 continue
2050 format_id = compat_str(format_id)
a49eccdf 2051
bf1317d2
S
2052 if cipher:
2053 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
67b19799 2054 ASSETS_RE = r'(?:"assets":.+?"js":\s*("[^"]+"))|(?:"jsUrl":\s*("[^"]+"))'
bf1317d2
S
2055 jsplayer_url_json = self._search_regex(
2056 ASSETS_RE,
2057 embed_webpage if age_gate else video_webpage,
2058 'JS player URL (1)', default=None)
2059 if not jsplayer_url_json and not age_gate:
2060 # We need the embed website after all
2061 if embed_webpage is None:
2062 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2063 embed_webpage = self._download_webpage(
2064 embed_url, video_id, 'Downloading embed webpage')
2065 jsplayer_url_json = self._search_regex(
2066 ASSETS_RE, embed_webpage, 'JS player URL')
2067
2068 player_url = json.loads(jsplayer_url_json)
cf010131 2069 if player_url is None:
bf1317d2
S
2070 player_url_json = self._search_regex(
2071 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2072 video_webpage, 'age gate player URL')
2073 player_url = json.loads(player_url_json)
2074
2075 if 'sig' in url_data:
2076 url += '&signature=' + url_data['sig'][0]
2077 elif 's' in url_data:
2078 encrypted_sig = url_data['s'][0]
2079
2080 if self._downloader.params.get('verbose'):
2081 if player_url is None:
bf1317d2 2082 player_desc = 'unknown'
cf010131 2083 else:
e40c758c
S
2084 player_type, player_version = self._extract_player_info(player_url)
2085 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
bf1317d2
S
2086 parts_sizes = self._signature_cache_id(encrypted_sig)
2087 self.to_screen('{%s} signature length %s, %s' %
2088 (format_id, parts_sizes, player_desc))
2089
2090 signature = self._decrypt_signature(
2091 encrypted_sig, video_id, player_url, age_gate)
2092 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2093 url += '&%s=%s' % (sp, signature)
201e9eaa
PH
2094 if 'ratebypass' not in url:
2095 url += '&ratebypass=yes'
c9afb51c 2096
94278f72
YCH
2097 dct = {
2098 'format_id': format_id,
2099 'url': url,
2100 'player_url': player_url,
2101 }
2102 if format_id in self._formats:
2103 dct.update(self._formats[format_id])
3318832e 2104 if format_id in formats_spec:
2105 dct.update(formats_spec[format_id])
94278f72 2106
aabc2be6 2107 # Some itags are not included in DASH manifest thus corresponding formats will
067aa17e 2108 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
aabc2be6
S
2109 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2110 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2111 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
94278f72 2112
bf1317d2
S
2113 if width is None:
2114 width = int_or_none(fmt.get('width'))
2115 if height is None:
2116 height = int_or_none(fmt.get('height'))
2117
c63ca0ee
S
2118 filesize = int_or_none(url_data.get(
2119 'clen', [None])[0]) or _extract_filesize(url)
2120
bf1317d2
S
2121 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2122 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2123
4878759f
S
2124 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2125 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
bf1317d2 2126 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
54fc90aa 2127
94278f72 2128 more_fields = {
c63ca0ee 2129 'filesize': filesize,
bf1317d2 2130 'tbr': tbr,
c9afb51c
AH
2131 'width': width,
2132 'height': height,
bf1317d2
S
2133 'fps': fps,
2134 'format_note': quality_label or quality,
c9afb51c 2135 }
94278f72
YCH
2136 for key, value in more_fields.items():
2137 if value:
2138 dct[key] = value
bf1317d2 2139 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
aabc2be6
S
2140 if type_:
2141 type_split = type_.split(';')
2142 kind_ext = type_split[0].split('/')
2143 if len(kind_ext) == 2:
94278f72
YCH
2144 kind, _ = kind_ext
2145 dct['ext'] = mimetype2ext(type_split[0])
aabc2be6
S
2146 if kind in ('audio', 'video'):
2147 codecs = None
2148 for mobj in re.finditer(
2149 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2150 if mobj.group('key') == 'codecs':
2151 codecs = mobj.group('val')
2152 break
2153 if codecs:
6310acf5 2154 dct.update(parse_codecs(codecs))
e4a60912
S
2155 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2156 dct['downloader_options'] = {
2157 # Youtube throttles chunks >~10M
2158 'http_chunk_size': 10485760,
2159 }
aabc2be6 2160 formats.append(dct)
c5e8d7af 2161 else:
c3e54389
S
2162 manifest_url = (
2163 url_or_none(try_get(
2164 player_response,
2165 lambda x: x['streamingData']['hlsManifestUrl'],
3089bc74
S
2166 compat_str))
2167 or url_or_none(try_get(
c3e54389
S
2168 video_info, lambda x: x['hlsvp'][0], compat_str)))
2169 if manifest_url:
2170 formats = []
2171 m3u8_formats = self._extract_m3u8_formats(
2172 manifest_url, video_id, 'mp4', fatal=False)
2173 for a_format in m3u8_formats:
2174 itag = self._search_regex(
2175 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2176 if itag:
2177 a_format['format_id'] = itag
2178 if itag in self._formats:
2179 dct = self._formats[itag].copy()
2180 dct.update(a_format)
2181 a_format = dct
2182 a_format['player_url'] = player_url
2183 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2184 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
78895bd3
U
2185 if self._downloader.params.get('youtube_include_hls_manifest', True):
2186 formats.append(a_format)
c3e54389 2187 else:
13577349 2188 error_message = extract_unavailable_message()
c3e54389 2189 if not error_message:
13577349
S
2190 error_message = clean_html(try_get(
2191 player_response, lambda x: x['playabilityStatus']['reason'],
2192 compat_str))
2193 if not error_message:
2194 error_message = clean_html(
2195 try_get(video_info, lambda x: x['reason'][0], compat_str))
c3e54389
S
2196 if error_message:
2197 raise ExtractorError(error_message, expected=True)
2198 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 2199
7e72694b 2200 # uploader
dbdaaa23
S
2201 video_uploader = try_get(
2202 video_info, lambda x: x['author'][0],
2203 compat_str) or str_or_none(video_details.get('author'))
7e72694b
S
2204 if video_uploader:
2205 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2206 else:
2207 self._downloader.report_warning('unable to extract uploader name')
2208
2209 # uploader_id
2210 video_uploader_id = None
2211 video_uploader_url = None
2212 mobj = re.search(
2213 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2214 video_webpage)
2215 if mobj is not None:
2216 video_uploader_id = mobj.group('uploader_id')
2217 video_uploader_url = mobj.group('uploader_url')
a6211d23
S
2218 else:
2219 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2220 if owner_profile_url:
2221 video_uploader_id = self._search_regex(
2222 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2223 default=None)
2224 video_uploader_url = owner_profile_url
7e72694b 2225
b45a9e69 2226 channel_id = (
3089bc74
S
2227 str_or_none(video_details.get('channelId'))
2228 or self._html_search_meta(
2229 'channelId', video_webpage, 'channel id', default=None)
2230 or self._search_regex(
b45a9e69 2231 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2232 video_webpage, 'channel id', default=None, group='id'))
dd4c4492
S
2233 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2234
b477fc13
S
2235 thumbnails = []
2236 thumbnails_list = try_get(
2237 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2238 for t in thumbnails_list:
2239 if not isinstance(t, dict):
2240 continue
2241 thumbnail_url = url_or_none(t.get('url'))
2242 if not thumbnail_url:
2243 continue
2244 thumbnails.append({
2245 'url': thumbnail_url,
2246 'width': int_or_none(t.get('width')),
2247 'height': int_or_none(t.get('height')),
2248 })
2249
2250 if not thumbnails:
7e72694b 2251 video_thumbnail = None
b477fc13
S
2252 # We try first to get a high quality image:
2253 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2254 video_webpage, re.DOTALL)
2255 if m_thumb is not None:
2256 video_thumbnail = m_thumb.group(1)
2257 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2258 if thumbnail_url:
2259 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2260 if video_thumbnail:
2261 thumbnails.append({'url': video_thumbnail})
7e72694b
S
2262
2263 # upload date
2264 upload_date = self._html_search_meta(
2265 'datePublished', video_webpage, 'upload date', default=None)
2266 if not upload_date:
2267 upload_date = self._search_regex(
2268 [r'(?s)id="eow-date.*?>(.*?)</span>',
2269 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2270 video_webpage, 'upload date', default=None)
37357d21
S
2271 if not upload_date:
2272 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
7e72694b
S
2273 upload_date = unified_strdate(upload_date)
2274
2275 video_license = self._html_search_regex(
2276 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2277 video_webpage, 'license', default=None)
2278
2279 m_music = re.search(
2280 r'''(?x)
2281 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2282 <ul[^>]*>\s*
2283 <li>(?P<title>.+?)
2284 by (?P<creator>.+?)
2285 (?:
2286 \(.+?\)|
2287 <a[^>]*
2288 (?:
2289 \bhref=["\']/red[^>]*>| # drop possible
2290 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2291 )
2292 .*?
2293 )?</li
2294 ''',
2295 video_webpage)
2296 if m_music:
2297 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2298 video_creator = clean_html(m_music.group('creator'))
2299 else:
2300 video_alt_title = video_creator = None
2301
2302 def extract_meta(field):
2303 return self._html_search_regex(
2304 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2305 video_webpage, field, default=None)
2306
2307 track = extract_meta('Song')
2308 artist = extract_meta('Artist')
92bc97d3 2309 album = extract_meta('Album')
822b9d9c
RA
2310
2311 # Youtube Music Auto-generated description
92bc97d3 2312 release_date = release_year = None
822b9d9c
RA
2313 if video_description:
2314 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2315 if mobj:
2316 if not track:
2317 track = mobj.group('track').strip()
2318 if not artist:
2319 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
92bc97d3
RA
2320 if not album:
2321 album = mobj.group('album'.strip())
822b9d9c
RA
2322 release_year = mobj.group('release_year')
2323 release_date = mobj.group('release_date')
2324 if release_date:
2325 release_date = release_date.replace('-', '')
2326 if not release_year:
2327 release_year = int(release_date[:4])
2328 if release_year:
2329 release_year = int(release_year)
7e72694b
S
2330
2331 m_episode = re.search(
2332 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2333 video_webpage)
2334 if m_episode:
c2dd2dc0 2335 series = unescapeHTML(m_episode.group('series'))
7e72694b
S
2336 season_number = int(m_episode.group('season'))
2337 episode_number = int(m_episode.group('episode'))
2338 else:
2339 series = season_number = episode_number = None
2340
2341 m_cat_container = self._search_regex(
2342 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2343 video_webpage, 'categories', default=None)
dbeafce5 2344 category = None
7e72694b
S
2345 if m_cat_container:
2346 category = self._html_search_regex(
2347 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2348 default=None)
dbeafce5
S
2349 if not category:
2350 category = try_get(
2351 microformat, lambda x: x['category'], compat_str)
2352 video_categories = None if category is None else [category]
7e72694b
S
2353
2354 video_tags = [
2355 unescapeHTML(m.group('content'))
2356 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
dbeafce5
S
2357 if not video_tags:
2358 video_tags = try_get(video_details, lambda x: x['keywords'], list)
7e72694b
S
2359
2360 def _extract_count(count_name):
2361 return str_to_int(self._search_regex(
a6c666d0 2362 r'"accessibilityData":\{"label":"([\d,\w]+) %ss"\}'
7e72694b
S
2363 % re.escape(count_name),
2364 video_webpage, count_name, default=None))
2365
2366 like_count = _extract_count('like')
2367 dislike_count = _extract_count('dislike')
2368
dbdaaa23
S
2369 if view_count is None:
2370 view_count = str_to_int(self._search_regex(
2371 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2372 'view count', default=None))
2373
bf3c9326
S
2374 average_rating = (
2375 float_or_none(video_details.get('averageRating'))
2376 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2377
7e72694b 2378 # subtitles
321bf820 2379 video_subtitles = self.extract_subtitles(
2380 video_id, video_webpage, has_live_chat_replay)
7e72694b
S
2381 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2382
2383 video_duration = try_get(
2384 video_info, lambda x: int_or_none(x['length_seconds'][0]))
dbdaaa23
S
2385 if not video_duration:
2386 video_duration = int_or_none(video_details.get('lengthSeconds'))
7e72694b
S
2387 if not video_duration:
2388 video_duration = parse_duration(self._html_search_meta(
2389 'duration', video_webpage, 'video duration'))
2390
b84071c0
JP
2391 # Get Subscriber Count of channel
2392 subscriber_count = parse_count(self._search_regex(
2393 r'"text":"([\d\.]+\w?) subscribers"',
2394 video_webpage,
2395 'subscriber count',
2396 default=None
2397 ))
2398
7e72694b
S
2399 # annotations
2400 video_annotations = None
2401 if self._downloader.params.get('writeannotations', False):
64b6a4e9
RA
2402 xsrf_token = self._search_regex(
2403 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2404 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2405 invideo_url = try_get(
2406 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2407 if xsrf_token and invideo_url:
2408 xsrf_field_name = self._search_regex(
2409 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2410 video_webpage, 'xsrf field name',
2411 group='xsrf_field_name', default='session_token')
2412 video_annotations = self._download_webpage(
2413 self._proto_relative_url(invideo_url),
2414 video_id, note='Downloading annotations',
2415 errnote='Unable to download video annotations', fatal=False,
2416 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
7e72694b 2417
84213ea8 2418 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
7e72694b 2419
dd27fd17 2420 # Look for the DASH manifest
203fb43f 2421 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 2422 dash_mpd_fatal = True
8ff648e4 2423 for mpd_url in dash_mpds:
d8d24a92 2424 dash_formats = {}
774e208f 2425 try:
05d0d131
YCH
2426 def decrypt_sig(mobj):
2427 s = mobj.group(1)
2428 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2429 return '/signature/%s' % dec_s
2430
8ff648e4 2431 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2d2fa82d 2432
8ff648e4 2433 for df in self._extract_mpd_formats(
2434 mpd_url, video_id, fatal=dash_mpd_fatal,
2435 formats_dict=self._formats):
c63ca0ee
S
2436 if not df.get('filesize'):
2437 df['filesize'] = _extract_filesize(df['url'])
d8d24a92
S
2438 # Do not overwrite DASH format found in some previous DASH manifest
2439 if df['format_id'] not in dash_formats:
2440 dash_formats[df['format_id']] = df
77c6fb5b
S
2441 # Additional DASH manifests may end up in HTTP Error 403 therefore
2442 # allow them to fail without bug report message if we already have
2443 # some DASH manifest succeeded. This is temporary workaround to reduce
2444 # burst of bug reports until we figure out the reason and whether it
2445 # can be fixed at all.
2446 dash_mpd_fatal = False
774e208f
PH
2447 except (ExtractorError, KeyError) as e:
2448 self.report_warning(
2449 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 2450 if dash_formats:
04b3b3df
JMF
2451 # Remove the formats we found through non-DASH, they
2452 # contain less info and it can be wrong, because we use
2453 # fixed values (for example the resolution). See
067aa17e 2454 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
04b3b3df 2455 # example.
d80265cc 2456 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 2457 formats.extend(dash_formats.values())
d80044c2 2458
6271f1ca
PH
2459 # Check for malformed aspect ratio
2460 stretched_m = re.search(
2461 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2462 video_webpage)
2463 if stretched_m:
313dfc45
LL
2464 w = float(stretched_m.group('w'))
2465 h = float(stretched_m.group('h'))
5faf9fed
S
2466 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2467 # We will only process correct ratios.
313dfc45 2468 if w > 0 and h > 0:
41f24c32 2469 ratio = w / h
313dfc45
LL
2470 for f in formats:
2471 if f.get('vcodec') != 'none':
2472 f['stretched_ratio'] = ratio
6271f1ca 2473
026fbedc 2474 if not formats:
43ebf77d
S
2475 if 'reason' in video_info:
2476 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2477 regions_allowed = self._html_search_meta(
2478 'regionsAllowed', video_webpage, default=None)
2479 countries = regions_allowed.split(',') if regions_allowed else None
2480 self.raise_geo_restricted(
2481 msg=video_info['reason'][0], countries=countries)
2482 reason = video_info['reason'][0]
2483 if 'Invalid parameters' in reason:
2484 unavailable_message = extract_unavailable_message()
2485 if unavailable_message:
2486 reason = unavailable_message
2487 raise ExtractorError(
2488 'YouTube said: %s' % reason,
2489 expected=True, video_id=video_id)
2490 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2491 raise ExtractorError('This video is DRM protected.', expected=True)
0d297518 2492
4bcc7bd1 2493 self._sort_formats(formats)
4ea3be0a 2494
21c340b8 2495 self.mark_watched(video_id, video_info, player_response)
d77ab8e2 2496
4ea3be0a 2497 return {
8bcc8756
JW
2498 'id': video_id,
2499 'uploader': video_uploader,
2500 'uploader_id': video_uploader_id,
fd050249 2501 'uploader_url': video_uploader_url,
dd4c4492
S
2502 'channel_id': channel_id,
2503 'channel_url': channel_url,
8bcc8756 2504 'upload_date': upload_date,
7caf9830 2505 'license': video_license,
936784b2 2506 'creator': video_creator or artist,
8bcc8756 2507 'title': video_title,
936784b2 2508 'alt_title': video_alt_title or track,
b477fc13 2509 'thumbnails': thumbnails,
8bcc8756
JW
2510 'description': video_description,
2511 'categories': video_categories,
000b6b5a 2512 'tags': video_tags,
8bcc8756 2513 'subtitles': video_subtitles,
360e1ca5 2514 'automatic_captions': automatic_captions,
8bcc8756
JW
2515 'duration': video_duration,
2516 'age_limit': 18 if age_gate else 0,
2517 'annotations': video_annotations,
9cafc3fd 2518 'chapters': chapters,
7e8c0af0 2519 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 2520 'view_count': view_count,
4ea3be0a 2521 'like_count': like_count,
2522 'dislike_count': dislike_count,
bf3c9326 2523 'average_rating': average_rating,
8bcc8756 2524 'formats': formats,
2fe1ff85 2525 'is_live': is_live,
7c80519c 2526 'start_time': start_time,
297a564b 2527 'end_time': end_time,
12afdc2a
S
2528 'series': series,
2529 'season_number': season_number,
2530 'episode_number': episode_number,
936784b2
S
2531 'track': track,
2532 'artist': artist,
5caabd3c 2533 'album': album,
2534 'release_date': release_date,
2535 'release_year': release_year,
b84071c0 2536 'subscriber_count': subscriber_count,
4ea3be0a 2537 }
c5e8d7af 2538
5f6a1245 2539
8e7aad20 2540class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 2541 IE_DESC = 'YouTube.com playlists'
d67cc9fa 2542 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
2543 (?:https?://)?
2544 (?:\w+\.)?
c5e8d7af 2545 (?:
c0345b82 2546 (?:
66b48727 2547 youtube(?:kids)?\.com|
c0345b82
S
2548 invidio\.us
2549 )
2550 /
feaa5ad7 2551 (?:
87dadd45 2552 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
feaa5ad7
S
2553 \? (?:.*?[&;])*? (?:p|a|list)=
2554 | p/
2555 )|
2556 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
c5e8d7af 2557 )
d67cc9fa 2558 (
66b48727 2559 (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
5f6a1245 2560 # Top tracks, they can also include dots
d67cc9fa
JMF
2561 |(?:MC)[\w\.]*
2562 )
c5e8d7af
PH
2563 .*
2564 |
d0ba5587
S
2565 (%(playlist_id)s)
2566 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
8d81f3e3 2567 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
351f37c0
S
2568 _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
2569 _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
78caa52a 2570 IE_NAME = 'youtube:playlist'
81127aa5 2571 _TESTS = [{
0e30a7b9 2572 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
81127aa5 2573 'info_dict': {
0e30a7b9 2574 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2575 'uploader': 'Sergey M.',
2576 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3867038a 2577 'title': 'youtube-dl public playlist',
81127aa5 2578 },
0e30a7b9 2579 'playlist_count': 1,
9291475f 2580 }, {
0e30a7b9 2581 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
9291475f 2582 'info_dict': {
0e30a7b9 2583 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2584 'uploader': 'Sergey M.',
2585 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3867038a 2586 'title': 'youtube-dl empty playlist',
9291475f
PH
2587 },
2588 'playlist_count': 0,
2589 }, {
2590 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2591 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2592 'info_dict': {
2593 'title': '29C3: Not my department',
acf757f4 2594 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
13a75688
S
2595 'uploader': 'Christiaan008',
2596 'uploader_id': 'ChRiStIaAn008',
9291475f 2597 },
0e30a7b9 2598 'playlist_count': 96,
9291475f
PH
2599 }, {
2600 'note': 'issue #673',
2601 'url': 'PLBB231211A4F62143',
2602 'info_dict': {
f46a8702 2603 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 2604 'id': 'PLBB231211A4F62143',
13a75688
S
2605 'uploader': 'Wickydoo',
2606 'uploader_id': 'Wickydoo',
9291475f
PH
2607 },
2608 'playlist_mincount': 26,
2609 }, {
2610 'note': 'Large playlist',
2611 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2612 'info_dict': {
2613 'title': 'Uploads from Cauchemar',
acf757f4 2614 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
13a75688
S
2615 'uploader': 'Cauchemar',
2616 'uploader_id': 'Cauchemar89',
9291475f
PH
2617 },
2618 'playlist_mincount': 799,
2619 }, {
2620 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2621 'info_dict': {
2622 'title': 'YDL_safe_search',
acf757f4 2623 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
2624 },
2625 'playlist_count': 2,
4201ba13 2626 'skip': 'This playlist is private',
ac7553d0
PH
2627 }, {
2628 'note': 'embedded',
2d3d2997 2629 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0
PH
2630 'playlist_count': 4,
2631 'info_dict': {
2632 'title': 'JODA15',
acf757f4 2633 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
13a75688
S
2634 'uploader': 'milan',
2635 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
ac7553d0 2636 }
87dadd45
S
2637 }, {
2638 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2639 'playlist_mincount': 485,
2640 'info_dict': {
13a75688 2641 'title': '2018 Chinese New Singles (11/6 updated)',
87dadd45 2642 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
13a75688
S
2643 'uploader': 'LBK',
2644 'uploader_id': 'sdragonfang',
87dadd45 2645 }
6b08cdf6
PH
2646 }, {
2647 'note': 'Embedded SWF player',
2d3d2997 2648 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
6b08cdf6
PH
2649 'playlist_count': 4,
2650 'info_dict': {
2651 'title': 'JODA7',
acf757f4 2652 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
13a75688
S
2653 },
2654 'skip': 'This playlist does not exist',
4b7df0d3
JMF
2655 }, {
2656 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2657 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2658 'info_dict': {
acf757f4
PH
2659 'title': 'Uploads from Interstellar Movie',
2660 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
13a75688
S
2661 'uploader': 'Interstellar Movie',
2662 'uploader_id': 'InterstellarMovie1',
4b7df0d3 2663 },
481cc733 2664 'playlist_mincount': 21,
dacb3a86
S
2665 }, {
2666 # Playlist URL that does not actually serve a playlist
2667 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2668 'info_dict': {
2669 'id': 'FqZTN594JQw',
2670 'ext': 'webm',
2671 'title': "Smiley's People 01 detective, Adventure Series, Action",
2672 'uploader': 'STREEM',
2673 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
ec85ded8 2674 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
dacb3a86
S
2675 'upload_date': '20150526',
2676 'license': 'Standard YouTube License',
2677 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2678 'categories': ['People & Blogs'],
2679 'tags': list,
dbdaaa23 2680 'view_count': int,
dacb3a86
S
2681 'like_count': int,
2682 'dislike_count': int,
2683 },
2684 'params': {
2685 'skip_download': True,
2686 },
13a75688 2687 'skip': 'This video is not available.',
dacb3a86 2688 'add_ie': [YoutubeIE.ie_key()],
481cc733
S
2689 }, {
2690 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2691 'info_dict': {
2692 'id': 'yeWKywCrFtk',
2693 'ext': 'mp4',
2694 'title': 'Small Scale Baler and Braiding Rugs',
2695 'uploader': 'Backus-Page House Museum',
2696 'uploader_id': 'backuspagemuseum',
ec85ded8 2697 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
481cc733 2698 'upload_date': '20161008',
481cc733
S
2699 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2700 'categories': ['Nonprofits & Activism'],
2701 'tags': list,
2702 'like_count': int,
2703 'dislike_count': int,
2704 },
2705 'params': {
2706 'noplaylist': True,
2707 'skip_download': True,
2708 },
2e18adec
S
2709 }, {
2710 # https://github.com/ytdl-org/youtube-dl/issues/21844
2711 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2712 'info_dict': {
2713 'title': 'Data Analysis with Dr Mike Pound',
2714 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2715 'uploader_id': 'Computerphile',
2716 'uploader': 'Computerphile',
2717 },
2718 'playlist_mincount': 11,
feaa5ad7
S
2719 }, {
2720 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2721 'only_matching': True,
a6857510
S
2722 }, {
2723 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2724 'only_matching': True,
409b9324
S
2725 }, {
2726 # music album playlist
2727 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2728 'only_matching': True,
c0345b82
S
2729 }, {
2730 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2731 'only_matching': True,
66b48727
RA
2732 }, {
2733 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2734 'only_matching': True,
81127aa5 2735 }]
c5e8d7af 2736
880e1c52
JMF
2737 def _real_initialize(self):
2738 self._login()
2739
351f37c0
S
2740 def extract_videos_from_page(self, page):
2741 ids_in_page = []
2742 titles_in_page = []
2743
2744 for item in re.findall(
2745 r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
2746 attrs = extract_attributes(item)
2747 video_id = attrs['data-video-id']
2748 video_title = unescapeHTML(attrs.get('data-title'))
2749 if video_title:
2750 video_title = video_title.strip()
2751 ids_in_page.append(video_id)
2752 titles_in_page.append(video_title)
2753
2754 # Fallback with old _VIDEO_RE
2755 self.extract_videos_from_page_impl(
2756 self._VIDEO_RE, page, ids_in_page, titles_in_page)
2757
2758 # Relaxed fallbacks
2759 self.extract_videos_from_page_impl(
2760 r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
2761 ids_in_page, titles_in_page)
2762 self.extract_videos_from_page_impl(
2763 r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
2764 ids_in_page, titles_in_page)
2765
2766 return zip(ids_in_page, titles_in_page)
2767
652cdaa2 2768 def _extract_mix(self, playlist_id):
99209c29 2769 # The mixes are generated from a single video
652cdaa2 2770 # the id of the playlist is just 'RD' + video_id
1b6182d8
JMF
2771 ids = []
2772 last_id = playlist_id[-11:]
2773 for n in itertools.count(1):
07af16b9 2774 url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
1b6182d8
JMF
2775 webpage = self._download_webpage(
2776 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2777 new_ids = orderedSet(re.findall(
2778 r'''(?xs)data-video-username=".*?".*?
2779 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2780 webpage))
2781 # Fetch new pages until all the videos are repeated, it seems that
2782 # there are always 51 unique videos.
2783 new_ids = [_id for _id in new_ids if _id not in ids]
2784 if not new_ids:
2785 break
2786 ids.extend(new_ids)
2787 last_id = ids[-1]
2788
2789 url_results = self._ids_to_results(ids)
2790
bc2f773b 2791 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5 2792 title_span = (
3089bc74
S
2793 search_title('playlist-title')
2794 or search_title('title long-title')
2795 or search_title('title'))
76d1700b 2796 title = clean_html(title_span)
652cdaa2
JMF
2797
2798 return self.playlist_result(url_results, playlist_id, title)
2799
448830ce 2800 def _extract_playlist(self, playlist_id):
dbb94fb0
S
2801 url = self._TEMPLATE_URL % playlist_id
2802 page = self._download_webpage(url, playlist_id)
dbb94fb0 2803
067aa17e 2804 # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
8bc0800d 2805 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
39b62db1
YCH
2806 match = match.strip()
2807 # Check if the playlist exists or is private
4201ba13
S
2808 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2809 if mobj:
2810 reason = mobj.group('reason')
2811 message = 'This playlist %s' % reason
2812 if 'private' in reason:
2813 message += ', use --username or --netrc to access it'
2814 message += '.'
2815 raise ExtractorError(message, expected=True)
39b62db1
YCH
2816 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2817 raise ExtractorError(
2818 'Invalid parameters. Maybe URL is incorrect.',
2819 expected=True)
2820 elif re.match(r'[^<]*Choose your language[^<]*', match):
2821 continue
2822 else:
2823 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 2824
dbb94fb0 2825 playlist_title = self._html_search_regex(
63b4295d 2826 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
dacb3a86 2827 page, 'title', default=None)
c5e8d7af 2828
07aeced6 2829 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
4e3f1f04 2830 uploader = self._html_search_regex(
07aeced6
S
2831 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2832 page, 'uploader', default=None)
2833 mobj = re.search(
2834 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2835 page)
2836 if mobj:
2837 uploader_id = mobj.group('uploader_id')
2838 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2839 else:
2840 uploader_id = uploader_url = None
2841
dacb3a86
S
2842 has_videos = True
2843
2844 if not playlist_title:
2845 try:
2846 # Some playlist URLs don't actually serve a playlist (e.g.
2847 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2848 next(self._entries(page, playlist_id))
2849 except StopIteration:
2850 has_videos = False
2851
07aeced6 2852 playlist = self.playlist_result(
dacb3a86 2853 self._entries(page, playlist_id), playlist_id, playlist_title)
07aeced6
S
2854 playlist.update({
2855 'uploader': uploader,
2856 'uploader_id': uploader_id,
2857 'uploader_url': uploader_url,
2858 })
2859
2860 return has_videos, playlist
c5e8d7af 2861
ebf1b291 2862 def _check_download_just_video(self, url, playlist_id):
448830ce
S
2863 # Check if it's a video-specific URL
2864 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
481cc733 2865 video_id = query_dict.get('v', [None])[0] or self._search_regex(
87dadd45 2866 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
481cc733
S
2867 'video id', default=None)
2868 if video_id:
448830ce
S
2869 if self._downloader.params.get('noplaylist'):
2870 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
dacb3a86 2871 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
448830ce
S
2872 else:
2873 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
dacb3a86
S
2874 return video_id, None
2875 return None, None
448830ce 2876
ebf1b291
S
2877 def _real_extract(self, url):
2878 # Extract playlist id
2879 mobj = re.match(self._VALID_URL, url)
2880 if mobj is None:
2881 raise ExtractorError('Invalid URL: %s' % url)
2882 playlist_id = mobj.group(1) or mobj.group(2)
2883
dacb3a86 2884 video_id, video = self._check_download_just_video(url, playlist_id)
ebf1b291
S
2885 if video:
2886 return video
2887
466a6145 2888 if playlist_id.startswith(('RD', 'UL', 'PU')):
448830ce
S
2889 # Mixes require a custom extraction process
2890 return self._extract_mix(playlist_id)
2891
dacb3a86
S
2892 has_videos, playlist = self._extract_playlist(playlist_id)
2893 if has_videos or not video_id:
2894 return playlist
2895
2896 # Some playlist URLs don't actually serve a playlist (see
067aa17e 2897 # https://github.com/ytdl-org/youtube-dl/issues/10537).
dacb3a86
S
2898 # Fallback to plain video extraction if there is a video id
2899 # along with playlist id.
2900 return self.url_result(video_id, 'Youtube', video_id=video_id)
448830ce 2901
c5e8d7af 2902
648e6a1f 2903class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 2904 IE_DESC = 'YouTube.com channels'
66b48727 2905 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 2906 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
648e6a1f 2907 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
78caa52a 2908 IE_NAME = 'youtube:channel'
cdc628a4
PH
2909 _TESTS = [{
2910 'note': 'paginated channel',
2911 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
2912 'playlist_mincount': 91,
acf757f4 2913 'info_dict': {
9170ca5b
JMF
2914 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
2915 'title': 'Uploads from lex will',
13a75688
S
2916 'uploader': 'lex will',
2917 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
acf757f4 2918 }
5c43afd4
JMF
2919 }, {
2920 'note': 'Age restricted channel',
2921 # from https://www.youtube.com/user/DeusExOfficial
2922 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
2923 'playlist_mincount': 64,
2924 'info_dict': {
2925 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
2926 'title': 'Uploads from Deus Ex',
13a75688
S
2927 'uploader': 'Deus Ex',
2928 'uploader_id': 'DeusExOfficial',
5c43afd4 2929 },
cd5a74a2
S
2930 }, {
2931 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
2932 'only_matching': True,
66b48727
RA
2933 }, {
2934 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
2935 'only_matching': True,
cdc628a4 2936 }]
c5e8d7af 2937
e462474e
S
2938 @classmethod
2939 def suitable(cls, url):
f07e276a
S
2940 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
2941 else super(YoutubeChannelIE, cls).suitable(url))
e462474e 2942
9558dcec
S
2943 def _build_template_url(self, url, channel_id):
2944 return self._TEMPLATE_URL % channel_id
2945
c5e8d7af 2946 def _real_extract(self, url):
9ff67727 2947 channel_id = self._match_id(url)
c5e8d7af 2948
9558dcec 2949 url = self._build_template_url(url, channel_id)
386bdfa6
S
2950
2951 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
2952 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
2953 # otherwise fallback on channel by page extraction
2954 channel_page = self._download_webpage(
2955 url + '?view=57', channel_id,
2956 'Downloading channel page', fatal=False)
2b3c2546
PH
2957 if channel_page is False:
2958 channel_playlist_id = False
2959 else:
2960 channel_playlist_id = self._html_search_meta(
2961 'channelId', channel_page, 'channel id', default=None)
2962 if not channel_playlist_id:
73c4ac2c
S
2963 channel_url = self._html_search_meta(
2964 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
2965 channel_page, 'channel url', default=None)
2966 if channel_url:
2967 channel_playlist_id = self._search_regex(
2968 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
2969 channel_url, 'channel id', default=None)
386bdfa6
S
2970 if channel_playlist_id and channel_playlist_id.startswith('UC'):
2971 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
2972 return self.url_result(
2973 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 2974
60bf45c8 2975 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
2976 autogenerated = re.search(r'''(?x)
2977 class="[^"]*?(?:
2978 channel-header-autogenerated-label|
2979 yt-channel-title-autogenerated
2980 )[^"]*"''', channel_page) is not None
c5e8d7af 2981
b9643eed
JMF
2982 if autogenerated:
2983 # The videos are contained in a single page
2984 # the ajax pages can't be used, they are empty
b82f815f 2985 entries = [
fb69240c
S
2986 self.url_result(
2987 video_id, 'Youtube', video_id=video_id,
2988 video_title=video_title)
8f02ad4f 2989 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
2990 return self.playlist_result(entries, channel_id)
2991
73c4ac2c
S
2992 try:
2993 next(self._entries(channel_page, channel_id))
2994 except StopIteration:
2995 alert_message = self._html_search_regex(
2996 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
2997 channel_page, 'alert', default=None, group='alert')
2998 if alert_message:
2999 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
3000
648e6a1f 3001 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
c5e8d7af
PH
3002
3003
eb0f3e7e 3004class YoutubeUserIE(YoutubeChannelIE):
78caa52a 3005 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
39e7107d 3006 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)'
9558dcec 3007 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
78caa52a 3008 IE_NAME = 'youtube:user'
c5e8d7af 3009
cdc628a4
PH
3010 _TESTS = [{
3011 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
3012 'playlist_mincount': 320,
3013 'info_dict': {
73c4ac2c
S
3014 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
3015 'title': 'Uploads from The Linux Foundation',
13a75688
S
3016 'uploader': 'The Linux Foundation',
3017 'uploader_id': 'TheLinuxFoundation',
cdc628a4 3018 }
9558dcec
S
3019 }, {
3020 # Only available via https://www.youtube.com/c/12minuteathlete/videos
3021 # but not https://www.youtube.com/user/12minuteathlete/videos
3022 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
3023 'playlist_mincount': 249,
3024 'info_dict': {
3025 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
3026 'title': 'Uploads from 12 Minute Athlete',
13a75688
S
3027 'uploader': '12 Minute Athlete',
3028 'uploader_id': 'the12minuteathlete',
9558dcec 3029 }
cdc628a4
PH
3030 }, {
3031 'url': 'ytuser:phihag',
3032 'only_matching': True,
daa0df9e
YCH
3033 }, {
3034 'url': 'https://www.youtube.com/c/gametrailers',
3035 'only_matching': True,
39e7107d
U
3036 }, {
3037 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak',
3038 'only_matching': True,
9558dcec
S
3039 }, {
3040 'url': 'https://www.youtube.com/gametrailers',
3041 'only_matching': True,
73c4ac2c 3042 }, {
0e879f43 3043 # This channel is not available, geo restricted to JP
73c4ac2c
S
3044 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
3045 'only_matching': True,
cdc628a4
PH
3046 }]
3047
e3ea4790 3048 @classmethod
f4b05232 3049 def suitable(cls, url):
e3ea4790
JMF
3050 # Don't return True if the url can be extracted with other youtube
3051 # extractor, the regex would is too permissive and it would match.
f3a58d46 3052 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
3053 if any(ie.suitable(url) for ie in other_yt_ies):
5f6a1245
JW
3054 return False
3055 else:
3056 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 3057
9558dcec
S
3058 def _build_template_url(self, url, channel_id):
3059 mobj = re.match(self._VALID_URL, url)
3060 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
3061
b05654f0 3062
f07e276a
S
3063class YoutubeLiveIE(YoutubeBaseInfoExtractor):
3064 IE_DESC = 'YouTube.com live streams'
073d5bf5 3065 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
f07e276a
S
3066 IE_NAME = 'youtube:live'
3067
3068 _TESTS = [{
2d3d2997 3069 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
f07e276a
S
3070 'info_dict': {
3071 'id': 'a48o2S1cPoo',
3072 'ext': 'mp4',
3073 'title': 'The Young Turks - Live Main Show',
3074 'uploader': 'The Young Turks',
3075 'uploader_id': 'TheYoungTurks',
ec85ded8 3076 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
f07e276a
S
3077 'upload_date': '20150715',
3078 'license': 'Standard YouTube License',
3079 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3080 'categories': ['News & Politics'],
3081 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3082 'like_count': int,
3083 'dislike_count': int,
3084 },
3085 'params': {
3086 'skip_download': True,
3087 },
3088 }, {
2d3d2997 3089 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
f07e276a 3090 'only_matching': True,
c1b2a085
S
3091 }, {
3092 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3093 'only_matching': True,
073d5bf5
S
3094 }, {
3095 'url': 'https://www.youtube.com/TheYoungTurks/live',
3096 'only_matching': True,
f07e276a
S
3097 }]
3098
3099 def _real_extract(self, url):
3100 mobj = re.match(self._VALID_URL, url)
3101 channel_id = mobj.group('id')
3102 base_url = mobj.group('base_url')
3103 webpage = self._download_webpage(url, channel_id, fatal=False)
3104 if webpage:
3105 page_type = self._og_search_property(
e7f3529f 3106 'type', webpage, 'page type', default='')
f07e276a
S
3107 video_id = self._html_search_meta(
3108 'videoId', webpage, 'video id', default=None)
e7f3529f
S
3109 if page_type.startswith('video') and video_id and re.match(
3110 r'^[0-9A-Za-z_-]{11}$', video_id):
f07e276a
S
3111 return self.url_result(video_id, YoutubeIE.ie_key())
3112 return self.url_result(base_url)
3113
3114
e462474e
S
3115class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
3116 IE_DESC = 'YouTube.com user/channel playlists'
e942cfd1 3117 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists'
e462474e 3118 IE_NAME = 'youtube:playlists'
0c148415 3119
e568c223 3120 _TESTS = [{
2d3d2997 3121 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
0c148415
S
3122 'playlist_mincount': 4,
3123 'info_dict': {
3124 'id': 'ThirstForScience',
13a75688 3125 'title': 'ThirstForScience',
0c148415 3126 },
e568c223
S
3127 }, {
3128 # with "Load more" button
2d3d2997 3129 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
e568c223
S
3130 'playlist_mincount': 70,
3131 'info_dict': {
3132 'id': 'igorkle1',
3133 'title': 'Игорь Клейнер',
3134 },
e462474e
S
3135 }, {
3136 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
3137 'playlist_mincount': 17,
3138 'info_dict': {
3139 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
3140 'title': 'Chem Player',
3141 },
13a75688 3142 'skip': 'Blocked',
e942cfd1
S
3143 }, {
3144 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3145 'only_matching': True,
e568c223 3146 }]
0c148415
S
3147
3148
870f3bfc
S
3149class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
3150 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
3151
3152
3153class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
78caa52a 3154 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
3155 # there doesn't appear to be a real limit, for example if you search for
3156 # 'python' you get more than 8.000.000 results
3157 _MAX_RESULTS = float('inf')
78caa52a 3158 IE_NAME = 'youtube:search'
b05654f0 3159 _SEARCH_KEY = 'ytsearch'
6c894ea1 3160 _SEARCH_PARAMS = None
9dd8e46a 3161 _TESTS = []
b05654f0 3162
6c894ea1
U
3163 def _entries(self, query, n):
3164 data = {
3165 'context': {
3166 'client': {
3167 'clientName': 'WEB',
3168 'clientVersion': '2.20201021.03.00',
3169 }
3170 },
3171 'query': query,
a22b2fd1 3172 }
6c894ea1
U
3173 if self._SEARCH_PARAMS:
3174 data['params'] = self._SEARCH_PARAMS
3175 total = 0
3176 for page_num in itertools.count(1):
3177 search = self._download_json(
3178 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3179 video_id='query "%s"' % query,
3180 note='Downloading page %s' % page_num,
3181 errnote='Unable to download API page', fatal=False,
3182 data=json.dumps(data).encode('utf8'),
3183 headers={'content-type': 'application/json'})
3184 if not search:
b4c08069 3185 break
6c894ea1
U
3186 slr_contents = try_get(
3187 search,
3188 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3189 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3190 list)
3191 if not slr_contents:
a22b2fd1 3192 break
6c894ea1
U
3193 isr_contents = try_get(
3194 slr_contents,
3195 lambda x: x[0]['itemSectionRenderer']['contents'],
3196 list)
3197 if not isr_contents:
3198 break
3199 for content in isr_contents:
3200 if not isinstance(content, dict):
3201 continue
3202 video = content.get('videoRenderer')
3203 if not isinstance(video, dict):
3204 continue
3205 video_id = video.get('videoId')
3206 if not video_id:
3207 continue
3208 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3209 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3210 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3211 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3212 view_count = int_or_none(self._search_regex(
3213 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3214 'view count', default=None))
3215 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3216 total += 1
3217 yield {
3218 '_type': 'url_transparent',
3219 'ie_key': YoutubeIE.ie_key(),
3220 'id': video_id,
3221 'url': video_id,
3222 'title': title,
3223 'description': description,
3224 'duration': duration,
3225 'view_count': view_count,
3226 'uploader': uploader,
3227 }
3228 if total == n:
3229 return
3230 token = try_get(
3231 slr_contents,
3232 lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3233 compat_str)
3234 if not token:
3235 break
3236 data['continuation'] = token
b05654f0 3237
6c894ea1
U
3238 def _get_n_results(self, query, n):
3239 """Get a specified number of results for a query"""
3240 return self.playlist_result(self._entries(query, n), query)
75dff0ee 3241
c9ae7b95 3242
a3dd9248 3243class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 3244 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 3245 _SEARCH_KEY = 'ytsearchdate'
78caa52a 3246 IE_DESC = 'YouTube.com searches, newest videos first'
6c894ea1 3247 _SEARCH_PARAMS = 'CAI%3D'
75dff0ee 3248
c9ae7b95 3249
870f3bfc 3250class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
78caa52a
PH
3251 IE_DESC = 'YouTube.com search URLs'
3252 IE_NAME = 'youtube:search_url'
d2c1f79f 3253 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
c0a1a892 3254 _SEARCH_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
cdc628a4 3255 _TESTS = [{
3867038a 3256 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
cdc628a4
PH
3257 'playlist_mincount': 5,
3258 'info_dict': {
3867038a 3259 'title': 'youtube-dl test video',
cdc628a4 3260 }
d2c1f79f
S
3261 }, {
3262 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3263 'only_matching': True,
cdc628a4 3264 }]
c9ae7b95 3265
e03b4f3e 3266 def _find_videos_in_json(self, extracted):
3267 videos = []
3268
3269 def _real_find(obj):
3270 if obj is None or isinstance(obj, str):
3271 return
3272
3273 if type(obj) is list:
3274 for elem in obj:
3275 _real_find(elem)
3276
3277 if type(obj) is dict:
3278 if "videoId" in obj:
3279 videos.append(obj)
3280 return
3281
3282 for _, o in obj.items():
3283 _real_find(o)
3284
3285 _real_find(extracted)
3286
3287 return videos
3288
19f671f8 3289 def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page):
3290 search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None)
3291
e03b4f3e 3292 result_items = self._find_videos_in_json(search_response)
19f671f8 3293
955c4cb6 3294 for renderer in result_items:
3295 video_id = try_get(renderer, lambda x: x['videoId'])
3296 video_title = try_get(renderer, lambda x: x['title']['runs'][0]['text']) or try_get(renderer, lambda x: x['title']['simpleText'])
19f671f8 3297
3298 if video_id is None or video_title is None:
955c4cb6 3299 # we do not have a videoRenderer or title extraction broke
19f671f8 3300 continue
3301
3302 video_title = video_title.strip()
3303
3304 try:
3305 idx = ids_in_page.index(video_id)
3306 if video_title and not titles_in_page[idx]:
3307 titles_in_page[idx] = video_title
3308 except ValueError:
3309 ids_in_page.append(video_id)
3310 titles_in_page.append(video_title)
3311
3312 def extract_videos_from_page(self, page):
3313 ids_in_page = []
3314 titles_in_page = []
3315 self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page)
3316 return zip(ids_in_page, titles_in_page)
3317
c9ae7b95
PH
3318 def _real_extract(self, url):
3319 mobj = re.match(self._VALID_URL, url)
7fd002c0 3320 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
c9ae7b95 3321 webpage = self._download_webpage(url, query)
175c2e9e 3322 return self.playlist_result(self._process_page(webpage), playlist_title=query)
c9ae7b95
PH
3323
3324
136dadde 3325class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
78caa52a 3326 IE_DESC = 'YouTube.com (multi-season) shows'
92519402 3327 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 3328 IE_NAME = 'youtube:show'
cdc628a4 3329 _TESTS = [{
4003bd82 3330 'url': 'https://www.youtube.com/show/airdisasters',
8801255d 3331 'playlist_mincount': 5,
cdc628a4
PH
3332 'info_dict': {
3333 'id': 'airdisasters',
3334 'title': 'Air Disasters',
3335 }
3336 }]
75dff0ee
JMF
3337
3338 def _real_extract(self, url):
136dadde
S
3339 playlist_id = self._match_id(url)
3340 return super(YoutubeShowIE, self)._real_extract(
3341 'https://www.youtube.com/show/%s/playlists' % playlist_id)
04cc9617
JMF
3342
3343
b2e8bc1b 3344class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639 3345 """
25f14e9f 3346 Base class for feed extractors
d7ae0639
JMF
3347 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3348 """
b2e8bc1b 3349 _LOGIN_REQUIRED = True
bea9b005 3350 _FEED_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
f5360807 3351 _YTCFG_DATA = r"ytcfg.set\(({.*?})\)"
d7ae0639
JMF
3352
3353 @property
3354 def IE_NAME(self):
78caa52a 3355 return 'youtube:%s' % self._FEED_NAME
04cc9617 3356
81f0259b 3357 def _real_initialize(self):
b2e8bc1b 3358 self._login()
81f0259b 3359
5c430b67 3360 def _find_videos_in_json(self, extracted):
3361 videos = []
299056ad 3362 c = {}
5c430b67 3363
3364 def _real_find(obj):
3365 if obj is None or isinstance(obj, str):
3366 return
3367
3368 if type(obj) is list:
3369 for elem in obj:
3370 _real_find(elem)
3371
3372 if type(obj) is dict:
3373 if "videoId" in obj:
3374 videos.append(obj)
3375 return
f5360807 3376
5c430b67 3377 if "nextContinuationData" in obj:
299056ad 3378 c["continuation"] = obj["nextContinuationData"]
f5360807 3379 return
3380
5c430b67 3381 for _, o in obj.items():
3382 _real_find(o)
3383
3384 _real_find(extracted)
3385
299056ad 3386 return videos, try_get(c, lambda x: x["continuation"])
f5360807 3387
3853309f 3388 def _entries(self, page):
5c430b67 3389 info = []
3390
1f93faf6 3391 yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set', default="null"), None, fatal=False)
5c430b67 3392
3393 search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None)
3394
2bc43303 3395 for page_num in itertools.count(1):
5c430b67 3396 video_info, continuation = self._find_videos_in_json(search_response)
62c95fd5 3397
f5360807 3398 new_info = []
5c430b67 3399
3400 for v in video_info:
3401 v_id = try_get(v, lambda x: x['videoId'])
3402 if not v_id:
3403 continue
3404
f5360807 3405 have_video = False
5c430b67 3406 for old in info:
3407 if old['videoId'] == v_id:
3408 have_video = True
3409 break
3410
3411 if not have_video:
3412 new_info.append(v)
3413
3414 if not new_info:
62c95fd5
S
3415 break
3416
5c430b67 3417 info.extend(new_info)
2bc43303 3418
5c430b67 3419 for video in new_info:
f442082a 3420 yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text']) or try_get(video, lambda x: x['title']['simpleText']))
3853309f 3421
1f93faf6 3422 if not continuation or not yt_conf:
2bc43303
JMF
3423 break
3424
5c430b67 3425 search_response = self._download_json(
3426 'https://www.youtube.com/browse_ajax', self._PLAYLIST_TITLE,
2bc43303 3427 'Downloading page #%s' % page_num,
d84b21b4 3428 transform_source=uppercase_escape,
5c430b67 3429 query={
3430 "ctoken": try_get(continuation, lambda x: x["continuation"]),
3431 "continuation": try_get(continuation, lambda x: x["continuation"]),
3432 "itct": try_get(continuation, lambda x: x["clickTrackingParams"])
3433 },
3434 headers={
3435 "X-YouTube-Client-Name": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_NAME"]),
3436 "X-YouTube-Client-Version": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_VERSION"]),
3437 "X-Youtube-Identity-Token": try_get(yt_conf, lambda x: x["ID_TOKEN"]),
3438 "X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]),
3439 "X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]),
3440 "X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]),
f5360807 3441 "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]),
5c430b67 3442 })
2bc43303 3443
3853309f
S
3444 def _real_extract(self, url):
3445 page = self._download_webpage(
3446 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3447 self._PLAYLIST_TITLE)
25f14e9f 3448 return self.playlist_result(
3853309f 3449 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
25f14e9f
S
3450
3451
3452class YoutubeWatchLaterIE(YoutubePlaylistIE):
3453 IE_NAME = 'youtube:watchlater'
3454 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
92519402 3455 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
25f14e9f 3456
bc7a9cd8
S
3457 _TESTS = [{
3458 'url': 'https://www.youtube.com/playlist?list=WL',
3459 'only_matching': True,
3460 }, {
3461 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
3462 'only_matching': True,
3463 }]
25f14e9f
S
3464
3465 def _real_extract(self, url):
7e5dc339 3466 _, video = self._check_download_just_video(url, 'WL')
ebf1b291
S
3467 if video:
3468 return video
dacb3a86
S
3469 _, playlist = self._extract_playlist('WL')
3470 return playlist
f459d170 3471
5f6a1245 3472
c626a3d9 3473class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 3474 IE_NAME = 'youtube:favorites'
f3a34072 3475 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
92519402 3476 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
3477 _LOGIN_REQUIRED = True
3478
3479 def _real_extract(self, url):
3480 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 3481 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 3482 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
3483
3484
25f14e9f
S
3485class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3486 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
92519402 3487 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
25f14e9f
S
3488 _FEED_NAME = 'recommended'
3489 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 3490
1ed5b5c9 3491
25f14e9f
S
3492class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3493 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
92519402 3494 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
25f14e9f
S
3495 _FEED_NAME = 'subscriptions'
3496 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 3497
1ed5b5c9 3498
25f14e9f
S
3499class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3500 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
92519402 3501 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
25f14e9f
S
3502 _FEED_NAME = 'history'
3503 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
3504
3505
15870e90
PH
3506class YoutubeTruncatedURLIE(InfoExtractor):
3507 IE_NAME = 'youtube:truncated_url'
3508 IE_DESC = False # Do not list
975d35db 3509 _VALID_URL = r'''(?x)
b95aab84
PH
3510 (?:https?://)?
3511 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3512 (?:watch\?(?:
c4808c60 3513 feature=[a-z_]+|
b95aab84
PH
3514 annotation_id=annotation_[^&]+|
3515 x-yt-cl=[0-9]+|
c1708b89 3516 hl=[^&]*|
287be8c6 3517 t=[0-9]+
b95aab84
PH
3518 )?
3519 |
3520 attribution_link\?a=[^&]+
3521 )
3522 $
975d35db 3523 '''
15870e90 3524
c4808c60 3525 _TESTS = [{
2d3d2997 3526 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
c4808c60 3527 'only_matching': True,
dc2fc736 3528 }, {
2d3d2997 3529 'url': 'https://www.youtube.com/watch?',
dc2fc736 3530 'only_matching': True,
b95aab84
PH
3531 }, {
3532 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3533 'only_matching': True,
3534 }, {
3535 'url': 'https://www.youtube.com/watch?feature=foo',
3536 'only_matching': True,
c1708b89
PH
3537 }, {
3538 'url': 'https://www.youtube.com/watch?hl=en-GB',
3539 'only_matching': True,
287be8c6
PH
3540 }, {
3541 'url': 'https://www.youtube.com/watch?t=2372',
3542 'only_matching': True,
c4808c60
PH
3543 }]
3544
15870e90
PH
3545 def _real_extract(self, url):
3546 raise ExtractorError(
78caa52a
PH
3547 'Did you forget to quote the URL? Remember that & is a meta '
3548 'character in most shells, so you want to put the URL in quotes, '
3867038a 3549 'like youtube-dl '
2d3d2997 3550 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3867038a 3551 ' or simply youtube-dl BaW_jenozKc .',
15870e90 3552 expected=True)
772fd5cc
PH
3553
3554
3555class YoutubeTruncatedIDIE(InfoExtractor):
3556 IE_NAME = 'youtube:truncated_id'
3557 IE_DESC = False # Do not list
b95aab84 3558 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
3559
3560 _TESTS = [{
3561 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3562 'only_matching': True,
3563 }]
3564
3565 def _real_extract(self, url):
3566 video_id = self._match_id(url)
3567 raise ExtractorError(
3568 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3569 expected=True)