]> jfr.im git - yt-dlp.git/blame - youtube_dlc/extractor/youtube.py
Merge branch 'master' into youtube-mix-fix
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
d77ab8e2 9import random
c5e8d7af 10import re
42939b61 11import time
e0df6211 12import traceback
c5e8d7af 13
b05654f0 14from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
f8c55c66 19 compat_HTTPError,
8d81f3e3 20 compat_kwargs,
c5e8d7af 21 compat_parse_qs,
7fd002c0
S
22 compat_urllib_parse_unquote,
23 compat_urllib_parse_unquote_plus,
15707c7e 24 compat_urllib_parse_urlencode,
7c80519c 25 compat_urllib_parse_urlparse,
7c61bd36 26 compat_urlparse,
c5e8d7af 27 compat_str,
4bb4a188
PH
28)
29from ..utils import (
27019dbb 30 bool_or_none,
c5e8d7af 31 clean_html,
9b9c5355 32 error_to_compat_str,
351f37c0 33 extract_attributes,
c5e8d7af 34 ExtractorError,
2d30521a 35 float_or_none,
4bb4a188
PH
36 get_element_by_attribute,
37 get_element_by_id,
dd27fd17 38 int_or_none,
94278f72 39 mimetype2ext,
4bb4a188 40 orderedSet,
6310acf5 41 parse_codecs,
b84071c0 42 parse_count,
7c80519c 43 parse_duration,
0cb58b02 44 remove_quotes,
3995d37d 45 remove_start,
cf7e015f 46 smuggle_url,
dbdaaa23 47 str_or_none,
c93d53f5 48 str_to_int,
556dbe7f 49 try_get,
c5e8d7af
PH
50 unescapeHTML,
51 unified_strdate,
cf7e015f 52 unsmuggle_url,
81c2f20b 53 uppercase_escape,
21c340b8 54 url_or_none,
6e6bc8da 55 urlencode_postdata,
c5e8d7af
PH
56)
57
5f6a1245 58
de7f3446 59class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
60 """Provide base functions for Youtube extractors"""
61 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 62 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
e00eb564
S
63
64 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
3995d37d
S
65 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
66 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
e00eb564 67
b2e8bc1b
JMF
68 _NETRC_MACHINE = 'youtube'
69 # If True it will raise an error if no login info is provided
70 _LOGIN_REQUIRED = False
71
66b48727 72 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
d0ba5587 73
d84b21b4
S
74 _YOUTUBE_CLIENT_HEADERS = {
75 'x-youtube-client-name': '1',
76 'x-youtube-client-version': '1.20200609.04.02',
77 }
78
b2e8bc1b 79 def _set_language(self):
810fb84d 80 self._set_cookie(
ee0b726c 81 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
42939b61 82 # YouTube sets the expire time to about two months
810fb84d 83 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 84
25f14e9f
S
85 def _ids_to_results(self, ids):
86 return [
87 self.url_result(vid_id, 'Youtube', video_id=vid_id)
88 for vid_id in ids]
89
b2e8bc1b 90 def _login(self):
83317f69 91 """
92 Attempt to log in to YouTube.
93 True is returned if successful or skipped.
94 False is returned if login failed.
95
96 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
97 """
68217024 98 username, password = self._get_login_info()
b2e8bc1b
JMF
99 # No authentication to be performed
100 if username is None:
70d35d16 101 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
69ea8ca4 102 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
957c523e
U
103 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
104 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
83317f69 105 return True
b2e8bc1b 106
7cc3570e
PH
107 login_page = self._download_webpage(
108 self._LOGIN_URL, None,
69ea8ca4
PH
109 note='Downloading login page',
110 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
111 if login_page is False:
112 return
b2e8bc1b 113
1212e997 114 login_form = self._hidden_inputs(login_page)
c5e8d7af 115
e00eb564
S
116 def req(url, f_req, note, errnote):
117 data = login_form.copy()
118 data.update({
119 'pstMsg': 1,
120 'checkConnection': 'youtube',
121 'checkedDomains': 'youtube',
122 'hl': 'en',
123 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
3995d37d 124 'f.req': json.dumps(f_req),
e00eb564
S
125 'flowName': 'GlifWebSignIn',
126 'flowEntry': 'ServiceLogin',
baf67a60
S
127 # TODO: reverse actual botguard identifier generation algo
128 'bgRequest': '["identifier",""]',
041bc3ad 129 })
e00eb564
S
130 return self._download_json(
131 url, None, note=note, errnote=errnote,
132 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
133 fatal=False,
134 data=urlencode_postdata(data), headers={
135 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
136 'Google-Accounts-XSRF': 1,
137 })
138
3995d37d
S
139 def warn(message):
140 self._downloader.report_warning(message)
141
142 lookup_req = [
143 username,
144 None, [], None, 'US', None, None, 2, False, True,
145 [
146 None, None,
147 [2, 1, None, 1,
148 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
149 None, [], 4],
150 1, [None, None, []], None, None, None, True
151 ],
152 username,
153 ]
154
e00eb564 155 lookup_results = req(
3995d37d 156 self._LOOKUP_URL, lookup_req,
e00eb564
S
157 'Looking up account info', 'Unable to look up account info')
158
159 if lookup_results is False:
160 return False
041bc3ad 161
3995d37d
S
162 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
163 if not user_hash:
164 warn('Unable to extract user hash')
165 return False
166
167 challenge_req = [
168 user_hash,
169 None, 1, None, [1, None, None, None, [password, None, True]],
170 [
171 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
172 1, [None, None, []], None, None, None, True
173 ]]
83317f69 174
3995d37d
S
175 challenge_results = req(
176 self._CHALLENGE_URL, challenge_req,
177 'Logging in', 'Unable to log in')
83317f69 178
3995d37d 179 if challenge_results is False:
e00eb564 180 return
83317f69 181
3995d37d
S
182 login_res = try_get(challenge_results, lambda x: x[0][5], list)
183 if login_res:
184 login_msg = try_get(login_res, lambda x: x[5], compat_str)
185 warn(
186 'Unable to login: %s' % 'Invalid password'
187 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
188 return False
189
190 res = try_get(challenge_results, lambda x: x[0][-1], list)
191 if not res:
192 warn('Unable to extract result entry')
193 return False
194
9a6628aa
S
195 login_challenge = try_get(res, lambda x: x[0][0], list)
196 if login_challenge:
197 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
198 if challenge_str == 'TWO_STEP_VERIFICATION':
3995d37d
S
199 # SEND_SUCCESS - TFA code has been successfully sent to phone
200 # QUOTA_EXCEEDED - reached the limit of TFA codes
9a6628aa 201 status = try_get(login_challenge, lambda x: x[5], compat_str)
3995d37d
S
202 if status == 'QUOTA_EXCEEDED':
203 warn('Exceeded the limit of TFA codes, try later')
204 return False
205
206 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
207 if not tl:
208 warn('Unable to extract TL')
209 return False
210
211 tfa_code = self._get_tfa_info('2-step verification code')
212
213 if not tfa_code:
214 warn(
215 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
216 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
217 return False
218
219 tfa_code = remove_start(tfa_code, 'G-')
220
221 tfa_req = [
222 user_hash, None, 2, None,
223 [
224 9, None, None, None, None, None, None, None,
225 [None, tfa_code, True, 2]
226 ]]
227
228 tfa_results = req(
229 self._TFA_URL.format(tl), tfa_req,
230 'Submitting TFA code', 'Unable to submit TFA code')
231
232 if tfa_results is False:
233 return False
234
235 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
236 if tfa_res:
237 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
238 warn(
239 'Unable to finish TFA: %s' % 'Invalid TFA code'
240 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
241 return False
242
243 check_cookie_url = try_get(
244 tfa_results, lambda x: x[0][-1][2], compat_str)
9a6628aa
S
245 else:
246 CHALLENGES = {
247 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
248 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
249 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
250 }
251 challenge = CHALLENGES.get(
252 challenge_str,
253 '%s returned error %s.' % (self.IE_NAME, challenge_str))
254 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
255 return False
3995d37d
S
256 else:
257 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
258
259 if not check_cookie_url:
260 warn('Unable to extract CheckCookie URL')
261 return False
e00eb564
S
262
263 check_cookie_results = self._download_webpage(
3995d37d
S
264 check_cookie_url, None, 'Checking cookie', fatal=False)
265
266 if check_cookie_results is False:
267 return False
e00eb564 268
3995d37d
S
269 if 'https://myaccount.google.com/' not in check_cookie_results:
270 warn('Unable to log in')
b2e8bc1b 271 return False
e00eb564 272
b2e8bc1b
JMF
273 return True
274
30226342 275 def _download_webpage_handle(self, *args, **kwargs):
c1148516
S
276 query = kwargs.get('query', {}).copy()
277 query['disable_polymer'] = 'true'
278 kwargs['query'] = query
30226342 279 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
8d81f3e3
S
280 *args, **compat_kwargs(kwargs))
281
5b0a6a80 282 def _get_yt_initial_data(self, video_id, webpage):
283 config = self._search_regex(
284 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
285 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
286 webpage, 'ytInitialData', default=None)
287 if config:
288 return self._parse_json(
289 uppercase_escape(config), video_id, fatal=False)
290
b2e8bc1b
JMF
291 def _real_initialize(self):
292 if self._downloader is None:
293 return
42939b61 294 self._set_language()
b2e8bc1b
JMF
295 if not self._login():
296 return
c5e8d7af 297
8377574c 298
8e7aad20 299class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
061a75ed 300 # Extract entries from page with "Load more" button
648e6a1f
S
301 def _entries(self, page, playlist_id):
302 more_widget_html = content_html = page
303 for page_num in itertools.count(1):
061a75ed
S
304 for entry in self._process_page(content_html):
305 yield entry
648e6a1f
S
306
307 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
308 if not mobj:
309 break
310
f8c55c66
S
311 count = 0
312 retries = 3
313 while count <= retries:
314 try:
315 # Downloading page may result in intermittent 5xx HTTP error
316 # that is usually worked around with a retry
317 more = self._download_json(
07af16b9 318 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id,
f8c55c66
S
319 'Downloading page #%s%s'
320 % (page_num, ' (retry #%d)' % count if count else ''),
d84b21b4
S
321 transform_source=uppercase_escape,
322 headers=self._YOUTUBE_CLIENT_HEADERS)
f8c55c66
S
323 break
324 except ExtractorError as e:
325 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
326 count += 1
327 if count <= retries:
328 continue
329 raise
330
648e6a1f
S
331 content_html = more['content_html']
332 if not content_html.strip():
333 # Some webpages show a "Load more" button but they don't
334 # have more videos
335 break
336 more_widget_html = more['load_more_widget_html']
337
061a75ed
S
338
339class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
340 def _process_page(self, content):
341 for video_id, video_title in self.extract_videos_from_page(content):
342 yield self.url_result(video_id, 'Youtube', video_id, video_title)
343
351f37c0
S
344 def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
345 for mobj in re.finditer(video_re, page):
648e6a1f
S
346 # The link with index 0 is not the first video of the playlist (not sure if still actual)
347 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
348 continue
349 video_id = mobj.group('id')
351f37c0
S
350 video_title = unescapeHTML(
351 mobj.group('title')) if 'title' in mobj.groupdict() else None
648e6a1f
S
352 if video_title:
353 video_title = video_title.strip()
351f37c0
S
354 if video_title == '► Play all':
355 video_title = None
648e6a1f
S
356 try:
357 idx = ids_in_page.index(video_id)
358 if video_title and not titles_in_page[idx]:
359 titles_in_page[idx] = video_title
360 except ValueError:
361 ids_in_page.append(video_id)
362 titles_in_page.append(video_title)
351f37c0
S
363
364 def extract_videos_from_page(self, page):
365 ids_in_page = []
366 titles_in_page = []
367 self.extract_videos_from_page_impl(
368 self._VIDEO_RE, page, ids_in_page, titles_in_page)
648e6a1f
S
369 return zip(ids_in_page, titles_in_page)
370
371
061a75ed
S
372class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
373 def _process_page(self, content):
6dee688e
S
374 for playlist_id in orderedSet(re.findall(
375 r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
376 content)):
061a75ed
S
377 yield self.url_result(
378 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
379
0c148415
S
380 def _real_extract(self, url):
381 playlist_id = self._match_id(url)
382 webpage = self._download_webpage(url, playlist_id)
0c148415 383 title = self._og_search_title(webpage, fatal=False)
061a75ed 384 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
0c148415
S
385
386
360e1ca5 387class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 388 IE_DESC = 'YouTube.com'
cb7dfeea 389 _VALID_URL = r"""(?x)^
c5e8d7af 390 (
edb53e2d 391 (?:https?://|//) # http(s):// or protocol-independent URL
66b48727 392 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
484aaeb2 393 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 394 (?:www\.)?pwnyoutube\.com/|
8b561bfc 395 (?:www\.)?hooktube\.com/|
f7000f3a 396 (?:www\.)?yourepeat\.com/|
e69ae5b9 397 tube\.majestyc\.net/|
ba036333 398 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
77d95677 399 (?:(?:www|dev)\.)?invidio\.us/|
ba036333 400 (?:(?:www|no)\.)?invidiou\.sh/|
401 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
8ae113ca 402 (?:www\.)?invidious\.kabi\.tk/|
ba036333 403 (?:www\.)?invidious\.13ad\.de/|
791d2e81 404 (?:www\.)?invidious\.mastodon\.host/|
494d664e 405 (?:www\.)?invidious\.nixnet\.xyz/|
666d808e 406 (?:www\.)?invidious\.drycat\.fr/|
ba036333 407 (?:www\.)?tube\.poal\.co/|
8ae113ca 408 (?:www\.)?vid\.wxzm\.sx/|
384bf91f 409 (?:www\.)?yewtu\.be/|
494d664e 410 (?:www\.)?yt\.elukerio\.org/|
894b3826 411 (?:www\.)?yt\.lelux\.fi/|
1db5ab6b 412 (?:www\.)?invidious\.ggc-project\.de/|
413 (?:www\.)?yt\.maisputain\.ovh/|
414 (?:www\.)?invidious\.13ad\.de/|
415 (?:www\.)?invidious\.toot\.koeln/|
416 (?:www\.)?invidious\.fdn\.fr/|
417 (?:www\.)?watch\.nettohikari\.com/|
bff90fc5 418 (?:www\.)?kgg2m7yk5aybusll\.onion/|
419 (?:www\.)?qklhadlycap4cnod\.onion/|
420 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
421 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
422 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
423 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
33c1c7d8 424 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
1db5ab6b 425 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
e69ae5b9 426 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
427 (?:.*?\#/)? # handle anchor (#/) redirect urls
428 (?: # the various things that can precede the ID:
ac7553d0 429 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 430 |(?: # or the v= param in all its forms
f7000f3a 431 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 432 (?:\?|\#!?) # the params delimiter ? or # or #!
040ac686 433 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
c5e8d7af
PH
434 v=
435 )
f4b05232 436 ))
cbaed4bb
S
437 |(?:
438 youtu\.be| # just youtu.be/xxxx
6d4fc66b
S
439 vid\.plus| # or vid.plus/xxxx
440 zwearz\.com/watch| # or zwearz.com/watch/xxxx
cbaed4bb 441 )/
edb53e2d 442 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 443 )
c5e8d7af 444 )? # all until now is optional -> you can pass the naked ID
8963d9c2 445 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
d0ba5587
S
446 (?!.*?\blist=
447 (?:
448 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
449 WL # WL are handled by the watch later IE
450 )
451 )
c5e8d7af 452 (?(1).+)? # if we found the ID, everything can follow
d0ba5587 453 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
c5e8d7af 454 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
e40c758c
S
455 _PLAYER_INFO_RE = (
456 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
457 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
458 )
2c62dc26 459 _formats = {
c2d3cb4c 460 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
461 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
462 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
463 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
464 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
465 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
466 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
467 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
3834d3e3 468 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
c2d3cb4c 469 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
470 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
471 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
472 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
473 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
474 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
e1a0bfdf 475 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
c2d3cb4c 476 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
477 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
e1a0bfdf 478
479
480 # 3D videos
c2d3cb4c 481 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
482 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
483 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
484 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
e1a0bfdf 485 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
486 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
487 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
836a086c 488
96fb5605 489 # Apple HTTP Live Streaming
11f12195 490 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
c2d3cb4c 491 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
492 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
493 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
494 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
495 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
e1a0bfdf 496 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
497 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
2c62dc26
PH
498
499 # DASH mp4 video
d23028a8
S
500 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
501 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
502 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
503 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
504 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
067aa17e 505 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
d23028a8
S
506 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
507 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
508 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
509 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
510 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
511 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
836a086c 512
f6f1fc92 513 # Dash mp4 audio
d23028a8
S
514 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
515 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
516 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
517 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
518 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
519 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
520 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
836a086c
AZ
521
522 # Dash webm
d23028a8
S
523 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
524 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
525 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
526 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
527 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
528 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
529 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
530 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
531 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
532 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
533 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
534 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
535 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
536 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
537 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
4c6b4764 538 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
d23028a8
S
539 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
540 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
541 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
542 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
543 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
544 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
2c62dc26
PH
545
546 # Dash webm audio
d23028a8
S
547 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
548 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
ce6b9a2d 549
0857baad 550 # Dash webm audio with opus inside
d23028a8
S
551 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
552 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
553 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
0857baad 554
ce6b9a2d
PH
555 # RTMP (unnamed)
556 '_rtmp': {'protocol': 'rtmp'},
b85eae0f
S
557
558 # av01 video only formats sometimes served with "unknown" codecs
559 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
560 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
561 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
562 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
c5e8d7af 563 }
84da5d84 564 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
836a086c 565
fd5c4aab
S
566 _GEO_BYPASS = False
567
78caa52a 568 IE_NAME = 'youtube'
2eb88d95
PH
569 _TESTS = [
570 {
2d3d2997 571 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
4bc3a23e
PH
572 'info_dict': {
573 'id': 'BaW_jenozKc',
574 'ext': 'mp4',
3867038a 575 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
4bc3a23e
PH
576 'uploader': 'Philipp Hagemeister',
577 'uploader_id': 'phihag',
ec85ded8 578 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
dd4c4492
S
579 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
580 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
4bc3a23e 581 'upload_date': '20121002',
3867038a 582 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
4bc3a23e 583 'categories': ['Science & Technology'],
3867038a 584 'tags': ['youtube-dl'],
556dbe7f 585 'duration': 10,
dbdaaa23 586 'view_count': int,
3e7c1224
PH
587 'like_count': int,
588 'dislike_count': int,
7c80519c 589 'start_time': 1,
297a564b 590 'end_time': 9,
2eb88d95 591 }
0e853ca4 592 },
fccd3771 593 {
4bc3a23e
PH
594 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
595 'note': 'Embed-only video (#1746)',
596 'info_dict': {
597 'id': 'yZIXLfi8CZQ',
598 'ext': 'mp4',
599 'upload_date': '20120608',
600 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
601 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
602 'uploader': 'SET India',
94bfcd23 603 'uploader_id': 'setindia',
ec85ded8 604 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
94bfcd23 605 'age_limit': 18,
fccd3771
PH
606 }
607 },
11b56058 608 {
2d3d2997 609 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
11b56058
PM
610 'note': 'Use the first video ID in the URL',
611 'info_dict': {
612 'id': 'BaW_jenozKc',
613 'ext': 'mp4',
3867038a 614 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
11b56058
PM
615 'uploader': 'Philipp Hagemeister',
616 'uploader_id': 'phihag',
ec85ded8 617 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
11b56058 618 'upload_date': '20121002',
3867038a 619 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
11b56058 620 'categories': ['Science & Technology'],
3867038a 621 'tags': ['youtube-dl'],
556dbe7f 622 'duration': 10,
dbdaaa23 623 'view_count': int,
11b56058
PM
624 'like_count': int,
625 'dislike_count': int,
34a7de29
S
626 },
627 'params': {
628 'skip_download': True,
629 },
11b56058 630 },
dd27fd17 631 {
2d3d2997 632 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
4bc3a23e
PH
633 'note': '256k DASH audio (format 141) via DASH manifest',
634 'info_dict': {
635 'id': 'a9LDPn-MO4I',
636 'ext': 'm4a',
637 'upload_date': '20121002',
638 'uploader_id': '8KVIDEO',
ec85ded8 639 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
4bc3a23e
PH
640 'description': '',
641 'uploader': '8KVIDEO',
642 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 643 },
4bc3a23e
PH
644 'params': {
645 'youtube_include_dash_manifest': True,
646 'format': '141',
4919603f 647 },
de3c7fe0 648 'skip': 'format 141 not served anymore',
dd27fd17 649 },
aa79ac0c
PH
650 # Controversy video
651 {
652 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
653 'info_dict': {
654 'id': 'T4XJQO3qol8',
655 'ext': 'mp4',
556dbe7f 656 'duration': 219,
aa79ac0c 657 'upload_date': '20100909',
4fe54c12 658 'uploader': 'Amazing Atheist',
aa79ac0c 659 'uploader_id': 'TheAmazingAtheist',
ec85ded8 660 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
aa79ac0c
PH
661 'title': 'Burning Everyone\'s Koran',
662 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
663 }
c522adb1 664 },
dd2d55f1 665 # Normal age-gate video (embed allowed)
c522adb1 666 {
2d3d2997 667 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
c522adb1
JMF
668 'info_dict': {
669 'id': 'HtVdAasjOgU',
670 'ext': 'mp4',
671 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
ec85ded8 672 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
556dbe7f 673 'duration': 142,
c522adb1
JMF
674 'uploader': 'The Witcher',
675 'uploader_id': 'WitcherGame',
ec85ded8 676 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
c522adb1 677 'upload_date': '20140605',
34952f09 678 'age_limit': 18,
c522adb1
JMF
679 },
680 },
067aa17e 681 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
e52a40ab
PH
682 {
683 'url': 'lqQg6PlCWgI',
684 'info_dict': {
685 'id': 'lqQg6PlCWgI',
686 'ext': 'mp4',
556dbe7f 687 'duration': 6085,
90227264 688 'upload_date': '20150827',
cbe2bd91 689 'uploader_id': 'olympic',
ec85ded8 690 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
cbe2bd91 691 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
be49068d 692 'uploader': 'Olympic',
cbe2bd91
PH
693 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
694 },
695 'params': {
696 'skip_download': 'requires avconv',
e52a40ab 697 }
cbe2bd91 698 },
6271f1ca
PH
699 # Non-square pixels
700 {
701 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
702 'info_dict': {
703 'id': '_b-2C3KPAM0',
704 'ext': 'mp4',
705 'stretched_ratio': 16 / 9.,
556dbe7f 706 'duration': 85,
6271f1ca
PH
707 'upload_date': '20110310',
708 'uploader_id': 'AllenMeow',
ec85ded8 709 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
6271f1ca 710 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
eb6793ba 711 'uploader': '孫ᄋᄅ',
6271f1ca
PH
712 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
713 },
06b491eb
S
714 },
715 # url_encoded_fmt_stream_map is empty string
716 {
717 'url': 'qEJwOuvDf7I',
718 'info_dict': {
719 'id': 'qEJwOuvDf7I',
f57b7835 720 'ext': 'webm',
06b491eb
S
721 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
722 'description': '',
723 'upload_date': '20150404',
724 'uploader_id': 'spbelect',
725 'uploader': 'Наблюдатели Петербурга',
726 },
727 'params': {
728 'skip_download': 'requires avconv',
e323cf3f
S
729 },
730 'skip': 'This live event has ended.',
06b491eb 731 },
067aa17e 732 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
da77d856
S
733 {
734 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
735 'info_dict': {
736 'id': 'FIl7x6_3R5Y',
eb6793ba 737 'ext': 'webm',
da77d856
S
738 'title': 'md5:7b81415841e02ecd4313668cde88737a',
739 'description': 'md5:116377fd2963b81ec4ce64b542173306',
556dbe7f 740 'duration': 220,
da77d856
S
741 'upload_date': '20150625',
742 'uploader_id': 'dorappi2000',
ec85ded8 743 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
da77d856 744 'uploader': 'dorappi2000',
eb6793ba 745 'formats': 'mincount:31',
da77d856 746 },
eb6793ba 747 'skip': 'not actual anymore',
2ee8f5d8 748 },
8a1a26ce
YCH
749 # DASH manifest with segment_list
750 {
751 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
752 'md5': '8ce563a1d667b599d21064e982ab9e31',
753 'info_dict': {
754 'id': 'CsmdDsKjzN8',
755 'ext': 'mp4',
17ee98e1 756 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
757 'uploader': 'Airtek',
758 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
759 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
760 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
761 },
762 'params': {
763 'youtube_include_dash_manifest': True,
764 'format': '135', # bestvideo
be49068d
S
765 },
766 'skip': 'This live event has ended.',
2ee8f5d8 767 },
cf7e015f
S
768 {
769 # Multifeed videos (multiple cameras), URL is for Main Camera
770 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
771 'info_dict': {
772 'id': 'jqWvoWXjCVs',
773 'title': 'teamPGP: Rocket League Noob Stream',
774 'description': 'md5:dc7872fb300e143831327f1bae3af010',
775 },
776 'playlist': [{
777 'info_dict': {
778 'id': 'jqWvoWXjCVs',
779 'ext': 'mp4',
780 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
781 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 782 'duration': 7335,
cf7e015f
S
783 'upload_date': '20150721',
784 'uploader': 'Beer Games Beer',
785 'uploader_id': 'beergamesbeer',
ec85ded8 786 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 787 'license': 'Standard YouTube License',
cf7e015f
S
788 },
789 }, {
790 'info_dict': {
791 'id': '6h8e8xoXJzg',
792 'ext': 'mp4',
793 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
794 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 795 'duration': 7337,
cf7e015f
S
796 'upload_date': '20150721',
797 'uploader': 'Beer Games Beer',
798 'uploader_id': 'beergamesbeer',
ec85ded8 799 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 800 'license': 'Standard YouTube License',
cf7e015f
S
801 },
802 }, {
803 'info_dict': {
804 'id': 'PUOgX5z9xZw',
805 'ext': 'mp4',
806 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
807 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 808 'duration': 7337,
cf7e015f
S
809 'upload_date': '20150721',
810 'uploader': 'Beer Games Beer',
811 'uploader_id': 'beergamesbeer',
ec85ded8 812 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 813 'license': 'Standard YouTube License',
cf7e015f
S
814 },
815 }, {
816 'info_dict': {
817 'id': 'teuwxikvS5k',
818 'ext': 'mp4',
819 'title': 'teamPGP: Rocket League Noob Stream (zim)',
820 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 821 'duration': 7334,
cf7e015f
S
822 'upload_date': '20150721',
823 'uploader': 'Beer Games Beer',
824 'uploader_id': 'beergamesbeer',
ec85ded8 825 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 826 'license': 'Standard YouTube License',
cf7e015f
S
827 },
828 }],
829 'params': {
830 'skip_download': True,
831 },
4fe54c12 832 'skip': 'This video is not available.',
cbaed4bb 833 },
f9f49d87 834 {
067aa17e 835 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
f9f49d87
S
836 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
837 'info_dict': {
838 'id': 'gVfLd0zydlo',
839 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
840 },
841 'playlist_count': 2,
be49068d 842 'skip': 'Not multifeed anymore',
f9f49d87 843 },
cbaed4bb 844 {
2d3d2997 845 'url': 'https://vid.plus/FlRa-iH7PGw',
cbaed4bb 846 'only_matching': True,
0e49d9a6 847 },
6d4fc66b 848 {
2d3d2997 849 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
6d4fc66b
S
850 'only_matching': True,
851 },
0e49d9a6 852 {
067aa17e 853 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
a8776b10 854 # Also tests cut-off URL expansion in video description (see
067aa17e
S
855 # https://github.com/ytdl-org/youtube-dl/issues/1892,
856 # https://github.com/ytdl-org/youtube-dl/issues/8164)
0e49d9a6
LL
857 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
858 'info_dict': {
859 'id': 'lsguqyKfVQg',
860 'ext': 'mp4',
861 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
eb6793ba 862 'alt_title': 'Dark Walk - Position Music',
0e49d9a6 863 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
556dbe7f 864 'duration': 133,
0e49d9a6
LL
865 'upload_date': '20151119',
866 'uploader_id': 'IronSoulElf',
ec85ded8 867 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
0e49d9a6 868 'uploader': 'IronSoulElf',
eb6793ba
S
869 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
870 'track': 'Dark Walk - Position Music',
871 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
92bc97d3 872 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
0e49d9a6
LL
873 },
874 'params': {
875 'skip_download': True,
876 },
877 },
61f92af1 878 {
067aa17e 879 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
61f92af1
S
880 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
881 'only_matching': True,
882 },
313dfc45
LL
883 {
884 # Video with yt:stretch=17:0
885 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
886 'info_dict': {
887 'id': 'Q39EVAstoRM',
888 'ext': 'mp4',
889 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
890 'description': 'md5:ee18a25c350637c8faff806845bddee9',
891 'upload_date': '20151107',
892 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
893 'uploader': 'CH GAMER DROID',
894 },
895 'params': {
896 'skip_download': True,
897 },
be49068d 898 'skip': 'This video does not exist.',
313dfc45 899 },
7caf9830
S
900 {
901 # Video licensed under Creative Commons
902 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
903 'info_dict': {
904 'id': 'M4gD1WSo5mA',
905 'ext': 'mp4',
906 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
907 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
556dbe7f 908 'duration': 721,
7caf9830
S
909 'upload_date': '20150127',
910 'uploader_id': 'BerkmanCenter',
ec85ded8 911 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
556dbe7f 912 'uploader': 'The Berkman Klein Center for Internet & Society',
7caf9830
S
913 'license': 'Creative Commons Attribution license (reuse allowed)',
914 },
915 'params': {
916 'skip_download': True,
917 },
918 },
fd050249
S
919 {
920 # Channel-like uploader_url
921 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
922 'info_dict': {
923 'id': 'eQcmzGIKrzg',
924 'ext': 'mp4',
925 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
926 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
556dbe7f 927 'duration': 4060,
fd050249 928 'upload_date': '20151119',
eb6793ba 929 'uploader': 'Bernie Sanders',
fd050249 930 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
ec85ded8 931 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
fd050249
S
932 'license': 'Creative Commons Attribution license (reuse allowed)',
933 },
934 'params': {
935 'skip_download': True,
936 },
937 },
040ac686
S
938 {
939 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
940 'only_matching': True,
7f29cf54
S
941 },
942 {
067aa17e 943 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
7f29cf54
S
944 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
945 'only_matching': True,
6496ccb4
S
946 },
947 {
948 # Rental video preview
949 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
950 'info_dict': {
951 'id': 'uGpuVWrhIzE',
952 'ext': 'mp4',
953 'title': 'Piku - Trailer',
954 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
955 'upload_date': '20150811',
956 'uploader': 'FlixMatrix',
957 'uploader_id': 'FlixMatrixKaravan',
ec85ded8 958 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
6496ccb4
S
959 'license': 'Standard YouTube License',
960 },
961 'params': {
962 'skip_download': True,
963 },
eb6793ba 964 'skip': 'This video is not available.',
022a5d66 965 },
12afdc2a
S
966 {
967 # YouTube Red video with episode data
968 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
969 'info_dict': {
970 'id': 'iqKdEhx-dD4',
971 'ext': 'mp4',
972 'title': 'Isolation - Mind Field (Ep 1)',
4fe54c12 973 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
556dbe7f 974 'duration': 2085,
12afdc2a
S
975 'upload_date': '20170118',
976 'uploader': 'Vsauce',
977 'uploader_id': 'Vsauce',
978 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
12afdc2a
S
979 'series': 'Mind Field',
980 'season_number': 1,
981 'episode_number': 1,
982 },
983 'params': {
984 'skip_download': True,
985 },
986 'expected_warnings': [
987 'Skipping DASH manifest',
988 ],
989 },
c7121fa7
S
990 {
991 # The following content has been identified by the YouTube community
992 # as inappropriate or offensive to some audiences.
993 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
994 'info_dict': {
995 'id': '6SJNVb0GnPI',
996 'ext': 'mp4',
997 'title': 'Race Differences in Intelligence',
998 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
999 'duration': 965,
1000 'upload_date': '20140124',
1001 'uploader': 'New Century Foundation',
1002 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1003 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
c7121fa7
S
1004 },
1005 'params': {
1006 'skip_download': True,
1007 },
1008 },
022a5d66
S
1009 {
1010 # itag 212
1011 'url': '1t24XAntNCY',
1012 'only_matching': True,
fd5c4aab
S
1013 },
1014 {
1015 # geo restricted to JP
1016 'url': 'sJL6WA-aGkQ',
1017 'only_matching': True,
1018 },
d0ba5587
S
1019 {
1020 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1021 'only_matching': True,
1022 },
cd5a74a2
S
1023 {
1024 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1025 'only_matching': True,
1026 },
825cd268
RA
1027 {
1028 # DRM protected
1029 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1030 'only_matching': True,
4fe54c12
S
1031 },
1032 {
1033 # Video with unsupported adaptive stream type formats
1034 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1035 'info_dict': {
1036 'id': 'Z4Vy8R84T1U',
1037 'ext': 'mp4',
1038 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1039 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1040 'duration': 433,
1041 'upload_date': '20130923',
1042 'uploader': 'Amelia Putri Harwita',
1043 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1044 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1045 'formats': 'maxcount:10',
1046 },
1047 'params': {
1048 'skip_download': True,
1049 'youtube_include_dash_manifest': False,
1050 },
5429d6a9 1051 'skip': 'not actual anymore',
5caabd3c 1052 },
1053 {
822b9d9c 1054 # Youtube Music Auto-generated description
5caabd3c 1055 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1056 'info_dict': {
1057 'id': 'MgNrAu2pzNs',
1058 'ext': 'mp4',
1059 'title': 'Voyeur Girl',
1060 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1061 'upload_date': '20190312',
5429d6a9
S
1062 'uploader': 'Stephen - Topic',
1063 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
5caabd3c 1064 'artist': 'Stephen',
1065 'track': 'Voyeur Girl',
1066 'album': 'it\'s too much love to know my dear',
1067 'release_date': '20190313',
1068 'release_year': 2019,
1069 },
1070 'params': {
1071 'skip_download': True,
1072 },
1073 },
1074 {
822b9d9c 1075 # Youtube Music Auto-generated description
5caabd3c 1076 # Retrieve 'artist' field from 'Artist:' in video description
1077 # when it is present on youtube music video
5caabd3c 1078 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
1079 'info_dict': {
1080 'id': 'k0jLE7tTwjY',
1081 'ext': 'mp4',
1082 'title': 'Latch Feat. Sam Smith',
1083 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
1084 'upload_date': '20150110',
1085 'uploader': 'Various Artists - Topic',
1086 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
1087 'artist': 'Disclosure',
1088 'track': 'Latch Feat. Sam Smith',
1089 'album': 'Latch Featuring Sam Smith',
1090 'release_date': '20121008',
1091 'release_year': 2012,
1092 },
1093 'params': {
1094 'skip_download': True,
1095 },
1096 },
1097 {
822b9d9c 1098 # Youtube Music Auto-generated description
5caabd3c 1099 # handle multiple artists on youtube music video
1100 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
1101 'info_dict': {
1102 'id': '74qn0eJSjpA',
1103 'ext': 'mp4',
1104 'title': 'Eastside',
1105 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
1106 'upload_date': '20180710',
1107 'uploader': 'Benny Blanco - Topic',
1108 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
1109 'artist': 'benny blanco, Halsey, Khalid',
1110 'track': 'Eastside',
1111 'album': 'Eastside',
1112 'release_date': '20180713',
1113 'release_year': 2018,
1114 },
1115 'params': {
1116 'skip_download': True,
1117 },
1118 },
1119 {
822b9d9c 1120 # Youtube Music Auto-generated description
5caabd3c 1121 # handle youtube music video with release_year and no release_date
1122 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
1123 'info_dict': {
1124 'id': '-hcAI0g-f5M',
1125 'ext': 'mp4',
1126 'title': 'Put It On Me',
5429d6a9 1127 'description': 'md5:f6422397c07c4c907c6638e1fee380a5',
5caabd3c 1128 'upload_date': '20180426',
1129 'uploader': 'Matt Maeson - Topic',
1130 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
1131 'artist': 'Matt Maeson',
1132 'track': 'Put It On Me',
1133 'album': 'The Hearse',
1134 'release_date': None,
1135 'release_year': 2018,
1136 },
1137 'params': {
1138 'skip_download': True,
1139 },
1140 },
66b48727
RA
1141 {
1142 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1143 'only_matching': True,
1144 },
011e75e6
S
1145 {
1146 # invalid -> valid video id redirection
1147 'url': 'DJztXj2GPfl',
1148 'info_dict': {
1149 'id': 'DJztXj2GPfk',
1150 'ext': 'mp4',
1151 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1152 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1153 'upload_date': '20090125',
1154 'uploader': 'Prochorowka',
1155 'uploader_id': 'Prochorowka',
1156 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1157 'artist': 'Panjabi MC',
1158 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1159 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1160 },
1161 'params': {
1162 'skip_download': True,
1163 },
ea74e00b
DP
1164 },
1165 {
1166 # empty description results in an empty string
1167 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1168 'info_dict': {
1169 'id': 'x41yOUIvK2k',
1170 'ext': 'mp4',
1171 'title': 'IMG 3456',
1172 'description': '',
1173 'upload_date': '20170613',
1174 'uploader_id': 'ElevageOrVert',
1175 'uploader': 'ElevageOrVert',
1176 },
1177 'params': {
1178 'skip_download': True,
1179 },
1180 },
2eb88d95
PH
1181 ]
1182
e0df6211
PH
1183 def __init__(self, *args, **kwargs):
1184 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 1185 self._player_cache = {}
e0df6211 1186
c5e8d7af
PH
1187 def report_video_info_webpage_download(self, video_id):
1188 """Report attempt to download video info webpage."""
69ea8ca4 1189 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 1190
c5e8d7af
PH
1191 def report_information_extraction(self, video_id):
1192 """Report attempt to extract video information."""
69ea8ca4 1193 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
1194
1195 def report_unavailable_format(self, video_id, format):
1196 """Report extracted video URL."""
69ea8ca4 1197 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
1198
1199 def report_rtmp_download(self):
1200 """Indicate the download will use the RTMP protocol."""
69ea8ca4 1201 self.to_screen('RTMP download detected')
c5e8d7af 1202
60064c53
PH
1203 def _signature_cache_id(self, example_sig):
1204 """ Return a string representation of a signature """
78caa52a 1205 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53 1206
e40c758c
S
1207 @classmethod
1208 def _extract_player_info(cls, player_url):
1209 for player_re in cls._PLAYER_INFO_RE:
1210 id_m = re.search(player_re, player_url)
1211 if id_m:
1212 break
1213 else:
c081b35c 1214 raise ExtractorError('Cannot identify player %r' % player_url)
e40c758c
S
1215 return id_m.group('ext'), id_m.group('id')
1216
1217 def _extract_signature_function(self, video_id, player_url, example_sig):
1218 player_type, player_id = self._extract_player_info(player_url)
e0df6211 1219
c4417ddb 1220 # Read from filesystem cache
60064c53
PH
1221 func_id = '%s_%s_%s' % (
1222 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 1223 assert os.path.basename(func_id) == func_id
a0e07d31 1224
69ea8ca4 1225 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 1226 if cache_spec is not None:
78caa52a 1227 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 1228
6d1a55a5
PH
1229 download_note = (
1230 'Downloading player %s' % player_url
1231 if self._downloader.params.get('verbose') else
1232 'Downloading %s player %s' % (player_type, player_id)
1233 )
e0df6211
PH
1234 if player_type == 'js':
1235 code = self._download_webpage(
1236 player_url, video_id,
6d1a55a5 1237 note=download_note,
69ea8ca4 1238 errnote='Download of %s failed' % player_url)
83799698 1239 res = self._parse_sig_js(code)
c4417ddb 1240 elif player_type == 'swf':
e0df6211
PH
1241 urlh = self._request_webpage(
1242 player_url, video_id,
6d1a55a5 1243 note=download_note,
69ea8ca4 1244 errnote='Download of %s failed' % player_url)
e0df6211 1245 code = urlh.read()
83799698 1246 res = self._parse_sig_swf(code)
e0df6211
PH
1247 else:
1248 assert False, 'Invalid player type %r' % player_type
1249
785521bf
PH
1250 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1251 cache_res = res(test_string)
1252 cache_spec = [ord(c) for c in cache_res]
83799698 1253
69ea8ca4 1254 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
1255 return res
1256
60064c53 1257 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
1258 def gen_sig_code(idxs):
1259 def _genslice(start, end, step):
78caa52a 1260 starts = '' if start == 0 else str(start)
8bcc8756 1261 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 1262 steps = '' if step == 1 else (':%d' % step)
78caa52a 1263 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
1264
1265 step = None
7af808a5
PH
1266 # Quelch pyflakes warnings - start will be set when step is set
1267 start = '(Never used)'
edf3e38e
PH
1268 for i, prev in zip(idxs[1:], idxs[:-1]):
1269 if step is not None:
1270 if i - prev == step:
1271 continue
1272 yield _genslice(start, prev, step)
1273 step = None
1274 continue
1275 if i - prev in [-1, 1]:
1276 step = i - prev
1277 start = prev
1278 continue
1279 else:
78caa52a 1280 yield 's[%d]' % prev
edf3e38e 1281 if step is None:
78caa52a 1282 yield 's[%d]' % i
edf3e38e
PH
1283 else:
1284 yield _genslice(start, i, step)
1285
78caa52a 1286 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 1287 cache_res = func(test_string)
edf3e38e 1288 cache_spec = [ord(c) for c in cache_res]
78caa52a 1289 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
1290 signature_id_tuple = '(%s)' % (
1291 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 1292 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 1293 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 1294 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 1295
e0df6211
PH
1296 def _parse_sig_js(self, jscode):
1297 funcname = self._search_regex(
abefc03f
S
1298 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1299 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
e450f6cb 1300 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
31ce6e99 1301 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
abefc03f
S
1302 # Obsolete patterns
1303 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
9a47fa35 1304 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
abefc03f
S
1305 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1306 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1307 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1308 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1309 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1310 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
3c90cc8b 1311 jscode, 'Initial JS player signature function name', group='sig')
2b25cb5d
PH
1312
1313 jsi = JSInterpreter(jscode)
1314 initial_function = jsi.extract_function(funcname)
e0df6211
PH
1315 return lambda s: initial_function([s])
1316
1317 def _parse_sig_swf(self, file_contents):
54256267 1318 swfi = SWFInterpreter(file_contents)
78caa52a 1319 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 1320 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 1321 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
1322 return lambda s: initial_function([s])
1323
83799698 1324 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1325 """Turn the encrypted s field into a working signature"""
6b37f0be 1326
c8bf86d5 1327 if player_url is None:
69ea8ca4 1328 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 1329
69ea8ca4 1330 if player_url.startswith('//'):
78caa52a 1331 player_url = 'https:' + player_url
3c90cc8b
S
1332 elif not re.match(r'https?://', player_url):
1333 player_url = compat_urlparse.urljoin(
1334 'https://www.youtube.com', player_url)
c8bf86d5 1335 try:
62af3a0e 1336 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
1337 if player_id not in self._player_cache:
1338 func = self._extract_signature_function(
60064c53 1339 video_id, player_url, s
c8bf86d5
PH
1340 )
1341 self._player_cache[player_id] = func
1342 func = self._player_cache[player_id]
1343 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 1344 self._print_sig_code(func, s)
c8bf86d5
PH
1345 return func(s)
1346 except Exception as e:
1347 tb = traceback.format_exc()
1348 raise ExtractorError(
78caa52a 1349 'Signature extraction failed: ' + tb, cause=e)
e0df6211 1350
f96f5dda 1351 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
de7f3446 1352 try:
60e47a26 1353 subs_doc = self._download_xml(
38c2e5b8 1354 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
1355 video_id, note=False)
1356 except ExtractorError as err:
9b9c5355 1357 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
de7f3446 1358 return {}
de7f3446
JMF
1359
1360 sub_lang_list = {}
60e47a26
JMF
1361 for track in subs_doc.findall('track'):
1362 lang = track.attrib['lang_code']
7e660ac1
LD
1363 if lang in sub_lang_list:
1364 continue
360e1ca5 1365 sub_formats = []
23d17e4b 1366 for ext in self._SUBTITLE_FORMATS:
15707c7e 1367 params = compat_urllib_parse_urlencode({
360e1ca5
JMF
1368 'lang': lang,
1369 'v': video_id,
1370 'fmt': ext,
1371 'name': track.attrib['name'].encode('utf-8'),
1372 })
1373 sub_formats.append({
1374 'url': 'https://www.youtube.com/api/timedtext?' + params,
1375 'ext': ext,
1376 })
1377 sub_lang_list[lang] = sub_formats
9f448fcb 1378 if has_live_chat_replay:
321bf820 1379 sub_lang_list['live_chat'] = [
1380 {
1381 'video_id': video_id,
1382 'ext': 'json',
1383 'protocol': 'youtube_live_chat_replay',
1384 },
9f448fcb 1385 ]
de7f3446 1386 if not sub_lang_list:
69ea8ca4 1387 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
1388 return {}
1389 return sub_lang_list
1390
a72778d3
S
1391 def _get_ytplayer_config(self, video_id, webpage):
1392 patterns = (
526b3b07
S
1393 # User data may contain arbitrary character sequences that may affect
1394 # JSON extraction with regex, e.g. when '};' is contained the second
1395 # regex won't capture the whole JSON. Yet working around by trying more
1396 # concrete regex first keeping in mind proper quoted string handling
1397 # to be implemented in future that will replace this workaround (see
067aa17e
S
1398 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1399 # https://github.com/ytdl-org/youtube-dl/pull/7599)
a72778d3
S
1400 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1401 r';ytplayer\.config\s*=\s*({.+?});',
1402 )
1403 config = self._search_regex(
1404 patterns, webpage, 'ytplayer.config', default=None)
1405 if config:
1406 return self._parse_json(
1407 uppercase_escape(config), video_id, fatal=False)
0e49d9a6 1408
9322f116 1409 def _get_music_metadata_from_yt_initial(self, yt_initial):
1410 music_metadata = []
1411 key_map = {
1412 'Album': 'album',
1413 'Artist': 'artist',
1414 'Song': 'track'
1415 }
1416 contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'])
1417 if type(contents) is list:
1418 for content in contents:
1419 music_track = {}
1420 if type(content) is not dict:
1421 continue
1422 videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer'])
1423 if type(videoSecondaryInfoRenderer) is not dict:
1424 continue
1425 rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'])
1426 if type(rows) is not list:
1427 continue
1428 for row in rows:
1429 metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer'])
1430 if type(metadataRowRenderer) is not dict:
1431 continue
1432 key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText'])
1433 value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \
1434 try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text'])
1435 if type(key) is not str or type(value) is not str:
1436 continue
1437 if key in key_map:
1438 if key_map[key] in music_track:
1439 # we've started on a new track
1440 music_metadata.append(music_track)
1441 music_track = {}
1442 music_track[key_map[key]] = value
1443 if len(music_track.keys()):
1444 music_metadata.append(music_track)
1445 return music_metadata
1446
360e1ca5 1447 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
1448 """We need the webpage for getting the captions url, pass it as an
1449 argument to speed up the process."""
69ea8ca4 1450 self.to_screen('%s: Looking for automatic captions' % video_id)
a72778d3 1451 player_config = self._get_ytplayer_config(video_id, webpage)
78caa52a 1452 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
a72778d3 1453 if not player_config:
de7f3446
JMF
1454 self._downloader.report_warning(err_msg)
1455 return {}
de7f3446 1456 try:
0792d563 1457 args = player_config['args']
b78b292f
S
1458 caption_url = args.get('ttsurl')
1459 if caption_url:
1460 timestamp = args['timestamp']
1461 # We get the available subtitles
15707c7e 1462 list_params = compat_urllib_parse_urlencode({
b78b292f
S
1463 'type': 'list',
1464 'tlangs': 1,
1465 'asrs': 1,
1466 })
1467 list_url = caption_url + '&' + list_params
1468 caption_list = self._download_xml(list_url, video_id)
1469 original_lang_node = caption_list.find('track')
1470 if original_lang_node is None:
1471 self._downloader.report_warning('Video doesn\'t have automatic captions')
1472 return {}
1473 original_lang = original_lang_node.attrib['lang_code']
1474 caption_kind = original_lang_node.attrib.get('kind', '')
1475
1476 sub_lang_list = {}
1477 for lang_node in caption_list.findall('target'):
1478 sub_lang = lang_node.attrib['lang_code']
1479 sub_formats = []
1480 for ext in self._SUBTITLE_FORMATS:
15707c7e 1481 params = compat_urllib_parse_urlencode({
b78b292f
S
1482 'lang': original_lang,
1483 'tlang': sub_lang,
1484 'fmt': ext,
1485 'ts': timestamp,
1486 'kind': caption_kind,
1487 })
1488 sub_formats.append({
1489 'url': caption_url + '&' + params,
1490 'ext': ext,
1491 })
1492 sub_lang_list[sub_lang] = sub_formats
1493 return sub_lang_list
1494
ddbb4c5c
S
1495 def make_captions(sub_url, sub_langs):
1496 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1497 caption_qs = compat_parse_qs(parsed_sub_url.query)
1498 captions = {}
1499 for sub_lang in sub_langs:
1500 sub_formats = []
1501 for ext in self._SUBTITLE_FORMATS:
1502 caption_qs.update({
1503 'tlang': [sub_lang],
1504 'fmt': [ext],
1505 })
1506 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1507 query=compat_urllib_parse_urlencode(caption_qs, True)))
1508 sub_formats.append({
1509 'url': sub_url,
1510 'ext': ext,
1511 })
1512 captions[sub_lang] = sub_formats
1513 return captions
1514
1515 # New captions format as of 22.06.2017
1516 player_response = args.get('player_response')
1517 if player_response and isinstance(player_response, compat_str):
1518 player_response = self._parse_json(
1519 player_response, video_id, fatal=False)
1520 if player_response:
1521 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
7e1cf1a4 1522 caption_tracks = renderer['captionTracks']
1523 for caption_track in caption_tracks:
1524 if 'kind' not in caption_track:
1525 # not an automatic transcription
1526 continue
1527 base_url = caption_track['baseUrl']
1528 sub_lang_list = []
1529 for lang in renderer['translationLanguages']:
1530 lang_code = lang.get('languageCode')
1531 if lang_code:
1532 sub_lang_list.append(lang_code)
1533 return make_captions(base_url, sub_lang_list)
bc842c27 1534
7e1cf1a4 1535 self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id)
1536 return {}
b78b292f
S
1537 # Some videos don't provide ttsurl but rather caption_tracks and
1538 # caption_translation_languages (e.g. 20LmZk1hakA)
ddbb4c5c 1539 # Does not used anymore as of 22.06.2017
b78b292f
S
1540 caption_tracks = args['caption_tracks']
1541 caption_translation_languages = args['caption_translation_languages']
1542 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
ddbb4c5c 1543 sub_lang_list = []
b78b292f
S
1544 for lang in caption_translation_languages.split(','):
1545 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1546 sub_lang = lang_qs.get('lc', [None])[0]
ddbb4c5c
S
1547 if sub_lang:
1548 sub_lang_list.append(sub_lang)
1549 return make_captions(caption_url, sub_lang_list)
de7f3446
JMF
1550 # An extractor error can be raise by the download process if there are
1551 # no automatic captions but there are subtitles
ddbb4c5c 1552 except (KeyError, IndexError, ExtractorError):
de7f3446
JMF
1553 self._downloader.report_warning(err_msg)
1554 return {}
1555
21c340b8
S
1556 def _mark_watched(self, video_id, video_info, player_response):
1557 playback_url = url_or_none(try_get(
1558 player_response,
1559 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1560 video_info, lambda x: x['videostats_playback_base_url'][0]))
d77ab8e2
S
1561 if not playback_url:
1562 return
1563 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1564 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1565
1566 # cpn generation algorithm is reverse engineered from base.js.
1567 # In fact it works even with dummy cpn.
1568 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1569 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1570
1571 qs.update({
1572 'ver': ['2'],
1573 'cpn': [cpn],
1574 })
1575 playback_url = compat_urlparse.urlunparse(
15707c7e 1576 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
d77ab8e2
S
1577
1578 self._download_webpage(
1579 playback_url, video_id, 'Marking watched',
1580 'Unable to mark watched', fatal=False)
1581
66c9fa36
S
1582 @staticmethod
1583 def _extract_urls(webpage):
1584 # Embedded YouTube player
1585 entries = [
1586 unescapeHTML(mobj.group('url'))
1587 for mobj in re.finditer(r'''(?x)
1588 (?:
1589 <iframe[^>]+?src=|
1590 data-video-url=|
1591 <embed[^>]+?src=|
1592 embedSWF\(?:\s*|
1593 <object[^>]+data=|
1594 new\s+SWFObject\(
1595 )
1596 (["\'])
1597 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
f2332f18 1598 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
66c9fa36
S
1599 \1''', webpage)]
1600
1601 # lazyYT YouTube embed
1602 entries.extend(list(map(
1603 unescapeHTML,
1604 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1605
1606 # Wordpress "YouTube Video Importer" plugin
1607 matches = re.findall(r'''(?x)<div[^>]+
1608 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1609 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1610 entries.extend(m[-1] for m in matches)
1611
1612 return entries
1613
1614 @staticmethod
1615 def _extract_url(webpage):
1616 urls = YoutubeIE._extract_urls(webpage)
1617 return urls[0] if urls else None
1618
97665381
PH
1619 @classmethod
1620 def extract_id(cls, url):
1621 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 1622 if mobj is None:
69ea8ca4 1623 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1624 video_id = mobj.group(2)
1625 return video_id
1626
84213ea8
S
1627 def _extract_chapters_from_json(self, webpage, video_id, duration):
1628 if not webpage:
1629 return
edd83104 1630 initial_data = self._parse_json(
84213ea8 1631 self._search_regex(
edd83104 1632 r'window\["ytInitialData"\] = (.+);\n', webpage,
84213ea8
S
1633 'player args', default='{}'),
1634 video_id, fatal=False)
edd83104 1635 if not initial_data or not isinstance(initial_data, dict):
84213ea8
S
1636 return
1637 chapters_list = try_get(
edd83104 1638 initial_data,
84213ea8
S
1639 lambda x: x['playerOverlays']
1640 ['playerOverlayRenderer']
1641 ['decoratedPlayerBarRenderer']
1642 ['decoratedPlayerBarRenderer']
1643 ['playerBar']
1644 ['chapteredPlayerBarRenderer']
1645 ['chapters'],
1646 list)
1647 if not chapters_list:
1648 return
1649
1650 def chapter_time(chapter):
1651 return float_or_none(
1652 try_get(
1653 chapter,
1654 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1655 int),
1656 scale=1000)
1657 chapters = []
1658 for next_num, chapter in enumerate(chapters_list, start=1):
1659 start_time = chapter_time(chapter)
1660 if start_time is None:
1661 continue
1662 end_time = (chapter_time(chapters_list[next_num])
1663 if next_num < len(chapters_list) else duration)
1664 if end_time is None:
1665 continue
1666 title = try_get(
1667 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1668 compat_str)
1669 chapters.append({
1670 'start_time': start_time,
1671 'end_time': end_time,
1672 'title': title,
1673 })
1674 return chapters
1675
9cafc3fd 1676 @staticmethod
84213ea8 1677 def _extract_chapters_from_description(description, duration):
9cafc3fd
S
1678 if not description:
1679 return None
1680 chapter_lines = re.findall(
1681 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1682 description)
1683 if not chapter_lines:
1684 return None
1685 chapters = []
1686 for next_num, (chapter_line, time_point) in enumerate(
1687 chapter_lines, start=1):
1688 start_time = parse_duration(time_point)
1689 if start_time is None:
1690 continue
39d4c1be
S
1691 if start_time > duration:
1692 break
9cafc3fd
S
1693 end_time = (duration if next_num == len(chapter_lines)
1694 else parse_duration(chapter_lines[next_num][1]))
1695 if end_time is None:
1696 continue
39d4c1be
S
1697 if end_time > duration:
1698 end_time = duration
1699 if start_time > end_time:
1700 break
9cafc3fd
S
1701 chapter_title = re.sub(
1702 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1703 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1704 chapters.append({
1705 'start_time': start_time,
1706 'end_time': end_time,
1707 'title': chapter_title,
1708 })
1709 return chapters
1710
84213ea8
S
1711 def _extract_chapters(self, webpage, description, video_id, duration):
1712 return (self._extract_chapters_from_json(webpage, video_id, duration)
1713 or self._extract_chapters_from_description(description, duration))
1714
c5e8d7af 1715 def _real_extract(self, url):
cf7e015f
S
1716 url, smuggled_data = unsmuggle_url(url, {})
1717
7e8c0af0 1718 proto = (
78caa52a
PH
1719 'http' if self._downloader.params.get('prefer_insecure', False)
1720 else 'https')
7e8c0af0 1721
7c80519c 1722 start_time = None
297a564b 1723 end_time = None
7c80519c
JMF
1724 parsed_url = compat_urllib_parse_urlparse(url)
1725 for component in [parsed_url.fragment, parsed_url.query]:
1726 query = compat_parse_qs(component)
297a564b 1727 if start_time is None and 't' in query:
7c80519c 1728 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
1729 if start_time is None and 'start' in query:
1730 start_time = parse_duration(query['start'][0])
297a564b
JMF
1731 if end_time is None and 'end' in query:
1732 end_time = parse_duration(query['end'][0])
7c80519c 1733
c5e8d7af
PH
1734 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1735 mobj = re.search(self._NEXT_URL_RE, url)
1736 if mobj:
7fd002c0 1737 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 1738 video_id = self.extract_id(url)
c5e8d7af
PH
1739
1740 # Get video webpage
aa79ac0c 1741 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
011e75e6
S
1742 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1743
1744 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1745 video_id = qs.get('v', [None])[0] or video_id
c5e8d7af
PH
1746
1747 # Attempt to extract SWF player URL
e0df6211 1748 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1749 if mobj is not None:
1750 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1751 else:
1752 player_url = None
1753
d8d24a92
S
1754 dash_mpds = []
1755
1756 def add_dash_mpd(video_info):
1757 dash_mpd = video_info.get('dashmpd')
1758 if dash_mpd and dash_mpd[0] not in dash_mpds:
1759 dash_mpds.append(dash_mpd[0])
1760
561b456e
S
1761 def add_dash_mpd_pr(pl_response):
1762 dash_mpd = url_or_none(try_get(
1763 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1764 compat_str))
1765 if dash_mpd and dash_mpd not in dash_mpds:
1766 dash_mpds.append(dash_mpd)
1767
c7121fa7
S
1768 is_live = None
1769 view_count = None
1770
1771 def extract_view_count(v_info):
1772 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1773
c2d125d9
S
1774 def extract_player_response(player_response, video_id):
1775 pl_response = str_or_none(player_response)
1776 if not pl_response:
1777 return
1778 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1779 if isinstance(pl_response, dict):
1780 add_dash_mpd_pr(pl_response)
1781 return pl_response
1782
fb2c9277
U
1783 def extract_embedded_config(embed_webpage, video_id):
1784 embedded_config = self._search_regex(
1785 r'setConfig\(({.*})\);',
1786 embed_webpage, 'ytInitialData', default=None)
1787 if embedded_config:
1788 return embedded_config
1789
dbdaaa23
S
1790 player_response = {}
1791
c5e8d7af 1792 # Get video info
43ebf77d 1793 video_info = {}
6449cd80 1794 embed_webpage = None
39e7107d
U
1795 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1796 or re.search(r'player-age-gate-content">', video_webpage) is not None):
9d9314cb 1797 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
c108eb73
JMF
1798 age_gate = True
1799 # We simulate the access to the video from www.youtube.com/v/{video_id}
1800 # this can be viewed without login into Youtube
beb95e77
CL
1801 url = proto + '://www.youtube.com/embed/%s' % video_id
1802 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
fb2c9277
U
1803 ext = extract_embedded_config(embed_webpage, video_id)
1804 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1805 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1806 if not playable_in_embed:
1807 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1808 playable_in_embed = ''
1809 else:
1810 playable_in_embed = playable_in_embed.group('playableinEmbed')
1811 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1812 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1813 if playable_in_embed == 'false':
c73baf23
U
1814 '''
1815 # TODO apply this patch when Support for Python 2.6(!) and above drops
9d9314cb 1816 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
4bb9c880 1817 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
c73baf23
U
1818 '''
1819 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1820 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
4bb9c880
U
1821 age_gate = False
1822 # Try looking directly into the video webpage
1823 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1824 if ytplayer_config:
1825 args = ytplayer_config['args']
1826 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1827 # Convert to the same format returned by compat_parse_qs
1828 video_info = dict((k, [v]) for k, v in args.items())
1829 add_dash_mpd(video_info)
1830 # Rental video is not rented but preview is available (e.g.
1831 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1832 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1833 if not video_info and args.get('ypc_vid'):
1834 return self.url_result(
1835 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1836 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1837 is_live = True
1838 if not player_response:
1839 player_response = extract_player_response(args.get('player_response'), video_id)
1840 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1841 add_dash_mpd_pr(player_response)
9d9314cb
U
1842 else:
1843 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1844 else:
1845 data = compat_urllib_parse_urlencode({
1846 'video_id': video_id,
1847 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1848 'sts': self._search_regex(
1849 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1850 })
1851 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1852 try:
1853 video_info_webpage = self._download_webpage(
1854 video_info_url, video_id,
1855 note='Refetching age-gated info webpage',
1856 errnote='unable to download video info webpage')
1857 except ExtractorError:
1858 video_info_webpage = None
1859 if video_info_webpage:
1860 video_info = compat_parse_qs(video_info_webpage)
1861 pl_response = video_info.get('player_response', [None])[0]
1862 player_response = extract_player_response(pl_response, video_id)
1863 add_dash_mpd(video_info)
1864 view_count = extract_view_count(video_info)
c108eb73
JMF
1865 else:
1866 age_gate = False
d8d24a92 1867 # Try looking directly into the video webpage
a72778d3
S
1868 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1869 if ytplayer_config:
4e62ebe2 1870 args = ytplayer_config['args']
4c76aa06 1871 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
d8d24a92
S
1872 # Convert to the same format returned by compat_parse_qs
1873 video_info = dict((k, [v]) for k, v in args.items())
1874 add_dash_mpd(video_info)
6496ccb4
S
1875 # Rental video is not rented but preview is available (e.g.
1876 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
067aa17e 1877 # https://github.com/ytdl-org/youtube-dl/issues/10532)
6496ccb4
S
1878 if not video_info and args.get('ypc_vid'):
1879 return self.url_result(
1880 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
2fe1ff85
JMF
1881 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1882 is_live = True
dbdaaa23 1883 if not player_response:
c2d125d9 1884 player_response = extract_player_response(args.get('player_response'), video_id)
0a3cf9ad 1885 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
561b456e 1886 add_dash_mpd_pr(player_response)
bbb7c3f7
YCH
1887
1888 def extract_unavailable_message():
0add33ab
S
1889 messages = []
1890 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1891 msg = self._html_search_regex(
1892 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1893 video_webpage, 'unavailable %s' % kind, default=None)
1894 if msg:
1895 messages.append(msg)
1896 if messages:
1897 return '\n'.join(messages)
bbb7c3f7 1898
f93abcf1 1899 if not video_info and not player_response:
15be3eb5
RA
1900 unavailable_message = extract_unavailable_message()
1901 if not unavailable_message:
1902 unavailable_message = 'Unable to extract video data'
1903 raise ExtractorError(
1904 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1905
f93abcf1
S
1906 if not isinstance(video_info, dict):
1907 video_info = {}
1908
dbdaaa23
S
1909 video_details = try_get(
1910 player_response, lambda x: x['videoDetails'], dict) or {}
1911
37357d21
S
1912 microformat = try_get(
1913 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1914
8dbf751a
RA
1915 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1916 if not video_title:
cf7e015f
S
1917 self._downloader.report_warning('Unable to extract video title')
1918 video_title = '_'
1919
9cafc3fd 1920 description_original = video_description = get_element_by_id("eow-description", video_webpage)
cf7e015f 1921 if video_description:
fa4bc6e7
RA
1922
1923 def replace_url(m):
1924 redir_url = compat_urlparse.urljoin(url, m.group(1))
1925 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1926 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1927 qs = compat_parse_qs(parsed_redir_url.query)
1928 q = qs.get('q')
1929 if q and q[0]:
1930 return q[0]
1931 return redir_url
1932
9cafc3fd 1933 description_original = video_description = re.sub(r'''(?x)
cf7e015f 1934 <a\s+
25cb7a0e 1935 (?:[a-zA-Z-]+="[^"]*"\s+)*?
23f13e97 1936 (?:title|href)="([^"]+)"\s+
25cb7a0e 1937 (?:[a-zA-Z-]+="[^"]*"\s+)*?
525cedb9 1938 class="[^"]*"[^>]*>
23f13e97 1939 [^<]+\.{3}\s*
cf7e015f 1940 </a>
fa4bc6e7 1941 ''', replace_url, video_description)
cf7e015f
S
1942 video_description = clean_html(video_description)
1943 else:
ea74e00b
DP
1944 video_description = video_details.get('shortDescription')
1945 if video_description is None:
1946 video_description = self._html_search_meta('description', video_webpage)
cf7e015f 1947
8fe10494 1948 if not smuggled_data.get('force_singlefeed', False):
5e1eddb9 1949 if not self._downloader.params.get('noplaylist'):
8fe10494
S
1950 multifeed_metadata_list = try_get(
1951 player_response,
1952 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1953 compat_str) or try_get(
1954 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1955 if multifeed_metadata_list:
1956 entries = []
1957 feed_ids = []
1958 for feed in multifeed_metadata_list.split(','):
1959 # Unquote should take place before split on comma (,) since textual
1960 # fields may contain comma as well (see
067aa17e 1961 # https://github.com/ytdl-org/youtube-dl/issues/8536)
8fe10494 1962 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
6b09401b
S
1963
1964 def feed_entry(name):
1965 return try_get(feed_data, lambda x: x[name][0], compat_str)
1966
1967 feed_id = feed_entry('id')
1968 if not feed_id:
1969 continue
1970 feed_title = feed_entry('title')
1971 title = video_title
1972 if feed_title:
1973 title += ' (%s)' % feed_title
8fe10494
S
1974 entries.append({
1975 '_type': 'url_transparent',
1976 'ie_key': 'Youtube',
1977 'url': smuggle_url(
1978 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1979 {'force_singlefeed': True}),
6b09401b 1980 'title': title,
8fe10494 1981 })
6b09401b 1982 feed_ids.append(feed_id)
8fe10494
S
1983 self.to_screen(
1984 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1985 % (', '.join(feed_ids), video_id))
1986 return self.playlist_result(entries, video_id, video_title, video_description)
1987 else:
1988 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1989
c7121fa7 1990 if view_count is None:
1c9c8de2 1991 view_count = extract_view_count(video_info)
dbdaaa23
S
1992 if view_count is None and video_details:
1993 view_count = int_or_none(video_details.get('viewCount'))
7b16239a
S
1994 if view_count is None and microformat:
1995 view_count = int_or_none(microformat.get('viewCount'))
1d699755 1996
27019dbb 1997 if is_live is None:
898238e9 1998 is_live = bool_or_none(video_details.get('isLive'))
27019dbb 1999
321bf820 2000 has_live_chat_replay = False
f0f76a33 2001 if not is_live:
321bf820 2002 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
2003 try:
2004 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
2005 has_live_chat_replay = True
f0f76a33 2006 except (KeyError, IndexError, TypeError):
321bf820 2007 pass
2008
c5e8d7af
PH
2009 # Check for "rental" videos
2010 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
067aa17e 2011 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
c5e8d7af 2012
c63ca0ee
S
2013 def _extract_filesize(media_url):
2014 return int_or_none(self._search_regex(
2015 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
2016
bf1317d2
S
2017 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
2018 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
2019
c5e8d7af
PH
2020 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
2021 self.report_rtmp_download()
dd27fd17
PH
2022 formats = [{
2023 'format_id': '_rtmp',
2024 'protocol': 'rtmp',
2025 'url': video_info['conn'][0],
2026 'player_url': player_url,
2027 }]
bf1317d2 2028 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
5f6a1245 2029 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 2030 if 'rtmpe%3Dyes' in encoded_url_map:
067aa17e 2031 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
bf1317d2 2032 formats = []
3318832e 2033 formats_spec = {}
82156fdb 2034 fmt_list = video_info.get('fmt_list', [''])[0]
2035 if fmt_list:
2036 for fmt in fmt_list.split(','):
2037 spec = fmt.split('/')
3318832e 2038 if len(spec) > 1:
2039 width_height = spec[1].split('x')
2040 if len(width_height) == 2:
2041 formats_spec[spec[0]] = {
2042 'resolution': spec[1],
2043 'width': int_or_none(width_height[0]),
2044 'height': int_or_none(width_height[1]),
2045 }
bf1317d2
S
2046 for fmt in streaming_formats:
2047 itag = str_or_none(fmt.get('itag'))
2048 if not itag:
201e9eaa 2049 continue
bf1317d2
S
2050 quality = fmt.get('quality')
2051 quality_label = fmt.get('qualityLabel') or quality
2052 formats_spec[itag] = {
2053 'asr': int_or_none(fmt.get('audioSampleRate')),
2054 'filesize': int_or_none(fmt.get('contentLength')),
2055 'format_note': quality_label,
2056 'fps': int_or_none(fmt.get('fps')),
2057 'height': int_or_none(fmt.get('height')),
bf1317d2
S
2058 # bitrate for itag 43 is always 2147483647
2059 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
2060 'width': int_or_none(fmt.get('width')),
2061 }
2062
2063 for fmt in streaming_formats:
00eb865b 2064 if fmt.get('drmFamilies') or fmt.get('drm_families'):
bf1317d2
S
2065 continue
2066 url = url_or_none(fmt.get('url'))
2067
2068 if not url:
fa3db383 2069 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
bf1317d2
S
2070 if not cipher:
2071 continue
2072 url_data = compat_parse_qs(cipher)
2073 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2074 if not url:
2075 continue
2076 else:
2077 cipher = None
2078 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2079
2f483bc1
S
2080 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2081 # Unsupported FORMAT_STREAM_TYPE_OTF
2082 if stream_type == 3:
2083 continue
6449cd80 2084
bf1317d2
S
2085 format_id = fmt.get('itag') or url_data['itag'][0]
2086 if not format_id:
2087 continue
2088 format_id = compat_str(format_id)
a49eccdf 2089
bf1317d2
S
2090 if cipher:
2091 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
67b19799 2092 ASSETS_RE = r'(?:"assets":.+?"js":\s*("[^"]+"))|(?:"jsUrl":\s*("[^"]+"))'
bf1317d2
S
2093 jsplayer_url_json = self._search_regex(
2094 ASSETS_RE,
2095 embed_webpage if age_gate else video_webpage,
2096 'JS player URL (1)', default=None)
2097 if not jsplayer_url_json and not age_gate:
2098 # We need the embed website after all
2099 if embed_webpage is None:
2100 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2101 embed_webpage = self._download_webpage(
2102 embed_url, video_id, 'Downloading embed webpage')
2103 jsplayer_url_json = self._search_regex(
2104 ASSETS_RE, embed_webpage, 'JS player URL')
2105
2106 player_url = json.loads(jsplayer_url_json)
cf010131 2107 if player_url is None:
bf1317d2
S
2108 player_url_json = self._search_regex(
2109 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2110 video_webpage, 'age gate player URL')
2111 player_url = json.loads(player_url_json)
2112
2113 if 'sig' in url_data:
2114 url += '&signature=' + url_data['sig'][0]
2115 elif 's' in url_data:
2116 encrypted_sig = url_data['s'][0]
2117
2118 if self._downloader.params.get('verbose'):
2119 if player_url is None:
bf1317d2 2120 player_desc = 'unknown'
cf010131 2121 else:
e40c758c
S
2122 player_type, player_version = self._extract_player_info(player_url)
2123 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
bf1317d2
S
2124 parts_sizes = self._signature_cache_id(encrypted_sig)
2125 self.to_screen('{%s} signature length %s, %s' %
2126 (format_id, parts_sizes, player_desc))
2127
2128 signature = self._decrypt_signature(
2129 encrypted_sig, video_id, player_url, age_gate)
2130 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2131 url += '&%s=%s' % (sp, signature)
201e9eaa
PH
2132 if 'ratebypass' not in url:
2133 url += '&ratebypass=yes'
c9afb51c 2134
94278f72
YCH
2135 dct = {
2136 'format_id': format_id,
2137 'url': url,
2138 'player_url': player_url,
2139 }
2140 if format_id in self._formats:
2141 dct.update(self._formats[format_id])
3318832e 2142 if format_id in formats_spec:
2143 dct.update(formats_spec[format_id])
94278f72 2144
aabc2be6 2145 # Some itags are not included in DASH manifest thus corresponding formats will
067aa17e 2146 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
aabc2be6
S
2147 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2148 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2149 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
94278f72 2150
bf1317d2
S
2151 if width is None:
2152 width = int_or_none(fmt.get('width'))
2153 if height is None:
2154 height = int_or_none(fmt.get('height'))
2155
c63ca0ee
S
2156 filesize = int_or_none(url_data.get(
2157 'clen', [None])[0]) or _extract_filesize(url)
2158
bf1317d2
S
2159 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2160 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2161
4878759f
S
2162 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2163 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
bf1317d2 2164 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
54fc90aa 2165
94278f72 2166 more_fields = {
c63ca0ee 2167 'filesize': filesize,
bf1317d2 2168 'tbr': tbr,
c9afb51c
AH
2169 'width': width,
2170 'height': height,
bf1317d2
S
2171 'fps': fps,
2172 'format_note': quality_label or quality,
c9afb51c 2173 }
94278f72
YCH
2174 for key, value in more_fields.items():
2175 if value:
2176 dct[key] = value
bf1317d2 2177 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
aabc2be6
S
2178 if type_:
2179 type_split = type_.split(';')
2180 kind_ext = type_split[0].split('/')
2181 if len(kind_ext) == 2:
94278f72
YCH
2182 kind, _ = kind_ext
2183 dct['ext'] = mimetype2ext(type_split[0])
aabc2be6
S
2184 if kind in ('audio', 'video'):
2185 codecs = None
2186 for mobj in re.finditer(
2187 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2188 if mobj.group('key') == 'codecs':
2189 codecs = mobj.group('val')
2190 break
2191 if codecs:
6310acf5 2192 dct.update(parse_codecs(codecs))
e4a60912
S
2193 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2194 dct['downloader_options'] = {
2195 # Youtube throttles chunks >~10M
2196 'http_chunk_size': 10485760,
2197 }
aabc2be6 2198 formats.append(dct)
c5e8d7af 2199 else:
c3e54389
S
2200 manifest_url = (
2201 url_or_none(try_get(
2202 player_response,
2203 lambda x: x['streamingData']['hlsManifestUrl'],
3089bc74
S
2204 compat_str))
2205 or url_or_none(try_get(
c3e54389
S
2206 video_info, lambda x: x['hlsvp'][0], compat_str)))
2207 if manifest_url:
2208 formats = []
2209 m3u8_formats = self._extract_m3u8_formats(
2210 manifest_url, video_id, 'mp4', fatal=False)
2211 for a_format in m3u8_formats:
2212 itag = self._search_regex(
2213 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2214 if itag:
2215 a_format['format_id'] = itag
2216 if itag in self._formats:
2217 dct = self._formats[itag].copy()
2218 dct.update(a_format)
2219 a_format = dct
2220 a_format['player_url'] = player_url
2221 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2222 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
78895bd3
U
2223 if self._downloader.params.get('youtube_include_hls_manifest', True):
2224 formats.append(a_format)
c3e54389 2225 else:
13577349 2226 error_message = extract_unavailable_message()
c3e54389 2227 if not error_message:
13577349
S
2228 error_message = clean_html(try_get(
2229 player_response, lambda x: x['playabilityStatus']['reason'],
2230 compat_str))
2231 if not error_message:
2232 error_message = clean_html(
2233 try_get(video_info, lambda x: x['reason'][0], compat_str))
c3e54389
S
2234 if error_message:
2235 raise ExtractorError(error_message, expected=True)
2236 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 2237
7e72694b 2238 # uploader
dbdaaa23
S
2239 video_uploader = try_get(
2240 video_info, lambda x: x['author'][0],
2241 compat_str) or str_or_none(video_details.get('author'))
7e72694b
S
2242 if video_uploader:
2243 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2244 else:
2245 self._downloader.report_warning('unable to extract uploader name')
2246
2247 # uploader_id
2248 video_uploader_id = None
2249 video_uploader_url = None
2250 mobj = re.search(
2251 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2252 video_webpage)
2253 if mobj is not None:
2254 video_uploader_id = mobj.group('uploader_id')
2255 video_uploader_url = mobj.group('uploader_url')
a6211d23
S
2256 else:
2257 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2258 if owner_profile_url:
2259 video_uploader_id = self._search_regex(
2260 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2261 default=None)
2262 video_uploader_url = owner_profile_url
7e72694b 2263
b45a9e69 2264 channel_id = (
3089bc74
S
2265 str_or_none(video_details.get('channelId'))
2266 or self._html_search_meta(
2267 'channelId', video_webpage, 'channel id', default=None)
2268 or self._search_regex(
b45a9e69 2269 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2270 video_webpage, 'channel id', default=None, group='id'))
dd4c4492
S
2271 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2272
b477fc13
S
2273 thumbnails = []
2274 thumbnails_list = try_get(
2275 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2276 for t in thumbnails_list:
2277 if not isinstance(t, dict):
2278 continue
2279 thumbnail_url = url_or_none(t.get('url'))
2280 if not thumbnail_url:
2281 continue
2282 thumbnails.append({
2283 'url': thumbnail_url,
2284 'width': int_or_none(t.get('width')),
2285 'height': int_or_none(t.get('height')),
2286 })
2287
2288 if not thumbnails:
7e72694b 2289 video_thumbnail = None
b477fc13
S
2290 # We try first to get a high quality image:
2291 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2292 video_webpage, re.DOTALL)
2293 if m_thumb is not None:
2294 video_thumbnail = m_thumb.group(1)
2295 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2296 if thumbnail_url:
2297 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2298 if video_thumbnail:
2299 thumbnails.append({'url': video_thumbnail})
7e72694b
S
2300
2301 # upload date
2302 upload_date = self._html_search_meta(
2303 'datePublished', video_webpage, 'upload date', default=None)
2304 if not upload_date:
2305 upload_date = self._search_regex(
2306 [r'(?s)id="eow-date.*?>(.*?)</span>',
2307 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2308 video_webpage, 'upload date', default=None)
37357d21
S
2309 if not upload_date:
2310 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
7e72694b
S
2311 upload_date = unified_strdate(upload_date)
2312
2313 video_license = self._html_search_regex(
2314 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2315 video_webpage, 'license', default=None)
2316
2317 m_music = re.search(
2318 r'''(?x)
2319 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2320 <ul[^>]*>\s*
2321 <li>(?P<title>.+?)
2322 by (?P<creator>.+?)
2323 (?:
2324 \(.+?\)|
2325 <a[^>]*
2326 (?:
2327 \bhref=["\']/red[^>]*>| # drop possible
2328 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2329 )
2330 .*?
2331 )?</li
2332 ''',
2333 video_webpage)
2334 if m_music:
2335 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2336 video_creator = clean_html(m_music.group('creator'))
2337 else:
2338 video_alt_title = video_creator = None
2339
2340 def extract_meta(field):
2341 return self._html_search_regex(
2342 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2343 video_webpage, field, default=None)
2344
2345 track = extract_meta('Song')
2346 artist = extract_meta('Artist')
92bc97d3 2347 album = extract_meta('Album')
822b9d9c
RA
2348
2349 # Youtube Music Auto-generated description
92bc97d3 2350 release_date = release_year = None
822b9d9c
RA
2351 if video_description:
2352 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2353 if mobj:
2354 if not track:
2355 track = mobj.group('track').strip()
2356 if not artist:
2357 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
92bc97d3
RA
2358 if not album:
2359 album = mobj.group('album'.strip())
822b9d9c
RA
2360 release_year = mobj.group('release_year')
2361 release_date = mobj.group('release_date')
2362 if release_date:
2363 release_date = release_date.replace('-', '')
2364 if not release_year:
2365 release_year = int(release_date[:4])
2366 if release_year:
2367 release_year = int(release_year)
7e72694b 2368
9322f116 2369 yt_initial = self._get_yt_initial_data(video_id, video_webpage)
2370 if yt_initial:
2371 music_metadata = self._get_music_metadata_from_yt_initial(yt_initial)
2372 if len(music_metadata):
2373 album = music_metadata[0].get('album')
2374 artist = music_metadata[0].get('artist')
2375 track = music_metadata[0].get('track')
2376
7e72694b
S
2377 m_episode = re.search(
2378 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2379 video_webpage)
2380 if m_episode:
c2dd2dc0 2381 series = unescapeHTML(m_episode.group('series'))
7e72694b
S
2382 season_number = int(m_episode.group('season'))
2383 episode_number = int(m_episode.group('episode'))
2384 else:
2385 series = season_number = episode_number = None
2386
2387 m_cat_container = self._search_regex(
2388 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2389 video_webpage, 'categories', default=None)
dbeafce5 2390 category = None
7e72694b
S
2391 if m_cat_container:
2392 category = self._html_search_regex(
2393 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2394 default=None)
dbeafce5
S
2395 if not category:
2396 category = try_get(
2397 microformat, lambda x: x['category'], compat_str)
2398 video_categories = None if category is None else [category]
7e72694b
S
2399
2400 video_tags = [
2401 unescapeHTML(m.group('content'))
2402 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
dbeafce5
S
2403 if not video_tags:
2404 video_tags = try_get(video_details, lambda x: x['keywords'], list)
7e72694b
S
2405
2406 def _extract_count(count_name):
2407 return str_to_int(self._search_regex(
a6c666d0 2408 r'"accessibilityData":\{"label":"([\d,\w]+) %ss"\}'
7e72694b
S
2409 % re.escape(count_name),
2410 video_webpage, count_name, default=None))
2411
2412 like_count = _extract_count('like')
2413 dislike_count = _extract_count('dislike')
2414
dbdaaa23
S
2415 if view_count is None:
2416 view_count = str_to_int(self._search_regex(
2417 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2418 'view count', default=None))
2419
bf3c9326
S
2420 average_rating = (
2421 float_or_none(video_details.get('averageRating'))
2422 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2423
7e72694b 2424 # subtitles
321bf820 2425 video_subtitles = self.extract_subtitles(
2426 video_id, video_webpage, has_live_chat_replay)
7e72694b
S
2427 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2428
2429 video_duration = try_get(
2430 video_info, lambda x: int_or_none(x['length_seconds'][0]))
dbdaaa23
S
2431 if not video_duration:
2432 video_duration = int_or_none(video_details.get('lengthSeconds'))
7e72694b
S
2433 if not video_duration:
2434 video_duration = parse_duration(self._html_search_meta(
2435 'duration', video_webpage, 'video duration'))
2436
b84071c0
JP
2437 # Get Subscriber Count of channel
2438 subscriber_count = parse_count(self._search_regex(
2439 r'"text":"([\d\.]+\w?) subscribers"',
2440 video_webpage,
2441 'subscriber count',
2442 default=None
2443 ))
2444
7e72694b
S
2445 # annotations
2446 video_annotations = None
2447 if self._downloader.params.get('writeannotations', False):
64b6a4e9
RA
2448 xsrf_token = self._search_regex(
2449 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2450 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2451 invideo_url = try_get(
2452 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2453 if xsrf_token and invideo_url:
2454 xsrf_field_name = self._search_regex(
2455 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2456 video_webpage, 'xsrf field name',
2457 group='xsrf_field_name', default='session_token')
2458 video_annotations = self._download_webpage(
2459 self._proto_relative_url(invideo_url),
2460 video_id, note='Downloading annotations',
2461 errnote='Unable to download video annotations', fatal=False,
2462 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
7e72694b 2463
84213ea8 2464 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
7e72694b 2465
dd27fd17 2466 # Look for the DASH manifest
203fb43f 2467 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 2468 dash_mpd_fatal = True
8ff648e4 2469 for mpd_url in dash_mpds:
d8d24a92 2470 dash_formats = {}
774e208f 2471 try:
05d0d131
YCH
2472 def decrypt_sig(mobj):
2473 s = mobj.group(1)
2474 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2475 return '/signature/%s' % dec_s
2476
8ff648e4 2477 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2d2fa82d 2478
8ff648e4 2479 for df in self._extract_mpd_formats(
2480 mpd_url, video_id, fatal=dash_mpd_fatal,
2481 formats_dict=self._formats):
c63ca0ee
S
2482 if not df.get('filesize'):
2483 df['filesize'] = _extract_filesize(df['url'])
d8d24a92
S
2484 # Do not overwrite DASH format found in some previous DASH manifest
2485 if df['format_id'] not in dash_formats:
2486 dash_formats[df['format_id']] = df
77c6fb5b
S
2487 # Additional DASH manifests may end up in HTTP Error 403 therefore
2488 # allow them to fail without bug report message if we already have
2489 # some DASH manifest succeeded. This is temporary workaround to reduce
2490 # burst of bug reports until we figure out the reason and whether it
2491 # can be fixed at all.
2492 dash_mpd_fatal = False
774e208f
PH
2493 except (ExtractorError, KeyError) as e:
2494 self.report_warning(
2495 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 2496 if dash_formats:
04b3b3df
JMF
2497 # Remove the formats we found through non-DASH, they
2498 # contain less info and it can be wrong, because we use
2499 # fixed values (for example the resolution). See
067aa17e 2500 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
04b3b3df 2501 # example.
d80265cc 2502 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 2503 formats.extend(dash_formats.values())
d80044c2 2504
6271f1ca
PH
2505 # Check for malformed aspect ratio
2506 stretched_m = re.search(
2507 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2508 video_webpage)
2509 if stretched_m:
313dfc45
LL
2510 w = float(stretched_m.group('w'))
2511 h = float(stretched_m.group('h'))
5faf9fed
S
2512 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2513 # We will only process correct ratios.
313dfc45 2514 if w > 0 and h > 0:
41f24c32 2515 ratio = w / h
313dfc45
LL
2516 for f in formats:
2517 if f.get('vcodec') != 'none':
2518 f['stretched_ratio'] = ratio
6271f1ca 2519
026fbedc 2520 if not formats:
43ebf77d
S
2521 if 'reason' in video_info:
2522 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2523 regions_allowed = self._html_search_meta(
2524 'regionsAllowed', video_webpage, default=None)
2525 countries = regions_allowed.split(',') if regions_allowed else None
2526 self.raise_geo_restricted(
2527 msg=video_info['reason'][0], countries=countries)
2528 reason = video_info['reason'][0]
2529 if 'Invalid parameters' in reason:
2530 unavailable_message = extract_unavailable_message()
2531 if unavailable_message:
2532 reason = unavailable_message
2533 raise ExtractorError(
2534 'YouTube said: %s' % reason,
2535 expected=True, video_id=video_id)
2536 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2537 raise ExtractorError('This video is DRM protected.', expected=True)
0d297518 2538
4bcc7bd1 2539 self._sort_formats(formats)
4ea3be0a 2540
21c340b8 2541 self.mark_watched(video_id, video_info, player_response)
d77ab8e2 2542
4ea3be0a 2543 return {
8bcc8756
JW
2544 'id': video_id,
2545 'uploader': video_uploader,
2546 'uploader_id': video_uploader_id,
fd050249 2547 'uploader_url': video_uploader_url,
dd4c4492
S
2548 'channel_id': channel_id,
2549 'channel_url': channel_url,
8bcc8756 2550 'upload_date': upload_date,
7caf9830 2551 'license': video_license,
936784b2 2552 'creator': video_creator or artist,
8bcc8756 2553 'title': video_title,
936784b2 2554 'alt_title': video_alt_title or track,
b477fc13 2555 'thumbnails': thumbnails,
8bcc8756
JW
2556 'description': video_description,
2557 'categories': video_categories,
000b6b5a 2558 'tags': video_tags,
8bcc8756 2559 'subtitles': video_subtitles,
360e1ca5 2560 'automatic_captions': automatic_captions,
8bcc8756
JW
2561 'duration': video_duration,
2562 'age_limit': 18 if age_gate else 0,
2563 'annotations': video_annotations,
9cafc3fd 2564 'chapters': chapters,
7e8c0af0 2565 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 2566 'view_count': view_count,
4ea3be0a 2567 'like_count': like_count,
2568 'dislike_count': dislike_count,
bf3c9326 2569 'average_rating': average_rating,
8bcc8756 2570 'formats': formats,
2fe1ff85 2571 'is_live': is_live,
7c80519c 2572 'start_time': start_time,
297a564b 2573 'end_time': end_time,
12afdc2a
S
2574 'series': series,
2575 'season_number': season_number,
2576 'episode_number': episode_number,
936784b2
S
2577 'track': track,
2578 'artist': artist,
5caabd3c 2579 'album': album,
2580 'release_date': release_date,
2581 'release_year': release_year,
b84071c0 2582 'subscriber_count': subscriber_count,
4ea3be0a 2583 }
c5e8d7af 2584
5f6a1245 2585
8e7aad20 2586class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 2587 IE_DESC = 'YouTube.com playlists'
d67cc9fa 2588 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
2589 (?:https?://)?
2590 (?:\w+\.)?
c5e8d7af 2591 (?:
c0345b82 2592 (?:
66b48727 2593 youtube(?:kids)?\.com|
c0345b82
S
2594 invidio\.us
2595 )
2596 /
feaa5ad7 2597 (?:
87dadd45 2598 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
feaa5ad7
S
2599 \? (?:.*?[&;])*? (?:p|a|list)=
2600 | p/
2601 )|
2602 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
c5e8d7af 2603 )
d67cc9fa 2604 (
66b48727 2605 (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
5f6a1245 2606 # Top tracks, they can also include dots
d67cc9fa
JMF
2607 |(?:MC)[\w\.]*
2608 )
c5e8d7af
PH
2609 .*
2610 |
d0ba5587
S
2611 (%(playlist_id)s)
2612 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
8d81f3e3 2613 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
351f37c0
S
2614 _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
2615 _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
78caa52a 2616 IE_NAME = 'youtube:playlist'
81127aa5 2617 _TESTS = [{
0e30a7b9 2618 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
81127aa5 2619 'info_dict': {
0e30a7b9 2620 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2621 'uploader': 'Sergey M.',
2622 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3867038a 2623 'title': 'youtube-dl public playlist',
81127aa5 2624 },
0e30a7b9 2625 'playlist_count': 1,
9291475f 2626 }, {
0e30a7b9 2627 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
9291475f 2628 'info_dict': {
0e30a7b9 2629 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2630 'uploader': 'Sergey M.',
2631 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3867038a 2632 'title': 'youtube-dl empty playlist',
9291475f
PH
2633 },
2634 'playlist_count': 0,
2635 }, {
2636 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2637 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2638 'info_dict': {
2639 'title': '29C3: Not my department',
acf757f4 2640 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
13a75688
S
2641 'uploader': 'Christiaan008',
2642 'uploader_id': 'ChRiStIaAn008',
9291475f 2643 },
0e30a7b9 2644 'playlist_count': 96,
9291475f
PH
2645 }, {
2646 'note': 'issue #673',
2647 'url': 'PLBB231211A4F62143',
2648 'info_dict': {
f46a8702 2649 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 2650 'id': 'PLBB231211A4F62143',
13a75688
S
2651 'uploader': 'Wickydoo',
2652 'uploader_id': 'Wickydoo',
9291475f
PH
2653 },
2654 'playlist_mincount': 26,
2655 }, {
2656 'note': 'Large playlist',
2657 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2658 'info_dict': {
2659 'title': 'Uploads from Cauchemar',
acf757f4 2660 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
13a75688
S
2661 'uploader': 'Cauchemar',
2662 'uploader_id': 'Cauchemar89',
9291475f
PH
2663 },
2664 'playlist_mincount': 799,
2665 }, {
2666 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2667 'info_dict': {
2668 'title': 'YDL_safe_search',
acf757f4 2669 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
2670 },
2671 'playlist_count': 2,
4201ba13 2672 'skip': 'This playlist is private',
ac7553d0
PH
2673 }, {
2674 'note': 'embedded',
2d3d2997 2675 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0
PH
2676 'playlist_count': 4,
2677 'info_dict': {
2678 'title': 'JODA15',
acf757f4 2679 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
13a75688
S
2680 'uploader': 'milan',
2681 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
ac7553d0 2682 }
87dadd45
S
2683 }, {
2684 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2685 'playlist_mincount': 485,
2686 'info_dict': {
13a75688 2687 'title': '2018 Chinese New Singles (11/6 updated)',
87dadd45 2688 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
13a75688
S
2689 'uploader': 'LBK',
2690 'uploader_id': 'sdragonfang',
87dadd45 2691 }
6b08cdf6
PH
2692 }, {
2693 'note': 'Embedded SWF player',
2d3d2997 2694 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
6b08cdf6
PH
2695 'playlist_count': 4,
2696 'info_dict': {
2697 'title': 'JODA7',
acf757f4 2698 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
13a75688
S
2699 },
2700 'skip': 'This playlist does not exist',
4b7df0d3
JMF
2701 }, {
2702 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2703 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2704 'info_dict': {
acf757f4
PH
2705 'title': 'Uploads from Interstellar Movie',
2706 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
13a75688
S
2707 'uploader': 'Interstellar Movie',
2708 'uploader_id': 'InterstellarMovie1',
4b7df0d3 2709 },
481cc733 2710 'playlist_mincount': 21,
dacb3a86
S
2711 }, {
2712 # Playlist URL that does not actually serve a playlist
2713 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2714 'info_dict': {
2715 'id': 'FqZTN594JQw',
2716 'ext': 'webm',
2717 'title': "Smiley's People 01 detective, Adventure Series, Action",
2718 'uploader': 'STREEM',
2719 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
ec85ded8 2720 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
dacb3a86
S
2721 'upload_date': '20150526',
2722 'license': 'Standard YouTube License',
2723 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2724 'categories': ['People & Blogs'],
2725 'tags': list,
dbdaaa23 2726 'view_count': int,
dacb3a86
S
2727 'like_count': int,
2728 'dislike_count': int,
2729 },
2730 'params': {
2731 'skip_download': True,
2732 },
13a75688 2733 'skip': 'This video is not available.',
dacb3a86 2734 'add_ie': [YoutubeIE.ie_key()],
481cc733
S
2735 }, {
2736 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2737 'info_dict': {
2738 'id': 'yeWKywCrFtk',
2739 'ext': 'mp4',
2740 'title': 'Small Scale Baler and Braiding Rugs',
2741 'uploader': 'Backus-Page House Museum',
2742 'uploader_id': 'backuspagemuseum',
ec85ded8 2743 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
481cc733 2744 'upload_date': '20161008',
481cc733
S
2745 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2746 'categories': ['Nonprofits & Activism'],
2747 'tags': list,
2748 'like_count': int,
2749 'dislike_count': int,
2750 },
2751 'params': {
2752 'noplaylist': True,
2753 'skip_download': True,
2754 },
2e18adec
S
2755 }, {
2756 # https://github.com/ytdl-org/youtube-dl/issues/21844
2757 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2758 'info_dict': {
2759 'title': 'Data Analysis with Dr Mike Pound',
2760 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2761 'uploader_id': 'Computerphile',
2762 'uploader': 'Computerphile',
2763 },
2764 'playlist_mincount': 11,
feaa5ad7
S
2765 }, {
2766 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2767 'only_matching': True,
a6857510
S
2768 }, {
2769 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2770 'only_matching': True,
409b9324
S
2771 }, {
2772 # music album playlist
2773 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2774 'only_matching': True,
c0345b82
S
2775 }, {
2776 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2777 'only_matching': True,
66b48727
RA
2778 }, {
2779 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2780 'only_matching': True,
81127aa5 2781 }]
c5e8d7af 2782
880e1c52
JMF
2783 def _real_initialize(self):
2784 self._login()
2785
351f37c0
S
2786 def extract_videos_from_page(self, page):
2787 ids_in_page = []
2788 titles_in_page = []
2789
2790 for item in re.findall(
2791 r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
2792 attrs = extract_attributes(item)
2793 video_id = attrs['data-video-id']
2794 video_title = unescapeHTML(attrs.get('data-title'))
2795 if video_title:
2796 video_title = video_title.strip()
2797 ids_in_page.append(video_id)
2798 titles_in_page.append(video_title)
2799
2800 # Fallback with old _VIDEO_RE
2801 self.extract_videos_from_page_impl(
2802 self._VIDEO_RE, page, ids_in_page, titles_in_page)
2803
2804 # Relaxed fallbacks
2805 self.extract_videos_from_page_impl(
2806 r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
2807 ids_in_page, titles_in_page)
2808 self.extract_videos_from_page_impl(
2809 r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
2810 ids_in_page, titles_in_page)
2811
2812 return zip(ids_in_page, titles_in_page)
2813
5b0a6a80 2814 def _extract_mix_ids_from_yt_initial(self, yt_initial):
2815 ids = []
2816 playlist_contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['contents'])
2817 if type(playlist_contents) is list:
2818 for item in playlist_contents:
2819 videoId = try_get(item, lambda x: x['playlistPanelVideoRenderer']['videoId'])
2820 if type(videoId) is str:
2821 ids.append(videoId)
2822 return ids
2823
652cdaa2 2824 def _extract_mix(self, playlist_id):
99209c29 2825 # The mixes are generated from a single video
652cdaa2 2826 # the id of the playlist is just 'RD' + video_id
1b6182d8
JMF
2827 ids = []
2828 last_id = playlist_id[-11:]
2829 for n in itertools.count(1):
07af16b9 2830 url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
1b6182d8
JMF
2831 webpage = self._download_webpage(
2832 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2833 new_ids = orderedSet(re.findall(
2834 r'''(?xs)data-video-username=".*?".*?
2835 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2836 webpage))
5b0a6a80 2837
2838 # if no ids in html of page, try using embedded json
2839 if (len(new_ids) == 0):
2840 yt_initial = self._get_yt_initial_data(playlist_id, webpage)
2841 if yt_initial:
2842 new_ids = self._extract_mix_ids_from_yt_initial(yt_initial)
2843
1b6182d8
JMF
2844 # Fetch new pages until all the videos are repeated, it seems that
2845 # there are always 51 unique videos.
2846 new_ids = [_id for _id in new_ids if _id not in ids]
2847 if not new_ids:
2848 break
2849 ids.extend(new_ids)
2850 last_id = ids[-1]
2851
2852 url_results = self._ids_to_results(ids)
2853
bc2f773b 2854 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5 2855 title_span = (
3089bc74
S
2856 search_title('playlist-title')
2857 or search_title('title long-title')
2858 or search_title('title'))
76d1700b 2859 title = clean_html(title_span)
652cdaa2
JMF
2860
2861 return self.playlist_result(url_results, playlist_id, title)
2862
448830ce 2863 def _extract_playlist(self, playlist_id):
dbb94fb0
S
2864 url = self._TEMPLATE_URL % playlist_id
2865 page = self._download_webpage(url, playlist_id)
dbb94fb0 2866
067aa17e 2867 # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
8bc0800d 2868 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
39b62db1
YCH
2869 match = match.strip()
2870 # Check if the playlist exists or is private
4201ba13
S
2871 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2872 if mobj:
2873 reason = mobj.group('reason')
2874 message = 'This playlist %s' % reason
2875 if 'private' in reason:
2876 message += ', use --username or --netrc to access it'
2877 message += '.'
2878 raise ExtractorError(message, expected=True)
39b62db1
YCH
2879 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2880 raise ExtractorError(
2881 'Invalid parameters. Maybe URL is incorrect.',
2882 expected=True)
2883 elif re.match(r'[^<]*Choose your language[^<]*', match):
2884 continue
2885 else:
2886 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 2887
dbb94fb0 2888 playlist_title = self._html_search_regex(
63b4295d 2889 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
dacb3a86 2890 page, 'title', default=None)
c5e8d7af 2891
07aeced6 2892 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
4e3f1f04 2893 uploader = self._html_search_regex(
07aeced6
S
2894 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2895 page, 'uploader', default=None)
2896 mobj = re.search(
2897 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2898 page)
2899 if mobj:
2900 uploader_id = mobj.group('uploader_id')
2901 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2902 else:
2903 uploader_id = uploader_url = None
2904
dacb3a86
S
2905 has_videos = True
2906
2907 if not playlist_title:
2908 try:
2909 # Some playlist URLs don't actually serve a playlist (e.g.
2910 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2911 next(self._entries(page, playlist_id))
2912 except StopIteration:
2913 has_videos = False
2914
07aeced6 2915 playlist = self.playlist_result(
dacb3a86 2916 self._entries(page, playlist_id), playlist_id, playlist_title)
07aeced6
S
2917 playlist.update({
2918 'uploader': uploader,
2919 'uploader_id': uploader_id,
2920 'uploader_url': uploader_url,
2921 })
2922
2923 return has_videos, playlist
c5e8d7af 2924
ebf1b291 2925 def _check_download_just_video(self, url, playlist_id):
448830ce
S
2926 # Check if it's a video-specific URL
2927 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
481cc733 2928 video_id = query_dict.get('v', [None])[0] or self._search_regex(
87dadd45 2929 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
481cc733
S
2930 'video id', default=None)
2931 if video_id:
448830ce
S
2932 if self._downloader.params.get('noplaylist'):
2933 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
dacb3a86 2934 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
448830ce
S
2935 else:
2936 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
dacb3a86
S
2937 return video_id, None
2938 return None, None
448830ce 2939
ebf1b291
S
2940 def _real_extract(self, url):
2941 # Extract playlist id
2942 mobj = re.match(self._VALID_URL, url)
2943 if mobj is None:
2944 raise ExtractorError('Invalid URL: %s' % url)
2945 playlist_id = mobj.group(1) or mobj.group(2)
2946
dacb3a86 2947 video_id, video = self._check_download_just_video(url, playlist_id)
ebf1b291
S
2948 if video:
2949 return video
2950
466a6145 2951 if playlist_id.startswith(('RD', 'UL', 'PU')):
448830ce
S
2952 # Mixes require a custom extraction process
2953 return self._extract_mix(playlist_id)
2954
dacb3a86
S
2955 has_videos, playlist = self._extract_playlist(playlist_id)
2956 if has_videos or not video_id:
2957 return playlist
2958
2959 # Some playlist URLs don't actually serve a playlist (see
067aa17e 2960 # https://github.com/ytdl-org/youtube-dl/issues/10537).
dacb3a86
S
2961 # Fallback to plain video extraction if there is a video id
2962 # along with playlist id.
2963 return self.url_result(video_id, 'Youtube', video_id=video_id)
448830ce 2964
c5e8d7af 2965
648e6a1f 2966class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 2967 IE_DESC = 'YouTube.com channels'
66b48727 2968 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 2969 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
648e6a1f 2970 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
78caa52a 2971 IE_NAME = 'youtube:channel'
cdc628a4
PH
2972 _TESTS = [{
2973 'note': 'paginated channel',
2974 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
2975 'playlist_mincount': 91,
acf757f4 2976 'info_dict': {
9170ca5b
JMF
2977 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
2978 'title': 'Uploads from lex will',
13a75688
S
2979 'uploader': 'lex will',
2980 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
acf757f4 2981 }
5c43afd4
JMF
2982 }, {
2983 'note': 'Age restricted channel',
2984 # from https://www.youtube.com/user/DeusExOfficial
2985 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
2986 'playlist_mincount': 64,
2987 'info_dict': {
2988 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
2989 'title': 'Uploads from Deus Ex',
13a75688
S
2990 'uploader': 'Deus Ex',
2991 'uploader_id': 'DeusExOfficial',
5c43afd4 2992 },
cd5a74a2
S
2993 }, {
2994 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
2995 'only_matching': True,
66b48727
RA
2996 }, {
2997 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
2998 'only_matching': True,
cdc628a4 2999 }]
c5e8d7af 3000
e462474e
S
3001 @classmethod
3002 def suitable(cls, url):
f07e276a
S
3003 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
3004 else super(YoutubeChannelIE, cls).suitable(url))
e462474e 3005
9558dcec
S
3006 def _build_template_url(self, url, channel_id):
3007 return self._TEMPLATE_URL % channel_id
3008
c5e8d7af 3009 def _real_extract(self, url):
9ff67727 3010 channel_id = self._match_id(url)
c5e8d7af 3011
9558dcec 3012 url = self._build_template_url(url, channel_id)
386bdfa6
S
3013
3014 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
3015 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
3016 # otherwise fallback on channel by page extraction
3017 channel_page = self._download_webpage(
3018 url + '?view=57', channel_id,
3019 'Downloading channel page', fatal=False)
2b3c2546
PH
3020 if channel_page is False:
3021 channel_playlist_id = False
3022 else:
3023 channel_playlist_id = self._html_search_meta(
3024 'channelId', channel_page, 'channel id', default=None)
3025 if not channel_playlist_id:
73c4ac2c
S
3026 channel_url = self._html_search_meta(
3027 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
3028 channel_page, 'channel url', default=None)
3029 if channel_url:
3030 channel_playlist_id = self._search_regex(
3031 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
3032 channel_url, 'channel id', default=None)
386bdfa6
S
3033 if channel_playlist_id and channel_playlist_id.startswith('UC'):
3034 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
3035 return self.url_result(
3036 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 3037
60bf45c8 3038 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
3039 autogenerated = re.search(r'''(?x)
3040 class="[^"]*?(?:
3041 channel-header-autogenerated-label|
3042 yt-channel-title-autogenerated
3043 )[^"]*"''', channel_page) is not None
c5e8d7af 3044
b9643eed
JMF
3045 if autogenerated:
3046 # The videos are contained in a single page
3047 # the ajax pages can't be used, they are empty
b82f815f 3048 entries = [
fb69240c
S
3049 self.url_result(
3050 video_id, 'Youtube', video_id=video_id,
3051 video_title=video_title)
8f02ad4f 3052 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
3053 return self.playlist_result(entries, channel_id)
3054
73c4ac2c
S
3055 try:
3056 next(self._entries(channel_page, channel_id))
3057 except StopIteration:
3058 alert_message = self._html_search_regex(
3059 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
3060 channel_page, 'alert', default=None, group='alert')
3061 if alert_message:
3062 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
3063
648e6a1f 3064 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
c5e8d7af
PH
3065
3066
eb0f3e7e 3067class YoutubeUserIE(YoutubeChannelIE):
78caa52a 3068 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
39e7107d 3069 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)'
9558dcec 3070 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
78caa52a 3071 IE_NAME = 'youtube:user'
c5e8d7af 3072
cdc628a4
PH
3073 _TESTS = [{
3074 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
3075 'playlist_mincount': 320,
3076 'info_dict': {
73c4ac2c
S
3077 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
3078 'title': 'Uploads from The Linux Foundation',
13a75688
S
3079 'uploader': 'The Linux Foundation',
3080 'uploader_id': 'TheLinuxFoundation',
cdc628a4 3081 }
9558dcec
S
3082 }, {
3083 # Only available via https://www.youtube.com/c/12minuteathlete/videos
3084 # but not https://www.youtube.com/user/12minuteathlete/videos
3085 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
3086 'playlist_mincount': 249,
3087 'info_dict': {
3088 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
3089 'title': 'Uploads from 12 Minute Athlete',
13a75688
S
3090 'uploader': '12 Minute Athlete',
3091 'uploader_id': 'the12minuteathlete',
9558dcec 3092 }
cdc628a4
PH
3093 }, {
3094 'url': 'ytuser:phihag',
3095 'only_matching': True,
daa0df9e
YCH
3096 }, {
3097 'url': 'https://www.youtube.com/c/gametrailers',
3098 'only_matching': True,
39e7107d
U
3099 }, {
3100 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak',
3101 'only_matching': True,
9558dcec
S
3102 }, {
3103 'url': 'https://www.youtube.com/gametrailers',
3104 'only_matching': True,
73c4ac2c 3105 }, {
0e879f43 3106 # This channel is not available, geo restricted to JP
73c4ac2c
S
3107 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
3108 'only_matching': True,
cdc628a4
PH
3109 }]
3110
e3ea4790 3111 @classmethod
f4b05232 3112 def suitable(cls, url):
e3ea4790
JMF
3113 # Don't return True if the url can be extracted with other youtube
3114 # extractor, the regex would is too permissive and it would match.
f3a58d46 3115 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
3116 if any(ie.suitable(url) for ie in other_yt_ies):
5f6a1245
JW
3117 return False
3118 else:
3119 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 3120
9558dcec
S
3121 def _build_template_url(self, url, channel_id):
3122 mobj = re.match(self._VALID_URL, url)
3123 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
3124
b05654f0 3125
f07e276a
S
3126class YoutubeLiveIE(YoutubeBaseInfoExtractor):
3127 IE_DESC = 'YouTube.com live streams'
073d5bf5 3128 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
f07e276a
S
3129 IE_NAME = 'youtube:live'
3130
3131 _TESTS = [{
2d3d2997 3132 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
f07e276a
S
3133 'info_dict': {
3134 'id': 'a48o2S1cPoo',
3135 'ext': 'mp4',
3136 'title': 'The Young Turks - Live Main Show',
3137 'uploader': 'The Young Turks',
3138 'uploader_id': 'TheYoungTurks',
ec85ded8 3139 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
f07e276a
S
3140 'upload_date': '20150715',
3141 'license': 'Standard YouTube License',
3142 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3143 'categories': ['News & Politics'],
3144 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3145 'like_count': int,
3146 'dislike_count': int,
3147 },
3148 'params': {
3149 'skip_download': True,
3150 },
3151 }, {
2d3d2997 3152 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
f07e276a 3153 'only_matching': True,
c1b2a085
S
3154 }, {
3155 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3156 'only_matching': True,
073d5bf5
S
3157 }, {
3158 'url': 'https://www.youtube.com/TheYoungTurks/live',
3159 'only_matching': True,
f07e276a
S
3160 }]
3161
3162 def _real_extract(self, url):
3163 mobj = re.match(self._VALID_URL, url)
3164 channel_id = mobj.group('id')
3165 base_url = mobj.group('base_url')
3166 webpage = self._download_webpage(url, channel_id, fatal=False)
3167 if webpage:
3168 page_type = self._og_search_property(
e7f3529f 3169 'type', webpage, 'page type', default='')
f07e276a
S
3170 video_id = self._html_search_meta(
3171 'videoId', webpage, 'video id', default=None)
e7f3529f
S
3172 if page_type.startswith('video') and video_id and re.match(
3173 r'^[0-9A-Za-z_-]{11}$', video_id):
f07e276a
S
3174 return self.url_result(video_id, YoutubeIE.ie_key())
3175 return self.url_result(base_url)
3176
3177
e462474e
S
3178class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
3179 IE_DESC = 'YouTube.com user/channel playlists'
e942cfd1 3180 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists'
e462474e 3181 IE_NAME = 'youtube:playlists'
0c148415 3182
e568c223 3183 _TESTS = [{
2d3d2997 3184 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
0c148415
S
3185 'playlist_mincount': 4,
3186 'info_dict': {
3187 'id': 'ThirstForScience',
13a75688 3188 'title': 'ThirstForScience',
0c148415 3189 },
e568c223
S
3190 }, {
3191 # with "Load more" button
2d3d2997 3192 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
e568c223
S
3193 'playlist_mincount': 70,
3194 'info_dict': {
3195 'id': 'igorkle1',
3196 'title': 'Игорь Клейнер',
3197 },
e462474e
S
3198 }, {
3199 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
3200 'playlist_mincount': 17,
3201 'info_dict': {
3202 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
3203 'title': 'Chem Player',
3204 },
13a75688 3205 'skip': 'Blocked',
e942cfd1
S
3206 }, {
3207 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3208 'only_matching': True,
e568c223 3209 }]
0c148415
S
3210
3211
870f3bfc
S
3212class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
3213 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
3214
3215
3216class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
78caa52a 3217 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
3218 # there doesn't appear to be a real limit, for example if you search for
3219 # 'python' you get more than 8.000.000 results
3220 _MAX_RESULTS = float('inf')
78caa52a 3221 IE_NAME = 'youtube:search'
b05654f0 3222 _SEARCH_KEY = 'ytsearch'
6c894ea1 3223 _SEARCH_PARAMS = None
9dd8e46a 3224 _TESTS = []
b05654f0 3225
6c894ea1
U
3226 def _entries(self, query, n):
3227 data = {
3228 'context': {
3229 'client': {
3230 'clientName': 'WEB',
3231 'clientVersion': '2.20201021.03.00',
3232 }
3233 },
3234 'query': query,
a22b2fd1 3235 }
6c894ea1
U
3236 if self._SEARCH_PARAMS:
3237 data['params'] = self._SEARCH_PARAMS
3238 total = 0
3239 for page_num in itertools.count(1):
3240 search = self._download_json(
3241 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3242 video_id='query "%s"' % query,
3243 note='Downloading page %s' % page_num,
3244 errnote='Unable to download API page', fatal=False,
3245 data=json.dumps(data).encode('utf8'),
3246 headers={'content-type': 'application/json'})
3247 if not search:
b4c08069 3248 break
6c894ea1
U
3249 slr_contents = try_get(
3250 search,
3251 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3252 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3253 list)
3254 if not slr_contents:
a22b2fd1 3255 break
6c894ea1
U
3256 isr_contents = try_get(
3257 slr_contents,
3258 lambda x: x[0]['itemSectionRenderer']['contents'],
3259 list)
3260 if not isr_contents:
3261 break
3262 for content in isr_contents:
3263 if not isinstance(content, dict):
3264 continue
3265 video = content.get('videoRenderer')
3266 if not isinstance(video, dict):
3267 continue
3268 video_id = video.get('videoId')
3269 if not video_id:
3270 continue
3271 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3272 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3273 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3274 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3275 view_count = int_or_none(self._search_regex(
3276 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3277 'view count', default=None))
3278 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3279 total += 1
3280 yield {
3281 '_type': 'url_transparent',
3282 'ie_key': YoutubeIE.ie_key(),
3283 'id': video_id,
3284 'url': video_id,
3285 'title': title,
3286 'description': description,
3287 'duration': duration,
3288 'view_count': view_count,
3289 'uploader': uploader,
3290 }
3291 if total == n:
3292 return
3293 token = try_get(
3294 slr_contents,
3295 lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3296 compat_str)
3297 if not token:
3298 break
3299 data['continuation'] = token
b05654f0 3300
6c894ea1
U
3301 def _get_n_results(self, query, n):
3302 """Get a specified number of results for a query"""
3303 return self.playlist_result(self._entries(query, n), query)
75dff0ee 3304
c9ae7b95 3305
a3dd9248 3306class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 3307 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 3308 _SEARCH_KEY = 'ytsearchdate'
78caa52a 3309 IE_DESC = 'YouTube.com searches, newest videos first'
6c894ea1 3310 _SEARCH_PARAMS = 'CAI%3D'
75dff0ee 3311
c9ae7b95 3312
870f3bfc 3313class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
78caa52a
PH
3314 IE_DESC = 'YouTube.com search URLs'
3315 IE_NAME = 'youtube:search_url'
d2c1f79f 3316 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
c0a1a892 3317 _SEARCH_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
cdc628a4 3318 _TESTS = [{
3867038a 3319 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
cdc628a4
PH
3320 'playlist_mincount': 5,
3321 'info_dict': {
3867038a 3322 'title': 'youtube-dl test video',
cdc628a4 3323 }
d2c1f79f
S
3324 }, {
3325 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3326 'only_matching': True,
cdc628a4 3327 }]
c9ae7b95 3328
e03b4f3e 3329 def _find_videos_in_json(self, extracted):
3330 videos = []
3331
3332 def _real_find(obj):
3333 if obj is None or isinstance(obj, str):
3334 return
3335
3336 if type(obj) is list:
3337 for elem in obj:
3338 _real_find(elem)
3339
3340 if type(obj) is dict:
3341 if "videoId" in obj:
3342 videos.append(obj)
3343 return
3344
3345 for _, o in obj.items():
3346 _real_find(o)
3347
3348 _real_find(extracted)
3349
3350 return videos
3351
19f671f8 3352 def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page):
3353 search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None)
3354
e03b4f3e 3355 result_items = self._find_videos_in_json(search_response)
19f671f8 3356
955c4cb6 3357 for renderer in result_items:
3358 video_id = try_get(renderer, lambda x: x['videoId'])
3359 video_title = try_get(renderer, lambda x: x['title']['runs'][0]['text']) or try_get(renderer, lambda x: x['title']['simpleText'])
19f671f8 3360
3361 if video_id is None or video_title is None:
955c4cb6 3362 # we do not have a videoRenderer or title extraction broke
19f671f8 3363 continue
3364
3365 video_title = video_title.strip()
3366
3367 try:
3368 idx = ids_in_page.index(video_id)
3369 if video_title and not titles_in_page[idx]:
3370 titles_in_page[idx] = video_title
3371 except ValueError:
3372 ids_in_page.append(video_id)
3373 titles_in_page.append(video_title)
3374
3375 def extract_videos_from_page(self, page):
3376 ids_in_page = []
3377 titles_in_page = []
3378 self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page)
3379 return zip(ids_in_page, titles_in_page)
3380
c9ae7b95
PH
3381 def _real_extract(self, url):
3382 mobj = re.match(self._VALID_URL, url)
7fd002c0 3383 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
c9ae7b95 3384 webpage = self._download_webpage(url, query)
175c2e9e 3385 return self.playlist_result(self._process_page(webpage), playlist_title=query)
c9ae7b95
PH
3386
3387
136dadde 3388class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
78caa52a 3389 IE_DESC = 'YouTube.com (multi-season) shows'
92519402 3390 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 3391 IE_NAME = 'youtube:show'
cdc628a4 3392 _TESTS = [{
4003bd82 3393 'url': 'https://www.youtube.com/show/airdisasters',
8801255d 3394 'playlist_mincount': 5,
cdc628a4
PH
3395 'info_dict': {
3396 'id': 'airdisasters',
3397 'title': 'Air Disasters',
3398 }
3399 }]
75dff0ee
JMF
3400
3401 def _real_extract(self, url):
136dadde
S
3402 playlist_id = self._match_id(url)
3403 return super(YoutubeShowIE, self)._real_extract(
3404 'https://www.youtube.com/show/%s/playlists' % playlist_id)
04cc9617
JMF
3405
3406
b2e8bc1b 3407class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639 3408 """
25f14e9f 3409 Base class for feed extractors
d7ae0639
JMF
3410 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3411 """
b2e8bc1b 3412 _LOGIN_REQUIRED = True
bea9b005 3413 _FEED_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
f5360807 3414 _YTCFG_DATA = r"ytcfg.set\(({.*?})\)"
d7ae0639
JMF
3415
3416 @property
3417 def IE_NAME(self):
78caa52a 3418 return 'youtube:%s' % self._FEED_NAME
04cc9617 3419
81f0259b 3420 def _real_initialize(self):
b2e8bc1b 3421 self._login()
81f0259b 3422
5c430b67 3423 def _find_videos_in_json(self, extracted):
3424 videos = []
299056ad 3425 c = {}
5c430b67 3426
3427 def _real_find(obj):
3428 if obj is None or isinstance(obj, str):
3429 return
3430
3431 if type(obj) is list:
3432 for elem in obj:
3433 _real_find(elem)
3434
3435 if type(obj) is dict:
3436 if "videoId" in obj:
3437 videos.append(obj)
3438 return
f5360807 3439
5c430b67 3440 if "nextContinuationData" in obj:
299056ad 3441 c["continuation"] = obj["nextContinuationData"]
f5360807 3442 return
3443
5c430b67 3444 for _, o in obj.items():
3445 _real_find(o)
3446
3447 _real_find(extracted)
3448
299056ad 3449 return videos, try_get(c, lambda x: x["continuation"])
f5360807 3450
3853309f 3451 def _entries(self, page):
5c430b67 3452 info = []
3453
1f93faf6 3454 yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set', default="null"), None, fatal=False)
5c430b67 3455
3456 search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None)
3457
2bc43303 3458 for page_num in itertools.count(1):
5c430b67 3459 video_info, continuation = self._find_videos_in_json(search_response)
62c95fd5 3460
f5360807 3461 new_info = []
5c430b67 3462
3463 for v in video_info:
3464 v_id = try_get(v, lambda x: x['videoId'])
3465 if not v_id:
3466 continue
3467
f5360807 3468 have_video = False
5c430b67 3469 for old in info:
3470 if old['videoId'] == v_id:
3471 have_video = True
3472 break
3473
3474 if not have_video:
3475 new_info.append(v)
3476
3477 if not new_info:
62c95fd5
S
3478 break
3479
5c430b67 3480 info.extend(new_info)
2bc43303 3481
5c430b67 3482 for video in new_info:
f442082a 3483 yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text']) or try_get(video, lambda x: x['title']['simpleText']))
3853309f 3484
1f93faf6 3485 if not continuation or not yt_conf:
2bc43303
JMF
3486 break
3487
5c430b67 3488 search_response = self._download_json(
3489 'https://www.youtube.com/browse_ajax', self._PLAYLIST_TITLE,
2bc43303 3490 'Downloading page #%s' % page_num,
d84b21b4 3491 transform_source=uppercase_escape,
5c430b67 3492 query={
3493 "ctoken": try_get(continuation, lambda x: x["continuation"]),
3494 "continuation": try_get(continuation, lambda x: x["continuation"]),
3495 "itct": try_get(continuation, lambda x: x["clickTrackingParams"])
3496 },
3497 headers={
3498 "X-YouTube-Client-Name": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_NAME"]),
3499 "X-YouTube-Client-Version": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_VERSION"]),
3500 "X-Youtube-Identity-Token": try_get(yt_conf, lambda x: x["ID_TOKEN"]),
3501 "X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]),
3502 "X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]),
3503 "X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]),
f5360807 3504 "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]),
5c430b67 3505 })
2bc43303 3506
3853309f
S
3507 def _real_extract(self, url):
3508 page = self._download_webpage(
3509 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3510 self._PLAYLIST_TITLE)
25f14e9f 3511 return self.playlist_result(
3853309f 3512 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
25f14e9f
S
3513
3514
3515class YoutubeWatchLaterIE(YoutubePlaylistIE):
3516 IE_NAME = 'youtube:watchlater'
3517 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
92519402 3518 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
25f14e9f 3519
bc7a9cd8
S
3520 _TESTS = [{
3521 'url': 'https://www.youtube.com/playlist?list=WL',
3522 'only_matching': True,
3523 }, {
3524 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
3525 'only_matching': True,
3526 }]
25f14e9f
S
3527
3528 def _real_extract(self, url):
7e5dc339 3529 _, video = self._check_download_just_video(url, 'WL')
ebf1b291
S
3530 if video:
3531 return video
dacb3a86
S
3532 _, playlist = self._extract_playlist('WL')
3533 return playlist
f459d170 3534
5f6a1245 3535
c626a3d9 3536class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 3537 IE_NAME = 'youtube:favorites'
f3a34072 3538 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
92519402 3539 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
3540 _LOGIN_REQUIRED = True
3541
3542 def _real_extract(self, url):
3543 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 3544 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 3545 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
3546
3547
25f14e9f
S
3548class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3549 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
92519402 3550 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
25f14e9f
S
3551 _FEED_NAME = 'recommended'
3552 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 3553
1ed5b5c9 3554
25f14e9f
S
3555class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3556 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
92519402 3557 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
25f14e9f
S
3558 _FEED_NAME = 'subscriptions'
3559 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 3560
1ed5b5c9 3561
25f14e9f
S
3562class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3563 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
92519402 3564 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
25f14e9f
S
3565 _FEED_NAME = 'history'
3566 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
3567
3568
15870e90
PH
3569class YoutubeTruncatedURLIE(InfoExtractor):
3570 IE_NAME = 'youtube:truncated_url'
3571 IE_DESC = False # Do not list
975d35db 3572 _VALID_URL = r'''(?x)
b95aab84
PH
3573 (?:https?://)?
3574 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3575 (?:watch\?(?:
c4808c60 3576 feature=[a-z_]+|
b95aab84
PH
3577 annotation_id=annotation_[^&]+|
3578 x-yt-cl=[0-9]+|
c1708b89 3579 hl=[^&]*|
287be8c6 3580 t=[0-9]+
b95aab84
PH
3581 )?
3582 |
3583 attribution_link\?a=[^&]+
3584 )
3585 $
975d35db 3586 '''
15870e90 3587
c4808c60 3588 _TESTS = [{
2d3d2997 3589 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
c4808c60 3590 'only_matching': True,
dc2fc736 3591 }, {
2d3d2997 3592 'url': 'https://www.youtube.com/watch?',
dc2fc736 3593 'only_matching': True,
b95aab84
PH
3594 }, {
3595 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3596 'only_matching': True,
3597 }, {
3598 'url': 'https://www.youtube.com/watch?feature=foo',
3599 'only_matching': True,
c1708b89
PH
3600 }, {
3601 'url': 'https://www.youtube.com/watch?hl=en-GB',
3602 'only_matching': True,
287be8c6
PH
3603 }, {
3604 'url': 'https://www.youtube.com/watch?t=2372',
3605 'only_matching': True,
c4808c60
PH
3606 }]
3607
15870e90
PH
3608 def _real_extract(self, url):
3609 raise ExtractorError(
78caa52a
PH
3610 'Did you forget to quote the URL? Remember that & is a meta '
3611 'character in most shells, so you want to put the URL in quotes, '
3867038a 3612 'like youtube-dl '
2d3d2997 3613 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3867038a 3614 ' or simply youtube-dl BaW_jenozKc .',
15870e90 3615 expected=True)
772fd5cc
PH
3616
3617
3618class YoutubeTruncatedIDIE(InfoExtractor):
3619 IE_NAME = 'youtube:truncated_id'
3620 IE_DESC = False # Do not list
b95aab84 3621 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
3622
3623 _TESTS = [{
3624 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3625 'only_matching': True,
3626 }]
3627
3628 def _real_extract(self, url):
3629 video_id = self._match_id(url)
3630 raise ExtractorError(
3631 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3632 expected=True)