]> jfr.im git - yt-dlp.git/blame - youtube_dlc/extractor/youtube.py
COMPLAINFREE
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
d77ab8e2 9import random
c5e8d7af 10import re
42939b61 11import time
e0df6211 12import traceback
c5e8d7af 13
b05654f0 14from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
f8c55c66 19 compat_HTTPError,
8d81f3e3 20 compat_kwargs,
c5e8d7af 21 compat_parse_qs,
7fd002c0
S
22 compat_urllib_parse_unquote,
23 compat_urllib_parse_unquote_plus,
15707c7e 24 compat_urllib_parse_urlencode,
7c80519c 25 compat_urllib_parse_urlparse,
7c61bd36 26 compat_urlparse,
c5e8d7af 27 compat_str,
4bb4a188
PH
28)
29from ..utils import (
27019dbb 30 bool_or_none,
c5e8d7af 31 clean_html,
9b9c5355 32 error_to_compat_str,
351f37c0 33 extract_attributes,
c5e8d7af 34 ExtractorError,
2d30521a 35 float_or_none,
4bb4a188
PH
36 get_element_by_attribute,
37 get_element_by_id,
dd27fd17 38 int_or_none,
94278f72 39 mimetype2ext,
4bb4a188 40 orderedSet,
6310acf5 41 parse_codecs,
b84071c0 42 parse_count,
7c80519c 43 parse_duration,
0cb58b02 44 remove_quotes,
3995d37d 45 remove_start,
cf7e015f 46 smuggle_url,
dbdaaa23 47 str_or_none,
c93d53f5 48 str_to_int,
556dbe7f 49 try_get,
c5e8d7af
PH
50 unescapeHTML,
51 unified_strdate,
cf7e015f 52 unsmuggle_url,
81c2f20b 53 uppercase_escape,
21c340b8 54 url_or_none,
6e6bc8da 55 urlencode_postdata,
c5e8d7af
PH
56)
57
5f6a1245 58
de7f3446 59class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
60 """Provide base functions for Youtube extractors"""
61 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 62 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
e00eb564
S
63
64 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
3995d37d
S
65 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
66 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
e00eb564 67
b2e8bc1b
JMF
68 _NETRC_MACHINE = 'youtube'
69 # If True it will raise an error if no login info is provided
70 _LOGIN_REQUIRED = False
71
66b48727 72 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
d0ba5587 73
d84b21b4
S
74 _YOUTUBE_CLIENT_HEADERS = {
75 'x-youtube-client-name': '1',
76 'x-youtube-client-version': '1.20200609.04.02',
77 }
78
b2e8bc1b 79 def _set_language(self):
810fb84d 80 self._set_cookie(
ee0b726c 81 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
42939b61 82 # YouTube sets the expire time to about two months
810fb84d 83 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 84
25f14e9f
S
85 def _ids_to_results(self, ids):
86 return [
87 self.url_result(vid_id, 'Youtube', video_id=vid_id)
88 for vid_id in ids]
89
b2e8bc1b 90 def _login(self):
83317f69 91 """
92 Attempt to log in to YouTube.
93 True is returned if successful or skipped.
94 False is returned if login failed.
95
96 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
97 """
68217024 98 username, password = self._get_login_info()
b2e8bc1b
JMF
99 # No authentication to be performed
100 if username is None:
70d35d16 101 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
69ea8ca4 102 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
957c523e
U
103 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
104 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
83317f69 105 return True
b2e8bc1b 106
7cc3570e
PH
107 login_page = self._download_webpage(
108 self._LOGIN_URL, None,
69ea8ca4
PH
109 note='Downloading login page',
110 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
111 if login_page is False:
112 return
b2e8bc1b 113
1212e997 114 login_form = self._hidden_inputs(login_page)
c5e8d7af 115
e00eb564
S
116 def req(url, f_req, note, errnote):
117 data = login_form.copy()
118 data.update({
119 'pstMsg': 1,
120 'checkConnection': 'youtube',
121 'checkedDomains': 'youtube',
122 'hl': 'en',
123 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
3995d37d 124 'f.req': json.dumps(f_req),
e00eb564
S
125 'flowName': 'GlifWebSignIn',
126 'flowEntry': 'ServiceLogin',
baf67a60
S
127 # TODO: reverse actual botguard identifier generation algo
128 'bgRequest': '["identifier",""]',
041bc3ad 129 })
e00eb564
S
130 return self._download_json(
131 url, None, note=note, errnote=errnote,
132 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
133 fatal=False,
134 data=urlencode_postdata(data), headers={
135 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
136 'Google-Accounts-XSRF': 1,
137 })
138
3995d37d
S
139 def warn(message):
140 self._downloader.report_warning(message)
141
142 lookup_req = [
143 username,
144 None, [], None, 'US', None, None, 2, False, True,
145 [
146 None, None,
147 [2, 1, None, 1,
148 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
149 None, [], 4],
150 1, [None, None, []], None, None, None, True
151 ],
152 username,
153 ]
154
e00eb564 155 lookup_results = req(
3995d37d 156 self._LOOKUP_URL, lookup_req,
e00eb564
S
157 'Looking up account info', 'Unable to look up account info')
158
159 if lookup_results is False:
160 return False
041bc3ad 161
3995d37d
S
162 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
163 if not user_hash:
164 warn('Unable to extract user hash')
165 return False
166
167 challenge_req = [
168 user_hash,
169 None, 1, None, [1, None, None, None, [password, None, True]],
170 [
171 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
172 1, [None, None, []], None, None, None, True
173 ]]
83317f69 174
3995d37d
S
175 challenge_results = req(
176 self._CHALLENGE_URL, challenge_req,
177 'Logging in', 'Unable to log in')
83317f69 178
3995d37d 179 if challenge_results is False:
e00eb564 180 return
83317f69 181
3995d37d
S
182 login_res = try_get(challenge_results, lambda x: x[0][5], list)
183 if login_res:
184 login_msg = try_get(login_res, lambda x: x[5], compat_str)
185 warn(
186 'Unable to login: %s' % 'Invalid password'
187 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
188 return False
189
190 res = try_get(challenge_results, lambda x: x[0][-1], list)
191 if not res:
192 warn('Unable to extract result entry')
193 return False
194
9a6628aa
S
195 login_challenge = try_get(res, lambda x: x[0][0], list)
196 if login_challenge:
197 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
198 if challenge_str == 'TWO_STEP_VERIFICATION':
3995d37d
S
199 # SEND_SUCCESS - TFA code has been successfully sent to phone
200 # QUOTA_EXCEEDED - reached the limit of TFA codes
9a6628aa 201 status = try_get(login_challenge, lambda x: x[5], compat_str)
3995d37d
S
202 if status == 'QUOTA_EXCEEDED':
203 warn('Exceeded the limit of TFA codes, try later')
204 return False
205
206 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
207 if not tl:
208 warn('Unable to extract TL')
209 return False
210
211 tfa_code = self._get_tfa_info('2-step verification code')
212
213 if not tfa_code:
214 warn(
215 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
216 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
217 return False
218
219 tfa_code = remove_start(tfa_code, 'G-')
220
221 tfa_req = [
222 user_hash, None, 2, None,
223 [
224 9, None, None, None, None, None, None, None,
225 [None, tfa_code, True, 2]
226 ]]
227
228 tfa_results = req(
229 self._TFA_URL.format(tl), tfa_req,
230 'Submitting TFA code', 'Unable to submit TFA code')
231
232 if tfa_results is False:
233 return False
234
235 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
236 if tfa_res:
237 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
238 warn(
239 'Unable to finish TFA: %s' % 'Invalid TFA code'
240 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
241 return False
242
243 check_cookie_url = try_get(
244 tfa_results, lambda x: x[0][-1][2], compat_str)
9a6628aa
S
245 else:
246 CHALLENGES = {
247 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
248 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
249 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
250 }
251 challenge = CHALLENGES.get(
252 challenge_str,
253 '%s returned error %s.' % (self.IE_NAME, challenge_str))
254 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
255 return False
3995d37d
S
256 else:
257 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
258
259 if not check_cookie_url:
260 warn('Unable to extract CheckCookie URL')
261 return False
e00eb564
S
262
263 check_cookie_results = self._download_webpage(
3995d37d
S
264 check_cookie_url, None, 'Checking cookie', fatal=False)
265
266 if check_cookie_results is False:
267 return False
e00eb564 268
3995d37d
S
269 if 'https://myaccount.google.com/' not in check_cookie_results:
270 warn('Unable to log in')
b2e8bc1b 271 return False
e00eb564 272
b2e8bc1b
JMF
273 return True
274
30226342 275 def _download_webpage_handle(self, *args, **kwargs):
c1148516
S
276 query = kwargs.get('query', {}).copy()
277 query['disable_polymer'] = 'true'
278 kwargs['query'] = query
30226342 279 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
8d81f3e3
S
280 *args, **compat_kwargs(kwargs))
281
b2e8bc1b
JMF
282 def _real_initialize(self):
283 if self._downloader is None:
284 return
42939b61 285 self._set_language()
b2e8bc1b
JMF
286 if not self._login():
287 return
c5e8d7af 288
8377574c 289
8e7aad20 290class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
061a75ed 291 # Extract entries from page with "Load more" button
648e6a1f
S
292 def _entries(self, page, playlist_id):
293 more_widget_html = content_html = page
294 for page_num in itertools.count(1):
061a75ed
S
295 for entry in self._process_page(content_html):
296 yield entry
648e6a1f
S
297
298 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
299 if not mobj:
300 break
301
f8c55c66
S
302 count = 0
303 retries = 3
304 while count <= retries:
305 try:
306 # Downloading page may result in intermittent 5xx HTTP error
307 # that is usually worked around with a retry
308 more = self._download_json(
07af16b9 309 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id,
f8c55c66
S
310 'Downloading page #%s%s'
311 % (page_num, ' (retry #%d)' % count if count else ''),
d84b21b4
S
312 transform_source=uppercase_escape,
313 headers=self._YOUTUBE_CLIENT_HEADERS)
f8c55c66
S
314 break
315 except ExtractorError as e:
316 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
317 count += 1
318 if count <= retries:
319 continue
320 raise
321
648e6a1f
S
322 content_html = more['content_html']
323 if not content_html.strip():
324 # Some webpages show a "Load more" button but they don't
325 # have more videos
326 break
327 more_widget_html = more['load_more_widget_html']
328
061a75ed
S
329
330class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
331 def _process_page(self, content):
332 for video_id, video_title in self.extract_videos_from_page(content):
333 yield self.url_result(video_id, 'Youtube', video_id, video_title)
334
351f37c0
S
335 def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
336 for mobj in re.finditer(video_re, page):
648e6a1f
S
337 # The link with index 0 is not the first video of the playlist (not sure if still actual)
338 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
339 continue
340 video_id = mobj.group('id')
351f37c0
S
341 video_title = unescapeHTML(
342 mobj.group('title')) if 'title' in mobj.groupdict() else None
648e6a1f
S
343 if video_title:
344 video_title = video_title.strip()
351f37c0
S
345 if video_title == '► Play all':
346 video_title = None
648e6a1f
S
347 try:
348 idx = ids_in_page.index(video_id)
349 if video_title and not titles_in_page[idx]:
350 titles_in_page[idx] = video_title
351 except ValueError:
352 ids_in_page.append(video_id)
353 titles_in_page.append(video_title)
351f37c0
S
354
355 def extract_videos_from_page(self, page):
356 ids_in_page = []
357 titles_in_page = []
358 self.extract_videos_from_page_impl(
359 self._VIDEO_RE, page, ids_in_page, titles_in_page)
648e6a1f
S
360 return zip(ids_in_page, titles_in_page)
361
362
061a75ed
S
363class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
364 def _process_page(self, content):
6dee688e
S
365 for playlist_id in orderedSet(re.findall(
366 r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
367 content)):
061a75ed
S
368 yield self.url_result(
369 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
370
0c148415
S
371 def _real_extract(self, url):
372 playlist_id = self._match_id(url)
373 webpage = self._download_webpage(url, playlist_id)
0c148415 374 title = self._og_search_title(webpage, fatal=False)
061a75ed 375 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
0c148415
S
376
377
360e1ca5 378class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 379 IE_DESC = 'YouTube.com'
cb7dfeea 380 _VALID_URL = r"""(?x)^
c5e8d7af 381 (
edb53e2d 382 (?:https?://|//) # http(s):// or protocol-independent URL
66b48727 383 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
484aaeb2 384 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 385 (?:www\.)?pwnyoutube\.com/|
8b561bfc 386 (?:www\.)?hooktube\.com/|
f7000f3a 387 (?:www\.)?yourepeat\.com/|
e69ae5b9 388 tube\.majestyc\.net/|
ba036333 389 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
77d95677 390 (?:(?:www|dev)\.)?invidio\.us/|
ba036333 391 (?:(?:www|no)\.)?invidiou\.sh/|
392 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
8ae113ca 393 (?:www\.)?invidious\.kabi\.tk/|
ba036333 394 (?:www\.)?invidious\.13ad\.de/|
791d2e81 395 (?:www\.)?invidious\.mastodon\.host/|
494d664e 396 (?:www\.)?invidious\.nixnet\.xyz/|
666d808e 397 (?:www\.)?invidious\.drycat\.fr/|
ba036333 398 (?:www\.)?tube\.poal\.co/|
8ae113ca 399 (?:www\.)?vid\.wxzm\.sx/|
384bf91f 400 (?:www\.)?yewtu\.be/|
494d664e 401 (?:www\.)?yt\.elukerio\.org/|
894b3826 402 (?:www\.)?yt\.lelux\.fi/|
1db5ab6b 403 (?:www\.)?invidious\.ggc-project\.de/|
404 (?:www\.)?yt\.maisputain\.ovh/|
405 (?:www\.)?invidious\.13ad\.de/|
406 (?:www\.)?invidious\.toot\.koeln/|
407 (?:www\.)?invidious\.fdn\.fr/|
408 (?:www\.)?watch\.nettohikari\.com/|
bff90fc5 409 (?:www\.)?kgg2m7yk5aybusll\.onion/|
410 (?:www\.)?qklhadlycap4cnod\.onion/|
411 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
412 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
413 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
414 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
33c1c7d8 415 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
1db5ab6b 416 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
e69ae5b9 417 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
418 (?:.*?\#/)? # handle anchor (#/) redirect urls
419 (?: # the various things that can precede the ID:
ac7553d0 420 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 421 |(?: # or the v= param in all its forms
f7000f3a 422 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 423 (?:\?|\#!?) # the params delimiter ? or # or #!
040ac686 424 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
c5e8d7af
PH
425 v=
426 )
f4b05232 427 ))
cbaed4bb
S
428 |(?:
429 youtu\.be| # just youtu.be/xxxx
6d4fc66b
S
430 vid\.plus| # or vid.plus/xxxx
431 zwearz\.com/watch| # or zwearz.com/watch/xxxx
cbaed4bb 432 )/
edb53e2d 433 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 434 )
c5e8d7af 435 )? # all until now is optional -> you can pass the naked ID
8963d9c2 436 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
d0ba5587
S
437 (?!.*?\blist=
438 (?:
439 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
440 WL # WL are handled by the watch later IE
441 )
442 )
c5e8d7af 443 (?(1).+)? # if we found the ID, everything can follow
d0ba5587 444 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
c5e8d7af 445 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
e40c758c
S
446 _PLAYER_INFO_RE = (
447 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
448 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
449 )
2c62dc26 450 _formats = {
c2d3cb4c 451 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
452 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
453 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
454 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
455 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
456 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
457 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
458 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
3834d3e3 459 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
c2d3cb4c 460 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
461 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
462 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
463 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
464 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
465 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
e1a0bfdf 466 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
c2d3cb4c 467 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
468 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
e1a0bfdf 469
470
471 # 3D videos
c2d3cb4c 472 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
473 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
474 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
475 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
e1a0bfdf 476 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
477 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
478 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
836a086c 479
96fb5605 480 # Apple HTTP Live Streaming
11f12195 481 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
c2d3cb4c 482 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
483 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
484 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
485 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
486 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
e1a0bfdf 487 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
488 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
2c62dc26
PH
489
490 # DASH mp4 video
d23028a8
S
491 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
492 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
493 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
494 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
495 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
067aa17e 496 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
d23028a8
S
497 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
498 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
499 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
500 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
501 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
502 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
836a086c 503
f6f1fc92 504 # Dash mp4 audio
d23028a8
S
505 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
506 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
507 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
508 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
509 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
510 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
511 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
836a086c
AZ
512
513 # Dash webm
d23028a8
S
514 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
515 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
516 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
517 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
518 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
519 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
520 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
521 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
522 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
523 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
524 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
525 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
526 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
527 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
528 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
4c6b4764 529 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
d23028a8
S
530 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
531 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
532 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
533 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
534 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
535 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
2c62dc26
PH
536
537 # Dash webm audio
d23028a8
S
538 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
539 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
ce6b9a2d 540
0857baad 541 # Dash webm audio with opus inside
d23028a8
S
542 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
543 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
544 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
0857baad 545
ce6b9a2d
PH
546 # RTMP (unnamed)
547 '_rtmp': {'protocol': 'rtmp'},
b85eae0f
S
548
549 # av01 video only formats sometimes served with "unknown" codecs
550 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
551 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
552 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
553 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
c5e8d7af 554 }
84da5d84 555 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
836a086c 556
fd5c4aab
S
557 _GEO_BYPASS = False
558
78caa52a 559 IE_NAME = 'youtube'
2eb88d95
PH
560 _TESTS = [
561 {
2d3d2997 562 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
4bc3a23e
PH
563 'info_dict': {
564 'id': 'BaW_jenozKc',
565 'ext': 'mp4',
3867038a 566 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
4bc3a23e
PH
567 'uploader': 'Philipp Hagemeister',
568 'uploader_id': 'phihag',
ec85ded8 569 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
dd4c4492
S
570 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
571 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
4bc3a23e 572 'upload_date': '20121002',
3867038a 573 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
4bc3a23e 574 'categories': ['Science & Technology'],
3867038a 575 'tags': ['youtube-dl'],
556dbe7f 576 'duration': 10,
dbdaaa23 577 'view_count': int,
3e7c1224
PH
578 'like_count': int,
579 'dislike_count': int,
7c80519c 580 'start_time': 1,
297a564b 581 'end_time': 9,
2eb88d95 582 }
0e853ca4 583 },
fccd3771 584 {
4bc3a23e
PH
585 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
586 'note': 'Embed-only video (#1746)',
587 'info_dict': {
588 'id': 'yZIXLfi8CZQ',
589 'ext': 'mp4',
590 'upload_date': '20120608',
591 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
592 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
593 'uploader': 'SET India',
94bfcd23 594 'uploader_id': 'setindia',
ec85ded8 595 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
94bfcd23 596 'age_limit': 18,
fccd3771
PH
597 }
598 },
11b56058 599 {
2d3d2997 600 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
11b56058
PM
601 'note': 'Use the first video ID in the URL',
602 'info_dict': {
603 'id': 'BaW_jenozKc',
604 'ext': 'mp4',
3867038a 605 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
11b56058
PM
606 'uploader': 'Philipp Hagemeister',
607 'uploader_id': 'phihag',
ec85ded8 608 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
11b56058 609 'upload_date': '20121002',
3867038a 610 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
11b56058 611 'categories': ['Science & Technology'],
3867038a 612 'tags': ['youtube-dl'],
556dbe7f 613 'duration': 10,
dbdaaa23 614 'view_count': int,
11b56058
PM
615 'like_count': int,
616 'dislike_count': int,
34a7de29
S
617 },
618 'params': {
619 'skip_download': True,
620 },
11b56058 621 },
dd27fd17 622 {
2d3d2997 623 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
4bc3a23e
PH
624 'note': '256k DASH audio (format 141) via DASH manifest',
625 'info_dict': {
626 'id': 'a9LDPn-MO4I',
627 'ext': 'm4a',
628 'upload_date': '20121002',
629 'uploader_id': '8KVIDEO',
ec85ded8 630 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
4bc3a23e
PH
631 'description': '',
632 'uploader': '8KVIDEO',
633 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 634 },
4bc3a23e
PH
635 'params': {
636 'youtube_include_dash_manifest': True,
637 'format': '141',
4919603f 638 },
de3c7fe0 639 'skip': 'format 141 not served anymore',
dd27fd17 640 },
aa79ac0c
PH
641 # Controversy video
642 {
643 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
644 'info_dict': {
645 'id': 'T4XJQO3qol8',
646 'ext': 'mp4',
556dbe7f 647 'duration': 219,
aa79ac0c 648 'upload_date': '20100909',
4fe54c12 649 'uploader': 'Amazing Atheist',
aa79ac0c 650 'uploader_id': 'TheAmazingAtheist',
ec85ded8 651 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
aa79ac0c
PH
652 'title': 'Burning Everyone\'s Koran',
653 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
654 }
c522adb1 655 },
dd2d55f1 656 # Normal age-gate video (embed allowed)
c522adb1 657 {
2d3d2997 658 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
c522adb1
JMF
659 'info_dict': {
660 'id': 'HtVdAasjOgU',
661 'ext': 'mp4',
662 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
ec85ded8 663 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
556dbe7f 664 'duration': 142,
c522adb1
JMF
665 'uploader': 'The Witcher',
666 'uploader_id': 'WitcherGame',
ec85ded8 667 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
c522adb1 668 'upload_date': '20140605',
34952f09 669 'age_limit': 18,
c522adb1
JMF
670 },
671 },
067aa17e 672 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
e52a40ab
PH
673 {
674 'url': 'lqQg6PlCWgI',
675 'info_dict': {
676 'id': 'lqQg6PlCWgI',
677 'ext': 'mp4',
556dbe7f 678 'duration': 6085,
90227264 679 'upload_date': '20150827',
cbe2bd91 680 'uploader_id': 'olympic',
ec85ded8 681 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
cbe2bd91 682 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
be49068d 683 'uploader': 'Olympic',
cbe2bd91
PH
684 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
685 },
686 'params': {
687 'skip_download': 'requires avconv',
e52a40ab 688 }
cbe2bd91 689 },
6271f1ca
PH
690 # Non-square pixels
691 {
692 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
693 'info_dict': {
694 'id': '_b-2C3KPAM0',
695 'ext': 'mp4',
696 'stretched_ratio': 16 / 9.,
556dbe7f 697 'duration': 85,
6271f1ca
PH
698 'upload_date': '20110310',
699 'uploader_id': 'AllenMeow',
ec85ded8 700 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
6271f1ca 701 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
eb6793ba 702 'uploader': '孫ᄋᄅ',
6271f1ca
PH
703 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
704 },
06b491eb
S
705 },
706 # url_encoded_fmt_stream_map is empty string
707 {
708 'url': 'qEJwOuvDf7I',
709 'info_dict': {
710 'id': 'qEJwOuvDf7I',
f57b7835 711 'ext': 'webm',
06b491eb
S
712 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
713 'description': '',
714 'upload_date': '20150404',
715 'uploader_id': 'spbelect',
716 'uploader': 'Наблюдатели Петербурга',
717 },
718 'params': {
719 'skip_download': 'requires avconv',
e323cf3f
S
720 },
721 'skip': 'This live event has ended.',
06b491eb 722 },
067aa17e 723 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
da77d856
S
724 {
725 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
726 'info_dict': {
727 'id': 'FIl7x6_3R5Y',
eb6793ba 728 'ext': 'webm',
da77d856
S
729 'title': 'md5:7b81415841e02ecd4313668cde88737a',
730 'description': 'md5:116377fd2963b81ec4ce64b542173306',
556dbe7f 731 'duration': 220,
da77d856
S
732 'upload_date': '20150625',
733 'uploader_id': 'dorappi2000',
ec85ded8 734 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
da77d856 735 'uploader': 'dorappi2000',
eb6793ba 736 'formats': 'mincount:31',
da77d856 737 },
eb6793ba 738 'skip': 'not actual anymore',
2ee8f5d8 739 },
8a1a26ce
YCH
740 # DASH manifest with segment_list
741 {
742 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
743 'md5': '8ce563a1d667b599d21064e982ab9e31',
744 'info_dict': {
745 'id': 'CsmdDsKjzN8',
746 'ext': 'mp4',
17ee98e1 747 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
748 'uploader': 'Airtek',
749 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
750 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
751 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
752 },
753 'params': {
754 'youtube_include_dash_manifest': True,
755 'format': '135', # bestvideo
be49068d
S
756 },
757 'skip': 'This live event has ended.',
2ee8f5d8 758 },
cf7e015f
S
759 {
760 # Multifeed videos (multiple cameras), URL is for Main Camera
761 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
762 'info_dict': {
763 'id': 'jqWvoWXjCVs',
764 'title': 'teamPGP: Rocket League Noob Stream',
765 'description': 'md5:dc7872fb300e143831327f1bae3af010',
766 },
767 'playlist': [{
768 'info_dict': {
769 'id': 'jqWvoWXjCVs',
770 'ext': 'mp4',
771 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
772 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 773 'duration': 7335,
cf7e015f
S
774 'upload_date': '20150721',
775 'uploader': 'Beer Games Beer',
776 'uploader_id': 'beergamesbeer',
ec85ded8 777 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 778 'license': 'Standard YouTube License',
cf7e015f
S
779 },
780 }, {
781 'info_dict': {
782 'id': '6h8e8xoXJzg',
783 'ext': 'mp4',
784 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
785 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 786 'duration': 7337,
cf7e015f
S
787 'upload_date': '20150721',
788 'uploader': 'Beer Games Beer',
789 'uploader_id': 'beergamesbeer',
ec85ded8 790 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 791 'license': 'Standard YouTube License',
cf7e015f
S
792 },
793 }, {
794 'info_dict': {
795 'id': 'PUOgX5z9xZw',
796 'ext': 'mp4',
797 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
798 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 799 'duration': 7337,
cf7e015f
S
800 'upload_date': '20150721',
801 'uploader': 'Beer Games Beer',
802 'uploader_id': 'beergamesbeer',
ec85ded8 803 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 804 'license': 'Standard YouTube License',
cf7e015f
S
805 },
806 }, {
807 'info_dict': {
808 'id': 'teuwxikvS5k',
809 'ext': 'mp4',
810 'title': 'teamPGP: Rocket League Noob Stream (zim)',
811 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 812 'duration': 7334,
cf7e015f
S
813 'upload_date': '20150721',
814 'uploader': 'Beer Games Beer',
815 'uploader_id': 'beergamesbeer',
ec85ded8 816 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 817 'license': 'Standard YouTube License',
cf7e015f
S
818 },
819 }],
820 'params': {
821 'skip_download': True,
822 },
4fe54c12 823 'skip': 'This video is not available.',
cbaed4bb 824 },
f9f49d87 825 {
067aa17e 826 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
f9f49d87
S
827 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
828 'info_dict': {
829 'id': 'gVfLd0zydlo',
830 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
831 },
832 'playlist_count': 2,
be49068d 833 'skip': 'Not multifeed anymore',
f9f49d87 834 },
cbaed4bb 835 {
2d3d2997 836 'url': 'https://vid.plus/FlRa-iH7PGw',
cbaed4bb 837 'only_matching': True,
0e49d9a6 838 },
6d4fc66b 839 {
2d3d2997 840 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
6d4fc66b
S
841 'only_matching': True,
842 },
0e49d9a6 843 {
067aa17e 844 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
a8776b10 845 # Also tests cut-off URL expansion in video description (see
067aa17e
S
846 # https://github.com/ytdl-org/youtube-dl/issues/1892,
847 # https://github.com/ytdl-org/youtube-dl/issues/8164)
0e49d9a6
LL
848 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
849 'info_dict': {
850 'id': 'lsguqyKfVQg',
851 'ext': 'mp4',
852 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
eb6793ba 853 'alt_title': 'Dark Walk - Position Music',
0e49d9a6 854 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
556dbe7f 855 'duration': 133,
0e49d9a6
LL
856 'upload_date': '20151119',
857 'uploader_id': 'IronSoulElf',
ec85ded8 858 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
0e49d9a6 859 'uploader': 'IronSoulElf',
eb6793ba
S
860 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
861 'track': 'Dark Walk - Position Music',
862 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
92bc97d3 863 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
0e49d9a6
LL
864 },
865 'params': {
866 'skip_download': True,
867 },
868 },
61f92af1 869 {
067aa17e 870 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
61f92af1
S
871 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
872 'only_matching': True,
873 },
313dfc45
LL
874 {
875 # Video with yt:stretch=17:0
876 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
877 'info_dict': {
878 'id': 'Q39EVAstoRM',
879 'ext': 'mp4',
880 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
881 'description': 'md5:ee18a25c350637c8faff806845bddee9',
882 'upload_date': '20151107',
883 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
884 'uploader': 'CH GAMER DROID',
885 },
886 'params': {
887 'skip_download': True,
888 },
be49068d 889 'skip': 'This video does not exist.',
313dfc45 890 },
7caf9830
S
891 {
892 # Video licensed under Creative Commons
893 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
894 'info_dict': {
895 'id': 'M4gD1WSo5mA',
896 'ext': 'mp4',
897 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
898 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
556dbe7f 899 'duration': 721,
7caf9830
S
900 'upload_date': '20150127',
901 'uploader_id': 'BerkmanCenter',
ec85ded8 902 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
556dbe7f 903 'uploader': 'The Berkman Klein Center for Internet & Society',
7caf9830
S
904 'license': 'Creative Commons Attribution license (reuse allowed)',
905 },
906 'params': {
907 'skip_download': True,
908 },
909 },
fd050249
S
910 {
911 # Channel-like uploader_url
912 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
913 'info_dict': {
914 'id': 'eQcmzGIKrzg',
915 'ext': 'mp4',
916 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
917 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
556dbe7f 918 'duration': 4060,
fd050249 919 'upload_date': '20151119',
eb6793ba 920 'uploader': 'Bernie Sanders',
fd050249 921 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
ec85ded8 922 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
fd050249
S
923 'license': 'Creative Commons Attribution license (reuse allowed)',
924 },
925 'params': {
926 'skip_download': True,
927 },
928 },
040ac686
S
929 {
930 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
931 'only_matching': True,
7f29cf54
S
932 },
933 {
067aa17e 934 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
7f29cf54
S
935 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
936 'only_matching': True,
6496ccb4
S
937 },
938 {
939 # Rental video preview
940 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
941 'info_dict': {
942 'id': 'uGpuVWrhIzE',
943 'ext': 'mp4',
944 'title': 'Piku - Trailer',
945 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
946 'upload_date': '20150811',
947 'uploader': 'FlixMatrix',
948 'uploader_id': 'FlixMatrixKaravan',
ec85ded8 949 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
6496ccb4
S
950 'license': 'Standard YouTube License',
951 },
952 'params': {
953 'skip_download': True,
954 },
eb6793ba 955 'skip': 'This video is not available.',
022a5d66 956 },
12afdc2a
S
957 {
958 # YouTube Red video with episode data
959 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
960 'info_dict': {
961 'id': 'iqKdEhx-dD4',
962 'ext': 'mp4',
963 'title': 'Isolation - Mind Field (Ep 1)',
4fe54c12 964 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
556dbe7f 965 'duration': 2085,
12afdc2a
S
966 'upload_date': '20170118',
967 'uploader': 'Vsauce',
968 'uploader_id': 'Vsauce',
969 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
12afdc2a
S
970 'series': 'Mind Field',
971 'season_number': 1,
972 'episode_number': 1,
973 },
974 'params': {
975 'skip_download': True,
976 },
977 'expected_warnings': [
978 'Skipping DASH manifest',
979 ],
980 },
c7121fa7
S
981 {
982 # The following content has been identified by the YouTube community
983 # as inappropriate or offensive to some audiences.
984 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
985 'info_dict': {
986 'id': '6SJNVb0GnPI',
987 'ext': 'mp4',
988 'title': 'Race Differences in Intelligence',
989 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
990 'duration': 965,
991 'upload_date': '20140124',
992 'uploader': 'New Century Foundation',
993 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
994 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
c7121fa7
S
995 },
996 'params': {
997 'skip_download': True,
998 },
999 },
022a5d66
S
1000 {
1001 # itag 212
1002 'url': '1t24XAntNCY',
1003 'only_matching': True,
fd5c4aab
S
1004 },
1005 {
1006 # geo restricted to JP
1007 'url': 'sJL6WA-aGkQ',
1008 'only_matching': True,
1009 },
d0ba5587
S
1010 {
1011 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1012 'only_matching': True,
1013 },
cd5a74a2
S
1014 {
1015 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1016 'only_matching': True,
1017 },
825cd268
RA
1018 {
1019 # DRM protected
1020 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1021 'only_matching': True,
4fe54c12
S
1022 },
1023 {
1024 # Video with unsupported adaptive stream type formats
1025 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1026 'info_dict': {
1027 'id': 'Z4Vy8R84T1U',
1028 'ext': 'mp4',
1029 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1030 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1031 'duration': 433,
1032 'upload_date': '20130923',
1033 'uploader': 'Amelia Putri Harwita',
1034 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1035 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1036 'formats': 'maxcount:10',
1037 },
1038 'params': {
1039 'skip_download': True,
1040 'youtube_include_dash_manifest': False,
1041 },
5429d6a9 1042 'skip': 'not actual anymore',
5caabd3c 1043 },
1044 {
822b9d9c 1045 # Youtube Music Auto-generated description
5caabd3c 1046 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1047 'info_dict': {
1048 'id': 'MgNrAu2pzNs',
1049 'ext': 'mp4',
1050 'title': 'Voyeur Girl',
1051 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1052 'upload_date': '20190312',
5429d6a9
S
1053 'uploader': 'Stephen - Topic',
1054 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
5caabd3c 1055 'artist': 'Stephen',
1056 'track': 'Voyeur Girl',
1057 'album': 'it\'s too much love to know my dear',
1058 'release_date': '20190313',
1059 'release_year': 2019,
1060 },
1061 'params': {
1062 'skip_download': True,
1063 },
1064 },
1065 {
822b9d9c 1066 # Youtube Music Auto-generated description
5caabd3c 1067 # Retrieve 'artist' field from 'Artist:' in video description
1068 # when it is present on youtube music video
5caabd3c 1069 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
1070 'info_dict': {
1071 'id': 'k0jLE7tTwjY',
1072 'ext': 'mp4',
1073 'title': 'Latch Feat. Sam Smith',
1074 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
1075 'upload_date': '20150110',
1076 'uploader': 'Various Artists - Topic',
1077 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
1078 'artist': 'Disclosure',
1079 'track': 'Latch Feat. Sam Smith',
1080 'album': 'Latch Featuring Sam Smith',
1081 'release_date': '20121008',
1082 'release_year': 2012,
1083 },
1084 'params': {
1085 'skip_download': True,
1086 },
1087 },
1088 {
822b9d9c 1089 # Youtube Music Auto-generated description
5caabd3c 1090 # handle multiple artists on youtube music video
1091 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
1092 'info_dict': {
1093 'id': '74qn0eJSjpA',
1094 'ext': 'mp4',
1095 'title': 'Eastside',
1096 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
1097 'upload_date': '20180710',
1098 'uploader': 'Benny Blanco - Topic',
1099 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
1100 'artist': 'benny blanco, Halsey, Khalid',
1101 'track': 'Eastside',
1102 'album': 'Eastside',
1103 'release_date': '20180713',
1104 'release_year': 2018,
1105 },
1106 'params': {
1107 'skip_download': True,
1108 },
1109 },
1110 {
822b9d9c 1111 # Youtube Music Auto-generated description
5caabd3c 1112 # handle youtube music video with release_year and no release_date
1113 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
1114 'info_dict': {
1115 'id': '-hcAI0g-f5M',
1116 'ext': 'mp4',
1117 'title': 'Put It On Me',
5429d6a9 1118 'description': 'md5:f6422397c07c4c907c6638e1fee380a5',
5caabd3c 1119 'upload_date': '20180426',
1120 'uploader': 'Matt Maeson - Topic',
1121 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
1122 'artist': 'Matt Maeson',
1123 'track': 'Put It On Me',
1124 'album': 'The Hearse',
1125 'release_date': None,
1126 'release_year': 2018,
1127 },
1128 'params': {
1129 'skip_download': True,
1130 },
1131 },
66b48727
RA
1132 {
1133 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1134 'only_matching': True,
1135 },
011e75e6
S
1136 {
1137 # invalid -> valid video id redirection
1138 'url': 'DJztXj2GPfl',
1139 'info_dict': {
1140 'id': 'DJztXj2GPfk',
1141 'ext': 'mp4',
1142 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1143 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1144 'upload_date': '20090125',
1145 'uploader': 'Prochorowka',
1146 'uploader_id': 'Prochorowka',
1147 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1148 'artist': 'Panjabi MC',
1149 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1150 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1151 },
1152 'params': {
1153 'skip_download': True,
1154 },
ea74e00b
DP
1155 },
1156 {
1157 # empty description results in an empty string
1158 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1159 'info_dict': {
1160 'id': 'x41yOUIvK2k',
1161 'ext': 'mp4',
1162 'title': 'IMG 3456',
1163 'description': '',
1164 'upload_date': '20170613',
1165 'uploader_id': 'ElevageOrVert',
1166 'uploader': 'ElevageOrVert',
1167 },
1168 'params': {
1169 'skip_download': True,
1170 },
1171 },
2eb88d95
PH
1172 ]
1173
e0df6211
PH
1174 def __init__(self, *args, **kwargs):
1175 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 1176 self._player_cache = {}
e0df6211 1177
c5e8d7af
PH
1178 def report_video_info_webpage_download(self, video_id):
1179 """Report attempt to download video info webpage."""
69ea8ca4 1180 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 1181
c5e8d7af
PH
1182 def report_information_extraction(self, video_id):
1183 """Report attempt to extract video information."""
69ea8ca4 1184 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
1185
1186 def report_unavailable_format(self, video_id, format):
1187 """Report extracted video URL."""
69ea8ca4 1188 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
1189
1190 def report_rtmp_download(self):
1191 """Indicate the download will use the RTMP protocol."""
69ea8ca4 1192 self.to_screen('RTMP download detected')
c5e8d7af 1193
60064c53
PH
1194 def _signature_cache_id(self, example_sig):
1195 """ Return a string representation of a signature """
78caa52a 1196 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53 1197
e40c758c
S
1198 @classmethod
1199 def _extract_player_info(cls, player_url):
1200 for player_re in cls._PLAYER_INFO_RE:
1201 id_m = re.search(player_re, player_url)
1202 if id_m:
1203 break
1204 else:
c081b35c 1205 raise ExtractorError('Cannot identify player %r' % player_url)
e40c758c
S
1206 return id_m.group('ext'), id_m.group('id')
1207
1208 def _extract_signature_function(self, video_id, player_url, example_sig):
1209 player_type, player_id = self._extract_player_info(player_url)
e0df6211 1210
c4417ddb 1211 # Read from filesystem cache
60064c53
PH
1212 func_id = '%s_%s_%s' % (
1213 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 1214 assert os.path.basename(func_id) == func_id
a0e07d31 1215
69ea8ca4 1216 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 1217 if cache_spec is not None:
78caa52a 1218 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 1219
6d1a55a5
PH
1220 download_note = (
1221 'Downloading player %s' % player_url
1222 if self._downloader.params.get('verbose') else
1223 'Downloading %s player %s' % (player_type, player_id)
1224 )
e0df6211
PH
1225 if player_type == 'js':
1226 code = self._download_webpage(
1227 player_url, video_id,
6d1a55a5 1228 note=download_note,
69ea8ca4 1229 errnote='Download of %s failed' % player_url)
83799698 1230 res = self._parse_sig_js(code)
c4417ddb 1231 elif player_type == 'swf':
e0df6211
PH
1232 urlh = self._request_webpage(
1233 player_url, video_id,
6d1a55a5 1234 note=download_note,
69ea8ca4 1235 errnote='Download of %s failed' % player_url)
e0df6211 1236 code = urlh.read()
83799698 1237 res = self._parse_sig_swf(code)
e0df6211
PH
1238 else:
1239 assert False, 'Invalid player type %r' % player_type
1240
785521bf
PH
1241 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1242 cache_res = res(test_string)
1243 cache_spec = [ord(c) for c in cache_res]
83799698 1244
69ea8ca4 1245 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
1246 return res
1247
60064c53 1248 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
1249 def gen_sig_code(idxs):
1250 def _genslice(start, end, step):
78caa52a 1251 starts = '' if start == 0 else str(start)
8bcc8756 1252 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 1253 steps = '' if step == 1 else (':%d' % step)
78caa52a 1254 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
1255
1256 step = None
7af808a5
PH
1257 # Quelch pyflakes warnings - start will be set when step is set
1258 start = '(Never used)'
edf3e38e
PH
1259 for i, prev in zip(idxs[1:], idxs[:-1]):
1260 if step is not None:
1261 if i - prev == step:
1262 continue
1263 yield _genslice(start, prev, step)
1264 step = None
1265 continue
1266 if i - prev in [-1, 1]:
1267 step = i - prev
1268 start = prev
1269 continue
1270 else:
78caa52a 1271 yield 's[%d]' % prev
edf3e38e 1272 if step is None:
78caa52a 1273 yield 's[%d]' % i
edf3e38e
PH
1274 else:
1275 yield _genslice(start, i, step)
1276
78caa52a 1277 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 1278 cache_res = func(test_string)
edf3e38e 1279 cache_spec = [ord(c) for c in cache_res]
78caa52a 1280 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
1281 signature_id_tuple = '(%s)' % (
1282 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 1283 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 1284 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 1285 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 1286
e0df6211
PH
1287 def _parse_sig_js(self, jscode):
1288 funcname = self._search_regex(
abefc03f
S
1289 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1290 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
e450f6cb 1291 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
31ce6e99 1292 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
abefc03f
S
1293 # Obsolete patterns
1294 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
9a47fa35 1295 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
abefc03f
S
1296 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1297 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1298 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1299 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1300 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1301 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
3c90cc8b 1302 jscode, 'Initial JS player signature function name', group='sig')
2b25cb5d
PH
1303
1304 jsi = JSInterpreter(jscode)
1305 initial_function = jsi.extract_function(funcname)
e0df6211
PH
1306 return lambda s: initial_function([s])
1307
1308 def _parse_sig_swf(self, file_contents):
54256267 1309 swfi = SWFInterpreter(file_contents)
78caa52a 1310 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 1311 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 1312 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
1313 return lambda s: initial_function([s])
1314
83799698 1315 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1316 """Turn the encrypted s field into a working signature"""
6b37f0be 1317
c8bf86d5 1318 if player_url is None:
69ea8ca4 1319 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 1320
69ea8ca4 1321 if player_url.startswith('//'):
78caa52a 1322 player_url = 'https:' + player_url
3c90cc8b
S
1323 elif not re.match(r'https?://', player_url):
1324 player_url = compat_urlparse.urljoin(
1325 'https://www.youtube.com', player_url)
c8bf86d5 1326 try:
62af3a0e 1327 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
1328 if player_id not in self._player_cache:
1329 func = self._extract_signature_function(
60064c53 1330 video_id, player_url, s
c8bf86d5
PH
1331 )
1332 self._player_cache[player_id] = func
1333 func = self._player_cache[player_id]
1334 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 1335 self._print_sig_code(func, s)
c8bf86d5
PH
1336 return func(s)
1337 except Exception as e:
1338 tb = traceback.format_exc()
1339 raise ExtractorError(
78caa52a 1340 'Signature extraction failed: ' + tb, cause=e)
e0df6211 1341
f96f5dda 1342 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
de7f3446 1343 try:
60e47a26 1344 subs_doc = self._download_xml(
38c2e5b8 1345 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
1346 video_id, note=False)
1347 except ExtractorError as err:
9b9c5355 1348 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
de7f3446 1349 return {}
de7f3446
JMF
1350
1351 sub_lang_list = {}
60e47a26
JMF
1352 for track in subs_doc.findall('track'):
1353 lang = track.attrib['lang_code']
7e660ac1
LD
1354 if lang in sub_lang_list:
1355 continue
360e1ca5 1356 sub_formats = []
23d17e4b 1357 for ext in self._SUBTITLE_FORMATS:
15707c7e 1358 params = compat_urllib_parse_urlencode({
360e1ca5
JMF
1359 'lang': lang,
1360 'v': video_id,
1361 'fmt': ext,
1362 'name': track.attrib['name'].encode('utf-8'),
1363 })
1364 sub_formats.append({
1365 'url': 'https://www.youtube.com/api/timedtext?' + params,
1366 'ext': ext,
1367 })
1368 sub_lang_list[lang] = sub_formats
f96f5dda 1369 if has_live_chat_replay:
321bf820 1370 sub_lang_list['live_chat'] = [
1371 {
1372 'video_id': video_id,
1373 'ext': 'json',
1374 'protocol': 'youtube_live_chat_replay',
1375 },
1376 ]
de7f3446 1377 if not sub_lang_list:
69ea8ca4 1378 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
1379 return {}
1380 return sub_lang_list
1381
a72778d3
S
1382 def _get_ytplayer_config(self, video_id, webpage):
1383 patterns = (
526b3b07
S
1384 # User data may contain arbitrary character sequences that may affect
1385 # JSON extraction with regex, e.g. when '};' is contained the second
1386 # regex won't capture the whole JSON. Yet working around by trying more
1387 # concrete regex first keeping in mind proper quoted string handling
1388 # to be implemented in future that will replace this workaround (see
067aa17e
S
1389 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1390 # https://github.com/ytdl-org/youtube-dl/pull/7599)
a72778d3
S
1391 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1392 r';ytplayer\.config\s*=\s*({.+?});',
1393 )
1394 config = self._search_regex(
1395 patterns, webpage, 'ytplayer.config', default=None)
1396 if config:
1397 return self._parse_json(
1398 uppercase_escape(config), video_id, fatal=False)
0e49d9a6 1399
321bf820 1400 def _get_yt_initial_data(self, video_id, webpage):
1401 config = self._search_regex(
15eae44d 1402 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
1403 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
321bf820 1404 webpage, 'ytInitialData', default=None)
1405 if config:
1406 return self._parse_json(
1407 uppercase_escape(config), video_id, fatal=False)
1408
360e1ca5 1409 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
1410 """We need the webpage for getting the captions url, pass it as an
1411 argument to speed up the process."""
69ea8ca4 1412 self.to_screen('%s: Looking for automatic captions' % video_id)
a72778d3 1413 player_config = self._get_ytplayer_config(video_id, webpage)
78caa52a 1414 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
a72778d3 1415 if not player_config:
de7f3446
JMF
1416 self._downloader.report_warning(err_msg)
1417 return {}
de7f3446 1418 try:
0792d563 1419 args = player_config['args']
b78b292f
S
1420 caption_url = args.get('ttsurl')
1421 if caption_url:
1422 timestamp = args['timestamp']
1423 # We get the available subtitles
15707c7e 1424 list_params = compat_urllib_parse_urlencode({
b78b292f
S
1425 'type': 'list',
1426 'tlangs': 1,
1427 'asrs': 1,
1428 })
1429 list_url = caption_url + '&' + list_params
1430 caption_list = self._download_xml(list_url, video_id)
1431 original_lang_node = caption_list.find('track')
1432 if original_lang_node is None:
1433 self._downloader.report_warning('Video doesn\'t have automatic captions')
1434 return {}
1435 original_lang = original_lang_node.attrib['lang_code']
1436 caption_kind = original_lang_node.attrib.get('kind', '')
1437
1438 sub_lang_list = {}
1439 for lang_node in caption_list.findall('target'):
1440 sub_lang = lang_node.attrib['lang_code']
1441 sub_formats = []
1442 for ext in self._SUBTITLE_FORMATS:
15707c7e 1443 params = compat_urllib_parse_urlencode({
b78b292f
S
1444 'lang': original_lang,
1445 'tlang': sub_lang,
1446 'fmt': ext,
1447 'ts': timestamp,
1448 'kind': caption_kind,
1449 })
1450 sub_formats.append({
1451 'url': caption_url + '&' + params,
1452 'ext': ext,
1453 })
1454 sub_lang_list[sub_lang] = sub_formats
1455 return sub_lang_list
1456
ddbb4c5c
S
1457 def make_captions(sub_url, sub_langs):
1458 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1459 caption_qs = compat_parse_qs(parsed_sub_url.query)
1460 captions = {}
1461 for sub_lang in sub_langs:
1462 sub_formats = []
1463 for ext in self._SUBTITLE_FORMATS:
1464 caption_qs.update({
1465 'tlang': [sub_lang],
1466 'fmt': [ext],
1467 })
1468 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1469 query=compat_urllib_parse_urlencode(caption_qs, True)))
1470 sub_formats.append({
1471 'url': sub_url,
1472 'ext': ext,
1473 })
1474 captions[sub_lang] = sub_formats
1475 return captions
1476
1477 # New captions format as of 22.06.2017
1478 player_response = args.get('player_response')
1479 if player_response and isinstance(player_response, compat_str):
1480 player_response = self._parse_json(
1481 player_response, video_id, fatal=False)
1482 if player_response:
1483 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
7e1cf1a4 1484 caption_tracks = renderer['captionTracks']
1485 for caption_track in caption_tracks:
1486 if 'kind' not in caption_track:
1487 # not an automatic transcription
1488 continue
1489 base_url = caption_track['baseUrl']
1490 sub_lang_list = []
1491 for lang in renderer['translationLanguages']:
1492 lang_code = lang.get('languageCode')
1493 if lang_code:
1494 sub_lang_list.append(lang_code)
1495 return make_captions(base_url, sub_lang_list)
bc842c27 1496
7e1cf1a4 1497 self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id)
1498 return {}
b78b292f
S
1499 # Some videos don't provide ttsurl but rather caption_tracks and
1500 # caption_translation_languages (e.g. 20LmZk1hakA)
ddbb4c5c 1501 # Does not used anymore as of 22.06.2017
b78b292f
S
1502 caption_tracks = args['caption_tracks']
1503 caption_translation_languages = args['caption_translation_languages']
1504 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
ddbb4c5c 1505 sub_lang_list = []
b78b292f
S
1506 for lang in caption_translation_languages.split(','):
1507 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1508 sub_lang = lang_qs.get('lc', [None])[0]
ddbb4c5c
S
1509 if sub_lang:
1510 sub_lang_list.append(sub_lang)
1511 return make_captions(caption_url, sub_lang_list)
de7f3446
JMF
1512 # An extractor error can be raise by the download process if there are
1513 # no automatic captions but there are subtitles
ddbb4c5c 1514 except (KeyError, IndexError, ExtractorError):
de7f3446
JMF
1515 self._downloader.report_warning(err_msg)
1516 return {}
1517
21c340b8
S
1518 def _mark_watched(self, video_id, video_info, player_response):
1519 playback_url = url_or_none(try_get(
1520 player_response,
1521 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1522 video_info, lambda x: x['videostats_playback_base_url'][0]))
d77ab8e2
S
1523 if not playback_url:
1524 return
1525 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1526 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1527
1528 # cpn generation algorithm is reverse engineered from base.js.
1529 # In fact it works even with dummy cpn.
1530 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1531 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1532
1533 qs.update({
1534 'ver': ['2'],
1535 'cpn': [cpn],
1536 })
1537 playback_url = compat_urlparse.urlunparse(
15707c7e 1538 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
d77ab8e2
S
1539
1540 self._download_webpage(
1541 playback_url, video_id, 'Marking watched',
1542 'Unable to mark watched', fatal=False)
1543
66c9fa36
S
1544 @staticmethod
1545 def _extract_urls(webpage):
1546 # Embedded YouTube player
1547 entries = [
1548 unescapeHTML(mobj.group('url'))
1549 for mobj in re.finditer(r'''(?x)
1550 (?:
1551 <iframe[^>]+?src=|
1552 data-video-url=|
1553 <embed[^>]+?src=|
1554 embedSWF\(?:\s*|
1555 <object[^>]+data=|
1556 new\s+SWFObject\(
1557 )
1558 (["\'])
1559 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
f2332f18 1560 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
66c9fa36
S
1561 \1''', webpage)]
1562
1563 # lazyYT YouTube embed
1564 entries.extend(list(map(
1565 unescapeHTML,
1566 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1567
1568 # Wordpress "YouTube Video Importer" plugin
1569 matches = re.findall(r'''(?x)<div[^>]+
1570 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1571 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1572 entries.extend(m[-1] for m in matches)
1573
1574 return entries
1575
1576 @staticmethod
1577 def _extract_url(webpage):
1578 urls = YoutubeIE._extract_urls(webpage)
1579 return urls[0] if urls else None
1580
97665381
PH
1581 @classmethod
1582 def extract_id(cls, url):
1583 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 1584 if mobj is None:
69ea8ca4 1585 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1586 video_id = mobj.group(2)
1587 return video_id
1588
84213ea8
S
1589 def _extract_chapters_from_json(self, webpage, video_id, duration):
1590 if not webpage:
1591 return
edd83104 1592 initial_data = self._parse_json(
84213ea8 1593 self._search_regex(
edd83104 1594 r'window\["ytInitialData"\] = (.+);\n', webpage,
84213ea8
S
1595 'player args', default='{}'),
1596 video_id, fatal=False)
edd83104 1597 if not initial_data or not isinstance(initial_data, dict):
84213ea8
S
1598 return
1599 chapters_list = try_get(
edd83104 1600 initial_data,
84213ea8
S
1601 lambda x: x['playerOverlays']
1602 ['playerOverlayRenderer']
1603 ['decoratedPlayerBarRenderer']
1604 ['decoratedPlayerBarRenderer']
1605 ['playerBar']
1606 ['chapteredPlayerBarRenderer']
1607 ['chapters'],
1608 list)
1609 if not chapters_list:
1610 return
1611
1612 def chapter_time(chapter):
1613 return float_or_none(
1614 try_get(
1615 chapter,
1616 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1617 int),
1618 scale=1000)
1619 chapters = []
1620 for next_num, chapter in enumerate(chapters_list, start=1):
1621 start_time = chapter_time(chapter)
1622 if start_time is None:
1623 continue
1624 end_time = (chapter_time(chapters_list[next_num])
1625 if next_num < len(chapters_list) else duration)
1626 if end_time is None:
1627 continue
1628 title = try_get(
1629 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1630 compat_str)
1631 chapters.append({
1632 'start_time': start_time,
1633 'end_time': end_time,
1634 'title': title,
1635 })
1636 return chapters
1637
9cafc3fd 1638 @staticmethod
84213ea8 1639 def _extract_chapters_from_description(description, duration):
9cafc3fd
S
1640 if not description:
1641 return None
1642 chapter_lines = re.findall(
1643 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1644 description)
1645 if not chapter_lines:
1646 return None
1647 chapters = []
1648 for next_num, (chapter_line, time_point) in enumerate(
1649 chapter_lines, start=1):
1650 start_time = parse_duration(time_point)
1651 if start_time is None:
1652 continue
39d4c1be
S
1653 if start_time > duration:
1654 break
9cafc3fd
S
1655 end_time = (duration if next_num == len(chapter_lines)
1656 else parse_duration(chapter_lines[next_num][1]))
1657 if end_time is None:
1658 continue
39d4c1be
S
1659 if end_time > duration:
1660 end_time = duration
1661 if start_time > end_time:
1662 break
9cafc3fd
S
1663 chapter_title = re.sub(
1664 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1665 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1666 chapters.append({
1667 'start_time': start_time,
1668 'end_time': end_time,
1669 'title': chapter_title,
1670 })
1671 return chapters
1672
84213ea8
S
1673 def _extract_chapters(self, webpage, description, video_id, duration):
1674 return (self._extract_chapters_from_json(webpage, video_id, duration)
1675 or self._extract_chapters_from_description(description, duration))
1676
c5e8d7af 1677 def _real_extract(self, url):
cf7e015f
S
1678 url, smuggled_data = unsmuggle_url(url, {})
1679
7e8c0af0 1680 proto = (
78caa52a
PH
1681 'http' if self._downloader.params.get('prefer_insecure', False)
1682 else 'https')
7e8c0af0 1683
7c80519c 1684 start_time = None
297a564b 1685 end_time = None
7c80519c
JMF
1686 parsed_url = compat_urllib_parse_urlparse(url)
1687 for component in [parsed_url.fragment, parsed_url.query]:
1688 query = compat_parse_qs(component)
297a564b 1689 if start_time is None and 't' in query:
7c80519c 1690 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
1691 if start_time is None and 'start' in query:
1692 start_time = parse_duration(query['start'][0])
297a564b
JMF
1693 if end_time is None and 'end' in query:
1694 end_time = parse_duration(query['end'][0])
7c80519c 1695
c5e8d7af
PH
1696 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1697 mobj = re.search(self._NEXT_URL_RE, url)
1698 if mobj:
7fd002c0 1699 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 1700 video_id = self.extract_id(url)
c5e8d7af
PH
1701
1702 # Get video webpage
aa79ac0c 1703 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
011e75e6
S
1704 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1705
1706 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1707 video_id = qs.get('v', [None])[0] or video_id
c5e8d7af
PH
1708
1709 # Attempt to extract SWF player URL
e0df6211 1710 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1711 if mobj is not None:
1712 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1713 else:
1714 player_url = None
1715
d8d24a92
S
1716 dash_mpds = []
1717
1718 def add_dash_mpd(video_info):
1719 dash_mpd = video_info.get('dashmpd')
1720 if dash_mpd and dash_mpd[0] not in dash_mpds:
1721 dash_mpds.append(dash_mpd[0])
1722
561b456e
S
1723 def add_dash_mpd_pr(pl_response):
1724 dash_mpd = url_or_none(try_get(
1725 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1726 compat_str))
1727 if dash_mpd and dash_mpd not in dash_mpds:
1728 dash_mpds.append(dash_mpd)
1729
c7121fa7
S
1730 is_live = None
1731 view_count = None
1732
1733 def extract_view_count(v_info):
1734 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1735
c2d125d9
S
1736 def extract_player_response(player_response, video_id):
1737 pl_response = str_or_none(player_response)
1738 if not pl_response:
1739 return
1740 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1741 if isinstance(pl_response, dict):
1742 add_dash_mpd_pr(pl_response)
1743 return pl_response
1744
dbdaaa23
S
1745 player_response = {}
1746
c5e8d7af 1747 # Get video info
43ebf77d 1748 video_info = {}
6449cd80 1749 embed_webpage = None
39e7107d
U
1750 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1751 or re.search(r'player-age-gate-content">', video_webpage) is not None):
9d9314cb 1752 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
c108eb73
JMF
1753 age_gate = True
1754 # We simulate the access to the video from www.youtube.com/v/{video_id}
1755 # this can be viewed without login into Youtube
beb95e77
CL
1756 url = proto + '://www.youtube.com/embed/%s' % video_id
1757 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
9d9314cb
U
1758 # check if video is only playable on youtube - if so it requires auth (cookies)
1759 if re.search(r'player-unavailable">', embed_webpage) is not None:
c73baf23
U
1760 '''
1761 # TODO apply this patch when Support for Python 2.6(!) and above drops
9d9314cb 1762 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
4bb9c880 1763 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
c73baf23
U
1764 '''
1765 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1766 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
4bb9c880
U
1767 age_gate = False
1768 # Try looking directly into the video webpage
1769 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1770 if ytplayer_config:
1771 args = ytplayer_config['args']
1772 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1773 # Convert to the same format returned by compat_parse_qs
1774 video_info = dict((k, [v]) for k, v in args.items())
1775 add_dash_mpd(video_info)
1776 # Rental video is not rented but preview is available (e.g.
1777 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1778 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1779 if not video_info and args.get('ypc_vid'):
1780 return self.url_result(
1781 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1782 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1783 is_live = True
1784 if not player_response:
1785 player_response = extract_player_response(args.get('player_response'), video_id)
1786 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1787 add_dash_mpd_pr(player_response)
9d9314cb
U
1788 else:
1789 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1790 else:
1791 data = compat_urllib_parse_urlencode({
1792 'video_id': video_id,
1793 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1794 'sts': self._search_regex(
1795 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1796 })
1797 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1798 try:
1799 video_info_webpage = self._download_webpage(
1800 video_info_url, video_id,
1801 note='Refetching age-gated info webpage',
1802 errnote='unable to download video info webpage')
1803 except ExtractorError:
1804 video_info_webpage = None
1805 if video_info_webpage:
1806 video_info = compat_parse_qs(video_info_webpage)
1807 pl_response = video_info.get('player_response', [None])[0]
1808 player_response = extract_player_response(pl_response, video_id)
1809 add_dash_mpd(video_info)
1810 view_count = extract_view_count(video_info)
c108eb73
JMF
1811 else:
1812 age_gate = False
d8d24a92 1813 # Try looking directly into the video webpage
a72778d3
S
1814 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1815 if ytplayer_config:
4e62ebe2 1816 args = ytplayer_config['args']
4c76aa06 1817 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
d8d24a92
S
1818 # Convert to the same format returned by compat_parse_qs
1819 video_info = dict((k, [v]) for k, v in args.items())
1820 add_dash_mpd(video_info)
6496ccb4
S
1821 # Rental video is not rented but preview is available (e.g.
1822 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
067aa17e 1823 # https://github.com/ytdl-org/youtube-dl/issues/10532)
6496ccb4
S
1824 if not video_info and args.get('ypc_vid'):
1825 return self.url_result(
1826 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
2fe1ff85
JMF
1827 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1828 is_live = True
dbdaaa23 1829 if not player_response:
c2d125d9 1830 player_response = extract_player_response(args.get('player_response'), video_id)
0a3cf9ad 1831 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
561b456e 1832 add_dash_mpd_pr(player_response)
bbb7c3f7
YCH
1833
1834 def extract_unavailable_message():
0add33ab
S
1835 messages = []
1836 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1837 msg = self._html_search_regex(
1838 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1839 video_webpage, 'unavailable %s' % kind, default=None)
1840 if msg:
1841 messages.append(msg)
1842 if messages:
1843 return '\n'.join(messages)
bbb7c3f7 1844
f93abcf1 1845 if not video_info and not player_response:
15be3eb5
RA
1846 unavailable_message = extract_unavailable_message()
1847 if not unavailable_message:
1848 unavailable_message = 'Unable to extract video data'
1849 raise ExtractorError(
1850 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1851
f93abcf1
S
1852 if not isinstance(video_info, dict):
1853 video_info = {}
1854
dbdaaa23
S
1855 video_details = try_get(
1856 player_response, lambda x: x['videoDetails'], dict) or {}
1857
37357d21
S
1858 microformat = try_get(
1859 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1860
8dbf751a
RA
1861 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1862 if not video_title:
cf7e015f
S
1863 self._downloader.report_warning('Unable to extract video title')
1864 video_title = '_'
1865
9cafc3fd 1866 description_original = video_description = get_element_by_id("eow-description", video_webpage)
cf7e015f 1867 if video_description:
fa4bc6e7
RA
1868
1869 def replace_url(m):
1870 redir_url = compat_urlparse.urljoin(url, m.group(1))
1871 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1872 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1873 qs = compat_parse_qs(parsed_redir_url.query)
1874 q = qs.get('q')
1875 if q and q[0]:
1876 return q[0]
1877 return redir_url
1878
9cafc3fd 1879 description_original = video_description = re.sub(r'''(?x)
cf7e015f 1880 <a\s+
25cb7a0e 1881 (?:[a-zA-Z-]+="[^"]*"\s+)*?
23f13e97 1882 (?:title|href)="([^"]+)"\s+
25cb7a0e 1883 (?:[a-zA-Z-]+="[^"]*"\s+)*?
525cedb9 1884 class="[^"]*"[^>]*>
23f13e97 1885 [^<]+\.{3}\s*
cf7e015f 1886 </a>
fa4bc6e7 1887 ''', replace_url, video_description)
cf7e015f
S
1888 video_description = clean_html(video_description)
1889 else:
ea74e00b
DP
1890 video_description = video_details.get('shortDescription')
1891 if video_description is None:
1892 video_description = self._html_search_meta('description', video_webpage)
cf7e015f 1893
8fe10494 1894 if not smuggled_data.get('force_singlefeed', False):
5e1eddb9 1895 if not self._downloader.params.get('noplaylist'):
8fe10494
S
1896 multifeed_metadata_list = try_get(
1897 player_response,
1898 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1899 compat_str) or try_get(
1900 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1901 if multifeed_metadata_list:
1902 entries = []
1903 feed_ids = []
1904 for feed in multifeed_metadata_list.split(','):
1905 # Unquote should take place before split on comma (,) since textual
1906 # fields may contain comma as well (see
067aa17e 1907 # https://github.com/ytdl-org/youtube-dl/issues/8536)
8fe10494 1908 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
6b09401b
S
1909
1910 def feed_entry(name):
1911 return try_get(feed_data, lambda x: x[name][0], compat_str)
1912
1913 feed_id = feed_entry('id')
1914 if not feed_id:
1915 continue
1916 feed_title = feed_entry('title')
1917 title = video_title
1918 if feed_title:
1919 title += ' (%s)' % feed_title
8fe10494
S
1920 entries.append({
1921 '_type': 'url_transparent',
1922 'ie_key': 'Youtube',
1923 'url': smuggle_url(
1924 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1925 {'force_singlefeed': True}),
6b09401b 1926 'title': title,
8fe10494 1927 })
6b09401b 1928 feed_ids.append(feed_id)
8fe10494
S
1929 self.to_screen(
1930 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1931 % (', '.join(feed_ids), video_id))
1932 return self.playlist_result(entries, video_id, video_title, video_description)
1933 else:
1934 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1935
c7121fa7 1936 if view_count is None:
1c9c8de2 1937 view_count = extract_view_count(video_info)
dbdaaa23
S
1938 if view_count is None and video_details:
1939 view_count = int_or_none(video_details.get('viewCount'))
7b16239a
S
1940 if view_count is None and microformat:
1941 view_count = int_or_none(microformat.get('viewCount'))
1d699755 1942
27019dbb 1943 if is_live is None:
898238e9 1944 is_live = bool_or_none(video_details.get('isLive'))
27019dbb 1945
321bf820 1946 has_live_chat_replay = False
f0f76a33 1947 if not is_live:
321bf820 1948 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
1949 try:
1950 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
1951 has_live_chat_replay = True
f0f76a33 1952 except (KeyError, IndexError, TypeError):
321bf820 1953 pass
1954
c5e8d7af
PH
1955 # Check for "rental" videos
1956 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
067aa17e 1957 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
c5e8d7af 1958
c63ca0ee
S
1959 def _extract_filesize(media_url):
1960 return int_or_none(self._search_regex(
1961 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1962
bf1317d2
S
1963 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
1964 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
1965
c5e8d7af
PH
1966 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1967 self.report_rtmp_download()
dd27fd17
PH
1968 formats = [{
1969 'format_id': '_rtmp',
1970 'protocol': 'rtmp',
1971 'url': video_info['conn'][0],
1972 'player_url': player_url,
1973 }]
bf1317d2 1974 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
5f6a1245 1975 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1976 if 'rtmpe%3Dyes' in encoded_url_map:
067aa17e 1977 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
bf1317d2 1978 formats = []
3318832e 1979 formats_spec = {}
82156fdb 1980 fmt_list = video_info.get('fmt_list', [''])[0]
1981 if fmt_list:
1982 for fmt in fmt_list.split(','):
1983 spec = fmt.split('/')
3318832e 1984 if len(spec) > 1:
1985 width_height = spec[1].split('x')
1986 if len(width_height) == 2:
1987 formats_spec[spec[0]] = {
1988 'resolution': spec[1],
1989 'width': int_or_none(width_height[0]),
1990 'height': int_or_none(width_height[1]),
1991 }
bf1317d2
S
1992 for fmt in streaming_formats:
1993 itag = str_or_none(fmt.get('itag'))
1994 if not itag:
201e9eaa 1995 continue
bf1317d2
S
1996 quality = fmt.get('quality')
1997 quality_label = fmt.get('qualityLabel') or quality
1998 formats_spec[itag] = {
1999 'asr': int_or_none(fmt.get('audioSampleRate')),
2000 'filesize': int_or_none(fmt.get('contentLength')),
2001 'format_note': quality_label,
2002 'fps': int_or_none(fmt.get('fps')),
2003 'height': int_or_none(fmt.get('height')),
bf1317d2
S
2004 # bitrate for itag 43 is always 2147483647
2005 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
2006 'width': int_or_none(fmt.get('width')),
2007 }
2008
2009 for fmt in streaming_formats:
00eb865b 2010 if fmt.get('drmFamilies') or fmt.get('drm_families'):
bf1317d2
S
2011 continue
2012 url = url_or_none(fmt.get('url'))
2013
2014 if not url:
fa3db383 2015 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
bf1317d2
S
2016 if not cipher:
2017 continue
2018 url_data = compat_parse_qs(cipher)
2019 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2020 if not url:
2021 continue
2022 else:
2023 cipher = None
2024 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2025
2f483bc1
S
2026 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2027 # Unsupported FORMAT_STREAM_TYPE_OTF
2028 if stream_type == 3:
2029 continue
6449cd80 2030
bf1317d2
S
2031 format_id = fmt.get('itag') or url_data['itag'][0]
2032 if not format_id:
2033 continue
2034 format_id = compat_str(format_id)
a49eccdf 2035
bf1317d2
S
2036 if cipher:
2037 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
2038 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
2039 jsplayer_url_json = self._search_regex(
2040 ASSETS_RE,
2041 embed_webpage if age_gate else video_webpage,
2042 'JS player URL (1)', default=None)
2043 if not jsplayer_url_json and not age_gate:
2044 # We need the embed website after all
2045 if embed_webpage is None:
2046 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2047 embed_webpage = self._download_webpage(
2048 embed_url, video_id, 'Downloading embed webpage')
2049 jsplayer_url_json = self._search_regex(
2050 ASSETS_RE, embed_webpage, 'JS player URL')
2051
2052 player_url = json.loads(jsplayer_url_json)
cf010131 2053 if player_url is None:
bf1317d2
S
2054 player_url_json = self._search_regex(
2055 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2056 video_webpage, 'age gate player URL')
2057 player_url = json.loads(player_url_json)
2058
2059 if 'sig' in url_data:
2060 url += '&signature=' + url_data['sig'][0]
2061 elif 's' in url_data:
2062 encrypted_sig = url_data['s'][0]
2063
2064 if self._downloader.params.get('verbose'):
2065 if player_url is None:
bf1317d2 2066 player_desc = 'unknown'
cf010131 2067 else:
e40c758c
S
2068 player_type, player_version = self._extract_player_info(player_url)
2069 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
bf1317d2
S
2070 parts_sizes = self._signature_cache_id(encrypted_sig)
2071 self.to_screen('{%s} signature length %s, %s' %
2072 (format_id, parts_sizes, player_desc))
2073
2074 signature = self._decrypt_signature(
2075 encrypted_sig, video_id, player_url, age_gate)
2076 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2077 url += '&%s=%s' % (sp, signature)
201e9eaa
PH
2078 if 'ratebypass' not in url:
2079 url += '&ratebypass=yes'
c9afb51c 2080
94278f72
YCH
2081 dct = {
2082 'format_id': format_id,
2083 'url': url,
2084 'player_url': player_url,
2085 }
2086 if format_id in self._formats:
2087 dct.update(self._formats[format_id])
3318832e 2088 if format_id in formats_spec:
2089 dct.update(formats_spec[format_id])
94278f72 2090
aabc2be6 2091 # Some itags are not included in DASH manifest thus corresponding formats will
067aa17e 2092 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
aabc2be6
S
2093 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2094 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2095 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
94278f72 2096
bf1317d2
S
2097 if width is None:
2098 width = int_or_none(fmt.get('width'))
2099 if height is None:
2100 height = int_or_none(fmt.get('height'))
2101
c63ca0ee
S
2102 filesize = int_or_none(url_data.get(
2103 'clen', [None])[0]) or _extract_filesize(url)
2104
bf1317d2
S
2105 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2106 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2107
4878759f
S
2108 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2109 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
bf1317d2 2110 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
54fc90aa 2111
94278f72 2112 more_fields = {
c63ca0ee 2113 'filesize': filesize,
bf1317d2 2114 'tbr': tbr,
c9afb51c
AH
2115 'width': width,
2116 'height': height,
bf1317d2
S
2117 'fps': fps,
2118 'format_note': quality_label or quality,
c9afb51c 2119 }
94278f72
YCH
2120 for key, value in more_fields.items():
2121 if value:
2122 dct[key] = value
bf1317d2 2123 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
aabc2be6
S
2124 if type_:
2125 type_split = type_.split(';')
2126 kind_ext = type_split[0].split('/')
2127 if len(kind_ext) == 2:
94278f72
YCH
2128 kind, _ = kind_ext
2129 dct['ext'] = mimetype2ext(type_split[0])
aabc2be6
S
2130 if kind in ('audio', 'video'):
2131 codecs = None
2132 for mobj in re.finditer(
2133 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2134 if mobj.group('key') == 'codecs':
2135 codecs = mobj.group('val')
2136 break
2137 if codecs:
6310acf5 2138 dct.update(parse_codecs(codecs))
e4a60912
S
2139 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2140 dct['downloader_options'] = {
2141 # Youtube throttles chunks >~10M
2142 'http_chunk_size': 10485760,
2143 }
aabc2be6 2144 formats.append(dct)
c5e8d7af 2145 else:
c3e54389
S
2146 manifest_url = (
2147 url_or_none(try_get(
2148 player_response,
2149 lambda x: x['streamingData']['hlsManifestUrl'],
3089bc74
S
2150 compat_str))
2151 or url_or_none(try_get(
c3e54389
S
2152 video_info, lambda x: x['hlsvp'][0], compat_str)))
2153 if manifest_url:
2154 formats = []
2155 m3u8_formats = self._extract_m3u8_formats(
2156 manifest_url, video_id, 'mp4', fatal=False)
2157 for a_format in m3u8_formats:
2158 itag = self._search_regex(
2159 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2160 if itag:
2161 a_format['format_id'] = itag
2162 if itag in self._formats:
2163 dct = self._formats[itag].copy()
2164 dct.update(a_format)
2165 a_format = dct
2166 a_format['player_url'] = player_url
2167 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2168 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
78895bd3
U
2169 if self._downloader.params.get('youtube_include_hls_manifest', True):
2170 formats.append(a_format)
c3e54389 2171 else:
13577349 2172 error_message = extract_unavailable_message()
c3e54389 2173 if not error_message:
13577349
S
2174 error_message = clean_html(try_get(
2175 player_response, lambda x: x['playabilityStatus']['reason'],
2176 compat_str))
2177 if not error_message:
2178 error_message = clean_html(
2179 try_get(video_info, lambda x: x['reason'][0], compat_str))
c3e54389
S
2180 if error_message:
2181 raise ExtractorError(error_message, expected=True)
2182 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 2183
7e72694b 2184 # uploader
dbdaaa23
S
2185 video_uploader = try_get(
2186 video_info, lambda x: x['author'][0],
2187 compat_str) or str_or_none(video_details.get('author'))
7e72694b
S
2188 if video_uploader:
2189 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2190 else:
2191 self._downloader.report_warning('unable to extract uploader name')
2192
2193 # uploader_id
2194 video_uploader_id = None
2195 video_uploader_url = None
2196 mobj = re.search(
2197 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2198 video_webpage)
2199 if mobj is not None:
2200 video_uploader_id = mobj.group('uploader_id')
2201 video_uploader_url = mobj.group('uploader_url')
a6211d23
S
2202 else:
2203 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2204 if owner_profile_url:
2205 video_uploader_id = self._search_regex(
2206 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2207 default=None)
2208 video_uploader_url = owner_profile_url
7e72694b 2209
b45a9e69 2210 channel_id = (
3089bc74
S
2211 str_or_none(video_details.get('channelId'))
2212 or self._html_search_meta(
2213 'channelId', video_webpage, 'channel id', default=None)
2214 or self._search_regex(
b45a9e69 2215 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2216 video_webpage, 'channel id', default=None, group='id'))
dd4c4492
S
2217 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2218
b477fc13
S
2219 thumbnails = []
2220 thumbnails_list = try_get(
2221 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2222 for t in thumbnails_list:
2223 if not isinstance(t, dict):
2224 continue
2225 thumbnail_url = url_or_none(t.get('url'))
2226 if not thumbnail_url:
2227 continue
2228 thumbnails.append({
2229 'url': thumbnail_url,
2230 'width': int_or_none(t.get('width')),
2231 'height': int_or_none(t.get('height')),
2232 })
2233
2234 if not thumbnails:
7e72694b 2235 video_thumbnail = None
b477fc13
S
2236 # We try first to get a high quality image:
2237 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2238 video_webpage, re.DOTALL)
2239 if m_thumb is not None:
2240 video_thumbnail = m_thumb.group(1)
2241 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2242 if thumbnail_url:
2243 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2244 if video_thumbnail:
2245 thumbnails.append({'url': video_thumbnail})
7e72694b
S
2246
2247 # upload date
2248 upload_date = self._html_search_meta(
2249 'datePublished', video_webpage, 'upload date', default=None)
2250 if not upload_date:
2251 upload_date = self._search_regex(
2252 [r'(?s)id="eow-date.*?>(.*?)</span>',
2253 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2254 video_webpage, 'upload date', default=None)
37357d21
S
2255 if not upload_date:
2256 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
7e72694b
S
2257 upload_date = unified_strdate(upload_date)
2258
2259 video_license = self._html_search_regex(
2260 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2261 video_webpage, 'license', default=None)
2262
2263 m_music = re.search(
2264 r'''(?x)
2265 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2266 <ul[^>]*>\s*
2267 <li>(?P<title>.+?)
2268 by (?P<creator>.+?)
2269 (?:
2270 \(.+?\)|
2271 <a[^>]*
2272 (?:
2273 \bhref=["\']/red[^>]*>| # drop possible
2274 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2275 )
2276 .*?
2277 )?</li
2278 ''',
2279 video_webpage)
2280 if m_music:
2281 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2282 video_creator = clean_html(m_music.group('creator'))
2283 else:
2284 video_alt_title = video_creator = None
2285
2286 def extract_meta(field):
2287 return self._html_search_regex(
2288 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2289 video_webpage, field, default=None)
2290
2291 track = extract_meta('Song')
2292 artist = extract_meta('Artist')
92bc97d3 2293 album = extract_meta('Album')
822b9d9c
RA
2294
2295 # Youtube Music Auto-generated description
92bc97d3 2296 release_date = release_year = None
822b9d9c
RA
2297 if video_description:
2298 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2299 if mobj:
2300 if not track:
2301 track = mobj.group('track').strip()
2302 if not artist:
2303 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
92bc97d3
RA
2304 if not album:
2305 album = mobj.group('album'.strip())
822b9d9c
RA
2306 release_year = mobj.group('release_year')
2307 release_date = mobj.group('release_date')
2308 if release_date:
2309 release_date = release_date.replace('-', '')
2310 if not release_year:
2311 release_year = int(release_date[:4])
2312 if release_year:
2313 release_year = int(release_year)
7e72694b
S
2314
2315 m_episode = re.search(
2316 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2317 video_webpage)
2318 if m_episode:
c2dd2dc0 2319 series = unescapeHTML(m_episode.group('series'))
7e72694b
S
2320 season_number = int(m_episode.group('season'))
2321 episode_number = int(m_episode.group('episode'))
2322 else:
2323 series = season_number = episode_number = None
2324
2325 m_cat_container = self._search_regex(
2326 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2327 video_webpage, 'categories', default=None)
dbeafce5 2328 category = None
7e72694b
S
2329 if m_cat_container:
2330 category = self._html_search_regex(
2331 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2332 default=None)
dbeafce5
S
2333 if not category:
2334 category = try_get(
2335 microformat, lambda x: x['category'], compat_str)
2336 video_categories = None if category is None else [category]
7e72694b
S
2337
2338 video_tags = [
2339 unescapeHTML(m.group('content'))
2340 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
dbeafce5
S
2341 if not video_tags:
2342 video_tags = try_get(video_details, lambda x: x['keywords'], list)
7e72694b
S
2343
2344 def _extract_count(count_name):
2345 return str_to_int(self._search_regex(
a6c666d0 2346 r'"accessibilityData":\{"label":"([\d,\w]+) %ss"\}'
7e72694b
S
2347 % re.escape(count_name),
2348 video_webpage, count_name, default=None))
2349
2350 like_count = _extract_count('like')
2351 dislike_count = _extract_count('dislike')
2352
dbdaaa23
S
2353 if view_count is None:
2354 view_count = str_to_int(self._search_regex(
2355 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2356 'view count', default=None))
2357
bf3c9326
S
2358 average_rating = (
2359 float_or_none(video_details.get('averageRating'))
2360 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2361
7e72694b 2362 # subtitles
321bf820 2363 video_subtitles = self.extract_subtitles(
2364 video_id, video_webpage, has_live_chat_replay)
7e72694b
S
2365 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2366
2367 video_duration = try_get(
2368 video_info, lambda x: int_or_none(x['length_seconds'][0]))
dbdaaa23
S
2369 if not video_duration:
2370 video_duration = int_or_none(video_details.get('lengthSeconds'))
7e72694b
S
2371 if not video_duration:
2372 video_duration = parse_duration(self._html_search_meta(
2373 'duration', video_webpage, 'video duration'))
2374
b84071c0
JP
2375 # Get Subscriber Count of channel
2376 subscriber_count = parse_count(self._search_regex(
2377 r'"text":"([\d\.]+\w?) subscribers"',
2378 video_webpage,
2379 'subscriber count',
2380 default=None
2381 ))
2382
7e72694b
S
2383 # annotations
2384 video_annotations = None
2385 if self._downloader.params.get('writeannotations', False):
64b6a4e9
RA
2386 xsrf_token = self._search_regex(
2387 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2388 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2389 invideo_url = try_get(
2390 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2391 if xsrf_token and invideo_url:
2392 xsrf_field_name = self._search_regex(
2393 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2394 video_webpage, 'xsrf field name',
2395 group='xsrf_field_name', default='session_token')
2396 video_annotations = self._download_webpage(
2397 self._proto_relative_url(invideo_url),
2398 video_id, note='Downloading annotations',
2399 errnote='Unable to download video annotations', fatal=False,
2400 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
7e72694b 2401
84213ea8 2402 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
7e72694b 2403
dd27fd17 2404 # Look for the DASH manifest
203fb43f 2405 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 2406 dash_mpd_fatal = True
8ff648e4 2407 for mpd_url in dash_mpds:
d8d24a92 2408 dash_formats = {}
774e208f 2409 try:
05d0d131
YCH
2410 def decrypt_sig(mobj):
2411 s = mobj.group(1)
2412 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2413 return '/signature/%s' % dec_s
2414
8ff648e4 2415 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2d2fa82d 2416
8ff648e4 2417 for df in self._extract_mpd_formats(
2418 mpd_url, video_id, fatal=dash_mpd_fatal,
2419 formats_dict=self._formats):
c63ca0ee
S
2420 if not df.get('filesize'):
2421 df['filesize'] = _extract_filesize(df['url'])
d8d24a92
S
2422 # Do not overwrite DASH format found in some previous DASH manifest
2423 if df['format_id'] not in dash_formats:
2424 dash_formats[df['format_id']] = df
77c6fb5b
S
2425 # Additional DASH manifests may end up in HTTP Error 403 therefore
2426 # allow them to fail without bug report message if we already have
2427 # some DASH manifest succeeded. This is temporary workaround to reduce
2428 # burst of bug reports until we figure out the reason and whether it
2429 # can be fixed at all.
2430 dash_mpd_fatal = False
774e208f
PH
2431 except (ExtractorError, KeyError) as e:
2432 self.report_warning(
2433 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 2434 if dash_formats:
04b3b3df
JMF
2435 # Remove the formats we found through non-DASH, they
2436 # contain less info and it can be wrong, because we use
2437 # fixed values (for example the resolution). See
067aa17e 2438 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
04b3b3df 2439 # example.
d80265cc 2440 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 2441 formats.extend(dash_formats.values())
d80044c2 2442
6271f1ca
PH
2443 # Check for malformed aspect ratio
2444 stretched_m = re.search(
2445 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2446 video_webpage)
2447 if stretched_m:
313dfc45
LL
2448 w = float(stretched_m.group('w'))
2449 h = float(stretched_m.group('h'))
5faf9fed
S
2450 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2451 # We will only process correct ratios.
313dfc45 2452 if w > 0 and h > 0:
41f24c32 2453 ratio = w / h
313dfc45
LL
2454 for f in formats:
2455 if f.get('vcodec') != 'none':
2456 f['stretched_ratio'] = ratio
6271f1ca 2457
026fbedc 2458 if not formats:
43ebf77d
S
2459 if 'reason' in video_info:
2460 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2461 regions_allowed = self._html_search_meta(
2462 'regionsAllowed', video_webpage, default=None)
2463 countries = regions_allowed.split(',') if regions_allowed else None
2464 self.raise_geo_restricted(
2465 msg=video_info['reason'][0], countries=countries)
2466 reason = video_info['reason'][0]
2467 if 'Invalid parameters' in reason:
2468 unavailable_message = extract_unavailable_message()
2469 if unavailable_message:
2470 reason = unavailable_message
2471 raise ExtractorError(
2472 'YouTube said: %s' % reason,
2473 expected=True, video_id=video_id)
2474 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2475 raise ExtractorError('This video is DRM protected.', expected=True)
0d297518 2476
4bcc7bd1 2477 self._sort_formats(formats)
4ea3be0a 2478
21c340b8 2479 self.mark_watched(video_id, video_info, player_response)
d77ab8e2 2480
4ea3be0a 2481 return {
8bcc8756
JW
2482 'id': video_id,
2483 'uploader': video_uploader,
2484 'uploader_id': video_uploader_id,
fd050249 2485 'uploader_url': video_uploader_url,
dd4c4492
S
2486 'channel_id': channel_id,
2487 'channel_url': channel_url,
8bcc8756 2488 'upload_date': upload_date,
7caf9830 2489 'license': video_license,
936784b2 2490 'creator': video_creator or artist,
8bcc8756 2491 'title': video_title,
936784b2 2492 'alt_title': video_alt_title or track,
b477fc13 2493 'thumbnails': thumbnails,
8bcc8756
JW
2494 'description': video_description,
2495 'categories': video_categories,
000b6b5a 2496 'tags': video_tags,
8bcc8756 2497 'subtitles': video_subtitles,
360e1ca5 2498 'automatic_captions': automatic_captions,
8bcc8756
JW
2499 'duration': video_duration,
2500 'age_limit': 18 if age_gate else 0,
2501 'annotations': video_annotations,
9cafc3fd 2502 'chapters': chapters,
7e8c0af0 2503 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 2504 'view_count': view_count,
4ea3be0a 2505 'like_count': like_count,
2506 'dislike_count': dislike_count,
bf3c9326 2507 'average_rating': average_rating,
8bcc8756 2508 'formats': formats,
2fe1ff85 2509 'is_live': is_live,
7c80519c 2510 'start_time': start_time,
297a564b 2511 'end_time': end_time,
12afdc2a
S
2512 'series': series,
2513 'season_number': season_number,
2514 'episode_number': episode_number,
936784b2
S
2515 'track': track,
2516 'artist': artist,
5caabd3c 2517 'album': album,
2518 'release_date': release_date,
2519 'release_year': release_year,
b84071c0 2520 'subscriber_count': subscriber_count,
4ea3be0a 2521 }
c5e8d7af 2522
5f6a1245 2523
8e7aad20 2524class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 2525 IE_DESC = 'YouTube.com playlists'
d67cc9fa 2526 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
2527 (?:https?://)?
2528 (?:\w+\.)?
c5e8d7af 2529 (?:
c0345b82 2530 (?:
66b48727 2531 youtube(?:kids)?\.com|
c0345b82
S
2532 invidio\.us
2533 )
2534 /
feaa5ad7 2535 (?:
87dadd45 2536 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
feaa5ad7
S
2537 \? (?:.*?[&;])*? (?:p|a|list)=
2538 | p/
2539 )|
2540 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
c5e8d7af 2541 )
d67cc9fa 2542 (
66b48727 2543 (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
5f6a1245 2544 # Top tracks, they can also include dots
d67cc9fa
JMF
2545 |(?:MC)[\w\.]*
2546 )
c5e8d7af
PH
2547 .*
2548 |
d0ba5587
S
2549 (%(playlist_id)s)
2550 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
8d81f3e3 2551 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
351f37c0
S
2552 _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
2553 _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
78caa52a 2554 IE_NAME = 'youtube:playlist'
81127aa5 2555 _TESTS = [{
0e30a7b9 2556 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
81127aa5 2557 'info_dict': {
0e30a7b9 2558 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2559 'uploader': 'Sergey M.',
2560 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3867038a 2561 'title': 'youtube-dl public playlist',
81127aa5 2562 },
0e30a7b9 2563 'playlist_count': 1,
9291475f 2564 }, {
0e30a7b9 2565 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
9291475f 2566 'info_dict': {
0e30a7b9 2567 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2568 'uploader': 'Sergey M.',
2569 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3867038a 2570 'title': 'youtube-dl empty playlist',
9291475f
PH
2571 },
2572 'playlist_count': 0,
2573 }, {
2574 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2575 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2576 'info_dict': {
2577 'title': '29C3: Not my department',
acf757f4 2578 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
13a75688
S
2579 'uploader': 'Christiaan008',
2580 'uploader_id': 'ChRiStIaAn008',
9291475f 2581 },
0e30a7b9 2582 'playlist_count': 96,
9291475f
PH
2583 }, {
2584 'note': 'issue #673',
2585 'url': 'PLBB231211A4F62143',
2586 'info_dict': {
f46a8702 2587 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 2588 'id': 'PLBB231211A4F62143',
13a75688
S
2589 'uploader': 'Wickydoo',
2590 'uploader_id': 'Wickydoo',
9291475f
PH
2591 },
2592 'playlist_mincount': 26,
2593 }, {
2594 'note': 'Large playlist',
2595 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2596 'info_dict': {
2597 'title': 'Uploads from Cauchemar',
acf757f4 2598 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
13a75688
S
2599 'uploader': 'Cauchemar',
2600 'uploader_id': 'Cauchemar89',
9291475f
PH
2601 },
2602 'playlist_mincount': 799,
2603 }, {
2604 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2605 'info_dict': {
2606 'title': 'YDL_safe_search',
acf757f4 2607 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
2608 },
2609 'playlist_count': 2,
4201ba13 2610 'skip': 'This playlist is private',
ac7553d0
PH
2611 }, {
2612 'note': 'embedded',
2d3d2997 2613 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0
PH
2614 'playlist_count': 4,
2615 'info_dict': {
2616 'title': 'JODA15',
acf757f4 2617 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
13a75688
S
2618 'uploader': 'milan',
2619 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
ac7553d0 2620 }
87dadd45
S
2621 }, {
2622 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2623 'playlist_mincount': 485,
2624 'info_dict': {
13a75688 2625 'title': '2018 Chinese New Singles (11/6 updated)',
87dadd45 2626 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
13a75688
S
2627 'uploader': 'LBK',
2628 'uploader_id': 'sdragonfang',
87dadd45 2629 }
6b08cdf6
PH
2630 }, {
2631 'note': 'Embedded SWF player',
2d3d2997 2632 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
6b08cdf6
PH
2633 'playlist_count': 4,
2634 'info_dict': {
2635 'title': 'JODA7',
acf757f4 2636 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
13a75688
S
2637 },
2638 'skip': 'This playlist does not exist',
4b7df0d3
JMF
2639 }, {
2640 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2641 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2642 'info_dict': {
acf757f4
PH
2643 'title': 'Uploads from Interstellar Movie',
2644 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
13a75688
S
2645 'uploader': 'Interstellar Movie',
2646 'uploader_id': 'InterstellarMovie1',
4b7df0d3 2647 },
481cc733 2648 'playlist_mincount': 21,
dacb3a86
S
2649 }, {
2650 # Playlist URL that does not actually serve a playlist
2651 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2652 'info_dict': {
2653 'id': 'FqZTN594JQw',
2654 'ext': 'webm',
2655 'title': "Smiley's People 01 detective, Adventure Series, Action",
2656 'uploader': 'STREEM',
2657 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
ec85ded8 2658 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
dacb3a86
S
2659 'upload_date': '20150526',
2660 'license': 'Standard YouTube License',
2661 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2662 'categories': ['People & Blogs'],
2663 'tags': list,
dbdaaa23 2664 'view_count': int,
dacb3a86
S
2665 'like_count': int,
2666 'dislike_count': int,
2667 },
2668 'params': {
2669 'skip_download': True,
2670 },
13a75688 2671 'skip': 'This video is not available.',
dacb3a86 2672 'add_ie': [YoutubeIE.ie_key()],
481cc733
S
2673 }, {
2674 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2675 'info_dict': {
2676 'id': 'yeWKywCrFtk',
2677 'ext': 'mp4',
2678 'title': 'Small Scale Baler and Braiding Rugs',
2679 'uploader': 'Backus-Page House Museum',
2680 'uploader_id': 'backuspagemuseum',
ec85ded8 2681 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
481cc733 2682 'upload_date': '20161008',
481cc733
S
2683 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2684 'categories': ['Nonprofits & Activism'],
2685 'tags': list,
2686 'like_count': int,
2687 'dislike_count': int,
2688 },
2689 'params': {
2690 'noplaylist': True,
2691 'skip_download': True,
2692 },
2e18adec
S
2693 }, {
2694 # https://github.com/ytdl-org/youtube-dl/issues/21844
2695 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2696 'info_dict': {
2697 'title': 'Data Analysis with Dr Mike Pound',
2698 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2699 'uploader_id': 'Computerphile',
2700 'uploader': 'Computerphile',
2701 },
2702 'playlist_mincount': 11,
feaa5ad7
S
2703 }, {
2704 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2705 'only_matching': True,
a6857510
S
2706 }, {
2707 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2708 'only_matching': True,
409b9324
S
2709 }, {
2710 # music album playlist
2711 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2712 'only_matching': True,
c0345b82
S
2713 }, {
2714 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2715 'only_matching': True,
66b48727
RA
2716 }, {
2717 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2718 'only_matching': True,
81127aa5 2719 }]
c5e8d7af 2720
880e1c52
JMF
2721 def _real_initialize(self):
2722 self._login()
2723
351f37c0
S
2724 def extract_videos_from_page(self, page):
2725 ids_in_page = []
2726 titles_in_page = []
2727
2728 for item in re.findall(
2729 r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
2730 attrs = extract_attributes(item)
2731 video_id = attrs['data-video-id']
2732 video_title = unescapeHTML(attrs.get('data-title'))
2733 if video_title:
2734 video_title = video_title.strip()
2735 ids_in_page.append(video_id)
2736 titles_in_page.append(video_title)
2737
2738 # Fallback with old _VIDEO_RE
2739 self.extract_videos_from_page_impl(
2740 self._VIDEO_RE, page, ids_in_page, titles_in_page)
2741
2742 # Relaxed fallbacks
2743 self.extract_videos_from_page_impl(
2744 r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
2745 ids_in_page, titles_in_page)
2746 self.extract_videos_from_page_impl(
2747 r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
2748 ids_in_page, titles_in_page)
2749
2750 return zip(ids_in_page, titles_in_page)
2751
652cdaa2 2752 def _extract_mix(self, playlist_id):
99209c29 2753 # The mixes are generated from a single video
652cdaa2 2754 # the id of the playlist is just 'RD' + video_id
1b6182d8
JMF
2755 ids = []
2756 last_id = playlist_id[-11:]
2757 for n in itertools.count(1):
07af16b9 2758 url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
1b6182d8
JMF
2759 webpage = self._download_webpage(
2760 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2761 new_ids = orderedSet(re.findall(
2762 r'''(?xs)data-video-username=".*?".*?
2763 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2764 webpage))
2765 # Fetch new pages until all the videos are repeated, it seems that
2766 # there are always 51 unique videos.
2767 new_ids = [_id for _id in new_ids if _id not in ids]
2768 if not new_ids:
2769 break
2770 ids.extend(new_ids)
2771 last_id = ids[-1]
2772
2773 url_results = self._ids_to_results(ids)
2774
bc2f773b 2775 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5 2776 title_span = (
3089bc74
S
2777 search_title('playlist-title')
2778 or search_title('title long-title')
2779 or search_title('title'))
76d1700b 2780 title = clean_html(title_span)
652cdaa2
JMF
2781
2782 return self.playlist_result(url_results, playlist_id, title)
2783
448830ce 2784 def _extract_playlist(self, playlist_id):
dbb94fb0
S
2785 url = self._TEMPLATE_URL % playlist_id
2786 page = self._download_webpage(url, playlist_id)
dbb94fb0 2787
067aa17e 2788 # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
8bc0800d 2789 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
39b62db1
YCH
2790 match = match.strip()
2791 # Check if the playlist exists or is private
4201ba13
S
2792 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2793 if mobj:
2794 reason = mobj.group('reason')
2795 message = 'This playlist %s' % reason
2796 if 'private' in reason:
2797 message += ', use --username or --netrc to access it'
2798 message += '.'
2799 raise ExtractorError(message, expected=True)
39b62db1
YCH
2800 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2801 raise ExtractorError(
2802 'Invalid parameters. Maybe URL is incorrect.',
2803 expected=True)
2804 elif re.match(r'[^<]*Choose your language[^<]*', match):
2805 continue
2806 else:
2807 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 2808
dbb94fb0 2809 playlist_title = self._html_search_regex(
63b4295d 2810 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
dacb3a86 2811 page, 'title', default=None)
c5e8d7af 2812
07aeced6 2813 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
4e3f1f04 2814 uploader = self._html_search_regex(
07aeced6
S
2815 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2816 page, 'uploader', default=None)
2817 mobj = re.search(
2818 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2819 page)
2820 if mobj:
2821 uploader_id = mobj.group('uploader_id')
2822 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2823 else:
2824 uploader_id = uploader_url = None
2825
dacb3a86
S
2826 has_videos = True
2827
2828 if not playlist_title:
2829 try:
2830 # Some playlist URLs don't actually serve a playlist (e.g.
2831 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2832 next(self._entries(page, playlist_id))
2833 except StopIteration:
2834 has_videos = False
2835
07aeced6 2836 playlist = self.playlist_result(
dacb3a86 2837 self._entries(page, playlist_id), playlist_id, playlist_title)
07aeced6
S
2838 playlist.update({
2839 'uploader': uploader,
2840 'uploader_id': uploader_id,
2841 'uploader_url': uploader_url,
2842 })
2843
2844 return has_videos, playlist
c5e8d7af 2845
ebf1b291 2846 def _check_download_just_video(self, url, playlist_id):
448830ce
S
2847 # Check if it's a video-specific URL
2848 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
481cc733 2849 video_id = query_dict.get('v', [None])[0] or self._search_regex(
87dadd45 2850 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
481cc733
S
2851 'video id', default=None)
2852 if video_id:
448830ce
S
2853 if self._downloader.params.get('noplaylist'):
2854 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
dacb3a86 2855 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
448830ce
S
2856 else:
2857 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
dacb3a86
S
2858 return video_id, None
2859 return None, None
448830ce 2860
ebf1b291
S
2861 def _real_extract(self, url):
2862 # Extract playlist id
2863 mobj = re.match(self._VALID_URL, url)
2864 if mobj is None:
2865 raise ExtractorError('Invalid URL: %s' % url)
2866 playlist_id = mobj.group(1) or mobj.group(2)
2867
dacb3a86 2868 video_id, video = self._check_download_just_video(url, playlist_id)
ebf1b291
S
2869 if video:
2870 return video
2871
466a6145 2872 if playlist_id.startswith(('RD', 'UL', 'PU')):
448830ce
S
2873 # Mixes require a custom extraction process
2874 return self._extract_mix(playlist_id)
2875
dacb3a86
S
2876 has_videos, playlist = self._extract_playlist(playlist_id)
2877 if has_videos or not video_id:
2878 return playlist
2879
2880 # Some playlist URLs don't actually serve a playlist (see
067aa17e 2881 # https://github.com/ytdl-org/youtube-dl/issues/10537).
dacb3a86
S
2882 # Fallback to plain video extraction if there is a video id
2883 # along with playlist id.
2884 return self.url_result(video_id, 'Youtube', video_id=video_id)
448830ce 2885
c5e8d7af 2886
648e6a1f 2887class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 2888 IE_DESC = 'YouTube.com channels'
66b48727 2889 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 2890 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
648e6a1f 2891 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
78caa52a 2892 IE_NAME = 'youtube:channel'
cdc628a4
PH
2893 _TESTS = [{
2894 'note': 'paginated channel',
2895 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
2896 'playlist_mincount': 91,
acf757f4 2897 'info_dict': {
9170ca5b
JMF
2898 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
2899 'title': 'Uploads from lex will',
13a75688
S
2900 'uploader': 'lex will',
2901 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
acf757f4 2902 }
5c43afd4
JMF
2903 }, {
2904 'note': 'Age restricted channel',
2905 # from https://www.youtube.com/user/DeusExOfficial
2906 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
2907 'playlist_mincount': 64,
2908 'info_dict': {
2909 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
2910 'title': 'Uploads from Deus Ex',
13a75688
S
2911 'uploader': 'Deus Ex',
2912 'uploader_id': 'DeusExOfficial',
5c43afd4 2913 },
cd5a74a2
S
2914 }, {
2915 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
2916 'only_matching': True,
66b48727
RA
2917 }, {
2918 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
2919 'only_matching': True,
cdc628a4 2920 }]
c5e8d7af 2921
e462474e
S
2922 @classmethod
2923 def suitable(cls, url):
f07e276a
S
2924 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
2925 else super(YoutubeChannelIE, cls).suitable(url))
e462474e 2926
9558dcec
S
2927 def _build_template_url(self, url, channel_id):
2928 return self._TEMPLATE_URL % channel_id
2929
c5e8d7af 2930 def _real_extract(self, url):
9ff67727 2931 channel_id = self._match_id(url)
c5e8d7af 2932
9558dcec 2933 url = self._build_template_url(url, channel_id)
386bdfa6
S
2934
2935 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
2936 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
2937 # otherwise fallback on channel by page extraction
2938 channel_page = self._download_webpage(
2939 url + '?view=57', channel_id,
2940 'Downloading channel page', fatal=False)
2b3c2546
PH
2941 if channel_page is False:
2942 channel_playlist_id = False
2943 else:
2944 channel_playlist_id = self._html_search_meta(
2945 'channelId', channel_page, 'channel id', default=None)
2946 if not channel_playlist_id:
73c4ac2c
S
2947 channel_url = self._html_search_meta(
2948 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
2949 channel_page, 'channel url', default=None)
2950 if channel_url:
2951 channel_playlist_id = self._search_regex(
2952 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
2953 channel_url, 'channel id', default=None)
386bdfa6
S
2954 if channel_playlist_id and channel_playlist_id.startswith('UC'):
2955 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
2956 return self.url_result(
2957 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 2958
60bf45c8 2959 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
2960 autogenerated = re.search(r'''(?x)
2961 class="[^"]*?(?:
2962 channel-header-autogenerated-label|
2963 yt-channel-title-autogenerated
2964 )[^"]*"''', channel_page) is not None
c5e8d7af 2965
b9643eed
JMF
2966 if autogenerated:
2967 # The videos are contained in a single page
2968 # the ajax pages can't be used, they are empty
b82f815f 2969 entries = [
fb69240c
S
2970 self.url_result(
2971 video_id, 'Youtube', video_id=video_id,
2972 video_title=video_title)
8f02ad4f 2973 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
2974 return self.playlist_result(entries, channel_id)
2975
73c4ac2c
S
2976 try:
2977 next(self._entries(channel_page, channel_id))
2978 except StopIteration:
2979 alert_message = self._html_search_regex(
2980 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
2981 channel_page, 'alert', default=None, group='alert')
2982 if alert_message:
2983 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
2984
648e6a1f 2985 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
c5e8d7af
PH
2986
2987
eb0f3e7e 2988class YoutubeUserIE(YoutubeChannelIE):
78caa52a 2989 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
39e7107d 2990 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)'
9558dcec 2991 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
78caa52a 2992 IE_NAME = 'youtube:user'
c5e8d7af 2993
cdc628a4
PH
2994 _TESTS = [{
2995 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
2996 'playlist_mincount': 320,
2997 'info_dict': {
73c4ac2c
S
2998 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
2999 'title': 'Uploads from The Linux Foundation',
13a75688
S
3000 'uploader': 'The Linux Foundation',
3001 'uploader_id': 'TheLinuxFoundation',
cdc628a4 3002 }
9558dcec
S
3003 }, {
3004 # Only available via https://www.youtube.com/c/12minuteathlete/videos
3005 # but not https://www.youtube.com/user/12minuteathlete/videos
3006 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
3007 'playlist_mincount': 249,
3008 'info_dict': {
3009 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
3010 'title': 'Uploads from 12 Minute Athlete',
13a75688
S
3011 'uploader': '12 Minute Athlete',
3012 'uploader_id': 'the12minuteathlete',
9558dcec 3013 }
cdc628a4
PH
3014 }, {
3015 'url': 'ytuser:phihag',
3016 'only_matching': True,
daa0df9e
YCH
3017 }, {
3018 'url': 'https://www.youtube.com/c/gametrailers',
3019 'only_matching': True,
39e7107d
U
3020 }, {
3021 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak',
3022 'only_matching': True,
9558dcec
S
3023 }, {
3024 'url': 'https://www.youtube.com/gametrailers',
3025 'only_matching': True,
73c4ac2c 3026 }, {
0e879f43 3027 # This channel is not available, geo restricted to JP
73c4ac2c
S
3028 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
3029 'only_matching': True,
cdc628a4
PH
3030 }]
3031
e3ea4790 3032 @classmethod
f4b05232 3033 def suitable(cls, url):
e3ea4790
JMF
3034 # Don't return True if the url can be extracted with other youtube
3035 # extractor, the regex would is too permissive and it would match.
f3a58d46 3036 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
3037 if any(ie.suitable(url) for ie in other_yt_ies):
5f6a1245
JW
3038 return False
3039 else:
3040 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 3041
9558dcec
S
3042 def _build_template_url(self, url, channel_id):
3043 mobj = re.match(self._VALID_URL, url)
3044 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
3045
b05654f0 3046
f07e276a
S
3047class YoutubeLiveIE(YoutubeBaseInfoExtractor):
3048 IE_DESC = 'YouTube.com live streams'
073d5bf5 3049 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
f07e276a
S
3050 IE_NAME = 'youtube:live'
3051
3052 _TESTS = [{
2d3d2997 3053 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
f07e276a
S
3054 'info_dict': {
3055 'id': 'a48o2S1cPoo',
3056 'ext': 'mp4',
3057 'title': 'The Young Turks - Live Main Show',
3058 'uploader': 'The Young Turks',
3059 'uploader_id': 'TheYoungTurks',
ec85ded8 3060 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
f07e276a
S
3061 'upload_date': '20150715',
3062 'license': 'Standard YouTube License',
3063 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3064 'categories': ['News & Politics'],
3065 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3066 'like_count': int,
3067 'dislike_count': int,
3068 },
3069 'params': {
3070 'skip_download': True,
3071 },
3072 }, {
2d3d2997 3073 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
f07e276a 3074 'only_matching': True,
c1b2a085
S
3075 }, {
3076 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3077 'only_matching': True,
073d5bf5
S
3078 }, {
3079 'url': 'https://www.youtube.com/TheYoungTurks/live',
3080 'only_matching': True,
f07e276a
S
3081 }]
3082
3083 def _real_extract(self, url):
3084 mobj = re.match(self._VALID_URL, url)
3085 channel_id = mobj.group('id')
3086 base_url = mobj.group('base_url')
3087 webpage = self._download_webpage(url, channel_id, fatal=False)
3088 if webpage:
3089 page_type = self._og_search_property(
e7f3529f 3090 'type', webpage, 'page type', default='')
f07e276a
S
3091 video_id = self._html_search_meta(
3092 'videoId', webpage, 'video id', default=None)
e7f3529f
S
3093 if page_type.startswith('video') and video_id and re.match(
3094 r'^[0-9A-Za-z_-]{11}$', video_id):
f07e276a
S
3095 return self.url_result(video_id, YoutubeIE.ie_key())
3096 return self.url_result(base_url)
3097
3098
e462474e
S
3099class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
3100 IE_DESC = 'YouTube.com user/channel playlists'
e942cfd1 3101 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists'
e462474e 3102 IE_NAME = 'youtube:playlists'
0c148415 3103
e568c223 3104 _TESTS = [{
2d3d2997 3105 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
0c148415
S
3106 'playlist_mincount': 4,
3107 'info_dict': {
3108 'id': 'ThirstForScience',
13a75688 3109 'title': 'ThirstForScience',
0c148415 3110 },
e568c223
S
3111 }, {
3112 # with "Load more" button
2d3d2997 3113 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
e568c223
S
3114 'playlist_mincount': 70,
3115 'info_dict': {
3116 'id': 'igorkle1',
3117 'title': 'Игорь Клейнер',
3118 },
e462474e
S
3119 }, {
3120 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
3121 'playlist_mincount': 17,
3122 'info_dict': {
3123 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
3124 'title': 'Chem Player',
3125 },
13a75688 3126 'skip': 'Blocked',
e942cfd1
S
3127 }, {
3128 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3129 'only_matching': True,
e568c223 3130 }]
0c148415
S
3131
3132
870f3bfc
S
3133class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
3134 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
3135
3136
3137class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
78caa52a 3138 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
3139 # there doesn't appear to be a real limit, for example if you search for
3140 # 'python' you get more than 8.000.000 results
3141 _MAX_RESULTS = float('inf')
78caa52a 3142 IE_NAME = 'youtube:search'
b05654f0 3143 _SEARCH_KEY = 'ytsearch'
b4c08069 3144 _EXTRA_QUERY_ARGS = {}
9dd8e46a 3145 _TESTS = []
b05654f0 3146
b05654f0
PH
3147 def _get_n_results(self, query, n):
3148 """Get a specified number of results for a query"""
3149
b4c08069 3150 videos = []
b05654f0
PH
3151 limit = n
3152
a22b2fd1
YCH
3153 url_query = {
3154 'search_query': query.encode('utf-8'),
3155 }
3156 url_query.update(self._EXTRA_QUERY_ARGS)
3157 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
3158
b4c08069 3159 for pagenum in itertools.count(1):
b4c08069 3160 data = self._download_json(
69ea8ca4 3161 result_url, video_id='query "%s"' % query,
b4c08069 3162 note='Downloading page %s' % pagenum,
a22b2fd1
YCH
3163 errnote='Unable to download API page',
3164 query={'spf': 'navigate'})
b4c08069 3165 html_content = data[1]['body']['content']
7cc3570e 3166
b4c08069 3167 if 'class="search-message' in html_content:
07ad22b8 3168 raise ExtractorError(
78caa52a 3169 '[youtube] No video results', expected=True)
b05654f0 3170
870f3bfc 3171 new_videos = list(self._process_page(html_content))
b4c08069
JMF
3172 videos += new_videos
3173 if not new_videos or len(videos) > limit:
3174 break
a22b2fd1
YCH
3175 next_link = self._html_search_regex(
3176 r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',
3177 html_content, 'next link', default=None)
3178 if next_link is None:
3179 break
3180 result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)
b05654f0 3181
b4c08069
JMF
3182 if len(videos) > n:
3183 videos = videos[:n]
b05654f0 3184 return self.playlist_result(videos, query)
75dff0ee 3185
c9ae7b95 3186
a3dd9248 3187class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 3188 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 3189 _SEARCH_KEY = 'ytsearchdate'
78caa52a 3190 IE_DESC = 'YouTube.com searches, newest videos first'
b4c08069 3191 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
75dff0ee 3192
c9ae7b95 3193
870f3bfc 3194class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
78caa52a
PH
3195 IE_DESC = 'YouTube.com search URLs'
3196 IE_NAME = 'youtube:search_url'
d2c1f79f 3197 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
c0a1a892 3198 _SEARCH_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
cdc628a4 3199 _TESTS = [{
3867038a 3200 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
cdc628a4
PH
3201 'playlist_mincount': 5,
3202 'info_dict': {
3867038a 3203 'title': 'youtube-dl test video',
cdc628a4 3204 }
d2c1f79f
S
3205 }, {
3206 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3207 'only_matching': True,
cdc628a4 3208 }]
c9ae7b95 3209
e03b4f3e 3210 def _find_videos_in_json(self, extracted):
3211 videos = []
3212
3213 def _real_find(obj):
3214 if obj is None or isinstance(obj, str):
3215 return
3216
3217 if type(obj) is list:
3218 for elem in obj:
3219 _real_find(elem)
3220
3221 if type(obj) is dict:
3222 if "videoId" in obj:
3223 videos.append(obj)
3224 return
3225
3226 for _, o in obj.items():
3227 _real_find(o)
3228
3229 _real_find(extracted)
3230
3231 return videos
3232
19f671f8 3233 def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page):
3234 search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None)
3235
e03b4f3e 3236 result_items = self._find_videos_in_json(search_response)
19f671f8 3237
955c4cb6 3238 for renderer in result_items:
3239 video_id = try_get(renderer, lambda x: x['videoId'])
3240 video_title = try_get(renderer, lambda x: x['title']['runs'][0]['text']) or try_get(renderer, lambda x: x['title']['simpleText'])
19f671f8 3241
3242 if video_id is None or video_title is None:
955c4cb6 3243 # we do not have a videoRenderer or title extraction broke
19f671f8 3244 continue
3245
3246 video_title = video_title.strip()
3247
3248 try:
3249 idx = ids_in_page.index(video_id)
3250 if video_title and not titles_in_page[idx]:
3251 titles_in_page[idx] = video_title
3252 except ValueError:
3253 ids_in_page.append(video_id)
3254 titles_in_page.append(video_title)
3255
3256 def extract_videos_from_page(self, page):
3257 ids_in_page = []
3258 titles_in_page = []
3259 self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page)
3260 return zip(ids_in_page, titles_in_page)
3261
c9ae7b95
PH
3262 def _real_extract(self, url):
3263 mobj = re.match(self._VALID_URL, url)
7fd002c0 3264 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
c9ae7b95 3265 webpage = self._download_webpage(url, query)
175c2e9e 3266 return self.playlist_result(self._process_page(webpage), playlist_title=query)
c9ae7b95
PH
3267
3268
136dadde 3269class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
78caa52a 3270 IE_DESC = 'YouTube.com (multi-season) shows'
92519402 3271 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 3272 IE_NAME = 'youtube:show'
cdc628a4 3273 _TESTS = [{
4003bd82 3274 'url': 'https://www.youtube.com/show/airdisasters',
8801255d 3275 'playlist_mincount': 5,
cdc628a4
PH
3276 'info_dict': {
3277 'id': 'airdisasters',
3278 'title': 'Air Disasters',
3279 }
3280 }]
75dff0ee
JMF
3281
3282 def _real_extract(self, url):
136dadde
S
3283 playlist_id = self._match_id(url)
3284 return super(YoutubeShowIE, self)._real_extract(
3285 'https://www.youtube.com/show/%s/playlists' % playlist_id)
04cc9617
JMF
3286
3287
b2e8bc1b 3288class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639 3289 """
25f14e9f 3290 Base class for feed extractors
d7ae0639
JMF
3291 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3292 """
b2e8bc1b 3293 _LOGIN_REQUIRED = True
bea9b005 3294 _FEED_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
f5360807 3295 _YTCFG_DATA = r"ytcfg.set\(({.*?})\)"
d7ae0639
JMF
3296
3297 @property
3298 def IE_NAME(self):
78caa52a 3299 return 'youtube:%s' % self._FEED_NAME
04cc9617 3300
81f0259b 3301 def _real_initialize(self):
b2e8bc1b 3302 self._login()
81f0259b 3303
5c430b67 3304 def _find_videos_in_json(self, extracted):
3305 videos = []
299056ad 3306 c = {}
5c430b67 3307
3308 def _real_find(obj):
3309 if obj is None or isinstance(obj, str):
3310 return
3311
3312 if type(obj) is list:
3313 for elem in obj:
3314 _real_find(elem)
3315
3316 if type(obj) is dict:
3317 if "videoId" in obj:
3318 videos.append(obj)
3319 return
f5360807 3320
5c430b67 3321 if "nextContinuationData" in obj:
299056ad 3322 c["continuation"] = obj["nextContinuationData"]
f5360807 3323 return
3324
5c430b67 3325 for _, o in obj.items():
3326 _real_find(o)
3327
3328 _real_find(extracted)
3329
299056ad 3330 return videos, try_get(c, lambda x: x["continuation"])
f5360807 3331
3853309f 3332 def _entries(self, page):
5c430b67 3333 info = []
3334
1f93faf6 3335 yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set', default="null"), None, fatal=False)
5c430b67 3336
3337 search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None)
3338
2bc43303 3339 for page_num in itertools.count(1):
5c430b67 3340 video_info, continuation = self._find_videos_in_json(search_response)
62c95fd5 3341
f5360807 3342 new_info = []
5c430b67 3343
3344 for v in video_info:
3345 v_id = try_get(v, lambda x: x['videoId'])
3346 if not v_id:
3347 continue
3348
f5360807 3349 have_video = False
5c430b67 3350 for old in info:
3351 if old['videoId'] == v_id:
3352 have_video = True
3353 break
3354
3355 if not have_video:
3356 new_info.append(v)
3357
3358 if not new_info:
62c95fd5
S
3359 break
3360
5c430b67 3361 info.extend(new_info)
2bc43303 3362
5c430b67 3363 for video in new_info:
f442082a 3364 yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text']) or try_get(video, lambda x: x['title']['simpleText']))
3853309f 3365
1f93faf6 3366 if not continuation or not yt_conf:
2bc43303
JMF
3367 break
3368
5c430b67 3369 search_response = self._download_json(
3370 'https://www.youtube.com/browse_ajax', self._PLAYLIST_TITLE,
2bc43303 3371 'Downloading page #%s' % page_num,
d84b21b4 3372 transform_source=uppercase_escape,
5c430b67 3373 query={
3374 "ctoken": try_get(continuation, lambda x: x["continuation"]),
3375 "continuation": try_get(continuation, lambda x: x["continuation"]),
3376 "itct": try_get(continuation, lambda x: x["clickTrackingParams"])
3377 },
3378 headers={
3379 "X-YouTube-Client-Name": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_NAME"]),
3380 "X-YouTube-Client-Version": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_VERSION"]),
3381 "X-Youtube-Identity-Token": try_get(yt_conf, lambda x: x["ID_TOKEN"]),
3382 "X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]),
3383 "X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]),
3384 "X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]),
f5360807 3385 "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]),
5c430b67 3386 })
2bc43303 3387
3853309f
S
3388 def _real_extract(self, url):
3389 page = self._download_webpage(
3390 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3391 self._PLAYLIST_TITLE)
25f14e9f 3392 return self.playlist_result(
3853309f 3393 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
25f14e9f
S
3394
3395
3396class YoutubeWatchLaterIE(YoutubePlaylistIE):
3397 IE_NAME = 'youtube:watchlater'
3398 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
92519402 3399 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
25f14e9f 3400
bc7a9cd8
S
3401 _TESTS = [{
3402 'url': 'https://www.youtube.com/playlist?list=WL',
3403 'only_matching': True,
3404 }, {
3405 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
3406 'only_matching': True,
3407 }]
25f14e9f
S
3408
3409 def _real_extract(self, url):
7e5dc339 3410 _, video = self._check_download_just_video(url, 'WL')
ebf1b291
S
3411 if video:
3412 return video
dacb3a86
S
3413 _, playlist = self._extract_playlist('WL')
3414 return playlist
f459d170 3415
5f6a1245 3416
c626a3d9 3417class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 3418 IE_NAME = 'youtube:favorites'
f3a34072 3419 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
92519402 3420 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
3421 _LOGIN_REQUIRED = True
3422
3423 def _real_extract(self, url):
3424 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 3425 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 3426 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
3427
3428
25f14e9f
S
3429class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3430 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
92519402 3431 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
25f14e9f
S
3432 _FEED_NAME = 'recommended'
3433 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 3434
1ed5b5c9 3435
25f14e9f
S
3436class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3437 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
92519402 3438 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
25f14e9f
S
3439 _FEED_NAME = 'subscriptions'
3440 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 3441
1ed5b5c9 3442
25f14e9f
S
3443class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3444 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
92519402 3445 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
25f14e9f
S
3446 _FEED_NAME = 'history'
3447 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
3448
3449
15870e90
PH
3450class YoutubeTruncatedURLIE(InfoExtractor):
3451 IE_NAME = 'youtube:truncated_url'
3452 IE_DESC = False # Do not list
975d35db 3453 _VALID_URL = r'''(?x)
b95aab84
PH
3454 (?:https?://)?
3455 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3456 (?:watch\?(?:
c4808c60 3457 feature=[a-z_]+|
b95aab84
PH
3458 annotation_id=annotation_[^&]+|
3459 x-yt-cl=[0-9]+|
c1708b89 3460 hl=[^&]*|
287be8c6 3461 t=[0-9]+
b95aab84
PH
3462 )?
3463 |
3464 attribution_link\?a=[^&]+
3465 )
3466 $
975d35db 3467 '''
15870e90 3468
c4808c60 3469 _TESTS = [{
2d3d2997 3470 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
c4808c60 3471 'only_matching': True,
dc2fc736 3472 }, {
2d3d2997 3473 'url': 'https://www.youtube.com/watch?',
dc2fc736 3474 'only_matching': True,
b95aab84
PH
3475 }, {
3476 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3477 'only_matching': True,
3478 }, {
3479 'url': 'https://www.youtube.com/watch?feature=foo',
3480 'only_matching': True,
c1708b89
PH
3481 }, {
3482 'url': 'https://www.youtube.com/watch?hl=en-GB',
3483 'only_matching': True,
287be8c6
PH
3484 }, {
3485 'url': 'https://www.youtube.com/watch?t=2372',
3486 'only_matching': True,
c4808c60
PH
3487 }]
3488
15870e90
PH
3489 def _real_extract(self, url):
3490 raise ExtractorError(
78caa52a
PH
3491 'Did you forget to quote the URL? Remember that & is a meta '
3492 'character in most shells, so you want to put the URL in quotes, '
3867038a 3493 'like youtube-dl '
2d3d2997 3494 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3867038a 3495 ' or simply youtube-dl BaW_jenozKc .',
15870e90 3496 expected=True)
772fd5cc
PH
3497
3498
3499class YoutubeTruncatedIDIE(InfoExtractor):
3500 IE_NAME = 'youtube:truncated_id'
3501 IE_DESC = False # Do not list
b95aab84 3502 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
3503
3504 _TESTS = [{
3505 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3506 'only_matching': True,
3507 }]
3508
3509 def _real_extract(self, url):
3510 video_id = self._match_id(url)
3511 raise ExtractorError(
3512 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3513 expected=True)