]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[README.md] Bind info dict URLs to a fixed blob (closes #18492)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
d77ab8e2 9import random
c5e8d7af 10import re
42939b61 11import time
e0df6211 12import traceback
c5e8d7af 13
b05654f0 14from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
8d81f3e3 19 compat_kwargs,
c5e8d7af 20 compat_parse_qs,
7fd002c0
S
21 compat_urllib_parse_unquote,
22 compat_urllib_parse_unquote_plus,
15707c7e 23 compat_urllib_parse_urlencode,
7c80519c 24 compat_urllib_parse_urlparse,
7c61bd36 25 compat_urlparse,
c5e8d7af 26 compat_str,
4bb4a188
PH
27)
28from ..utils import (
c5e8d7af 29 clean_html,
9b9c5355 30 error_to_compat_str,
c5e8d7af 31 ExtractorError,
2d30521a 32 float_or_none,
4bb4a188
PH
33 get_element_by_attribute,
34 get_element_by_id,
dd27fd17 35 int_or_none,
94278f72 36 mimetype2ext,
4bb4a188 37 orderedSet,
6310acf5 38 parse_codecs,
7c80519c 39 parse_duration,
54fc90aa 40 qualities,
0cb58b02 41 remove_quotes,
3995d37d 42 remove_start,
cf7e015f 43 smuggle_url,
dbdaaa23 44 str_or_none,
c93d53f5 45 str_to_int,
556dbe7f 46 try_get,
c5e8d7af
PH
47 unescapeHTML,
48 unified_strdate,
cf7e015f 49 unsmuggle_url,
81c2f20b 50 uppercase_escape,
6e6bc8da 51 urlencode_postdata,
c5e8d7af
PH
52)
53
5f6a1245 54
de7f3446 55class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
56 """Provide base functions for Youtube extractors"""
57 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 58 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
e00eb564
S
59
60 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
3995d37d
S
61 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
62 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
e00eb564 63
b2e8bc1b
JMF
64 _NETRC_MACHINE = 'youtube'
65 # If True it will raise an error if no login info is provided
66 _LOGIN_REQUIRED = False
67
409b9324 68 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}'
d0ba5587 69
b2e8bc1b 70 def _set_language(self):
810fb84d
PH
71 self._set_cookie(
72 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 73 # YouTube sets the expire time to about two months
810fb84d 74 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 75
25f14e9f
S
76 def _ids_to_results(self, ids):
77 return [
78 self.url_result(vid_id, 'Youtube', video_id=vid_id)
79 for vid_id in ids]
80
b2e8bc1b 81 def _login(self):
83317f69 82 """
83 Attempt to log in to YouTube.
84 True is returned if successful or skipped.
85 False is returned if login failed.
86
87 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
88 """
68217024 89 username, password = self._get_login_info()
b2e8bc1b
JMF
90 # No authentication to be performed
91 if username is None:
70d35d16 92 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
69ea8ca4 93 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 94 return True
b2e8bc1b 95
7cc3570e
PH
96 login_page = self._download_webpage(
97 self._LOGIN_URL, None,
69ea8ca4
PH
98 note='Downloading login page',
99 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
100 if login_page is False:
101 return
b2e8bc1b 102
1212e997 103 login_form = self._hidden_inputs(login_page)
c5e8d7af 104
e00eb564
S
105 def req(url, f_req, note, errnote):
106 data = login_form.copy()
107 data.update({
108 'pstMsg': 1,
109 'checkConnection': 'youtube',
110 'checkedDomains': 'youtube',
111 'hl': 'en',
112 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
3995d37d 113 'f.req': json.dumps(f_req),
e00eb564
S
114 'flowName': 'GlifWebSignIn',
115 'flowEntry': 'ServiceLogin',
041bc3ad 116 })
e00eb564
S
117 return self._download_json(
118 url, None, note=note, errnote=errnote,
119 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
120 fatal=False,
121 data=urlencode_postdata(data), headers={
122 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
123 'Google-Accounts-XSRF': 1,
124 })
125
3995d37d
S
126 def warn(message):
127 self._downloader.report_warning(message)
128
129 lookup_req = [
130 username,
131 None, [], None, 'US', None, None, 2, False, True,
132 [
133 None, None,
134 [2, 1, None, 1,
135 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
136 None, [], 4],
137 1, [None, None, []], None, None, None, True
138 ],
139 username,
140 ]
141
e00eb564 142 lookup_results = req(
3995d37d 143 self._LOOKUP_URL, lookup_req,
e00eb564
S
144 'Looking up account info', 'Unable to look up account info')
145
146 if lookup_results is False:
147 return False
041bc3ad 148
3995d37d
S
149 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
150 if not user_hash:
151 warn('Unable to extract user hash')
152 return False
153
154 challenge_req = [
155 user_hash,
156 None, 1, None, [1, None, None, None, [password, None, True]],
157 [
158 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
159 1, [None, None, []], None, None, None, True
160 ]]
83317f69 161
3995d37d
S
162 challenge_results = req(
163 self._CHALLENGE_URL, challenge_req,
164 'Logging in', 'Unable to log in')
83317f69 165
3995d37d 166 if challenge_results is False:
e00eb564 167 return
83317f69 168
3995d37d
S
169 login_res = try_get(challenge_results, lambda x: x[0][5], list)
170 if login_res:
171 login_msg = try_get(login_res, lambda x: x[5], compat_str)
172 warn(
173 'Unable to login: %s' % 'Invalid password'
174 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
175 return False
176
177 res = try_get(challenge_results, lambda x: x[0][-1], list)
178 if not res:
179 warn('Unable to extract result entry')
180 return False
181
9a6628aa
S
182 login_challenge = try_get(res, lambda x: x[0][0], list)
183 if login_challenge:
184 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
185 if challenge_str == 'TWO_STEP_VERIFICATION':
3995d37d
S
186 # SEND_SUCCESS - TFA code has been successfully sent to phone
187 # QUOTA_EXCEEDED - reached the limit of TFA codes
9a6628aa 188 status = try_get(login_challenge, lambda x: x[5], compat_str)
3995d37d
S
189 if status == 'QUOTA_EXCEEDED':
190 warn('Exceeded the limit of TFA codes, try later')
191 return False
192
193 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
194 if not tl:
195 warn('Unable to extract TL')
196 return False
197
198 tfa_code = self._get_tfa_info('2-step verification code')
199
200 if not tfa_code:
201 warn(
202 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
203 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
204 return False
205
206 tfa_code = remove_start(tfa_code, 'G-')
207
208 tfa_req = [
209 user_hash, None, 2, None,
210 [
211 9, None, None, None, None, None, None, None,
212 [None, tfa_code, True, 2]
213 ]]
214
215 tfa_results = req(
216 self._TFA_URL.format(tl), tfa_req,
217 'Submitting TFA code', 'Unable to submit TFA code')
218
219 if tfa_results is False:
220 return False
221
222 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
223 if tfa_res:
224 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
225 warn(
226 'Unable to finish TFA: %s' % 'Invalid TFA code'
227 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
228 return False
229
230 check_cookie_url = try_get(
231 tfa_results, lambda x: x[0][-1][2], compat_str)
9a6628aa
S
232 else:
233 CHALLENGES = {
234 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
235 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
236 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
237 }
238 challenge = CHALLENGES.get(
239 challenge_str,
240 '%s returned error %s.' % (self.IE_NAME, challenge_str))
241 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
242 return False
3995d37d
S
243 else:
244 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
245
246 if not check_cookie_url:
247 warn('Unable to extract CheckCookie URL')
248 return False
e00eb564
S
249
250 check_cookie_results = self._download_webpage(
3995d37d
S
251 check_cookie_url, None, 'Checking cookie', fatal=False)
252
253 if check_cookie_results is False:
254 return False
e00eb564 255
3995d37d
S
256 if 'https://myaccount.google.com/' not in check_cookie_results:
257 warn('Unable to log in')
b2e8bc1b 258 return False
e00eb564 259
b2e8bc1b
JMF
260 return True
261
30226342 262 def _download_webpage_handle(self, *args, **kwargs):
c1148516
S
263 query = kwargs.get('query', {}).copy()
264 query['disable_polymer'] = 'true'
265 kwargs['query'] = query
30226342 266 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
8d81f3e3
S
267 *args, **compat_kwargs(kwargs))
268
b2e8bc1b
JMF
269 def _real_initialize(self):
270 if self._downloader is None:
271 return
42939b61 272 self._set_language()
b2e8bc1b
JMF
273 if not self._login():
274 return
c5e8d7af 275
8377574c 276
8e7aad20 277class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
061a75ed 278 # Extract entries from page with "Load more" button
648e6a1f
S
279 def _entries(self, page, playlist_id):
280 more_widget_html = content_html = page
281 for page_num in itertools.count(1):
061a75ed
S
282 for entry in self._process_page(content_html):
283 yield entry
648e6a1f
S
284
285 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
286 if not mobj:
287 break
288
289 more = self._download_json(
290 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
291 'Downloading page #%s' % page_num,
292 transform_source=uppercase_escape)
293 content_html = more['content_html']
294 if not content_html.strip():
295 # Some webpages show a "Load more" button but they don't
296 # have more videos
297 break
298 more_widget_html = more['load_more_widget_html']
299
061a75ed
S
300
301class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
302 def _process_page(self, content):
303 for video_id, video_title in self.extract_videos_from_page(content):
304 yield self.url_result(video_id, 'Youtube', video_id, video_title)
305
648e6a1f
S
306 def extract_videos_from_page(self, page):
307 ids_in_page = []
308 titles_in_page = []
309 for mobj in re.finditer(self._VIDEO_RE, page):
310 # The link with index 0 is not the first video of the playlist (not sure if still actual)
311 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
312 continue
313 video_id = mobj.group('id')
314 video_title = unescapeHTML(mobj.group('title'))
315 if video_title:
316 video_title = video_title.strip()
317 try:
318 idx = ids_in_page.index(video_id)
319 if video_title and not titles_in_page[idx]:
320 titles_in_page[idx] = video_title
321 except ValueError:
322 ids_in_page.append(video_id)
323 titles_in_page.append(video_title)
324 return zip(ids_in_page, titles_in_page)
325
326
061a75ed
S
327class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
328 def _process_page(self, content):
6dee688e
S
329 for playlist_id in orderedSet(re.findall(
330 r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
331 content)):
061a75ed
S
332 yield self.url_result(
333 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
334
0c148415
S
335 def _real_extract(self, url):
336 playlist_id = self._match_id(url)
337 webpage = self._download_webpage(url, playlist_id)
0c148415 338 title = self._og_search_title(webpage, fatal=False)
061a75ed 339 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
0c148415
S
340
341
360e1ca5 342class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 343 IE_DESC = 'YouTube.com'
cb7dfeea 344 _VALID_URL = r"""(?x)^
c5e8d7af 345 (
edb53e2d 346 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 347 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 348 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 349 (?:www\.)?pwnyoutube\.com/|
8b561bfc 350 (?:www\.)?hooktube\.com/|
f7000f3a 351 (?:www\.)?yourepeat\.com/|
e69ae5b9 352 tube\.majestyc\.net/|
cd5a74a2 353 (?:www\.)?invidio\.us/|
e69ae5b9 354 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
355 (?:.*?\#/)? # handle anchor (#/) redirect urls
356 (?: # the various things that can precede the ID:
ac7553d0 357 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 358 |(?: # or the v= param in all its forms
f7000f3a 359 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 360 (?:\?|\#!?) # the params delimiter ? or # or #!
040ac686 361 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
c5e8d7af
PH
362 v=
363 )
f4b05232 364 ))
cbaed4bb
S
365 |(?:
366 youtu\.be| # just youtu.be/xxxx
6d4fc66b
S
367 vid\.plus| # or vid.plus/xxxx
368 zwearz\.com/watch| # or zwearz.com/watch/xxxx
cbaed4bb 369 )/
edb53e2d 370 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 371 )
c5e8d7af 372 )? # all until now is optional -> you can pass the naked ID
8963d9c2 373 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
d0ba5587
S
374 (?!.*?\blist=
375 (?:
376 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
377 WL # WL are handled by the watch later IE
378 )
379 )
c5e8d7af 380 (?(1).+)? # if we found the ID, everything can follow
d0ba5587 381 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
c5e8d7af 382 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26 383 _formats = {
c2d3cb4c 384 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
385 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
386 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
387 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
388 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
389 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
390 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
391 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
3834d3e3 392 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
c2d3cb4c 393 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
394 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
395 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
396 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
397 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
398 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
e1a0bfdf 399 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
c2d3cb4c 400 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
401 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
e1a0bfdf 402
403
404 # 3D videos
c2d3cb4c 405 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
406 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
407 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
408 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
e1a0bfdf 409 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
410 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
411 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
836a086c 412
96fb5605 413 # Apple HTTP Live Streaming
11f12195 414 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
c2d3cb4c 415 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
416 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
417 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
418 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
419 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
e1a0bfdf 420 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
421 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
2c62dc26
PH
422
423 # DASH mp4 video
d23028a8
S
424 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
425 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
426 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
427 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
428 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
429 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
430 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
431 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
432 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
433 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
434 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
435 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
836a086c 436
f6f1fc92 437 # Dash mp4 audio
d23028a8
S
438 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
439 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
440 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
441 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
442 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
443 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
444 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
836a086c
AZ
445
446 # Dash webm
d23028a8
S
447 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
448 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
449 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
450 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
451 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
452 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
453 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
454 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
455 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
456 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
457 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
458 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
459 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
460 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
461 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
4c6b4764 462 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
d23028a8
S
463 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
464 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
465 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
466 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
467 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
468 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
2c62dc26
PH
469
470 # Dash webm audio
d23028a8
S
471 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
472 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
ce6b9a2d 473
0857baad 474 # Dash webm audio with opus inside
d23028a8
S
475 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
476 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
477 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
0857baad 478
ce6b9a2d
PH
479 # RTMP (unnamed)
480 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 481 }
23d17e4b 482 _SUBTITLE_FORMATS = ('ttml', 'vtt')
836a086c 483
fd5c4aab
S
484 _GEO_BYPASS = False
485
78caa52a 486 IE_NAME = 'youtube'
2eb88d95
PH
487 _TESTS = [
488 {
2d3d2997 489 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
4bc3a23e
PH
490 'info_dict': {
491 'id': 'BaW_jenozKc',
492 'ext': 'mp4',
493 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
494 'uploader': 'Philipp Hagemeister',
495 'uploader_id': 'phihag',
ec85ded8 496 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
dd4c4492
S
497 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
498 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
4bc3a23e 499 'upload_date': '20121002',
7caf9830 500 'license': 'Standard YouTube License',
4bc3a23e
PH
501 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
502 'categories': ['Science & Technology'],
000b6b5a 503 'tags': ['youtube-dl'],
556dbe7f 504 'duration': 10,
dbdaaa23 505 'view_count': int,
3e7c1224
PH
506 'like_count': int,
507 'dislike_count': int,
7c80519c 508 'start_time': 1,
297a564b 509 'end_time': 9,
2eb88d95 510 }
0e853ca4 511 },
0e853ca4 512 {
2d3d2997 513 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',
4bc3a23e
PH
514 'note': 'Test generic use_cipher_signature video (#897)',
515 'info_dict': {
516 'id': 'UxxajLWwzqY',
517 'ext': 'mp4',
518 'upload_date': '20120506',
519 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
0cb58b02 520 'alt_title': 'I Love It (feat. Charli XCX)',
7caf9830 521 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',
000b6b5a
S
522 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
523 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
524 'iconic ep', 'iconic', 'love', 'it'],
556dbe7f 525 'duration': 180,
4bc3a23e
PH
526 'uploader': 'Icona Pop',
527 'uploader_id': 'IconaPop',
ec85ded8 528 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
7caf9830 529 'license': 'Standard YouTube License',
0cb58b02 530 'creator': 'Icona Pop',
936784b2
S
531 'track': 'I Love It (feat. Charli XCX)',
532 'artist': 'Icona Pop',
2eb88d95 533 }
c108eb73
JMF
534 },
535 {
4bc3a23e
PH
536 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
537 'note': 'Test VEVO video with age protection (#956)',
538 'info_dict': {
539 'id': '07FYdnEawAQ',
540 'ext': 'mp4',
541 'upload_date': '20130703',
542 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
0cb58b02 543 'alt_title': 'Tunnel Vision',
4bc3a23e 544 'description': 'md5:64249768eec3bc4276236606ea996373',
556dbe7f 545 'duration': 419,
4bc3a23e
PH
546 'uploader': 'justintimberlakeVEVO',
547 'uploader_id': 'justintimberlakeVEVO',
ec85ded8 548 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
7caf9830 549 'license': 'Standard YouTube License',
0cb58b02 550 'creator': 'Justin Timberlake',
7e72694b 551 'track': 'Tunnel Vision',
936784b2 552 'artist': 'Justin Timberlake',
34952f09 553 'age_limit': 18,
c108eb73
JMF
554 }
555 },
fccd3771 556 {
4bc3a23e
PH
557 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
558 'note': 'Embed-only video (#1746)',
559 'info_dict': {
560 'id': 'yZIXLfi8CZQ',
561 'ext': 'mp4',
562 'upload_date': '20120608',
563 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
564 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
565 'uploader': 'SET India',
94bfcd23 566 'uploader_id': 'setindia',
ec85ded8 567 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
7caf9830 568 'license': 'Standard YouTube License',
94bfcd23 569 'age_limit': 18,
fccd3771
PH
570 }
571 },
11b56058 572 {
2d3d2997 573 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
11b56058
PM
574 'note': 'Use the first video ID in the URL',
575 'info_dict': {
576 'id': 'BaW_jenozKc',
577 'ext': 'mp4',
578 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
579 'uploader': 'Philipp Hagemeister',
580 'uploader_id': 'phihag',
ec85ded8 581 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
11b56058 582 'upload_date': '20121002',
7caf9830 583 'license': 'Standard YouTube License',
11b56058
PM
584 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
585 'categories': ['Science & Technology'],
586 'tags': ['youtube-dl'],
556dbe7f 587 'duration': 10,
dbdaaa23 588 'view_count': int,
11b56058
PM
589 'like_count': int,
590 'dislike_count': int,
34a7de29
S
591 },
592 'params': {
593 'skip_download': True,
594 },
11b56058 595 },
dd27fd17 596 {
2d3d2997 597 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
4bc3a23e
PH
598 'note': '256k DASH audio (format 141) via DASH manifest',
599 'info_dict': {
600 'id': 'a9LDPn-MO4I',
601 'ext': 'm4a',
602 'upload_date': '20121002',
603 'uploader_id': '8KVIDEO',
ec85ded8 604 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
4bc3a23e
PH
605 'description': '',
606 'uploader': '8KVIDEO',
7caf9830 607 'license': 'Standard YouTube License',
4bc3a23e 608 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 609 },
4bc3a23e
PH
610 'params': {
611 'youtube_include_dash_manifest': True,
612 'format': '141',
4919603f 613 },
de3c7fe0 614 'skip': 'format 141 not served anymore',
dd27fd17 615 },
3489b7d2
JMF
616 # DASH manifest with encrypted signature
617 {
78caa52a
PH
618 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
619 'info_dict': {
620 'id': 'IB3lcPjvWLA',
621 'ext': 'm4a',
b766eb27 622 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
eb6793ba 623 'description': 'md5:1900ed86ee514927b9e00fbead6969a5',
556dbe7f 624 'duration': 244,
78caa52a
PH
625 'uploader': 'AfrojackVEVO',
626 'uploader_id': 'AfrojackVEVO',
627 'upload_date': '20131011',
7caf9830 628 'license': 'Standard YouTube License',
3489b7d2 629 },
4bc3a23e 630 'params': {
78caa52a 631 'youtube_include_dash_manifest': True,
de3c7fe0 632 'format': '141/bestaudio[ext=m4a]',
3489b7d2
JMF
633 },
634 },
aaeb86f6
S
635 # JS player signature function name containing $
636 {
637 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
638 'info_dict': {
639 'id': 'nfWlot6h_JM',
640 'ext': 'm4a',
641 'title': 'Taylor Swift - Shake It Off',
0cb58b02 642 'alt_title': 'Shake It Off',
f57b7835 643 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',
556dbe7f 644 'duration': 242,
aaeb86f6
S
645 'uploader': 'TaylorSwiftVEVO',
646 'uploader_id': 'TaylorSwiftVEVO',
647 'upload_date': '20140818',
7caf9830 648 'license': 'Standard YouTube License',
0cb58b02 649 'creator': 'Taylor Swift',
aaeb86f6
S
650 },
651 'params': {
652 'youtube_include_dash_manifest': True,
de3c7fe0 653 'format': '141/bestaudio[ext=m4a]',
aaeb86f6
S
654 },
655 },
aa79ac0c
PH
656 # Controversy video
657 {
658 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
659 'info_dict': {
660 'id': 'T4XJQO3qol8',
661 'ext': 'mp4',
556dbe7f 662 'duration': 219,
aa79ac0c 663 'upload_date': '20100909',
eb6793ba 664 'uploader': 'TJ Kirk',
aa79ac0c 665 'uploader_id': 'TheAmazingAtheist',
ec85ded8 666 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
7caf9830 667 'license': 'Standard YouTube License',
aa79ac0c
PH
668 'title': 'Burning Everyone\'s Koran',
669 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
670 }
c522adb1
JMF
671 },
672 # Normal age-gate video (No vevo, embed allowed)
673 {
2d3d2997 674 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
c522adb1
JMF
675 'info_dict': {
676 'id': 'HtVdAasjOgU',
677 'ext': 'mp4',
678 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
ec85ded8 679 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
556dbe7f 680 'duration': 142,
c522adb1
JMF
681 'uploader': 'The Witcher',
682 'uploader_id': 'WitcherGame',
ec85ded8 683 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
c522adb1 684 'upload_date': '20140605',
7caf9830 685 'license': 'Standard YouTube License',
34952f09 686 'age_limit': 18,
c522adb1
JMF
687 },
688 },
fccae2b9
S
689 # Age-gate video with encrypted signature
690 {
2d3d2997 691 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',
fccae2b9
S
692 'info_dict': {
693 'id': '6kLq3WMV1nU',
eb6793ba 694 'ext': 'webm',
fccae2b9
S
695 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
696 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
eb6793ba 697 'duration': 246,
fccae2b9
S
698 'uploader': 'LloydVEVO',
699 'uploader_id': 'LloydVEVO',
ec85ded8 700 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
fccae2b9 701 'upload_date': '20110629',
7caf9830 702 'license': 'Standard YouTube License',
34952f09 703 'age_limit': 18,
fccae2b9
S
704 },
705 },
774e208f 706 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
7d02dcfa 707 # YouTube Red ad is not captured for creator
774e208f
PH
708 {
709 'url': '__2ABJjxzNo',
710 'info_dict': {
711 'id': '__2ABJjxzNo',
712 'ext': 'mp4',
556dbe7f 713 'duration': 266,
774e208f
PH
714 'upload_date': '20100430',
715 'uploader_id': 'deadmau5',
ec85ded8 716 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
0cb58b02 717 'creator': 'deadmau5',
774e208f
PH
718 'description': 'md5:12c56784b8032162bb936a5f76d55360',
719 'uploader': 'deadmau5',
7caf9830 720 'license': 'Standard YouTube License',
774e208f 721 'title': 'Deadmau5 - Some Chords (HD)',
0cb58b02 722 'alt_title': 'Some Chords',
774e208f
PH
723 },
724 'expected_warnings': [
725 'DASH manifest missing',
726 ]
e52a40ab
PH
727 },
728 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
729 {
730 'url': 'lqQg6PlCWgI',
731 'info_dict': {
732 'id': 'lqQg6PlCWgI',
733 'ext': 'mp4',
556dbe7f 734 'duration': 6085,
90227264 735 'upload_date': '20150827',
cbe2bd91 736 'uploader_id': 'olympic',
ec85ded8 737 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
7caf9830 738 'license': 'Standard YouTube License',
cbe2bd91 739 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
be49068d 740 'uploader': 'Olympic',
cbe2bd91
PH
741 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
742 },
743 'params': {
744 'skip_download': 'requires avconv',
e52a40ab 745 }
cbe2bd91 746 },
6271f1ca
PH
747 # Non-square pixels
748 {
749 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
750 'info_dict': {
751 'id': '_b-2C3KPAM0',
752 'ext': 'mp4',
753 'stretched_ratio': 16 / 9.,
556dbe7f 754 'duration': 85,
6271f1ca
PH
755 'upload_date': '20110310',
756 'uploader_id': 'AllenMeow',
ec85ded8 757 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
6271f1ca 758 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
eb6793ba 759 'uploader': '孫ᄋᄅ',
7caf9830 760 'license': 'Standard YouTube License',
6271f1ca
PH
761 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
762 },
06b491eb
S
763 },
764 # url_encoded_fmt_stream_map is empty string
765 {
766 'url': 'qEJwOuvDf7I',
767 'info_dict': {
768 'id': 'qEJwOuvDf7I',
f57b7835 769 'ext': 'webm',
06b491eb
S
770 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
771 'description': '',
772 'upload_date': '20150404',
773 'uploader_id': 'spbelect',
774 'uploader': 'Наблюдатели Петербурга',
775 },
776 'params': {
777 'skip_download': 'requires avconv',
e323cf3f
S
778 },
779 'skip': 'This live event has ended.',
06b491eb 780 },
da77d856
S
781 # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
782 {
783 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
784 'info_dict': {
785 'id': 'FIl7x6_3R5Y',
eb6793ba 786 'ext': 'webm',
da77d856
S
787 'title': 'md5:7b81415841e02ecd4313668cde88737a',
788 'description': 'md5:116377fd2963b81ec4ce64b542173306',
556dbe7f 789 'duration': 220,
da77d856
S
790 'upload_date': '20150625',
791 'uploader_id': 'dorappi2000',
ec85ded8 792 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
da77d856 793 'uploader': 'dorappi2000',
7caf9830 794 'license': 'Standard YouTube License',
eb6793ba 795 'formats': 'mincount:31',
da77d856 796 },
eb6793ba 797 'skip': 'not actual anymore',
2ee8f5d8 798 },
8a1a26ce
YCH
799 # DASH manifest with segment_list
800 {
801 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
802 'md5': '8ce563a1d667b599d21064e982ab9e31',
803 'info_dict': {
804 'id': 'CsmdDsKjzN8',
805 'ext': 'mp4',
17ee98e1 806 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
807 'uploader': 'Airtek',
808 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
809 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
7caf9830 810 'license': 'Standard YouTube License',
8a1a26ce
YCH
811 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
812 },
813 'params': {
814 'youtube_include_dash_manifest': True,
815 'format': '135', # bestvideo
be49068d
S
816 },
817 'skip': 'This live event has ended.',
2ee8f5d8 818 },
cf7e015f
S
819 {
820 # Multifeed videos (multiple cameras), URL is for Main Camera
821 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
822 'info_dict': {
823 'id': 'jqWvoWXjCVs',
824 'title': 'teamPGP: Rocket League Noob Stream',
825 'description': 'md5:dc7872fb300e143831327f1bae3af010',
826 },
827 'playlist': [{
828 'info_dict': {
829 'id': 'jqWvoWXjCVs',
830 'ext': 'mp4',
831 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
832 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 833 'duration': 7335,
cf7e015f
S
834 'upload_date': '20150721',
835 'uploader': 'Beer Games Beer',
836 'uploader_id': 'beergamesbeer',
ec85ded8 837 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 838 'license': 'Standard YouTube License',
cf7e015f
S
839 },
840 }, {
841 'info_dict': {
842 'id': '6h8e8xoXJzg',
843 'ext': 'mp4',
844 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
845 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 846 'duration': 7337,
cf7e015f
S
847 'upload_date': '20150721',
848 'uploader': 'Beer Games Beer',
849 'uploader_id': 'beergamesbeer',
ec85ded8 850 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 851 'license': 'Standard YouTube License',
cf7e015f
S
852 },
853 }, {
854 'info_dict': {
855 'id': 'PUOgX5z9xZw',
856 'ext': 'mp4',
857 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
858 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 859 'duration': 7337,
cf7e015f
S
860 'upload_date': '20150721',
861 'uploader': 'Beer Games Beer',
862 'uploader_id': 'beergamesbeer',
ec85ded8 863 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 864 'license': 'Standard YouTube License',
cf7e015f
S
865 },
866 }, {
867 'info_dict': {
868 'id': 'teuwxikvS5k',
869 'ext': 'mp4',
870 'title': 'teamPGP: Rocket League Noob Stream (zim)',
871 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 872 'duration': 7334,
cf7e015f
S
873 'upload_date': '20150721',
874 'uploader': 'Beer Games Beer',
875 'uploader_id': 'beergamesbeer',
ec85ded8 876 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 877 'license': 'Standard YouTube License',
cf7e015f
S
878 },
879 }],
880 'params': {
881 'skip_download': True,
882 },
cbaed4bb 883 },
f9f49d87
S
884 {
885 # Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536)
886 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
887 'info_dict': {
888 'id': 'gVfLd0zydlo',
889 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
890 },
891 'playlist_count': 2,
be49068d 892 'skip': 'Not multifeed anymore',
f9f49d87 893 },
cbaed4bb 894 {
2d3d2997 895 'url': 'https://vid.plus/FlRa-iH7PGw',
cbaed4bb 896 'only_matching': True,
0e49d9a6 897 },
6d4fc66b 898 {
2d3d2997 899 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
6d4fc66b
S
900 'only_matching': True,
901 },
0e49d9a6 902 {
61f92af1 903 # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)
a8776b10
S
904 # Also tests cut-off URL expansion in video description (see
905 # https://github.com/rg3/youtube-dl/issues/1892,
906 # https://github.com/rg3/youtube-dl/issues/8164)
0e49d9a6
LL
907 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
908 'info_dict': {
909 'id': 'lsguqyKfVQg',
910 'ext': 'mp4',
911 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
eb6793ba 912 'alt_title': 'Dark Walk - Position Music',
0e49d9a6 913 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
556dbe7f 914 'duration': 133,
0e49d9a6
LL
915 'upload_date': '20151119',
916 'uploader_id': 'IronSoulElf',
ec85ded8 917 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
0e49d9a6 918 'uploader': 'IronSoulElf',
7caf9830 919 'license': 'Standard YouTube License',
eb6793ba
S
920 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
921 'track': 'Dark Walk - Position Music',
922 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
0e49d9a6
LL
923 },
924 'params': {
925 'skip_download': True,
926 },
927 },
61f92af1
S
928 {
929 # Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468)
930 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
931 'only_matching': True,
932 },
313dfc45
LL
933 {
934 # Video with yt:stretch=17:0
935 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
936 'info_dict': {
937 'id': 'Q39EVAstoRM',
938 'ext': 'mp4',
939 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
940 'description': 'md5:ee18a25c350637c8faff806845bddee9',
941 'upload_date': '20151107',
942 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
943 'uploader': 'CH GAMER DROID',
944 },
945 'params': {
946 'skip_download': True,
947 },
be49068d 948 'skip': 'This video does not exist.',
313dfc45 949 },
7caf9830
S
950 {
951 # Video licensed under Creative Commons
952 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
953 'info_dict': {
954 'id': 'M4gD1WSo5mA',
955 'ext': 'mp4',
956 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
957 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
556dbe7f 958 'duration': 721,
7caf9830
S
959 'upload_date': '20150127',
960 'uploader_id': 'BerkmanCenter',
ec85ded8 961 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
556dbe7f 962 'uploader': 'The Berkman Klein Center for Internet & Society',
7caf9830
S
963 'license': 'Creative Commons Attribution license (reuse allowed)',
964 },
965 'params': {
966 'skip_download': True,
967 },
968 },
fd050249
S
969 {
970 # Channel-like uploader_url
971 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
972 'info_dict': {
973 'id': 'eQcmzGIKrzg',
974 'ext': 'mp4',
975 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
976 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
556dbe7f 977 'duration': 4060,
fd050249 978 'upload_date': '20151119',
eb6793ba 979 'uploader': 'Bernie Sanders',
fd050249 980 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
ec85ded8 981 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
fd050249
S
982 'license': 'Creative Commons Attribution license (reuse allowed)',
983 },
984 'params': {
985 'skip_download': True,
986 },
987 },
040ac686
S
988 {
989 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
990 'only_matching': True,
7f29cf54
S
991 },
992 {
993 # YouTube Red paid video (https://github.com/rg3/youtube-dl/issues/10059)
994 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
995 'only_matching': True,
6496ccb4
S
996 },
997 {
998 # Rental video preview
999 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1000 'info_dict': {
1001 'id': 'uGpuVWrhIzE',
1002 'ext': 'mp4',
1003 'title': 'Piku - Trailer',
1004 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1005 'upload_date': '20150811',
1006 'uploader': 'FlixMatrix',
1007 'uploader_id': 'FlixMatrixKaravan',
ec85ded8 1008 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
6496ccb4
S
1009 'license': 'Standard YouTube License',
1010 },
1011 'params': {
1012 'skip_download': True,
1013 },
eb6793ba 1014 'skip': 'This video is not available.',
022a5d66 1015 },
12afdc2a
S
1016 {
1017 # YouTube Red video with episode data
1018 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1019 'info_dict': {
1020 'id': 'iqKdEhx-dD4',
1021 'ext': 'mp4',
1022 'title': 'Isolation - Mind Field (Ep 1)',
eb6793ba 1023 'description': 'md5:25b78d2f64ae81719f5c96319889b736',
556dbe7f 1024 'duration': 2085,
12afdc2a
S
1025 'upload_date': '20170118',
1026 'uploader': 'Vsauce',
1027 'uploader_id': 'Vsauce',
1028 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1029 'license': 'Standard YouTube License',
1030 'series': 'Mind Field',
1031 'season_number': 1,
1032 'episode_number': 1,
1033 },
1034 'params': {
1035 'skip_download': True,
1036 },
1037 'expected_warnings': [
1038 'Skipping DASH manifest',
1039 ],
1040 },
c7121fa7
S
1041 {
1042 # The following content has been identified by the YouTube community
1043 # as inappropriate or offensive to some audiences.
1044 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1045 'info_dict': {
1046 'id': '6SJNVb0GnPI',
1047 'ext': 'mp4',
1048 'title': 'Race Differences in Intelligence',
1049 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1050 'duration': 965,
1051 'upload_date': '20140124',
1052 'uploader': 'New Century Foundation',
1053 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1054 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1055 'license': 'Standard YouTube License',
c7121fa7
S
1056 },
1057 'params': {
1058 'skip_download': True,
1059 },
1060 },
022a5d66
S
1061 {
1062 # itag 212
1063 'url': '1t24XAntNCY',
1064 'only_matching': True,
fd5c4aab
S
1065 },
1066 {
1067 # geo restricted to JP
1068 'url': 'sJL6WA-aGkQ',
1069 'only_matching': True,
1070 },
d0ba5587
S
1071 {
1072 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1073 'only_matching': True,
1074 },
cd5a74a2
S
1075 {
1076 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1077 'only_matching': True,
1078 },
2eb88d95
PH
1079 ]
1080
e0df6211
PH
1081 def __init__(self, *args, **kwargs):
1082 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 1083 self._player_cache = {}
e0df6211 1084
c5e8d7af
PH
1085 def report_video_info_webpage_download(self, video_id):
1086 """Report attempt to download video info webpage."""
69ea8ca4 1087 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 1088
c5e8d7af
PH
1089 def report_information_extraction(self, video_id):
1090 """Report attempt to extract video information."""
69ea8ca4 1091 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
1092
1093 def report_unavailable_format(self, video_id, format):
1094 """Report extracted video URL."""
69ea8ca4 1095 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
1096
1097 def report_rtmp_download(self):
1098 """Indicate the download will use the RTMP protocol."""
69ea8ca4 1099 self.to_screen('RTMP download detected')
c5e8d7af 1100
60064c53
PH
1101 def _signature_cache_id(self, example_sig):
1102 """ Return a string representation of a signature """
78caa52a 1103 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
1104
1105 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 1106 id_m = re.match(
e31fed95 1107 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',
cf010131 1108 player_url)
c081b35c
PH
1109 if not id_m:
1110 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
1111 player_type = id_m.group('ext')
1112 player_id = id_m.group('id')
1113
c4417ddb 1114 # Read from filesystem cache
60064c53
PH
1115 func_id = '%s_%s_%s' % (
1116 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 1117 assert os.path.basename(func_id) == func_id
a0e07d31 1118
69ea8ca4 1119 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 1120 if cache_spec is not None:
78caa52a 1121 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 1122
6d1a55a5
PH
1123 download_note = (
1124 'Downloading player %s' % player_url
1125 if self._downloader.params.get('verbose') else
1126 'Downloading %s player %s' % (player_type, player_id)
1127 )
e0df6211
PH
1128 if player_type == 'js':
1129 code = self._download_webpage(
1130 player_url, video_id,
6d1a55a5 1131 note=download_note,
69ea8ca4 1132 errnote='Download of %s failed' % player_url)
83799698 1133 res = self._parse_sig_js(code)
c4417ddb 1134 elif player_type == 'swf':
e0df6211
PH
1135 urlh = self._request_webpage(
1136 player_url, video_id,
6d1a55a5 1137 note=download_note,
69ea8ca4 1138 errnote='Download of %s failed' % player_url)
e0df6211 1139 code = urlh.read()
83799698 1140 res = self._parse_sig_swf(code)
e0df6211
PH
1141 else:
1142 assert False, 'Invalid player type %r' % player_type
1143
785521bf
PH
1144 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1145 cache_res = res(test_string)
1146 cache_spec = [ord(c) for c in cache_res]
83799698 1147
69ea8ca4 1148 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
1149 return res
1150
60064c53 1151 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
1152 def gen_sig_code(idxs):
1153 def _genslice(start, end, step):
78caa52a 1154 starts = '' if start == 0 else str(start)
8bcc8756 1155 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 1156 steps = '' if step == 1 else (':%d' % step)
78caa52a 1157 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
1158
1159 step = None
7af808a5
PH
1160 # Quelch pyflakes warnings - start will be set when step is set
1161 start = '(Never used)'
edf3e38e
PH
1162 for i, prev in zip(idxs[1:], idxs[:-1]):
1163 if step is not None:
1164 if i - prev == step:
1165 continue
1166 yield _genslice(start, prev, step)
1167 step = None
1168 continue
1169 if i - prev in [-1, 1]:
1170 step = i - prev
1171 start = prev
1172 continue
1173 else:
78caa52a 1174 yield 's[%d]' % prev
edf3e38e 1175 if step is None:
78caa52a 1176 yield 's[%d]' % i
edf3e38e
PH
1177 else:
1178 yield _genslice(start, i, step)
1179
78caa52a 1180 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 1181 cache_res = func(test_string)
edf3e38e 1182 cache_spec = [ord(c) for c in cache_res]
78caa52a 1183 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
1184 signature_id_tuple = '(%s)' % (
1185 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 1186 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 1187 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 1188 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 1189
e0df6211
PH
1190 def _parse_sig_js(self, jscode):
1191 funcname = self._search_regex(
3c90cc8b 1192 (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
9a47fa35
S
1193 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1194 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
2511eee2
S
1195 r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1196 r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
3c90cc8b 1197 jscode, 'Initial JS player signature function name', group='sig')
2b25cb5d
PH
1198
1199 jsi = JSInterpreter(jscode)
1200 initial_function = jsi.extract_function(funcname)
e0df6211
PH
1201 return lambda s: initial_function([s])
1202
1203 def _parse_sig_swf(self, file_contents):
54256267 1204 swfi = SWFInterpreter(file_contents)
78caa52a 1205 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 1206 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 1207 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
1208 return lambda s: initial_function([s])
1209
83799698 1210 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1211 """Turn the encrypted s field into a working signature"""
6b37f0be 1212
c8bf86d5 1213 if player_url is None:
69ea8ca4 1214 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 1215
69ea8ca4 1216 if player_url.startswith('//'):
78caa52a 1217 player_url = 'https:' + player_url
3c90cc8b
S
1218 elif not re.match(r'https?://', player_url):
1219 player_url = compat_urlparse.urljoin(
1220 'https://www.youtube.com', player_url)
c8bf86d5 1221 try:
62af3a0e 1222 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
1223 if player_id not in self._player_cache:
1224 func = self._extract_signature_function(
60064c53 1225 video_id, player_url, s
c8bf86d5
PH
1226 )
1227 self._player_cache[player_id] = func
1228 func = self._player_cache[player_id]
1229 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 1230 self._print_sig_code(func, s)
c8bf86d5
PH
1231 return func(s)
1232 except Exception as e:
1233 tb = traceback.format_exc()
1234 raise ExtractorError(
78caa52a 1235 'Signature extraction failed: ' + tb, cause=e)
e0df6211 1236
360e1ca5 1237 def _get_subtitles(self, video_id, webpage):
de7f3446 1238 try:
60e47a26 1239 subs_doc = self._download_xml(
38c2e5b8 1240 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
1241 video_id, note=False)
1242 except ExtractorError as err:
9b9c5355 1243 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
de7f3446 1244 return {}
de7f3446
JMF
1245
1246 sub_lang_list = {}
60e47a26
JMF
1247 for track in subs_doc.findall('track'):
1248 lang = track.attrib['lang_code']
7e660ac1
LD
1249 if lang in sub_lang_list:
1250 continue
360e1ca5 1251 sub_formats = []
23d17e4b 1252 for ext in self._SUBTITLE_FORMATS:
15707c7e 1253 params = compat_urllib_parse_urlencode({
360e1ca5
JMF
1254 'lang': lang,
1255 'v': video_id,
1256 'fmt': ext,
1257 'name': track.attrib['name'].encode('utf-8'),
1258 })
1259 sub_formats.append({
1260 'url': 'https://www.youtube.com/api/timedtext?' + params,
1261 'ext': ext,
1262 })
1263 sub_lang_list[lang] = sub_formats
de7f3446 1264 if not sub_lang_list:
69ea8ca4 1265 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
1266 return {}
1267 return sub_lang_list
1268
a72778d3
S
1269 def _get_ytplayer_config(self, video_id, webpage):
1270 patterns = (
526b3b07
S
1271 # User data may contain arbitrary character sequences that may affect
1272 # JSON extraction with regex, e.g. when '};' is contained the second
1273 # regex won't capture the whole JSON. Yet working around by trying more
1274 # concrete regex first keeping in mind proper quoted string handling
1275 # to be implemented in future that will replace this workaround (see
1276 # https://github.com/rg3/youtube-dl/issues/7468,
1277 # https://github.com/rg3/youtube-dl/pull/7599)
a72778d3
S
1278 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1279 r';ytplayer\.config\s*=\s*({.+?});',
1280 )
1281 config = self._search_regex(
1282 patterns, webpage, 'ytplayer.config', default=None)
1283 if config:
1284 return self._parse_json(
1285 uppercase_escape(config), video_id, fatal=False)
0e49d9a6 1286
360e1ca5 1287 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
1288 """We need the webpage for getting the captions url, pass it as an
1289 argument to speed up the process."""
69ea8ca4 1290 self.to_screen('%s: Looking for automatic captions' % video_id)
a72778d3 1291 player_config = self._get_ytplayer_config(video_id, webpage)
78caa52a 1292 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
a72778d3 1293 if not player_config:
de7f3446
JMF
1294 self._downloader.report_warning(err_msg)
1295 return {}
de7f3446 1296 try:
0792d563 1297 args = player_config['args']
b78b292f
S
1298 caption_url = args.get('ttsurl')
1299 if caption_url:
1300 timestamp = args['timestamp']
1301 # We get the available subtitles
15707c7e 1302 list_params = compat_urllib_parse_urlencode({
b78b292f
S
1303 'type': 'list',
1304 'tlangs': 1,
1305 'asrs': 1,
1306 })
1307 list_url = caption_url + '&' + list_params
1308 caption_list = self._download_xml(list_url, video_id)
1309 original_lang_node = caption_list.find('track')
1310 if original_lang_node is None:
1311 self._downloader.report_warning('Video doesn\'t have automatic captions')
1312 return {}
1313 original_lang = original_lang_node.attrib['lang_code']
1314 caption_kind = original_lang_node.attrib.get('kind', '')
1315
1316 sub_lang_list = {}
1317 for lang_node in caption_list.findall('target'):
1318 sub_lang = lang_node.attrib['lang_code']
1319 sub_formats = []
1320 for ext in self._SUBTITLE_FORMATS:
15707c7e 1321 params = compat_urllib_parse_urlencode({
b78b292f
S
1322 'lang': original_lang,
1323 'tlang': sub_lang,
1324 'fmt': ext,
1325 'ts': timestamp,
1326 'kind': caption_kind,
1327 })
1328 sub_formats.append({
1329 'url': caption_url + '&' + params,
1330 'ext': ext,
1331 })
1332 sub_lang_list[sub_lang] = sub_formats
1333 return sub_lang_list
1334
ddbb4c5c
S
1335 def make_captions(sub_url, sub_langs):
1336 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1337 caption_qs = compat_parse_qs(parsed_sub_url.query)
1338 captions = {}
1339 for sub_lang in sub_langs:
1340 sub_formats = []
1341 for ext in self._SUBTITLE_FORMATS:
1342 caption_qs.update({
1343 'tlang': [sub_lang],
1344 'fmt': [ext],
1345 })
1346 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1347 query=compat_urllib_parse_urlencode(caption_qs, True)))
1348 sub_formats.append({
1349 'url': sub_url,
1350 'ext': ext,
1351 })
1352 captions[sub_lang] = sub_formats
1353 return captions
1354
1355 # New captions format as of 22.06.2017
1356 player_response = args.get('player_response')
1357 if player_response and isinstance(player_response, compat_str):
1358 player_response = self._parse_json(
1359 player_response, video_id, fatal=False)
1360 if player_response:
1361 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1362 base_url = renderer['captionTracks'][0]['baseUrl']
1363 sub_lang_list = []
1364 for lang in renderer['translationLanguages']:
1365 lang_code = lang.get('languageCode')
1366 if lang_code:
1367 sub_lang_list.append(lang_code)
1368 return make_captions(base_url, sub_lang_list)
1369
b78b292f
S
1370 # Some videos don't provide ttsurl but rather caption_tracks and
1371 # caption_translation_languages (e.g. 20LmZk1hakA)
ddbb4c5c 1372 # Does not used anymore as of 22.06.2017
b78b292f
S
1373 caption_tracks = args['caption_tracks']
1374 caption_translation_languages = args['caption_translation_languages']
1375 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
ddbb4c5c 1376 sub_lang_list = []
b78b292f
S
1377 for lang in caption_translation_languages.split(','):
1378 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1379 sub_lang = lang_qs.get('lc', [None])[0]
ddbb4c5c
S
1380 if sub_lang:
1381 sub_lang_list.append(sub_lang)
1382 return make_captions(caption_url, sub_lang_list)
de7f3446
JMF
1383 # An extractor error can be raise by the download process if there are
1384 # no automatic captions but there are subtitles
ddbb4c5c 1385 except (KeyError, IndexError, ExtractorError):
de7f3446
JMF
1386 self._downloader.report_warning(err_msg)
1387 return {}
1388
d77ab8e2
S
1389 def _mark_watched(self, video_id, video_info):
1390 playback_url = video_info.get('videostats_playback_base_url', [None])[0]
1391 if not playback_url:
1392 return
1393 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1394 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1395
1396 # cpn generation algorithm is reverse engineered from base.js.
1397 # In fact it works even with dummy cpn.
1398 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1399 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1400
1401 qs.update({
1402 'ver': ['2'],
1403 'cpn': [cpn],
1404 })
1405 playback_url = compat_urlparse.urlunparse(
15707c7e 1406 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
d77ab8e2
S
1407
1408 self._download_webpage(
1409 playback_url, video_id, 'Marking watched',
1410 'Unable to mark watched', fatal=False)
1411
66c9fa36
S
1412 @staticmethod
1413 def _extract_urls(webpage):
1414 # Embedded YouTube player
1415 entries = [
1416 unescapeHTML(mobj.group('url'))
1417 for mobj in re.finditer(r'''(?x)
1418 (?:
1419 <iframe[^>]+?src=|
1420 data-video-url=|
1421 <embed[^>]+?src=|
1422 embedSWF\(?:\s*|
1423 <object[^>]+data=|
1424 new\s+SWFObject\(
1425 )
1426 (["\'])
1427 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
f2332f18 1428 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
66c9fa36
S
1429 \1''', webpage)]
1430
1431 # lazyYT YouTube embed
1432 entries.extend(list(map(
1433 unescapeHTML,
1434 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1435
1436 # Wordpress "YouTube Video Importer" plugin
1437 matches = re.findall(r'''(?x)<div[^>]+
1438 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1439 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1440 entries.extend(m[-1] for m in matches)
1441
1442 return entries
1443
1444 @staticmethod
1445 def _extract_url(webpage):
1446 urls = YoutubeIE._extract_urls(webpage)
1447 return urls[0] if urls else None
1448
97665381
PH
1449 @classmethod
1450 def extract_id(cls, url):
1451 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 1452 if mobj is None:
69ea8ca4 1453 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1454 video_id = mobj.group(2)
1455 return video_id
1456
1fb07d10
JG
1457 def _extract_annotations(self, video_id):
1458 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 1459 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 1460
9cafc3fd
S
1461 @staticmethod
1462 def _extract_chapters(description, duration):
1463 if not description:
1464 return None
1465 chapter_lines = re.findall(
1466 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1467 description)
1468 if not chapter_lines:
1469 return None
1470 chapters = []
1471 for next_num, (chapter_line, time_point) in enumerate(
1472 chapter_lines, start=1):
1473 start_time = parse_duration(time_point)
1474 if start_time is None:
1475 continue
39d4c1be
S
1476 if start_time > duration:
1477 break
9cafc3fd
S
1478 end_time = (duration if next_num == len(chapter_lines)
1479 else parse_duration(chapter_lines[next_num][1]))
1480 if end_time is None:
1481 continue
39d4c1be
S
1482 if end_time > duration:
1483 end_time = duration
1484 if start_time > end_time:
1485 break
9cafc3fd
S
1486 chapter_title = re.sub(
1487 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1488 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1489 chapters.append({
1490 'start_time': start_time,
1491 'end_time': end_time,
1492 'title': chapter_title,
1493 })
1494 return chapters
1495
c5e8d7af 1496 def _real_extract(self, url):
cf7e015f
S
1497 url, smuggled_data = unsmuggle_url(url, {})
1498
7e8c0af0 1499 proto = (
78caa52a
PH
1500 'http' if self._downloader.params.get('prefer_insecure', False)
1501 else 'https')
7e8c0af0 1502
7c80519c 1503 start_time = None
297a564b 1504 end_time = None
7c80519c
JMF
1505 parsed_url = compat_urllib_parse_urlparse(url)
1506 for component in [parsed_url.fragment, parsed_url.query]:
1507 query = compat_parse_qs(component)
297a564b 1508 if start_time is None and 't' in query:
7c80519c 1509 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
1510 if start_time is None and 'start' in query:
1511 start_time = parse_duration(query['start'][0])
297a564b
JMF
1512 if end_time is None and 'end' in query:
1513 end_time = parse_duration(query['end'][0])
7c80519c 1514
c5e8d7af
PH
1515 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1516 mobj = re.search(self._NEXT_URL_RE, url)
1517 if mobj:
7fd002c0 1518 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 1519 video_id = self.extract_id(url)
c5e8d7af
PH
1520
1521 # Get video webpage
aa79ac0c 1522 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 1523 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1524
1525 # Attempt to extract SWF player URL
e0df6211 1526 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1527 if mobj is not None:
1528 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1529 else:
1530 player_url = None
1531
d8d24a92
S
1532 dash_mpds = []
1533
1534 def add_dash_mpd(video_info):
1535 dash_mpd = video_info.get('dashmpd')
1536 if dash_mpd and dash_mpd[0] not in dash_mpds:
1537 dash_mpds.append(dash_mpd[0])
1538
c7121fa7
S
1539 is_live = None
1540 view_count = None
1541
1542 def extract_view_count(v_info):
1543 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1544
dbdaaa23
S
1545 player_response = {}
1546
c5e8d7af 1547 # Get video info
6449cd80 1548 embed_webpage = None
c108eb73 1549 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
1550 age_gate = True
1551 # We simulate the access to the video from www.youtube.com/v/{video_id}
1552 # this can be viewed without login into Youtube
beb95e77
CL
1553 url = proto + '://www.youtube.com/embed/%s' % video_id
1554 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
15707c7e 1555 data = compat_urllib_parse_urlencode({
2c57c7fa
JMF
1556 'video_id': video_id,
1557 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 1558 'sts': self._search_regex(
beb95e77 1559 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 1560 })
7e8c0af0 1561 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
1562 video_info_webpage = self._download_webpage(
1563 video_info_url, video_id,
20436c30 1564 note='Refetching age-gated info webpage',
94bd3613 1565 errnote='unable to download video info webpage')
c5e8d7af 1566 video_info = compat_parse_qs(video_info_webpage)
d8d24a92 1567 add_dash_mpd(video_info)
c108eb73
JMF
1568 else:
1569 age_gate = False
bc93bdb5 1570 video_info = None
dc4e4f90 1571 sts = None
d8d24a92 1572 # Try looking directly into the video webpage
a72778d3
S
1573 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1574 if ytplayer_config:
4e62ebe2 1575 args = ytplayer_config['args']
4c76aa06 1576 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
d8d24a92
S
1577 # Convert to the same format returned by compat_parse_qs
1578 video_info = dict((k, [v]) for k, v in args.items())
1579 add_dash_mpd(video_info)
6496ccb4
S
1580 # Rental video is not rented but preview is available (e.g.
1581 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1582 # https://github.com/rg3/youtube-dl/issues/10532)
1583 if not video_info and args.get('ypc_vid'):
1584 return self.url_result(
1585 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
2fe1ff85
JMF
1586 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1587 is_live = True
dc4e4f90 1588 sts = ytplayer_config.get('sts')
dbdaaa23
S
1589 if not player_response:
1590 pl_response = str_or_none(args.get('player_response'))
1591 if pl_response:
1592 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1593 if isinstance(pl_response, dict):
1594 player_response = pl_response
0a3cf9ad
S
1595 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1596 # We also try looking in get_video_info since it may contain different dashmpd
1597 # URL that points to a DASH manifest with possibly different itag set (some itags
1598 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
1599 # manifest pointed by get_video_info's dashmpd).
1600 # The general idea is to take a union of itags of both DASH manifests (for example
1601 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
4e62ebe2 1602 self.report_video_info_webpage_download(video_id)
dc4e4f90
S
1603 for el in ('info', 'embedded', 'detailpage', 'vevo', ''):
1604 query = {
1605 'video_id': video_id,
1606 'ps': 'default',
1607 'eurl': '',
1608 'gl': 'US',
1609 'hl': 'en',
1610 }
1611 if el:
1612 query['el'] = el
1613 if sts:
1614 query['sts'] = sts
810fb84d 1615 video_info_webpage = self._download_webpage(
dc4e4f90 1616 '%s://www.youtube.com/get_video_info' % proto,
4e62ebe2 1617 video_id, note=False,
dc4e4f90
S
1618 errnote='unable to download video info webpage',
1619 fatal=False, query=query)
1620 if not video_info_webpage:
1621 continue
0a3cf9ad 1622 get_video_info = compat_parse_qs(video_info_webpage)
dbdaaa23
S
1623 if not player_response:
1624 pl_response = get_video_info.get('player_response', [None])[0]
1625 if isinstance(pl_response, dict):
1626 player_response = pl_response
fd545fc6 1627 add_dash_mpd(get_video_info)
c7121fa7
S
1628 if view_count is None:
1629 view_count = extract_view_count(get_video_info)
0a3cf9ad
S
1630 if not video_info:
1631 video_info = get_video_info
1632 if 'token' in get_video_info:
89ea063e
S
1633 # Different get_video_info requests may report different results, e.g.
1634 # some may report video unavailability, but some may serve it without
1635 # any complaint (see https://github.com/rg3/youtube-dl/issues/7362,
1636 # the original webpage as well as el=info and el=embedded get_video_info
1637 # requests report video unavailability due to geo restriction while
1638 # el=detailpage succeeds and returns valid data). This is probably
1639 # due to YouTube measures against IP ranges of hosting providers.
1640 # Working around by preferring the first succeeded video_info containing
1641 # the token if no such video_info yet was found.
44b2264f
S
1642 if 'token' not in video_info:
1643 video_info = get_video_info
4e62ebe2 1644 break
bbb7c3f7
YCH
1645
1646 def extract_unavailable_message():
1647 return self._html_search_regex(
1648 r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',
1649 video_webpage, 'unavailable message', default=None)
1650
c5e8d7af
PH
1651 if 'token' not in video_info:
1652 if 'reason' in video_info:
af214c3a 1653 if 'The uploader has not made this video available in your country.' in video_info['reason']:
fd5c4aab
S
1654 regions_allowed = self._html_search_meta(
1655 'regionsAllowed', video_webpage, default=None)
1656 countries = regions_allowed.split(',') if regions_allowed else None
1657 self.raise_geo_restricted(
1658 msg=video_info['reason'][0], countries=countries)
bbb7c3f7
YCH
1659 reason = video_info['reason'][0]
1660 if 'Invalid parameters' in reason:
1661 unavailable_message = extract_unavailable_message()
1662 if unavailable_message:
1663 reason = unavailable_message
d11271dd 1664 raise ExtractorError(
bbb7c3f7 1665 'YouTube said: %s' % reason,
d11271dd 1666 expected=True, video_id=video_id)
c5e8d7af 1667 else:
d11271dd 1668 raise ExtractorError(
78caa52a 1669 '"token" parameter not in video info for unknown reason',
d11271dd 1670 video_id=video_id)
c5e8d7af 1671
dbdaaa23
S
1672 video_details = try_get(
1673 player_response, lambda x: x['videoDetails'], dict) or {}
1674
cf7e015f
S
1675 # title
1676 if 'title' in video_info:
1677 video_title = video_info['title'][0]
dbdaaa23
S
1678 elif 'title' in player_response:
1679 video_title = video_details['title']
cf7e015f
S
1680 else:
1681 self._downloader.report_warning('Unable to extract video title')
1682 video_title = '_'
1683
1684 # description
9cafc3fd 1685 description_original = video_description = get_element_by_id("eow-description", video_webpage)
cf7e015f 1686 if video_description:
fa4bc6e7
RA
1687
1688 def replace_url(m):
1689 redir_url = compat_urlparse.urljoin(url, m.group(1))
1690 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1691 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1692 qs = compat_parse_qs(parsed_redir_url.query)
1693 q = qs.get('q')
1694 if q and q[0]:
1695 return q[0]
1696 return redir_url
1697
9cafc3fd 1698 description_original = video_description = re.sub(r'''(?x)
cf7e015f 1699 <a\s+
25cb7a0e 1700 (?:[a-zA-Z-]+="[^"]*"\s+)*?
23f13e97 1701 (?:title|href)="([^"]+)"\s+
25cb7a0e 1702 (?:[a-zA-Z-]+="[^"]*"\s+)*?
525cedb9 1703 class="[^"]*"[^>]*>
23f13e97 1704 [^<]+\.{3}\s*
cf7e015f 1705 </a>
fa4bc6e7 1706 ''', replace_url, video_description)
cf7e015f
S
1707 video_description = clean_html(video_description)
1708 else:
1709 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1710 if fd_mobj:
1711 video_description = unescapeHTML(fd_mobj.group(1))
1712 else:
1713 video_description = ''
1714
8fe10494 1715 if not smuggled_data.get('force_singlefeed', False):
5e1eddb9 1716 if not self._downloader.params.get('noplaylist'):
8fe10494
S
1717 multifeed_metadata_list = try_get(
1718 player_response,
1719 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1720 compat_str) or try_get(
1721 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1722 if multifeed_metadata_list:
1723 entries = []
1724 feed_ids = []
1725 for feed in multifeed_metadata_list.split(','):
1726 # Unquote should take place before split on comma (,) since textual
1727 # fields may contain comma as well (see
1728 # https://github.com/rg3/youtube-dl/issues/8536)
1729 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1730 entries.append({
1731 '_type': 'url_transparent',
1732 'ie_key': 'Youtube',
1733 'url': smuggle_url(
1734 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1735 {'force_singlefeed': True}),
1736 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
1737 })
1738 feed_ids.append(feed_data['id'][0])
1739 self.to_screen(
1740 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1741 % (', '.join(feed_ids), video_id))
1742 return self.playlist_result(entries, video_id, video_title, video_description)
1743 else:
1744 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1745
c7121fa7 1746 if view_count is None:
1c9c8de2 1747 view_count = extract_view_count(video_info)
dbdaaa23
S
1748 if view_count is None and video_details:
1749 view_count = int_or_none(video_details.get('viewCount'))
1d699755 1750
c5e8d7af
PH
1751 # Check for "rental" videos
1752 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
c9612c04 1753 raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True)
c5e8d7af 1754
c63ca0ee
S
1755 def _extract_filesize(media_url):
1756 return int_or_none(self._search_regex(
1757 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1758
c5e8d7af
PH
1759 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1760 self.report_rtmp_download()
dd27fd17
PH
1761 formats = [{
1762 'format_id': '_rtmp',
1763 'protocol': 'rtmp',
1764 'url': video_info['conn'][0],
1765 'player_url': player_url,
1766 }]
391dd6f0 1767 elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
5f6a1245 1768 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1769 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1770 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
3318832e 1771 formats_spec = {}
82156fdb 1772 fmt_list = video_info.get('fmt_list', [''])[0]
1773 if fmt_list:
1774 for fmt in fmt_list.split(','):
1775 spec = fmt.split('/')
3318832e 1776 if len(spec) > 1:
1777 width_height = spec[1].split('x')
1778 if len(width_height) == 2:
1779 formats_spec[spec[0]] = {
1780 'resolution': spec[1],
1781 'width': int_or_none(width_height[0]),
1782 'height': int_or_none(width_height[1]),
1783 }
54fc90aa 1784 q = qualities(['small', 'medium', 'hd720'])
c9afb51c 1785 formats = []
00fe14fc 1786 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1787 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1788 if 'itag' not in url_data or 'url' not in url_data:
1789 continue
1790 format_id = url_data['itag'][0]
1791 url = url_data['url'][0]
1792
a49eccdf 1793 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
6449cd80 1794 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
beb95e77 1795 jsplayer_url_json = self._search_regex(
6449cd80
PH
1796 ASSETS_RE,
1797 embed_webpage if age_gate else video_webpage,
1798 'JS player URL (1)', default=None)
1799 if not jsplayer_url_json and not age_gate:
1800 # We need the embed website after all
1801 if embed_webpage is None:
1802 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1803 embed_webpage = self._download_webpage(
1804 embed_url, video_id, 'Downloading embed webpage')
1805 jsplayer_url_json = self._search_regex(
1806 ASSETS_RE, embed_webpage, 'JS player URL')
1807
beb95e77 1808 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1809 if player_url is None:
1810 player_url_json = self._search_regex(
1811 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1812 video_webpage, 'age gate player URL')
201e9eaa
PH
1813 player_url = json.loads(player_url_json)
1814
a49eccdf
YCH
1815 if 'sig' in url_data:
1816 url += '&signature=' + url_data['sig'][0]
1817 elif 's' in url_data:
1818 encrypted_sig = url_data['s'][0]
1819
201e9eaa 1820 if self._downloader.params.get('verbose'):
cf010131 1821 if player_url is None:
201e9eaa
PH
1822 player_version = 'unknown'
1823 player_desc = 'unknown'
1824 else:
1825 if player_url.endswith('swf'):
1826 player_version = self._search_regex(
1827 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1828 'flash player', fatal=False)
201e9eaa 1829 player_desc = 'flash player %s' % player_version
cf010131 1830 else:
201e9eaa 1831 player_version = self._search_regex(
b62985a9
YCH
1832 [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
1833 r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'],
201e9eaa
PH
1834 player_url,
1835 'html5 player', fatal=False)
78caa52a 1836 player_desc = 'html5 player %s' % player_version
201e9eaa 1837
60064c53 1838 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1839 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1840 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1841
1842 signature = self._decrypt_signature(
1843 encrypted_sig, video_id, player_url, age_gate)
1844 url += '&signature=' + signature
1845 if 'ratebypass' not in url:
1846 url += '&ratebypass=yes'
c9afb51c 1847
94278f72
YCH
1848 dct = {
1849 'format_id': format_id,
1850 'url': url,
1851 'player_url': player_url,
1852 }
1853 if format_id in self._formats:
1854 dct.update(self._formats[format_id])
3318832e 1855 if format_id in formats_spec:
1856 dct.update(formats_spec[format_id])
94278f72 1857
aabc2be6
S
1858 # Some itags are not included in DASH manifest thus corresponding formats will
1859 # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).
1860 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
1861 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
1862 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
94278f72 1863
c63ca0ee
S
1864 filesize = int_or_none(url_data.get(
1865 'clen', [None])[0]) or _extract_filesize(url)
1866
54fc90aa
RA
1867 quality = url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0]
1868
94278f72 1869 more_fields = {
c63ca0ee 1870 'filesize': filesize,
aabc2be6 1871 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
c9afb51c
AH
1872 'width': width,
1873 'height': height,
1874 'fps': int_or_none(url_data.get('fps', [None])[0]),
54fc90aa
RA
1875 'format_note': quality,
1876 'quality': q(quality),
c9afb51c 1877 }
94278f72
YCH
1878 for key, value in more_fields.items():
1879 if value:
1880 dct[key] = value
aabc2be6
S
1881 type_ = url_data.get('type', [None])[0]
1882 if type_:
1883 type_split = type_.split(';')
1884 kind_ext = type_split[0].split('/')
1885 if len(kind_ext) == 2:
94278f72
YCH
1886 kind, _ = kind_ext
1887 dct['ext'] = mimetype2ext(type_split[0])
aabc2be6
S
1888 if kind in ('audio', 'video'):
1889 codecs = None
1890 for mobj in re.finditer(
1891 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
1892 if mobj.group('key') == 'codecs':
1893 codecs = mobj.group('val')
1894 break
1895 if codecs:
6310acf5 1896 dct.update(parse_codecs(codecs))
e4a60912
S
1897 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
1898 dct['downloader_options'] = {
1899 # Youtube throttles chunks >~10M
1900 'http_chunk_size': 10485760,
1901 }
aabc2be6 1902 formats.append(dct)
1d043b93
JMF
1903 elif video_info.get('hlsvp'):
1904 manifest_url = video_info['hlsvp'][0]
89beedd3
RA
1905 formats = []
1906 m3u8_formats = self._extract_m3u8_formats(
1907 manifest_url, video_id, 'mp4', fatal=False)
1908 for a_format in m3u8_formats:
1909 itag = self._search_regex(
1910 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
1911 if itag:
1912 a_format['format_id'] = itag
1913 if itag in self._formats:
1914 dct = self._formats[itag].copy()
1915 dct.update(a_format)
1916 a_format = dct
1917 a_format['player_url'] = player_url
1918 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
049d71d8 1919 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
89beedd3 1920 formats.append(a_format)
c5e8d7af 1921 else:
4c76aa06
RA
1922 error_message = clean_html(video_info.get('reason', [None])[0])
1923 if not error_message:
1924 error_message = extract_unavailable_message()
1925 if error_message:
1926 raise ExtractorError(error_message, expected=True)
69ea8ca4 1927 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1928
7e72694b 1929 # uploader
dbdaaa23
S
1930 video_uploader = try_get(
1931 video_info, lambda x: x['author'][0],
1932 compat_str) or str_or_none(video_details.get('author'))
7e72694b
S
1933 if video_uploader:
1934 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
1935 else:
1936 self._downloader.report_warning('unable to extract uploader name')
1937
1938 # uploader_id
1939 video_uploader_id = None
1940 video_uploader_url = None
1941 mobj = re.search(
1942 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
1943 video_webpage)
1944 if mobj is not None:
1945 video_uploader_id = mobj.group('uploader_id')
1946 video_uploader_url = mobj.group('uploader_url')
1947 else:
1948 self._downloader.report_warning('unable to extract uploader nickname')
1949
dd4c4492
S
1950 channel_id = self._html_search_meta(
1951 'channelId', video_webpage, 'channel id')
1952 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
1953
7e72694b
S
1954 # thumbnail image
1955 # We try first to get a high quality image:
1956 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1957 video_webpage, re.DOTALL)
1958 if m_thumb is not None:
1959 video_thumbnail = m_thumb.group(1)
1960 elif 'thumbnail_url' not in video_info:
1961 self._downloader.report_warning('unable to extract video thumbnail')
1962 video_thumbnail = None
1963 else: # don't panic if we can't find it
1964 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
1965
1966 # upload date
1967 upload_date = self._html_search_meta(
1968 'datePublished', video_webpage, 'upload date', default=None)
1969 if not upload_date:
1970 upload_date = self._search_regex(
1971 [r'(?s)id="eow-date.*?>(.*?)</span>',
1972 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
1973 video_webpage, 'upload date', default=None)
1974 upload_date = unified_strdate(upload_date)
1975
1976 video_license = self._html_search_regex(
1977 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
1978 video_webpage, 'license', default=None)
1979
1980 m_music = re.search(
1981 r'''(?x)
1982 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
1983 <ul[^>]*>\s*
1984 <li>(?P<title>.+?)
1985 by (?P<creator>.+?)
1986 (?:
1987 \(.+?\)|
1988 <a[^>]*
1989 (?:
1990 \bhref=["\']/red[^>]*>| # drop possible
1991 >\s*Listen ad-free with YouTube Red # YouTube Red ad
1992 )
1993 .*?
1994 )?</li
1995 ''',
1996 video_webpage)
1997 if m_music:
1998 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
1999 video_creator = clean_html(m_music.group('creator'))
2000 else:
2001 video_alt_title = video_creator = None
2002
2003 def extract_meta(field):
2004 return self._html_search_regex(
2005 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2006 video_webpage, field, default=None)
2007
2008 track = extract_meta('Song')
2009 artist = extract_meta('Artist')
2010
2011 m_episode = re.search(
2012 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2013 video_webpage)
2014 if m_episode:
2015 series = m_episode.group('series')
2016 season_number = int(m_episode.group('season'))
2017 episode_number = int(m_episode.group('episode'))
2018 else:
2019 series = season_number = episode_number = None
2020
2021 m_cat_container = self._search_regex(
2022 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2023 video_webpage, 'categories', default=None)
2024 if m_cat_container:
2025 category = self._html_search_regex(
2026 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2027 default=None)
2028 video_categories = None if category is None else [category]
2029 else:
2030 video_categories = None
2031
2032 video_tags = [
2033 unescapeHTML(m.group('content'))
2034 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2035
2036 def _extract_count(count_name):
2037 return str_to_int(self._search_regex(
2038 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
2039 % re.escape(count_name),
2040 video_webpage, count_name, default=None))
2041
2042 like_count = _extract_count('like')
2043 dislike_count = _extract_count('dislike')
2044
dbdaaa23
S
2045 if view_count is None:
2046 view_count = str_to_int(self._search_regex(
2047 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2048 'view count', default=None))
2049
7e72694b
S
2050 # subtitles
2051 video_subtitles = self.extract_subtitles(video_id, video_webpage)
2052 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2053
2054 video_duration = try_get(
2055 video_info, lambda x: int_or_none(x['length_seconds'][0]))
dbdaaa23
S
2056 if not video_duration:
2057 video_duration = int_or_none(video_details.get('lengthSeconds'))
7e72694b
S
2058 if not video_duration:
2059 video_duration = parse_duration(self._html_search_meta(
2060 'duration', video_webpage, 'video duration'))
2061
2062 # annotations
2063 video_annotations = None
2064 if self._downloader.params.get('writeannotations', False):
2065 video_annotations = self._extract_annotations(video_id)
2066
2067 chapters = self._extract_chapters(description_original, video_duration)
2068
dd27fd17 2069 # Look for the DASH manifest
203fb43f 2070 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 2071 dash_mpd_fatal = True
8ff648e4 2072 for mpd_url in dash_mpds:
d8d24a92 2073 dash_formats = {}
774e208f 2074 try:
05d0d131
YCH
2075 def decrypt_sig(mobj):
2076 s = mobj.group(1)
2077 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2078 return '/signature/%s' % dec_s
2079
8ff648e4 2080 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2d2fa82d 2081
8ff648e4 2082 for df in self._extract_mpd_formats(
2083 mpd_url, video_id, fatal=dash_mpd_fatal,
2084 formats_dict=self._formats):
c63ca0ee
S
2085 if not df.get('filesize'):
2086 df['filesize'] = _extract_filesize(df['url'])
d8d24a92
S
2087 # Do not overwrite DASH format found in some previous DASH manifest
2088 if df['format_id'] not in dash_formats:
2089 dash_formats[df['format_id']] = df
77c6fb5b
S
2090 # Additional DASH manifests may end up in HTTP Error 403 therefore
2091 # allow them to fail without bug report message if we already have
2092 # some DASH manifest succeeded. This is temporary workaround to reduce
2093 # burst of bug reports until we figure out the reason and whether it
2094 # can be fixed at all.
2095 dash_mpd_fatal = False
774e208f
PH
2096 except (ExtractorError, KeyError) as e:
2097 self.report_warning(
2098 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 2099 if dash_formats:
04b3b3df
JMF
2100 # Remove the formats we found through non-DASH, they
2101 # contain less info and it can be wrong, because we use
2102 # fixed values (for example the resolution). See
2103 # https://github.com/rg3/youtube-dl/issues/5774 for an
2104 # example.
d80265cc 2105 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 2106 formats.extend(dash_formats.values())
d80044c2 2107
6271f1ca
PH
2108 # Check for malformed aspect ratio
2109 stretched_m = re.search(
2110 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2111 video_webpage)
2112 if stretched_m:
313dfc45
LL
2113 w = float(stretched_m.group('w'))
2114 h = float(stretched_m.group('h'))
5faf9fed
S
2115 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2116 # We will only process correct ratios.
313dfc45 2117 if w > 0 and h > 0:
41f24c32 2118 ratio = w / h
313dfc45
LL
2119 for f in formats:
2120 if f.get('vcodec') != 'none':
2121 f['stretched_ratio'] = ratio
6271f1ca 2122
4bcc7bd1 2123 self._sort_formats(formats)
4ea3be0a 2124
d77ab8e2
S
2125 self.mark_watched(video_id, video_info)
2126
4ea3be0a 2127 return {
8bcc8756
JW
2128 'id': video_id,
2129 'uploader': video_uploader,
2130 'uploader_id': video_uploader_id,
fd050249 2131 'uploader_url': video_uploader_url,
dd4c4492
S
2132 'channel_id': channel_id,
2133 'channel_url': channel_url,
8bcc8756 2134 'upload_date': upload_date,
7caf9830 2135 'license': video_license,
936784b2 2136 'creator': video_creator or artist,
8bcc8756 2137 'title': video_title,
936784b2 2138 'alt_title': video_alt_title or track,
8bcc8756
JW
2139 'thumbnail': video_thumbnail,
2140 'description': video_description,
2141 'categories': video_categories,
000b6b5a 2142 'tags': video_tags,
8bcc8756 2143 'subtitles': video_subtitles,
360e1ca5 2144 'automatic_captions': automatic_captions,
8bcc8756
JW
2145 'duration': video_duration,
2146 'age_limit': 18 if age_gate else 0,
2147 'annotations': video_annotations,
9cafc3fd 2148 'chapters': chapters,
7e8c0af0 2149 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 2150 'view_count': view_count,
4ea3be0a 2151 'like_count': like_count,
2152 'dislike_count': dislike_count,
2d30521a 2153 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
8bcc8756 2154 'formats': formats,
2fe1ff85 2155 'is_live': is_live,
7c80519c 2156 'start_time': start_time,
297a564b 2157 'end_time': end_time,
12afdc2a
S
2158 'series': series,
2159 'season_number': season_number,
2160 'episode_number': episode_number,
936784b2
S
2161 'track': track,
2162 'artist': artist,
4ea3be0a 2163 }
c5e8d7af 2164
5f6a1245 2165
8e7aad20 2166class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 2167 IE_DESC = 'YouTube.com playlists'
d67cc9fa 2168 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
2169 (?:https?://)?
2170 (?:\w+\.)?
c5e8d7af 2171 (?:
c0345b82
S
2172 (?:
2173 youtube\.com|
2174 invidio\.us
2175 )
2176 /
feaa5ad7 2177 (?:
87dadd45 2178 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
feaa5ad7
S
2179 \? (?:.*?[&;])*? (?:p|a|list)=
2180 | p/
2181 )|
2182 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
c5e8d7af 2183 )
d67cc9fa 2184 (
409b9324 2185 (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,}
5f6a1245 2186 # Top tracks, they can also include dots
d67cc9fa
JMF
2187 |(?:MC)[\w\.]*
2188 )
c5e8d7af
PH
2189 .*
2190 |
d0ba5587
S
2191 (%(playlist_id)s)
2192 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
8d81f3e3 2193 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
648e6a1f 2194 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
78caa52a 2195 IE_NAME = 'youtube:playlist'
81127aa5
PH
2196 _TESTS = [{
2197 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
2198 'info_dict': {
2199 'title': 'ytdl test PL',
a1cf99d0 2200 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
2201 },
2202 'playlist_count': 3,
9291475f
PH
2203 }, {
2204 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
2205 'info_dict': {
acf757f4 2206 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
2207 'title': 'YDL_Empty_List',
2208 },
2209 'playlist_count': 0,
4201ba13 2210 'skip': 'This playlist is private',
9291475f
PH
2211 }, {
2212 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2213 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2214 'info_dict': {
2215 'title': '29C3: Not my department',
acf757f4 2216 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
2217 },
2218 'playlist_count': 95,
2219 }, {
2220 'note': 'issue #673',
2221 'url': 'PLBB231211A4F62143',
2222 'info_dict': {
f46a8702 2223 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 2224 'id': 'PLBB231211A4F62143',
9291475f
PH
2225 },
2226 'playlist_mincount': 26,
2227 }, {
2228 'note': 'Large playlist',
2229 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2230 'info_dict': {
2231 'title': 'Uploads from Cauchemar',
acf757f4 2232 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
2233 },
2234 'playlist_mincount': 799,
2235 }, {
2236 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2237 'info_dict': {
2238 'title': 'YDL_safe_search',
acf757f4 2239 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
2240 },
2241 'playlist_count': 2,
4201ba13 2242 'skip': 'This playlist is private',
ac7553d0
PH
2243 }, {
2244 'note': 'embedded',
2d3d2997 2245 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0
PH
2246 'playlist_count': 4,
2247 'info_dict': {
2248 'title': 'JODA15',
acf757f4 2249 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 2250 }
87dadd45
S
2251 }, {
2252 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2253 'playlist_mincount': 485,
2254 'info_dict': {
2255 'title': '2017 華語最新單曲 (2/24更新)',
2256 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2257 }
6b08cdf6
PH
2258 }, {
2259 'note': 'Embedded SWF player',
2d3d2997 2260 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
6b08cdf6
PH
2261 'playlist_count': 4,
2262 'info_dict': {
2263 'title': 'JODA7',
acf757f4 2264 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 2265 }
4b7df0d3
JMF
2266 }, {
2267 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2268 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2269 'info_dict': {
acf757f4
PH
2270 'title': 'Uploads from Interstellar Movie',
2271 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3 2272 },
481cc733 2273 'playlist_mincount': 21,
dacb3a86
S
2274 }, {
2275 # Playlist URL that does not actually serve a playlist
2276 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2277 'info_dict': {
2278 'id': 'FqZTN594JQw',
2279 'ext': 'webm',
2280 'title': "Smiley's People 01 detective, Adventure Series, Action",
2281 'uploader': 'STREEM',
2282 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
ec85ded8 2283 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
dacb3a86
S
2284 'upload_date': '20150526',
2285 'license': 'Standard YouTube License',
2286 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2287 'categories': ['People & Blogs'],
2288 'tags': list,
dbdaaa23 2289 'view_count': int,
dacb3a86
S
2290 'like_count': int,
2291 'dislike_count': int,
2292 },
2293 'params': {
2294 'skip_download': True,
2295 },
2296 'add_ie': [YoutubeIE.ie_key()],
481cc733
S
2297 }, {
2298 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2299 'info_dict': {
2300 'id': 'yeWKywCrFtk',
2301 'ext': 'mp4',
2302 'title': 'Small Scale Baler and Braiding Rugs',
2303 'uploader': 'Backus-Page House Museum',
2304 'uploader_id': 'backuspagemuseum',
ec85ded8 2305 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
481cc733
S
2306 'upload_date': '20161008',
2307 'license': 'Standard YouTube License',
2308 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2309 'categories': ['Nonprofits & Activism'],
2310 'tags': list,
2311 'like_count': int,
2312 'dislike_count': int,
2313 },
2314 'params': {
2315 'noplaylist': True,
2316 'skip_download': True,
2317 },
feaa5ad7
S
2318 }, {
2319 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2320 'only_matching': True,
a6857510
S
2321 }, {
2322 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2323 'only_matching': True,
409b9324
S
2324 }, {
2325 # music album playlist
2326 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2327 'only_matching': True,
c0345b82
S
2328 }, {
2329 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2330 'only_matching': True,
81127aa5 2331 }]
c5e8d7af 2332
880e1c52
JMF
2333 def _real_initialize(self):
2334 self._login()
2335
652cdaa2 2336 def _extract_mix(self, playlist_id):
99209c29 2337 # The mixes are generated from a single video
652cdaa2 2338 # the id of the playlist is just 'RD' + video_id
1b6182d8
JMF
2339 ids = []
2340 last_id = playlist_id[-11:]
2341 for n in itertools.count(1):
2342 url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
2343 webpage = self._download_webpage(
2344 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2345 new_ids = orderedSet(re.findall(
2346 r'''(?xs)data-video-username=".*?".*?
2347 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2348 webpage))
2349 # Fetch new pages until all the videos are repeated, it seems that
2350 # there are always 51 unique videos.
2351 new_ids = [_id for _id in new_ids if _id not in ids]
2352 if not new_ids:
2353 break
2354 ids.extend(new_ids)
2355 last_id = ids[-1]
2356
2357 url_results = self._ids_to_results(ids)
2358
bc2f773b 2359 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
2360 title_span = (
2361 search_title('playlist-title') or
2362 search_title('title long-title') or
2363 search_title('title'))
76d1700b 2364 title = clean_html(title_span)
652cdaa2
JMF
2365
2366 return self.playlist_result(url_results, playlist_id, title)
2367
448830ce 2368 def _extract_playlist(self, playlist_id):
dbb94fb0
S
2369 url = self._TEMPLATE_URL % playlist_id
2370 page = self._download_webpage(url, playlist_id)
dbb94fb0 2371
8bc0800d
G
2372 # the yt-alert-message now has tabindex attribute (see https://github.com/rg3/youtube-dl/issues/11604)
2373 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
39b62db1
YCH
2374 match = match.strip()
2375 # Check if the playlist exists or is private
4201ba13
S
2376 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2377 if mobj:
2378 reason = mobj.group('reason')
2379 message = 'This playlist %s' % reason
2380 if 'private' in reason:
2381 message += ', use --username or --netrc to access it'
2382 message += '.'
2383 raise ExtractorError(message, expected=True)
39b62db1
YCH
2384 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2385 raise ExtractorError(
2386 'Invalid parameters. Maybe URL is incorrect.',
2387 expected=True)
2388 elif re.match(r'[^<]*Choose your language[^<]*', match):
2389 continue
2390 else:
2391 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 2392
dbb94fb0 2393 playlist_title = self._html_search_regex(
63b4295d 2394 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
dacb3a86 2395 page, 'title', default=None)
c5e8d7af 2396
07aeced6
S
2397 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
2398 uploader = self._search_regex(
2399 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2400 page, 'uploader', default=None)
2401 mobj = re.search(
2402 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2403 page)
2404 if mobj:
2405 uploader_id = mobj.group('uploader_id')
2406 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2407 else:
2408 uploader_id = uploader_url = None
2409
dacb3a86
S
2410 has_videos = True
2411
2412 if not playlist_title:
2413 try:
2414 # Some playlist URLs don't actually serve a playlist (e.g.
2415 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2416 next(self._entries(page, playlist_id))
2417 except StopIteration:
2418 has_videos = False
2419
07aeced6 2420 playlist = self.playlist_result(
dacb3a86 2421 self._entries(page, playlist_id), playlist_id, playlist_title)
07aeced6
S
2422 playlist.update({
2423 'uploader': uploader,
2424 'uploader_id': uploader_id,
2425 'uploader_url': uploader_url,
2426 })
2427
2428 return has_videos, playlist
c5e8d7af 2429
ebf1b291 2430 def _check_download_just_video(self, url, playlist_id):
448830ce
S
2431 # Check if it's a video-specific URL
2432 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
481cc733 2433 video_id = query_dict.get('v', [None])[0] or self._search_regex(
87dadd45 2434 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
481cc733
S
2435 'video id', default=None)
2436 if video_id:
448830ce
S
2437 if self._downloader.params.get('noplaylist'):
2438 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
dacb3a86 2439 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
448830ce
S
2440 else:
2441 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
dacb3a86
S
2442 return video_id, None
2443 return None, None
448830ce 2444
ebf1b291
S
2445 def _real_extract(self, url):
2446 # Extract playlist id
2447 mobj = re.match(self._VALID_URL, url)
2448 if mobj is None:
2449 raise ExtractorError('Invalid URL: %s' % url)
2450 playlist_id = mobj.group(1) or mobj.group(2)
2451
dacb3a86 2452 video_id, video = self._check_download_just_video(url, playlist_id)
ebf1b291
S
2453 if video:
2454 return video
2455
466a6145 2456 if playlist_id.startswith(('RD', 'UL', 'PU')):
448830ce
S
2457 # Mixes require a custom extraction process
2458 return self._extract_mix(playlist_id)
2459
dacb3a86
S
2460 has_videos, playlist = self._extract_playlist(playlist_id)
2461 if has_videos or not video_id:
2462 return playlist
2463
2464 # Some playlist URLs don't actually serve a playlist (see
2465 # https://github.com/rg3/youtube-dl/issues/10537).
2466 # Fallback to plain video extraction if there is a video id
2467 # along with playlist id.
2468 return self.url_result(video_id, 'Youtube', video_id=video_id)
448830ce 2469
c5e8d7af 2470
648e6a1f 2471class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 2472 IE_DESC = 'YouTube.com channels'
cd5a74a2 2473 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 2474 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
648e6a1f 2475 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
78caa52a 2476 IE_NAME = 'youtube:channel'
cdc628a4
PH
2477 _TESTS = [{
2478 'note': 'paginated channel',
2479 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
2480 'playlist_mincount': 91,
acf757f4 2481 'info_dict': {
9170ca5b
JMF
2482 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
2483 'title': 'Uploads from lex will',
acf757f4 2484 }
5c43afd4
JMF
2485 }, {
2486 'note': 'Age restricted channel',
2487 # from https://www.youtube.com/user/DeusExOfficial
2488 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
2489 'playlist_mincount': 64,
2490 'info_dict': {
2491 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
2492 'title': 'Uploads from Deus Ex',
2493 },
cd5a74a2
S
2494 }, {
2495 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
2496 'only_matching': True,
cdc628a4 2497 }]
c5e8d7af 2498
e462474e
S
2499 @classmethod
2500 def suitable(cls, url):
f07e276a
S
2501 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
2502 else super(YoutubeChannelIE, cls).suitable(url))
e462474e 2503
9558dcec
S
2504 def _build_template_url(self, url, channel_id):
2505 return self._TEMPLATE_URL % channel_id
2506
c5e8d7af 2507 def _real_extract(self, url):
9ff67727 2508 channel_id = self._match_id(url)
c5e8d7af 2509
9558dcec 2510 url = self._build_template_url(url, channel_id)
386bdfa6
S
2511
2512 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
2513 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
2514 # otherwise fallback on channel by page extraction
2515 channel_page = self._download_webpage(
2516 url + '?view=57', channel_id,
2517 'Downloading channel page', fatal=False)
2b3c2546
PH
2518 if channel_page is False:
2519 channel_playlist_id = False
2520 else:
2521 channel_playlist_id = self._html_search_meta(
2522 'channelId', channel_page, 'channel id', default=None)
2523 if not channel_playlist_id:
73c4ac2c
S
2524 channel_url = self._html_search_meta(
2525 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
2526 channel_page, 'channel url', default=None)
2527 if channel_url:
2528 channel_playlist_id = self._search_regex(
2529 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
2530 channel_url, 'channel id', default=None)
386bdfa6
S
2531 if channel_playlist_id and channel_playlist_id.startswith('UC'):
2532 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
2533 return self.url_result(
2534 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 2535
60bf45c8 2536 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
2537 autogenerated = re.search(r'''(?x)
2538 class="[^"]*?(?:
2539 channel-header-autogenerated-label|
2540 yt-channel-title-autogenerated
2541 )[^"]*"''', channel_page) is not None
c5e8d7af 2542
b9643eed
JMF
2543 if autogenerated:
2544 # The videos are contained in a single page
2545 # the ajax pages can't be used, they are empty
b82f815f 2546 entries = [
fb69240c
S
2547 self.url_result(
2548 video_id, 'Youtube', video_id=video_id,
2549 video_title=video_title)
8f02ad4f 2550 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
2551 return self.playlist_result(entries, channel_id)
2552
73c4ac2c
S
2553 try:
2554 next(self._entries(channel_page, channel_id))
2555 except StopIteration:
2556 alert_message = self._html_search_regex(
2557 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
2558 channel_page, 'alert', default=None, group='alert')
2559 if alert_message:
2560 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
2561
648e6a1f 2562 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
c5e8d7af
PH
2563
2564
eb0f3e7e 2565class YoutubeUserIE(YoutubeChannelIE):
78caa52a 2566 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
ea696249 2567 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
9558dcec 2568 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
78caa52a 2569 IE_NAME = 'youtube:user'
c5e8d7af 2570
cdc628a4
PH
2571 _TESTS = [{
2572 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
2573 'playlist_mincount': 320,
2574 'info_dict': {
73c4ac2c
S
2575 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
2576 'title': 'Uploads from The Linux Foundation',
cdc628a4 2577 }
9558dcec
S
2578 }, {
2579 # Only available via https://www.youtube.com/c/12minuteathlete/videos
2580 # but not https://www.youtube.com/user/12minuteathlete/videos
2581 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
2582 'playlist_mincount': 249,
2583 'info_dict': {
2584 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
2585 'title': 'Uploads from 12 Minute Athlete',
2586 }
cdc628a4
PH
2587 }, {
2588 'url': 'ytuser:phihag',
2589 'only_matching': True,
daa0df9e
YCH
2590 }, {
2591 'url': 'https://www.youtube.com/c/gametrailers',
2592 'only_matching': True,
9558dcec
S
2593 }, {
2594 'url': 'https://www.youtube.com/gametrailers',
2595 'only_matching': True,
73c4ac2c 2596 }, {
0e879f43 2597 # This channel is not available, geo restricted to JP
73c4ac2c
S
2598 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
2599 'only_matching': True,
cdc628a4
PH
2600 }]
2601
e3ea4790 2602 @classmethod
f4b05232 2603 def suitable(cls, url):
e3ea4790
JMF
2604 # Don't return True if the url can be extracted with other youtube
2605 # extractor, the regex would is too permissive and it would match.
f3a58d46 2606 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
2607 if any(ie.suitable(url) for ie in other_yt_ies):
5f6a1245
JW
2608 return False
2609 else:
2610 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 2611
9558dcec
S
2612 def _build_template_url(self, url, channel_id):
2613 mobj = re.match(self._VALID_URL, url)
2614 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
2615
b05654f0 2616
f07e276a
S
2617class YoutubeLiveIE(YoutubeBaseInfoExtractor):
2618 IE_DESC = 'YouTube.com live streams'
073d5bf5 2619 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
f07e276a
S
2620 IE_NAME = 'youtube:live'
2621
2622 _TESTS = [{
2d3d2997 2623 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
f07e276a
S
2624 'info_dict': {
2625 'id': 'a48o2S1cPoo',
2626 'ext': 'mp4',
2627 'title': 'The Young Turks - Live Main Show',
2628 'uploader': 'The Young Turks',
2629 'uploader_id': 'TheYoungTurks',
ec85ded8 2630 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
f07e276a
S
2631 'upload_date': '20150715',
2632 'license': 'Standard YouTube License',
2633 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2634 'categories': ['News & Politics'],
2635 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2636 'like_count': int,
2637 'dislike_count': int,
2638 },
2639 'params': {
2640 'skip_download': True,
2641 },
2642 }, {
2d3d2997 2643 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
f07e276a 2644 'only_matching': True,
c1b2a085
S
2645 }, {
2646 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2647 'only_matching': True,
073d5bf5
S
2648 }, {
2649 'url': 'https://www.youtube.com/TheYoungTurks/live',
2650 'only_matching': True,
f07e276a
S
2651 }]
2652
2653 def _real_extract(self, url):
2654 mobj = re.match(self._VALID_URL, url)
2655 channel_id = mobj.group('id')
2656 base_url = mobj.group('base_url')
2657 webpage = self._download_webpage(url, channel_id, fatal=False)
2658 if webpage:
2659 page_type = self._og_search_property(
e7f3529f 2660 'type', webpage, 'page type', default='')
f07e276a
S
2661 video_id = self._html_search_meta(
2662 'videoId', webpage, 'video id', default=None)
e7f3529f
S
2663 if page_type.startswith('video') and video_id and re.match(
2664 r'^[0-9A-Za-z_-]{11}$', video_id):
f07e276a
S
2665 return self.url_result(video_id, YoutubeIE.ie_key())
2666 return self.url_result(base_url)
2667
2668
e462474e
S
2669class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
2670 IE_DESC = 'YouTube.com user/channel playlists'
2671 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'
2672 IE_NAME = 'youtube:playlists'
0c148415 2673
e568c223 2674 _TESTS = [{
2d3d2997 2675 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
0c148415
S
2676 'playlist_mincount': 4,
2677 'info_dict': {
2678 'id': 'ThirstForScience',
2679 'title': 'Thirst for Science',
2680 },
e568c223
S
2681 }, {
2682 # with "Load more" button
2d3d2997 2683 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
e568c223
S
2684 'playlist_mincount': 70,
2685 'info_dict': {
2686 'id': 'igorkle1',
2687 'title': 'Игорь Клейнер',
2688 },
e462474e
S
2689 }, {
2690 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
2691 'playlist_mincount': 17,
2692 'info_dict': {
2693 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
2694 'title': 'Chem Player',
2695 },
e568c223 2696 }]
0c148415
S
2697
2698
870f3bfc
S
2699class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
2700 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
2701
2702
2703class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
78caa52a 2704 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
2705 # there doesn't appear to be a real limit, for example if you search for
2706 # 'python' you get more than 8.000.000 results
2707 _MAX_RESULTS = float('inf')
78caa52a 2708 IE_NAME = 'youtube:search'
b05654f0 2709 _SEARCH_KEY = 'ytsearch'
b4c08069 2710 _EXTRA_QUERY_ARGS = {}
9dd8e46a 2711 _TESTS = []
b05654f0 2712
b05654f0
PH
2713 def _get_n_results(self, query, n):
2714 """Get a specified number of results for a query"""
2715
b4c08069 2716 videos = []
b05654f0
PH
2717 limit = n
2718
a22b2fd1
YCH
2719 url_query = {
2720 'search_query': query.encode('utf-8'),
2721 }
2722 url_query.update(self._EXTRA_QUERY_ARGS)
2723 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
2724
b4c08069 2725 for pagenum in itertools.count(1):
b4c08069 2726 data = self._download_json(
69ea8ca4 2727 result_url, video_id='query "%s"' % query,
b4c08069 2728 note='Downloading page %s' % pagenum,
a22b2fd1
YCH
2729 errnote='Unable to download API page',
2730 query={'spf': 'navigate'})
b4c08069 2731 html_content = data[1]['body']['content']
7cc3570e 2732
b4c08069 2733 if 'class="search-message' in html_content:
07ad22b8 2734 raise ExtractorError(
78caa52a 2735 '[youtube] No video results', expected=True)
b05654f0 2736
870f3bfc 2737 new_videos = list(self._process_page(html_content))
b4c08069
JMF
2738 videos += new_videos
2739 if not new_videos or len(videos) > limit:
2740 break
a22b2fd1
YCH
2741 next_link = self._html_search_regex(
2742 r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',
2743 html_content, 'next link', default=None)
2744 if next_link is None:
2745 break
2746 result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)
b05654f0 2747
b4c08069
JMF
2748 if len(videos) > n:
2749 videos = videos[:n]
b05654f0 2750 return self.playlist_result(videos, query)
75dff0ee 2751
c9ae7b95 2752
a3dd9248 2753class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 2754 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 2755 _SEARCH_KEY = 'ytsearchdate'
78caa52a 2756 IE_DESC = 'YouTube.com searches, newest videos first'
b4c08069 2757 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
75dff0ee 2758
c9ae7b95 2759
870f3bfc 2760class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
78caa52a
PH
2761 IE_DESC = 'YouTube.com search URLs'
2762 IE_NAME = 'youtube:search_url'
d2c1f79f 2763 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
2764 _TESTS = [{
2765 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
2766 'playlist_mincount': 5,
2767 'info_dict': {
2768 'title': 'youtube-dl test video',
2769 }
d2c1f79f
S
2770 }, {
2771 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
2772 'only_matching': True,
cdc628a4 2773 }]
c9ae7b95
PH
2774
2775 def _real_extract(self, url):
2776 mobj = re.match(self._VALID_URL, url)
7fd002c0 2777 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
c9ae7b95 2778 webpage = self._download_webpage(url, query)
175c2e9e 2779 return self.playlist_result(self._process_page(webpage), playlist_title=query)
c9ae7b95
PH
2780
2781
136dadde 2782class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
78caa52a 2783 IE_DESC = 'YouTube.com (multi-season) shows'
92519402 2784 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 2785 IE_NAME = 'youtube:show'
cdc628a4 2786 _TESTS = [{
4003bd82 2787 'url': 'https://www.youtube.com/show/airdisasters',
8801255d 2788 'playlist_mincount': 5,
cdc628a4
PH
2789 'info_dict': {
2790 'id': 'airdisasters',
2791 'title': 'Air Disasters',
2792 }
2793 }]
75dff0ee
JMF
2794
2795 def _real_extract(self, url):
136dadde
S
2796 playlist_id = self._match_id(url)
2797 return super(YoutubeShowIE, self)._real_extract(
2798 'https://www.youtube.com/show/%s/playlists' % playlist_id)
04cc9617
JMF
2799
2800
b2e8bc1b 2801class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639 2802 """
25f14e9f 2803 Base class for feed extractors
d7ae0639
JMF
2804 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
2805 """
b2e8bc1b 2806 _LOGIN_REQUIRED = True
d7ae0639
JMF
2807
2808 @property
2809 def IE_NAME(self):
78caa52a 2810 return 'youtube:%s' % self._FEED_NAME
04cc9617 2811
81f0259b 2812 def _real_initialize(self):
b2e8bc1b 2813 self._login()
81f0259b 2814
3853309f 2815 def _entries(self, page):
2bc43303
JMF
2816 # The extraction process is the same as for playlists, but the regex
2817 # for the video ids doesn't contain an index
2818 ids = []
2819 more_widget_html = content_html = page
2bc43303
JMF
2820 for page_num in itertools.count(1):
2821 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
62c95fd5
S
2822
2823 # 'recommended' feed has infinite 'load more' and each new portion spins
2824 # the same videos in (sometimes) slightly different order, so we'll check
2825 # for unicity and break when portion has no new videos
3853309f 2826 new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
62c95fd5
S
2827 if not new_ids:
2828 break
2829
2bc43303
JMF
2830 ids.extend(new_ids)
2831
3853309f
S
2832 for entry in self._ids_to_results(new_ids):
2833 yield entry
2834
2bc43303
JMF
2835 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
2836 if not mobj:
2837 break
2838
2839 more = self._download_json(
25f14e9f 2840 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
2bc43303
JMF
2841 'Downloading page #%s' % page_num,
2842 transform_source=uppercase_escape)
2843 content_html = more['content_html']
2844 more_widget_html = more['load_more_widget_html']
2845
3853309f
S
2846 def _real_extract(self, url):
2847 page = self._download_webpage(
2848 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
2849 self._PLAYLIST_TITLE)
25f14e9f 2850 return self.playlist_result(
3853309f 2851 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
25f14e9f
S
2852
2853
2854class YoutubeWatchLaterIE(YoutubePlaylistIE):
2855 IE_NAME = 'youtube:watchlater'
2856 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
92519402 2857 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
25f14e9f 2858
bc7a9cd8
S
2859 _TESTS = [{
2860 'url': 'https://www.youtube.com/playlist?list=WL',
2861 'only_matching': True,
2862 }, {
2863 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
2864 'only_matching': True,
2865 }]
25f14e9f
S
2866
2867 def _real_extract(self, url):
7e5dc339 2868 _, video = self._check_download_just_video(url, 'WL')
ebf1b291
S
2869 if video:
2870 return video
dacb3a86
S
2871 _, playlist = self._extract_playlist('WL')
2872 return playlist
f459d170 2873
5f6a1245 2874
c626a3d9 2875class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 2876 IE_NAME = 'youtube:favorites'
f3a34072 2877 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
92519402 2878 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
2879 _LOGIN_REQUIRED = True
2880
2881 def _real_extract(self, url):
2882 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 2883 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 2884 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
2885
2886
25f14e9f
S
2887class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
2888 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
92519402 2889 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
25f14e9f
S
2890 _FEED_NAME = 'recommended'
2891 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 2892
1ed5b5c9 2893
25f14e9f
S
2894class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
2895 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
92519402 2896 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
25f14e9f
S
2897 _FEED_NAME = 'subscriptions'
2898 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 2899
1ed5b5c9 2900
25f14e9f
S
2901class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
2902 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
92519402 2903 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
25f14e9f
S
2904 _FEED_NAME = 'history'
2905 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
2906
2907
15870e90
PH
2908class YoutubeTruncatedURLIE(InfoExtractor):
2909 IE_NAME = 'youtube:truncated_url'
2910 IE_DESC = False # Do not list
975d35db 2911 _VALID_URL = r'''(?x)
b95aab84
PH
2912 (?:https?://)?
2913 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
2914 (?:watch\?(?:
c4808c60 2915 feature=[a-z_]+|
b95aab84
PH
2916 annotation_id=annotation_[^&]+|
2917 x-yt-cl=[0-9]+|
c1708b89 2918 hl=[^&]*|
287be8c6 2919 t=[0-9]+
b95aab84
PH
2920 )?
2921 |
2922 attribution_link\?a=[^&]+
2923 )
2924 $
975d35db 2925 '''
15870e90 2926
c4808c60 2927 _TESTS = [{
2d3d2997 2928 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
c4808c60 2929 'only_matching': True,
dc2fc736 2930 }, {
2d3d2997 2931 'url': 'https://www.youtube.com/watch?',
dc2fc736 2932 'only_matching': True,
b95aab84
PH
2933 }, {
2934 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
2935 'only_matching': True,
2936 }, {
2937 'url': 'https://www.youtube.com/watch?feature=foo',
2938 'only_matching': True,
c1708b89
PH
2939 }, {
2940 'url': 'https://www.youtube.com/watch?hl=en-GB',
2941 'only_matching': True,
287be8c6
PH
2942 }, {
2943 'url': 'https://www.youtube.com/watch?t=2372',
2944 'only_matching': True,
c4808c60
PH
2945 }]
2946
15870e90
PH
2947 def _real_extract(self, url):
2948 raise ExtractorError(
78caa52a
PH
2949 'Did you forget to quote the URL? Remember that & is a meta '
2950 'character in most shells, so you want to put the URL in quotes, '
2951 'like youtube-dl '
2d3d2997 2952 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
78caa52a 2953 ' or simply youtube-dl BaW_jenozKc .',
15870e90 2954 expected=True)
772fd5cc
PH
2955
2956
2957class YoutubeTruncatedIDIE(InfoExtractor):
2958 IE_NAME = 'youtube:truncated_id'
2959 IE_DESC = False # Do not list
b95aab84 2960 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
2961
2962 _TESTS = [{
2963 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
2964 'only_matching': True,
2965 }]
2966
2967 def _real_extract(self, url):
2968 video_id = self._match_id(url)
2969 raise ExtractorError(
2970 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
2971 expected=True)