]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[extractor/common] Introduce channel meta fields
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
d77ab8e2 9import random
c5e8d7af 10import re
42939b61 11import time
e0df6211 12import traceback
c5e8d7af 13
b05654f0 14from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
8d81f3e3 19 compat_kwargs,
c5e8d7af 20 compat_parse_qs,
7fd002c0
S
21 compat_urllib_parse_unquote,
22 compat_urllib_parse_unquote_plus,
15707c7e 23 compat_urllib_parse_urlencode,
7c80519c 24 compat_urllib_parse_urlparse,
7c61bd36 25 compat_urlparse,
c5e8d7af 26 compat_str,
4bb4a188
PH
27)
28from ..utils import (
c5e8d7af 29 clean_html,
9b9c5355 30 error_to_compat_str,
c5e8d7af 31 ExtractorError,
2d30521a 32 float_or_none,
4bb4a188
PH
33 get_element_by_attribute,
34 get_element_by_id,
dd27fd17 35 int_or_none,
94278f72 36 mimetype2ext,
4bb4a188 37 orderedSet,
6310acf5 38 parse_codecs,
7c80519c 39 parse_duration,
54fc90aa 40 qualities,
0cb58b02 41 remove_quotes,
3995d37d 42 remove_start,
cf7e015f 43 smuggle_url,
c93d53f5 44 str_to_int,
556dbe7f 45 try_get,
c5e8d7af
PH
46 unescapeHTML,
47 unified_strdate,
cf7e015f 48 unsmuggle_url,
81c2f20b 49 uppercase_escape,
6e6bc8da 50 urlencode_postdata,
c5e8d7af
PH
51)
52
5f6a1245 53
de7f3446 54class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
55 """Provide base functions for Youtube extractors"""
56 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 57 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
e00eb564
S
58
59 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
3995d37d
S
60 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
61 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
e00eb564 62
b2e8bc1b
JMF
63 _NETRC_MACHINE = 'youtube'
64 # If True it will raise an error if no login info is provided
65 _LOGIN_REQUIRED = False
66
409b9324 67 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}'
d0ba5587 68
b2e8bc1b 69 def _set_language(self):
810fb84d
PH
70 self._set_cookie(
71 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 72 # YouTube sets the expire time to about two months
810fb84d 73 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 74
25f14e9f
S
75 def _ids_to_results(self, ids):
76 return [
77 self.url_result(vid_id, 'Youtube', video_id=vid_id)
78 for vid_id in ids]
79
b2e8bc1b 80 def _login(self):
83317f69 81 """
82 Attempt to log in to YouTube.
83 True is returned if successful or skipped.
84 False is returned if login failed.
85
86 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
87 """
68217024 88 username, password = self._get_login_info()
b2e8bc1b
JMF
89 # No authentication to be performed
90 if username is None:
70d35d16 91 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
69ea8ca4 92 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 93 return True
b2e8bc1b 94
7cc3570e
PH
95 login_page = self._download_webpage(
96 self._LOGIN_URL, None,
69ea8ca4
PH
97 note='Downloading login page',
98 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
99 if login_page is False:
100 return
b2e8bc1b 101
1212e997 102 login_form = self._hidden_inputs(login_page)
c5e8d7af 103
e00eb564
S
104 def req(url, f_req, note, errnote):
105 data = login_form.copy()
106 data.update({
107 'pstMsg': 1,
108 'checkConnection': 'youtube',
109 'checkedDomains': 'youtube',
110 'hl': 'en',
111 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
3995d37d 112 'f.req': json.dumps(f_req),
e00eb564
S
113 'flowName': 'GlifWebSignIn',
114 'flowEntry': 'ServiceLogin',
041bc3ad 115 })
e00eb564
S
116 return self._download_json(
117 url, None, note=note, errnote=errnote,
118 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
119 fatal=False,
120 data=urlencode_postdata(data), headers={
121 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
122 'Google-Accounts-XSRF': 1,
123 })
124
3995d37d
S
125 def warn(message):
126 self._downloader.report_warning(message)
127
128 lookup_req = [
129 username,
130 None, [], None, 'US', None, None, 2, False, True,
131 [
132 None, None,
133 [2, 1, None, 1,
134 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
135 None, [], 4],
136 1, [None, None, []], None, None, None, True
137 ],
138 username,
139 ]
140
e00eb564 141 lookup_results = req(
3995d37d 142 self._LOOKUP_URL, lookup_req,
e00eb564
S
143 'Looking up account info', 'Unable to look up account info')
144
145 if lookup_results is False:
146 return False
041bc3ad 147
3995d37d
S
148 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
149 if not user_hash:
150 warn('Unable to extract user hash')
151 return False
152
153 challenge_req = [
154 user_hash,
155 None, 1, None, [1, None, None, None, [password, None, True]],
156 [
157 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
158 1, [None, None, []], None, None, None, True
159 ]]
83317f69 160
3995d37d
S
161 challenge_results = req(
162 self._CHALLENGE_URL, challenge_req,
163 'Logging in', 'Unable to log in')
83317f69 164
3995d37d 165 if challenge_results is False:
e00eb564 166 return
83317f69 167
3995d37d
S
168 login_res = try_get(challenge_results, lambda x: x[0][5], list)
169 if login_res:
170 login_msg = try_get(login_res, lambda x: x[5], compat_str)
171 warn(
172 'Unable to login: %s' % 'Invalid password'
173 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
174 return False
175
176 res = try_get(challenge_results, lambda x: x[0][-1], list)
177 if not res:
178 warn('Unable to extract result entry')
179 return False
180
9a6628aa
S
181 login_challenge = try_get(res, lambda x: x[0][0], list)
182 if login_challenge:
183 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
184 if challenge_str == 'TWO_STEP_VERIFICATION':
3995d37d
S
185 # SEND_SUCCESS - TFA code has been successfully sent to phone
186 # QUOTA_EXCEEDED - reached the limit of TFA codes
9a6628aa 187 status = try_get(login_challenge, lambda x: x[5], compat_str)
3995d37d
S
188 if status == 'QUOTA_EXCEEDED':
189 warn('Exceeded the limit of TFA codes, try later')
190 return False
191
192 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
193 if not tl:
194 warn('Unable to extract TL')
195 return False
196
197 tfa_code = self._get_tfa_info('2-step verification code')
198
199 if not tfa_code:
200 warn(
201 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
202 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
203 return False
204
205 tfa_code = remove_start(tfa_code, 'G-')
206
207 tfa_req = [
208 user_hash, None, 2, None,
209 [
210 9, None, None, None, None, None, None, None,
211 [None, tfa_code, True, 2]
212 ]]
213
214 tfa_results = req(
215 self._TFA_URL.format(tl), tfa_req,
216 'Submitting TFA code', 'Unable to submit TFA code')
217
218 if tfa_results is False:
219 return False
220
221 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
222 if tfa_res:
223 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
224 warn(
225 'Unable to finish TFA: %s' % 'Invalid TFA code'
226 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
227 return False
228
229 check_cookie_url = try_get(
230 tfa_results, lambda x: x[0][-1][2], compat_str)
9a6628aa
S
231 else:
232 CHALLENGES = {
233 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
234 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
235 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
236 }
237 challenge = CHALLENGES.get(
238 challenge_str,
239 '%s returned error %s.' % (self.IE_NAME, challenge_str))
240 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
241 return False
3995d37d
S
242 else:
243 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
244
245 if not check_cookie_url:
246 warn('Unable to extract CheckCookie URL')
247 return False
e00eb564
S
248
249 check_cookie_results = self._download_webpage(
3995d37d
S
250 check_cookie_url, None, 'Checking cookie', fatal=False)
251
252 if check_cookie_results is False:
253 return False
e00eb564 254
3995d37d
S
255 if 'https://myaccount.google.com/' not in check_cookie_results:
256 warn('Unable to log in')
b2e8bc1b 257 return False
e00eb564 258
b2e8bc1b
JMF
259 return True
260
30226342 261 def _download_webpage_handle(self, *args, **kwargs):
8d81f3e3 262 kwargs.setdefault('query', {})['disable_polymer'] = 'true'
30226342 263 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
8d81f3e3
S
264 *args, **compat_kwargs(kwargs))
265
b2e8bc1b
JMF
266 def _real_initialize(self):
267 if self._downloader is None:
268 return
42939b61 269 self._set_language()
b2e8bc1b
JMF
270 if not self._login():
271 return
c5e8d7af 272
8377574c 273
8e7aad20 274class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
061a75ed 275 # Extract entries from page with "Load more" button
648e6a1f
S
276 def _entries(self, page, playlist_id):
277 more_widget_html = content_html = page
278 for page_num in itertools.count(1):
061a75ed
S
279 for entry in self._process_page(content_html):
280 yield entry
648e6a1f
S
281
282 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
283 if not mobj:
284 break
285
286 more = self._download_json(
287 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
288 'Downloading page #%s' % page_num,
289 transform_source=uppercase_escape)
290 content_html = more['content_html']
291 if not content_html.strip():
292 # Some webpages show a "Load more" button but they don't
293 # have more videos
294 break
295 more_widget_html = more['load_more_widget_html']
296
061a75ed
S
297
298class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
299 def _process_page(self, content):
300 for video_id, video_title in self.extract_videos_from_page(content):
301 yield self.url_result(video_id, 'Youtube', video_id, video_title)
302
648e6a1f
S
303 def extract_videos_from_page(self, page):
304 ids_in_page = []
305 titles_in_page = []
306 for mobj in re.finditer(self._VIDEO_RE, page):
307 # The link with index 0 is not the first video of the playlist (not sure if still actual)
308 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
309 continue
310 video_id = mobj.group('id')
311 video_title = unescapeHTML(mobj.group('title'))
312 if video_title:
313 video_title = video_title.strip()
314 try:
315 idx = ids_in_page.index(video_id)
316 if video_title and not titles_in_page[idx]:
317 titles_in_page[idx] = video_title
318 except ValueError:
319 ids_in_page.append(video_id)
320 titles_in_page.append(video_title)
321 return zip(ids_in_page, titles_in_page)
322
323
061a75ed
S
324class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
325 def _process_page(self, content):
6dee688e
S
326 for playlist_id in orderedSet(re.findall(
327 r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
328 content)):
061a75ed
S
329 yield self.url_result(
330 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
331
0c148415
S
332 def _real_extract(self, url):
333 playlist_id = self._match_id(url)
334 webpage = self._download_webpage(url, playlist_id)
0c148415 335 title = self._og_search_title(webpage, fatal=False)
061a75ed 336 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
0c148415
S
337
338
360e1ca5 339class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 340 IE_DESC = 'YouTube.com'
cb7dfeea 341 _VALID_URL = r"""(?x)^
c5e8d7af 342 (
edb53e2d 343 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 344 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 345 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 346 (?:www\.)?pwnyoutube\.com/|
8b561bfc 347 (?:www\.)?hooktube\.com/|
f7000f3a 348 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
349 tube\.majestyc\.net/|
350 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
351 (?:.*?\#/)? # handle anchor (#/) redirect urls
352 (?: # the various things that can precede the ID:
ac7553d0 353 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 354 |(?: # or the v= param in all its forms
f7000f3a 355 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 356 (?:\?|\#!?) # the params delimiter ? or # or #!
040ac686 357 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
c5e8d7af
PH
358 v=
359 )
f4b05232 360 ))
cbaed4bb
S
361 |(?:
362 youtu\.be| # just youtu.be/xxxx
6d4fc66b
S
363 vid\.plus| # or vid.plus/xxxx
364 zwearz\.com/watch| # or zwearz.com/watch/xxxx
cbaed4bb 365 )/
edb53e2d 366 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 367 )
c5e8d7af 368 )? # all until now is optional -> you can pass the naked ID
8963d9c2 369 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
d0ba5587
S
370 (?!.*?\blist=
371 (?:
372 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
373 WL # WL are handled by the watch later IE
374 )
375 )
c5e8d7af 376 (?(1).+)? # if we found the ID, everything can follow
d0ba5587 377 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
c5e8d7af 378 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26 379 _formats = {
c2d3cb4c 380 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
381 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
382 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
383 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
384 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
385 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
386 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
387 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
3834d3e3 388 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
c2d3cb4c 389 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
390 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
391 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
392 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
393 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
394 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
e1a0bfdf 395 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
c2d3cb4c 396 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
397 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
e1a0bfdf 398
399
400 # 3D videos
c2d3cb4c 401 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
402 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
403 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
404 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
e1a0bfdf 405 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
406 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
407 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
836a086c 408
96fb5605 409 # Apple HTTP Live Streaming
11f12195 410 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
c2d3cb4c 411 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
412 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
413 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
414 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
415 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
e1a0bfdf 416 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
417 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
2c62dc26
PH
418
419 # DASH mp4 video
d23028a8
S
420 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
421 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
422 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
423 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
424 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
425 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
426 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
427 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
428 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
429 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
430 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
431 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
836a086c 432
f6f1fc92 433 # Dash mp4 audio
d23028a8
S
434 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
435 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
436 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
437 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
438 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
439 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
440 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
836a086c
AZ
441
442 # Dash webm
d23028a8
S
443 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
444 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
445 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
446 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
447 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
448 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
449 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
450 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
451 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
452 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
453 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
454 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
455 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
456 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
457 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
4c6b4764 458 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
d23028a8
S
459 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
460 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
461 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
462 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
463 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
464 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
2c62dc26
PH
465
466 # Dash webm audio
d23028a8
S
467 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
468 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
ce6b9a2d 469
0857baad 470 # Dash webm audio with opus inside
d23028a8
S
471 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
472 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
473 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
0857baad 474
ce6b9a2d
PH
475 # RTMP (unnamed)
476 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 477 }
23d17e4b 478 _SUBTITLE_FORMATS = ('ttml', 'vtt')
836a086c 479
fd5c4aab
S
480 _GEO_BYPASS = False
481
78caa52a 482 IE_NAME = 'youtube'
2eb88d95
PH
483 _TESTS = [
484 {
2d3d2997 485 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
4bc3a23e
PH
486 'info_dict': {
487 'id': 'BaW_jenozKc',
488 'ext': 'mp4',
489 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
490 'uploader': 'Philipp Hagemeister',
491 'uploader_id': 'phihag',
ec85ded8 492 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
4bc3a23e 493 'upload_date': '20121002',
7caf9830 494 'license': 'Standard YouTube License',
4bc3a23e
PH
495 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
496 'categories': ['Science & Technology'],
000b6b5a 497 'tags': ['youtube-dl'],
556dbe7f 498 'duration': 10,
3e7c1224
PH
499 'like_count': int,
500 'dislike_count': int,
7c80519c 501 'start_time': 1,
297a564b 502 'end_time': 9,
2eb88d95 503 }
0e853ca4 504 },
0e853ca4 505 {
2d3d2997 506 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',
4bc3a23e
PH
507 'note': 'Test generic use_cipher_signature video (#897)',
508 'info_dict': {
509 'id': 'UxxajLWwzqY',
510 'ext': 'mp4',
511 'upload_date': '20120506',
512 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
0cb58b02 513 'alt_title': 'I Love It (feat. Charli XCX)',
7caf9830 514 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',
000b6b5a
S
515 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
516 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
517 'iconic ep', 'iconic', 'love', 'it'],
556dbe7f 518 'duration': 180,
4bc3a23e
PH
519 'uploader': 'Icona Pop',
520 'uploader_id': 'IconaPop',
ec85ded8 521 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
7caf9830 522 'license': 'Standard YouTube License',
0cb58b02 523 'creator': 'Icona Pop',
936784b2
S
524 'track': 'I Love It (feat. Charli XCX)',
525 'artist': 'Icona Pop',
2eb88d95 526 }
c108eb73
JMF
527 },
528 {
4bc3a23e
PH
529 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
530 'note': 'Test VEVO video with age protection (#956)',
531 'info_dict': {
532 'id': '07FYdnEawAQ',
533 'ext': 'mp4',
534 'upload_date': '20130703',
535 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
0cb58b02 536 'alt_title': 'Tunnel Vision',
4bc3a23e 537 'description': 'md5:64249768eec3bc4276236606ea996373',
556dbe7f 538 'duration': 419,
4bc3a23e
PH
539 'uploader': 'justintimberlakeVEVO',
540 'uploader_id': 'justintimberlakeVEVO',
ec85ded8 541 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
7caf9830 542 'license': 'Standard YouTube License',
0cb58b02 543 'creator': 'Justin Timberlake',
7e72694b 544 'track': 'Tunnel Vision',
936784b2 545 'artist': 'Justin Timberlake',
34952f09 546 'age_limit': 18,
c108eb73
JMF
547 }
548 },
fccd3771 549 {
4bc3a23e
PH
550 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
551 'note': 'Embed-only video (#1746)',
552 'info_dict': {
553 'id': 'yZIXLfi8CZQ',
554 'ext': 'mp4',
555 'upload_date': '20120608',
556 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
557 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
558 'uploader': 'SET India',
94bfcd23 559 'uploader_id': 'setindia',
ec85ded8 560 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
7caf9830 561 'license': 'Standard YouTube License',
94bfcd23 562 'age_limit': 18,
fccd3771
PH
563 }
564 },
11b56058 565 {
2d3d2997 566 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
11b56058
PM
567 'note': 'Use the first video ID in the URL',
568 'info_dict': {
569 'id': 'BaW_jenozKc',
570 'ext': 'mp4',
571 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
572 'uploader': 'Philipp Hagemeister',
573 'uploader_id': 'phihag',
ec85ded8 574 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
11b56058 575 'upload_date': '20121002',
7caf9830 576 'license': 'Standard YouTube License',
11b56058
PM
577 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
578 'categories': ['Science & Technology'],
579 'tags': ['youtube-dl'],
556dbe7f 580 'duration': 10,
11b56058
PM
581 'like_count': int,
582 'dislike_count': int,
34a7de29
S
583 },
584 'params': {
585 'skip_download': True,
586 },
11b56058 587 },
dd27fd17 588 {
2d3d2997 589 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
4bc3a23e
PH
590 'note': '256k DASH audio (format 141) via DASH manifest',
591 'info_dict': {
592 'id': 'a9LDPn-MO4I',
593 'ext': 'm4a',
594 'upload_date': '20121002',
595 'uploader_id': '8KVIDEO',
ec85ded8 596 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
4bc3a23e
PH
597 'description': '',
598 'uploader': '8KVIDEO',
7caf9830 599 'license': 'Standard YouTube License',
4bc3a23e 600 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 601 },
4bc3a23e
PH
602 'params': {
603 'youtube_include_dash_manifest': True,
604 'format': '141',
4919603f 605 },
de3c7fe0 606 'skip': 'format 141 not served anymore',
dd27fd17 607 },
3489b7d2
JMF
608 # DASH manifest with encrypted signature
609 {
78caa52a
PH
610 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
611 'info_dict': {
612 'id': 'IB3lcPjvWLA',
613 'ext': 'm4a',
b766eb27 614 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
eb6793ba 615 'description': 'md5:1900ed86ee514927b9e00fbead6969a5',
556dbe7f 616 'duration': 244,
78caa52a
PH
617 'uploader': 'AfrojackVEVO',
618 'uploader_id': 'AfrojackVEVO',
619 'upload_date': '20131011',
7caf9830 620 'license': 'Standard YouTube License',
3489b7d2 621 },
4bc3a23e 622 'params': {
78caa52a 623 'youtube_include_dash_manifest': True,
de3c7fe0 624 'format': '141/bestaudio[ext=m4a]',
3489b7d2
JMF
625 },
626 },
aaeb86f6
S
627 # JS player signature function name containing $
628 {
629 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
630 'info_dict': {
631 'id': 'nfWlot6h_JM',
632 'ext': 'm4a',
633 'title': 'Taylor Swift - Shake It Off',
0cb58b02 634 'alt_title': 'Shake It Off',
f57b7835 635 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',
556dbe7f 636 'duration': 242,
aaeb86f6
S
637 'uploader': 'TaylorSwiftVEVO',
638 'uploader_id': 'TaylorSwiftVEVO',
639 'upload_date': '20140818',
7caf9830 640 'license': 'Standard YouTube License',
0cb58b02 641 'creator': 'Taylor Swift',
aaeb86f6
S
642 },
643 'params': {
644 'youtube_include_dash_manifest': True,
de3c7fe0 645 'format': '141/bestaudio[ext=m4a]',
aaeb86f6
S
646 },
647 },
aa79ac0c
PH
648 # Controversy video
649 {
650 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
651 'info_dict': {
652 'id': 'T4XJQO3qol8',
653 'ext': 'mp4',
556dbe7f 654 'duration': 219,
aa79ac0c 655 'upload_date': '20100909',
eb6793ba 656 'uploader': 'TJ Kirk',
aa79ac0c 657 'uploader_id': 'TheAmazingAtheist',
ec85ded8 658 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
7caf9830 659 'license': 'Standard YouTube License',
aa79ac0c
PH
660 'title': 'Burning Everyone\'s Koran',
661 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
662 }
c522adb1
JMF
663 },
664 # Normal age-gate video (No vevo, embed allowed)
665 {
2d3d2997 666 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
c522adb1
JMF
667 'info_dict': {
668 'id': 'HtVdAasjOgU',
669 'ext': 'mp4',
670 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
ec85ded8 671 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
556dbe7f 672 'duration': 142,
c522adb1
JMF
673 'uploader': 'The Witcher',
674 'uploader_id': 'WitcherGame',
ec85ded8 675 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
c522adb1 676 'upload_date': '20140605',
7caf9830 677 'license': 'Standard YouTube License',
34952f09 678 'age_limit': 18,
c522adb1
JMF
679 },
680 },
fccae2b9
S
681 # Age-gate video with encrypted signature
682 {
2d3d2997 683 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',
fccae2b9
S
684 'info_dict': {
685 'id': '6kLq3WMV1nU',
eb6793ba 686 'ext': 'webm',
fccae2b9
S
687 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
688 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
eb6793ba 689 'duration': 246,
fccae2b9
S
690 'uploader': 'LloydVEVO',
691 'uploader_id': 'LloydVEVO',
ec85ded8 692 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
fccae2b9 693 'upload_date': '20110629',
7caf9830 694 'license': 'Standard YouTube License',
34952f09 695 'age_limit': 18,
fccae2b9
S
696 },
697 },
774e208f 698 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
7d02dcfa 699 # YouTube Red ad is not captured for creator
774e208f
PH
700 {
701 'url': '__2ABJjxzNo',
702 'info_dict': {
703 'id': '__2ABJjxzNo',
704 'ext': 'mp4',
556dbe7f 705 'duration': 266,
774e208f
PH
706 'upload_date': '20100430',
707 'uploader_id': 'deadmau5',
ec85ded8 708 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
0cb58b02 709 'creator': 'deadmau5',
774e208f
PH
710 'description': 'md5:12c56784b8032162bb936a5f76d55360',
711 'uploader': 'deadmau5',
7caf9830 712 'license': 'Standard YouTube License',
774e208f 713 'title': 'Deadmau5 - Some Chords (HD)',
0cb58b02 714 'alt_title': 'Some Chords',
774e208f
PH
715 },
716 'expected_warnings': [
717 'DASH manifest missing',
718 ]
e52a40ab
PH
719 },
720 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
721 {
722 'url': 'lqQg6PlCWgI',
723 'info_dict': {
724 'id': 'lqQg6PlCWgI',
725 'ext': 'mp4',
556dbe7f 726 'duration': 6085,
90227264 727 'upload_date': '20150827',
cbe2bd91 728 'uploader_id': 'olympic',
ec85ded8 729 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
7caf9830 730 'license': 'Standard YouTube License',
cbe2bd91 731 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
be49068d 732 'uploader': 'Olympic',
cbe2bd91
PH
733 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
734 },
735 'params': {
736 'skip_download': 'requires avconv',
e52a40ab 737 }
cbe2bd91 738 },
6271f1ca
PH
739 # Non-square pixels
740 {
741 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
742 'info_dict': {
743 'id': '_b-2C3KPAM0',
744 'ext': 'mp4',
745 'stretched_ratio': 16 / 9.,
556dbe7f 746 'duration': 85,
6271f1ca
PH
747 'upload_date': '20110310',
748 'uploader_id': 'AllenMeow',
ec85ded8 749 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
6271f1ca 750 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
eb6793ba 751 'uploader': '孫ᄋᄅ',
7caf9830 752 'license': 'Standard YouTube License',
6271f1ca
PH
753 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
754 },
06b491eb
S
755 },
756 # url_encoded_fmt_stream_map is empty string
757 {
758 'url': 'qEJwOuvDf7I',
759 'info_dict': {
760 'id': 'qEJwOuvDf7I',
f57b7835 761 'ext': 'webm',
06b491eb
S
762 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
763 'description': '',
764 'upload_date': '20150404',
765 'uploader_id': 'spbelect',
766 'uploader': 'Наблюдатели Петербурга',
767 },
768 'params': {
769 'skip_download': 'requires avconv',
e323cf3f
S
770 },
771 'skip': 'This live event has ended.',
06b491eb 772 },
da77d856
S
773 # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
774 {
775 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
776 'info_dict': {
777 'id': 'FIl7x6_3R5Y',
eb6793ba 778 'ext': 'webm',
da77d856
S
779 'title': 'md5:7b81415841e02ecd4313668cde88737a',
780 'description': 'md5:116377fd2963b81ec4ce64b542173306',
556dbe7f 781 'duration': 220,
da77d856
S
782 'upload_date': '20150625',
783 'uploader_id': 'dorappi2000',
ec85ded8 784 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
da77d856 785 'uploader': 'dorappi2000',
7caf9830 786 'license': 'Standard YouTube License',
eb6793ba 787 'formats': 'mincount:31',
da77d856 788 },
eb6793ba 789 'skip': 'not actual anymore',
2ee8f5d8 790 },
8a1a26ce
YCH
791 # DASH manifest with segment_list
792 {
793 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
794 'md5': '8ce563a1d667b599d21064e982ab9e31',
795 'info_dict': {
796 'id': 'CsmdDsKjzN8',
797 'ext': 'mp4',
17ee98e1 798 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
799 'uploader': 'Airtek',
800 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
801 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
7caf9830 802 'license': 'Standard YouTube License',
8a1a26ce
YCH
803 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
804 },
805 'params': {
806 'youtube_include_dash_manifest': True,
807 'format': '135', # bestvideo
be49068d
S
808 },
809 'skip': 'This live event has ended.',
2ee8f5d8 810 },
cf7e015f
S
811 {
812 # Multifeed videos (multiple cameras), URL is for Main Camera
813 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
814 'info_dict': {
815 'id': 'jqWvoWXjCVs',
816 'title': 'teamPGP: Rocket League Noob Stream',
817 'description': 'md5:dc7872fb300e143831327f1bae3af010',
818 },
819 'playlist': [{
820 'info_dict': {
821 'id': 'jqWvoWXjCVs',
822 'ext': 'mp4',
823 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
824 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 825 'duration': 7335,
cf7e015f
S
826 'upload_date': '20150721',
827 'uploader': 'Beer Games Beer',
828 'uploader_id': 'beergamesbeer',
ec85ded8 829 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 830 'license': 'Standard YouTube License',
cf7e015f
S
831 },
832 }, {
833 'info_dict': {
834 'id': '6h8e8xoXJzg',
835 'ext': 'mp4',
836 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
837 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 838 'duration': 7337,
cf7e015f
S
839 'upload_date': '20150721',
840 'uploader': 'Beer Games Beer',
841 'uploader_id': 'beergamesbeer',
ec85ded8 842 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 843 'license': 'Standard YouTube License',
cf7e015f
S
844 },
845 }, {
846 'info_dict': {
847 'id': 'PUOgX5z9xZw',
848 'ext': 'mp4',
849 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
850 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 851 'duration': 7337,
cf7e015f
S
852 'upload_date': '20150721',
853 'uploader': 'Beer Games Beer',
854 'uploader_id': 'beergamesbeer',
ec85ded8 855 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 856 'license': 'Standard YouTube License',
cf7e015f
S
857 },
858 }, {
859 'info_dict': {
860 'id': 'teuwxikvS5k',
861 'ext': 'mp4',
862 'title': 'teamPGP: Rocket League Noob Stream (zim)',
863 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 864 'duration': 7334,
cf7e015f
S
865 'upload_date': '20150721',
866 'uploader': 'Beer Games Beer',
867 'uploader_id': 'beergamesbeer',
ec85ded8 868 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 869 'license': 'Standard YouTube License',
cf7e015f
S
870 },
871 }],
872 'params': {
873 'skip_download': True,
874 },
cbaed4bb 875 },
f9f49d87
S
876 {
877 # Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536)
878 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
879 'info_dict': {
880 'id': 'gVfLd0zydlo',
881 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
882 },
883 'playlist_count': 2,
be49068d 884 'skip': 'Not multifeed anymore',
f9f49d87 885 },
cbaed4bb 886 {
2d3d2997 887 'url': 'https://vid.plus/FlRa-iH7PGw',
cbaed4bb 888 'only_matching': True,
0e49d9a6 889 },
6d4fc66b 890 {
2d3d2997 891 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
6d4fc66b
S
892 'only_matching': True,
893 },
0e49d9a6 894 {
61f92af1 895 # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)
a8776b10
S
896 # Also tests cut-off URL expansion in video description (see
897 # https://github.com/rg3/youtube-dl/issues/1892,
898 # https://github.com/rg3/youtube-dl/issues/8164)
0e49d9a6
LL
899 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
900 'info_dict': {
901 'id': 'lsguqyKfVQg',
902 'ext': 'mp4',
903 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
eb6793ba 904 'alt_title': 'Dark Walk - Position Music',
0e49d9a6 905 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
556dbe7f 906 'duration': 133,
0e49d9a6
LL
907 'upload_date': '20151119',
908 'uploader_id': 'IronSoulElf',
ec85ded8 909 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
0e49d9a6 910 'uploader': 'IronSoulElf',
7caf9830 911 'license': 'Standard YouTube License',
eb6793ba
S
912 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
913 'track': 'Dark Walk - Position Music',
914 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
0e49d9a6
LL
915 },
916 'params': {
917 'skip_download': True,
918 },
919 },
61f92af1
S
920 {
921 # Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468)
922 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
923 'only_matching': True,
924 },
313dfc45
LL
925 {
926 # Video with yt:stretch=17:0
927 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
928 'info_dict': {
929 'id': 'Q39EVAstoRM',
930 'ext': 'mp4',
931 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
932 'description': 'md5:ee18a25c350637c8faff806845bddee9',
933 'upload_date': '20151107',
934 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
935 'uploader': 'CH GAMER DROID',
936 },
937 'params': {
938 'skip_download': True,
939 },
be49068d 940 'skip': 'This video does not exist.',
313dfc45 941 },
7caf9830
S
942 {
943 # Video licensed under Creative Commons
944 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
945 'info_dict': {
946 'id': 'M4gD1WSo5mA',
947 'ext': 'mp4',
948 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
949 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
556dbe7f 950 'duration': 721,
7caf9830
S
951 'upload_date': '20150127',
952 'uploader_id': 'BerkmanCenter',
ec85ded8 953 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
556dbe7f 954 'uploader': 'The Berkman Klein Center for Internet & Society',
7caf9830
S
955 'license': 'Creative Commons Attribution license (reuse allowed)',
956 },
957 'params': {
958 'skip_download': True,
959 },
960 },
fd050249
S
961 {
962 # Channel-like uploader_url
963 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
964 'info_dict': {
965 'id': 'eQcmzGIKrzg',
966 'ext': 'mp4',
967 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
968 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
556dbe7f 969 'duration': 4060,
fd050249 970 'upload_date': '20151119',
eb6793ba 971 'uploader': 'Bernie Sanders',
fd050249 972 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
ec85ded8 973 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
fd050249
S
974 'license': 'Creative Commons Attribution license (reuse allowed)',
975 },
976 'params': {
977 'skip_download': True,
978 },
979 },
040ac686
S
980 {
981 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
982 'only_matching': True,
7f29cf54
S
983 },
984 {
985 # YouTube Red paid video (https://github.com/rg3/youtube-dl/issues/10059)
986 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
987 'only_matching': True,
6496ccb4
S
988 },
989 {
990 # Rental video preview
991 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
992 'info_dict': {
993 'id': 'uGpuVWrhIzE',
994 'ext': 'mp4',
995 'title': 'Piku - Trailer',
996 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
997 'upload_date': '20150811',
998 'uploader': 'FlixMatrix',
999 'uploader_id': 'FlixMatrixKaravan',
ec85ded8 1000 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
6496ccb4
S
1001 'license': 'Standard YouTube License',
1002 },
1003 'params': {
1004 'skip_download': True,
1005 },
eb6793ba 1006 'skip': 'This video is not available.',
022a5d66 1007 },
12afdc2a
S
1008 {
1009 # YouTube Red video with episode data
1010 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1011 'info_dict': {
1012 'id': 'iqKdEhx-dD4',
1013 'ext': 'mp4',
1014 'title': 'Isolation - Mind Field (Ep 1)',
eb6793ba 1015 'description': 'md5:25b78d2f64ae81719f5c96319889b736',
556dbe7f 1016 'duration': 2085,
12afdc2a
S
1017 'upload_date': '20170118',
1018 'uploader': 'Vsauce',
1019 'uploader_id': 'Vsauce',
1020 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1021 'license': 'Standard YouTube License',
1022 'series': 'Mind Field',
1023 'season_number': 1,
1024 'episode_number': 1,
1025 },
1026 'params': {
1027 'skip_download': True,
1028 },
1029 'expected_warnings': [
1030 'Skipping DASH manifest',
1031 ],
1032 },
c7121fa7
S
1033 {
1034 # The following content has been identified by the YouTube community
1035 # as inappropriate or offensive to some audiences.
1036 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1037 'info_dict': {
1038 'id': '6SJNVb0GnPI',
1039 'ext': 'mp4',
1040 'title': 'Race Differences in Intelligence',
1041 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1042 'duration': 965,
1043 'upload_date': '20140124',
1044 'uploader': 'New Century Foundation',
1045 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1046 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1047 'license': 'Standard YouTube License',
c7121fa7
S
1048 },
1049 'params': {
1050 'skip_download': True,
1051 },
1052 },
022a5d66
S
1053 {
1054 # itag 212
1055 'url': '1t24XAntNCY',
1056 'only_matching': True,
fd5c4aab
S
1057 },
1058 {
1059 # geo restricted to JP
1060 'url': 'sJL6WA-aGkQ',
1061 'only_matching': True,
1062 },
d0ba5587
S
1063 {
1064 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1065 'only_matching': True,
1066 },
2eb88d95
PH
1067 ]
1068
e0df6211
PH
1069 def __init__(self, *args, **kwargs):
1070 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 1071 self._player_cache = {}
e0df6211 1072
c5e8d7af
PH
1073 def report_video_info_webpage_download(self, video_id):
1074 """Report attempt to download video info webpage."""
69ea8ca4 1075 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 1076
c5e8d7af
PH
1077 def report_information_extraction(self, video_id):
1078 """Report attempt to extract video information."""
69ea8ca4 1079 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
1080
1081 def report_unavailable_format(self, video_id, format):
1082 """Report extracted video URL."""
69ea8ca4 1083 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
1084
1085 def report_rtmp_download(self):
1086 """Indicate the download will use the RTMP protocol."""
69ea8ca4 1087 self.to_screen('RTMP download detected')
c5e8d7af 1088
60064c53
PH
1089 def _signature_cache_id(self, example_sig):
1090 """ Return a string representation of a signature """
78caa52a 1091 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
1092
1093 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 1094 id_m = re.match(
e31fed95 1095 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',
cf010131 1096 player_url)
c081b35c
PH
1097 if not id_m:
1098 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
1099 player_type = id_m.group('ext')
1100 player_id = id_m.group('id')
1101
c4417ddb 1102 # Read from filesystem cache
60064c53
PH
1103 func_id = '%s_%s_%s' % (
1104 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 1105 assert os.path.basename(func_id) == func_id
a0e07d31 1106
69ea8ca4 1107 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 1108 if cache_spec is not None:
78caa52a 1109 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 1110
6d1a55a5
PH
1111 download_note = (
1112 'Downloading player %s' % player_url
1113 if self._downloader.params.get('verbose') else
1114 'Downloading %s player %s' % (player_type, player_id)
1115 )
e0df6211
PH
1116 if player_type == 'js':
1117 code = self._download_webpage(
1118 player_url, video_id,
6d1a55a5 1119 note=download_note,
69ea8ca4 1120 errnote='Download of %s failed' % player_url)
83799698 1121 res = self._parse_sig_js(code)
c4417ddb 1122 elif player_type == 'swf':
e0df6211
PH
1123 urlh = self._request_webpage(
1124 player_url, video_id,
6d1a55a5 1125 note=download_note,
69ea8ca4 1126 errnote='Download of %s failed' % player_url)
e0df6211 1127 code = urlh.read()
83799698 1128 res = self._parse_sig_swf(code)
e0df6211
PH
1129 else:
1130 assert False, 'Invalid player type %r' % player_type
1131
785521bf
PH
1132 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1133 cache_res = res(test_string)
1134 cache_spec = [ord(c) for c in cache_res]
83799698 1135
69ea8ca4 1136 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
1137 return res
1138
60064c53 1139 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
1140 def gen_sig_code(idxs):
1141 def _genslice(start, end, step):
78caa52a 1142 starts = '' if start == 0 else str(start)
8bcc8756 1143 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 1144 steps = '' if step == 1 else (':%d' % step)
78caa52a 1145 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
1146
1147 step = None
7af808a5
PH
1148 # Quelch pyflakes warnings - start will be set when step is set
1149 start = '(Never used)'
edf3e38e
PH
1150 for i, prev in zip(idxs[1:], idxs[:-1]):
1151 if step is not None:
1152 if i - prev == step:
1153 continue
1154 yield _genslice(start, prev, step)
1155 step = None
1156 continue
1157 if i - prev in [-1, 1]:
1158 step = i - prev
1159 start = prev
1160 continue
1161 else:
78caa52a 1162 yield 's[%d]' % prev
edf3e38e 1163 if step is None:
78caa52a 1164 yield 's[%d]' % i
edf3e38e
PH
1165 else:
1166 yield _genslice(start, i, step)
1167
78caa52a 1168 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 1169 cache_res = func(test_string)
edf3e38e 1170 cache_spec = [ord(c) for c in cache_res]
78caa52a 1171 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
1172 signature_id_tuple = '(%s)' % (
1173 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 1174 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 1175 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 1176 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 1177
e0df6211
PH
1178 def _parse_sig_js(self, jscode):
1179 funcname = self._search_regex(
3c90cc8b 1180 (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
9a47fa35
S
1181 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1182 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1183 r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\('),
3c90cc8b 1184 jscode, 'Initial JS player signature function name', group='sig')
2b25cb5d
PH
1185
1186 jsi = JSInterpreter(jscode)
1187 initial_function = jsi.extract_function(funcname)
e0df6211
PH
1188 return lambda s: initial_function([s])
1189
1190 def _parse_sig_swf(self, file_contents):
54256267 1191 swfi = SWFInterpreter(file_contents)
78caa52a 1192 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 1193 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 1194 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
1195 return lambda s: initial_function([s])
1196
83799698 1197 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1198 """Turn the encrypted s field into a working signature"""
6b37f0be 1199
c8bf86d5 1200 if player_url is None:
69ea8ca4 1201 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 1202
69ea8ca4 1203 if player_url.startswith('//'):
78caa52a 1204 player_url = 'https:' + player_url
3c90cc8b
S
1205 elif not re.match(r'https?://', player_url):
1206 player_url = compat_urlparse.urljoin(
1207 'https://www.youtube.com', player_url)
c8bf86d5 1208 try:
62af3a0e 1209 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
1210 if player_id not in self._player_cache:
1211 func = self._extract_signature_function(
60064c53 1212 video_id, player_url, s
c8bf86d5
PH
1213 )
1214 self._player_cache[player_id] = func
1215 func = self._player_cache[player_id]
1216 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 1217 self._print_sig_code(func, s)
c8bf86d5
PH
1218 return func(s)
1219 except Exception as e:
1220 tb = traceback.format_exc()
1221 raise ExtractorError(
78caa52a 1222 'Signature extraction failed: ' + tb, cause=e)
e0df6211 1223
360e1ca5 1224 def _get_subtitles(self, video_id, webpage):
de7f3446 1225 try:
60e47a26 1226 subs_doc = self._download_xml(
38c2e5b8 1227 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
1228 video_id, note=False)
1229 except ExtractorError as err:
9b9c5355 1230 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
de7f3446 1231 return {}
de7f3446
JMF
1232
1233 sub_lang_list = {}
60e47a26
JMF
1234 for track in subs_doc.findall('track'):
1235 lang = track.attrib['lang_code']
7e660ac1
LD
1236 if lang in sub_lang_list:
1237 continue
360e1ca5 1238 sub_formats = []
23d17e4b 1239 for ext in self._SUBTITLE_FORMATS:
15707c7e 1240 params = compat_urllib_parse_urlencode({
360e1ca5
JMF
1241 'lang': lang,
1242 'v': video_id,
1243 'fmt': ext,
1244 'name': track.attrib['name'].encode('utf-8'),
1245 })
1246 sub_formats.append({
1247 'url': 'https://www.youtube.com/api/timedtext?' + params,
1248 'ext': ext,
1249 })
1250 sub_lang_list[lang] = sub_formats
de7f3446 1251 if not sub_lang_list:
69ea8ca4 1252 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
1253 return {}
1254 return sub_lang_list
1255
a72778d3
S
1256 def _get_ytplayer_config(self, video_id, webpage):
1257 patterns = (
526b3b07
S
1258 # User data may contain arbitrary character sequences that may affect
1259 # JSON extraction with regex, e.g. when '};' is contained the second
1260 # regex won't capture the whole JSON. Yet working around by trying more
1261 # concrete regex first keeping in mind proper quoted string handling
1262 # to be implemented in future that will replace this workaround (see
1263 # https://github.com/rg3/youtube-dl/issues/7468,
1264 # https://github.com/rg3/youtube-dl/pull/7599)
a72778d3
S
1265 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1266 r';ytplayer\.config\s*=\s*({.+?});',
1267 )
1268 config = self._search_regex(
1269 patterns, webpage, 'ytplayer.config', default=None)
1270 if config:
1271 return self._parse_json(
1272 uppercase_escape(config), video_id, fatal=False)
0e49d9a6 1273
360e1ca5 1274 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
1275 """We need the webpage for getting the captions url, pass it as an
1276 argument to speed up the process."""
69ea8ca4 1277 self.to_screen('%s: Looking for automatic captions' % video_id)
a72778d3 1278 player_config = self._get_ytplayer_config(video_id, webpage)
78caa52a 1279 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
a72778d3 1280 if not player_config:
de7f3446
JMF
1281 self._downloader.report_warning(err_msg)
1282 return {}
de7f3446 1283 try:
0792d563 1284 args = player_config['args']
b78b292f
S
1285 caption_url = args.get('ttsurl')
1286 if caption_url:
1287 timestamp = args['timestamp']
1288 # We get the available subtitles
15707c7e 1289 list_params = compat_urllib_parse_urlencode({
b78b292f
S
1290 'type': 'list',
1291 'tlangs': 1,
1292 'asrs': 1,
1293 })
1294 list_url = caption_url + '&' + list_params
1295 caption_list = self._download_xml(list_url, video_id)
1296 original_lang_node = caption_list.find('track')
1297 if original_lang_node is None:
1298 self._downloader.report_warning('Video doesn\'t have automatic captions')
1299 return {}
1300 original_lang = original_lang_node.attrib['lang_code']
1301 caption_kind = original_lang_node.attrib.get('kind', '')
1302
1303 sub_lang_list = {}
1304 for lang_node in caption_list.findall('target'):
1305 sub_lang = lang_node.attrib['lang_code']
1306 sub_formats = []
1307 for ext in self._SUBTITLE_FORMATS:
15707c7e 1308 params = compat_urllib_parse_urlencode({
b78b292f
S
1309 'lang': original_lang,
1310 'tlang': sub_lang,
1311 'fmt': ext,
1312 'ts': timestamp,
1313 'kind': caption_kind,
1314 })
1315 sub_formats.append({
1316 'url': caption_url + '&' + params,
1317 'ext': ext,
1318 })
1319 sub_lang_list[sub_lang] = sub_formats
1320 return sub_lang_list
1321
ddbb4c5c
S
1322 def make_captions(sub_url, sub_langs):
1323 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1324 caption_qs = compat_parse_qs(parsed_sub_url.query)
1325 captions = {}
1326 for sub_lang in sub_langs:
1327 sub_formats = []
1328 for ext in self._SUBTITLE_FORMATS:
1329 caption_qs.update({
1330 'tlang': [sub_lang],
1331 'fmt': [ext],
1332 })
1333 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1334 query=compat_urllib_parse_urlencode(caption_qs, True)))
1335 sub_formats.append({
1336 'url': sub_url,
1337 'ext': ext,
1338 })
1339 captions[sub_lang] = sub_formats
1340 return captions
1341
1342 # New captions format as of 22.06.2017
1343 player_response = args.get('player_response')
1344 if player_response and isinstance(player_response, compat_str):
1345 player_response = self._parse_json(
1346 player_response, video_id, fatal=False)
1347 if player_response:
1348 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1349 base_url = renderer['captionTracks'][0]['baseUrl']
1350 sub_lang_list = []
1351 for lang in renderer['translationLanguages']:
1352 lang_code = lang.get('languageCode')
1353 if lang_code:
1354 sub_lang_list.append(lang_code)
1355 return make_captions(base_url, sub_lang_list)
1356
b78b292f
S
1357 # Some videos don't provide ttsurl but rather caption_tracks and
1358 # caption_translation_languages (e.g. 20LmZk1hakA)
ddbb4c5c 1359 # Does not used anymore as of 22.06.2017
b78b292f
S
1360 caption_tracks = args['caption_tracks']
1361 caption_translation_languages = args['caption_translation_languages']
1362 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
ddbb4c5c 1363 sub_lang_list = []
b78b292f
S
1364 for lang in caption_translation_languages.split(','):
1365 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1366 sub_lang = lang_qs.get('lc', [None])[0]
ddbb4c5c
S
1367 if sub_lang:
1368 sub_lang_list.append(sub_lang)
1369 return make_captions(caption_url, sub_lang_list)
de7f3446
JMF
1370 # An extractor error can be raise by the download process if there are
1371 # no automatic captions but there are subtitles
ddbb4c5c 1372 except (KeyError, IndexError, ExtractorError):
de7f3446
JMF
1373 self._downloader.report_warning(err_msg)
1374 return {}
1375
d77ab8e2
S
1376 def _mark_watched(self, video_id, video_info):
1377 playback_url = video_info.get('videostats_playback_base_url', [None])[0]
1378 if not playback_url:
1379 return
1380 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1381 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1382
1383 # cpn generation algorithm is reverse engineered from base.js.
1384 # In fact it works even with dummy cpn.
1385 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1386 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1387
1388 qs.update({
1389 'ver': ['2'],
1390 'cpn': [cpn],
1391 })
1392 playback_url = compat_urlparse.urlunparse(
15707c7e 1393 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
d77ab8e2
S
1394
1395 self._download_webpage(
1396 playback_url, video_id, 'Marking watched',
1397 'Unable to mark watched', fatal=False)
1398
66c9fa36
S
1399 @staticmethod
1400 def _extract_urls(webpage):
1401 # Embedded YouTube player
1402 entries = [
1403 unescapeHTML(mobj.group('url'))
1404 for mobj in re.finditer(r'''(?x)
1405 (?:
1406 <iframe[^>]+?src=|
1407 data-video-url=|
1408 <embed[^>]+?src=|
1409 embedSWF\(?:\s*|
1410 <object[^>]+data=|
1411 new\s+SWFObject\(
1412 )
1413 (["\'])
1414 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
f2332f18 1415 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
66c9fa36
S
1416 \1''', webpage)]
1417
1418 # lazyYT YouTube embed
1419 entries.extend(list(map(
1420 unescapeHTML,
1421 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1422
1423 # Wordpress "YouTube Video Importer" plugin
1424 matches = re.findall(r'''(?x)<div[^>]+
1425 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1426 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1427 entries.extend(m[-1] for m in matches)
1428
1429 return entries
1430
1431 @staticmethod
1432 def _extract_url(webpage):
1433 urls = YoutubeIE._extract_urls(webpage)
1434 return urls[0] if urls else None
1435
97665381
PH
1436 @classmethod
1437 def extract_id(cls, url):
1438 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 1439 if mobj is None:
69ea8ca4 1440 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1441 video_id = mobj.group(2)
1442 return video_id
1443
1fb07d10
JG
1444 def _extract_annotations(self, video_id):
1445 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 1446 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 1447
9cafc3fd
S
1448 @staticmethod
1449 def _extract_chapters(description, duration):
1450 if not description:
1451 return None
1452 chapter_lines = re.findall(
1453 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1454 description)
1455 if not chapter_lines:
1456 return None
1457 chapters = []
1458 for next_num, (chapter_line, time_point) in enumerate(
1459 chapter_lines, start=1):
1460 start_time = parse_duration(time_point)
1461 if start_time is None:
1462 continue
39d4c1be
S
1463 if start_time > duration:
1464 break
9cafc3fd
S
1465 end_time = (duration if next_num == len(chapter_lines)
1466 else parse_duration(chapter_lines[next_num][1]))
1467 if end_time is None:
1468 continue
39d4c1be
S
1469 if end_time > duration:
1470 end_time = duration
1471 if start_time > end_time:
1472 break
9cafc3fd
S
1473 chapter_title = re.sub(
1474 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1475 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1476 chapters.append({
1477 'start_time': start_time,
1478 'end_time': end_time,
1479 'title': chapter_title,
1480 })
1481 return chapters
1482
c5e8d7af 1483 def _real_extract(self, url):
cf7e015f
S
1484 url, smuggled_data = unsmuggle_url(url, {})
1485
7e8c0af0 1486 proto = (
78caa52a
PH
1487 'http' if self._downloader.params.get('prefer_insecure', False)
1488 else 'https')
7e8c0af0 1489
7c80519c 1490 start_time = None
297a564b 1491 end_time = None
7c80519c
JMF
1492 parsed_url = compat_urllib_parse_urlparse(url)
1493 for component in [parsed_url.fragment, parsed_url.query]:
1494 query = compat_parse_qs(component)
297a564b 1495 if start_time is None and 't' in query:
7c80519c 1496 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
1497 if start_time is None and 'start' in query:
1498 start_time = parse_duration(query['start'][0])
297a564b
JMF
1499 if end_time is None and 'end' in query:
1500 end_time = parse_duration(query['end'][0])
7c80519c 1501
c5e8d7af
PH
1502 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1503 mobj = re.search(self._NEXT_URL_RE, url)
1504 if mobj:
7fd002c0 1505 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 1506 video_id = self.extract_id(url)
c5e8d7af
PH
1507
1508 # Get video webpage
aa79ac0c 1509 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 1510 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1511
1512 # Attempt to extract SWF player URL
e0df6211 1513 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1514 if mobj is not None:
1515 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1516 else:
1517 player_url = None
1518
d8d24a92
S
1519 dash_mpds = []
1520
1521 def add_dash_mpd(video_info):
1522 dash_mpd = video_info.get('dashmpd')
1523 if dash_mpd and dash_mpd[0] not in dash_mpds:
1524 dash_mpds.append(dash_mpd[0])
1525
c7121fa7
S
1526 is_live = None
1527 view_count = None
1528
1529 def extract_view_count(v_info):
1530 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1531
c5e8d7af 1532 # Get video info
6449cd80 1533 embed_webpage = None
c108eb73 1534 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
1535 age_gate = True
1536 # We simulate the access to the video from www.youtube.com/v/{video_id}
1537 # this can be viewed without login into Youtube
beb95e77
CL
1538 url = proto + '://www.youtube.com/embed/%s' % video_id
1539 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
15707c7e 1540 data = compat_urllib_parse_urlencode({
2c57c7fa
JMF
1541 'video_id': video_id,
1542 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 1543 'sts': self._search_regex(
beb95e77 1544 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 1545 })
7e8c0af0 1546 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
1547 video_info_webpage = self._download_webpage(
1548 video_info_url, video_id,
20436c30 1549 note='Refetching age-gated info webpage',
94bd3613 1550 errnote='unable to download video info webpage')
c5e8d7af 1551 video_info = compat_parse_qs(video_info_webpage)
d8d24a92 1552 add_dash_mpd(video_info)
c108eb73
JMF
1553 else:
1554 age_gate = False
bc93bdb5 1555 video_info = None
dc4e4f90 1556 sts = None
d8d24a92 1557 # Try looking directly into the video webpage
a72778d3
S
1558 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1559 if ytplayer_config:
4e62ebe2 1560 args = ytplayer_config['args']
4c76aa06 1561 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
d8d24a92
S
1562 # Convert to the same format returned by compat_parse_qs
1563 video_info = dict((k, [v]) for k, v in args.items())
1564 add_dash_mpd(video_info)
6496ccb4
S
1565 # Rental video is not rented but preview is available (e.g.
1566 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1567 # https://github.com/rg3/youtube-dl/issues/10532)
1568 if not video_info and args.get('ypc_vid'):
1569 return self.url_result(
1570 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
2fe1ff85
JMF
1571 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1572 is_live = True
dc4e4f90 1573 sts = ytplayer_config.get('sts')
0a3cf9ad
S
1574 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1575 # We also try looking in get_video_info since it may contain different dashmpd
1576 # URL that points to a DASH manifest with possibly different itag set (some itags
1577 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
1578 # manifest pointed by get_video_info's dashmpd).
1579 # The general idea is to take a union of itags of both DASH manifests (for example
1580 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
4e62ebe2 1581 self.report_video_info_webpage_download(video_id)
dc4e4f90
S
1582 for el in ('info', 'embedded', 'detailpage', 'vevo', ''):
1583 query = {
1584 'video_id': video_id,
1585 'ps': 'default',
1586 'eurl': '',
1587 'gl': 'US',
1588 'hl': 'en',
1589 }
1590 if el:
1591 query['el'] = el
1592 if sts:
1593 query['sts'] = sts
810fb84d 1594 video_info_webpage = self._download_webpage(
dc4e4f90 1595 '%s://www.youtube.com/get_video_info' % proto,
4e62ebe2 1596 video_id, note=False,
dc4e4f90
S
1597 errnote='unable to download video info webpage',
1598 fatal=False, query=query)
1599 if not video_info_webpage:
1600 continue
0a3cf9ad 1601 get_video_info = compat_parse_qs(video_info_webpage)
fd545fc6 1602 add_dash_mpd(get_video_info)
c7121fa7
S
1603 if view_count is None:
1604 view_count = extract_view_count(get_video_info)
0a3cf9ad
S
1605 if not video_info:
1606 video_info = get_video_info
1607 if 'token' in get_video_info:
89ea063e
S
1608 # Different get_video_info requests may report different results, e.g.
1609 # some may report video unavailability, but some may serve it without
1610 # any complaint (see https://github.com/rg3/youtube-dl/issues/7362,
1611 # the original webpage as well as el=info and el=embedded get_video_info
1612 # requests report video unavailability due to geo restriction while
1613 # el=detailpage succeeds and returns valid data). This is probably
1614 # due to YouTube measures against IP ranges of hosting providers.
1615 # Working around by preferring the first succeeded video_info containing
1616 # the token if no such video_info yet was found.
44b2264f
S
1617 if 'token' not in video_info:
1618 video_info = get_video_info
4e62ebe2 1619 break
bbb7c3f7
YCH
1620
1621 def extract_unavailable_message():
1622 return self._html_search_regex(
1623 r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',
1624 video_webpage, 'unavailable message', default=None)
1625
c5e8d7af
PH
1626 if 'token' not in video_info:
1627 if 'reason' in video_info:
af214c3a 1628 if 'The uploader has not made this video available in your country.' in video_info['reason']:
fd5c4aab
S
1629 regions_allowed = self._html_search_meta(
1630 'regionsAllowed', video_webpage, default=None)
1631 countries = regions_allowed.split(',') if regions_allowed else None
1632 self.raise_geo_restricted(
1633 msg=video_info['reason'][0], countries=countries)
bbb7c3f7
YCH
1634 reason = video_info['reason'][0]
1635 if 'Invalid parameters' in reason:
1636 unavailable_message = extract_unavailable_message()
1637 if unavailable_message:
1638 reason = unavailable_message
d11271dd 1639 raise ExtractorError(
bbb7c3f7 1640 'YouTube said: %s' % reason,
d11271dd 1641 expected=True, video_id=video_id)
c5e8d7af 1642 else:
d11271dd 1643 raise ExtractorError(
78caa52a 1644 '"token" parameter not in video info for unknown reason',
d11271dd 1645 video_id=video_id)
c5e8d7af 1646
cf7e015f
S
1647 # title
1648 if 'title' in video_info:
1649 video_title = video_info['title'][0]
1650 else:
1651 self._downloader.report_warning('Unable to extract video title')
1652 video_title = '_'
1653
1654 # description
9cafc3fd 1655 description_original = video_description = get_element_by_id("eow-description", video_webpage)
cf7e015f 1656 if video_description:
fa4bc6e7
RA
1657
1658 def replace_url(m):
1659 redir_url = compat_urlparse.urljoin(url, m.group(1))
1660 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1661 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1662 qs = compat_parse_qs(parsed_redir_url.query)
1663 q = qs.get('q')
1664 if q and q[0]:
1665 return q[0]
1666 return redir_url
1667
9cafc3fd 1668 description_original = video_description = re.sub(r'''(?x)
cf7e015f 1669 <a\s+
25cb7a0e 1670 (?:[a-zA-Z-]+="[^"]*"\s+)*?
23f13e97 1671 (?:title|href)="([^"]+)"\s+
25cb7a0e 1672 (?:[a-zA-Z-]+="[^"]*"\s+)*?
525cedb9 1673 class="[^"]*"[^>]*>
23f13e97 1674 [^<]+\.{3}\s*
cf7e015f 1675 </a>
fa4bc6e7 1676 ''', replace_url, video_description)
cf7e015f
S
1677 video_description = clean_html(video_description)
1678 else:
1679 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1680 if fd_mobj:
1681 video_description = unescapeHTML(fd_mobj.group(1))
1682 else:
1683 video_description = ''
1684
5e1eddb9
S
1685 if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
1686 if not self._downloader.params.get('noplaylist'):
1687 entries = []
1688 feed_ids = []
6863631c 1689 multifeed_metadata_list = video_info['multifeed_metadata_list'][0]
5e1eddb9 1690 for feed in multifeed_metadata_list.split(','):
6863631c
S
1691 # Unquote should take place before split on comma (,) since textual
1692 # fields may contain comma as well (see
1693 # https://github.com/rg3/youtube-dl/issues/8536)
1694 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
5e1eddb9
S
1695 entries.append({
1696 '_type': 'url_transparent',
1697 'ie_key': 'Youtube',
1698 'url': smuggle_url(
1699 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1700 {'force_singlefeed': True}),
1701 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
1702 })
1703 feed_ids.append(feed_data['id'][0])
1704 self.to_screen(
1705 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1706 % (', '.join(feed_ids), video_id))
1707 return self.playlist_result(entries, video_id, video_title, video_description)
1708 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1709
c7121fa7 1710 if view_count is None:
1c9c8de2 1711 view_count = extract_view_count(video_info)
1d699755 1712
c5e8d7af
PH
1713 # Check for "rental" videos
1714 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
c9612c04 1715 raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True)
c5e8d7af 1716
c63ca0ee
S
1717 def _extract_filesize(media_url):
1718 return int_or_none(self._search_regex(
1719 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1720
c5e8d7af
PH
1721 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1722 self.report_rtmp_download()
dd27fd17
PH
1723 formats = [{
1724 'format_id': '_rtmp',
1725 'protocol': 'rtmp',
1726 'url': video_info['conn'][0],
1727 'player_url': player_url,
1728 }]
391dd6f0 1729 elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
5f6a1245 1730 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1731 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1732 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
3318832e 1733 formats_spec = {}
82156fdb 1734 fmt_list = video_info.get('fmt_list', [''])[0]
1735 if fmt_list:
1736 for fmt in fmt_list.split(','):
1737 spec = fmt.split('/')
3318832e 1738 if len(spec) > 1:
1739 width_height = spec[1].split('x')
1740 if len(width_height) == 2:
1741 formats_spec[spec[0]] = {
1742 'resolution': spec[1],
1743 'width': int_or_none(width_height[0]),
1744 'height': int_or_none(width_height[1]),
1745 }
54fc90aa 1746 q = qualities(['small', 'medium', 'hd720'])
c9afb51c 1747 formats = []
00fe14fc 1748 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1749 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1750 if 'itag' not in url_data or 'url' not in url_data:
1751 continue
1752 format_id = url_data['itag'][0]
1753 url = url_data['url'][0]
1754
a49eccdf 1755 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
6449cd80 1756 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
beb95e77 1757 jsplayer_url_json = self._search_regex(
6449cd80
PH
1758 ASSETS_RE,
1759 embed_webpage if age_gate else video_webpage,
1760 'JS player URL (1)', default=None)
1761 if not jsplayer_url_json and not age_gate:
1762 # We need the embed website after all
1763 if embed_webpage is None:
1764 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1765 embed_webpage = self._download_webpage(
1766 embed_url, video_id, 'Downloading embed webpage')
1767 jsplayer_url_json = self._search_regex(
1768 ASSETS_RE, embed_webpage, 'JS player URL')
1769
beb95e77 1770 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1771 if player_url is None:
1772 player_url_json = self._search_regex(
1773 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1774 video_webpage, 'age gate player URL')
201e9eaa
PH
1775 player_url = json.loads(player_url_json)
1776
a49eccdf
YCH
1777 if 'sig' in url_data:
1778 url += '&signature=' + url_data['sig'][0]
1779 elif 's' in url_data:
1780 encrypted_sig = url_data['s'][0]
1781
201e9eaa 1782 if self._downloader.params.get('verbose'):
cf010131 1783 if player_url is None:
201e9eaa
PH
1784 player_version = 'unknown'
1785 player_desc = 'unknown'
1786 else:
1787 if player_url.endswith('swf'):
1788 player_version = self._search_regex(
1789 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1790 'flash player', fatal=False)
201e9eaa 1791 player_desc = 'flash player %s' % player_version
cf010131 1792 else:
201e9eaa 1793 player_version = self._search_regex(
b62985a9
YCH
1794 [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
1795 r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'],
201e9eaa
PH
1796 player_url,
1797 'html5 player', fatal=False)
78caa52a 1798 player_desc = 'html5 player %s' % player_version
201e9eaa 1799
60064c53 1800 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1801 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1802 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1803
1804 signature = self._decrypt_signature(
1805 encrypted_sig, video_id, player_url, age_gate)
1806 url += '&signature=' + signature
1807 if 'ratebypass' not in url:
1808 url += '&ratebypass=yes'
c9afb51c 1809
94278f72
YCH
1810 dct = {
1811 'format_id': format_id,
1812 'url': url,
1813 'player_url': player_url,
1814 }
1815 if format_id in self._formats:
1816 dct.update(self._formats[format_id])
3318832e 1817 if format_id in formats_spec:
1818 dct.update(formats_spec[format_id])
94278f72 1819
aabc2be6
S
1820 # Some itags are not included in DASH manifest thus corresponding formats will
1821 # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).
1822 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
1823 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
1824 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
94278f72 1825
c63ca0ee
S
1826 filesize = int_or_none(url_data.get(
1827 'clen', [None])[0]) or _extract_filesize(url)
1828
54fc90aa
RA
1829 quality = url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0]
1830
94278f72 1831 more_fields = {
c63ca0ee 1832 'filesize': filesize,
aabc2be6 1833 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
c9afb51c
AH
1834 'width': width,
1835 'height': height,
1836 'fps': int_or_none(url_data.get('fps', [None])[0]),
54fc90aa
RA
1837 'format_note': quality,
1838 'quality': q(quality),
c9afb51c 1839 }
94278f72
YCH
1840 for key, value in more_fields.items():
1841 if value:
1842 dct[key] = value
aabc2be6
S
1843 type_ = url_data.get('type', [None])[0]
1844 if type_:
1845 type_split = type_.split(';')
1846 kind_ext = type_split[0].split('/')
1847 if len(kind_ext) == 2:
94278f72
YCH
1848 kind, _ = kind_ext
1849 dct['ext'] = mimetype2ext(type_split[0])
aabc2be6
S
1850 if kind in ('audio', 'video'):
1851 codecs = None
1852 for mobj in re.finditer(
1853 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
1854 if mobj.group('key') == 'codecs':
1855 codecs = mobj.group('val')
1856 break
1857 if codecs:
6310acf5 1858 dct.update(parse_codecs(codecs))
e4a60912
S
1859 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
1860 dct['downloader_options'] = {
1861 # Youtube throttles chunks >~10M
1862 'http_chunk_size': 10485760,
1863 }
aabc2be6 1864 formats.append(dct)
1d043b93
JMF
1865 elif video_info.get('hlsvp'):
1866 manifest_url = video_info['hlsvp'][0]
89beedd3
RA
1867 formats = []
1868 m3u8_formats = self._extract_m3u8_formats(
1869 manifest_url, video_id, 'mp4', fatal=False)
1870 for a_format in m3u8_formats:
1871 itag = self._search_regex(
1872 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
1873 if itag:
1874 a_format['format_id'] = itag
1875 if itag in self._formats:
1876 dct = self._formats[itag].copy()
1877 dct.update(a_format)
1878 a_format = dct
1879 a_format['player_url'] = player_url
1880 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
049d71d8 1881 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
89beedd3 1882 formats.append(a_format)
c5e8d7af 1883 else:
4c76aa06
RA
1884 error_message = clean_html(video_info.get('reason', [None])[0])
1885 if not error_message:
1886 error_message = extract_unavailable_message()
1887 if error_message:
1888 raise ExtractorError(error_message, expected=True)
69ea8ca4 1889 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1890
7e72694b
S
1891 # uploader
1892 video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str)
1893 if video_uploader:
1894 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
1895 else:
1896 self._downloader.report_warning('unable to extract uploader name')
1897
1898 # uploader_id
1899 video_uploader_id = None
1900 video_uploader_url = None
1901 mobj = re.search(
1902 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
1903 video_webpage)
1904 if mobj is not None:
1905 video_uploader_id = mobj.group('uploader_id')
1906 video_uploader_url = mobj.group('uploader_url')
1907 else:
1908 self._downloader.report_warning('unable to extract uploader nickname')
1909
1910 # thumbnail image
1911 # We try first to get a high quality image:
1912 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1913 video_webpage, re.DOTALL)
1914 if m_thumb is not None:
1915 video_thumbnail = m_thumb.group(1)
1916 elif 'thumbnail_url' not in video_info:
1917 self._downloader.report_warning('unable to extract video thumbnail')
1918 video_thumbnail = None
1919 else: # don't panic if we can't find it
1920 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
1921
1922 # upload date
1923 upload_date = self._html_search_meta(
1924 'datePublished', video_webpage, 'upload date', default=None)
1925 if not upload_date:
1926 upload_date = self._search_regex(
1927 [r'(?s)id="eow-date.*?>(.*?)</span>',
1928 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
1929 video_webpage, 'upload date', default=None)
1930 upload_date = unified_strdate(upload_date)
1931
1932 video_license = self._html_search_regex(
1933 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
1934 video_webpage, 'license', default=None)
1935
1936 m_music = re.search(
1937 r'''(?x)
1938 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
1939 <ul[^>]*>\s*
1940 <li>(?P<title>.+?)
1941 by (?P<creator>.+?)
1942 (?:
1943 \(.+?\)|
1944 <a[^>]*
1945 (?:
1946 \bhref=["\']/red[^>]*>| # drop possible
1947 >\s*Listen ad-free with YouTube Red # YouTube Red ad
1948 )
1949 .*?
1950 )?</li
1951 ''',
1952 video_webpage)
1953 if m_music:
1954 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
1955 video_creator = clean_html(m_music.group('creator'))
1956 else:
1957 video_alt_title = video_creator = None
1958
1959 def extract_meta(field):
1960 return self._html_search_regex(
1961 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
1962 video_webpage, field, default=None)
1963
1964 track = extract_meta('Song')
1965 artist = extract_meta('Artist')
1966
1967 m_episode = re.search(
1968 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
1969 video_webpage)
1970 if m_episode:
1971 series = m_episode.group('series')
1972 season_number = int(m_episode.group('season'))
1973 episode_number = int(m_episode.group('episode'))
1974 else:
1975 series = season_number = episode_number = None
1976
1977 m_cat_container = self._search_regex(
1978 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
1979 video_webpage, 'categories', default=None)
1980 if m_cat_container:
1981 category = self._html_search_regex(
1982 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1983 default=None)
1984 video_categories = None if category is None else [category]
1985 else:
1986 video_categories = None
1987
1988 video_tags = [
1989 unescapeHTML(m.group('content'))
1990 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
1991
1992 def _extract_count(count_name):
1993 return str_to_int(self._search_regex(
1994 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
1995 % re.escape(count_name),
1996 video_webpage, count_name, default=None))
1997
1998 like_count = _extract_count('like')
1999 dislike_count = _extract_count('dislike')
2000
2001 # subtitles
2002 video_subtitles = self.extract_subtitles(video_id, video_webpage)
2003 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2004
2005 video_duration = try_get(
2006 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2007 if not video_duration:
2008 video_duration = parse_duration(self._html_search_meta(
2009 'duration', video_webpage, 'video duration'))
2010
2011 # annotations
2012 video_annotations = None
2013 if self._downloader.params.get('writeannotations', False):
2014 video_annotations = self._extract_annotations(video_id)
2015
2016 chapters = self._extract_chapters(description_original, video_duration)
2017
dd27fd17 2018 # Look for the DASH manifest
203fb43f 2019 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 2020 dash_mpd_fatal = True
8ff648e4 2021 for mpd_url in dash_mpds:
d8d24a92 2022 dash_formats = {}
774e208f 2023 try:
05d0d131
YCH
2024 def decrypt_sig(mobj):
2025 s = mobj.group(1)
2026 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2027 return '/signature/%s' % dec_s
2028
8ff648e4 2029 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2d2fa82d 2030
8ff648e4 2031 for df in self._extract_mpd_formats(
2032 mpd_url, video_id, fatal=dash_mpd_fatal,
2033 formats_dict=self._formats):
c63ca0ee
S
2034 if not df.get('filesize'):
2035 df['filesize'] = _extract_filesize(df['url'])
d8d24a92
S
2036 # Do not overwrite DASH format found in some previous DASH manifest
2037 if df['format_id'] not in dash_formats:
2038 dash_formats[df['format_id']] = df
77c6fb5b
S
2039 # Additional DASH manifests may end up in HTTP Error 403 therefore
2040 # allow them to fail without bug report message if we already have
2041 # some DASH manifest succeeded. This is temporary workaround to reduce
2042 # burst of bug reports until we figure out the reason and whether it
2043 # can be fixed at all.
2044 dash_mpd_fatal = False
774e208f
PH
2045 except (ExtractorError, KeyError) as e:
2046 self.report_warning(
2047 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 2048 if dash_formats:
04b3b3df
JMF
2049 # Remove the formats we found through non-DASH, they
2050 # contain less info and it can be wrong, because we use
2051 # fixed values (for example the resolution). See
2052 # https://github.com/rg3/youtube-dl/issues/5774 for an
2053 # example.
d80265cc 2054 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 2055 formats.extend(dash_formats.values())
d80044c2 2056
6271f1ca
PH
2057 # Check for malformed aspect ratio
2058 stretched_m = re.search(
2059 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2060 video_webpage)
2061 if stretched_m:
313dfc45
LL
2062 w = float(stretched_m.group('w'))
2063 h = float(stretched_m.group('h'))
5faf9fed
S
2064 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2065 # We will only process correct ratios.
313dfc45 2066 if w > 0 and h > 0:
41f24c32 2067 ratio = w / h
313dfc45
LL
2068 for f in formats:
2069 if f.get('vcodec') != 'none':
2070 f['stretched_ratio'] = ratio
6271f1ca 2071
4bcc7bd1 2072 self._sort_formats(formats)
4ea3be0a 2073
d77ab8e2
S
2074 self.mark_watched(video_id, video_info)
2075
4ea3be0a 2076 return {
8bcc8756
JW
2077 'id': video_id,
2078 'uploader': video_uploader,
2079 'uploader_id': video_uploader_id,
fd050249 2080 'uploader_url': video_uploader_url,
8bcc8756 2081 'upload_date': upload_date,
7caf9830 2082 'license': video_license,
936784b2 2083 'creator': video_creator or artist,
8bcc8756 2084 'title': video_title,
936784b2 2085 'alt_title': video_alt_title or track,
8bcc8756
JW
2086 'thumbnail': video_thumbnail,
2087 'description': video_description,
2088 'categories': video_categories,
000b6b5a 2089 'tags': video_tags,
8bcc8756 2090 'subtitles': video_subtitles,
360e1ca5 2091 'automatic_captions': automatic_captions,
8bcc8756
JW
2092 'duration': video_duration,
2093 'age_limit': 18 if age_gate else 0,
2094 'annotations': video_annotations,
9cafc3fd 2095 'chapters': chapters,
7e8c0af0 2096 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 2097 'view_count': view_count,
4ea3be0a 2098 'like_count': like_count,
2099 'dislike_count': dislike_count,
2d30521a 2100 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
8bcc8756 2101 'formats': formats,
2fe1ff85 2102 'is_live': is_live,
7c80519c 2103 'start_time': start_time,
297a564b 2104 'end_time': end_time,
12afdc2a
S
2105 'series': series,
2106 'season_number': season_number,
2107 'episode_number': episode_number,
936784b2
S
2108 'track': track,
2109 'artist': artist,
4ea3be0a 2110 }
c5e8d7af 2111
5f6a1245 2112
8e7aad20 2113class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 2114 IE_DESC = 'YouTube.com playlists'
d67cc9fa 2115 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
2116 (?:https?://)?
2117 (?:\w+\.)?
c5e8d7af 2118 (?:
feaa5ad7
S
2119 youtube\.com/
2120 (?:
87dadd45 2121 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
feaa5ad7
S
2122 \? (?:.*?[&;])*? (?:p|a|list)=
2123 | p/
2124 )|
2125 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
c5e8d7af 2126 )
d67cc9fa 2127 (
409b9324 2128 (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,}
5f6a1245 2129 # Top tracks, they can also include dots
d67cc9fa
JMF
2130 |(?:MC)[\w\.]*
2131 )
c5e8d7af
PH
2132 .*
2133 |
d0ba5587
S
2134 (%(playlist_id)s)
2135 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
8d81f3e3 2136 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
648e6a1f 2137 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
78caa52a 2138 IE_NAME = 'youtube:playlist'
81127aa5
PH
2139 _TESTS = [{
2140 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
2141 'info_dict': {
2142 'title': 'ytdl test PL',
a1cf99d0 2143 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
2144 },
2145 'playlist_count': 3,
9291475f
PH
2146 }, {
2147 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
2148 'info_dict': {
acf757f4 2149 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
2150 'title': 'YDL_Empty_List',
2151 },
2152 'playlist_count': 0,
4201ba13 2153 'skip': 'This playlist is private',
9291475f
PH
2154 }, {
2155 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2156 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2157 'info_dict': {
2158 'title': '29C3: Not my department',
acf757f4 2159 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
2160 },
2161 'playlist_count': 95,
2162 }, {
2163 'note': 'issue #673',
2164 'url': 'PLBB231211A4F62143',
2165 'info_dict': {
f46a8702 2166 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 2167 'id': 'PLBB231211A4F62143',
9291475f
PH
2168 },
2169 'playlist_mincount': 26,
2170 }, {
2171 'note': 'Large playlist',
2172 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2173 'info_dict': {
2174 'title': 'Uploads from Cauchemar',
acf757f4 2175 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
2176 },
2177 'playlist_mincount': 799,
2178 }, {
2179 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2180 'info_dict': {
2181 'title': 'YDL_safe_search',
acf757f4 2182 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
2183 },
2184 'playlist_count': 2,
4201ba13 2185 'skip': 'This playlist is private',
ac7553d0
PH
2186 }, {
2187 'note': 'embedded',
2d3d2997 2188 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0
PH
2189 'playlist_count': 4,
2190 'info_dict': {
2191 'title': 'JODA15',
acf757f4 2192 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 2193 }
87dadd45
S
2194 }, {
2195 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2196 'playlist_mincount': 485,
2197 'info_dict': {
2198 'title': '2017 華語最新單曲 (2/24更新)',
2199 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2200 }
6b08cdf6
PH
2201 }, {
2202 'note': 'Embedded SWF player',
2d3d2997 2203 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
6b08cdf6
PH
2204 'playlist_count': 4,
2205 'info_dict': {
2206 'title': 'JODA7',
acf757f4 2207 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 2208 }
4b7df0d3
JMF
2209 }, {
2210 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2211 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2212 'info_dict': {
acf757f4
PH
2213 'title': 'Uploads from Interstellar Movie',
2214 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3 2215 },
481cc733 2216 'playlist_mincount': 21,
dacb3a86
S
2217 }, {
2218 # Playlist URL that does not actually serve a playlist
2219 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2220 'info_dict': {
2221 'id': 'FqZTN594JQw',
2222 'ext': 'webm',
2223 'title': "Smiley's People 01 detective, Adventure Series, Action",
2224 'uploader': 'STREEM',
2225 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
ec85ded8 2226 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
dacb3a86
S
2227 'upload_date': '20150526',
2228 'license': 'Standard YouTube License',
2229 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2230 'categories': ['People & Blogs'],
2231 'tags': list,
2232 'like_count': int,
2233 'dislike_count': int,
2234 },
2235 'params': {
2236 'skip_download': True,
2237 },
2238 'add_ie': [YoutubeIE.ie_key()],
481cc733
S
2239 }, {
2240 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2241 'info_dict': {
2242 'id': 'yeWKywCrFtk',
2243 'ext': 'mp4',
2244 'title': 'Small Scale Baler and Braiding Rugs',
2245 'uploader': 'Backus-Page House Museum',
2246 'uploader_id': 'backuspagemuseum',
ec85ded8 2247 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
481cc733
S
2248 'upload_date': '20161008',
2249 'license': 'Standard YouTube License',
2250 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2251 'categories': ['Nonprofits & Activism'],
2252 'tags': list,
2253 'like_count': int,
2254 'dislike_count': int,
2255 },
2256 'params': {
2257 'noplaylist': True,
2258 'skip_download': True,
2259 },
feaa5ad7
S
2260 }, {
2261 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2262 'only_matching': True,
a6857510
S
2263 }, {
2264 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2265 'only_matching': True,
409b9324
S
2266 }, {
2267 # music album playlist
2268 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2269 'only_matching': True,
81127aa5 2270 }]
c5e8d7af 2271
880e1c52
JMF
2272 def _real_initialize(self):
2273 self._login()
2274
652cdaa2 2275 def _extract_mix(self, playlist_id):
99209c29 2276 # The mixes are generated from a single video
652cdaa2 2277 # the id of the playlist is just 'RD' + video_id
1b6182d8
JMF
2278 ids = []
2279 last_id = playlist_id[-11:]
2280 for n in itertools.count(1):
2281 url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
2282 webpage = self._download_webpage(
2283 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2284 new_ids = orderedSet(re.findall(
2285 r'''(?xs)data-video-username=".*?".*?
2286 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2287 webpage))
2288 # Fetch new pages until all the videos are repeated, it seems that
2289 # there are always 51 unique videos.
2290 new_ids = [_id for _id in new_ids if _id not in ids]
2291 if not new_ids:
2292 break
2293 ids.extend(new_ids)
2294 last_id = ids[-1]
2295
2296 url_results = self._ids_to_results(ids)
2297
bc2f773b 2298 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
2299 title_span = (
2300 search_title('playlist-title') or
2301 search_title('title long-title') or
2302 search_title('title'))
76d1700b 2303 title = clean_html(title_span)
652cdaa2
JMF
2304
2305 return self.playlist_result(url_results, playlist_id, title)
2306
448830ce 2307 def _extract_playlist(self, playlist_id):
dbb94fb0
S
2308 url = self._TEMPLATE_URL % playlist_id
2309 page = self._download_webpage(url, playlist_id)
dbb94fb0 2310
8bc0800d
G
2311 # the yt-alert-message now has tabindex attribute (see https://github.com/rg3/youtube-dl/issues/11604)
2312 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
39b62db1
YCH
2313 match = match.strip()
2314 # Check if the playlist exists or is private
4201ba13
S
2315 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2316 if mobj:
2317 reason = mobj.group('reason')
2318 message = 'This playlist %s' % reason
2319 if 'private' in reason:
2320 message += ', use --username or --netrc to access it'
2321 message += '.'
2322 raise ExtractorError(message, expected=True)
39b62db1
YCH
2323 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2324 raise ExtractorError(
2325 'Invalid parameters. Maybe URL is incorrect.',
2326 expected=True)
2327 elif re.match(r'[^<]*Choose your language[^<]*', match):
2328 continue
2329 else:
2330 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 2331
dbb94fb0 2332 playlist_title = self._html_search_regex(
63b4295d 2333 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
dacb3a86 2334 page, 'title', default=None)
c5e8d7af 2335
07aeced6
S
2336 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
2337 uploader = self._search_regex(
2338 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2339 page, 'uploader', default=None)
2340 mobj = re.search(
2341 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2342 page)
2343 if mobj:
2344 uploader_id = mobj.group('uploader_id')
2345 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2346 else:
2347 uploader_id = uploader_url = None
2348
dacb3a86
S
2349 has_videos = True
2350
2351 if not playlist_title:
2352 try:
2353 # Some playlist URLs don't actually serve a playlist (e.g.
2354 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2355 next(self._entries(page, playlist_id))
2356 except StopIteration:
2357 has_videos = False
2358
07aeced6 2359 playlist = self.playlist_result(
dacb3a86 2360 self._entries(page, playlist_id), playlist_id, playlist_title)
07aeced6
S
2361 playlist.update({
2362 'uploader': uploader,
2363 'uploader_id': uploader_id,
2364 'uploader_url': uploader_url,
2365 })
2366
2367 return has_videos, playlist
c5e8d7af 2368
ebf1b291 2369 def _check_download_just_video(self, url, playlist_id):
448830ce
S
2370 # Check if it's a video-specific URL
2371 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
481cc733 2372 video_id = query_dict.get('v', [None])[0] or self._search_regex(
87dadd45 2373 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
481cc733
S
2374 'video id', default=None)
2375 if video_id:
448830ce
S
2376 if self._downloader.params.get('noplaylist'):
2377 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
dacb3a86 2378 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
448830ce
S
2379 else:
2380 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
dacb3a86
S
2381 return video_id, None
2382 return None, None
448830ce 2383
ebf1b291
S
2384 def _real_extract(self, url):
2385 # Extract playlist id
2386 mobj = re.match(self._VALID_URL, url)
2387 if mobj is None:
2388 raise ExtractorError('Invalid URL: %s' % url)
2389 playlist_id = mobj.group(1) or mobj.group(2)
2390
dacb3a86 2391 video_id, video = self._check_download_just_video(url, playlist_id)
ebf1b291
S
2392 if video:
2393 return video
2394
466a6145 2395 if playlist_id.startswith(('RD', 'UL', 'PU')):
448830ce
S
2396 # Mixes require a custom extraction process
2397 return self._extract_mix(playlist_id)
2398
dacb3a86
S
2399 has_videos, playlist = self._extract_playlist(playlist_id)
2400 if has_videos or not video_id:
2401 return playlist
2402
2403 # Some playlist URLs don't actually serve a playlist (see
2404 # https://github.com/rg3/youtube-dl/issues/10537).
2405 # Fallback to plain video extraction if there is a video id
2406 # along with playlist id.
2407 return self.url_result(video_id, 'Youtube', video_id=video_id)
448830ce 2408
c5e8d7af 2409
648e6a1f 2410class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 2411 IE_DESC = 'YouTube.com channels'
9ff67727 2412 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 2413 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
648e6a1f 2414 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
78caa52a 2415 IE_NAME = 'youtube:channel'
cdc628a4
PH
2416 _TESTS = [{
2417 'note': 'paginated channel',
2418 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
2419 'playlist_mincount': 91,
acf757f4 2420 'info_dict': {
9170ca5b
JMF
2421 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
2422 'title': 'Uploads from lex will',
acf757f4 2423 }
5c43afd4
JMF
2424 }, {
2425 'note': 'Age restricted channel',
2426 # from https://www.youtube.com/user/DeusExOfficial
2427 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
2428 'playlist_mincount': 64,
2429 'info_dict': {
2430 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
2431 'title': 'Uploads from Deus Ex',
2432 },
cdc628a4 2433 }]
c5e8d7af 2434
e462474e
S
2435 @classmethod
2436 def suitable(cls, url):
f07e276a
S
2437 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
2438 else super(YoutubeChannelIE, cls).suitable(url))
e462474e 2439
9558dcec
S
2440 def _build_template_url(self, url, channel_id):
2441 return self._TEMPLATE_URL % channel_id
2442
c5e8d7af 2443 def _real_extract(self, url):
9ff67727 2444 channel_id = self._match_id(url)
c5e8d7af 2445
9558dcec 2446 url = self._build_template_url(url, channel_id)
386bdfa6
S
2447
2448 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
2449 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
2450 # otherwise fallback on channel by page extraction
2451 channel_page = self._download_webpage(
2452 url + '?view=57', channel_id,
2453 'Downloading channel page', fatal=False)
2b3c2546
PH
2454 if channel_page is False:
2455 channel_playlist_id = False
2456 else:
2457 channel_playlist_id = self._html_search_meta(
2458 'channelId', channel_page, 'channel id', default=None)
2459 if not channel_playlist_id:
73c4ac2c
S
2460 channel_url = self._html_search_meta(
2461 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
2462 channel_page, 'channel url', default=None)
2463 if channel_url:
2464 channel_playlist_id = self._search_regex(
2465 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
2466 channel_url, 'channel id', default=None)
386bdfa6
S
2467 if channel_playlist_id and channel_playlist_id.startswith('UC'):
2468 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
2469 return self.url_result(
2470 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 2471
60bf45c8 2472 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
2473 autogenerated = re.search(r'''(?x)
2474 class="[^"]*?(?:
2475 channel-header-autogenerated-label|
2476 yt-channel-title-autogenerated
2477 )[^"]*"''', channel_page) is not None
c5e8d7af 2478
b9643eed
JMF
2479 if autogenerated:
2480 # The videos are contained in a single page
2481 # the ajax pages can't be used, they are empty
b82f815f 2482 entries = [
fb69240c
S
2483 self.url_result(
2484 video_id, 'Youtube', video_id=video_id,
2485 video_title=video_title)
8f02ad4f 2486 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
2487 return self.playlist_result(entries, channel_id)
2488
73c4ac2c
S
2489 try:
2490 next(self._entries(channel_page, channel_id))
2491 except StopIteration:
2492 alert_message = self._html_search_regex(
2493 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
2494 channel_page, 'alert', default=None, group='alert')
2495 if alert_message:
2496 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
2497
648e6a1f 2498 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
c5e8d7af
PH
2499
2500
eb0f3e7e 2501class YoutubeUserIE(YoutubeChannelIE):
78caa52a 2502 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
ea696249 2503 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
9558dcec 2504 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
78caa52a 2505 IE_NAME = 'youtube:user'
c5e8d7af 2506
cdc628a4
PH
2507 _TESTS = [{
2508 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
2509 'playlist_mincount': 320,
2510 'info_dict': {
73c4ac2c
S
2511 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
2512 'title': 'Uploads from The Linux Foundation',
cdc628a4 2513 }
9558dcec
S
2514 }, {
2515 # Only available via https://www.youtube.com/c/12minuteathlete/videos
2516 # but not https://www.youtube.com/user/12minuteathlete/videos
2517 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
2518 'playlist_mincount': 249,
2519 'info_dict': {
2520 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
2521 'title': 'Uploads from 12 Minute Athlete',
2522 }
cdc628a4
PH
2523 }, {
2524 'url': 'ytuser:phihag',
2525 'only_matching': True,
daa0df9e
YCH
2526 }, {
2527 'url': 'https://www.youtube.com/c/gametrailers',
2528 'only_matching': True,
9558dcec
S
2529 }, {
2530 'url': 'https://www.youtube.com/gametrailers',
2531 'only_matching': True,
73c4ac2c 2532 }, {
0e879f43 2533 # This channel is not available, geo restricted to JP
73c4ac2c
S
2534 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
2535 'only_matching': True,
cdc628a4
PH
2536 }]
2537
e3ea4790 2538 @classmethod
f4b05232 2539 def suitable(cls, url):
e3ea4790
JMF
2540 # Don't return True if the url can be extracted with other youtube
2541 # extractor, the regex would is too permissive and it would match.
f3a58d46 2542 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
2543 if any(ie.suitable(url) for ie in other_yt_ies):
5f6a1245
JW
2544 return False
2545 else:
2546 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 2547
9558dcec
S
2548 def _build_template_url(self, url, channel_id):
2549 mobj = re.match(self._VALID_URL, url)
2550 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
2551
b05654f0 2552
f07e276a
S
2553class YoutubeLiveIE(YoutubeBaseInfoExtractor):
2554 IE_DESC = 'YouTube.com live streams'
073d5bf5 2555 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
f07e276a
S
2556 IE_NAME = 'youtube:live'
2557
2558 _TESTS = [{
2d3d2997 2559 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
f07e276a
S
2560 'info_dict': {
2561 'id': 'a48o2S1cPoo',
2562 'ext': 'mp4',
2563 'title': 'The Young Turks - Live Main Show',
2564 'uploader': 'The Young Turks',
2565 'uploader_id': 'TheYoungTurks',
ec85ded8 2566 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
f07e276a
S
2567 'upload_date': '20150715',
2568 'license': 'Standard YouTube License',
2569 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2570 'categories': ['News & Politics'],
2571 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2572 'like_count': int,
2573 'dislike_count': int,
2574 },
2575 'params': {
2576 'skip_download': True,
2577 },
2578 }, {
2d3d2997 2579 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
f07e276a 2580 'only_matching': True,
c1b2a085
S
2581 }, {
2582 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2583 'only_matching': True,
073d5bf5
S
2584 }, {
2585 'url': 'https://www.youtube.com/TheYoungTurks/live',
2586 'only_matching': True,
f07e276a
S
2587 }]
2588
2589 def _real_extract(self, url):
2590 mobj = re.match(self._VALID_URL, url)
2591 channel_id = mobj.group('id')
2592 base_url = mobj.group('base_url')
2593 webpage = self._download_webpage(url, channel_id, fatal=False)
2594 if webpage:
2595 page_type = self._og_search_property(
e7f3529f 2596 'type', webpage, 'page type', default='')
f07e276a
S
2597 video_id = self._html_search_meta(
2598 'videoId', webpage, 'video id', default=None)
e7f3529f
S
2599 if page_type.startswith('video') and video_id and re.match(
2600 r'^[0-9A-Za-z_-]{11}$', video_id):
f07e276a
S
2601 return self.url_result(video_id, YoutubeIE.ie_key())
2602 return self.url_result(base_url)
2603
2604
e462474e
S
2605class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
2606 IE_DESC = 'YouTube.com user/channel playlists'
2607 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'
2608 IE_NAME = 'youtube:playlists'
0c148415 2609
e568c223 2610 _TESTS = [{
2d3d2997 2611 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
0c148415
S
2612 'playlist_mincount': 4,
2613 'info_dict': {
2614 'id': 'ThirstForScience',
2615 'title': 'Thirst for Science',
2616 },
e568c223
S
2617 }, {
2618 # with "Load more" button
2d3d2997 2619 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
e568c223
S
2620 'playlist_mincount': 70,
2621 'info_dict': {
2622 'id': 'igorkle1',
2623 'title': 'Игорь Клейнер',
2624 },
e462474e
S
2625 }, {
2626 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
2627 'playlist_mincount': 17,
2628 'info_dict': {
2629 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
2630 'title': 'Chem Player',
2631 },
e568c223 2632 }]
0c148415
S
2633
2634
870f3bfc
S
2635class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
2636 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
2637
2638
2639class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
78caa52a 2640 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
2641 # there doesn't appear to be a real limit, for example if you search for
2642 # 'python' you get more than 8.000.000 results
2643 _MAX_RESULTS = float('inf')
78caa52a 2644 IE_NAME = 'youtube:search'
b05654f0 2645 _SEARCH_KEY = 'ytsearch'
b4c08069 2646 _EXTRA_QUERY_ARGS = {}
9dd8e46a 2647 _TESTS = []
b05654f0 2648
b05654f0
PH
2649 def _get_n_results(self, query, n):
2650 """Get a specified number of results for a query"""
2651
b4c08069 2652 videos = []
b05654f0
PH
2653 limit = n
2654
a22b2fd1
YCH
2655 url_query = {
2656 'search_query': query.encode('utf-8'),
2657 }
2658 url_query.update(self._EXTRA_QUERY_ARGS)
2659 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
2660
b4c08069 2661 for pagenum in itertools.count(1):
b4c08069 2662 data = self._download_json(
69ea8ca4 2663 result_url, video_id='query "%s"' % query,
b4c08069 2664 note='Downloading page %s' % pagenum,
a22b2fd1
YCH
2665 errnote='Unable to download API page',
2666 query={'spf': 'navigate'})
b4c08069 2667 html_content = data[1]['body']['content']
7cc3570e 2668
b4c08069 2669 if 'class="search-message' in html_content:
07ad22b8 2670 raise ExtractorError(
78caa52a 2671 '[youtube] No video results', expected=True)
b05654f0 2672
870f3bfc 2673 new_videos = list(self._process_page(html_content))
b4c08069
JMF
2674 videos += new_videos
2675 if not new_videos or len(videos) > limit:
2676 break
a22b2fd1
YCH
2677 next_link = self._html_search_regex(
2678 r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',
2679 html_content, 'next link', default=None)
2680 if next_link is None:
2681 break
2682 result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)
b05654f0 2683
b4c08069
JMF
2684 if len(videos) > n:
2685 videos = videos[:n]
b05654f0 2686 return self.playlist_result(videos, query)
75dff0ee 2687
c9ae7b95 2688
a3dd9248 2689class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 2690 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 2691 _SEARCH_KEY = 'ytsearchdate'
78caa52a 2692 IE_DESC = 'YouTube.com searches, newest videos first'
b4c08069 2693 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
75dff0ee 2694
c9ae7b95 2695
870f3bfc 2696class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
78caa52a
PH
2697 IE_DESC = 'YouTube.com search URLs'
2698 IE_NAME = 'youtube:search_url'
d2c1f79f 2699 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
2700 _TESTS = [{
2701 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
2702 'playlist_mincount': 5,
2703 'info_dict': {
2704 'title': 'youtube-dl test video',
2705 }
d2c1f79f
S
2706 }, {
2707 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
2708 'only_matching': True,
cdc628a4 2709 }]
c9ae7b95
PH
2710
2711 def _real_extract(self, url):
2712 mobj = re.match(self._VALID_URL, url)
7fd002c0 2713 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
c9ae7b95 2714 webpage = self._download_webpage(url, query)
175c2e9e 2715 return self.playlist_result(self._process_page(webpage), playlist_title=query)
c9ae7b95
PH
2716
2717
136dadde 2718class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
78caa52a 2719 IE_DESC = 'YouTube.com (multi-season) shows'
92519402 2720 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 2721 IE_NAME = 'youtube:show'
cdc628a4 2722 _TESTS = [{
4003bd82 2723 'url': 'https://www.youtube.com/show/airdisasters',
8801255d 2724 'playlist_mincount': 5,
cdc628a4
PH
2725 'info_dict': {
2726 'id': 'airdisasters',
2727 'title': 'Air Disasters',
2728 }
2729 }]
75dff0ee
JMF
2730
2731 def _real_extract(self, url):
136dadde
S
2732 playlist_id = self._match_id(url)
2733 return super(YoutubeShowIE, self)._real_extract(
2734 'https://www.youtube.com/show/%s/playlists' % playlist_id)
04cc9617
JMF
2735
2736
b2e8bc1b 2737class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639 2738 """
25f14e9f 2739 Base class for feed extractors
d7ae0639
JMF
2740 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
2741 """
b2e8bc1b 2742 _LOGIN_REQUIRED = True
d7ae0639
JMF
2743
2744 @property
2745 def IE_NAME(self):
78caa52a 2746 return 'youtube:%s' % self._FEED_NAME
04cc9617 2747
81f0259b 2748 def _real_initialize(self):
b2e8bc1b 2749 self._login()
81f0259b 2750
3853309f 2751 def _entries(self, page):
2bc43303
JMF
2752 # The extraction process is the same as for playlists, but the regex
2753 # for the video ids doesn't contain an index
2754 ids = []
2755 more_widget_html = content_html = page
2bc43303
JMF
2756 for page_num in itertools.count(1):
2757 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
62c95fd5
S
2758
2759 # 'recommended' feed has infinite 'load more' and each new portion spins
2760 # the same videos in (sometimes) slightly different order, so we'll check
2761 # for unicity and break when portion has no new videos
3853309f 2762 new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
62c95fd5
S
2763 if not new_ids:
2764 break
2765
2bc43303
JMF
2766 ids.extend(new_ids)
2767
3853309f
S
2768 for entry in self._ids_to_results(new_ids):
2769 yield entry
2770
2bc43303
JMF
2771 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
2772 if not mobj:
2773 break
2774
2775 more = self._download_json(
25f14e9f 2776 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
2bc43303
JMF
2777 'Downloading page #%s' % page_num,
2778 transform_source=uppercase_escape)
2779 content_html = more['content_html']
2780 more_widget_html = more['load_more_widget_html']
2781
3853309f
S
2782 def _real_extract(self, url):
2783 page = self._download_webpage(
2784 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
2785 self._PLAYLIST_TITLE)
25f14e9f 2786 return self.playlist_result(
3853309f 2787 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
25f14e9f
S
2788
2789
2790class YoutubeWatchLaterIE(YoutubePlaylistIE):
2791 IE_NAME = 'youtube:watchlater'
2792 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
92519402 2793 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
25f14e9f 2794
bc7a9cd8
S
2795 _TESTS = [{
2796 'url': 'https://www.youtube.com/playlist?list=WL',
2797 'only_matching': True,
2798 }, {
2799 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
2800 'only_matching': True,
2801 }]
25f14e9f
S
2802
2803 def _real_extract(self, url):
7e5dc339 2804 _, video = self._check_download_just_video(url, 'WL')
ebf1b291
S
2805 if video:
2806 return video
dacb3a86
S
2807 _, playlist = self._extract_playlist('WL')
2808 return playlist
f459d170 2809
5f6a1245 2810
c626a3d9 2811class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 2812 IE_NAME = 'youtube:favorites'
f3a34072 2813 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
92519402 2814 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
2815 _LOGIN_REQUIRED = True
2816
2817 def _real_extract(self, url):
2818 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 2819 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 2820 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
2821
2822
25f14e9f
S
2823class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
2824 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
92519402 2825 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
25f14e9f
S
2826 _FEED_NAME = 'recommended'
2827 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 2828
1ed5b5c9 2829
25f14e9f
S
2830class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
2831 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
92519402 2832 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
25f14e9f
S
2833 _FEED_NAME = 'subscriptions'
2834 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 2835
1ed5b5c9 2836
25f14e9f
S
2837class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
2838 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
92519402 2839 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
25f14e9f
S
2840 _FEED_NAME = 'history'
2841 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
2842
2843
15870e90
PH
2844class YoutubeTruncatedURLIE(InfoExtractor):
2845 IE_NAME = 'youtube:truncated_url'
2846 IE_DESC = False # Do not list
975d35db 2847 _VALID_URL = r'''(?x)
b95aab84
PH
2848 (?:https?://)?
2849 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
2850 (?:watch\?(?:
c4808c60 2851 feature=[a-z_]+|
b95aab84
PH
2852 annotation_id=annotation_[^&]+|
2853 x-yt-cl=[0-9]+|
c1708b89 2854 hl=[^&]*|
287be8c6 2855 t=[0-9]+
b95aab84
PH
2856 )?
2857 |
2858 attribution_link\?a=[^&]+
2859 )
2860 $
975d35db 2861 '''
15870e90 2862
c4808c60 2863 _TESTS = [{
2d3d2997 2864 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
c4808c60 2865 'only_matching': True,
dc2fc736 2866 }, {
2d3d2997 2867 'url': 'https://www.youtube.com/watch?',
dc2fc736 2868 'only_matching': True,
b95aab84
PH
2869 }, {
2870 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
2871 'only_matching': True,
2872 }, {
2873 'url': 'https://www.youtube.com/watch?feature=foo',
2874 'only_matching': True,
c1708b89
PH
2875 }, {
2876 'url': 'https://www.youtube.com/watch?hl=en-GB',
2877 'only_matching': True,
287be8c6
PH
2878 }, {
2879 'url': 'https://www.youtube.com/watch?t=2372',
2880 'only_matching': True,
c4808c60
PH
2881 }]
2882
15870e90
PH
2883 def _real_extract(self, url):
2884 raise ExtractorError(
78caa52a
PH
2885 'Did you forget to quote the URL? Remember that & is a meta '
2886 'character in most shells, so you want to put the URL in quotes, '
2887 'like youtube-dl '
2d3d2997 2888 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
78caa52a 2889 ' or simply youtube-dl BaW_jenozKc .',
15870e90 2890 expected=True)
772fd5cc
PH
2891
2892
2893class YoutubeTruncatedIDIE(InfoExtractor):
2894 IE_NAME = 'youtube:truncated_id'
2895 IE_DESC = False # Do not list
b95aab84 2896 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
2897
2898 _TESTS = [{
2899 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
2900 'only_matching': True,
2901 }]
2902
2903 def _real_extract(self, url):
2904 video_id = self._match_id(url)
2905 raise ExtractorError(
2906 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
2907 expected=True)