]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[pornhub:uservideos] Add support for new URLs (closes #17388)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
d77ab8e2 9import random
c5e8d7af 10import re
42939b61 11import time
e0df6211 12import traceback
c5e8d7af 13
b05654f0 14from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
8d81f3e3 19 compat_kwargs,
c5e8d7af 20 compat_parse_qs,
7fd002c0
S
21 compat_urllib_parse_unquote,
22 compat_urllib_parse_unquote_plus,
15707c7e 23 compat_urllib_parse_urlencode,
7c80519c 24 compat_urllib_parse_urlparse,
7c61bd36 25 compat_urlparse,
c5e8d7af 26 compat_str,
4bb4a188
PH
27)
28from ..utils import (
c5e8d7af 29 clean_html,
9b9c5355 30 error_to_compat_str,
c5e8d7af 31 ExtractorError,
2d30521a 32 float_or_none,
4bb4a188
PH
33 get_element_by_attribute,
34 get_element_by_id,
dd27fd17 35 int_or_none,
94278f72 36 mimetype2ext,
4bb4a188 37 orderedSet,
6310acf5 38 parse_codecs,
7c80519c 39 parse_duration,
54fc90aa 40 qualities,
0cb58b02 41 remove_quotes,
3995d37d 42 remove_start,
cf7e015f 43 smuggle_url,
c93d53f5 44 str_to_int,
556dbe7f 45 try_get,
c5e8d7af
PH
46 unescapeHTML,
47 unified_strdate,
cf7e015f 48 unsmuggle_url,
81c2f20b 49 uppercase_escape,
6e6bc8da 50 urlencode_postdata,
c5e8d7af
PH
51)
52
5f6a1245 53
de7f3446 54class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
55 """Provide base functions for Youtube extractors"""
56 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 57 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
e00eb564
S
58
59 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
3995d37d
S
60 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
61 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
e00eb564 62
b2e8bc1b
JMF
63 _NETRC_MACHINE = 'youtube'
64 # If True it will raise an error if no login info is provided
65 _LOGIN_REQUIRED = False
66
409b9324 67 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}'
d0ba5587 68
b2e8bc1b 69 def _set_language(self):
810fb84d
PH
70 self._set_cookie(
71 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 72 # YouTube sets the expire time to about two months
810fb84d 73 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 74
25f14e9f
S
75 def _ids_to_results(self, ids):
76 return [
77 self.url_result(vid_id, 'Youtube', video_id=vid_id)
78 for vid_id in ids]
79
b2e8bc1b 80 def _login(self):
83317f69 81 """
82 Attempt to log in to YouTube.
83 True is returned if successful or skipped.
84 False is returned if login failed.
85
86 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
87 """
68217024 88 username, password = self._get_login_info()
b2e8bc1b
JMF
89 # No authentication to be performed
90 if username is None:
70d35d16 91 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
69ea8ca4 92 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 93 return True
b2e8bc1b 94
7cc3570e
PH
95 login_page = self._download_webpage(
96 self._LOGIN_URL, None,
69ea8ca4
PH
97 note='Downloading login page',
98 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
99 if login_page is False:
100 return
b2e8bc1b 101
1212e997 102 login_form = self._hidden_inputs(login_page)
c5e8d7af 103
e00eb564
S
104 def req(url, f_req, note, errnote):
105 data = login_form.copy()
106 data.update({
107 'pstMsg': 1,
108 'checkConnection': 'youtube',
109 'checkedDomains': 'youtube',
110 'hl': 'en',
111 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
3995d37d 112 'f.req': json.dumps(f_req),
e00eb564
S
113 'flowName': 'GlifWebSignIn',
114 'flowEntry': 'ServiceLogin',
041bc3ad 115 })
e00eb564
S
116 return self._download_json(
117 url, None, note=note, errnote=errnote,
118 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
119 fatal=False,
120 data=urlencode_postdata(data), headers={
121 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
122 'Google-Accounts-XSRF': 1,
123 })
124
3995d37d
S
125 def warn(message):
126 self._downloader.report_warning(message)
127
128 lookup_req = [
129 username,
130 None, [], None, 'US', None, None, 2, False, True,
131 [
132 None, None,
133 [2, 1, None, 1,
134 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
135 None, [], 4],
136 1, [None, None, []], None, None, None, True
137 ],
138 username,
139 ]
140
e00eb564 141 lookup_results = req(
3995d37d 142 self._LOOKUP_URL, lookup_req,
e00eb564
S
143 'Looking up account info', 'Unable to look up account info')
144
145 if lookup_results is False:
146 return False
041bc3ad 147
3995d37d
S
148 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
149 if not user_hash:
150 warn('Unable to extract user hash')
151 return False
152
153 challenge_req = [
154 user_hash,
155 None, 1, None, [1, None, None, None, [password, None, True]],
156 [
157 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
158 1, [None, None, []], None, None, None, True
159 ]]
83317f69 160
3995d37d
S
161 challenge_results = req(
162 self._CHALLENGE_URL, challenge_req,
163 'Logging in', 'Unable to log in')
83317f69 164
3995d37d 165 if challenge_results is False:
e00eb564 166 return
83317f69 167
3995d37d
S
168 login_res = try_get(challenge_results, lambda x: x[0][5], list)
169 if login_res:
170 login_msg = try_get(login_res, lambda x: x[5], compat_str)
171 warn(
172 'Unable to login: %s' % 'Invalid password'
173 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
174 return False
175
176 res = try_get(challenge_results, lambda x: x[0][-1], list)
177 if not res:
178 warn('Unable to extract result entry')
179 return False
180
9a6628aa
S
181 login_challenge = try_get(res, lambda x: x[0][0], list)
182 if login_challenge:
183 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
184 if challenge_str == 'TWO_STEP_VERIFICATION':
3995d37d
S
185 # SEND_SUCCESS - TFA code has been successfully sent to phone
186 # QUOTA_EXCEEDED - reached the limit of TFA codes
9a6628aa 187 status = try_get(login_challenge, lambda x: x[5], compat_str)
3995d37d
S
188 if status == 'QUOTA_EXCEEDED':
189 warn('Exceeded the limit of TFA codes, try later')
190 return False
191
192 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
193 if not tl:
194 warn('Unable to extract TL')
195 return False
196
197 tfa_code = self._get_tfa_info('2-step verification code')
198
199 if not tfa_code:
200 warn(
201 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
202 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
203 return False
204
205 tfa_code = remove_start(tfa_code, 'G-')
206
207 tfa_req = [
208 user_hash, None, 2, None,
209 [
210 9, None, None, None, None, None, None, None,
211 [None, tfa_code, True, 2]
212 ]]
213
214 tfa_results = req(
215 self._TFA_URL.format(tl), tfa_req,
216 'Submitting TFA code', 'Unable to submit TFA code')
217
218 if tfa_results is False:
219 return False
220
221 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
222 if tfa_res:
223 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
224 warn(
225 'Unable to finish TFA: %s' % 'Invalid TFA code'
226 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
227 return False
228
229 check_cookie_url = try_get(
230 tfa_results, lambda x: x[0][-1][2], compat_str)
9a6628aa
S
231 else:
232 CHALLENGES = {
233 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
234 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
235 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
236 }
237 challenge = CHALLENGES.get(
238 challenge_str,
239 '%s returned error %s.' % (self.IE_NAME, challenge_str))
240 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
241 return False
3995d37d
S
242 else:
243 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
244
245 if not check_cookie_url:
246 warn('Unable to extract CheckCookie URL')
247 return False
e00eb564
S
248
249 check_cookie_results = self._download_webpage(
3995d37d
S
250 check_cookie_url, None, 'Checking cookie', fatal=False)
251
252 if check_cookie_results is False:
253 return False
e00eb564 254
3995d37d
S
255 if 'https://myaccount.google.com/' not in check_cookie_results:
256 warn('Unable to log in')
b2e8bc1b 257 return False
e00eb564 258
b2e8bc1b
JMF
259 return True
260
30226342 261 def _download_webpage_handle(self, *args, **kwargs):
8d81f3e3 262 kwargs.setdefault('query', {})['disable_polymer'] = 'true'
30226342 263 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
8d81f3e3
S
264 *args, **compat_kwargs(kwargs))
265
b2e8bc1b
JMF
266 def _real_initialize(self):
267 if self._downloader is None:
268 return
42939b61 269 self._set_language()
b2e8bc1b
JMF
270 if not self._login():
271 return
c5e8d7af 272
8377574c 273
8e7aad20 274class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
061a75ed 275 # Extract entries from page with "Load more" button
648e6a1f
S
276 def _entries(self, page, playlist_id):
277 more_widget_html = content_html = page
278 for page_num in itertools.count(1):
061a75ed
S
279 for entry in self._process_page(content_html):
280 yield entry
648e6a1f
S
281
282 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
283 if not mobj:
284 break
285
286 more = self._download_json(
287 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
288 'Downloading page #%s' % page_num,
289 transform_source=uppercase_escape)
290 content_html = more['content_html']
291 if not content_html.strip():
292 # Some webpages show a "Load more" button but they don't
293 # have more videos
294 break
295 more_widget_html = more['load_more_widget_html']
296
061a75ed
S
297
298class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
299 def _process_page(self, content):
300 for video_id, video_title in self.extract_videos_from_page(content):
301 yield self.url_result(video_id, 'Youtube', video_id, video_title)
302
648e6a1f
S
303 def extract_videos_from_page(self, page):
304 ids_in_page = []
305 titles_in_page = []
306 for mobj in re.finditer(self._VIDEO_RE, page):
307 # The link with index 0 is not the first video of the playlist (not sure if still actual)
308 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
309 continue
310 video_id = mobj.group('id')
311 video_title = unescapeHTML(mobj.group('title'))
312 if video_title:
313 video_title = video_title.strip()
314 try:
315 idx = ids_in_page.index(video_id)
316 if video_title and not titles_in_page[idx]:
317 titles_in_page[idx] = video_title
318 except ValueError:
319 ids_in_page.append(video_id)
320 titles_in_page.append(video_title)
321 return zip(ids_in_page, titles_in_page)
322
323
061a75ed
S
324class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
325 def _process_page(self, content):
6dee688e
S
326 for playlist_id in orderedSet(re.findall(
327 r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
328 content)):
061a75ed
S
329 yield self.url_result(
330 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
331
0c148415
S
332 def _real_extract(self, url):
333 playlist_id = self._match_id(url)
334 webpage = self._download_webpage(url, playlist_id)
0c148415 335 title = self._og_search_title(webpage, fatal=False)
061a75ed 336 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
0c148415
S
337
338
360e1ca5 339class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 340 IE_DESC = 'YouTube.com'
cb7dfeea 341 _VALID_URL = r"""(?x)^
c5e8d7af 342 (
edb53e2d 343 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 344 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 345 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 346 (?:www\.)?pwnyoutube\.com/|
8b561bfc 347 (?:www\.)?hooktube\.com/|
f7000f3a 348 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
349 tube\.majestyc\.net/|
350 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
351 (?:.*?\#/)? # handle anchor (#/) redirect urls
352 (?: # the various things that can precede the ID:
ac7553d0 353 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 354 |(?: # or the v= param in all its forms
f7000f3a 355 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 356 (?:\?|\#!?) # the params delimiter ? or # or #!
040ac686 357 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
c5e8d7af
PH
358 v=
359 )
f4b05232 360 ))
cbaed4bb
S
361 |(?:
362 youtu\.be| # just youtu.be/xxxx
6d4fc66b
S
363 vid\.plus| # or vid.plus/xxxx
364 zwearz\.com/watch| # or zwearz.com/watch/xxxx
cbaed4bb 365 )/
edb53e2d 366 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 367 )
c5e8d7af 368 )? # all until now is optional -> you can pass the naked ID
8963d9c2 369 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
d0ba5587
S
370 (?!.*?\blist=
371 (?:
372 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
373 WL # WL are handled by the watch later IE
374 )
375 )
c5e8d7af 376 (?(1).+)? # if we found the ID, everything can follow
d0ba5587 377 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
c5e8d7af 378 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26 379 _formats = {
c2d3cb4c 380 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
381 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
382 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
383 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
384 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
385 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
386 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
387 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
3834d3e3 388 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
c2d3cb4c 389 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
390 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
391 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
392 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
393 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
394 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
e1a0bfdf 395 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
c2d3cb4c 396 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
397 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
e1a0bfdf 398
399
400 # 3D videos
c2d3cb4c 401 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
402 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
403 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
404 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
e1a0bfdf 405 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
406 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
407 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
836a086c 408
96fb5605 409 # Apple HTTP Live Streaming
11f12195 410 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
c2d3cb4c 411 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
412 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
413 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
414 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
415 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
e1a0bfdf 416 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
417 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
2c62dc26
PH
418
419 # DASH mp4 video
d23028a8
S
420 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
421 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
422 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
423 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
424 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
425 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
426 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
427 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
428 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
429 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
430 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
431 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
836a086c 432
f6f1fc92 433 # Dash mp4 audio
d23028a8
S
434 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
435 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
436 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
437 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
438 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
439 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
440 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
836a086c
AZ
441
442 # Dash webm
d23028a8
S
443 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
444 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
445 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
446 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
447 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
448 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
449 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
450 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
451 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
452 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
453 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
454 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
455 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
456 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
457 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
4c6b4764 458 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
d23028a8
S
459 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
460 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
461 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
462 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
463 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
464 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
2c62dc26
PH
465
466 # Dash webm audio
d23028a8
S
467 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
468 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
ce6b9a2d 469
0857baad 470 # Dash webm audio with opus inside
d23028a8
S
471 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
472 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
473 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
0857baad 474
ce6b9a2d
PH
475 # RTMP (unnamed)
476 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 477 }
23d17e4b 478 _SUBTITLE_FORMATS = ('ttml', 'vtt')
836a086c 479
fd5c4aab
S
480 _GEO_BYPASS = False
481
78caa52a 482 IE_NAME = 'youtube'
2eb88d95
PH
483 _TESTS = [
484 {
2d3d2997 485 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
4bc3a23e
PH
486 'info_dict': {
487 'id': 'BaW_jenozKc',
488 'ext': 'mp4',
489 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
490 'uploader': 'Philipp Hagemeister',
491 'uploader_id': 'phihag',
ec85ded8 492 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
4bc3a23e 493 'upload_date': '20121002',
7caf9830 494 'license': 'Standard YouTube License',
4bc3a23e
PH
495 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
496 'categories': ['Science & Technology'],
000b6b5a 497 'tags': ['youtube-dl'],
556dbe7f 498 'duration': 10,
3e7c1224
PH
499 'like_count': int,
500 'dislike_count': int,
7c80519c 501 'start_time': 1,
297a564b 502 'end_time': 9,
2eb88d95 503 }
0e853ca4 504 },
0e853ca4 505 {
2d3d2997 506 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',
4bc3a23e
PH
507 'note': 'Test generic use_cipher_signature video (#897)',
508 'info_dict': {
509 'id': 'UxxajLWwzqY',
510 'ext': 'mp4',
511 'upload_date': '20120506',
512 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
0cb58b02 513 'alt_title': 'I Love It (feat. Charli XCX)',
7caf9830 514 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',
000b6b5a
S
515 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
516 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
517 'iconic ep', 'iconic', 'love', 'it'],
556dbe7f 518 'duration': 180,
4bc3a23e
PH
519 'uploader': 'Icona Pop',
520 'uploader_id': 'IconaPop',
ec85ded8 521 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
7caf9830 522 'license': 'Standard YouTube License',
0cb58b02 523 'creator': 'Icona Pop',
936784b2
S
524 'track': 'I Love It (feat. Charli XCX)',
525 'artist': 'Icona Pop',
2eb88d95 526 }
c108eb73
JMF
527 },
528 {
4bc3a23e
PH
529 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
530 'note': 'Test VEVO video with age protection (#956)',
531 'info_dict': {
532 'id': '07FYdnEawAQ',
533 'ext': 'mp4',
534 'upload_date': '20130703',
535 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
0cb58b02 536 'alt_title': 'Tunnel Vision',
4bc3a23e 537 'description': 'md5:64249768eec3bc4276236606ea996373',
556dbe7f 538 'duration': 419,
4bc3a23e
PH
539 'uploader': 'justintimberlakeVEVO',
540 'uploader_id': 'justintimberlakeVEVO',
ec85ded8 541 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
7caf9830 542 'license': 'Standard YouTube License',
0cb58b02 543 'creator': 'Justin Timberlake',
7e72694b 544 'track': 'Tunnel Vision',
936784b2 545 'artist': 'Justin Timberlake',
34952f09 546 'age_limit': 18,
c108eb73
JMF
547 }
548 },
fccd3771 549 {
4bc3a23e
PH
550 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
551 'note': 'Embed-only video (#1746)',
552 'info_dict': {
553 'id': 'yZIXLfi8CZQ',
554 'ext': 'mp4',
555 'upload_date': '20120608',
556 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
557 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
558 'uploader': 'SET India',
94bfcd23 559 'uploader_id': 'setindia',
ec85ded8 560 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
7caf9830 561 'license': 'Standard YouTube License',
94bfcd23 562 'age_limit': 18,
fccd3771
PH
563 }
564 },
11b56058 565 {
2d3d2997 566 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
11b56058
PM
567 'note': 'Use the first video ID in the URL',
568 'info_dict': {
569 'id': 'BaW_jenozKc',
570 'ext': 'mp4',
571 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
572 'uploader': 'Philipp Hagemeister',
573 'uploader_id': 'phihag',
ec85ded8 574 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
11b56058 575 'upload_date': '20121002',
7caf9830 576 'license': 'Standard YouTube License',
11b56058
PM
577 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
578 'categories': ['Science & Technology'],
579 'tags': ['youtube-dl'],
556dbe7f 580 'duration': 10,
11b56058
PM
581 'like_count': int,
582 'dislike_count': int,
34a7de29
S
583 },
584 'params': {
585 'skip_download': True,
586 },
11b56058 587 },
dd27fd17 588 {
2d3d2997 589 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
4bc3a23e
PH
590 'note': '256k DASH audio (format 141) via DASH manifest',
591 'info_dict': {
592 'id': 'a9LDPn-MO4I',
593 'ext': 'm4a',
594 'upload_date': '20121002',
595 'uploader_id': '8KVIDEO',
ec85ded8 596 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
4bc3a23e
PH
597 'description': '',
598 'uploader': '8KVIDEO',
7caf9830 599 'license': 'Standard YouTube License',
4bc3a23e 600 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 601 },
4bc3a23e
PH
602 'params': {
603 'youtube_include_dash_manifest': True,
604 'format': '141',
4919603f 605 },
de3c7fe0 606 'skip': 'format 141 not served anymore',
dd27fd17 607 },
3489b7d2
JMF
608 # DASH manifest with encrypted signature
609 {
78caa52a
PH
610 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
611 'info_dict': {
612 'id': 'IB3lcPjvWLA',
613 'ext': 'm4a',
b766eb27 614 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
eb6793ba 615 'description': 'md5:1900ed86ee514927b9e00fbead6969a5',
556dbe7f 616 'duration': 244,
78caa52a
PH
617 'uploader': 'AfrojackVEVO',
618 'uploader_id': 'AfrojackVEVO',
619 'upload_date': '20131011',
7caf9830 620 'license': 'Standard YouTube License',
3489b7d2 621 },
4bc3a23e 622 'params': {
78caa52a 623 'youtube_include_dash_manifest': True,
de3c7fe0 624 'format': '141/bestaudio[ext=m4a]',
3489b7d2
JMF
625 },
626 },
aaeb86f6
S
627 # JS player signature function name containing $
628 {
629 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
630 'info_dict': {
631 'id': 'nfWlot6h_JM',
632 'ext': 'm4a',
633 'title': 'Taylor Swift - Shake It Off',
0cb58b02 634 'alt_title': 'Shake It Off',
f57b7835 635 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',
556dbe7f 636 'duration': 242,
aaeb86f6
S
637 'uploader': 'TaylorSwiftVEVO',
638 'uploader_id': 'TaylorSwiftVEVO',
639 'upload_date': '20140818',
7caf9830 640 'license': 'Standard YouTube License',
0cb58b02 641 'creator': 'Taylor Swift',
aaeb86f6
S
642 },
643 'params': {
644 'youtube_include_dash_manifest': True,
de3c7fe0 645 'format': '141/bestaudio[ext=m4a]',
aaeb86f6
S
646 },
647 },
aa79ac0c
PH
648 # Controversy video
649 {
650 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
651 'info_dict': {
652 'id': 'T4XJQO3qol8',
653 'ext': 'mp4',
556dbe7f 654 'duration': 219,
aa79ac0c 655 'upload_date': '20100909',
eb6793ba 656 'uploader': 'TJ Kirk',
aa79ac0c 657 'uploader_id': 'TheAmazingAtheist',
ec85ded8 658 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
7caf9830 659 'license': 'Standard YouTube License',
aa79ac0c
PH
660 'title': 'Burning Everyone\'s Koran',
661 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
662 }
c522adb1
JMF
663 },
664 # Normal age-gate video (No vevo, embed allowed)
665 {
2d3d2997 666 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
c522adb1
JMF
667 'info_dict': {
668 'id': 'HtVdAasjOgU',
669 'ext': 'mp4',
670 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
ec85ded8 671 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
556dbe7f 672 'duration': 142,
c522adb1
JMF
673 'uploader': 'The Witcher',
674 'uploader_id': 'WitcherGame',
ec85ded8 675 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
c522adb1 676 'upload_date': '20140605',
7caf9830 677 'license': 'Standard YouTube License',
34952f09 678 'age_limit': 18,
c522adb1
JMF
679 },
680 },
fccae2b9
S
681 # Age-gate video with encrypted signature
682 {
2d3d2997 683 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',
fccae2b9
S
684 'info_dict': {
685 'id': '6kLq3WMV1nU',
eb6793ba 686 'ext': 'webm',
fccae2b9
S
687 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
688 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
eb6793ba 689 'duration': 246,
fccae2b9
S
690 'uploader': 'LloydVEVO',
691 'uploader_id': 'LloydVEVO',
ec85ded8 692 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
fccae2b9 693 'upload_date': '20110629',
7caf9830 694 'license': 'Standard YouTube License',
34952f09 695 'age_limit': 18,
fccae2b9
S
696 },
697 },
774e208f 698 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
7d02dcfa 699 # YouTube Red ad is not captured for creator
774e208f
PH
700 {
701 'url': '__2ABJjxzNo',
702 'info_dict': {
703 'id': '__2ABJjxzNo',
704 'ext': 'mp4',
556dbe7f 705 'duration': 266,
774e208f
PH
706 'upload_date': '20100430',
707 'uploader_id': 'deadmau5',
ec85ded8 708 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
0cb58b02 709 'creator': 'deadmau5',
774e208f
PH
710 'description': 'md5:12c56784b8032162bb936a5f76d55360',
711 'uploader': 'deadmau5',
7caf9830 712 'license': 'Standard YouTube License',
774e208f 713 'title': 'Deadmau5 - Some Chords (HD)',
0cb58b02 714 'alt_title': 'Some Chords',
774e208f
PH
715 },
716 'expected_warnings': [
717 'DASH manifest missing',
718 ]
e52a40ab
PH
719 },
720 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
721 {
722 'url': 'lqQg6PlCWgI',
723 'info_dict': {
724 'id': 'lqQg6PlCWgI',
725 'ext': 'mp4',
556dbe7f 726 'duration': 6085,
90227264 727 'upload_date': '20150827',
cbe2bd91 728 'uploader_id': 'olympic',
ec85ded8 729 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
7caf9830 730 'license': 'Standard YouTube License',
cbe2bd91 731 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
be49068d 732 'uploader': 'Olympic',
cbe2bd91
PH
733 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
734 },
735 'params': {
736 'skip_download': 'requires avconv',
e52a40ab 737 }
cbe2bd91 738 },
6271f1ca
PH
739 # Non-square pixels
740 {
741 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
742 'info_dict': {
743 'id': '_b-2C3KPAM0',
744 'ext': 'mp4',
745 'stretched_ratio': 16 / 9.,
556dbe7f 746 'duration': 85,
6271f1ca
PH
747 'upload_date': '20110310',
748 'uploader_id': 'AllenMeow',
ec85ded8 749 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
6271f1ca 750 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
eb6793ba 751 'uploader': '孫ᄋᄅ',
7caf9830 752 'license': 'Standard YouTube License',
6271f1ca
PH
753 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
754 },
06b491eb
S
755 },
756 # url_encoded_fmt_stream_map is empty string
757 {
758 'url': 'qEJwOuvDf7I',
759 'info_dict': {
760 'id': 'qEJwOuvDf7I',
f57b7835 761 'ext': 'webm',
06b491eb
S
762 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
763 'description': '',
764 'upload_date': '20150404',
765 'uploader_id': 'spbelect',
766 'uploader': 'Наблюдатели Петербурга',
767 },
768 'params': {
769 'skip_download': 'requires avconv',
e323cf3f
S
770 },
771 'skip': 'This live event has ended.',
06b491eb 772 },
da77d856
S
773 # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
774 {
775 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
776 'info_dict': {
777 'id': 'FIl7x6_3R5Y',
eb6793ba 778 'ext': 'webm',
da77d856
S
779 'title': 'md5:7b81415841e02ecd4313668cde88737a',
780 'description': 'md5:116377fd2963b81ec4ce64b542173306',
556dbe7f 781 'duration': 220,
da77d856
S
782 'upload_date': '20150625',
783 'uploader_id': 'dorappi2000',
ec85ded8 784 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
da77d856 785 'uploader': 'dorappi2000',
7caf9830 786 'license': 'Standard YouTube License',
eb6793ba 787 'formats': 'mincount:31',
da77d856 788 },
eb6793ba 789 'skip': 'not actual anymore',
2ee8f5d8 790 },
8a1a26ce
YCH
791 # DASH manifest with segment_list
792 {
793 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
794 'md5': '8ce563a1d667b599d21064e982ab9e31',
795 'info_dict': {
796 'id': 'CsmdDsKjzN8',
797 'ext': 'mp4',
17ee98e1 798 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
799 'uploader': 'Airtek',
800 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
801 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
7caf9830 802 'license': 'Standard YouTube License',
8a1a26ce
YCH
803 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
804 },
805 'params': {
806 'youtube_include_dash_manifest': True,
807 'format': '135', # bestvideo
be49068d
S
808 },
809 'skip': 'This live event has ended.',
2ee8f5d8 810 },
cf7e015f
S
811 {
812 # Multifeed videos (multiple cameras), URL is for Main Camera
813 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
814 'info_dict': {
815 'id': 'jqWvoWXjCVs',
816 'title': 'teamPGP: Rocket League Noob Stream',
817 'description': 'md5:dc7872fb300e143831327f1bae3af010',
818 },
819 'playlist': [{
820 'info_dict': {
821 'id': 'jqWvoWXjCVs',
822 'ext': 'mp4',
823 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
824 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 825 'duration': 7335,
cf7e015f
S
826 'upload_date': '20150721',
827 'uploader': 'Beer Games Beer',
828 'uploader_id': 'beergamesbeer',
ec85ded8 829 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 830 'license': 'Standard YouTube License',
cf7e015f
S
831 },
832 }, {
833 'info_dict': {
834 'id': '6h8e8xoXJzg',
835 'ext': 'mp4',
836 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
837 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 838 'duration': 7337,
cf7e015f
S
839 'upload_date': '20150721',
840 'uploader': 'Beer Games Beer',
841 'uploader_id': 'beergamesbeer',
ec85ded8 842 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 843 'license': 'Standard YouTube License',
cf7e015f
S
844 },
845 }, {
846 'info_dict': {
847 'id': 'PUOgX5z9xZw',
848 'ext': 'mp4',
849 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
850 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 851 'duration': 7337,
cf7e015f
S
852 'upload_date': '20150721',
853 'uploader': 'Beer Games Beer',
854 'uploader_id': 'beergamesbeer',
ec85ded8 855 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 856 'license': 'Standard YouTube License',
cf7e015f
S
857 },
858 }, {
859 'info_dict': {
860 'id': 'teuwxikvS5k',
861 'ext': 'mp4',
862 'title': 'teamPGP: Rocket League Noob Stream (zim)',
863 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 864 'duration': 7334,
cf7e015f
S
865 'upload_date': '20150721',
866 'uploader': 'Beer Games Beer',
867 'uploader_id': 'beergamesbeer',
ec85ded8 868 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 869 'license': 'Standard YouTube License',
cf7e015f
S
870 },
871 }],
872 'params': {
873 'skip_download': True,
874 },
cbaed4bb 875 },
f9f49d87
S
876 {
877 # Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536)
878 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
879 'info_dict': {
880 'id': 'gVfLd0zydlo',
881 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
882 },
883 'playlist_count': 2,
be49068d 884 'skip': 'Not multifeed anymore',
f9f49d87 885 },
cbaed4bb 886 {
2d3d2997 887 'url': 'https://vid.plus/FlRa-iH7PGw',
cbaed4bb 888 'only_matching': True,
0e49d9a6 889 },
6d4fc66b 890 {
2d3d2997 891 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
6d4fc66b
S
892 'only_matching': True,
893 },
0e49d9a6 894 {
61f92af1 895 # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)
a8776b10
S
896 # Also tests cut-off URL expansion in video description (see
897 # https://github.com/rg3/youtube-dl/issues/1892,
898 # https://github.com/rg3/youtube-dl/issues/8164)
0e49d9a6
LL
899 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
900 'info_dict': {
901 'id': 'lsguqyKfVQg',
902 'ext': 'mp4',
903 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
eb6793ba 904 'alt_title': 'Dark Walk - Position Music',
0e49d9a6 905 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
556dbe7f 906 'duration': 133,
0e49d9a6
LL
907 'upload_date': '20151119',
908 'uploader_id': 'IronSoulElf',
ec85ded8 909 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
0e49d9a6 910 'uploader': 'IronSoulElf',
7caf9830 911 'license': 'Standard YouTube License',
eb6793ba
S
912 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
913 'track': 'Dark Walk - Position Music',
914 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
0e49d9a6
LL
915 },
916 'params': {
917 'skip_download': True,
918 },
919 },
61f92af1
S
920 {
921 # Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468)
922 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
923 'only_matching': True,
924 },
313dfc45
LL
925 {
926 # Video with yt:stretch=17:0
927 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
928 'info_dict': {
929 'id': 'Q39EVAstoRM',
930 'ext': 'mp4',
931 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
932 'description': 'md5:ee18a25c350637c8faff806845bddee9',
933 'upload_date': '20151107',
934 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
935 'uploader': 'CH GAMER DROID',
936 },
937 'params': {
938 'skip_download': True,
939 },
be49068d 940 'skip': 'This video does not exist.',
313dfc45 941 },
7caf9830
S
942 {
943 # Video licensed under Creative Commons
944 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
945 'info_dict': {
946 'id': 'M4gD1WSo5mA',
947 'ext': 'mp4',
948 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
949 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
556dbe7f 950 'duration': 721,
7caf9830
S
951 'upload_date': '20150127',
952 'uploader_id': 'BerkmanCenter',
ec85ded8 953 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
556dbe7f 954 'uploader': 'The Berkman Klein Center for Internet & Society',
7caf9830
S
955 'license': 'Creative Commons Attribution license (reuse allowed)',
956 },
957 'params': {
958 'skip_download': True,
959 },
960 },
fd050249
S
961 {
962 # Channel-like uploader_url
963 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
964 'info_dict': {
965 'id': 'eQcmzGIKrzg',
966 'ext': 'mp4',
967 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
968 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
556dbe7f 969 'duration': 4060,
fd050249 970 'upload_date': '20151119',
eb6793ba 971 'uploader': 'Bernie Sanders',
fd050249 972 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
ec85ded8 973 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
fd050249
S
974 'license': 'Creative Commons Attribution license (reuse allowed)',
975 },
976 'params': {
977 'skip_download': True,
978 },
979 },
040ac686
S
980 {
981 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
982 'only_matching': True,
7f29cf54
S
983 },
984 {
985 # YouTube Red paid video (https://github.com/rg3/youtube-dl/issues/10059)
986 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
987 'only_matching': True,
6496ccb4
S
988 },
989 {
990 # Rental video preview
991 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
992 'info_dict': {
993 'id': 'uGpuVWrhIzE',
994 'ext': 'mp4',
995 'title': 'Piku - Trailer',
996 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
997 'upload_date': '20150811',
998 'uploader': 'FlixMatrix',
999 'uploader_id': 'FlixMatrixKaravan',
ec85ded8 1000 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
6496ccb4
S
1001 'license': 'Standard YouTube License',
1002 },
1003 'params': {
1004 'skip_download': True,
1005 },
eb6793ba 1006 'skip': 'This video is not available.',
022a5d66 1007 },
12afdc2a
S
1008 {
1009 # YouTube Red video with episode data
1010 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1011 'info_dict': {
1012 'id': 'iqKdEhx-dD4',
1013 'ext': 'mp4',
1014 'title': 'Isolation - Mind Field (Ep 1)',
eb6793ba 1015 'description': 'md5:25b78d2f64ae81719f5c96319889b736',
556dbe7f 1016 'duration': 2085,
12afdc2a
S
1017 'upload_date': '20170118',
1018 'uploader': 'Vsauce',
1019 'uploader_id': 'Vsauce',
1020 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1021 'license': 'Standard YouTube License',
1022 'series': 'Mind Field',
1023 'season_number': 1,
1024 'episode_number': 1,
1025 },
1026 'params': {
1027 'skip_download': True,
1028 },
1029 'expected_warnings': [
1030 'Skipping DASH manifest',
1031 ],
1032 },
c7121fa7
S
1033 {
1034 # The following content has been identified by the YouTube community
1035 # as inappropriate or offensive to some audiences.
1036 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1037 'info_dict': {
1038 'id': '6SJNVb0GnPI',
1039 'ext': 'mp4',
1040 'title': 'Race Differences in Intelligence',
1041 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1042 'duration': 965,
1043 'upload_date': '20140124',
1044 'uploader': 'New Century Foundation',
1045 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1046 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1047 'license': 'Standard YouTube License',
c7121fa7
S
1048 },
1049 'params': {
1050 'skip_download': True,
1051 },
1052 },
022a5d66
S
1053 {
1054 # itag 212
1055 'url': '1t24XAntNCY',
1056 'only_matching': True,
fd5c4aab
S
1057 },
1058 {
1059 # geo restricted to JP
1060 'url': 'sJL6WA-aGkQ',
1061 'only_matching': True,
1062 },
d0ba5587
S
1063 {
1064 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1065 'only_matching': True,
1066 },
2eb88d95
PH
1067 ]
1068
e0df6211
PH
1069 def __init__(self, *args, **kwargs):
1070 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 1071 self._player_cache = {}
e0df6211 1072
c5e8d7af
PH
1073 def report_video_info_webpage_download(self, video_id):
1074 """Report attempt to download video info webpage."""
69ea8ca4 1075 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 1076
c5e8d7af
PH
1077 def report_information_extraction(self, video_id):
1078 """Report attempt to extract video information."""
69ea8ca4 1079 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
1080
1081 def report_unavailable_format(self, video_id, format):
1082 """Report extracted video URL."""
69ea8ca4 1083 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
1084
1085 def report_rtmp_download(self):
1086 """Indicate the download will use the RTMP protocol."""
69ea8ca4 1087 self.to_screen('RTMP download detected')
c5e8d7af 1088
60064c53
PH
1089 def _signature_cache_id(self, example_sig):
1090 """ Return a string representation of a signature """
78caa52a 1091 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
1092
1093 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 1094 id_m = re.match(
e31fed95 1095 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',
cf010131 1096 player_url)
c081b35c
PH
1097 if not id_m:
1098 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
1099 player_type = id_m.group('ext')
1100 player_id = id_m.group('id')
1101
c4417ddb 1102 # Read from filesystem cache
60064c53
PH
1103 func_id = '%s_%s_%s' % (
1104 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 1105 assert os.path.basename(func_id) == func_id
a0e07d31 1106
69ea8ca4 1107 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 1108 if cache_spec is not None:
78caa52a 1109 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 1110
6d1a55a5
PH
1111 download_note = (
1112 'Downloading player %s' % player_url
1113 if self._downloader.params.get('verbose') else
1114 'Downloading %s player %s' % (player_type, player_id)
1115 )
e0df6211
PH
1116 if player_type == 'js':
1117 code = self._download_webpage(
1118 player_url, video_id,
6d1a55a5 1119 note=download_note,
69ea8ca4 1120 errnote='Download of %s failed' % player_url)
83799698 1121 res = self._parse_sig_js(code)
c4417ddb 1122 elif player_type == 'swf':
e0df6211
PH
1123 urlh = self._request_webpage(
1124 player_url, video_id,
6d1a55a5 1125 note=download_note,
69ea8ca4 1126 errnote='Download of %s failed' % player_url)
e0df6211 1127 code = urlh.read()
83799698 1128 res = self._parse_sig_swf(code)
e0df6211
PH
1129 else:
1130 assert False, 'Invalid player type %r' % player_type
1131
785521bf
PH
1132 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1133 cache_res = res(test_string)
1134 cache_spec = [ord(c) for c in cache_res]
83799698 1135
69ea8ca4 1136 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
1137 return res
1138
60064c53 1139 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
1140 def gen_sig_code(idxs):
1141 def _genslice(start, end, step):
78caa52a 1142 starts = '' if start == 0 else str(start)
8bcc8756 1143 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 1144 steps = '' if step == 1 else (':%d' % step)
78caa52a 1145 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
1146
1147 step = None
7af808a5
PH
1148 # Quelch pyflakes warnings - start will be set when step is set
1149 start = '(Never used)'
edf3e38e
PH
1150 for i, prev in zip(idxs[1:], idxs[:-1]):
1151 if step is not None:
1152 if i - prev == step:
1153 continue
1154 yield _genslice(start, prev, step)
1155 step = None
1156 continue
1157 if i - prev in [-1, 1]:
1158 step = i - prev
1159 start = prev
1160 continue
1161 else:
78caa52a 1162 yield 's[%d]' % prev
edf3e38e 1163 if step is None:
78caa52a 1164 yield 's[%d]' % i
edf3e38e
PH
1165 else:
1166 yield _genslice(start, i, step)
1167
78caa52a 1168 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 1169 cache_res = func(test_string)
edf3e38e 1170 cache_spec = [ord(c) for c in cache_res]
78caa52a 1171 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
1172 signature_id_tuple = '(%s)' % (
1173 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 1174 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 1175 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 1176 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 1177
e0df6211
PH
1178 def _parse_sig_js(self, jscode):
1179 funcname = self._search_regex(
3c90cc8b
S
1180 (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1181 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\('),
1182 jscode, 'Initial JS player signature function name', group='sig')
2b25cb5d
PH
1183
1184 jsi = JSInterpreter(jscode)
1185 initial_function = jsi.extract_function(funcname)
e0df6211
PH
1186 return lambda s: initial_function([s])
1187
1188 def _parse_sig_swf(self, file_contents):
54256267 1189 swfi = SWFInterpreter(file_contents)
78caa52a 1190 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 1191 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 1192 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
1193 return lambda s: initial_function([s])
1194
83799698 1195 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1196 """Turn the encrypted s field into a working signature"""
6b37f0be 1197
c8bf86d5 1198 if player_url is None:
69ea8ca4 1199 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 1200
69ea8ca4 1201 if player_url.startswith('//'):
78caa52a 1202 player_url = 'https:' + player_url
3c90cc8b
S
1203 elif not re.match(r'https?://', player_url):
1204 player_url = compat_urlparse.urljoin(
1205 'https://www.youtube.com', player_url)
c8bf86d5 1206 try:
62af3a0e 1207 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
1208 if player_id not in self._player_cache:
1209 func = self._extract_signature_function(
60064c53 1210 video_id, player_url, s
c8bf86d5
PH
1211 )
1212 self._player_cache[player_id] = func
1213 func = self._player_cache[player_id]
1214 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 1215 self._print_sig_code(func, s)
c8bf86d5
PH
1216 return func(s)
1217 except Exception as e:
1218 tb = traceback.format_exc()
1219 raise ExtractorError(
78caa52a 1220 'Signature extraction failed: ' + tb, cause=e)
e0df6211 1221
360e1ca5 1222 def _get_subtitles(self, video_id, webpage):
de7f3446 1223 try:
60e47a26 1224 subs_doc = self._download_xml(
38c2e5b8 1225 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
1226 video_id, note=False)
1227 except ExtractorError as err:
9b9c5355 1228 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
de7f3446 1229 return {}
de7f3446
JMF
1230
1231 sub_lang_list = {}
60e47a26
JMF
1232 for track in subs_doc.findall('track'):
1233 lang = track.attrib['lang_code']
7e660ac1
LD
1234 if lang in sub_lang_list:
1235 continue
360e1ca5 1236 sub_formats = []
23d17e4b 1237 for ext in self._SUBTITLE_FORMATS:
15707c7e 1238 params = compat_urllib_parse_urlencode({
360e1ca5
JMF
1239 'lang': lang,
1240 'v': video_id,
1241 'fmt': ext,
1242 'name': track.attrib['name'].encode('utf-8'),
1243 })
1244 sub_formats.append({
1245 'url': 'https://www.youtube.com/api/timedtext?' + params,
1246 'ext': ext,
1247 })
1248 sub_lang_list[lang] = sub_formats
de7f3446 1249 if not sub_lang_list:
69ea8ca4 1250 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
1251 return {}
1252 return sub_lang_list
1253
a72778d3
S
1254 def _get_ytplayer_config(self, video_id, webpage):
1255 patterns = (
526b3b07
S
1256 # User data may contain arbitrary character sequences that may affect
1257 # JSON extraction with regex, e.g. when '};' is contained the second
1258 # regex won't capture the whole JSON. Yet working around by trying more
1259 # concrete regex first keeping in mind proper quoted string handling
1260 # to be implemented in future that will replace this workaround (see
1261 # https://github.com/rg3/youtube-dl/issues/7468,
1262 # https://github.com/rg3/youtube-dl/pull/7599)
a72778d3
S
1263 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1264 r';ytplayer\.config\s*=\s*({.+?});',
1265 )
1266 config = self._search_regex(
1267 patterns, webpage, 'ytplayer.config', default=None)
1268 if config:
1269 return self._parse_json(
1270 uppercase_escape(config), video_id, fatal=False)
0e49d9a6 1271
360e1ca5 1272 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
1273 """We need the webpage for getting the captions url, pass it as an
1274 argument to speed up the process."""
69ea8ca4 1275 self.to_screen('%s: Looking for automatic captions' % video_id)
a72778d3 1276 player_config = self._get_ytplayer_config(video_id, webpage)
78caa52a 1277 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
a72778d3 1278 if not player_config:
de7f3446
JMF
1279 self._downloader.report_warning(err_msg)
1280 return {}
de7f3446 1281 try:
0792d563 1282 args = player_config['args']
b78b292f
S
1283 caption_url = args.get('ttsurl')
1284 if caption_url:
1285 timestamp = args['timestamp']
1286 # We get the available subtitles
15707c7e 1287 list_params = compat_urllib_parse_urlencode({
b78b292f
S
1288 'type': 'list',
1289 'tlangs': 1,
1290 'asrs': 1,
1291 })
1292 list_url = caption_url + '&' + list_params
1293 caption_list = self._download_xml(list_url, video_id)
1294 original_lang_node = caption_list.find('track')
1295 if original_lang_node is None:
1296 self._downloader.report_warning('Video doesn\'t have automatic captions')
1297 return {}
1298 original_lang = original_lang_node.attrib['lang_code']
1299 caption_kind = original_lang_node.attrib.get('kind', '')
1300
1301 sub_lang_list = {}
1302 for lang_node in caption_list.findall('target'):
1303 sub_lang = lang_node.attrib['lang_code']
1304 sub_formats = []
1305 for ext in self._SUBTITLE_FORMATS:
15707c7e 1306 params = compat_urllib_parse_urlencode({
b78b292f
S
1307 'lang': original_lang,
1308 'tlang': sub_lang,
1309 'fmt': ext,
1310 'ts': timestamp,
1311 'kind': caption_kind,
1312 })
1313 sub_formats.append({
1314 'url': caption_url + '&' + params,
1315 'ext': ext,
1316 })
1317 sub_lang_list[sub_lang] = sub_formats
1318 return sub_lang_list
1319
ddbb4c5c
S
1320 def make_captions(sub_url, sub_langs):
1321 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1322 caption_qs = compat_parse_qs(parsed_sub_url.query)
1323 captions = {}
1324 for sub_lang in sub_langs:
1325 sub_formats = []
1326 for ext in self._SUBTITLE_FORMATS:
1327 caption_qs.update({
1328 'tlang': [sub_lang],
1329 'fmt': [ext],
1330 })
1331 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1332 query=compat_urllib_parse_urlencode(caption_qs, True)))
1333 sub_formats.append({
1334 'url': sub_url,
1335 'ext': ext,
1336 })
1337 captions[sub_lang] = sub_formats
1338 return captions
1339
1340 # New captions format as of 22.06.2017
1341 player_response = args.get('player_response')
1342 if player_response and isinstance(player_response, compat_str):
1343 player_response = self._parse_json(
1344 player_response, video_id, fatal=False)
1345 if player_response:
1346 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1347 base_url = renderer['captionTracks'][0]['baseUrl']
1348 sub_lang_list = []
1349 for lang in renderer['translationLanguages']:
1350 lang_code = lang.get('languageCode')
1351 if lang_code:
1352 sub_lang_list.append(lang_code)
1353 return make_captions(base_url, sub_lang_list)
1354
b78b292f
S
1355 # Some videos don't provide ttsurl but rather caption_tracks and
1356 # caption_translation_languages (e.g. 20LmZk1hakA)
ddbb4c5c 1357 # Does not used anymore as of 22.06.2017
b78b292f
S
1358 caption_tracks = args['caption_tracks']
1359 caption_translation_languages = args['caption_translation_languages']
1360 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
ddbb4c5c 1361 sub_lang_list = []
b78b292f
S
1362 for lang in caption_translation_languages.split(','):
1363 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1364 sub_lang = lang_qs.get('lc', [None])[0]
ddbb4c5c
S
1365 if sub_lang:
1366 sub_lang_list.append(sub_lang)
1367 return make_captions(caption_url, sub_lang_list)
de7f3446
JMF
1368 # An extractor error can be raise by the download process if there are
1369 # no automatic captions but there are subtitles
ddbb4c5c 1370 except (KeyError, IndexError, ExtractorError):
de7f3446
JMF
1371 self._downloader.report_warning(err_msg)
1372 return {}
1373
d77ab8e2
S
1374 def _mark_watched(self, video_id, video_info):
1375 playback_url = video_info.get('videostats_playback_base_url', [None])[0]
1376 if not playback_url:
1377 return
1378 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1379 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1380
1381 # cpn generation algorithm is reverse engineered from base.js.
1382 # In fact it works even with dummy cpn.
1383 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1384 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1385
1386 qs.update({
1387 'ver': ['2'],
1388 'cpn': [cpn],
1389 })
1390 playback_url = compat_urlparse.urlunparse(
15707c7e 1391 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
d77ab8e2
S
1392
1393 self._download_webpage(
1394 playback_url, video_id, 'Marking watched',
1395 'Unable to mark watched', fatal=False)
1396
66c9fa36
S
1397 @staticmethod
1398 def _extract_urls(webpage):
1399 # Embedded YouTube player
1400 entries = [
1401 unescapeHTML(mobj.group('url'))
1402 for mobj in re.finditer(r'''(?x)
1403 (?:
1404 <iframe[^>]+?src=|
1405 data-video-url=|
1406 <embed[^>]+?src=|
1407 embedSWF\(?:\s*|
1408 <object[^>]+data=|
1409 new\s+SWFObject\(
1410 )
1411 (["\'])
1412 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
f2332f18 1413 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
66c9fa36
S
1414 \1''', webpage)]
1415
1416 # lazyYT YouTube embed
1417 entries.extend(list(map(
1418 unescapeHTML,
1419 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1420
1421 # Wordpress "YouTube Video Importer" plugin
1422 matches = re.findall(r'''(?x)<div[^>]+
1423 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1424 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1425 entries.extend(m[-1] for m in matches)
1426
1427 return entries
1428
1429 @staticmethod
1430 def _extract_url(webpage):
1431 urls = YoutubeIE._extract_urls(webpage)
1432 return urls[0] if urls else None
1433
97665381
PH
1434 @classmethod
1435 def extract_id(cls, url):
1436 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 1437 if mobj is None:
69ea8ca4 1438 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1439 video_id = mobj.group(2)
1440 return video_id
1441
1fb07d10
JG
1442 def _extract_annotations(self, video_id):
1443 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 1444 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 1445
9cafc3fd
S
1446 @staticmethod
1447 def _extract_chapters(description, duration):
1448 if not description:
1449 return None
1450 chapter_lines = re.findall(
1451 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1452 description)
1453 if not chapter_lines:
1454 return None
1455 chapters = []
1456 for next_num, (chapter_line, time_point) in enumerate(
1457 chapter_lines, start=1):
1458 start_time = parse_duration(time_point)
1459 if start_time is None:
1460 continue
39d4c1be
S
1461 if start_time > duration:
1462 break
9cafc3fd
S
1463 end_time = (duration if next_num == len(chapter_lines)
1464 else parse_duration(chapter_lines[next_num][1]))
1465 if end_time is None:
1466 continue
39d4c1be
S
1467 if end_time > duration:
1468 end_time = duration
1469 if start_time > end_time:
1470 break
9cafc3fd
S
1471 chapter_title = re.sub(
1472 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1473 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1474 chapters.append({
1475 'start_time': start_time,
1476 'end_time': end_time,
1477 'title': chapter_title,
1478 })
1479 return chapters
1480
c5e8d7af 1481 def _real_extract(self, url):
cf7e015f
S
1482 url, smuggled_data = unsmuggle_url(url, {})
1483
7e8c0af0 1484 proto = (
78caa52a
PH
1485 'http' if self._downloader.params.get('prefer_insecure', False)
1486 else 'https')
7e8c0af0 1487
7c80519c 1488 start_time = None
297a564b 1489 end_time = None
7c80519c
JMF
1490 parsed_url = compat_urllib_parse_urlparse(url)
1491 for component in [parsed_url.fragment, parsed_url.query]:
1492 query = compat_parse_qs(component)
297a564b 1493 if start_time is None and 't' in query:
7c80519c 1494 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
1495 if start_time is None and 'start' in query:
1496 start_time = parse_duration(query['start'][0])
297a564b
JMF
1497 if end_time is None and 'end' in query:
1498 end_time = parse_duration(query['end'][0])
7c80519c 1499
c5e8d7af
PH
1500 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1501 mobj = re.search(self._NEXT_URL_RE, url)
1502 if mobj:
7fd002c0 1503 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 1504 video_id = self.extract_id(url)
c5e8d7af
PH
1505
1506 # Get video webpage
aa79ac0c 1507 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 1508 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1509
1510 # Attempt to extract SWF player URL
e0df6211 1511 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1512 if mobj is not None:
1513 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1514 else:
1515 player_url = None
1516
d8d24a92
S
1517 dash_mpds = []
1518
1519 def add_dash_mpd(video_info):
1520 dash_mpd = video_info.get('dashmpd')
1521 if dash_mpd and dash_mpd[0] not in dash_mpds:
1522 dash_mpds.append(dash_mpd[0])
1523
c7121fa7
S
1524 is_live = None
1525 view_count = None
1526
1527 def extract_view_count(v_info):
1528 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1529
c5e8d7af 1530 # Get video info
6449cd80 1531 embed_webpage = None
c108eb73 1532 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
1533 age_gate = True
1534 # We simulate the access to the video from www.youtube.com/v/{video_id}
1535 # this can be viewed without login into Youtube
beb95e77
CL
1536 url = proto + '://www.youtube.com/embed/%s' % video_id
1537 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
15707c7e 1538 data = compat_urllib_parse_urlencode({
2c57c7fa
JMF
1539 'video_id': video_id,
1540 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 1541 'sts': self._search_regex(
beb95e77 1542 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 1543 })
7e8c0af0 1544 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
1545 video_info_webpage = self._download_webpage(
1546 video_info_url, video_id,
20436c30 1547 note='Refetching age-gated info webpage',
94bd3613 1548 errnote='unable to download video info webpage')
c5e8d7af 1549 video_info = compat_parse_qs(video_info_webpage)
d8d24a92 1550 add_dash_mpd(video_info)
c108eb73
JMF
1551 else:
1552 age_gate = False
bc93bdb5 1553 video_info = None
dc4e4f90 1554 sts = None
d8d24a92 1555 # Try looking directly into the video webpage
a72778d3
S
1556 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1557 if ytplayer_config:
4e62ebe2 1558 args = ytplayer_config['args']
4c76aa06 1559 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
d8d24a92
S
1560 # Convert to the same format returned by compat_parse_qs
1561 video_info = dict((k, [v]) for k, v in args.items())
1562 add_dash_mpd(video_info)
6496ccb4
S
1563 # Rental video is not rented but preview is available (e.g.
1564 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1565 # https://github.com/rg3/youtube-dl/issues/10532)
1566 if not video_info and args.get('ypc_vid'):
1567 return self.url_result(
1568 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
2fe1ff85
JMF
1569 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1570 is_live = True
dc4e4f90 1571 sts = ytplayer_config.get('sts')
0a3cf9ad
S
1572 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1573 # We also try looking in get_video_info since it may contain different dashmpd
1574 # URL that points to a DASH manifest with possibly different itag set (some itags
1575 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
1576 # manifest pointed by get_video_info's dashmpd).
1577 # The general idea is to take a union of itags of both DASH manifests (for example
1578 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
4e62ebe2 1579 self.report_video_info_webpage_download(video_id)
dc4e4f90
S
1580 for el in ('info', 'embedded', 'detailpage', 'vevo', ''):
1581 query = {
1582 'video_id': video_id,
1583 'ps': 'default',
1584 'eurl': '',
1585 'gl': 'US',
1586 'hl': 'en',
1587 }
1588 if el:
1589 query['el'] = el
1590 if sts:
1591 query['sts'] = sts
810fb84d 1592 video_info_webpage = self._download_webpage(
dc4e4f90 1593 '%s://www.youtube.com/get_video_info' % proto,
4e62ebe2 1594 video_id, note=False,
dc4e4f90
S
1595 errnote='unable to download video info webpage',
1596 fatal=False, query=query)
1597 if not video_info_webpage:
1598 continue
0a3cf9ad 1599 get_video_info = compat_parse_qs(video_info_webpage)
fd545fc6 1600 add_dash_mpd(get_video_info)
c7121fa7
S
1601 if view_count is None:
1602 view_count = extract_view_count(get_video_info)
0a3cf9ad
S
1603 if not video_info:
1604 video_info = get_video_info
1605 if 'token' in get_video_info:
89ea063e
S
1606 # Different get_video_info requests may report different results, e.g.
1607 # some may report video unavailability, but some may serve it without
1608 # any complaint (see https://github.com/rg3/youtube-dl/issues/7362,
1609 # the original webpage as well as el=info and el=embedded get_video_info
1610 # requests report video unavailability due to geo restriction while
1611 # el=detailpage succeeds and returns valid data). This is probably
1612 # due to YouTube measures against IP ranges of hosting providers.
1613 # Working around by preferring the first succeeded video_info containing
1614 # the token if no such video_info yet was found.
44b2264f
S
1615 if 'token' not in video_info:
1616 video_info = get_video_info
4e62ebe2 1617 break
bbb7c3f7
YCH
1618
1619 def extract_unavailable_message():
1620 return self._html_search_regex(
1621 r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',
1622 video_webpage, 'unavailable message', default=None)
1623
c5e8d7af
PH
1624 if 'token' not in video_info:
1625 if 'reason' in video_info:
af214c3a 1626 if 'The uploader has not made this video available in your country.' in video_info['reason']:
fd5c4aab
S
1627 regions_allowed = self._html_search_meta(
1628 'regionsAllowed', video_webpage, default=None)
1629 countries = regions_allowed.split(',') if regions_allowed else None
1630 self.raise_geo_restricted(
1631 msg=video_info['reason'][0], countries=countries)
bbb7c3f7
YCH
1632 reason = video_info['reason'][0]
1633 if 'Invalid parameters' in reason:
1634 unavailable_message = extract_unavailable_message()
1635 if unavailable_message:
1636 reason = unavailable_message
d11271dd 1637 raise ExtractorError(
bbb7c3f7 1638 'YouTube said: %s' % reason,
d11271dd 1639 expected=True, video_id=video_id)
c5e8d7af 1640 else:
d11271dd 1641 raise ExtractorError(
78caa52a 1642 '"token" parameter not in video info for unknown reason',
d11271dd 1643 video_id=video_id)
c5e8d7af 1644
cf7e015f
S
1645 # title
1646 if 'title' in video_info:
1647 video_title = video_info['title'][0]
1648 else:
1649 self._downloader.report_warning('Unable to extract video title')
1650 video_title = '_'
1651
1652 # description
9cafc3fd 1653 description_original = video_description = get_element_by_id("eow-description", video_webpage)
cf7e015f 1654 if video_description:
fa4bc6e7
RA
1655
1656 def replace_url(m):
1657 redir_url = compat_urlparse.urljoin(url, m.group(1))
1658 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1659 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1660 qs = compat_parse_qs(parsed_redir_url.query)
1661 q = qs.get('q')
1662 if q and q[0]:
1663 return q[0]
1664 return redir_url
1665
9cafc3fd 1666 description_original = video_description = re.sub(r'''(?x)
cf7e015f 1667 <a\s+
25cb7a0e 1668 (?:[a-zA-Z-]+="[^"]*"\s+)*?
23f13e97 1669 (?:title|href)="([^"]+)"\s+
25cb7a0e 1670 (?:[a-zA-Z-]+="[^"]*"\s+)*?
525cedb9 1671 class="[^"]*"[^>]*>
23f13e97 1672 [^<]+\.{3}\s*
cf7e015f 1673 </a>
fa4bc6e7 1674 ''', replace_url, video_description)
cf7e015f
S
1675 video_description = clean_html(video_description)
1676 else:
1677 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1678 if fd_mobj:
1679 video_description = unescapeHTML(fd_mobj.group(1))
1680 else:
1681 video_description = ''
1682
5e1eddb9
S
1683 if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
1684 if not self._downloader.params.get('noplaylist'):
1685 entries = []
1686 feed_ids = []
6863631c 1687 multifeed_metadata_list = video_info['multifeed_metadata_list'][0]
5e1eddb9 1688 for feed in multifeed_metadata_list.split(','):
6863631c
S
1689 # Unquote should take place before split on comma (,) since textual
1690 # fields may contain comma as well (see
1691 # https://github.com/rg3/youtube-dl/issues/8536)
1692 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
5e1eddb9
S
1693 entries.append({
1694 '_type': 'url_transparent',
1695 'ie_key': 'Youtube',
1696 'url': smuggle_url(
1697 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1698 {'force_singlefeed': True}),
1699 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
1700 })
1701 feed_ids.append(feed_data['id'][0])
1702 self.to_screen(
1703 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1704 % (', '.join(feed_ids), video_id))
1705 return self.playlist_result(entries, video_id, video_title, video_description)
1706 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1707
c7121fa7 1708 if view_count is None:
1c9c8de2 1709 view_count = extract_view_count(video_info)
1d699755 1710
c5e8d7af
PH
1711 # Check for "rental" videos
1712 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
c9612c04 1713 raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True)
c5e8d7af 1714
c63ca0ee
S
1715 def _extract_filesize(media_url):
1716 return int_or_none(self._search_regex(
1717 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1718
c5e8d7af
PH
1719 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1720 self.report_rtmp_download()
dd27fd17
PH
1721 formats = [{
1722 'format_id': '_rtmp',
1723 'protocol': 'rtmp',
1724 'url': video_info['conn'][0],
1725 'player_url': player_url,
1726 }]
391dd6f0 1727 elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
5f6a1245 1728 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1729 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1730 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
3318832e 1731 formats_spec = {}
82156fdb 1732 fmt_list = video_info.get('fmt_list', [''])[0]
1733 if fmt_list:
1734 for fmt in fmt_list.split(','):
1735 spec = fmt.split('/')
3318832e 1736 if len(spec) > 1:
1737 width_height = spec[1].split('x')
1738 if len(width_height) == 2:
1739 formats_spec[spec[0]] = {
1740 'resolution': spec[1],
1741 'width': int_or_none(width_height[0]),
1742 'height': int_or_none(width_height[1]),
1743 }
54fc90aa 1744 q = qualities(['small', 'medium', 'hd720'])
c9afb51c 1745 formats = []
00fe14fc 1746 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1747 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1748 if 'itag' not in url_data or 'url' not in url_data:
1749 continue
1750 format_id = url_data['itag'][0]
1751 url = url_data['url'][0]
1752
a49eccdf 1753 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
6449cd80 1754 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
beb95e77 1755 jsplayer_url_json = self._search_regex(
6449cd80
PH
1756 ASSETS_RE,
1757 embed_webpage if age_gate else video_webpage,
1758 'JS player URL (1)', default=None)
1759 if not jsplayer_url_json and not age_gate:
1760 # We need the embed website after all
1761 if embed_webpage is None:
1762 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1763 embed_webpage = self._download_webpage(
1764 embed_url, video_id, 'Downloading embed webpage')
1765 jsplayer_url_json = self._search_regex(
1766 ASSETS_RE, embed_webpage, 'JS player URL')
1767
beb95e77 1768 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1769 if player_url is None:
1770 player_url_json = self._search_regex(
1771 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1772 video_webpage, 'age gate player URL')
201e9eaa
PH
1773 player_url = json.loads(player_url_json)
1774
a49eccdf
YCH
1775 if 'sig' in url_data:
1776 url += '&signature=' + url_data['sig'][0]
1777 elif 's' in url_data:
1778 encrypted_sig = url_data['s'][0]
1779
201e9eaa 1780 if self._downloader.params.get('verbose'):
cf010131 1781 if player_url is None:
201e9eaa
PH
1782 player_version = 'unknown'
1783 player_desc = 'unknown'
1784 else:
1785 if player_url.endswith('swf'):
1786 player_version = self._search_regex(
1787 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1788 'flash player', fatal=False)
201e9eaa 1789 player_desc = 'flash player %s' % player_version
cf010131 1790 else:
201e9eaa 1791 player_version = self._search_regex(
b62985a9
YCH
1792 [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
1793 r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'],
201e9eaa
PH
1794 player_url,
1795 'html5 player', fatal=False)
78caa52a 1796 player_desc = 'html5 player %s' % player_version
201e9eaa 1797
60064c53 1798 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1799 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1800 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1801
1802 signature = self._decrypt_signature(
1803 encrypted_sig, video_id, player_url, age_gate)
1804 url += '&signature=' + signature
1805 if 'ratebypass' not in url:
1806 url += '&ratebypass=yes'
c9afb51c 1807
94278f72
YCH
1808 dct = {
1809 'format_id': format_id,
1810 'url': url,
1811 'player_url': player_url,
1812 }
1813 if format_id in self._formats:
1814 dct.update(self._formats[format_id])
3318832e 1815 if format_id in formats_spec:
1816 dct.update(formats_spec[format_id])
94278f72 1817
aabc2be6
S
1818 # Some itags are not included in DASH manifest thus corresponding formats will
1819 # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).
1820 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
1821 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
1822 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
94278f72 1823
c63ca0ee
S
1824 filesize = int_or_none(url_data.get(
1825 'clen', [None])[0]) or _extract_filesize(url)
1826
54fc90aa
RA
1827 quality = url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0]
1828
94278f72 1829 more_fields = {
c63ca0ee 1830 'filesize': filesize,
aabc2be6 1831 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
c9afb51c
AH
1832 'width': width,
1833 'height': height,
1834 'fps': int_or_none(url_data.get('fps', [None])[0]),
54fc90aa
RA
1835 'format_note': quality,
1836 'quality': q(quality),
c9afb51c 1837 }
94278f72
YCH
1838 for key, value in more_fields.items():
1839 if value:
1840 dct[key] = value
aabc2be6
S
1841 type_ = url_data.get('type', [None])[0]
1842 if type_:
1843 type_split = type_.split(';')
1844 kind_ext = type_split[0].split('/')
1845 if len(kind_ext) == 2:
94278f72
YCH
1846 kind, _ = kind_ext
1847 dct['ext'] = mimetype2ext(type_split[0])
aabc2be6
S
1848 if kind in ('audio', 'video'):
1849 codecs = None
1850 for mobj in re.finditer(
1851 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
1852 if mobj.group('key') == 'codecs':
1853 codecs = mobj.group('val')
1854 break
1855 if codecs:
6310acf5 1856 dct.update(parse_codecs(codecs))
e4a60912
S
1857 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
1858 dct['downloader_options'] = {
1859 # Youtube throttles chunks >~10M
1860 'http_chunk_size': 10485760,
1861 }
aabc2be6 1862 formats.append(dct)
1d043b93
JMF
1863 elif video_info.get('hlsvp'):
1864 manifest_url = video_info['hlsvp'][0]
89beedd3
RA
1865 formats = []
1866 m3u8_formats = self._extract_m3u8_formats(
1867 manifest_url, video_id, 'mp4', fatal=False)
1868 for a_format in m3u8_formats:
1869 itag = self._search_regex(
1870 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
1871 if itag:
1872 a_format['format_id'] = itag
1873 if itag in self._formats:
1874 dct = self._formats[itag].copy()
1875 dct.update(a_format)
1876 a_format = dct
1877 a_format['player_url'] = player_url
1878 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
049d71d8 1879 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
89beedd3 1880 formats.append(a_format)
c5e8d7af 1881 else:
4c76aa06
RA
1882 error_message = clean_html(video_info.get('reason', [None])[0])
1883 if not error_message:
1884 error_message = extract_unavailable_message()
1885 if error_message:
1886 raise ExtractorError(error_message, expected=True)
69ea8ca4 1887 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1888
7e72694b
S
1889 # uploader
1890 video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str)
1891 if video_uploader:
1892 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
1893 else:
1894 self._downloader.report_warning('unable to extract uploader name')
1895
1896 # uploader_id
1897 video_uploader_id = None
1898 video_uploader_url = None
1899 mobj = re.search(
1900 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
1901 video_webpage)
1902 if mobj is not None:
1903 video_uploader_id = mobj.group('uploader_id')
1904 video_uploader_url = mobj.group('uploader_url')
1905 else:
1906 self._downloader.report_warning('unable to extract uploader nickname')
1907
1908 # thumbnail image
1909 # We try first to get a high quality image:
1910 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1911 video_webpage, re.DOTALL)
1912 if m_thumb is not None:
1913 video_thumbnail = m_thumb.group(1)
1914 elif 'thumbnail_url' not in video_info:
1915 self._downloader.report_warning('unable to extract video thumbnail')
1916 video_thumbnail = None
1917 else: # don't panic if we can't find it
1918 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
1919
1920 # upload date
1921 upload_date = self._html_search_meta(
1922 'datePublished', video_webpage, 'upload date', default=None)
1923 if not upload_date:
1924 upload_date = self._search_regex(
1925 [r'(?s)id="eow-date.*?>(.*?)</span>',
1926 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
1927 video_webpage, 'upload date', default=None)
1928 upload_date = unified_strdate(upload_date)
1929
1930 video_license = self._html_search_regex(
1931 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
1932 video_webpage, 'license', default=None)
1933
1934 m_music = re.search(
1935 r'''(?x)
1936 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
1937 <ul[^>]*>\s*
1938 <li>(?P<title>.+?)
1939 by (?P<creator>.+?)
1940 (?:
1941 \(.+?\)|
1942 <a[^>]*
1943 (?:
1944 \bhref=["\']/red[^>]*>| # drop possible
1945 >\s*Listen ad-free with YouTube Red # YouTube Red ad
1946 )
1947 .*?
1948 )?</li
1949 ''',
1950 video_webpage)
1951 if m_music:
1952 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
1953 video_creator = clean_html(m_music.group('creator'))
1954 else:
1955 video_alt_title = video_creator = None
1956
1957 def extract_meta(field):
1958 return self._html_search_regex(
1959 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
1960 video_webpage, field, default=None)
1961
1962 track = extract_meta('Song')
1963 artist = extract_meta('Artist')
1964
1965 m_episode = re.search(
1966 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
1967 video_webpage)
1968 if m_episode:
1969 series = m_episode.group('series')
1970 season_number = int(m_episode.group('season'))
1971 episode_number = int(m_episode.group('episode'))
1972 else:
1973 series = season_number = episode_number = None
1974
1975 m_cat_container = self._search_regex(
1976 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
1977 video_webpage, 'categories', default=None)
1978 if m_cat_container:
1979 category = self._html_search_regex(
1980 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1981 default=None)
1982 video_categories = None if category is None else [category]
1983 else:
1984 video_categories = None
1985
1986 video_tags = [
1987 unescapeHTML(m.group('content'))
1988 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
1989
1990 def _extract_count(count_name):
1991 return str_to_int(self._search_regex(
1992 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
1993 % re.escape(count_name),
1994 video_webpage, count_name, default=None))
1995
1996 like_count = _extract_count('like')
1997 dislike_count = _extract_count('dislike')
1998
1999 # subtitles
2000 video_subtitles = self.extract_subtitles(video_id, video_webpage)
2001 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2002
2003 video_duration = try_get(
2004 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2005 if not video_duration:
2006 video_duration = parse_duration(self._html_search_meta(
2007 'duration', video_webpage, 'video duration'))
2008
2009 # annotations
2010 video_annotations = None
2011 if self._downloader.params.get('writeannotations', False):
2012 video_annotations = self._extract_annotations(video_id)
2013
2014 chapters = self._extract_chapters(description_original, video_duration)
2015
dd27fd17 2016 # Look for the DASH manifest
203fb43f 2017 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 2018 dash_mpd_fatal = True
8ff648e4 2019 for mpd_url in dash_mpds:
d8d24a92 2020 dash_formats = {}
774e208f 2021 try:
05d0d131
YCH
2022 def decrypt_sig(mobj):
2023 s = mobj.group(1)
2024 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2025 return '/signature/%s' % dec_s
2026
8ff648e4 2027 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2d2fa82d 2028
8ff648e4 2029 for df in self._extract_mpd_formats(
2030 mpd_url, video_id, fatal=dash_mpd_fatal,
2031 formats_dict=self._formats):
c63ca0ee
S
2032 if not df.get('filesize'):
2033 df['filesize'] = _extract_filesize(df['url'])
d8d24a92
S
2034 # Do not overwrite DASH format found in some previous DASH manifest
2035 if df['format_id'] not in dash_formats:
2036 dash_formats[df['format_id']] = df
77c6fb5b
S
2037 # Additional DASH manifests may end up in HTTP Error 403 therefore
2038 # allow them to fail without bug report message if we already have
2039 # some DASH manifest succeeded. This is temporary workaround to reduce
2040 # burst of bug reports until we figure out the reason and whether it
2041 # can be fixed at all.
2042 dash_mpd_fatal = False
774e208f
PH
2043 except (ExtractorError, KeyError) as e:
2044 self.report_warning(
2045 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 2046 if dash_formats:
04b3b3df
JMF
2047 # Remove the formats we found through non-DASH, they
2048 # contain less info and it can be wrong, because we use
2049 # fixed values (for example the resolution). See
2050 # https://github.com/rg3/youtube-dl/issues/5774 for an
2051 # example.
d80265cc 2052 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 2053 formats.extend(dash_formats.values())
d80044c2 2054
6271f1ca
PH
2055 # Check for malformed aspect ratio
2056 stretched_m = re.search(
2057 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2058 video_webpage)
2059 if stretched_m:
313dfc45
LL
2060 w = float(stretched_m.group('w'))
2061 h = float(stretched_m.group('h'))
5faf9fed
S
2062 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2063 # We will only process correct ratios.
313dfc45 2064 if w > 0 and h > 0:
41f24c32 2065 ratio = w / h
313dfc45
LL
2066 for f in formats:
2067 if f.get('vcodec') != 'none':
2068 f['stretched_ratio'] = ratio
6271f1ca 2069
4bcc7bd1 2070 self._sort_formats(formats)
4ea3be0a 2071
d77ab8e2
S
2072 self.mark_watched(video_id, video_info)
2073
4ea3be0a 2074 return {
8bcc8756
JW
2075 'id': video_id,
2076 'uploader': video_uploader,
2077 'uploader_id': video_uploader_id,
fd050249 2078 'uploader_url': video_uploader_url,
8bcc8756 2079 'upload_date': upload_date,
7caf9830 2080 'license': video_license,
936784b2 2081 'creator': video_creator or artist,
8bcc8756 2082 'title': video_title,
936784b2 2083 'alt_title': video_alt_title or track,
8bcc8756
JW
2084 'thumbnail': video_thumbnail,
2085 'description': video_description,
2086 'categories': video_categories,
000b6b5a 2087 'tags': video_tags,
8bcc8756 2088 'subtitles': video_subtitles,
360e1ca5 2089 'automatic_captions': automatic_captions,
8bcc8756
JW
2090 'duration': video_duration,
2091 'age_limit': 18 if age_gate else 0,
2092 'annotations': video_annotations,
9cafc3fd 2093 'chapters': chapters,
7e8c0af0 2094 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 2095 'view_count': view_count,
4ea3be0a 2096 'like_count': like_count,
2097 'dislike_count': dislike_count,
2d30521a 2098 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
8bcc8756 2099 'formats': formats,
2fe1ff85 2100 'is_live': is_live,
7c80519c 2101 'start_time': start_time,
297a564b 2102 'end_time': end_time,
12afdc2a
S
2103 'series': series,
2104 'season_number': season_number,
2105 'episode_number': episode_number,
936784b2
S
2106 'track': track,
2107 'artist': artist,
4ea3be0a 2108 }
c5e8d7af 2109
5f6a1245 2110
8e7aad20 2111class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 2112 IE_DESC = 'YouTube.com playlists'
d67cc9fa 2113 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
2114 (?:https?://)?
2115 (?:\w+\.)?
c5e8d7af 2116 (?:
feaa5ad7
S
2117 youtube\.com/
2118 (?:
87dadd45 2119 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
feaa5ad7
S
2120 \? (?:.*?[&;])*? (?:p|a|list)=
2121 | p/
2122 )|
2123 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
c5e8d7af 2124 )
d67cc9fa 2125 (
409b9324 2126 (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,}
5f6a1245 2127 # Top tracks, they can also include dots
d67cc9fa
JMF
2128 |(?:MC)[\w\.]*
2129 )
c5e8d7af
PH
2130 .*
2131 |
d0ba5587
S
2132 (%(playlist_id)s)
2133 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
8d81f3e3 2134 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
648e6a1f 2135 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
78caa52a 2136 IE_NAME = 'youtube:playlist'
81127aa5
PH
2137 _TESTS = [{
2138 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
2139 'info_dict': {
2140 'title': 'ytdl test PL',
a1cf99d0 2141 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
2142 },
2143 'playlist_count': 3,
9291475f
PH
2144 }, {
2145 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
2146 'info_dict': {
acf757f4 2147 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
2148 'title': 'YDL_Empty_List',
2149 },
2150 'playlist_count': 0,
4201ba13 2151 'skip': 'This playlist is private',
9291475f
PH
2152 }, {
2153 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2154 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2155 'info_dict': {
2156 'title': '29C3: Not my department',
acf757f4 2157 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
2158 },
2159 'playlist_count': 95,
2160 }, {
2161 'note': 'issue #673',
2162 'url': 'PLBB231211A4F62143',
2163 'info_dict': {
f46a8702 2164 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 2165 'id': 'PLBB231211A4F62143',
9291475f
PH
2166 },
2167 'playlist_mincount': 26,
2168 }, {
2169 'note': 'Large playlist',
2170 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2171 'info_dict': {
2172 'title': 'Uploads from Cauchemar',
acf757f4 2173 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
2174 },
2175 'playlist_mincount': 799,
2176 }, {
2177 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2178 'info_dict': {
2179 'title': 'YDL_safe_search',
acf757f4 2180 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
2181 },
2182 'playlist_count': 2,
4201ba13 2183 'skip': 'This playlist is private',
ac7553d0
PH
2184 }, {
2185 'note': 'embedded',
2d3d2997 2186 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0
PH
2187 'playlist_count': 4,
2188 'info_dict': {
2189 'title': 'JODA15',
acf757f4 2190 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 2191 }
87dadd45
S
2192 }, {
2193 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2194 'playlist_mincount': 485,
2195 'info_dict': {
2196 'title': '2017 華語最新單曲 (2/24更新)',
2197 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2198 }
6b08cdf6
PH
2199 }, {
2200 'note': 'Embedded SWF player',
2d3d2997 2201 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
6b08cdf6
PH
2202 'playlist_count': 4,
2203 'info_dict': {
2204 'title': 'JODA7',
acf757f4 2205 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 2206 }
4b7df0d3
JMF
2207 }, {
2208 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2209 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2210 'info_dict': {
acf757f4
PH
2211 'title': 'Uploads from Interstellar Movie',
2212 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3 2213 },
481cc733 2214 'playlist_mincount': 21,
dacb3a86
S
2215 }, {
2216 # Playlist URL that does not actually serve a playlist
2217 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2218 'info_dict': {
2219 'id': 'FqZTN594JQw',
2220 'ext': 'webm',
2221 'title': "Smiley's People 01 detective, Adventure Series, Action",
2222 'uploader': 'STREEM',
2223 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
ec85ded8 2224 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
dacb3a86
S
2225 'upload_date': '20150526',
2226 'license': 'Standard YouTube License',
2227 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2228 'categories': ['People & Blogs'],
2229 'tags': list,
2230 'like_count': int,
2231 'dislike_count': int,
2232 },
2233 'params': {
2234 'skip_download': True,
2235 },
2236 'add_ie': [YoutubeIE.ie_key()],
481cc733
S
2237 }, {
2238 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2239 'info_dict': {
2240 'id': 'yeWKywCrFtk',
2241 'ext': 'mp4',
2242 'title': 'Small Scale Baler and Braiding Rugs',
2243 'uploader': 'Backus-Page House Museum',
2244 'uploader_id': 'backuspagemuseum',
ec85ded8 2245 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
481cc733
S
2246 'upload_date': '20161008',
2247 'license': 'Standard YouTube License',
2248 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2249 'categories': ['Nonprofits & Activism'],
2250 'tags': list,
2251 'like_count': int,
2252 'dislike_count': int,
2253 },
2254 'params': {
2255 'noplaylist': True,
2256 'skip_download': True,
2257 },
feaa5ad7
S
2258 }, {
2259 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2260 'only_matching': True,
a6857510
S
2261 }, {
2262 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2263 'only_matching': True,
409b9324
S
2264 }, {
2265 # music album playlist
2266 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2267 'only_matching': True,
81127aa5 2268 }]
c5e8d7af 2269
880e1c52
JMF
2270 def _real_initialize(self):
2271 self._login()
2272
652cdaa2 2273 def _extract_mix(self, playlist_id):
99209c29 2274 # The mixes are generated from a single video
652cdaa2 2275 # the id of the playlist is just 'RD' + video_id
1b6182d8
JMF
2276 ids = []
2277 last_id = playlist_id[-11:]
2278 for n in itertools.count(1):
2279 url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
2280 webpage = self._download_webpage(
2281 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2282 new_ids = orderedSet(re.findall(
2283 r'''(?xs)data-video-username=".*?".*?
2284 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2285 webpage))
2286 # Fetch new pages until all the videos are repeated, it seems that
2287 # there are always 51 unique videos.
2288 new_ids = [_id for _id in new_ids if _id not in ids]
2289 if not new_ids:
2290 break
2291 ids.extend(new_ids)
2292 last_id = ids[-1]
2293
2294 url_results = self._ids_to_results(ids)
2295
bc2f773b 2296 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
2297 title_span = (
2298 search_title('playlist-title') or
2299 search_title('title long-title') or
2300 search_title('title'))
76d1700b 2301 title = clean_html(title_span)
652cdaa2
JMF
2302
2303 return self.playlist_result(url_results, playlist_id, title)
2304
448830ce 2305 def _extract_playlist(self, playlist_id):
dbb94fb0
S
2306 url = self._TEMPLATE_URL % playlist_id
2307 page = self._download_webpage(url, playlist_id)
dbb94fb0 2308
8bc0800d
G
2309 # the yt-alert-message now has tabindex attribute (see https://github.com/rg3/youtube-dl/issues/11604)
2310 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
39b62db1
YCH
2311 match = match.strip()
2312 # Check if the playlist exists or is private
4201ba13
S
2313 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2314 if mobj:
2315 reason = mobj.group('reason')
2316 message = 'This playlist %s' % reason
2317 if 'private' in reason:
2318 message += ', use --username or --netrc to access it'
2319 message += '.'
2320 raise ExtractorError(message, expected=True)
39b62db1
YCH
2321 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2322 raise ExtractorError(
2323 'Invalid parameters. Maybe URL is incorrect.',
2324 expected=True)
2325 elif re.match(r'[^<]*Choose your language[^<]*', match):
2326 continue
2327 else:
2328 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 2329
dbb94fb0 2330 playlist_title = self._html_search_regex(
63b4295d 2331 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
dacb3a86 2332 page, 'title', default=None)
c5e8d7af 2333
07aeced6
S
2334 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
2335 uploader = self._search_regex(
2336 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2337 page, 'uploader', default=None)
2338 mobj = re.search(
2339 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2340 page)
2341 if mobj:
2342 uploader_id = mobj.group('uploader_id')
2343 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2344 else:
2345 uploader_id = uploader_url = None
2346
dacb3a86
S
2347 has_videos = True
2348
2349 if not playlist_title:
2350 try:
2351 # Some playlist URLs don't actually serve a playlist (e.g.
2352 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2353 next(self._entries(page, playlist_id))
2354 except StopIteration:
2355 has_videos = False
2356
07aeced6 2357 playlist = self.playlist_result(
dacb3a86 2358 self._entries(page, playlist_id), playlist_id, playlist_title)
07aeced6
S
2359 playlist.update({
2360 'uploader': uploader,
2361 'uploader_id': uploader_id,
2362 'uploader_url': uploader_url,
2363 })
2364
2365 return has_videos, playlist
c5e8d7af 2366
ebf1b291 2367 def _check_download_just_video(self, url, playlist_id):
448830ce
S
2368 # Check if it's a video-specific URL
2369 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
481cc733 2370 video_id = query_dict.get('v', [None])[0] or self._search_regex(
87dadd45 2371 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
481cc733
S
2372 'video id', default=None)
2373 if video_id:
448830ce
S
2374 if self._downloader.params.get('noplaylist'):
2375 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
dacb3a86 2376 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
448830ce
S
2377 else:
2378 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
dacb3a86
S
2379 return video_id, None
2380 return None, None
448830ce 2381
ebf1b291
S
2382 def _real_extract(self, url):
2383 # Extract playlist id
2384 mobj = re.match(self._VALID_URL, url)
2385 if mobj is None:
2386 raise ExtractorError('Invalid URL: %s' % url)
2387 playlist_id = mobj.group(1) or mobj.group(2)
2388
dacb3a86 2389 video_id, video = self._check_download_just_video(url, playlist_id)
ebf1b291
S
2390 if video:
2391 return video
2392
466a6145 2393 if playlist_id.startswith(('RD', 'UL', 'PU')):
448830ce
S
2394 # Mixes require a custom extraction process
2395 return self._extract_mix(playlist_id)
2396
dacb3a86
S
2397 has_videos, playlist = self._extract_playlist(playlist_id)
2398 if has_videos or not video_id:
2399 return playlist
2400
2401 # Some playlist URLs don't actually serve a playlist (see
2402 # https://github.com/rg3/youtube-dl/issues/10537).
2403 # Fallback to plain video extraction if there is a video id
2404 # along with playlist id.
2405 return self.url_result(video_id, 'Youtube', video_id=video_id)
448830ce 2406
c5e8d7af 2407
648e6a1f 2408class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 2409 IE_DESC = 'YouTube.com channels'
9ff67727 2410 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 2411 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
648e6a1f 2412 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
78caa52a 2413 IE_NAME = 'youtube:channel'
cdc628a4
PH
2414 _TESTS = [{
2415 'note': 'paginated channel',
2416 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
2417 'playlist_mincount': 91,
acf757f4 2418 'info_dict': {
9170ca5b
JMF
2419 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
2420 'title': 'Uploads from lex will',
acf757f4 2421 }
5c43afd4
JMF
2422 }, {
2423 'note': 'Age restricted channel',
2424 # from https://www.youtube.com/user/DeusExOfficial
2425 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
2426 'playlist_mincount': 64,
2427 'info_dict': {
2428 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
2429 'title': 'Uploads from Deus Ex',
2430 },
cdc628a4 2431 }]
c5e8d7af 2432
e462474e
S
2433 @classmethod
2434 def suitable(cls, url):
f07e276a
S
2435 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
2436 else super(YoutubeChannelIE, cls).suitable(url))
e462474e 2437
9558dcec
S
2438 def _build_template_url(self, url, channel_id):
2439 return self._TEMPLATE_URL % channel_id
2440
c5e8d7af 2441 def _real_extract(self, url):
9ff67727 2442 channel_id = self._match_id(url)
c5e8d7af 2443
9558dcec 2444 url = self._build_template_url(url, channel_id)
386bdfa6
S
2445
2446 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
2447 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
2448 # otherwise fallback on channel by page extraction
2449 channel_page = self._download_webpage(
2450 url + '?view=57', channel_id,
2451 'Downloading channel page', fatal=False)
2b3c2546
PH
2452 if channel_page is False:
2453 channel_playlist_id = False
2454 else:
2455 channel_playlist_id = self._html_search_meta(
2456 'channelId', channel_page, 'channel id', default=None)
2457 if not channel_playlist_id:
73c4ac2c
S
2458 channel_url = self._html_search_meta(
2459 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
2460 channel_page, 'channel url', default=None)
2461 if channel_url:
2462 channel_playlist_id = self._search_regex(
2463 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
2464 channel_url, 'channel id', default=None)
386bdfa6
S
2465 if channel_playlist_id and channel_playlist_id.startswith('UC'):
2466 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
2467 return self.url_result(
2468 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 2469
60bf45c8 2470 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
2471 autogenerated = re.search(r'''(?x)
2472 class="[^"]*?(?:
2473 channel-header-autogenerated-label|
2474 yt-channel-title-autogenerated
2475 )[^"]*"''', channel_page) is not None
c5e8d7af 2476
b9643eed
JMF
2477 if autogenerated:
2478 # The videos are contained in a single page
2479 # the ajax pages can't be used, they are empty
b82f815f 2480 entries = [
fb69240c
S
2481 self.url_result(
2482 video_id, 'Youtube', video_id=video_id,
2483 video_title=video_title)
8f02ad4f 2484 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
2485 return self.playlist_result(entries, channel_id)
2486
73c4ac2c
S
2487 try:
2488 next(self._entries(channel_page, channel_id))
2489 except StopIteration:
2490 alert_message = self._html_search_regex(
2491 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
2492 channel_page, 'alert', default=None, group='alert')
2493 if alert_message:
2494 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
2495
648e6a1f 2496 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
c5e8d7af
PH
2497
2498
eb0f3e7e 2499class YoutubeUserIE(YoutubeChannelIE):
78caa52a 2500 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
ea696249 2501 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
9558dcec 2502 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
78caa52a 2503 IE_NAME = 'youtube:user'
c5e8d7af 2504
cdc628a4
PH
2505 _TESTS = [{
2506 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
2507 'playlist_mincount': 320,
2508 'info_dict': {
73c4ac2c
S
2509 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
2510 'title': 'Uploads from The Linux Foundation',
cdc628a4 2511 }
9558dcec
S
2512 }, {
2513 # Only available via https://www.youtube.com/c/12minuteathlete/videos
2514 # but not https://www.youtube.com/user/12minuteathlete/videos
2515 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
2516 'playlist_mincount': 249,
2517 'info_dict': {
2518 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
2519 'title': 'Uploads from 12 Minute Athlete',
2520 }
cdc628a4
PH
2521 }, {
2522 'url': 'ytuser:phihag',
2523 'only_matching': True,
daa0df9e
YCH
2524 }, {
2525 'url': 'https://www.youtube.com/c/gametrailers',
2526 'only_matching': True,
9558dcec
S
2527 }, {
2528 'url': 'https://www.youtube.com/gametrailers',
2529 'only_matching': True,
73c4ac2c 2530 }, {
0e879f43 2531 # This channel is not available, geo restricted to JP
73c4ac2c
S
2532 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
2533 'only_matching': True,
cdc628a4
PH
2534 }]
2535
e3ea4790 2536 @classmethod
f4b05232 2537 def suitable(cls, url):
e3ea4790
JMF
2538 # Don't return True if the url can be extracted with other youtube
2539 # extractor, the regex would is too permissive and it would match.
f3a58d46 2540 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
2541 if any(ie.suitable(url) for ie in other_yt_ies):
5f6a1245
JW
2542 return False
2543 else:
2544 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 2545
9558dcec
S
2546 def _build_template_url(self, url, channel_id):
2547 mobj = re.match(self._VALID_URL, url)
2548 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
2549
b05654f0 2550
f07e276a
S
2551class YoutubeLiveIE(YoutubeBaseInfoExtractor):
2552 IE_DESC = 'YouTube.com live streams'
073d5bf5 2553 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
f07e276a
S
2554 IE_NAME = 'youtube:live'
2555
2556 _TESTS = [{
2d3d2997 2557 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
f07e276a
S
2558 'info_dict': {
2559 'id': 'a48o2S1cPoo',
2560 'ext': 'mp4',
2561 'title': 'The Young Turks - Live Main Show',
2562 'uploader': 'The Young Turks',
2563 'uploader_id': 'TheYoungTurks',
ec85ded8 2564 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
f07e276a
S
2565 'upload_date': '20150715',
2566 'license': 'Standard YouTube License',
2567 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2568 'categories': ['News & Politics'],
2569 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2570 'like_count': int,
2571 'dislike_count': int,
2572 },
2573 'params': {
2574 'skip_download': True,
2575 },
2576 }, {
2d3d2997 2577 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
f07e276a 2578 'only_matching': True,
c1b2a085
S
2579 }, {
2580 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2581 'only_matching': True,
073d5bf5
S
2582 }, {
2583 'url': 'https://www.youtube.com/TheYoungTurks/live',
2584 'only_matching': True,
f07e276a
S
2585 }]
2586
2587 def _real_extract(self, url):
2588 mobj = re.match(self._VALID_URL, url)
2589 channel_id = mobj.group('id')
2590 base_url = mobj.group('base_url')
2591 webpage = self._download_webpage(url, channel_id, fatal=False)
2592 if webpage:
2593 page_type = self._og_search_property(
e7f3529f 2594 'type', webpage, 'page type', default='')
f07e276a
S
2595 video_id = self._html_search_meta(
2596 'videoId', webpage, 'video id', default=None)
e7f3529f
S
2597 if page_type.startswith('video') and video_id and re.match(
2598 r'^[0-9A-Za-z_-]{11}$', video_id):
f07e276a
S
2599 return self.url_result(video_id, YoutubeIE.ie_key())
2600 return self.url_result(base_url)
2601
2602
e462474e
S
2603class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
2604 IE_DESC = 'YouTube.com user/channel playlists'
2605 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'
2606 IE_NAME = 'youtube:playlists'
0c148415 2607
e568c223 2608 _TESTS = [{
2d3d2997 2609 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
0c148415
S
2610 'playlist_mincount': 4,
2611 'info_dict': {
2612 'id': 'ThirstForScience',
2613 'title': 'Thirst for Science',
2614 },
e568c223
S
2615 }, {
2616 # with "Load more" button
2d3d2997 2617 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
e568c223
S
2618 'playlist_mincount': 70,
2619 'info_dict': {
2620 'id': 'igorkle1',
2621 'title': 'Игорь Клейнер',
2622 },
e462474e
S
2623 }, {
2624 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
2625 'playlist_mincount': 17,
2626 'info_dict': {
2627 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
2628 'title': 'Chem Player',
2629 },
e568c223 2630 }]
0c148415
S
2631
2632
870f3bfc
S
2633class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
2634 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
2635
2636
2637class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
78caa52a 2638 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
2639 # there doesn't appear to be a real limit, for example if you search for
2640 # 'python' you get more than 8.000.000 results
2641 _MAX_RESULTS = float('inf')
78caa52a 2642 IE_NAME = 'youtube:search'
b05654f0 2643 _SEARCH_KEY = 'ytsearch'
b4c08069 2644 _EXTRA_QUERY_ARGS = {}
9dd8e46a 2645 _TESTS = []
b05654f0 2646
b05654f0
PH
2647 def _get_n_results(self, query, n):
2648 """Get a specified number of results for a query"""
2649
b4c08069 2650 videos = []
b05654f0
PH
2651 limit = n
2652
a22b2fd1
YCH
2653 url_query = {
2654 'search_query': query.encode('utf-8'),
2655 }
2656 url_query.update(self._EXTRA_QUERY_ARGS)
2657 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
2658
b4c08069 2659 for pagenum in itertools.count(1):
b4c08069 2660 data = self._download_json(
69ea8ca4 2661 result_url, video_id='query "%s"' % query,
b4c08069 2662 note='Downloading page %s' % pagenum,
a22b2fd1
YCH
2663 errnote='Unable to download API page',
2664 query={'spf': 'navigate'})
b4c08069 2665 html_content = data[1]['body']['content']
7cc3570e 2666
b4c08069 2667 if 'class="search-message' in html_content:
07ad22b8 2668 raise ExtractorError(
78caa52a 2669 '[youtube] No video results', expected=True)
b05654f0 2670
870f3bfc 2671 new_videos = list(self._process_page(html_content))
b4c08069
JMF
2672 videos += new_videos
2673 if not new_videos or len(videos) > limit:
2674 break
a22b2fd1
YCH
2675 next_link = self._html_search_regex(
2676 r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',
2677 html_content, 'next link', default=None)
2678 if next_link is None:
2679 break
2680 result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)
b05654f0 2681
b4c08069
JMF
2682 if len(videos) > n:
2683 videos = videos[:n]
b05654f0 2684 return self.playlist_result(videos, query)
75dff0ee 2685
c9ae7b95 2686
a3dd9248 2687class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 2688 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 2689 _SEARCH_KEY = 'ytsearchdate'
78caa52a 2690 IE_DESC = 'YouTube.com searches, newest videos first'
b4c08069 2691 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
75dff0ee 2692
c9ae7b95 2693
870f3bfc 2694class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
78caa52a
PH
2695 IE_DESC = 'YouTube.com search URLs'
2696 IE_NAME = 'youtube:search_url'
d2c1f79f 2697 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
2698 _TESTS = [{
2699 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
2700 'playlist_mincount': 5,
2701 'info_dict': {
2702 'title': 'youtube-dl test video',
2703 }
d2c1f79f
S
2704 }, {
2705 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
2706 'only_matching': True,
cdc628a4 2707 }]
c9ae7b95
PH
2708
2709 def _real_extract(self, url):
2710 mobj = re.match(self._VALID_URL, url)
7fd002c0 2711 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
c9ae7b95 2712 webpage = self._download_webpage(url, query)
175c2e9e 2713 return self.playlist_result(self._process_page(webpage), playlist_title=query)
c9ae7b95
PH
2714
2715
136dadde 2716class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
78caa52a 2717 IE_DESC = 'YouTube.com (multi-season) shows'
92519402 2718 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 2719 IE_NAME = 'youtube:show'
cdc628a4 2720 _TESTS = [{
4003bd82 2721 'url': 'https://www.youtube.com/show/airdisasters',
8801255d 2722 'playlist_mincount': 5,
cdc628a4
PH
2723 'info_dict': {
2724 'id': 'airdisasters',
2725 'title': 'Air Disasters',
2726 }
2727 }]
75dff0ee
JMF
2728
2729 def _real_extract(self, url):
136dadde
S
2730 playlist_id = self._match_id(url)
2731 return super(YoutubeShowIE, self)._real_extract(
2732 'https://www.youtube.com/show/%s/playlists' % playlist_id)
04cc9617
JMF
2733
2734
b2e8bc1b 2735class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639 2736 """
25f14e9f 2737 Base class for feed extractors
d7ae0639
JMF
2738 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
2739 """
b2e8bc1b 2740 _LOGIN_REQUIRED = True
d7ae0639
JMF
2741
2742 @property
2743 def IE_NAME(self):
78caa52a 2744 return 'youtube:%s' % self._FEED_NAME
04cc9617 2745
81f0259b 2746 def _real_initialize(self):
b2e8bc1b 2747 self._login()
81f0259b 2748
3853309f 2749 def _entries(self, page):
2bc43303
JMF
2750 # The extraction process is the same as for playlists, but the regex
2751 # for the video ids doesn't contain an index
2752 ids = []
2753 more_widget_html = content_html = page
2bc43303
JMF
2754 for page_num in itertools.count(1):
2755 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
62c95fd5
S
2756
2757 # 'recommended' feed has infinite 'load more' and each new portion spins
2758 # the same videos in (sometimes) slightly different order, so we'll check
2759 # for unicity and break when portion has no new videos
3853309f 2760 new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
62c95fd5
S
2761 if not new_ids:
2762 break
2763
2bc43303
JMF
2764 ids.extend(new_ids)
2765
3853309f
S
2766 for entry in self._ids_to_results(new_ids):
2767 yield entry
2768
2bc43303
JMF
2769 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
2770 if not mobj:
2771 break
2772
2773 more = self._download_json(
25f14e9f 2774 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
2bc43303
JMF
2775 'Downloading page #%s' % page_num,
2776 transform_source=uppercase_escape)
2777 content_html = more['content_html']
2778 more_widget_html = more['load_more_widget_html']
2779
3853309f
S
2780 def _real_extract(self, url):
2781 page = self._download_webpage(
2782 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
2783 self._PLAYLIST_TITLE)
25f14e9f 2784 return self.playlist_result(
3853309f 2785 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
25f14e9f
S
2786
2787
2788class YoutubeWatchLaterIE(YoutubePlaylistIE):
2789 IE_NAME = 'youtube:watchlater'
2790 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
92519402 2791 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
25f14e9f 2792
bc7a9cd8
S
2793 _TESTS = [{
2794 'url': 'https://www.youtube.com/playlist?list=WL',
2795 'only_matching': True,
2796 }, {
2797 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
2798 'only_matching': True,
2799 }]
25f14e9f
S
2800
2801 def _real_extract(self, url):
7e5dc339 2802 _, video = self._check_download_just_video(url, 'WL')
ebf1b291
S
2803 if video:
2804 return video
dacb3a86
S
2805 _, playlist = self._extract_playlist('WL')
2806 return playlist
f459d170 2807
5f6a1245 2808
c626a3d9 2809class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 2810 IE_NAME = 'youtube:favorites'
f3a34072 2811 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
92519402 2812 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
2813 _LOGIN_REQUIRED = True
2814
2815 def _real_extract(self, url):
2816 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 2817 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 2818 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
2819
2820
25f14e9f
S
2821class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
2822 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
92519402 2823 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
25f14e9f
S
2824 _FEED_NAME = 'recommended'
2825 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 2826
1ed5b5c9 2827
25f14e9f
S
2828class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
2829 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
92519402 2830 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
25f14e9f
S
2831 _FEED_NAME = 'subscriptions'
2832 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 2833
1ed5b5c9 2834
25f14e9f
S
2835class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
2836 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
92519402 2837 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
25f14e9f
S
2838 _FEED_NAME = 'history'
2839 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
2840
2841
15870e90
PH
2842class YoutubeTruncatedURLIE(InfoExtractor):
2843 IE_NAME = 'youtube:truncated_url'
2844 IE_DESC = False # Do not list
975d35db 2845 _VALID_URL = r'''(?x)
b95aab84
PH
2846 (?:https?://)?
2847 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
2848 (?:watch\?(?:
c4808c60 2849 feature=[a-z_]+|
b95aab84
PH
2850 annotation_id=annotation_[^&]+|
2851 x-yt-cl=[0-9]+|
c1708b89 2852 hl=[^&]*|
287be8c6 2853 t=[0-9]+
b95aab84
PH
2854 )?
2855 |
2856 attribution_link\?a=[^&]+
2857 )
2858 $
975d35db 2859 '''
15870e90 2860
c4808c60 2861 _TESTS = [{
2d3d2997 2862 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
c4808c60 2863 'only_matching': True,
dc2fc736 2864 }, {
2d3d2997 2865 'url': 'https://www.youtube.com/watch?',
dc2fc736 2866 'only_matching': True,
b95aab84
PH
2867 }, {
2868 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
2869 'only_matching': True,
2870 }, {
2871 'url': 'https://www.youtube.com/watch?feature=foo',
2872 'only_matching': True,
c1708b89
PH
2873 }, {
2874 'url': 'https://www.youtube.com/watch?hl=en-GB',
2875 'only_matching': True,
287be8c6
PH
2876 }, {
2877 'url': 'https://www.youtube.com/watch?t=2372',
2878 'only_matching': True,
c4808c60
PH
2879 }]
2880
15870e90
PH
2881 def _real_extract(self, url):
2882 raise ExtractorError(
78caa52a
PH
2883 'Did you forget to quote the URL? Remember that & is a meta '
2884 'character in most shells, so you want to put the URL in quotes, '
2885 'like youtube-dl '
2d3d2997 2886 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
78caa52a 2887 ' or simply youtube-dl BaW_jenozKc .',
15870e90 2888 expected=True)
772fd5cc
PH
2889
2890
2891class YoutubeTruncatedIDIE(InfoExtractor):
2892 IE_NAME = 'youtube:truncated_id'
2893 IE_DESC = False # Do not list
b95aab84 2894 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
2895
2896 _TESTS = [{
2897 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
2898 'only_matching': True,
2899 }]
2900
2901 def _real_extract(self, url):
2902 video_id = self._match_id(url)
2903 raise ExtractorError(
2904 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
2905 expected=True)